In [28]:
import psycopg2
import time

# establish a connection to the PostgreSQL database
conn = psycopg2.connect(
    dbname = "dw_cs", 
    user = "postgres", 
    host= 'localhost',
    #host = '172.30.160.1',
    password = "postgres",
    port = 5432
)

In [29]:
def check_indexes(table_name):

    with conn.cursor() as cur:
        query = f"""
        SELECT
            indexname,
            indexdef
        FROM
            pg_indexes
        WHERE
            tablename = '{table_name}';
        """

        cur.execute(query)      
        
        indexes = cur.fetchall()
        
        for index in indexes:
            print(f"Index Name: {index[0]}")
            print(f"Index Definition: {index[1]}\n")

# function to explain-analyze a query

def explain_analyze(query, analyze = True):
    conn.rollback()
    with conn.cursor() as cur:
        if analyze:
            cur.execute(f"EXPLAIN ANALYZE {query}")
        else:
            cur.execute(f"EXPLAIN {query}")
        explain = cur.fetchall()

        for line in explain:
            print(line[0])

## Check indexes

In [30]:
a = ['nation', 'part', 'supplier', 'customer', 'lineitem', 'region', 'partsupp', 'orders']

conn.rollback()
for table in a:
    check_indexes(table)

Index Name: nation_pkey
Index Definition: CREATE UNIQUE INDEX nation_pkey ON public.nation USING btree (n_nationkey)

Index Name: part_pkey
Index Definition: CREATE UNIQUE INDEX part_pkey ON public.part USING btree (p_partkey)

Index Name: supplier_pkey
Index Definition: CREATE UNIQUE INDEX supplier_pkey ON public.supplier USING btree (s_suppkey)

Index Name: customer_pkey
Index Definition: CREATE UNIQUE INDEX customer_pkey ON public.customer USING btree (c_custkey)

Index Name: region_pkey
Index Definition: CREATE UNIQUE INDEX region_pkey ON public.region USING btree (r_regionkey)

Index Name: partsupp_pkey
Index Definition: CREATE UNIQUE INDEX partsupp_pkey ON public.partsupp USING btree (ps_partkey, ps_suppkey)

Index Name: orders_pkey
Index Definition: CREATE UNIQUE INDEX orders_pkey ON public.orders USING btree (o_orderkey)



In [31]:
conn.rollback()
with conn.cursor() as cur:
    query = """
    drop index if exists idx_o_orderdate;
    """
    cur.execute(query)
    conn.commit()

## Query 1

is it necessary to create a mv for this query? we don't expect so much gain because of the sequential scan on shipdate. we could perform a group by, but in this way we can't add shipdate and adding this information to


## Materialized on part lineitem

In [32]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    conn.commit()

query_materialized = """

CREATE MATERIALIZED VIEW part_lineitem AS
SELECT
    l_returnflag,
    l_linestatus,
    l_quantity,
    l_extendedprice,
    l_discount,
    l_tax,
    l_shipdate,
    l_partkey,
    p_partkey,
    p_brand,
    p_container,
    SUBSTRING(p_type FROM 1 FOR 5) AS p_type_prefix,
    0.2 * AVG(l_quantity) OVER (PARTITION BY l_partkey) AS avg_quantity
FROM
    lineitem l
JOIN
    part p ON l.l_partkey = p.p_partkey;

"""

with conn.cursor() as cur:
    start_time = time.time()
    cur.execute(query_materialized)
    end_time = time.time()
    print(f"Time taken to create materialized view: {end_time - start_time} seconds")
    conn.commit()

Time taken to create materialized view: 310.9520151615143 seconds


we allowed to use hash join since we are not interested in indexes performance.

In [33]:
conn.rollback()
with conn.cursor() as cur:
    
    cur.execute("SELECT pg_total_relation_size('part_lineitem');")
    size = cur.fetchall()
    print(f"Size of materialised view: {size[0][0]/(1024**2)} MB")
    print(f"Size of materialised view: {size[0][0]/(1024**3)} GB")

Size of materialised view: 6581.5078125 MB
Size of materialised view: 6.427253723144531 GB


In [34]:
conn.rollback()
check_indexes('part_lineitem')

### Query 14

In [35]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    cur.execute("SET enable_material = on;")
    conn.commit()

query_14 = """
SELECT
    100.00 * SUM(CASE
        WHEN p_type_prefix LIKE 'PROMO'
        THEN l_extendedprice * (1 - l_discount)
        ELSE 0
    END) / SUM(l_extendedprice * (1 - l_discount)) AS promo_revenue
FROM
    part_lineitem
WHERE
    l_shipdate >= DATE '1995-09-01'
    AND l_shipdate < DATE '1995-09-01' + INTERVAL '1' MONTH;
"""

In [36]:
explain_analyze(query_14)

Finalize Aggregate  (cost=970534.18..970534.19 rows=1 width=32) (actual time=36047.043..36049.397 rows=1 loops=1)
  ->  Gather  (cost=970533.94..970534.15 rows=2 width=64) (actual time=36044.942..36049.075 rows=3 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial Aggregate  (cost=969533.94..969533.95 rows=1 width=64) (actual time=36035.166..36035.168 rows=1 loops=3)
              ->  Parallel Seq Scan on part_lineitem  (cost=0.00..968796.80 rows=42122 width=96) (actual time=0.279..35848.123 rows=249741 loops=3)
                    Filter: ((l_shipdate >= '1995-09-01'::date) AND (l_shipdate < '1995-10-01 00:00:00'::timestamp without time zone))
                    Rows Removed by Filter: 19745610
Planning Time: 0.798 ms
Execution Time: 36049.752 ms


the problem is that it does a seq scan because there is no index on shipdate.

Size of the result table

In [37]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_14};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_14 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 (Decimal('16.6475949416150953'),)
Size of query_14 result table: 0.015625 MB


### Query 17

Here we expect to get the most gain.

In [38]:
query_17 = """
SELECT
    SUM(l_extendedprice) / 7.0 AS avg_yearly
FROM
    part_lineitem
WHERE
    p_brand = 'Brand#23'
    AND p_container = 'MED BOX'
    AND l_quantity < avg_quantity;
"""

In [39]:
explain_analyze(query_17)

Finalize Aggregate  (cost=990858.00..990858.01 rows=1 width=32) (actual time=15263.910..15265.350 rows=1 loops=1)
  ->  Gather  (cost=990857.78..990857.99 rows=2 width=32) (actual time=15263.831..15265.340 rows=3 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial Aggregate  (cost=989857.78..989857.79 rows=1 width=32) (actual time=15259.045..15259.045 rows=1 loops=3)
              ->  Parallel Seq Scan on part_lineitem  (cost=0.00..989857.60 rows=70 width=32) (actual time=79.155..15258.522 rows=1842 loops=3)
                    Filter: ((l_quantity < avg_quantity) AND (p_brand = 'Brand#23'::bpchar) AND (p_container = 'MED BOX'::bpchar))
                    Rows Removed by Filter: 19993509
Planning Time: 665.693 ms
Execution Time: 15265.404 ms


Size of result table

In [42]:
conn.rollback()
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_17};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_1 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 (Decimal('3295493.512857142857'),)
Size of query_1 result table: 0.015625 MB


## Indexes on part lineitem

In [43]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_pl_shipdate ON part_lineitem (l_shipdate);")
    end_time = time.time()
    print(f"Time to create idx_pl_shipdate ON part_lineitem (l_shipdate): {end_time - start_time} seconds")

    conn.commit()

Time to create idx_pl_shipdate ON part_lineitem (l_shipdate): 30.162781953811646 seconds


In [44]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_pl_brand ON part_lineitem (p_brand);")
    end_time = time.time()
    print(f"Time to create idx_pl_brand ON part_lineitem (p_brand): {end_time - start_time} seconds")

    conn.commit()

Time to create idx_pl_brand ON part_lineitem (p_brand): 58.95485210418701 seconds


In [45]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_pl_container ON part_lineitem (p_container);")
    end_time = time.time()
    print(f"Time to create idx_pl_container ON part_lineitem (p_container): {end_time - start_time} seconds")

    conn.commit()

Time to create idx_pl_container ON part_lineitem (p_container): 60.34345507621765 seconds


In [46]:
explain_analyze(query_14)

Finalize Aggregate  (cost=1214874.74..1214874.76 rows=1 width=32) (actual time=19381.340..19395.889 rows=1 loops=1)
  ->  Gather  (cost=1214874.51..1214874.72 rows=2 width=64) (actual time=19380.980..19395.876 rows=3 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial Aggregate  (cost=1213874.51..1213874.52 rows=1 width=64) (actual time=19375.856..19375.857 rows=1 loops=3)
              ->  Parallel Bitmap Heap Scan on part_lineitem  (cost=9882.66..1208594.56 rows=301711 width=18) (actual time=107.708..19034.300 rows=249741 loops=3)
                    Recheck Cond: ((l_shipdate >= '1995-09-01'::date) AND (l_shipdate < '1995-10-01 00:00:00'::timestamp without time zone))
                    Rows Removed by Index Recheck: 10773120
                    Heap Blocks: exact=11467 lossy=154789
                    ->  Bitmap Index Scan on idx_pl_shipdate  (cost=0.00..9701.64 rows=724107 width=0) (actual time=88.514..88.514 rows=749223 loops=1)
                  

In [47]:
explain_analyze(query_17)

Aggregate  (cost=218970.71..218970.72 rows=1 width=32) (actual time=1422.670..1422.673 rows=1 loops=1)
  ->  Bitmap Heap Scan on part_lineitem  (cost=41274.20..218923.70 rows=18805 width=8) (actual time=143.534..1421.748 rows=5526 loops=1)
        Recheck Cond: ((p_container = 'MED BOX'::bpchar) AND (p_brand = 'Brand#23'::bpchar))
        Rows Removed by Index Recheck: 346780
        Filter: (l_quantity < avg_quantity)
        Rows Removed by Filter: 55859
        Heap Blocks: exact=3124 lossy=4777
        ->  BitmapAnd  (cost=41274.20..41274.20 rows=56414 width=0) (actual time=140.630..140.631 rows=0 loops=1)
              ->  Bitmap Index Scan on idx_pl_container  (cost=0.00..16102.00 rows=1471658 width=0) (actual time=54.228..54.228 rows=1504601 loops=1)
                    Index Cond: (p_container = 'MED BOX'::bpchar)
              ->  Bitmap Index Scan on idx_pl_brand  (cost=0.00..25162.55 rows=2299465 width=0) (actual time=82.913..82.913 rows=2391264 loops=1)
                    

## Materialized customer_order_lineitem_nation

In [48]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    conn.commit()

query_materialized = """

CREATE MATERIALIZED VIEW customer_order_lineitem_nation AS
SELECT
    c.c_custkey,
    c.c_name,
    c.c_acctbal,
    n.n_name,
    c.c_address,
    c.c_phone,
    c.c_comment,
    -- c.c_nationkey, not needed in the query, so not included
    l.l_returnflag,
    -- l.l_orderkey, not needed in the query, so not included
    l.l_discount,
    l.l_extendedprice,
    o.o_orderdate

FROM
    customer c
JOIN
    orders o ON c.c_custkey = o.o_custkey
JOIN
    lineitem l ON l.l_orderkey = o.o_orderkey
JOIN
    nation n ON c.c_nationkey = n.n_nationkey;
"""

explain_analyze(query_materialized, analyze = False)

Hash Join  (cost=773460.56..4508342.52 rows=59986052 width=265)
  Hash Cond: (c.c_nationkey = n.n_nationkey)
  ->  Hash Join  (cost=773459.00..4324183.78 rows=59986052 width=165)
        Hash Cond: (o.o_custkey = c.c_custkey)
        ->  Hash Join  (cost=671655.00..3329726.95 rows=59986052 width=22)
              Hash Cond: (l.l_orderkey = o.o_orderkey)
              ->  Seq Scan on lineitem l  (cost=0.00..1724403.52 rows=59986052 width=18)
              ->  Hash  (cost=410912.00..410912.00 rows=15000000 width=12)
                    ->  Seq Scan on orders o  (cost=0.00..410912.00 rows=15000000 width=12)
        ->  Hash  (cost=50827.00..50827.00 rows=1500000 width=147)
              ->  Seq Scan on customer c  (cost=0.00..50827.00 rows=1500000 width=147)
  ->  Hash  (cost=1.25..1.25 rows=25 width=108)
        ->  Seq Scan on nation n  (cost=0.00..1.25 rows=25 width=108)


In [50]:
conn.rollback()
with conn.cursor() as cur:
    start_time = time.time()
    cur.execute(query_materialized)
    end_time = time.time()
    print(f"Time taken to create materialized view: {end_time - start_time} seconds")
    conn.commit()

Time taken to create materialized view: 437.10683703422546 seconds


In [51]:
conn.rollback()
with conn.cursor() as cur:
    
    cur.execute("SELECT pg_total_relation_size('customer_order_lineitem_nation');")
    size = cur.fetchall()
    print(f"Size of materialised view: {size[0][0]/(1024**2)} MB")
    print(f"Size of materialised view: {size[0][0]/(1024**3)} GB")

Size of materialised view: 12938.5078125 MB
Size of materialised view: 12.635261535644531 GB


In [52]:
check_indexes('customer_order_lineitem_nation')

In [53]:
query_10 = """
SELECT
    c_custkey,
    c_name,
    SUM(l_extendedprice * (1 - l_discount)) AS revenue,
    c_acctbal,
    n_name,
    c_address,
    c_phone,
    c_comment
FROM
    customer_order_lineitem_nation
WHERE
    o_orderdate >= DATE '1993-10-01'
    AND o_orderdate < DATE '1993-10-01' + INTERVAL '3' MONTH
    AND l_returnflag = 'R'
GROUP BY
    c_custkey,
    c_name,
    c_acctbal,
    c_phone,
    n_name,
    c_address,
    c_comment
ORDER BY
    revenue DESC;
"""


In [54]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    conn.commit()

explain_analyze(query_10)

Sort  (cost=1790056.27..1790057.41 rows=455 width=654) (actual time=75326.455..75486.829 rows=381105 loops=1)
  Sort Key: (sum((l_extendedprice * ('1'::numeric - l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  Finalize GroupAggregate  (cost=1789970.48..1790036.18 rows=455 width=654) (actual time=74060.532..74838.424 rows=381105 loops=1)
        Group Key: c_custkey, c_name, c_acctbal, c_phone, n_name, c_address, c_comment
        ->  Gather Merge  (cost=1789970.48..1790021.94 rows=380 width=654) (actual time=74059.794..74549.460 rows=449695 loops=1)
              Workers Planned: 2
              Workers Launched: 2
              ->  Partial GroupAggregate  (cost=1788970.46..1788978.06 rows=190 width=654) (actual time=73965.544..74267.850 rows=149898 loops=3)
                    Group Key: c_custkey, c_name, c_acctbal, c_phone, n_name, c_address, c_comment
                    ->  Sort  (cost=1788970.46..1788970.93 rows=190 width=686) (actual time=73965.241..74014.

In [55]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_10};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_10 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 (1237537, 'Customer#001237537', Decimal('884989.6657'), Decimal('7840.17'), 'RUSSIA                   ', 'FNG6WgB1mopyyY,ajQTU qUPW5o', '32-367-120-4327', 'nag carefully about the regular packages. carefully reg')
Size of query_10 result table: 78.5078125 MB


## indexes on customer order lineitem nation

In [56]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_coln_orderdate ON customer_order_lineitem_nation (o_orderdate);")
    end_time = time.time()
    print(f"Time to create idx_coln_orderdate ON customer_order_lineitem_nation (o_orderdate): {end_time - start_time} seconds")

    conn.commit()

Time to create idx_coln_orderdate ON customer_order_lineitem_nation (o_orderdate): 38.008803844451904 seconds


In [77]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_coln_l_returnflag ON customer_order_lineitem_nation (l_returnflag);")
    end_time = time.time()
    print(f"Time to create idx_coln_l_returnflag ON customer_order_lineitem_nation (l_returnflag): {end_time - start_time} seconds")

    conn.commit()

Time to create idx_coln_l_returnflag ON customer_order_lineitem_nation (l_returnflag): 64.28667688369751 seconds


Tried an index also on return flag but not useful, drop it.

In [78]:
explain_analyze(query_10)

Sort  (cost=1993769.88..1995073.30 rows=521368 width=201) (actual time=27572.214..27623.325 rows=381105 loops=1)
  Sort Key: (sum((l_extendedprice * ('1'::numeric - l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  Finalize GroupAggregate  (cost=1762926.53..1840898.93 rows=521368 width=201) (actual time=26512.469..27185.829 rows=381105 loops=1)
        Group Key: c_custkey, c_name, c_acctbal, c_phone, n_name, c_address, c_comment
        ->  Gather Merge  (cost=1762926.53..1824201.39 rows=452464 width=201) (actual time=26512.460..26902.711 rows=471492 loops=1)
              Workers Planned: 2
              Workers Launched: 2
              ->  Partial GroupAggregate  (cost=1761926.51..1770975.79 rows=226232 width=201) (actual time=26440.076..26725.181 rows=157164 loops=3)
                    Group Key: c_custkey, c_name, c_acctbal, c_phone, n_name, c_address, c_comment
                    ->  Sort  (cost=1761926.51..1762492.09 rows=226232 width=181) (actual time=26

## mixed approach order lineitem part

In [58]:
query_10_1 = """
SELECT
    c_custkey,
    c_name,
    SUM(l_extendedprice * (1 - l_discount)) AS revenue,
    c_acctbal,
    n_name,
    c_address,
    c_phone,
    c_comment
FROM
    part_lineitem_order
    JOIN customer c ON c.c_custkey = o_custkey
    JOIN nation n ON c.c_nationkey = n.n_nationkey
WHERE
    o_orderdate >= DATE '1993-10-01'
    AND o_orderdate < DATE '1993-10-01' + INTERVAL '3' MONTH
    AND l_returnflag = 'R'
GROUP BY
    c_custkey,
    c_name,
    c_acctbal,
    c_phone,
    n_name,
    c_address,
    c_comment
ORDER BY
    revenue DESC;
"""

query_14_1 = """
SELECT
    100.00 * SUM(CASE
        WHEN p_type_prefix LIKE 'PROMO'
        THEN l_extendedprice * (1 - l_discount)
        ELSE 0
    END) / SUM(l_extendedprice * (1 - l_discount)) AS promo_revenue
FROM
    part_lineitem_order
WHERE
    l_shipdate >= DATE '1995-09-01'
    AND l_shipdate < DATE '1995-09-01' + INTERVAL '1' MONTH;
"""

query_17_1 = """
SELECT
    SUM(l_extendedprice) / 7.0 AS avg_yearly
FROM
    part_lineitem_order
WHERE
    p_brand = 'Brand#23'
    AND p_container = 'MED BOX'
    AND l_quantity < avg_quantity;
"""

In [59]:
query_materialized = """

CREATE MATERIALIZED VIEW part_lineitem_order AS
SELECT
    l_returnflag,
    l_linestatus,
    l_quantity,
    l_extendedprice,
    l_discount,
    l_tax,
    l_shipdate,
    l_partkey,
    p_partkey,
    p_brand,
    p_container,
    SUBSTRING(p_type FROM 1 FOR 5) AS p_type_prefix,
    0.2 * AVG(l_quantity) OVER (PARTITION BY l_partkey) AS avg_quantity,
    o_orderkey,
    o.o_custkey,
    o.o_orderdate
FROM
    lineitem l
JOIN
    part p ON l.l_partkey = p.p_partkey
JOIN
    orders o ON l.l_orderkey = o.o_orderkey;

"""

In [60]:
conn.rollback()
with conn.cursor() as cur:
    start_time = time.time()
    cur.execute(query_materialized)
    end_time = time.time()
    print(f"Time taken to create materialized view: {end_time - start_time} seconds")
    conn.commit()

Time taken to create materialized view: 366.5083260536194 seconds


In [61]:
check_indexes('part_lineitem_order')

In [62]:
explain_analyze(query_10_1)

Sort  (cost=1063800.11..1063801.41 rows=522 width=279) (actual time=43185.387..43235.769 rows=381105 loops=1)
  Sort Key: (sum((part_lineitem_order.l_extendedprice * ('1'::numeric - part_lineitem_order.l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  Finalize GroupAggregate  (cost=1063709.34..1063776.54 rows=522 width=279) (actual time=41871.651..42792.799 rows=381105 loops=1)
        Group Key: c.c_custkey, n.n_name
        ->  Gather Merge  (cost=1063709.34..1063765.66 rows=436 width=279) (actual time=41871.638..42419.588 rows=719582 loops=1)
              Workers Planned: 2
              Workers Launched: 2
              ->  Partial GroupAggregate  (cost=1062709.32..1062715.31 rows=218 width=279) (actual time=41759.503..42098.971 rows=239861 loops=3)
                    Group Key: c.c_custkey, n.n_name
                    ->  Sort  (cost=1062709.32..1062709.86 rows=218 width=311) (actual time=41759.475..41843.263 rows=382361 loops=3)
                          S

In [63]:
with conn.cursor() as cur:
    
    cur.execute("drop index if exists idx_plo_shipdate;")
    cur.execute("drop index if exists idx_plo_o_orderdate;")
    cur.execute("drop index if exists idx_plo_brand;")
    cur.execute("drop index if exists idx_plo_container;")
    cur.execute("drop index if exists idx_plo_custkey;")
    conn.commit()

In [64]:
explain_analyze(query_14_1)

Finalize Aggregate  (cost=1040909.30..1040909.32 rows=1 width=32) (actual time=28027.154..28028.463 rows=1 loops=1)
  ->  Gather  (cost=1040909.07..1040909.28 rows=2 width=64) (actual time=28026.830..28028.450 rows=3 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial Aggregate  (cost=1039909.07..1039909.08 rows=1 width=64) (actual time=27912.234..27912.235 rows=1 loops=3)
              ->  Parallel Seq Scan on part_lineitem_order  (cost=0.00..1039147.20 rows=43535 width=96) (actual time=0.573..27773.552 rows=249741 loops=3)
                    Filter: ((l_shipdate >= '1995-09-01'::date) AND (l_shipdate < '1995-10-01 00:00:00'::timestamp without time zone))
                    Rows Removed by Filter: 19745610
Planning Time: 7.480 ms
Execution Time: 28028.536 ms


In [65]:
explain_analyze(query_17_1)

Finalize Aggregate  (cost=1061914.80..1061914.82 rows=1 width=32) (actual time=15161.995..15163.137 rows=1 loops=1)
  ->  Gather  (cost=1061914.58..1061914.79 rows=2 width=32) (actual time=15161.930..15163.128 rows=3 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial Aggregate  (cost=1060914.58..1060914.59 rows=1 width=32) (actual time=15151.031..15151.031 rows=1 loops=3)
              ->  Parallel Seq Scan on part_lineitem_order  (cost=0.00..1060914.40 rows=72 width=32) (actual time=10.426..15150.673 rows=1842 loops=3)
                    Filter: ((l_quantity < avg_quantity) AND (p_brand = 'Brand#23'::bpchar) AND (p_container = 'MED BOX'::bpchar))
                    Rows Removed by Filter: 19993509
Planning Time: 0.057 ms
Execution Time: 15163.162 ms


## indexes on order_lineitem_part

In [66]:
with conn.cursor() as cur:
    
    cur.execute("drop index if exists idx_plo_shipdate;")
    cur.execute("drop index if exists idx_plo_o_orderdate;")
    cur.execute("drop index if exists idx_plo_brand;")
    cur.execute("drop index if exists idx_plo_container;")
    conn.commit()

In [67]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_plo_shipdate ON part_lineitem_order (l_shipdate);")
    end_time = time.time()
    print(f"Time to create idx_plo_shipdate ON part_lineitem_order (l_shipdate): {end_time - start_time} seconds")

    cur.execute("SELECT pg_relation_size('idx_plo_shipdate');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_plo_shipdate: {index_size/(1024**2)} MB")
    print(f"Size of idx_plo_shipdate: {index_size/(1024**3)} GB")

    conn.commit()

Time to create idx_plo_shipdate ON part_lineitem_order (l_shipdate): 47.12177610397339 seconds
Size of idx_plo_shipdate: 397.546875 MB
Size of idx_plo_shipdate: 0.3882293701171875 GB


In [94]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_plo_o_orderdate ON part_lineitem_order (o_orderdate);")
    end_time = time.time()
    print(f"Time to create idx_plo_o_orderdate ON part_lineitem_order (o_orderdate): {end_time - start_time} seconds")

    cur.execute("SELECT pg_relation_size('idx_plo_o_orderdate');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_plo_o_orderdate: {index_size/(1024**2)} MB")
    print(f"Size of idx_plo_o_orderdate: {index_size/(1024**3)} GB")

    conn.commit()

DuplicateTable: relation "idx_plo_o_orderdate" already exists


In [69]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_plo_brand ON part_lineitem_order (p_brand);")
    end_time = time.time()
    print(f"Time to create idx_plo_brand ON part_lineitem_order (p_brand): {end_time - start_time} seconds")

    cur.execute("SELECT pg_relation_size('idx_plo_brand');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_plo_brand: {index_size/(1024**2)} MB")
    print(f"Size of idx_plo_brand: {index_size/(1024**3)} GB")

    conn.commit()

Time to create idx_plo_brand ON part_lineitem_order (p_brand): 49.67605805397034 seconds
Size of idx_plo_brand: 403.140625 MB
Size of idx_plo_brand: 0.3936920166015625 GB


In [70]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_plo_container ON part_lineitem_order (p_container);")
    end_time = time.time()
    print(f"Time to create idx_plo_container ON part_lineitem_order (p_container): {end_time - start_time} seconds")

    cur.execute("SELECT pg_relation_size('idx_plo_container');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_plo_container: {index_size/(1024**2)} MB")
    print(f"Size of idx_plo_container: {index_size/(1024**3)} GB")

    conn.commit()

Time to create idx_plo_container ON part_lineitem_order (p_container): 59.78866696357727 seconds
Size of idx_plo_container: 403.1484375 MB
Size of idx_plo_container: 0.39369964599609375 GB


tried also an index on return flag, but it is not used

In [87]:
conn.rollback()
with conn.cursor() as cur:

    cur.execute("SET enable_seqscan = off;")
    conn.commit()

explain_analyze(query_10_1)

Sort  (cost=1770017.13..1771473.82 rows=582677 width=279) (actual time=28996.049..29047.533 rows=381105 loops=1)
  Sort Key: (sum((part_lineitem_order.l_extendedprice * ('1'::numeric - part_lineitem_order.l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  Finalize GroupAggregate  (cost=1389553.95..1562858.00 rows=582677 width=279) (actual time=27234.573..28578.806 rows=381105 loops=1)
        Group Key: c.c_custkey, n.n_name
        ->  Gather Merge  (cost=1389553.95..1550718.90 rows=485564 width=279) (actual time=27234.563..28201.333 rows=720080 loops=1)
              Workers Planned: 2
              Workers Launched: 2
              ->  Partial GroupAggregate  (cost=1388553.93..1493672.74 rows=242782 width=279) (actual time=27220.221..28077.161 rows=240027 loops=3)
                    Group Key: c.c_custkey, n.n_name
                    ->  Incremental Sort  (cost=1388553.93..1487603.19 rows=242782 width=259) (actual time=27220.210..27818.267 rows=382361 loops=3)


In [72]:
explain_analyze(query_14_1)

Finalize Aggregate  (cost=953355.33..953355.34 rows=1 width=32) (actual time=17837.472..17857.493 rows=1 loops=1)
  ->  Gather  (cost=953355.09..953355.30 rows=2 width=64) (actual time=17836.303..17857.465 rows=3 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial Aggregate  (cost=952355.09..952355.10 rows=1 width=64) (actual time=17795.332..17795.333 rows=1 loops=3)
              ->  Parallel Bitmap Heap Scan on part_lineitem_order  (cost=4094.85..950168.09 rows=124971 width=96) (actual time=49.563..17500.610 rows=249741 loops=3)
                    Recheck Cond: ((l_shipdate >= '1995-09-01'::date) AND (l_shipdate < '1995-10-01 00:00:00'::timestamp without time zone))
                    Rows Removed by Index Recheck: 9982702
                    Heap Blocks: exact=16214 lossy=155070
                    ->  Bitmap Index Scan on idx_plo_shipdate  (cost=0.00..4019.87 rows=299930 width=0) (actual time=79.299..79.299 rows=749223 loops=1)
                    

In [73]:
explain_analyze(query_17_1)

Aggregate  (cost=12413.42..12413.43 rows=1 width=32) (actual time=1258.450..1258.455 rows=1 loops=1)
  ->  Bitmap Heap Scan on part_lineitem_order  (cost=6572.58..12412.17 rows=500 width=32) (actual time=157.935..1257.492 rows=5526 loops=1)
        Recheck Cond: ((p_container = 'MED BOX'::bpchar) AND (p_brand = 'Brand#23'::bpchar))
        Rows Removed by Index Recheck: 302709
        Filter: (l_quantity < avg_quantity)
        Rows Removed by Filter: 55859
        Heap Blocks: exact=3108 lossy=4520
        ->  BitmapAnd  (cost=6572.58..6572.58 rows=1500 width=0) (actual time=156.401..156.404 rows=0 loops=1)
              ->  Bitmap Index Scan on idx_plo_container  (cost=0.00..3286.04 rows=299930 width=0) (actual time=66.810..66.810 rows=1504601 loops=1)
                    Index Cond: (p_container = 'MED BOX'::bpchar)
              ->  Bitmap Index Scan on idx_plo_brand  (cost=0.00..3286.04 rows=299930 width=0) (actual time=86.063..86.063 rows=2391264 loops=1)
                    Inde

## indexes used

In [91]:
conn.rollback()
check_indexes('part_lineitem_order')

Index Name: idx_plo_shipdate
Index Definition: CREATE INDEX idx_plo_shipdate ON public.part_lineitem_order USING btree (l_shipdate)

Index Name: idx_plo_o_orderdate
Index Definition: CREATE INDEX idx_plo_o_orderdate ON public.part_lineitem_order USING btree (o_orderdate)

Index Name: idx_plo_brand
Index Definition: CREATE INDEX idx_plo_brand ON public.part_lineitem_order USING btree (p_brand)

Index Name: idx_plo_container
Index Definition: CREATE INDEX idx_plo_container ON public.part_lineitem_order USING btree (p_container)



In [92]:
check_indexes('customer_order_lineitem_nation') # drop returnflag index 

Index Name: idx_coln_orderdate
Index Definition: CREATE INDEX idx_coln_orderdate ON public.customer_order_lineitem_nation USING btree (o_orderdate)

Index Name: idx_coln_l_returnflag
Index Definition: CREATE INDEX idx_coln_l_returnflag ON public.customer_order_lineitem_nation USING btree (l_returnflag)



In [96]:
conn.rollback()
check_indexes('part_lineitem_order') 

Index Name: idx_plo_shipdate
Index Definition: CREATE INDEX idx_plo_shipdate ON public.part_lineitem_order USING btree (l_shipdate)

Index Name: idx_plo_o_orderdate
Index Definition: CREATE INDEX idx_plo_o_orderdate ON public.part_lineitem_order USING btree (o_orderdate)

Index Name: idx_plo_brand
Index Definition: CREATE INDEX idx_plo_brand ON public.part_lineitem_order USING btree (p_brand)

Index Name: idx_plo_container
Index Definition: CREATE INDEX idx_plo_container ON public.part_lineitem_order USING btree (p_container)



## Conclusion

We can see that the gain of using 2 different materialized views wrt using a single one with part order lineitem are the same. the only useful indexes on part order lineitem are p_brand and p_container that speed up a lot the query.

since we risparmiare time and space by creating only one mv, we choose to use that, also because it can be helpful for many other queries, in fact the tables part order lineitem are the "core" of the db.