In [1]:
import psycopg2
import time

# establish a connection to the PostgreSQL database
conn = psycopg2.connect(
    dbname = "dw_cs", 
    host= 'localhost',      # change this to your host
    password = "postgres",  # change this to your password
    password = "postgres",
    port = 5432
)

In [2]:
def check_indexes(table_name):

    with conn.cursor() as cur:
        query = f"""
        SELECT
            indexname,
            indexdef
        FROM
            pg_indexes
        WHERE
            tablename = '{table_name}';
        """

        cur.execute(query)      
        
        indexes = cur.fetchall()
        
        for index in indexes:
            print(f"Index Name: {index[0]}")
            print(f"Index Definition: {index[1]}\n")

# function to explain-analyze a query

def explain_analyze(query, analyze = True):
    conn.rollback()
    with conn.cursor() as cur:
        if analyze:
            cur.execute(f"EXPLAIN ANALYZE {query}")
        else:
            cur.execute(f"EXPLAIN {query}")
        explain = cur.fetchall()

        for line in explain:
            print(line[0])
            

def collect_size(table : str) -> None:
    conn.rollback()
    with conn.cursor() as cur:

        cur.execute(f"SELECT pg_relation_size('{table}');")
        size = cur.fetchall()
        size_mb = size[0][0] / (1024**2)
        size_gb = size[0][0] / (1024**3)
        print(f"Relation size: {size_mb:.2f} MB")
        print(f"Relation size: {size_gb:.2f} GB")

        cur.execute(f"SELECT pg_table_size('{table}');")
        size = cur.fetchall()
        size_mb = size[0][0] / (1024**2)
        size_gb = size[0][0] / (1024**3)
        print(f"Table (relation + TOAST) size: {size_mb:.2f} MB")
        print(f"Table (relation + TOAST) size: {size_gb:.2f} GB")

        cur.execute(f"SELECT pg_indexes_size('{table}');")
        size = cur.fetchall()
        print(f"Index size for table {table}: {size[0][0] / (1024**2):.2f} MB")
        print(f"Index size for table {table}: {size[0][0] / (1024**3):.2f} GB")

        cur.execute(f"SELECT pg_total_relation_size('{table}');")
        size = cur.fetchall()
        size_mb = size[0][0] / (1024**2)
        size_gb = size[0][0] / (1024**3)
        print(f"Total size (relation + TOAST + index): {size_mb:.2f} MB")
        print(f"Total size (relation + TOAST + index): {size_gb:.2f} GB \n")

## Check indexes

In [3]:
a = ['nation', 'part', 'supplier', 'customer', 'lineitem', 'region', 'partsupp', 'orders']

conn.rollback()
for table in a:
    check_indexes(table)

Index Name: nation_pkey
Index Definition: CREATE UNIQUE INDEX nation_pkey ON public.nation USING btree (n_nationkey)

Index Name: part_pkey
Index Definition: CREATE UNIQUE INDEX part_pkey ON public.part USING btree (p_partkey)

Index Name: supplier_pkey
Index Definition: CREATE UNIQUE INDEX supplier_pkey ON public.supplier USING btree (s_suppkey)

Index Name: customer_pkey
Index Definition: CREATE UNIQUE INDEX customer_pkey ON public.customer USING btree (c_custkey)

Index Name: region_pkey
Index Definition: CREATE UNIQUE INDEX region_pkey ON public.region USING btree (r_regionkey)

Index Name: partsupp_pkey
Index Definition: CREATE UNIQUE INDEX partsupp_pkey ON public.partsupp USING btree (ps_partkey, ps_suppkey)

Index Name: orders_pkey
Index Definition: CREATE UNIQUE INDEX orders_pkey ON public.orders USING btree (o_orderkey)



## Query 1

Is it useful to create a mv for this query? We don't expect so much gain because of the sequential scan on shipdate.

We try to filter for shipdate, creating a mv, then we put an index on (l_returnflag, l_linestatus) and then we perform a CLUSTER on this index.

We don't do this on the original table for two reasons: one is to use a mv and the other is not to modify the original table and possibly alter the timings of the queries.


In [9]:
query_materialized = """

CREATE MATERIALIZED VIEW lineitem_aggregates AS
SELECT
    l_returnflag,
    l_linestatus,
    l_extendedprice,
    l_discount,
    l_tax,
    l_quantity
FROM
    lineitem
WHERE 
    l_shipdate <= DATE '1998-12-01' - INTERVAL '90' DAY;

"""

conn.rollback()
with conn.cursor() as cur:
    start_time = time.time()
    cur.execute(query_materialized)
    end_time = time.time()
    print(f"Time taken to create materialized view: {end_time - start_time} seconds")
    conn.commit()

Time taken to create materialized view: 132.15457582473755 seconds


In [4]:
query_1 = """
SELECT
    l_returnflag,
    l_linestatus,
    SUM(l_quantity) AS sum_qty,
    SUM(l_extendedprice) AS sum_base_price,
    SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
    SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
    AVG(l_quantity) AS avg_qty,
    AVG(l_extendedprice) AS avg_price,
    AVG(l_discount) AS avg_disc,
    COUNT(*) AS count_order
FROM
    lineitem_aggregates
GROUP BY
    l_returnflag,
    l_linestatus
ORDER BY
    l_returnflag,
    l_linestatus;
"""

In [11]:
explain_analyze(query_1)

Finalize GroupAggregate  (cost=2736532.36..2749566.35 rows=40000 width=248) (actual time=25929.538..25942.595 rows=4 loops=1)
  Group Key: l_returnflag, l_linestatus
  ->  Gather Merge  (cost=2736532.36..2745866.35 rows=80000 width=248) (actual time=25929.509..25942.553 rows=12 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Sort  (cost=2735532.34..2735632.34 rows=40000 width=248) (actual time=25851.397..25851.399 rows=4 loops=3)
              Sort Key: l_returnflag, l_linestatus
              Sort Method: quicksort  Memory: 27kB
              Worker 0:  Sort Method: quicksort  Memory: 27kB
              Worker 1:  Sort Method: quicksort  Memory: 27kB
              ->  Partial HashAggregate  (cost=2378645.97..2727823.30 rows=40000 width=248) (actual time=25850.276..25850.321 rows=4 loops=3)
                    Group Key: l_returnflag, l_linestatus
                    Planned Partitions: 8  Batches: 1  Memory Usage: 217kB
                    Worker 0:  Batche

## Trying to use index and cluster on lineitem_aggregates

We don't want to alter the original table lineitem, so we saved it in lineitem_aggregates filtering for shipdate.

In [5]:
with conn.cursor() as cur:
    start_time = time.time()
    cur.execute("CREATE INDEX idx_lineitem_returnflag_linestatus ON lineitem_aggregates (l_returnflag, l_linestatus);")
    end_time = time.time()
    print(f"Time taken to create index: {end_time - start_time} seconds")
    start_time = time.time()
    cur.execute("CLUSTER lineitem_aggregates USING idx_lineitem_returnflag_linestatus;")
    end_time = time.time()
    print(f"Time taken to cluster table: {end_time - start_time} seconds")
    conn.commit()


Time taken to create index: 91.87562990188599 seconds
Time taken to cluster table: 148.17869091033936 seconds


In [15]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_bitmapscan = off;")
    cur.execute("SET enable_indexscan = off;")
    conn.commit()
explain_analyze(query_1)

Finalize GroupAggregate  (cost=1543490.50..1543492.45 rows=6 width=236) (actual time=22392.926..22395.471 rows=4 loops=1)
  Group Key: l_returnflag, l_linestatus
  ->  Gather Merge  (cost=1543490.50..1543491.90 rows=12 width=236) (actual time=22392.893..22395.431 rows=12 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Sort  (cost=1542490.47..1542490.49 rows=6 width=236) (actual time=22331.585..22331.586 rows=4 loops=3)
              Sort Key: l_returnflag, l_linestatus
              Sort Method: quicksort  Memory: 27kB
              Worker 0:  Sort Method: quicksort  Memory: 27kB
              Worker 1:  Sort Method: quicksort  Memory: 27kB
              ->  Partial HashAggregate  (cost=1542490.26..1542490.40 rows=6 width=236) (actual time=22331.548..22331.553 rows=4 loops=3)
                    Group Key: l_returnflag, l_linestatus
                    Batches: 1  Memory Usage: 24kB
                    Worker 0:  Batches: 1  Memory Usage: 24kB
              

In [11]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_bitmapscan = off;")
    cur.execute("SET enable_indexscan = on;")
    conn.commit()
explain_analyze(query_1)

Finalize GroupAggregate  (cost=1000.59..2608431.78 rows=6 width=236) (actual time=30108.231..30109.194 rows=4 loops=1)
  Group Key: l_returnflag, l_linestatus
  ->  Gather Merge  (cost=1000.59..2608431.22 rows=12 width=236) (actual time=30106.595..30109.140 rows=10 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial GroupAggregate  (cost=0.56..2607429.82 rows=6 width=236) (actual time=6124.939..22104.443 rows=3 loops=3)
              Group Key: l_returnflag, l_linestatus
              ->  Parallel Index Scan using idx_lineitem_returnflag_linestatus on lineitem_aggregates  (cost=0.56..1744915.83 rows=24643253 width=25) (actual time=0.304..7210.118 rows=19714203 loops=3)
Planning Time: 4.898 ms
Execution Time: 30109.282 ms


It seems that it doesn't use the partition.

In [13]:
check = """
SELECT
    t.relname AS table_name,
    i.relname AS index_name,
    ix.indisclustered
FROM
    pg_class t
    JOIN pg_index ix ON t.oid = ix.indrelid
    JOIN pg_class i ON ix.indexrelid = i.oid
WHERE
    t.relname = 'lineitem_aggregates';
"""

conn.rollback()
with conn.cursor() as cur:
    cur.execute(check)
    row = cur.fetchall()
    print(row)


[('lineitem_aggregates', 'idx_lineitem_returnflag_linestatus', True)]


This strategy is not interesting and we don't analyze it no more.

In [17]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("drop materialized view lineitem_aggregates;")
    conn.commit()

## Materialized on part lineitem

In [32]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    conn.commit()

query_materialized = """

CREATE MATERIALIZED VIEW part_lineitem AS
SELECT
    l_returnflag,
    l_linestatus,
    l_quantity,
    l_extendedprice,
    l_discount,
    l_tax,
    l_shipdate,
    l_partkey,
    p_partkey,
    p_brand,
    p_container,
    SUBSTRING(p_type FROM 1 FOR 5) AS p_type_prefix,
    0.2 * AVG(l_quantity) OVER (PARTITION BY l_partkey) AS avg_quantity
FROM
    lineitem l
JOIN
    part p ON l.l_partkey = p.p_partkey;

"""

with conn.cursor() as cur:
    start_time = time.time()
    cur.execute(query_materialized)
    end_time = time.time()
    print(f"Time taken to create materialized view: {end_time - start_time} seconds")
    conn.commit()

Time taken to create materialized view: 310.9520151615143 seconds


We enable hash join since we are not interested in indexes performance in executing the join.

In [33]:
conn.rollback()
with conn.cursor() as cur:
    
    cur.execute("SELECT pg_total_relation_size('part_lineitem');")
    size = cur.fetchall()
    print(f"Size of materialised view: {size[0][0]/(1024**2)} MB")
    print(f"Size of materialised view: {size[0][0]/(1024**3)} GB")

Size of materialised view: 6581.5078125 MB
Size of materialised view: 6.427253723144531 GB


In [34]:
conn.rollback()
check_indexes('part_lineitem')

### Query 14

In [35]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    cur.execute("SET enable_material = on;")
    conn.commit()

query_14 = """
SELECT
    100.00 * SUM(CASE
        WHEN p_type_prefix LIKE 'PROMO'
        THEN l_extendedprice * (1 - l_discount)
        ELSE 0
    END) / SUM(l_extendedprice * (1 - l_discount)) AS promo_revenue
FROM
    part_lineitem
WHERE
    l_shipdate >= DATE '1995-09-01'
    AND l_shipdate < DATE '1995-09-01' + INTERVAL '1' MONTH;
"""

In [36]:
explain_analyze(query_14)

Finalize Aggregate  (cost=970534.18..970534.19 rows=1 width=32) (actual time=36047.043..36049.397 rows=1 loops=1)
  ->  Gather  (cost=970533.94..970534.15 rows=2 width=64) (actual time=36044.942..36049.075 rows=3 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial Aggregate  (cost=969533.94..969533.95 rows=1 width=64) (actual time=36035.166..36035.168 rows=1 loops=3)
              ->  Parallel Seq Scan on part_lineitem  (cost=0.00..968796.80 rows=42122 width=96) (actual time=0.279..35848.123 rows=249741 loops=3)
                    Filter: ((l_shipdate >= '1995-09-01'::date) AND (l_shipdate < '1995-10-01 00:00:00'::timestamp without time zone))
                    Rows Removed by Filter: 19745610
Planning Time: 0.798 ms
Execution Time: 36049.752 ms


The problem with the lack of gain in performance is that the optimizer performs a sequential scan because there is no index on shipdate.

Validate and size of the result table.

In [37]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_14};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_14 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 (Decimal('16.6475949416150953'),)
Size of query_14 result table: 0.015625 MB


### Query 17

Here we expect to get the most gain in performance.

In [38]:
query_17 = """
SELECT
    SUM(l_extendedprice) / 7.0 AS avg_yearly
FROM
    part_lineitem
WHERE
    p_brand = 'Brand#23'
    AND p_container = 'MED BOX'
    AND l_quantity < avg_quantity;
"""

In [39]:
explain_analyze(query_17)

Finalize Aggregate  (cost=990858.00..990858.01 rows=1 width=32) (actual time=15263.910..15265.350 rows=1 loops=1)
  ->  Gather  (cost=990857.78..990857.99 rows=2 width=32) (actual time=15263.831..15265.340 rows=3 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial Aggregate  (cost=989857.78..989857.79 rows=1 width=32) (actual time=15259.045..15259.045 rows=1 loops=3)
              ->  Parallel Seq Scan on part_lineitem  (cost=0.00..989857.60 rows=70 width=32) (actual time=79.155..15258.522 rows=1842 loops=3)
                    Filter: ((l_quantity < avg_quantity) AND (p_brand = 'Brand#23'::bpchar) AND (p_container = 'MED BOX'::bpchar))
                    Rows Removed by Filter: 19993509
Planning Time: 665.693 ms
Execution Time: 15265.404 ms


Very good! We can see that the only cost is in scanning the table and filtering for the conditions.

Validate and size of result table.

In [42]:
conn.rollback()
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_17};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_1 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 (Decimal('3295493.512857142857'),)
Size of query_1 result table: 0.015625 MB


## Indexes on part lineitem

In [43]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_pl_shipdate ON part_lineitem (l_shipdate);")
    end_time = time.time()
    print(f"Time to create idx_pl_shipdate ON part_lineitem (l_shipdate): {end_time - start_time} seconds")

    conn.commit()

Time to create idx_pl_shipdate ON part_lineitem (l_shipdate): 30.162781953811646 seconds


In [30]:
with conn.cursor() as cur:    
    cur.execute("SELECT pg_relation_size('idx_pl_shipdate');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_pl_shipdate: {index_size/(1024**2)} MB")
    print(f"Size of idx_pl_shipdate: {index_size/(1024**3)} GB")

Size of idx_pl_shipdate: 397.546875 MB
Size of idx_pl_shipdate: 0.3882293701171875 GB


In [44]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_pl_brand ON part_lineitem (p_brand);")
    end_time = time.time()
    print(f"Time to create idx_pl_brand ON part_lineitem (p_brand): {end_time - start_time} seconds")

    conn.commit()

Time to create idx_pl_brand ON part_lineitem (p_brand): 58.95485210418701 seconds


In [31]:
with conn.cursor() as cur:    
    cur.execute("SELECT pg_relation_size('idx_pl_brand');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_pl_brand: {index_size/(1024**2)} MB")
    print(f"Size of idx_pl_brand: {index_size/(1024**3)} GB")

Size of idx_pl_brand: 403.140625 MB
Size of idx_pl_brand: 0.3936920166015625 GB


In [45]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_pl_container ON part_lineitem (p_container);")
    end_time = time.time()
    print(f"Time to create idx_pl_container ON part_lineitem (p_container): {end_time - start_time} seconds")

    conn.commit()

Time to create idx_pl_container ON part_lineitem (p_container): 60.34345507621765 seconds


In [32]:
with conn.cursor() as cur:    
    cur.execute("SELECT pg_relation_size('idx_pl_container');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_pl_container: {index_size/(1024**2)} MB")
    print(f"Size of idx_pl_container: {index_size/(1024**3)} GB")

Size of idx_pl_container: 403.1484375 MB
Size of idx_pl_container: 0.39369964599609375 GB


In [46]:
explain_analyze(query_14)

Finalize Aggregate  (cost=1214874.74..1214874.76 rows=1 width=32) (actual time=19381.340..19395.889 rows=1 loops=1)
  ->  Gather  (cost=1214874.51..1214874.72 rows=2 width=64) (actual time=19380.980..19395.876 rows=3 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial Aggregate  (cost=1213874.51..1213874.52 rows=1 width=64) (actual time=19375.856..19375.857 rows=1 loops=3)
              ->  Parallel Bitmap Heap Scan on part_lineitem  (cost=9882.66..1208594.56 rows=301711 width=18) (actual time=107.708..19034.300 rows=249741 loops=3)
                    Recheck Cond: ((l_shipdate >= '1995-09-01'::date) AND (l_shipdate < '1995-10-01 00:00:00'::timestamp without time zone))
                    Rows Removed by Index Recheck: 10773120
                    Heap Blocks: exact=11467 lossy=154789
                    ->  Bitmap Index Scan on idx_pl_shipdate  (cost=0.00..9701.64 rows=724107 width=0) (actual time=88.514..88.514 rows=749223 loops=1)
                  

We can see that the index on l_shipdate is used and it brings a significative improvement.

In [47]:
explain_analyze(query_17)

Aggregate  (cost=218970.71..218970.72 rows=1 width=32) (actual time=1422.670..1422.673 rows=1 loops=1)
  ->  Bitmap Heap Scan on part_lineitem  (cost=41274.20..218923.70 rows=18805 width=8) (actual time=143.534..1421.748 rows=5526 loops=1)
        Recheck Cond: ((p_container = 'MED BOX'::bpchar) AND (p_brand = 'Brand#23'::bpchar))
        Rows Removed by Index Recheck: 346780
        Filter: (l_quantity < avg_quantity)
        Rows Removed by Filter: 55859
        Heap Blocks: exact=3124 lossy=4777
        ->  BitmapAnd  (cost=41274.20..41274.20 rows=56414 width=0) (actual time=140.630..140.631 rows=0 loops=1)
              ->  Bitmap Index Scan on idx_pl_container  (cost=0.00..16102.00 rows=1471658 width=0) (actual time=54.228..54.228 rows=1504601 loops=1)
                    Index Cond: (p_container = 'MED BOX'::bpchar)
              ->  Bitmap Index Scan on idx_pl_brand  (cost=0.00..25162.55 rows=2299465 width=0) (actual time=82.913..82.913 rows=2391264 loops=1)
                    

We can see that the index on p_brand and p_container are used and they bring a very significative improvement!

## Materialized customer_order_lineitem_nation

In [48]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    conn.commit()

query_materialized = """

CREATE MATERIALIZED VIEW customer_order_lineitem_nation AS
SELECT
    c.c_custkey,
    c.c_name,
    c.c_acctbal,
    n.n_name,
    c.c_address,
    c.c_phone,
    c.c_comment,
    -- c.c_nationkey, not needed in the query, so not included
    l.l_returnflag,
    -- l.l_orderkey, not needed in the query, so not included
    l.l_discount,
    l.l_extendedprice,
    o.o_orderdate

FROM
    customer c
JOIN
    orders o ON c.c_custkey = o.o_custkey
JOIN
    lineitem l ON l.l_orderkey = o.o_orderkey
JOIN
    nation n ON c.c_nationkey = n.n_nationkey;
"""

explain_analyze(query_materialized, analyze = False)

Hash Join  (cost=773460.56..4508342.52 rows=59986052 width=265)
  Hash Cond: (c.c_nationkey = n.n_nationkey)
  ->  Hash Join  (cost=773459.00..4324183.78 rows=59986052 width=165)
        Hash Cond: (o.o_custkey = c.c_custkey)
        ->  Hash Join  (cost=671655.00..3329726.95 rows=59986052 width=22)
              Hash Cond: (l.l_orderkey = o.o_orderkey)
              ->  Seq Scan on lineitem l  (cost=0.00..1724403.52 rows=59986052 width=18)
              ->  Hash  (cost=410912.00..410912.00 rows=15000000 width=12)
                    ->  Seq Scan on orders o  (cost=0.00..410912.00 rows=15000000 width=12)
        ->  Hash  (cost=50827.00..50827.00 rows=1500000 width=147)
              ->  Seq Scan on customer c  (cost=0.00..50827.00 rows=1500000 width=147)
  ->  Hash  (cost=1.25..1.25 rows=25 width=108)
        ->  Seq Scan on nation n  (cost=0.00..1.25 rows=25 width=108)


In [50]:
conn.rollback()
with conn.cursor() as cur:
    start_time = time.time()
    cur.execute(query_materialized)
    end_time = time.time()
    print(f"Time taken to create materialized view: {end_time - start_time} seconds")
    conn.commit()

Time taken to create materialized view: 437.10683703422546 seconds


In [51]:
conn.rollback()
with conn.cursor() as cur:
    
    cur.execute("SELECT pg_total_relation_size('customer_order_lineitem_nation');")
    size = cur.fetchall()
    print(f"Size of materialised view: {size[0][0]/(1024**2)} MB")
    print(f"Size of materialised view: {size[0][0]/(1024**3)} GB")

Size of materialised view: 12938.5078125 MB
Size of materialised view: 12.635261535644531 GB


In [112]:
check_indexes('customer_order_lineitem_nation')

In [113]:
query_10 = """
SELECT
    c_custkey,
    c_name,
    SUM(l_extendedprice * (1 - l_discount)) AS revenue,
    c_acctbal,
    n_name,
    c_address,
    c_phone,
    c_comment
FROM
    customer_order_lineitem_nation
WHERE
    o_orderdate >= DATE '1993-10-01'
    AND o_orderdate < DATE '1993-10-01' + INTERVAL '3' MONTH
    AND l_returnflag = 'R'
GROUP BY
    c_custkey,
    c_name,
    c_acctbal,
    c_phone,
    n_name,
    c_address,
    c_comment
ORDER BY
    revenue DESC;
"""


In [114]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = off;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    conn.commit()

explain_analyze(query_10)

Sort  (cost=2365597.61..2366901.03 rows=521368 width=201) (actual time=41540.225..41591.007 rows=381105 loops=1)
  Sort Key: (sum((l_extendedprice * ('1'::numeric - l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  Finalize GroupAggregate  (cost=2134754.26..2212726.66 rows=521368 width=201) (actual time=40507.944..41159.491 rows=381105 loops=1)
        Group Key: c_custkey, c_name, c_acctbal, c_phone, n_name, c_address, c_comment
        ->  Gather Merge  (cost=2134754.26..2196029.12 rows=452464 width=201) (actual time=40507.920..40884.069 rows=450028 loops=1)
              Workers Planned: 2
              Workers Launched: 2
              ->  Partial GroupAggregate  (cost=2133754.24..2142803.52 rows=226232 width=201) (actual time=40481.177..40766.210 rows=150009 loops=3)
                    Group Key: c_custkey, c_name, c_acctbal, c_phone, n_name, c_address, c_comment
                    ->  Sort  (cost=2133754.24..2134319.82 rows=226232 width=181) (actual time=40

In [55]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_10};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_10 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 (1237537, 'Customer#001237537', Decimal('884989.6657'), Decimal('7840.17'), 'RUSSIA                   ', 'FNG6WgB1mopyyY,ajQTU qUPW5o', '32-367-120-4327', 'nag carefully about the regular packages. carefully reg')
Size of query_10 result table: 78.5078125 MB


## Indexes on customer order lineitem nation

In [115]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_coln_orderdate ON customer_order_lineitem_nation (o_orderdate);")
    end_time = time.time()
    print(f"Time to create idx_coln_orderdate ON customer_order_lineitem_nation (o_orderdate): {end_time - start_time} seconds")

    conn.commit()

Time to create idx_coln_orderdate ON customer_order_lineitem_nation (o_orderdate): 58.30237102508545 seconds


In [33]:
with conn.cursor() as cur:    
    cur.execute("SELECT pg_relation_size('idx_coln_orderdate');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_coln_orderdate: {index_size/(1024**2)} MB")
    print(f"Size of idx_coln_orderdate: {index_size/(1024**3)} GB")

Size of idx_coln_orderdate: 397.5078125 MB
Size of idx_coln_orderdate: 0.38819122314453125 GB


In [116]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_coln_l_returnflag ON customer_order_lineitem_nation (l_returnflag);")
    end_time = time.time()
    print(f"Time to create idx_coln_l_returnflag ON customer_order_lineitem_nation (l_returnflag): {end_time - start_time} seconds")

    conn.commit()

Time to create idx_coln_l_returnflag ON customer_order_lineitem_nation (l_returnflag): 50.617753982543945 seconds


In [34]:
with conn.cursor() as cur:    
    cur.execute("SELECT pg_relation_size('idx_coln_l_returnflag');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_coln_l_returnflag: {index_size/(1024**2)} MB")
    print(f"Size of idx_coln_l_returnflag: {index_size/(1024**3)} GB")

Size of idx_coln_l_returnflag: 396.4609375 MB
Size of idx_coln_l_returnflag: 0.38716888427734375 GB


In [120]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = off;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    conn.commit()

explain_analyze(query_10)

Sort  (cost=1993769.88..1995073.30 rows=521368 width=201) (actual time=27869.242..27920.840 rows=381105 loops=1)
  Sort Key: (sum((l_extendedprice * ('1'::numeric - l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  Finalize GroupAggregate  (cost=1762926.53..1840898.93 rows=521368 width=201) (actual time=26794.582..27470.836 rows=381105 loops=1)
        Group Key: c_custkey, c_name, c_acctbal, c_phone, n_name, c_address, c_comment
        ->  Gather Merge  (cost=1762926.53..1824201.39 rows=452464 width=201) (actual time=26794.572..27186.217 rows=471540 loops=1)
              Workers Planned: 2
              Workers Launched: 2
              ->  Partial GroupAggregate  (cost=1761926.51..1770975.79 rows=226232 width=201) (actual time=26778.571..27064.860 rows=157180 loops=3)
                    Group Key: c_custkey, c_name, c_acctbal, c_phone, n_name, c_address, c_comment
                    ->  Sort  (cost=1761926.51..1762492.09 rows=226232 width=181) (actual time=26

We can see that the indexes are used and reduce the execution time.

## Mixed approach on order lineitem part

In [4]:
query_10_1 = """
SELECT
    c_custkey,
    c_name,
    SUM(l_extendedprice * (1 - l_discount)) AS revenue,
    c_acctbal,
    n_name,
    c_address,
    c_phone,
    c_comment
FROM
    part_lineitem_order
    JOIN customer c ON c.c_custkey = o_custkey
    JOIN nation n ON c.c_nationkey = n.n_nationkey
WHERE
    o_orderdate >= DATE '1993-10-01'
    AND o_orderdate < DATE '1993-10-01' + INTERVAL '3' MONTH
    AND l_returnflag = 'R'
GROUP BY
    c_custkey,
    c_name,
    c_acctbal,
    c_phone,
    n_name,
    c_address,
    c_comment
ORDER BY
    revenue DESC;
"""

query_14_1 = """
SELECT
    100.00 * SUM(CASE
        WHEN p_type_prefix LIKE 'PROMO'
        THEN l_extendedprice * (1 - l_discount)
        ELSE 0
    END) / SUM(l_extendedprice * (1 - l_discount)) AS promo_revenue
FROM
    part_lineitem_order
WHERE
    l_shipdate >= DATE '1995-09-01'
    AND l_shipdate < DATE '1995-09-01' + INTERVAL '1' MONTH;
"""

query_17_1 = """
SELECT
    SUM(l_extendedprice) / 7.0 AS avg_yearly
FROM
    part_lineitem_order
WHERE
    p_brand = 'Brand#23'
    AND p_container = 'MED BOX'
    AND l_quantity < avg_quantity;
"""

In [5]:
query_materialized = """

CREATE MATERIALIZED VIEW part_lineitem_order AS
SELECT
    l_returnflag,
    l_linestatus,
    l_quantity,
    l_extendedprice,
    l_discount,
    l_tax,
    l_shipdate,
    l_partkey,
    p_partkey,
    p_brand,
    p_container,
    SUBSTRING(p_type FROM 1 FOR 5) AS p_type_prefix,
    0.2 * AVG(l_quantity) OVER (PARTITION BY l_partkey) AS avg_quantity,
    o_orderkey,
    o.o_custkey,
    o.o_orderdate
FROM
    lineitem l
JOIN
    part p ON l.l_partkey = p.p_partkey
JOIN
    orders o ON l.l_orderkey = o.o_orderkey;

"""

In [6]:
conn.rollback()
with conn.cursor() as cur:
    start_time = time.time()
    cur.execute(query_materialized)
    end_time = time.time()
    print(f"Time taken to create materialized view: {end_time - start_time} seconds")
    conn.commit()

Time taken to create materialized view: 339.64209485054016 seconds


In [51]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = off;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    conn.commit()

explain_analyze(query_10_1)

Sort  (cost=1783342.71..1784778.05 rows=574138 width=279) (actual time=31616.511..31677.911 rows=381105 loops=1)
  Sort Key: (sum((part_lineitem_order.l_extendedprice * ('1'::numeric - part_lineitem_order.l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  GroupAggregate  (cost=1372477.75..1579281.43 rows=574138 width=279) (actual time=29109.471..31058.942 rows=381105 loops=1)
        Group Key: c.c_custkey, n.n_name
        ->  Incremental Sort  (cost=1372477.75..1564927.98 rows=574138 width=259) (actual time=29109.463..30459.317 rows=1147084 loops=1)
              Sort Key: c.c_custkey, n.n_name
              Presorted Key: c.c_custkey
              Full-sort Groups: 34124  Sort Method: quicksort  Average Memory: 31kB  Peak Memory: 31kB
              ->  Nested Loop  (cost=1372477.45..1539091.77 rows=574138 width=259) (actual time=29109.413..30107.430 rows=1147084 loops=1)
                    ->  Merge Join  (cost=1372477.30..1525052.30 rows=574138 width=159) (actu

This result is interesting because it performs better than the previous query on customer_order_lineitem_nation with indexes.

In [8]:
explain_analyze(query_14_1)

Finalize Aggregate  (cost=1040982.56..1040982.57 rows=1 width=32) (actual time=22416.982..22429.483 rows=1 loops=1)
  ->  Gather  (cost=1040982.32..1040982.53 rows=2 width=64) (actual time=22416.905..22429.474 rows=3 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial Aggregate  (cost=1039982.32..1039982.33 rows=1 width=64) (actual time=22228.012..22228.013 rows=1 loops=3)
              ->  Parallel Seq Scan on part_lineitem_order  (cost=0.00..1039220.40 rows=43538 width=96) (actual time=1.920..22113.036 rows=249741 loops=3)
                    Filter: ((l_shipdate >= '1995-09-01'::date) AND (l_shipdate < '1995-10-01 00:00:00'::timestamp without time zone))
                    Rows Removed by Filter: 19745610
Planning Time: 15.610 ms
Execution Time: 22432.104 ms


In [9]:
explain_analyze(query_17_1)

Finalize Aggregate  (cost=1061989.54..1061989.55 rows=1 width=32) (actual time=17460.999..17461.806 rows=1 loops=1)
  ->  Gather  (cost=1061989.32..1061989.53 rows=2 width=32) (actual time=17460.951..17461.800 rows=3 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial Aggregate  (cost=1060989.32..1060989.33 rows=1 width=32) (actual time=17410.178..17410.179 rows=1 loops=3)
              ->  Parallel Seq Scan on part_lineitem_order  (cost=0.00..1060989.13 rows=72 width=32) (actual time=34.027..17409.744 rows=1842 loops=3)
                    Filter: ((l_quantity < avg_quantity) AND (p_brand = 'Brand#23'::bpchar) AND (p_container = 'MED BOX'::bpchar))
                    Rows Removed by Filter: 19993509
Planning Time: 6.260 ms
Execution Time: 17461.827 ms


The execution times of query 14 and 17 are similar to the ones using only part_lineitem.

## Indexes on order_lineitem_part

In [10]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_plo_shipdate ON part_lineitem_order (l_shipdate);")
    end_time = time.time()
    print(f"Time to create idx_plo_shipdate ON part_lineitem_order (l_shipdate): {end_time - start_time} seconds")

    cur.execute("SELECT pg_relation_size('idx_plo_shipdate');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_plo_shipdate: {index_size/(1024**2)} MB")
    print(f"Size of idx_plo_shipdate: {index_size/(1024**3)} GB")

    conn.commit()

Time to create idx_plo_shipdate ON part_lineitem_order (l_shipdate): 39.903138160705566 seconds
Size of idx_plo_shipdate: 397.546875 MB
Size of idx_plo_shipdate: 0.3882293701171875 GB


In [34]:
conn.rollback()
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_plo_o_orderdate ON part_lineitem_order (o_orderdate);")
    end_time = time.time()
    print(f"Time to create idx_plo_o_orderdate ON part_lineitem_order (o_orderdate): {end_time - start_time} seconds")

    cur.execute("SELECT pg_relation_size('idx_plo_o_orderdate');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_plo_o_orderdate: {index_size/(1024**2)} MB")
    print(f"Size of idx_plo_o_orderdate: {index_size/(1024**3)} GB")

    conn.commit()

Time to create idx_plo_o_orderdate ON part_lineitem_order (o_orderdate): 45.10997986793518 seconds
Size of idx_plo_o_orderdate: 397.5078125 MB
Size of idx_plo_o_orderdate: 0.38819122314453125 GB


In [35]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_plo_l_returnflag ON part_lineitem_order (l_returnflag);")
    end_time = time.time()
    print(f"Time to create idx_plo_l_returnflag ON part_lineitem_order (l_returnflag): {end_time - start_time} seconds")

    cur.execute("SELECT pg_relation_size('idx_plo_l_returnflag');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_plo_l_returnflag: {index_size/(1024**2)} MB")
    print(f"Size of idx_plo_l_returnflag: {index_size/(1024**3)} GB")

    conn.commit()

Time to create idx_plo_l_returnflag ON part_lineitem_order (l_returnflag): 51.507787227630615 seconds
Size of idx_plo_l_returnflag: 396.4609375 MB
Size of idx_plo_l_returnflag: 0.38716888427734375 GB


In [13]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_plo_brand ON part_lineitem_order (p_brand);")
    end_time = time.time()
    print(f"Time to create idx_plo_brand ON part_lineitem_order (p_brand): {end_time - start_time} seconds")

    cur.execute("SELECT pg_relation_size('idx_plo_brand');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_plo_brand: {index_size/(1024**2)} MB")
    print(f"Size of idx_plo_brand: {index_size/(1024**3)} GB")

    conn.commit()

Time to create idx_plo_brand ON part_lineitem_order (p_brand): 49.85310411453247 seconds
Size of idx_plo_brand: 403.140625 MB
Size of idx_plo_brand: 0.3936920166015625 GB


In [14]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_plo_container ON part_lineitem_order (p_container);")
    end_time = time.time()
    print(f"Time to create idx_plo_container ON part_lineitem_order (p_container): {end_time - start_time} seconds")

    cur.execute("SELECT pg_relation_size('idx_plo_container');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_plo_container: {index_size/(1024**2)} MB")
    print(f"Size of idx_plo_container: {index_size/(1024**3)} GB")

    conn.commit()

Time to create idx_plo_container ON part_lineitem_order (p_container): 48.44050908088684 seconds
Size of idx_plo_container: 403.1484375 MB
Size of idx_plo_container: 0.39369964599609375 GB


We tried also an index on o_custkey but it doesn't help neither is used by the optimizer.

In [36]:
conn.rollback()
with conn.cursor() as cur:

    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexscan = on;")
    conn.commit()

explain_analyze(query_10_1)

Sort  (cost=1783342.71..1784778.05 rows=574138 width=279) (actual time=24681.894..24732.574 rows=381105 loops=1)
  Sort Key: (sum((part_lineitem_order.l_extendedprice * ('1'::numeric - part_lineitem_order.l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  GroupAggregate  (cost=1372477.75..1579281.43 rows=574138 width=279) (actual time=22392.395..24324.839 rows=381105 loops=1)
        Group Key: c.c_custkey, n.n_name
        ->  Incremental Sort  (cost=1372477.75..1564927.98 rows=574138 width=259) (actual time=22392.382..23771.424 rows=1147084 loops=1)
              Sort Key: c.c_custkey, n.n_name
              Presorted Key: c.c_custkey
              Full-sort Groups: 34124  Sort Method: quicksort  Average Memory: 31kB  Peak Memory: 31kB
              ->  Nested Loop  (cost=1372477.45..1539091.77 rows=574138 width=259) (actual time=22392.291..23445.765 rows=1147084 loops=1)
                    ->  Merge Join  (cost=1372477.30..1525052.30 rows=574138 width=159) (actu

In [37]:
conn.rollback()
with conn.cursor() as cur:

    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexscan = on;")
    conn.commit()

explain_analyze(query_10_1)

Sort  (cost=1764463.50..1765898.84 rows=574138 width=279) (actual time=31118.361..31169.077 rows=381105 loops=1)
  Sort Key: (sum((part_lineitem_order.l_extendedprice * ('1'::numeric - part_lineitem_order.l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  Finalize GroupAggregate  (cost=1388486.85..1560402.22 rows=574138 width=279) (actual time=29412.414..30743.582 rows=381105 loops=1)
        Group Key: c.c_custkey, n.n_name
        ->  Gather Merge  (cost=1388486.85..1548441.01 rows=478448 width=279) (actual time=29412.406..30368.193 rows=720217 loops=1)
              Workers Planned: 2
              Workers Launched: 2
              ->  Partial GroupAggregate  (cost=1387486.83..1492216.22 rows=239224 width=279) (actual time=29395.420..30238.699 rows=240072 loops=3)
                    Group Key: c.c_custkey, n.n_name
                    ->  Incremental Sort  (cost=1387486.83..1486235.62 rows=239224 width=259) (actual time=29395.405..29984.904 rows=382361 loops=3)


We note that the indexes worsen the results.

If we force the optimizer to use them we get worse results, so we drop them.

In [38]:
with conn.cursor() as cur:
    
    cur.execute("drop index if exists idx_plo_o_orderdate;")

    conn.commit()

In [39]:
explain_analyze(query_10_1)

Sort  (cost=1898491.04..1899926.38 rows=574138 width=279) (actual time=31421.579..31472.768 rows=381105 loops=1)
  Sort Key: (sum((part_lineitem_order.l_extendedprice * ('1'::numeric - part_lineitem_order.l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  Finalize GroupAggregate  (cost=1522514.39..1694429.76 rows=574138 width=279) (actual time=29710.085..31036.011 rows=381105 loops=1)
        Group Key: c.c_custkey, n.n_name
        ->  Gather Merge  (cost=1522514.39..1682468.55 rows=478448 width=279) (actual time=29710.047..30655.132 rows=719135 loops=1)
              Workers Planned: 2
              Workers Launched: 2
              ->  Partial GroupAggregate  (cost=1521514.37..1626243.76 rows=239224 width=279) (actual time=29693.407..30537.172 rows=239712 loops=3)
                    Group Key: c.c_custkey, n.n_name
                    ->  Incremental Sort  (cost=1521514.37..1620263.16 rows=239224 width=259) (actual time=29693.378..30281.841 rows=382361 loops=3)


In [40]:
with conn.cursor() as cur:
    
    cur.execute("drop index if exists idx_plo_l_returnflag;")

    conn.commit()

In [41]:
explain_analyze(query_10_1)

Sort  (cost=10001748447.81..10001749883.16 rows=574138 width=279) (actual time=22364.280..22415.015 rows=381105 loops=1)
  Sort Key: (sum((part_lineitem_order.l_extendedprice * ('1'::numeric - part_lineitem_order.l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  Finalize GroupAggregate  (cost=10001372471.17..10001544386.53 rows=574138 width=279) (actual time=20634.133..21979.730 rows=381105 loops=1)
        Group Key: c.c_custkey, n.n_name
        ->  Gather Merge  (cost=10001372471.17..10001532425.33 rows=478448 width=279) (actual time=20634.123..21602.432 rows=719814 loops=1)
              Workers Planned: 2
              Workers Launched: 2
              ->  Partial GroupAggregate  (cost=10001371471.14..10001476200.53 rows=239224 width=279) (actual time=20606.636..21469.478 rows=239938 loops=3)
                    Group Key: c.c_custkey, n.n_name
                    ->  Incremental Sort  (cost=10001371471.14..10001470219.93 rows=239224 width=259) (actual time=20

In [20]:
explain_analyze(query_14_1)

Finalize Aggregate  (cost=953373.48..953373.50 rows=1 width=32) (actual time=20537.260..20543.182 rows=1 loops=1)
  ->  Gather  (cost=953373.25..953373.46 rows=2 width=64) (actual time=20536.383..20543.122 rows=3 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial Aggregate  (cost=952373.25..952373.26 rows=1 width=64) (actual time=20472.197..20472.199 rows=1 loops=3)
              ->  Parallel Bitmap Heap Scan on part_lineitem_order  (cost=4094.85..950186.25 rows=124971 width=96) (actual time=87.988..20026.335 rows=249741 loops=3)
                    Recheck Cond: ((l_shipdate >= '1995-09-01'::date) AND (l_shipdate < '1995-10-01 00:00:00'::timestamp without time zone))
                    Rows Removed by Index Recheck: 9983234
                    Heap Blocks: exact=16276 lossy=155345
                    ->  Bitmap Index Scan on idx_plo_shipdate  (cost=0.00..4019.87 rows=299930 width=0) (actual time=145.760..145.760 rows=749223 loops=1)
                  

In [25]:
explain_analyze(query_17_1)

Aggregate  (cost=12413.43..12413.44 rows=1 width=32) (actual time=1461.615..1461.622 rows=1 loops=1)
  ->  Bitmap Heap Scan on part_lineitem_order  (cost=6572.58..12412.17 rows=500 width=32) (actual time=154.493..1460.190 rows=5526 loops=1)
        Recheck Cond: ((p_container = 'MED BOX'::bpchar) AND (p_brand = 'Brand#23'::bpchar))
        Rows Removed by Index Recheck: 294271
        Filter: (l_quantity < avg_quantity)
        Rows Removed by Filter: 55859
        Heap Blocks: exact=2653 lossy=4541
        ->  BitmapAnd  (cost=6572.58..6572.58 rows=1500 width=0) (actual time=153.482..153.484 rows=0 loops=1)
              ->  Bitmap Index Scan on idx_plo_container  (cost=0.00..3286.04 rows=299930 width=0) (actual time=62.933..62.933 rows=1504601 loops=1)
                    Index Cond: (p_container = 'MED BOX'::bpchar)
              ->  Bitmap Index Scan on idx_plo_brand  (cost=0.00..3286.04 rows=299930 width=0) (actual time=87.636..87.637 rows=2391264 loops=1)
                    Inde

## Indexes used

In [26]:
conn.rollback()
check_indexes('part_lineitem_order')

Index Name: idx_plo_shipdate
Index Definition: CREATE INDEX idx_plo_shipdate ON public.part_lineitem_order USING btree (l_shipdate)

Index Name: idx_plo_brand
Index Definition: CREATE INDEX idx_plo_brand ON public.part_lineitem_order USING btree (p_brand)

Index Name: idx_plo_container
Index Definition: CREATE INDEX idx_plo_container ON public.part_lineitem_order USING btree (p_container)



## Size of materialized views

In [29]:
collect_size('part_lineitem')

Relation size: 6581.50 MB
Relation size: 6.43 GB
Table (relation + TOAST) size: 6583.35 MB
Table (relation + TOAST) size: 6.43 GB
Index size for table part_lineitem: 1203.84 MB
Index size for table part_lineitem: 1.18 GB
Total size (relation + TOAST + index): 7787.19 MB
Total size (relation + TOAST + index): 7.60 GB 



In [28]:
collect_size('part_lineitem_order')

Relation size: 7098.00 MB
Relation size: 6.93 GB
Table (relation + TOAST) size: 7099.99 MB
Table (relation + TOAST) size: 6.93 GB
Index size for table part_lineitem_order: 1203.84 MB
Index size for table part_lineitem_order: 1.18 GB
Total size (relation + TOAST + index): 8303.83 MB
Total size (relation + TOAST + index): 8.11 GB 



In [27]:
collect_size('customer_order_lineitem_nation')

Relation size: 12938.50 MB
Relation size: 12.64 GB
Table (relation + TOAST) size: 12942.11 MB
Table (relation + TOAST) size: 12.64 GB
Index size for table customer_order_lineitem_nation: 793.97 MB
Index size for table customer_order_lineitem_nation: 0.78 GB
Total size (relation + TOAST + index): 13736.08 MB
Total size (relation + TOAST + index): 13.41 GB 



## Conclusion

We can see that the gain of using 2 different materialized views wrt using a single one with part order lineitem are the same. the only useful indexes on part order lineitem are p_brand and p_container that speed up a lot the query.

since we risparmiare time and space by creating only one mv, we choose to use that, also because it can be helpful for many other queries, in fact the tables part order lineitem are the "core" of the db.

so we keep only part lineitem order as mv, drop the others

In [44]:
with conn.cursor() as cur:
    cur.execute("drop materialized view part_lineitem;")
    cur.execute("drop materialized view customer_order_lineitem_nation;")
    conn.commit()