In [6]:
import psycopg2
import time

In [7]:
# establish a connection to the PostgreSQL database
conn = psycopg2.connect(
    dbname = "dw_cs", 
    user = "postgres", 
    host= 'localhost',      # change this to your host
    password = "postgres",  # change this to your password
    port = 5432
)

In [8]:
# function to check the indexes on a table

def check_indexes(table_name):

    with conn.cursor() as cur:
        query = f"""
        SELECT
            indexname,
            indexdef
        FROM
            pg_indexes
        WHERE
            tablename = '{table_name}';
        """

        cur.execute(query)      
        
        indexes = cur.fetchall()
        
        for index in indexes:
            print(f"Index Name: {index[0]}")
            print(f"Index Definition: {index[1]}\n")


# function to explain-analyze a query

def explain_analyze(query, analyze = True):
    conn.rollback()
    with conn.cursor() as cur:
        if analyze:
            cur.execute(f"EXPLAIN ANALYZE {query}")
        else:
            cur.execute(f"EXPLAIN {query}")
        explain = cur.fetchall()

        for line in explain:
            print(line[0])

## Initial database size

In [96]:
conn.rollback()
with conn.cursor() as cur:

    cur.execute("SELECT pg_database_size('dw_cs');")
    all_rows = cur.fetchall()
    print(f"Database size: {all_rows[0][0] / (1024**2):.2f} MB")
    print(f"Database size: {all_rows[0][0] / (1024**3):.2f} GB \n")

Database size: 19715.57 MB
Database size: 19.25 GB 



## Initial indexes

In [97]:
a = ['nation', 'part', 'supplier', 'customer', 'lineitem', 'region', 'partsupp', 'orders']

conn.rollback()
for table in a:
    check_indexes(table)

Index Name: nation_pkey
Index Definition: CREATE UNIQUE INDEX nation_pkey ON public.nation USING btree (n_nationkey)

Index Name: part_pkey
Index Definition: CREATE UNIQUE INDEX part_pkey ON public.part USING btree (p_partkey)

Index Name: supplier_pkey
Index Definition: CREATE UNIQUE INDEX supplier_pkey ON public.supplier USING btree (s_suppkey)

Index Name: customer_pkey
Index Definition: CREATE UNIQUE INDEX customer_pkey ON public.customer USING btree (c_custkey)

Index Name: region_pkey
Index Definition: CREATE UNIQUE INDEX region_pkey ON public.region USING btree (r_regionkey)

Index Name: partsupp_pkey
Index Definition: CREATE UNIQUE INDEX partsupp_pkey ON public.partsupp USING btree (ps_partkey, ps_suppkey)

Index Name: orders_pkey
Index Definition: CREATE UNIQUE INDEX orders_pkey ON public.orders USING btree (o_orderkey)



## Baseline queries

In [9]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = off;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_tidscan = off;")
    cur.execute("SET enable_material = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = off;")
    cur.execute("SET enable_hashagg = on;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_partition_pruning = off;")
    cur.execute("SET enable_partitionwise_join = off;")
    cur.execute("SET enable_partitionwise_aggregate = off;")
    conn.commit()

In [10]:
query_1 = """
SELECT
    l_returnflag,
    l_linestatus,
    SUM(l_quantity) AS sum_qty,
    SUM(l_extendedprice) AS sum_base_price,
    SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
    SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
    AVG(l_quantity) AS avg_qty,
    AVG(l_extendedprice) AS avg_price,
    AVG(l_discount) AS avg_disc,
    COUNT(*) AS count_order
FROM
    lineitem
WHERE
    l_shipdate <= DATE '1998-12-01' - INTERVAL '90' DAY
GROUP BY
    l_returnflag,
    l_linestatus
ORDER BY
    l_returnflag,
    l_linestatus;
"""

query_10 = """
SELECT
    c_custkey,
    c_name,
    SUM(l_extendedprice * (1 - l_discount)) AS revenue,
    c_acctbal,
    n_name,
    c_address,
    c_phone,
    c_comment
FROM
    customer,
    orders,
    lineitem,
    nation
WHERE
    c_custkey = o_custkey
    AND l_orderkey = o_orderkey
    AND o_orderdate >= DATE '1993-10-01'
    AND o_orderdate < DATE '1993-10-01' + INTERVAL '3' MONTH
    AND l_returnflag = 'R'
    AND c_nationkey = n_nationkey
GROUP BY
    c_custkey,
    c_name,
    c_acctbal,
    c_phone,
    n_name,
    c_address,
    c_comment
ORDER BY
    revenue DESC;
"""

query_14 = """
SELECT
    100.00 * SUM(CASE
        WHEN p_type LIKE 'PROMO%'
        THEN l_extendedprice * (1 - l_discount)
        ELSE 0
    END) / SUM(l_extendedprice * (1 - l_discount)) AS promo_revenue
FROM
    lineitem,
    part
WHERE
    l_partkey = p_partkey
    AND l_shipdate >= DATE '1995-09-01'
    AND l_shipdate < DATE '1995-09-01' + INTERVAL '1' MONTH;

"""

query_17 = """
SELECT
    SUM(l_extendedprice) / 7.0 AS avg_yearly
FROM
    lineitem,
    part
WHERE
    l_partkey = p_partkey
    AND p_brand = 'Brand#23'
    AND p_container = 'MED BOX'
    AND l_quantity < (
        SELECT
            0.2 * AVG(l_quantity)
        FROM
            lineitem
        WHERE
            p_partkey = l_partkey
    );
"""


In [11]:
explain_analyze(query_1, analyze = True)

Finalize GroupAggregate  (cost=2300104.98..2300106.93 rows=6 width=236) (actual time=41971.743..41972.816 rows=4 loops=1)
  Group Key: l_returnflag, l_linestatus
  ->  Gather Merge  (cost=2300104.98..2300106.38 rows=12 width=236) (actual time=41971.694..41972.759 rows=12 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Sort  (cost=2299104.95..2299104.97 rows=6 width=236) (actual time=41958.535..41958.536 rows=4 loops=3)
              Sort Key: l_returnflag, l_linestatus
              Sort Method: quicksort  Memory: 27kB
              Worker 0:  Sort Method: quicksort  Memory: 27kB
              Worker 1:  Sort Method: quicksort  Memory: 27kB
              ->  Partial HashAggregate  (cost=2299104.74..2299104.87 rows=6 width=236) (actual time=41954.829..41954.833 rows=4 loops=3)
                    Group Key: l_returnflag, l_linestatus
                    Batches: 1  Memory Usage: 24kB
                    Worker 0:  Batches: 1  Memory Usage: 24kB
              

In [101]:
explain_analyze(query_10, analyze = True)

Sort  (cost=3344325.21..3345720.08 rows=557947 width=279) (actual time=31990.264..32040.663 rows=381105 loops=1)
  Sort Key: (sum((lineitem.l_extendedprice * ('1'::numeric - lineitem.l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  GroupAggregate  (cost=2942891.02..3146127.82 rows=557947 width=279) (actual time=29727.038..31635.053 rows=381105 loops=1)
        Group Key: customer.c_custkey, nation.n_name
        ->  Incremental Sort  (cost=2942891.02..3132179.14 rows=557947 width=259) (actual time=29727.025..31081.010 rows=1147084 loops=1)
              Sort Key: customer.c_custkey, nation.n_name
              Presorted Key: customer.c_custkey
              Full-sort Groups: 34124  Sort Method: quicksort  Average Memory: 31kB  Peak Memory: 31kB
              ->  Nested Loop  (cost=2942890.72..3107071.53 rows=557947 width=259) (actual time=29726.951..30755.197 rows=1147084 loops=1)
                    ->  Merge Join  (cost=2942890.57..3093377.71 rows=557947 width=1

In [102]:
explain_analyze(query_14, analyze = True)

Aggregate  (cost=1800922.34..1800922.35 rows=1 width=32) (actual time=28778.747..28778.848 rows=1 loops=1)
  ->  Merge Join  (cost=1676444.04..1786617.50 rows=817419 width=33) (actual time=27950.390..28572.123 rows=749223 loops=1)
        Merge Cond: (part.p_partkey = lineitem.l_partkey)
        ->  Index Scan using part_pkey on part  (cost=0.43..92913.43 rows=2000000 width=25) (actual time=0.131..330.069 rows=1999994 loops=1)
        ->  Sort  (cost=1676443.20..1678486.75 rows=817419 width=16) (actual time=27950.251..28009.924 rows=749223 loops=1)
              Sort Key: lineitem.l_partkey
              Sort Method: external sort  Disk: 24288kB
              ->  Gather  (cost=1000.00..1582197.72 rows=817419 width=16) (actual time=0.452..27458.587 rows=749223 loops=1)
                    Workers Planned: 2
                    Workers Launched: 2
                    ->  Parallel Seq Scan on lineitem  (cost=0.00..1499455.82 rows=340591 width=16) (actual time=0.537..27610.615 rows=249741 

We set hashjoin and hashaggregate off, we only analyze the situation with indexes.

## Query 1

We tried to put two indexes on l_returnflag and l_linestatus but we saw that they were not used. We have to note that they do not have many distinct values: (3,2) respectively, we could use a bitmap index, but in postgre it is not implemented.

Let's see if with an index on shipdate we get better results.

In [13]:
conn.rollback()
with conn.cursor() as cur:

    # tried also this index, but it is not helpful
    # cur.execute("CREATE EXTENSION IF NOT EXISTS btree_gin;")
    # cur.execute("CREATE INDEX idx_l_shipdate_gin ON lineitem USING gin (l_shipdate);")

    start_time = time.time()
    cur.execute("CREATE INDEX idx_l_shipdate ON lineitem (l_shipdate);")
    end_time = time.time()
    print(f"Time to create index on lineitem: {end_time - start_time} seconds")

    cur.execute("SELECT pg_relation_size('idx_l_shipdate');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_l_shipdate ON lineitem (l_shipdate): {index_size/(1024**2)} MB")
    print(f"Size of idx_l_shipdate ON lineitem (l_shipdate): {index_size/(1024**3)} GB")

    conn.commit()

Time to create index on lineitem: 54.73046016693115 seconds
Size of idx_l_shipdate ON lineitem (l_shipdate): 397.546875 MB
Size of idx_l_shipdate ON lineitem (l_shipdate): 0.3882293701171875 GB


In [104]:
conn.rollback()
with conn.cursor() as cur:

    # tried also hash index but it doesn't work, after 5 minutes it was still running
    start_time = time.time()
    cur.execute("CREATE INDEX idx_l_returnflag ON lineitem (l_returnflag);")
    end_time = time.time()
    print(f"Time to create index on lineitem: {end_time - start_time} seconds")

    cur.execute("SELECT pg_relation_size('idx_l_returnflag');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_l_returnflag ON lineitem (l_returnflag): {index_size/(1024**2)} MB")
    print(f"Size of idx_l_returnflag ON lineitem (l_returnflag): {index_size/(1024**3)} GB")

    conn.commit()

Time to create index on lineitem: 61.83437705039978 seconds
Size of idx_l_returnflag ON lineitem (l_returnflag): 396.4609375 MB
Size of idx_l_returnflag ON lineitem (l_returnflag): 0.38716888427734375 GB


We create an index also on return flag to see if it helps both in query 1 and 10.

In [14]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = on;")
    cur.execute("SET enable_tidscan = off;")
    cur.execute("SET enable_material = off;")
    cur.execute("SET enable_nestloop = off;")
    cur.execute("SET enable_mergejoin = off;")
    cur.execute("SET enable_hashjoin = off;")
    cur.execute("SET enable_sort = on;")
    conn.commit()

explain_analyze(query_1)

Finalize GroupAggregate  (cost=2958736.61..2958738.57 rows=6 width=236) (actual time=32916.266..32919.690 rows=4 loops=1)
  Group Key: l_returnflag, l_linestatus
  ->  Gather Merge  (cost=2958736.61..2958738.01 rows=12 width=236) (actual time=32916.244..32919.654 rows=12 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Sort  (cost=2957736.59..2957736.60 rows=6 width=236) (actual time=32910.412..32910.414 rows=4 loops=3)
              Sort Key: l_returnflag, l_linestatus
              Sort Method: quicksort  Memory: 27kB
              Worker 0:  Sort Method: quicksort  Memory: 27kB
              Worker 1:  Sort Method: quicksort  Memory: 27kB
              ->  Partial HashAggregate  (cost=2957736.37..2957736.51 rows=6 width=236) (actual time=32910.371..32910.376 rows=4 loops=3)
                    Group Key: l_returnflag, l_linestatus
                    Batches: 1  Memory Usage: 24kB
                    Worker 0:  Batches: 1  Memory Usage: 24kB
              

We can see an improvement using an index on l_shipdate with a bitmapscan. 

What I would suggest is to keep the index on l_shipdate since it may help us also in query 14, where shipdate has high selectivity and probably will help more.

The index on l_returnflag is not used, I also tried to use an index on (l_returnflag, l_linestatus), but it was not used.

### Validate and record the size of the table

In [106]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_tidscan = off;")
    cur.execute("SET enable_material = off;")
    cur.execute("SET enable_nestloop = off;")
    cur.execute("SET enable_mergejoin = off;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    conn.commit()
    
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_1};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_1 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 ('A', 'F', Decimal('377518399'), Decimal('566065727797.25'), Decimal('537759104278.0656'), Decimal('559276670892.116819'), Decimal('25.5009751030070973'), Decimal('38237.151008958546'), Decimal('0.05000657454024320463'), 14804077)
Size of query_1 result table: 0.015625 MB


## Query 10

o_orderdate is quite selective 3 months / 7*12 months, so probably an index can be beneficial.

ordering by revenue, which is computed in the query, can't be optimised.

what can be optimised is the join, where there are already indexes on the pks.

l_return flag is not selective, but we saw it helps in speeding up the query.

i put an index on o_custkey and l_orderkey but the first was never used and the second brings worse results.


In [4]:
conn.rollback()
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_o_orderdate ON orders (o_orderdate);")
    end_time = time.time()
    print(f"Time to create idx_o_orderdate ON orders (o_orderdate): {end_time - start_time} seconds")
    cur.execute("SELECT pg_relation_size('idx_o_orderdate');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_o_orderdate: {index_size/(1024**2)} MB")
    print(f"Size of idx_o_orderdate: {index_size/(1024**3)} GB")

    conn.commit()

Time to create idx_o_orderdate ON orders (o_orderdate): 11.052881002426147 seconds
Size of idx_o_orderdate: 100.1796875 MB
Size of idx_o_orderdate: 0.09783172607421875 GB


In [108]:
check_indexes('lineitem')
check_indexes('orders')

Index Name: idx_l_shipdate
Index Definition: CREATE INDEX idx_l_shipdate ON public.lineitem USING btree (l_shipdate)

Index Name: idx_l_returnflag
Index Definition: CREATE INDEX idx_l_returnflag ON public.lineitem USING btree (l_returnflag)



In [109]:
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = off;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_tidscan = off;")
    cur.execute("SET enable_material = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = off;")
    cur.execute("SET enable_sort = on;")
    conn.commit()

explain_analyze(query_10)

Sort  (cost=34913159.69..34914554.56 rows=557947 width=279) (actual time=30247.351..30297.432 rows=381105 loops=1)
  Sort Key: (sum((lineitem.l_extendedprice * ('1'::numeric - lineitem.l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  Finalize GroupAggregate  (cost=34545629.72..34714962.30 rows=557947 width=279) (actual time=28776.713..29877.905 rows=381105 loops=1)
        Group Key: customer.c_custkey, nation.n_name
        ->  Gather Merge  (cost=34545629.72..34703338.40 rows=464956 width=279) (actual time=28776.685..29587.500 rows=451109 loops=1)
              Workers Planned: 2
              Workers Launched: 2
              ->  Partial GroupAggregate  (cost=34544629.70..34648670.92 rows=232478 width=279) (actual time=28761.180..29549.346 rows=150370 loops=3)
                    Group Key: customer.c_custkey, nation.n_name
                    ->  Incremental Sort  (cost=34544629.70..34642858.97 rows=232478 width=259) (actual time=28761.163..29338.986 rows=3823

we note that here we get better results using index scan and not bitmapscan.

We can see that we get a small improvement using the index on l_returnflag, but the index on orderdate is not used because of the query definition, so we drop it.

In [110]:
with conn.cursor() as cur:
    
    cur.execute("DROP INDEX idx_o_orderdate;")

Validate and record the size of the table

In [111]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_10};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_10 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 (1237537, 'Customer#001237537', Decimal('884989.6657'), Decimal('7840.17'), 'RUSSIA                   ', 'FNG6WgB1mopyyY,ajQTU qUPW5o', '32-367-120-4327', 'nag carefully about the regular packages. carefully reg')
Size of query_10 result table: 78.5078125 MB


## Query 14

shipdate is selective and we know that we have already an index on it, so we can leverage it 

we know we have a btree index in p_partkey

since l_partkey will be fundamental for the next query, we can create now the index and check if it helps/is used.

In [112]:
conn.rollback()
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_l_partkey ON lineitem (l_partkey);")    
    end_time = time.time()
    print(f"Time to create idx_l_partkey ON lineitem (l_partkey): {end_time - start_time} seconds")
    cur.execute("SELECT pg_relation_size('idx_l_partkey');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_l_partkey: {index_size/(1024**2)} MB")
    print(f"Size of idx_l_partkey: {index_size/(1024**3)} GB")
    conn.commit()

Time to create idx_l_partkey ON lineitem (l_partkey): 46.996431827545166 seconds
Size of idx_l_partkey: 429.5078125 MB
Size of idx_l_partkey: 0.41944122314453125 GB


In [113]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = off;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    
    conn.commit()

explain_analyze(query_14)

Aggregate  (cost=1752182.24..1752182.26 rows=1 width=32) (actual time=22055.189..22055.719 rows=1 loops=1)
  ->  Merge Join  (cost=1627703.30..1737877.40 rows=817419 width=33) (actual time=21220.749..21849.587 rows=749223 loops=1)
        Merge Cond: (part.p_partkey = lineitem.l_partkey)
        ->  Index Scan using part_pkey on part  (cost=0.43..92913.43 rows=2000000 width=25) (actual time=0.181..333.497 rows=1999994 loops=1)
        ->  Sort  (cost=1627702.87..1629746.42 rows=817419 width=16) (actual time=21220.556..21284.103 rows=749223 loops=1)
              Sort Key: lineitem.l_partkey
              Sort Method: external sort  Disk: 24288kB
              ->  Gather  (cost=12155.11..1533457.40 rows=817419 width=16) (actual time=76.699..20330.170 rows=749223 loops=1)
                    Workers Planned: 2
                    Workers Launched: 2
                    ->  Parallel Bitmap Heap Scan on lineitem  (cost=11155.11..1450715.50 rows=340591 width=16) (actual time=71.580..20643.8

We can see that the index on l_partkey is not used, but this is in line with the definition of the query.

Validate and record size result table

In [114]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_14};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_14 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 (Decimal('16.6475949416150953'),)
Size of query_14 result table: 0.015625 MB


## Query 17

This query takes a lot of time if we don't use indexes.

The index on l_partkey is fundamental.

We tried hash indexes on p_brand and p_container but they are not used.

In [115]:
check_indexes('lineitem')

Index Name: idx_l_shipdate
Index Definition: CREATE INDEX idx_l_shipdate ON public.lineitem USING btree (l_shipdate)

Index Name: idx_l_returnflag
Index Definition: CREATE INDEX idx_l_returnflag ON public.lineitem USING btree (l_returnflag)

Index Name: idx_l_partkey
Index Definition: CREATE INDEX idx_l_partkey ON public.lineitem USING btree (l_partkey)



In [116]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = on;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = off;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    conn.commit()

explain_analyze(query_17, analyze=True)

Aggregate  (cost=9428617.44..9428617.45 rows=1 width=32) (actual time=10340.977..10340.979 rows=1 loops=1)
  ->  Nested Loop  (cost=0.87..9428567.40 rows=20015 width=8) (actual time=4.552..10340.009 rows=5526 loops=1)
        ->  Index Scan using part_pkey on part  (cost=0.43..102913.43 rows=2002 width=4) (actual time=0.418..400.213 rows=2044 loops=1)
              Filter: ((p_brand = 'Brand#23'::bpchar) AND (p_container = 'MED BOX'::bpchar))
              Rows Removed by Filter: 1997956
        ->  Index Scan using idx_l_partkey on lineitem  (cost=0.44..4658.06 rows=11 width=17) (actual time=4.653..4.862 rows=3 loops=2044)
              Index Cond: (l_partkey = part.p_partkey)
              Filter: (l_quantity < (SubPlan 1))
              Rows Removed by Filter: 27
              SubPlan 1
                ->  Aggregate  (cost=137.10..137.11 rows=1 width=32) (actual time=0.149..0.149 rows=1 loops=61385)
                      ->  Index Scan using idx_l_partkey on lineitem lineitem_1  (co

In [117]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_17};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchall()
    for row in result:
        print(row)

    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_17 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

(Decimal('3295493.512857142857'),)
Size of query_17 result table: 0.015625 MB


## Indexes used

In [118]:
a = ['nation', 'part', 'supplier', 'customer', 'lineitem', 'region', 'partsupp', 'orders']

conn.rollback()
for table in a:
    check_indexes(table)

Index Name: nation_pkey
Index Definition: CREATE UNIQUE INDEX nation_pkey ON public.nation USING btree (n_nationkey)

Index Name: part_pkey
Index Definition: CREATE UNIQUE INDEX part_pkey ON public.part USING btree (p_partkey)

Index Name: supplier_pkey
Index Definition: CREATE UNIQUE INDEX supplier_pkey ON public.supplier USING btree (s_suppkey)

Index Name: customer_pkey
Index Definition: CREATE UNIQUE INDEX customer_pkey ON public.customer USING btree (c_custkey)

Index Name: idx_l_shipdate
Index Definition: CREATE INDEX idx_l_shipdate ON public.lineitem USING btree (l_shipdate)

Index Name: idx_l_returnflag
Index Definition: CREATE INDEX idx_l_returnflag ON public.lineitem USING btree (l_returnflag)

Index Name: idx_l_partkey
Index Definition: CREATE INDEX idx_l_partkey ON public.lineitem USING btree (l_partkey)

Index Name: region_pkey
Index Definition: CREATE UNIQUE INDEX region_pkey ON public.region USING btree (r_regionkey)

Index Name: partsupp_pkey
Index Definition: CREATE UN

## Final db size

In [119]:
conn.rollback()
with conn.cursor() as cur:

    cur.execute("SELECT pg_database_size('dw_cs');")
    all_rows = cur.fetchall()
    print(f"Database size: {all_rows[0][0] / (1024**2):.2f} MB")
    print(f"Database size: {all_rows[0][0] / (1024**3):.2f} GB \n")

Database size: 21039.29 MB
Database size: 20.55 GB 



In [120]:
cur.close()
conn.close()
