In [1]:
import psycopg2
import time

In [2]:
# establish a connection to the PostgreSQL database
conn = psycopg2.connect(
    dbname = "dw_cs", 
    user = "postgres", 
    host= 'localhost',      # change this to your host
    password = "postgres",  # change this to your password
    port = 5432
)

In [3]:
# function to check the indexes on a table

def check_indexes(table_name):

    with conn.cursor() as cur:
        query = f"""
        SELECT
            indexname,
            indexdef
        FROM
            pg_indexes
        WHERE
            tablename = '{table_name}';
        """

        cur.execute(query)      
        
        indexes = cur.fetchall()
        
        for index in indexes:
            print(f"Index Name: {index[0]}")
            print(f"Index Definition: {index[1]}\n")


# function to explain-analyze a query

def explain_analyze(query, analyze = True):
    conn.rollback()
    with conn.cursor() as cur:
        if analyze:
            cur.execute(f"EXPLAIN ANALYZE {query}")
        else:
            cur.execute(f"EXPLAIN {query}")
        explain = cur.fetchall()

        for line in explain:
            print(line[0])

## Initial indexes

In [5]:
a = ['nation', 'part', 'supplier', 'customer', 'lineitem', 'region', 'partsupp', 'orders']

conn.rollback()
for table in a:
    check_indexes(table)

Index Name: nation_pkey
Index Definition: CREATE UNIQUE INDEX nation_pkey ON public.nation USING btree (n_nationkey)

Index Name: part_pkey
Index Definition: CREATE UNIQUE INDEX part_pkey ON public.part USING btree (p_partkey)

Index Name: idx_p_brand
Index Definition: CREATE INDEX idx_p_brand ON public.part USING gin (p_brand)

Index Name: idx_p_container
Index Definition: CREATE INDEX idx_p_container ON public.part USING gin (p_container)

Index Name: supplier_pkey
Index Definition: CREATE UNIQUE INDEX supplier_pkey ON public.supplier USING btree (s_suppkey)

Index Name: customer_pkey
Index Definition: CREATE UNIQUE INDEX customer_pkey ON public.customer USING btree (c_custkey)

Index Name: idx_l_shipdate
Index Definition: CREATE INDEX idx_l_shipdate ON public.lineitem USING btree (l_shipdate)

Index Name: idx_l_partkey
Index Definition: CREATE INDEX idx_l_partkey ON public.lineitem USING btree (l_partkey)

Index Name: region_pkey
Index Definition: CREATE UNIQUE INDEX region_pkey ON 

## Baseline queries

In [4]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_tidscan = off;")
    cur.execute("SET enable_material = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = off;")
    cur.execute("SET enable_hashagg = on;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_partition_pruning = off;")
    cur.execute("SET enable_partitionwise_join = off;")
    cur.execute("SET enable_partitionwise_aggregate = off;")
    conn.commit()

In [4]:
query_1 = """
SELECT
    l_returnflag,
    l_linestatus,
    SUM(l_quantity) AS sum_qty,
    SUM(l_extendedprice) AS sum_base_price,
    SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
    SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
    AVG(l_quantity) AS avg_qty,
    AVG(l_extendedprice) AS avg_price,
    AVG(l_discount) AS avg_disc,
    COUNT(*) AS count_order
FROM
    lineitem
WHERE
    l_shipdate <= DATE '1998-12-01' - INTERVAL '90' DAY
GROUP BY
    l_returnflag,
    l_linestatus
ORDER BY
    l_returnflag,
    l_linestatus;
"""

query_10 = """
SELECT
    c_custkey,
    c_name,
    SUM(l_extendedprice * (1 - l_discount)) AS revenue,
    c_acctbal,
    n_name,
    c_address,
    c_phone,
    c_comment
FROM
    customer,
    orders,
    lineitem,
    nation
WHERE
    c_custkey = o_custkey
    AND l_orderkey = o_orderkey
    AND o_orderdate >= DATE '1993-10-01'
    AND o_orderdate < DATE '1993-10-01' + INTERVAL '3' MONTH
    AND l_returnflag = 'R'
    AND c_nationkey = n_nationkey
GROUP BY
    c_custkey,
    c_name,
    c_acctbal,
    c_phone,
    n_name,
    c_address,
    c_comment
ORDER BY
    revenue DESC;
"""

query_14 = """
SELECT
    100.00 * SUM(CASE
        WHEN p_type LIKE 'PROMO%'
        THEN l_extendedprice * (1 - l_discount)
        ELSE 0
    END) / SUM(l_extendedprice * (1 - l_discount)) AS promo_revenue
FROM
    lineitem,
    part
WHERE
    l_partkey = p_partkey
    AND l_shipdate >= DATE '1995-09-01'
    AND l_shipdate < DATE '1995-09-01' + INTERVAL '1' MONTH;

"""

query_17 = """
SELECT
    SUM(l_extendedprice) / 7.0 AS avg_yearly
FROM
    lineitem,
    part
WHERE
    l_partkey = p_partkey
    AND p_brand = 'Brand#23'
    AND p_container = 'MED BOX'
    AND l_quantity < (
        SELECT
            0.2 * AVG(l_quantity)
        FROM
            lineitem
        WHERE
            p_partkey = l_partkey
    );
"""


In [12]:
explain_analyze(query_1, analyze = True)

Finalize GroupAggregate  (cost=2300104.98..2300106.93 rows=6 width=236) (actual time=37413.172..37414.824 rows=4 loops=1)
  Group Key: l_returnflag, l_linestatus
  ->  Gather Merge  (cost=2300104.98..2300106.38 rows=12 width=236) (actual time=37413.155..37414.782 rows=12 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Sort  (cost=2299104.95..2299104.97 rows=6 width=236) (actual time=37407.591..37407.593 rows=4 loops=3)
              Sort Key: l_returnflag, l_linestatus
              Sort Method: quicksort  Memory: 27kB
              Worker 0:  Sort Method: quicksort  Memory: 27kB
              Worker 1:  Sort Method: quicksort  Memory: 27kB
              ->  Partial HashAggregate  (cost=2299104.74..2299104.87 rows=6 width=236) (actual time=37407.549..37407.554 rows=4 loops=3)
                    Group Key: l_returnflag, l_linestatus
                    Batches: 1  Memory Usage: 24kB
                    Worker 0:  Batches: 1  Memory Usage: 24kB
              

In [13]:
explain_analyze(query_10, analyze = True)

Sort  (cost=3344325.21..3345720.08 rows=557947 width=279) (actual time=32089.471..32140.473 rows=381105 loops=1)
  Sort Key: (sum((lineitem.l_extendedprice * ('1'::numeric - lineitem.l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  GroupAggregate  (cost=2942891.02..3146127.82 rows=557947 width=279) (actual time=29752.701..31714.944 rows=381105 loops=1)
        Group Key: customer.c_custkey, nation.n_name
        ->  Incremental Sort  (cost=2942891.02..3132179.14 rows=557947 width=259) (actual time=29752.688..31153.161 rows=1147084 loops=1)
              Sort Key: customer.c_custkey, nation.n_name
              Presorted Key: customer.c_custkey
              Full-sort Groups: 34124  Sort Method: quicksort  Average Memory: 31kB  Peak Memory: 31kB
              ->  Nested Loop  (cost=2942890.72..3107071.53 rows=557947 width=259) (actual time=29752.600..30822.790 rows=1147084 loops=1)
                    ->  Merge Join  (cost=2942890.57..3093377.71 rows=557947 width=1

In [14]:
explain_analyze(query_14, analyze = True)

Aggregate  (cost=1800922.34..1800922.35 rows=1 width=32) (actual time=29053.529..29053.639 rows=1 loops=1)
  ->  Merge Join  (cost=1676444.04..1786617.50 rows=817419 width=33) (actual time=28216.458..28841.330 rows=749223 loops=1)
        Merge Cond: (part.p_partkey = lineitem.l_partkey)
        ->  Index Scan using part_pkey on part  (cost=0.43..92913.43 rows=2000000 width=25) (actual time=0.501..325.632 rows=1999994 loops=1)
        ->  Sort  (cost=1676443.20..1678486.75 rows=817419 width=16) (actual time=28215.933..28277.619 rows=749223 loops=1)
              Sort Key: lineitem.l_partkey
              Sort Method: external sort  Disk: 24296kB
              ->  Gather  (cost=1000.00..1582197.72 rows=817419 width=16) (actual time=0.245..27746.114 rows=749223 loops=1)
                    Workers Planned: 2
                    Workers Launched: 2
                    ->  Parallel Seq Scan on lineitem  (cost=0.00..1499455.82 rows=340591 width=16) (actual time=2.027..27878.044 rows=249741 

We set hashjoin and hashaggregate off, we only analyze the situation with indexes.

Setting bitmap scan on or off doesn't affect the choiches of the optmizer, so we set it off.

## Query 1

We tried to put two indexes on l_returnflag and l_linestatus but we saw that they were not used. We have to note that they do not have many distinct values: (3,2) respectively, we could use a bitmap index, but in postgre it is not implemented.

Let's see if with an index on shipdate we get better results.

In [22]:
conn.rollback()
with conn.cursor() as cur:

    # tried also this index, but it is not helpful
    cur.execute("CREATE EXTENSION IF NOT EXISTS btree_gin;")
    
    start_time = time.time()
    cur.execute("CREATE INDEX idx_l_shipdate_gin ON lineitem USING gin (l_shipdate);")
    end_time = time.time()
    print(f"Time to create idx_l_shipdate_gin: {end_time - start_time} seconds")

    cur.execute("SELECT pg_relation_size('idx_l_shipdate_gin');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_l_shipdate_gin: {index_size/(1024**2)} MB")

    conn.commit()

Time to create idx_l_shipdate_gin: 28.46113610267639 seconds
Size of idx_l_shipdate_gin: 192.0625 MB


In [16]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = on;")
    cur.execute("SET enable_tidscan = off;")
    cur.execute("SET enable_material = off;")
    cur.execute("SET enable_nestloop = off;")
    cur.execute("SET enable_mergejoin = off;")
    cur.execute("SET enable_hashjoin = off;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    conn.commit()

explain_analyze(query_1)

GroupAggregate  (cost=10013748937.34..10015965854.48 rows=6 width=236) (actual time=74891.132..119123.310 rows=4 loops=1)
  Group Key: l_returnflag, l_linestatus
  ->  Sort  (cost=10013748937.34..10013896731.80 rows=59117786 width=25) (actual time=59907.064..75449.645 rows=59142609 loops=1)
        Sort Key: l_returnflag, l_linestatus
        Sort Method: external merge  Disk: 2155336kB
        ->  Seq Scan on lineitem  (cost=10000000000.00..10001874368.65 rows=59117786 width=25) (actual time=0.154..10817.424 rows=59142609 loops=1)
              Filter: (l_shipdate <= '1998-09-02 00:00:00'::timestamp without time zone)
              Rows Removed by Filter: 843443
Planning Time: 3.503 ms
Execution Time: 119198.998 ms


Shipdate with a btree+gin index doesn't bring any advantage, so we drop it.

In [19]:
conn.rollback()
with conn.cursor() as cur:

    cur.execute("DROP INDEX IF EXISTS idx_l_shipdate_gin;")
    conn.commit()

In [20]:
conn.rollback()
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_l_shipdate ON lineitem (l_shipdate);")
    end_time = time.time()
    print(f"Time to create idx_l_shipdate: {end_time - start_time} seconds")

    cur.execute("SELECT pg_relation_size('idx_l_shipdate');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_l_shipdate: {index_size/(1024**2)} MB")

    conn.commit()

Time to create idx_l_shipdate: 50.349547147750854 seconds
Size of idx_l_shipdate: 397.546875 MB


In [23]:
explain_analyze(query_1)

Finalize GroupAggregate  (cost=2958736.61..2958738.57 rows=6 width=236) (actual time=28416.919..28419.355 rows=4 loops=1)
  Group Key: l_returnflag, l_linestatus
  ->  Gather Merge  (cost=2958736.61..2958738.01 rows=12 width=236) (actual time=28416.899..28419.313 rows=12 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Sort  (cost=2957736.59..2957736.60 rows=6 width=236) (actual time=28393.543..28393.544 rows=4 loops=3)
              Sort Key: l_returnflag, l_linestatus
              Sort Method: quicksort  Memory: 27kB
              Worker 0:  Sort Method: quicksort  Memory: 27kB
              Worker 1:  Sort Method: quicksort  Memory: 27kB
              ->  Partial HashAggregate  (cost=2957736.37..2957736.51 rows=6 width=236) (actual time=28393.511..28393.516 rows=4 loops=3)
                    Group Key: l_returnflag, l_linestatus
                    Batches: 1  Memory Usage: 24kB
                    Worker 0:  Batches: 1  Memory Usage: 24kB
              

We create an index also on return flag to see if it helps both in query 1 and 10.

We tried an hash on (l_returnflag, l_linestatus) but hash doesn't work with multicolumns.

We try an hash on returnflag.

In [27]:
conn.rollback()
with conn.cursor() as cur:

    # tried also hash index but it doesn't work, after 5 minutes it was still running
    start_time = time.time()
    cur.execute("CREATE INDEX idx_l_returnflag ON lineitem USING gin (l_returnflag, l_linestatus);")
    end_time = time.time()
    print(f"Time to create index on lineitem: {end_time - start_time} seconds")

    cur.execute("SELECT pg_relation_size('idx_l_returnflag');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_l_returnflag ON lineitem (l_returnflag): {index_size/(1024**2)} MB")
    print(f"Size of idx_l_returnflag ON lineitem (l_returnflag): {index_size/(1024**3)} GB")

    conn.commit()

Time to create index on lineitem: 28.43353796005249 seconds
Size of idx_l_returnflag ON lineitem (l_returnflag): 127.328125 MB
Size of idx_l_returnflag ON lineitem (l_returnflag): 0.1243438720703125 GB


In [28]:
explain_analyze(query_1)

Finalize GroupAggregate  (cost=2958736.61..2958738.57 rows=6 width=236) (actual time=34572.293..34575.039 rows=4 loops=1)
  Group Key: l_returnflag, l_linestatus
  ->  Gather Merge  (cost=2958736.61..2958738.01 rows=12 width=236) (actual time=34572.267..34574.994 rows=12 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Sort  (cost=2957736.59..2957736.60 rows=6 width=236) (actual time=34551.508..34551.512 rows=4 loops=3)
              Sort Key: l_returnflag, l_linestatus
              Sort Method: quicksort  Memory: 27kB
              Worker 0:  Sort Method: quicksort  Memory: 27kB
              Worker 1:  Sort Method: quicksort  Memory: 27kB
              ->  Partial HashAggregate  (cost=2957736.37..2957736.51 rows=6 width=236) (actual time=34551.454..34551.461 rows=4 loops=3)
                    Group Key: l_returnflag, l_linestatus
                    Batches: 1  Memory Usage: 24kB
                    Worker 0:  Batches: 1  Memory Usage: 24kB
              

This is not used so we drop it.

In [29]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("DROP EXTENSION IF EXISTS btree_gin CASCADE;")
    cur.execute("drop INDEX if exists idx_l_returnflag;")
    conn.commit()

We can see an improvement using an index on l_shipdate with a bitmapscan. 

What I would suggest is to keep the index on l_shipdate since it may help us also in query 14, where shipdate has high selectivity and probably will help more.

The index on l_returnflag is not used, I also tried to use an index on (l_returnflag, l_linestatus), but it was not used, anyway it will be helpful in query 10.

### Validate and record the size of the table

In [106]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_tidscan = off;")
    cur.execute("SET enable_material = off;")
    cur.execute("SET enable_nestloop = off;")
    cur.execute("SET enable_mergejoin = off;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    conn.commit()
    
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_1};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_1 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 ('A', 'F', Decimal('377518399'), Decimal('566065727797.25'), Decimal('537759104278.0656'), Decimal('559276670892.116819'), Decimal('25.5009751030070973'), Decimal('38237.151008958546'), Decimal('0.05000657454024320463'), 14804077)
Size of query_1 result table: 0.015625 MB


## Query 10

o_orderdate is quite selective 3 months / 7*12 months, so probably an index can be beneficial.

ordering by revenue, which is computed in the query, can't be optimised.

what can be optimised is the join, where there are already indexes on the pks.

l_return flag is not selective, but we saw it helps in speeding up the query.

i put an index on o_custkey and l_orderkey but the first was never used and the second brings worse results.


In [18]:
conn.rollback()
with conn.cursor() as cur:

    # tried hash index but it doesn't work, after 15 minutes it was still running
    start_time = time.time()
    cur.execute("CREATE INDEX idx_l_returnflag ON lineitem USING hash (l_returnflag);")
    end_time = time.time()
    print(f"Time to create index on lineitem: {end_time - start_time} seconds")

    cur.execute("SELECT pg_relation_size('idx_l_returnflag');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_l_returnflag ON lineitem (l_returnflag): {index_size/(1024**2)} MB")

    conn.commit()

In [None]:
explain_analyze(query_10)

In [None]:
conn.rollback()
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("drop index idx_l_returnflag;")
    conn.commit()

In [27]:
conn.rollback()
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("create EXTENSION IF not EXISTS btree_gin;") # otherwise it doesn't work
    print("extension cerated")
    cur.execute("CREATE INDEX idx_l_returnflag ON lineitem USING gin (l_returnflag);")
    end_time = time.time()
    print(f"Time to create index on lineitem: {end_time - start_time} seconds")

    cur.execute("SELECT pg_relation_size('idx_l_returnflag');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_l_returnflag ON lineitem (l_returnflag): {index_size/(1024**2)} MB")

    conn.commit()

extension cerated
Time to create index on lineitem: 19.962371826171875 seconds
Size of idx_l_returnflag ON lineitem (l_returnflag): 64.25 MB


In [28]:
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")

    conn.commit()

explain_analyze(query_10)

Sort  (cost=3578165.76..3579560.62 rows=557947 width=279) (actual time=42074.437..42125.476 rows=381105 loops=1)
  Sort Key: (sum((lineitem.l_extendedprice * ('1'::numeric - lineitem.l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  Finalize GroupAggregate  (cost=3210635.78..3379968.36 rows=557947 width=279) (actual time=40584.766..41685.294 rows=381105 loops=1)
        Group Key: customer.c_custkey, nation.n_name
        ->  Gather Merge  (cost=3210635.78..3368344.46 rows=464956 width=279) (actual time=40584.751..41395.453 rows=464801 loops=1)
              Workers Planned: 2
              Workers Launched: 2
              ->  Partial GroupAggregate  (cost=3209635.76..3313676.98 rows=232478 width=279) (actual time=40567.435..41338.972 rows=154934 loops=3)
                    Group Key: customer.c_custkey, nation.n_name
                    ->  Incremental Sort  (cost=3209635.76..3307865.03 rows=232478 width=259) (actual time=40567.415..41131.195 rows=382361 loops=3

In [29]:
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = off;")

    conn.commit()

explain_analyze(query_10)

Sort  (cost=10003486042.81..10003487437.68 rows=557947 width=279) (actual time=33693.838..33744.622 rows=381105 loops=1)
  Sort Key: (sum((lineitem.l_extendedprice * ('1'::numeric - lineitem.l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  Finalize GroupAggregate  (cost=10003118512.84..10003287845.41 rows=557947 width=279) (actual time=32212.419..33320.381 rows=381105 loops=1)
        Group Key: customer.c_custkey, nation.n_name
        ->  Gather Merge  (cost=10003118512.84..10003276221.52 rows=464956 width=279) (actual time=32212.413..33027.966 rows=450382 loops=1)
              Workers Planned: 2
              Workers Launched: 2
              ->  Partial GroupAggregate  (cost=10003117512.81..10003221554.03 rows=232478 width=279) (actual time=32168.229..32949.752 rows=150127 loops=3)
                    Group Key: customer.c_custkey, nation.n_name
                    ->  Incremental Sort  (cost=10003117512.81..10003215742.08 rows=232478 width=259) (actual time=

In [26]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("drop INDEX idx_l_returnflag;")
    conn.commit()

In [18]:
conn.rollback()
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_l_returnflag ON lineitem (l_returnflag);")
    end_time = time.time()
    print(f"Time to create index on lineitem: {end_time - start_time} seconds")

    cur.execute("SELECT pg_relation_size('idx_l_returnflag');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_l_returnflag ON lineitem (l_returnflag): {index_size/(1024**2)} MB")

    conn.commit()

Time to create index on lineitem: 59.03293704986572 seconds
Size of idx_l_returnflag ON lineitem (l_returnflag): 396.4609375 MB


In [19]:
explain_analyze(query_10)

Sort  (cost=34913159.69..34914554.56 rows=557947 width=279) (actual time=26842.652..26893.302 rows=381105 loops=1)
  Sort Key: (sum((lineitem.l_extendedprice * ('1'::numeric - lineitem.l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  Finalize GroupAggregate  (cost=34545629.72..34714962.30 rows=557947 width=279) (actual time=25321.837..26437.974 rows=381105 loops=1)
        Group Key: customer.c_custkey, nation.n_name
        ->  Gather Merge  (cost=34545629.72..34703338.40 rows=464956 width=279) (actual time=25321.829..26144.228 rows=450868 loops=1)
              Workers Planned: 2
              Workers Launched: 2
              ->  Partial GroupAggregate  (cost=34544629.70..34648670.92 rows=232478 width=279) (actual time=25306.516..26090.030 rows=150289 loops=3)
                    Group Key: customer.c_custkey, nation.n_name
                    ->  Incremental Sort  (cost=34544629.70..34642858.97 rows=232478 width=259) (actual time=25306.499..25878.258 rows=3823

For the same reason of not creating a btree+gin index on shipdate, we create a btree index also on o_orderdate 

In [20]:
conn.rollback()
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_o_orderdate ON orders (o_orderdate);")
    end_time = time.time()
    print(f"Time to create idx_o_orderdate ON orders (o_orderdate): {end_time - start_time} seconds")
    cur.execute("SELECT pg_relation_size('idx_o_orderdate');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_o_orderdate: {index_size/(1024**2)} MB")
    print(f"Size of idx_o_orderdate: {index_size/(1024**3)} GB")

    conn.commit()

Time to create idx_o_orderdate ON orders (o_orderdate): 10.481804132461548 seconds
Size of idx_o_orderdate: 100.1796875 MB
Size of idx_o_orderdate: 0.09783172607421875 GB


In [21]:
conn.rollback()
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_l_orderkey ON lineitem USING gin (l_orderkey);")
    end_time = time.time()
    print(f"Time to create idx_l_orderkey ON orders (l_orderkey): {end_time - start_time} seconds")
    cur.execute("SELECT pg_relation_size('idx_l_orderkey');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_l_orderkey: {index_size/(1024**2)} MB")
    print(f"Size of idx_l_orderkey: {index_size/(1024**3)} GB")

    conn.commit()

Time to create idx_l_orderkey ON orders (l_orderkey): 52.030105113983154 seconds
Size of idx_l_orderkey: 999.71875 MB
Size of idx_l_orderkey: 0.976287841796875 GB


Let's check the selectivity of l_returnflag = 'R'.

In [16]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SELECT COUNT(*) FROM lineitem WHERE l_returnflag = 'R';")
    rows = cur.fetchall()
    for row in rows:
        print(row)
    cur.execute("SELECT COUNT(*) FROM lineitem;")
    rows = cur.fetchall()
    for row in rows:
        print(row)

(14808183,)
(59986052,)


In [22]:
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    conn.commit()

explain_analyze(query_10)

Sort  (cost=3324959.15..3326354.02 rows=557947 width=279) (actual time=41403.214..41454.335 rows=381105 loops=1)
  Sort Key: (sum((lineitem.l_extendedprice * ('1'::numeric - lineitem.l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  GroupAggregate  (cost=2923524.96..3126761.75 rows=557947 width=279) (actual time=39229.932..41026.039 rows=381105 loops=1)
        Group Key: customer.c_custkey, nation.n_name
        ->  Incremental Sort  (cost=2923524.96..3112813.08 rows=557947 width=259) (actual time=39229.917..40471.320 rows=1147084 loops=1)
              Sort Key: customer.c_custkey, nation.n_name
              Presorted Key: customer.c_custkey
              Full-sort Groups: 34124  Sort Method: quicksort  Average Memory: 31kB  Peak Memory: 31kB
              ->  Nested Loop  (cost=2923524.66..3087705.46 rows=557947 width=259) (actual time=39229.846..40144.081 rows=1147084 loops=1)
                    ->  Merge Join  (cost=2923524.51..3074011.64 rows=557947 width=1

In [23]:
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = off;")
    conn.commit()

explain_analyze(query_10)

Sort  (cost=34913159.69..34914554.56 rows=557947 width=279) (actual time=29314.443..29365.071 rows=381105 loops=1)
  Sort Key: (sum((lineitem.l_extendedprice * ('1'::numeric - lineitem.l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  Finalize GroupAggregate  (cost=34545629.72..34714962.30 rows=557947 width=279) (actual time=27868.519..28947.569 rows=381105 loops=1)
        Group Key: customer.c_custkey, nation.n_name
        ->  Gather Merge  (cost=34545629.72..34703338.40 rows=464956 width=279) (actual time=27868.504..28664.107 rows=451154 loops=1)
              Workers Planned: 2
              Workers Launched: 2
              ->  Partial GroupAggregate  (cost=34544629.70..34648670.92 rows=232478 width=279) (actual time=27828.736..28585.299 rows=150385 loops=3)
                    Group Key: customer.c_custkey, nation.n_name
                    ->  Incremental Sort  (cost=34544629.70..34642858.97 rows=232478 width=259) (actual time=27828.719..28381.246 rows=3823

we note that here we get better results using index scan and not bitmapscan.

We can see that we get a small improvement using the index on l_returnflag, but the index on orderdate is not used because of the query definition, so we drop it.

In [24]:
with conn.cursor() as cur:
    
    cur.execute("DROP INDEX idx_o_orderdate;")
    cur.execute("DROP INDEX idx_l_orderkey;")
    conn.commit()

In [25]:
explain_analyze(query_10)

Sort  (cost=34913159.69..34914554.56 rows=557947 width=279) (actual time=28433.389..28484.250 rows=381105 loops=1)
  Sort Key: (sum((lineitem.l_extendedprice * ('1'::numeric - lineitem.l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  Finalize GroupAggregate  (cost=34545629.72..34714962.30 rows=557947 width=279) (actual time=26959.210..28041.918 rows=381105 loops=1)
        Group Key: customer.c_custkey, nation.n_name
        ->  Gather Merge  (cost=34545629.72..34703338.40 rows=464956 width=279) (actual time=26959.202..27758.215 rows=451144 loops=1)
              Workers Planned: 2
              Workers Launched: 2
              ->  Partial GroupAggregate  (cost=34544629.70..34648670.92 rows=232478 width=279) (actual time=26931.973..27693.280 rows=150381 loops=3)
                    Group Key: customer.c_custkey, nation.n_name
                    ->  Incremental Sort  (cost=34544629.70..34642858.97 rows=232478 width=259) (actual time=26931.954..27488.675 rows=3823

Validate and record the size of the table

In [111]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_10};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_10 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 (1237537, 'Customer#001237537', Decimal('884989.6657'), Decimal('7840.17'), 'RUSSIA                   ', 'FNG6WgB1mopyyY,ajQTU qUPW5o', '32-367-120-4327', 'nag carefully about the regular packages. carefully reg')
Size of query_10 result table: 78.5078125 MB


## Query 14

shipdate is selective and we know that we have already an index on it, so we can leverage it 

we know we have a btree index in p_partkey

since l_partkey will be fundamental for the next query, we can create now the index and check if it helps/is used.

In [9]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = off;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    
    conn.commit()

explain_analyze(query_14)

Aggregate  (cost=1752182.01..1752182.03 rows=1 width=32) (actual time=20325.198..20325.602 rows=1 loops=1)
  ->  Merge Join  (cost=1627703.71..1737877.17 rows=817419 width=33) (actual time=19561.520..20119.528 rows=749223 loops=1)
        Merge Cond: (part.p_partkey = lineitem.l_partkey)
        ->  Index Scan using part_pkey on part  (cost=0.43..92913.43 rows=2000000 width=25) (actual time=0.861..266.734 rows=1999994 loops=1)
        ->  Sort  (cost=1627702.87..1629746.42 rows=817419 width=16) (actual time=19560.646..19620.759 rows=749223 loops=1)
              Sort Key: lineitem.l_partkey
              Sort Method: external sort  Disk: 24288kB
              ->  Gather  (cost=12155.11..1533457.40 rows=817419 width=16) (actual time=85.373..18451.143 rows=749223 loops=1)
                    Workers Planned: 2
                    Workers Launched: 2
                    ->  Parallel Bitmap Heap Scan on lineitem  (cost=11155.11..1450715.50 rows=340591 width=16) (actual time=74.458..18929.1

Validate and record size result table

In [114]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_14};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_14 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 (Decimal('16.6475949416150953'),)
Size of query_14 result table: 0.015625 MB


## Query 17

This query takes a lot of time if we don't use indexes.

The index on l_partkey is fundamental.

We tried hash indexes on p_brand and p_container but they are not used.

In [5]:
conn.rollback()
with conn.cursor() as cur:

    start_time = time.time()
    #cur.execute("CREATE EXTENSION IF NOT EXISTS btree_gin;") # otherwise it doesn't work
    cur.execute("CREATE INDEX idx_l_partkey ON lineitem USING gin (l_partkey);")    
    end_time = time.time()
    print(f"Time to create idx_l_partkey ON lineitem (l_partkey): {end_time - start_time} seconds")
    cur.execute("SELECT pg_relation_size('idx_l_partkey');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_l_partkey: {index_size/(1024**2)} MB")
    conn.commit()

Time to create idx_l_partkey ON lineitem (l_partkey): 144.65980434417725 seconds
Size of idx_l_partkey: 416.328125 MB


In [5]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = off;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    conn.commit()

explain_analyze(query_17, analyze=True)

Aggregate  (cost=10797725.67..10797725.68 rows=1 width=32) (actual time=18863.726..18863.733 rows=1 loops=1)
  ->  Nested Loop  (cost=20.06..10797675.63 rows=20015 width=8) (actual time=106.231..18862.449 rows=5526 loops=1)
        ->  Seq Scan on part  (cost=0.00..70962.00 rows=2002 width=4) (actual time=24.950..349.157 rows=2044 loops=1)
              Filter: ((p_brand = 'Brand#23'::bpchar) AND (p_container = 'MED BOX'::bpchar))
              Rows Removed by Filter: 1997956
        ->  Bitmap Heap Scan on lineitem  (cost=20.06..5357.89 rows=11 width=17) (actual time=8.713..9.056 rows=3 loops=2044)
              Recheck Cond: (l_partkey = part.p_partkey)
              Filter: (l_quantity < (SubPlan 1))
              Rows Removed by Filter: 27
              Heap Blocks: exact=61385
              ->  Bitmap Index Scan on idx_l_partkey  (cost=0.00..20.06 rows=33 width=0) (actual time=0.531..0.531 rows=30 loops=2044)
                    Index Cond: (l_partkey = part.p_partkey)
           

In [5]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("drop index idx_l_partkey;") 
    conn.commit()

In [6]:
conn.rollback()
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_l_partkey ON lineitem USING hash (l_partkey);")    
    end_time = time.time()
    print(f"Time to create idx_l_partkey ON lineitem (l_partkey): {end_time - start_time} seconds")
    cur.execute("SELECT pg_relation_size('idx_l_partkey');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_l_partkey: {index_size/(1024**2)} MB")
    conn.commit()

Time to create idx_l_partkey ON lineitem (l_partkey): 139.54524493217468 seconds
Size of idx_l_partkey: 1896.65625 MB


In [7]:
explain_analyze(query_17, analyze=True)

Aggregate  (cost=1937971.06..1937971.07 rows=1 width=32) (actual time=140288.301..140288.418 rows=1 loops=1)
  ->  Hash Join  (cost=54687.22..1937921.02 rows=20015 width=8) (actual time=1004.053..140286.656 rows=5526 loops=1)
        Hash Cond: (lineitem.l_partkey = part.p_partkey)
        Join Filter: (lineitem.l_quantity < (SubPlan 1))
        Rows Removed by Join Filter: 55859
        ->  Seq Scan on lineitem  (cost=0.00..1724403.52 rows=59986052 width=17) (actual time=0.206..21705.621 rows=59986052 loops=1)
        ->  Hash  (cost=54662.20..54662.20 rows=2002 width=4) (actual time=963.074..963.186 rows=2044 loops=1)
              Buckets: 2048  Batches: 1  Memory Usage: 88kB
              ->  Gather  (cost=1000.00..54662.20 rows=2002 width=4) (actual time=3.697..961.572 rows=2044 loops=1)
                    Workers Planned: 2
                    Workers Launched: 2
                    ->  Parallel Seq Scan on part  (cost=0.00..53462.00 rows=834 width=4) (actual time=1.310..952.566

we can see that the hash index is not useful.

In [8]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("drop index idx_l_partkey;") 
    conn.commit()

In [9]:
conn.rollback()
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_l_partkey ON lineitem (l_partkey);")    
    end_time = time.time()
    print(f"Time to create idx_l_partkey ON lineitem (l_partkey): {end_time - start_time} seconds")
    cur.execute("SELECT pg_relation_size('idx_l_partkey');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_l_partkey: {index_size/(1024**2)} MB")
    conn.commit()

Time to create idx_l_partkey ON lineitem (l_partkey): 48.10466003417969 seconds
Size of idx_l_partkey: 429.5078125 MB


In [14]:
explain_analyze(query_17)

Aggregate  (cost=9428617.44..9428617.45 rows=1 width=32) (actual time=10267.602..10267.603 rows=1 loops=1)
  ->  Nested Loop  (cost=0.87..9428567.40 rows=20015 width=8) (actual time=7.576..10266.618 rows=5526 loops=1)
        ->  Index Scan using part_pkey on part  (cost=0.43..102913.43 rows=2002 width=4) (actual time=1.014..404.634 rows=2044 loops=1)
              Filter: ((p_brand = 'Brand#23'::bpchar) AND (p_container = 'MED BOX'::bpchar))
              Rows Removed by Filter: 1997956
        ->  Index Scan using idx_l_partkey on lineitem  (cost=0.44..4658.06 rows=11 width=17) (actual time=4.590..4.824 rows=3 loops=2044)
              Index Cond: (l_partkey = part.p_partkey)
              Filter: (l_quantity < (SubPlan 1))
              Rows Removed by Filter: 27
              SubPlan 1
                ->  Aggregate  (cost=137.10..137.11 rows=1 width=32) (actual time=0.148..0.148 rows=1 loops=61385)
                      ->  Index Scan using idx_l_partkey on lineitem lineitem_1  (co

We can see that we get better results using btree index on l_partkey wrt btree+gin.

In [15]:
conn.rollback()
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_p_brand ON part USING gin (p_brand);")    
    end_time = time.time()
    print(f"Time to create idx_p_brand ON part (p_brand): {end_time - start_time} seconds")
    cur.execute("SELECT pg_relation_size('idx_p_brand');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_p_brand: {index_size/(1024**2)} MB")
    print(f"Size of idx_p_brand: {index_size/(1024**3)} GB")
    conn.commit()

Time to create idx_p_brand ON part (p_brand): 0.5964438915252686 seconds
Size of idx_p_brand: 3.140625 MB
Size of idx_p_brand: 0.0030670166015625 GB


In [16]:
conn.rollback()
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_p_container ON part USING gin (p_container);")    
    end_time = time.time()
    print(f"Time to create idx_p_container ON part (p_container): {end_time - start_time} seconds")
    cur.execute("SELECT pg_relation_size('idx_p_container');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_p_container: {index_size/(1024**2)} MB")
    print(f"Size of idx_p_container: {index_size/(1024**3)} GB")
    conn.commit()

Time to create idx_p_container ON part (p_container): 0.7287259101867676 seconds
Size of idx_p_container: 3.765625 MB
Size of idx_p_container: 0.0036773681640625 GB


In [17]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = on;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = off;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    conn.commit()
    
explain_analyze(query_17, analyze=True)

Aggregate  (cost=9333134.47..9333134.48 rows=1 width=32) (actual time=10055.052..10055.054 rows=1 loops=1)
  ->  Nested Loop  (cost=862.17..9333084.43 rows=20015 width=8) (actual time=33.863..10054.087 rows=5526 loops=1)
        ->  Bitmap Heap Scan on part  (cost=861.73..7430.46 rows=2002 width=4) (actual time=27.656..45.512 rows=2044 loops=1)
              Recheck Cond: ((p_container = 'MED BOX'::bpchar) AND (p_brand = 'Brand#23'::bpchar))
              Heap Blocks: exact=1998
              ->  BitmapAnd  (cost=861.73..861.73 rows=2002 width=0) (actual time=27.258..27.259 rows=0 loops=1)
                    ->  Bitmap Index Scan on idx_p_container  (cost=0.00..323.66 rows=49000 width=0) (actual time=11.837..11.837 rows=50186 loops=1)
                          Index Cond: (p_container = 'MED BOX'::bpchar)
                    ->  Bitmap Index Scan on idx_p_brand  (cost=0.00..536.82 rows=81733 width=0) (actual time=13.332..13.332 rows=79826 loops=1)
                          Index Cond:

Comparing with an index on l_partkey with gin wrt a btree the cost is 

Bitmap Heap Scan on lineitem  (cost=20.06..5357.89 rows=11 width=17) vs 

Index Scan using idx_l_partkey on lineitem  (cost=0.44..4658.06 rows=11 width=17)

since also the creation time is better for l_partkey with btre and the sapce neded is similar: 416 mb vs 429 mb, we keep the index on l_shipdate with btree.

We can see that the indexes on p_brand and p_container are not used, so we drop them.

In [28]:
with conn.cursor() as cur:
    cur.execute("DROP INDEX IF EXISTS idx_p_container;")
    cur.execute("DROP INDEX IF EXISTS idx_p_brand;")
    conn.commit()

In [117]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_17};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchall()
    for row in result:
        print(row)

    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_17 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

(Decimal('3295493.512857142857'),)
Size of query_17 result table: 0.015625 MB


## Indexes used

In [34]:
a = ['nation', 'part', 'supplier', 'customer', 'lineitem', 'region', 'partsupp', 'orders']

conn.rollback()
for table in a:
    check_indexes(table)

Index Name: nation_pkey
Index Definition: CREATE UNIQUE INDEX nation_pkey ON public.nation USING btree (n_nationkey)

Index Name: part_pkey
Index Definition: CREATE UNIQUE INDEX part_pkey ON public.part USING btree (p_partkey)

Index Name: supplier_pkey
Index Definition: CREATE UNIQUE INDEX supplier_pkey ON public.supplier USING btree (s_suppkey)

Index Name: customer_pkey
Index Definition: CREATE UNIQUE INDEX customer_pkey ON public.customer USING btree (c_custkey)

Index Name: idx_l_shipdate
Index Definition: CREATE INDEX idx_l_shipdate ON public.lineitem USING btree (l_shipdate)

Index Name: idx_l_returnflag
Index Definition: CREATE INDEX idx_l_returnflag ON public.lineitem USING btree (l_returnflag)

Index Name: idx_l_partkey
Index Definition: CREATE INDEX idx_l_partkey ON public.lineitem USING btree (l_partkey)

Index Name: region_pkey
Index Definition: CREATE UNIQUE INDEX region_pkey ON public.region USING btree (r_regionkey)

Index Name: partsupp_pkey
Index Definition: CREATE UN

In [30]:
cur.close()
conn.close()