In [3]:
import psycopg2
import time

In [4]:
# establish a connection to the PostgreSQL database
conn = psycopg2.connect(
    dbname = "dw_cs", 
    user = "postgres", 
    # host= 'localhost',
    host = '172.30.160.1',
    # password = "Mu34zi72",
    password = "postgres",
    port = 5432
)

In [5]:
# cur.close()
# conn.close()

In [6]:
# function to check the indexes on a table

def check_indexes(table_name):

    with conn.cursor() as cur:
        query = f"""
        SELECT
            indexname,
            indexdef
        FROM
            pg_indexes
        WHERE
            tablename = '{table_name}';
        """

        cur.execute(query)      
        
        indexes = cur.fetchall()
        
        for index in indexes:
            print(f"Index Name: {index[0]}")
            print(f"Index Definition: {index[1]}\n")


# function to explain-analyze a query

def explain_analyze(query, analyze = True):
    conn.rollback()
    with conn.cursor() as cur:
        if analyze:
            cur.execute(f"EXPLAIN ANALYZE {query}")
        else:
            cur.execute(f"EXPLAIN {query}")
        explain = cur.fetchall()

        for line in explain:
            print(line[0])

## First step
Compute size and time for executing the queries without additional structure support. Record the size of the result set.


the first value is the startup cost, the second the total cost.

Startup Cost: This represents the amount of work the query planner estimates is required before the first row can be returned. For a sequential scan (Seq Scan), this value is typically very low or zero because the first row can be returned almost immediately.

Total Cost: This represents the total estimated cost to execute the entire query. It is the sum of the startup cost and the cost to process all rows.

## how to compute the size needed to execute the query?

## Query 1

In [5]:
query_1 = """
SELECT
    l_returnflag,
    l_linestatus,
    SUM(l_quantity) AS sum_qty,
    SUM(l_extendedprice) AS sum_base_price,
    SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
    SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
    AVG(l_quantity) AS avg_qty,
    AVG(l_extendedprice) AS avg_price,
    AVG(l_discount) AS avg_disc,
    COUNT(*) AS count_order
FROM
    lineitem
WHERE
    l_shipdate <= DATE '1998-12-01' - INTERVAL '90' DAY
GROUP BY
    l_returnflag,
    l_linestatus
ORDER BY
    l_returnflag,
    l_linestatus;
"""

In [6]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = off;")
    cur.execute("SET enable_bitmapscan = off;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_tidscan = off;")
    cur.execute("SET enable_material = off;")
    cur.execute("SET enable_nestloop = off;")
    cur.execute("SET enable_mergejoin = off;")
    cur.execute("SET enable_hashjoin = off;")
    cur.execute("SET enable_sort = off;")
    cur.execute("SET enable_partition_pruning = off;")
    cur.execute("SET enable_partitionwise_join = off;")
    cur.execute("SET enable_partitionwise_aggregate = off;")
    conn.commit()
    
explain_analyze(query_1)

GroupAggregate  (cost=10013743919.39..10015959934.91 rows=6 width=236) (actual time=199101.899..326429.833 rows=4 loops=1)
  Group Key: l_returnflag, l_linestatus
  ->  Sort  (cost=10013743919.39..10013891653.74 rows=59093743 width=25) (actual time=155823.036..197391.279 rows=59142609 loops=1)
        Sort Key: l_returnflag, l_linestatus
        Sort Method: external merge  Disk: -2041039kB
        ->  Seq Scan on lineitem  (cost=0.00..1874349.70 rows=59093743 width=25) (actual time=0.632..37558.095 rows=59142609 loops=1)
              Filter: (l_shipdate <= '1998-09-02 00:00:00'::timestamp without time zone)
              Rows Removed by Filter: 843443
Planning Time: 24.311 ms
Execution Time: 326708.396 ms


Trying to put an index on (l_returnflag, l_linestatus). since they do not have many distinct values: (3,2) respectively, we could use a bitmap index, but in postgre it is not implemented.

Let's use btree, hash index cannot be done on a pair

Maybe we can leverage bitmap scan.

In [17]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_l_returnflag_linestatus ON lineitem (l_returnflag, l_linestatus);")
    end_time = time.time()
    print(f"Time to create index on lineitem (l_returnflag, l_linestatus): {end_time - start_time} seconds")

    conn.commit()

Time to create index on lineitem (l_returnflag, l_linestatus): 83.83242297172546 seconds


In [12]:
with conn.cursor() as cur:

    cur.execute("SELECT pg_relation_size('idx_l_returnflag_linestatus');")
    index_size = cur.fetchone()[0]
    print(f"Size of gin index on lineitem (l_returnflag, l_linestatus): {index_size/(1024**2)} MB")
    print(f"Size of gin index on lineitem (l_returnflag, l_linestatus): {index_size/(1024**3)} GB")

Size of gin index on lineitem (l_returnflag, l_linestatus): 396.4609375 MB
Size of gin index on lineitem (l_returnflag, l_linestatus): 0.38716888427734375 GB


In [8]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_tidscan = off;")
    cur.execute("SET enable_material = off;")
    cur.execute("SET enable_nestloop = off;")
    cur.execute("SET enable_mergejoin = off;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    conn.commit()

explain_analyze(query_1)

Finalize GroupAggregate  (cost=2299574.59..2299576.54 rows=6 width=236) (actual time=53554.178..53557.148 rows=4 loops=1)
  Group Key: l_returnflag, l_linestatus
  ->  Gather Merge  (cost=2299574.59..2299575.99 rows=12 width=236) (actual time=53554.137..53557.081 rows=12 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Sort  (cost=2298574.56..2298574.58 rows=6 width=236) (actual time=53488.994..53488.997 rows=4 loops=3)
              Sort Key: l_returnflag, l_linestatus
              Sort Method: quicksort  Memory: 27kB
              Worker 0:  Sort Method: quicksort  Memory: 27kB
              Worker 1:  Sort Method: quicksort  Memory: 27kB
              ->  Partial HashAggregate  (cost=2298574.35..2298574.48 rows=6 width=236) (actual time=53488.926..53488.934 rows=4 loops=3)
                    Group Key: l_returnflag, l_linestatus
                    Batches: 1  Memory Usage: 24kB
                    Worker 0:  Batches: 1  Memory Usage: 24kB
              

In [6]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_tidscan = off;")
    cur.execute("SET enable_material = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = off;")
    cur.execute("SET enable_sort = on;")
    conn.commit()

explain_analyze(query_1)

Finalize GroupAggregate  (cost=2299574.59..2299576.54 rows=6 width=236) (actual time=53311.966..53313.894 rows=4 loops=1)
  Group Key: l_returnflag, l_linestatus
  ->  Gather Merge  (cost=2299574.59..2299575.99 rows=12 width=236) (actual time=53311.940..53313.849 rows=12 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Sort  (cost=2298574.56..2298574.58 rows=6 width=236) (actual time=53288.852..53288.858 rows=4 loops=3)
              Sort Key: l_returnflag, l_linestatus
              Sort Method: quicksort  Memory: 27kB
              Worker 0:  Sort Method: quicksort  Memory: 27kB
              Worker 1:  Sort Method: quicksort  Memory: 27kB
              ->  Partial HashAggregate  (cost=2298574.35..2298574.48 rows=6 width=236) (actual time=53287.944..53287.952 rows=4 loops=3)
                    Group Key: l_returnflag, l_linestatus
                    Batches: 1  Memory Usage: 24kB
                    Worker 0:  Batches: 1  Memory Usage: 24kB
              

In [18]:
conn.rollback()
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_l_shipdate ON lineitem (l_shipdate);")
    end_time = time.time()
    print(f"Time to create index on lineitem: {end_time - start_time} seconds")

    conn.commit()

Time to create index on lineitem: 52.21028995513916 seconds


In [11]:
with conn.cursor() as cur:

    cur.execute("SELECT pg_relation_size('idx_l_shipdate');")
    index_size = cur.fetchone()[0]
    print(f"Size of gin index on lineitem (l_shipdate): {index_size/(1024**2)} MB")
    print(f"Size of gin index on lineitem (l_shipdate): {index_size/(1024**3)} GB")

Size of gin index on lineitem (l_shipdate): 397.546875 MB
Size of gin index on lineitem (l_shipdate): 0.3882293701171875 GB


In [9]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_tidscan = off;")
    cur.execute("SET enable_material = off;")
    cur.execute("SET enable_nestloop = off;")
    cur.execute("SET enable_mergejoin = off;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    conn.commit()

In [10]:
explain_analyze(query_1)

Finalize GroupAggregate  (cost=2957794.83..2957796.79 rows=6 width=236) (actual time=49054.213..49072.826 rows=4 loops=1)
  Group Key: l_returnflag, l_linestatus
  ->  Gather Merge  (cost=2957794.83..2957796.23 rows=12 width=236) (actual time=49054.173..49072.770 rows=12 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Sort  (cost=2956794.81..2956794.82 rows=6 width=236) (actual time=49016.941..49016.946 rows=4 loops=3)
              Sort Key: l_returnflag, l_linestatus
              Sort Method: quicksort  Memory: 27kB
              Worker 0:  Sort Method: quicksort  Memory: 27kB
              Worker 1:  Sort Method: quicksort  Memory: 27kB
              ->  Partial HashAggregate  (cost=2956794.60..2956794.73 rows=6 width=236) (actual time=49016.872..49016.881 rows=4 loops=3)
                    Group Key: l_returnflag, l_linestatus
                    Batches: 1  Memory Usage: 24kB
                    Worker 0:  Batches: 1  Memory Usage: 24kB
              

### Disable hash aggregate

In [19]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_tidscan = off;")
    cur.execute("SET enable_material = off;")
    cur.execute("SET enable_hashagg = off;")
    cur.execute("SET enable_nestloop = off;")
    cur.execute("SET enable_mergejoin = off;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    conn.commit()

In [20]:
explain_analyze(query_1)

Finalize GroupAggregate  (cost=6886663.57..7810032.42 rows=6 width=236) (actual time=144068.798..161691.701 rows=4 loops=1)
  Group Key: l_returnflag, l_linestatus
  ->  Gather Merge  (cost=6886663.57..7810031.86 rows=12 width=236) (actual time=143910.709..161691.319 rows=12 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial GroupAggregate  (cost=6885663.54..7809030.45 rows=6 width=236) (actual time=121862.243..142262.407 rows=4 loops=3)
              Group Key: l_returnflag, l_linestatus
              ->  Sort  (cost=6885663.54..6947221.33 rows=24623114 width=25) (actual time=115009.475..121910.610 rows=19714203 loops=3)
                    Sort Key: l_returnflag, l_linestatus
                    Sort Method: external merge  Disk: 723504kB
                    Worker 0:  Sort Method: external merge  Disk: 712960kB
                    Worker 1:  Sort Method: external merge  Disk: 716928kB
                    ->  Parallel Bitmap Heap Scan on lineitem  (co

In [21]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = off;")
    cur.execute("SET enable_indexonlyscan = on;")
    cur.execute("SET enable_tidscan = off;")
    cur.execute("SET enable_material = off;")
    cur.execute("SET enable_nestloop = off;")
    cur.execute("SET enable_mergejoin = off;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    conn.commit()

In [22]:
explain_analyze(query_1)

Finalize GroupAggregate  (cost=1000.59..124685817.11 rows=6 width=236) (actual time=107006.181..107006.254 rows=4 loops=1)
  Group Key: l_returnflag, l_linestatus
  ->  Gather Merge  (cost=1000.59..124685816.55 rows=12 width=236) (actual time=106929.952..107006.208 rows=10 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial GroupAggregate  (cost=0.56..124684815.15 rows=6 width=236) (actual time=19294.237..77704.922 rows=3 loops=3)
              Group Key: l_returnflag, l_linestatus
              ->  Parallel Index Scan using idx_l_returnflag_linestatus on lineitem  (cost=0.56..123823006.02 rows=24623114 width=25) (actual time=0.803..57590.155 rows=19714203 loops=3)
                    Filter: (l_shipdate <= '1998-09-02 00:00:00'::timestamp without time zone)
                    Rows Removed by Filter: 281148
Planning Time: 2.877 ms
Execution Time: 107006.392 ms


At the end of the day, the index on (l_returnflag, l_linestatus) is not used for sorting nor grouping, so I would not use it. 

On the contrary we can see an improvement using an index on l_shipdate with a bitmapscan. if we used indexonlyscan we get worse results.

What I would suggest is to keep the index on l_shipdate since it may help us also in query 14, even if it has low selectivity and probably it won't help much.

Validate and record the size of the table

In [9]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_tidscan = off;")
    cur.execute("SET enable_material = off;")
    cur.execute("SET enable_nestloop = off;")
    cur.execute("SET enable_mergejoin = off;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    conn.commit()
    
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_1};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_1 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 ('A', 'F', Decimal('377518399'), Decimal('566065727797.25'), Decimal('537759104278.0656'), Decimal('559276670892.116819'), Decimal('25.5009751030070973'), Decimal('38237.151008958546'), Decimal('0.05000657454024320463'), 14804077)
Size of query_1 result table: 0.015625 MB


In [11]:
# l_shipdate has selectivity of 90/(6*12*365) = 0,003424657534 , so an index may be useful, 
# but since we have <=, an hash index can't be used. 
# we may use an inverted list

# tried with btree_gin extension, but it gave worse results

"""
conn.rollback()
with conn.cursor() as cur:
    cur.execute("CREATE EXTENSION IF NOT EXISTS btree_gin;")
    cur.execute("CREATE INDEX idx_lineitem_shipdate ON lineitem USING gin (l_shipdate);")

"""
# we may try to put an index also on (l_returnflag, l_linestatus) since they are used in the GROUP BY and ORDER BY clauses

Time to create gin index on lineitem (l_shipdate): 70.45620393753052 seconds


## Query 10

In [4]:
query_10 = """
SELECT
    c_custkey,
    c_name,
    SUM(l_extendedprice * (1 - l_discount)) AS revenue,
    c_acctbal,
    n_name,
    c_address,
    c_phone,
    c_comment
FROM
    customer,
    orders,
    lineitem,
    nation
WHERE
    c_custkey = o_custkey
    AND l_orderkey = o_orderkey
    AND o_orderdate >= DATE '1993-10-01'
    AND o_orderdate < DATE '1993-10-01' + INTERVAL '3' MONTH
    AND l_returnflag = 'R'
    AND c_nationkey = n_nationkey
GROUP BY
    c_custkey,
    c_name,
    c_acctbal,
    c_phone,
    n_name,
    c_address,
    c_comment
ORDER BY
    revenue DESC;
"""

Order_Date is not selective, so probably a sequential scan for that clause will be ok (but bitmapscan may be beneficial).
ordering by revenue, which is computed in the query, can't be optimised.
what can be optimised is the join.

In [5]:
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = off;")
    cur.execute("SET enable_bitmapscan = off;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_tidscan = off;")
    cur.execute("SET enable_material = off;")
    cur.execute("SET enable_nestloop = off;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = off;")
    cur.execute("SET enable_sort = on;")
    conn.commit()

In [6]:
explain_analyze(query_10)

Sort  (cost=3775904.16..3777256.44 rows=540913 width=279) (actual time=47746.573..47825.689 rows=381105 loops=1)
  Sort Key: (sum((lineitem.l_extendedprice * ('1'::numeric - lineitem.l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  Finalize GroupAggregate  (cost=3514388.77..3583884.62 rows=540913 width=279) (actual time=46125.765..47087.550 rows=381105 loops=1)
        Group Key: customer.c_custkey, nation.n_name
        ->  Gather Merge  (cost=3514388.77..3572615.61 rows=450760 width=279) (actual time=46125.688..46642.507 rows=450378 loops=1)
              Workers Planned: 2
              Workers Launched: 2
              ->  Partial GroupAggregate  (cost=3513388.75..3519586.70 rows=225380 width=279) (actual time=46096.908..46474.597 rows=150126 loops=3)
                    Group Key: customer.c_custkey, nation.n_name
                    ->  Sort  (cost=3513388.75..3513952.20 rows=225380 width=259) (actual time=46096.865..46163.459 rows=382361 loops=3)
          

In [7]:
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_tidscan = off;")
    cur.execute("SET enable_material = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    conn.commit()

In [8]:
explain_analyze(query_10)

Sort  (cost=2655567.02..2656919.30 rows=540913 width=279) (actual time=41879.185..42004.789 rows=381105 loops=1)
  Sort Key: (sum((lineitem.l_extendedprice * ('1'::numeric - lineitem.l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  Finalize GroupAggregate  (cost=2296819.47..2463547.48 rows=540913 width=279) (actual time=39346.447..41312.665 rows=381105 loops=1)
        Group Key: customer.c_custkey, nation.n_name
        ->  Gather Merge  (cost=2296819.47..2452278.46 rows=450760 width=279) (actual time=39346.397..40837.657 rows=453770 loops=1)
              Workers Planned: 2
              Workers Launched: 2
              ->  Partial GroupAggregate  (cost=2295819.45..2399249.55 rows=225380 width=279) (actual time=39336.334..40734.674 rows=151257 loops=3)
                    Group Key: customer.c_custkey, nation.n_name
                    ->  Incremental Sort  (cost=2295819.45..2393615.05 rows=225380 width=259) (actual time=39336.290..40402.003 rows=382361 loops=3

It is interesting to note that it leverages an index on l_returnflag because we have an index on (l_returnflag, l_linestatus). but it may be dropped, sooo.

Validate and record the size of the table

In [9]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_10};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_10 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 (1237537, 'Customer#001237537', Decimal('884989.6657'), Decimal('7840.17'), 'RUSSIA                   ', 'FNG6WgB1mopyyY,ajQTU qUPW5o', '32-367-120-4327', 'nag carefully about the regular packages. carefully reg')
Size of query_10 result table: 78.5078125 MB


## Query 14

In [5]:
query_14 = """
SELECT
    100.00 * SUM(CASE
        WHEN p_type LIKE 'PROMO%'
        THEN l_extendedprice * (1 - l_discount)
        ELSE 0
    END) / SUM(l_extendedprice * (1 - l_discount)) AS promo_revenue
FROM
    lineitem,
    part
WHERE
    l_partkey = p_partkey
    AND l_shipdate >= DATE '1995-09-01'
    AND l_shipdate < DATE '1995-09-01' + INTERVAL '1' MONTH;

"""

shipdate is not selective, so a sequential scan for that will be ok
we can optimise the join with the indexes. we know we have a btree index in both l_partkey, p_partkey

In [6]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = off;")
    cur.execute("SET enable_hashjoin = off;")
    cur.execute("SET enable_sort = on;")
    conn.commit()

In [7]:
explain_analyze(query_14)

Finalize Aggregate  (cost=3040576.20..3040576.22 rows=1 width=32) (actual time=54118.429..54143.827 rows=1 loops=1)
  ->  Gather  (cost=3040575.97..3040576.18 rows=2 width=64) (actual time=54116.979..54143.758 rows=3 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial Aggregate  (cost=3039575.97..3039575.98 rows=1 width=64) (actual time=54025.019..54025.020 rows=1 loops=3)
              ->  Nested Loop  (cost=10693.37..3033861.86 rows=326520 width=33) (actual time=178.893..52919.409 rows=249741 loops=3)
                    ->  Parallel Bitmap Heap Scan on lineitem  (cost=10692.95..1436018.61 rows=326520 width=16) (actual time=176.917..37625.034 rows=249741 loops=3)
                          Recheck Cond: ((l_shipdate >= '1995-09-01'::date) AND (l_shipdate < '1995-10-01 00:00:00'::timestamp without time zone))
                          Rows Removed by Index Recheck: 6278762
                          Heap Blocks: exact=14648 lossy=122085
                  

Takes a lot of time. after 45 minutes i quit.

Let's try to not use the indexes but only hash join.

In [46]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    conn.commit()

In [47]:
explain_analyze(query_14)

Finalize Aggregate  (cost=1496775.77..1496775.79 rows=1 width=32) (actual time=20258.203..20292.782 rows=1 loops=1)
  ->  Gather  (cost=1496775.54..1496775.75 rows=2 width=64) (actual time=20257.364..20292.767 rows=3 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial Aggregate  (cost=1495775.54..1495775.55 rows=1 width=64) (actual time=20249.104..20249.108 rows=1 loops=3)
              ->  Parallel Hash Join  (cost=75519.02..1490373.79 rows=308671 width=33) (actual time=19799.402..20132.108 rows=249741 loops=3)
                    Hash Cond: (lineitem.l_partkey = part.p_partkey)
                    ->  Parallel Bitmap Heap Scan on lineitem  (cost=10109.87..1415441.38 rows=308671 width=16) (actual time=241.545..18951.927 rows=249741 loops=3)
                          Recheck Cond: ((l_shipdate >= '1995-09-01'::date) AND (l_shipdate < '1995-10-01 00:00:00'::timestamp without time zone))
                          Rows Removed by Index Recheck: 6279367
    

we can see that the optimizer doesn't use the index on p_partkey, it prefers to use hash join.
the bitmap scan is used for the shipdate condition, and seems to speed up the time of 9 sec.

let's use also the indexes, the bitmap scan.

In [41]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = off;")
    cur.execute("SET enable_sort = on;")
    conn.commit()

explain_analyze(query_14, analyze=True)

Aggregate  (cost=1697062.00..1697062.02 rows=1 width=32) (actual time=19220.117..19221.248 rows=1 loops=1)
  ->  Merge Join  (cost=1575410.50..1684097.82 rows=740810 width=33) (actual time=17863.853..18877.299 rows=749223 loops=1)
        Merge Cond: (part.p_partkey = lineitem.l_partkey)
        ->  Index Scan using part_pkey on part  (cost=0.43..92909.67 rows=2000016 width=25) (actual time=0.017..520.218 rows=1999994 loops=1)
        ->  Sort  (cost=1575409.70..1577261.73 rows=740810 width=16) (actual time=17863.821..17969.999 rows=749223 loops=1)
              Sort Key: lineitem.l_partkey
              Sort Method: external sort  Disk: 24288kB
              ->  Gather  (cost=11109.87..1490522.38 rows=740810 width=16) (actual time=123.398..16730.733 rows=749223 loops=1)
                    Workers Planned: 2
                    Workers Launched: 2
                    ->  Parallel Bitmap Heap Scan on lineitem  (cost=10109.87..1415441.38 rows=308671 width=16) (actual time=111.862..17221

using sort seems to improve the result of 5 seconds.

bitmap scan improves of 9 seconds the time.

as it is clear, the index on lineitem is not used.

let's try with index nested loop

In [48]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = off;")
    cur.execute("SET enable_hashjoin = off;")
    cur.execute("SET enable_sort = off;")
    conn.commit()

explain_analyze(query_14, analyze=True)

Finalize Aggregate  (cost=2932787.70..2932787.71 rows=1 width=32) (actual time=21599.203..21601.892 rows=1 loops=1)
  ->  Gather  (cost=2932787.46..2932787.67 rows=2 width=64) (actual time=21597.924..21601.858 rows=3 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial Aggregate  (cost=2931787.46..2931787.47 rows=1 width=64) (actual time=21582.093..21582.095 rows=1 loops=3)
              ->  Nested Loop  (cost=10110.30..2926385.72 rows=308671 width=33) (actual time=133.630..21343.967 rows=249741 loops=3)
                    ->  Parallel Bitmap Heap Scan on lineitem  (cost=10109.87..1415441.38 rows=308671 width=16) (actual time=132.549..18454.968 rows=249741 loops=3)
                          Recheck Cond: ((l_shipdate >= '1995-09-01'::date) AND (l_shipdate < '1995-10-01 00:00:00'::timestamp without time zone))
                          Rows Removed by Index Recheck: 6279367
                          Heap Blocks: exact=14645 lossy=122171
                  

Validate and record size result table

In [20]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_14};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_14 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 (Decimal('16.6475949416150953'),)
Size of query_14 result table: 0.015625 MB


## Query 17

In [7]:
query_17 = """
SELECT
    SUM(l_extendedprice) / 7.0 AS avg_yearly
FROM
    lineitem,
    part
WHERE
    l_partkey = p_partkey
    AND p_brand = 'Brand#23'
    AND p_container = 'MED BOX'
    AND l_quantity < (
        SELECT
            0.2 * AVG(l_quantity)
        FROM
            lineitem
        WHERE
            p_partkey = l_partkey
    );
"""

In [8]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = off;")
    cur.execute("SET enable_hashjoin = off;")
    cur.execute("SET enable_sort = on;")
    conn.commit()

explain_analyze(query_17, analyze=True)

Aggregate  (cost=111088725898511.00..111088725898511.02 rows=1 width=32)
  ->  Nested Loop  (cost=0.44..111088725898458.64 rows=20945 width=8)
        ->  Seq Scan on lineitem  (cost=0.00..1724406.52 rows=59986052 width=17)
        ->  Memoize  (cost=0.44..1874376.62 rows=1 width=4)
              Cache Key: lineitem.l_quantity, lineitem.l_partkey
              Cache Mode: binary
              ->  Index Scan using part_pkey on part  (cost=0.43..1874376.61 rows=1 width=4)
                    Index Cond: (p_partkey = lineitem.l_partkey)
                    Filter: ((p_brand = 'Brand#23'::bpchar) AND (p_container = 'MED BOX'::bpchar) AND (lineitem.l_quantity < (SubPlan 1)))
                    SubPlan 1
                      ->  Aggregate  (cost=1874371.73..1874371.74 rows=1 width=32)
                            ->  Seq Scan on lineitem lineitem_1  (cost=0.00..1874371.65 rows=31 width=5)
                                  Filter: (part.p_partkey = l_partkey)


This query takes a lot of time if we don't use indexes.

In [11]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")          #off
    cur.execute("SET enable_indexonlyscan = on;")       #off
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = on;")        #off
    cur.execute("SET enable_sort = on;")
    conn.commit()

explain_analyze(query_17, analyze=True)

we can see that it is already a fast query, but we can try to put an index also on containier and brand to see if it speeds up the query. they have respectively 40 and 25 distinc values, so they are not that selective, but we can try.

In [14]:
conn.rollback()
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_p_brand ON part USING hash (p_brand);")    
    end_time = time.time()
    print(f"Time to create index on part: {end_time - start_time} seconds")
    conn.commit()

Time to create index on lineitem: 55.932111978530884 seconds


In [15]:
conn.rollback()
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_p_container ON part USING hash (p_container);")    
    end_time = time.time()
    print(f"Time to create index on part: {end_time - start_time} seconds")
    conn.commit()

Time to create index on lineitem: 36.967782974243164 seconds


In [16]:
check_indexes("part")

Index Name: part_pkey
Index Definition: CREATE UNIQUE INDEX part_pkey ON public.part USING btree (p_partkey)

Index Name: idx_p_brand
Index Definition: CREATE INDEX idx_p_brand ON public.part USING hash (p_brand)

Index Name: idx_p_container
Index Definition: CREATE INDEX idx_p_container ON public.part USING hash (p_container)



In [17]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = off;")
    cur.execute("SET enable_sort = on;")
    conn.commit()

explain_analyze(query_17, analyze=True)

Aggregate  (cost=7132257.53..7132257.55 rows=1 width=32) (actual time=19185.859..19185.862 rows=1 loops=1)
  ->  Nested Loop  (cost=4089.74..7132208.29 rows=19695 width=8) (actual time=53.152..19183.788 rows=5526 loops=1)
        ->  Bitmap Heap Scan on part  (cost=4089.74..10564.34 rows=1970 width=4) (actual time=28.314..249.393 rows=2044 loops=1)
              Recheck Cond: ((p_container = 'MED BOX'::bpchar) AND (p_brand = 'Brand#23'::bpchar))
              Heap Blocks: exact=1998
              ->  BitmapAnd  (cost=4089.74..4089.74 rows=1970 width=0) (actual time=27.397..27.398 rows=0 loops=1)
                    ->  Bitmap Index Scan on idx_p_container  (cost=0.00..1567.00 rows=49467 width=0) (actual time=18.831..18.831 rows=50186 loops=1)
                          Index Cond: (p_container = 'MED BOX'::bpchar)
                    ->  Bitmap Index Scan on idx_p_brand  (cost=0.00..2521.50 rows=79667 width=0) (actual time=7.327..7.327 rows=79826 loops=1)
                          Index

we get a small improvemtn, i don't think it is worth keeping those indexes.

In [None]:
"""

with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_17};")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_17 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

"""

## Part 2: Indexes

Some useful commands:

To create and index:

default is b+tree
CREATE INDEX idx_customer_name ON customer (c_name);

available indexes:

B-tree: The default and most common type of index.

Hash: Used for equality comparisons.

GIN (Generalized Inverted Index): Useful for indexing array values and full-text search.

GiST (Generalized Search Tree): Supports many types of queries, including full-text search.

SP-GiST (Space-Partitioned Generalized Search Tree): Useful for partitioning data.

BRIN (Block Range INdexes): Efficient for large tables where the column values are correlated with their physical location.

other possibilities:

Partial Indexes
Description: Indexes only a portion of a table, based on a condition.
Use Case: When you frequently query a subset of rows.

CREATE INDEX idx_active_customers ON customer (c_name) WHERE active = true;

Expression Indexes
Description: Indexes the result of an expression or function rather than a raw column.
Use Case: When queries involve expressions or function calls.

CREATE INDEX idx_lower_customer_name ON customer ((lower(c_name)));


To disable the indexscan

SET enable_seqscan = on;
SET enable_indexscan = off;
SET enable_bitmapscan = off;

In [9]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_lineitem_partkey ON lineitem USING hash (l_partkey);")
    end_time = time.time()
    print(f"Time to create index on lineitem: {end_time - start_time} seconds")

    conn.commit()

Time to create index on lineitem: 146.5280566215515 seconds


In [10]:
with conn.cursor() as cur:

    cur.execute("SELECT pg_relation_size('idx_lineitem_partkey');")
    index_size = cur.fetchone()[0]
    print(f"Size of index on lineitem: {index_size/(1024**2)} MB")
    print(f"Size of index on lineitem: {index_size/(1024**3)} GB")


Size of index on lineitem: 1896.65625 MB
Size of index on lineitem: 1.852203369140625 GB


In [11]:
conn.rollback()
with conn.cursor() as cur:
    
    cur.execute(f"EXPLAIN ANALYZE {query_17}")
    explain_result = cur.fetchall()
    print("EXPLAIN ANALYZE result:")
    for row in explain_result:
        print(row[0])

EXPLAIN ANALYZE result:
Aggregate  (cost=1937805.96..1937805.97 rows=1 width=32) (actual time=240896.545..240896.670 rows=1 loops=1)
  ->  Hash Join  (cost=54683.72..1937756.72 rows=19695 width=8) (actual time=1156.271..240892.848 rows=5526 loops=1)
        Hash Cond: (lineitem.l_partkey = part.p_partkey)
        Join Filter: (lineitem.l_quantity < (SubPlan 1))
        Rows Removed by Join Filter: 55859
        ->  Seq Scan on lineitem  (cost=0.00..1724403.52 rows=59986052 width=17) (actual time=0.335..23018.323 rows=59986052 loops=1)
        ->  Hash  (cost=54659.10..54659.10 rows=1970 width=4) (actual time=989.198..989.319 rows=2044 loops=1)
              Buckets: 2048  Batches: 1  Memory Usage: 88kB
              ->  Gather  (cost=1000.00..54659.10 rows=1970 width=4) (actual time=3.643..975.057 rows=2044 loops=1)
                    Workers Planned: 2
                    Workers Launched: 2
                    ->  Parallel Seq Scan on part  (cost=0.00..53462.10 rows=821 width=4) (ac

Very good! A query that almost can't be executed now is completed in 4 minutes! But the cost is huge 1.85 GB ...

In [None]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_17};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchall()
    for row in result:
        print(row)

    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_17 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 (Decimal('3295493.512857142857'),)
Size of query_17 result table: 0.015625 MB


## Materialised view

In [34]:
query_materialized = """
CREATE MATERIALIZED VIEW part_lineitem AS
SELECT part.p_partkey, part.p_brand, part.p_container, lineitem.l_quantity, lineitem.l_extendedprice
FROM part JOIN lineitem ON p_partkey = l_partkey;
"""

with conn.cursor() as cur:
        cur.execute(f"EXPLAIN ANALYZE {query_materialized}")
        conn.commit()

In [12]:
conn.rollback()
with conn.cursor() as cur:
    
    cur.execute("SELECT pg_total_relation_size('part_lineitem');")
    size = cur.fetchall()
    print(f"Size of materialised view: {size[0][0]/(1024**2)} MB")


Size of materialised view: 3906.609375 MB


In [9]:
query_17_1 = """

CREATE VIEW average_quantity AS
SELECT
    p_partkey,
    AVG(l_quantity) AS avg_quantity
FROM
    part_lineitem
GROUP BY
    p_partkey;

SELECT
    p_partkey,
    SUM(l_extendedprice) / 7.0 AS avg_yearly
FROM
    part_lineitem
WHERE
    p_brand = 'Brand#23'
    AND p_container = 'MED BOX'
    AND l_quantity < (
        SELECT
            0.2 * avg_quantity
        FROM
            average_quantity
        WHERE 
            average_quantity.p_partkey = part_lineitem.p_partkey
    )
GROUP BY
    p_partkey;
"""

In [10]:
conn.rollback()
with conn.cursor() as cur:
        cur.execute(f"{query_17_1}")
        conn.commit()

takes more time than the index, there is something wrong? maybe there is no index in p_partkey and thats the problem??