In [2]:
import psycopg2
import time

# establish a connection to the PostgreSQL database
conn = psycopg2.connect(
    dbname = "dw_cs", 
    user = "postgres", 
    host= 'localhost',
    password = "Mu34zi72",
    port = 5432
)

In [3]:
def check_indexes(table_name):

    with conn.cursor() as cur:
        query = f"""
        SELECT
            indexname,
            indexdef
        FROM
            pg_indexes
        WHERE
            tablename = '{table_name}';
        """

        cur.execute(query)      
        
        indexes = cur.fetchall()
        
        for index in indexes:
            print(f"Index Name: {index[0]}")
            print(f"Index Definition: {index[1]}\n")

# function to explain-analyze a query

def explain_analyze(query, analyze = True):
    conn.rollback()
    with conn.cursor() as cur:
        if analyze:
            cur.execute(f"EXPLAIN ANALYZE {query}")
        else:
            cur.execute(f"EXPLAIN {query}")
        explain = cur.fetchall()

        for line in explain:
            print(line[0])

## Check indexes and drop non pk indexes

In [11]:
a = ['nation', 'part', 'supplier', 'customer', 'lineitem', 'region', 'partsupp', 'orders']

conn.rollback()
for table in a:
    check_indexes(table)

Index Name: nation_pkey
Index Definition: CREATE UNIQUE INDEX nation_pkey ON public.nation USING btree (n_nationkey)

Index Name: part_pkey
Index Definition: CREATE UNIQUE INDEX part_pkey ON public.part USING btree (p_partkey)

Index Name: supplier_pkey
Index Definition: CREATE UNIQUE INDEX supplier_pkey ON public.supplier USING btree (s_suppkey)

Index Name: customer_pkey
Index Definition: CREATE UNIQUE INDEX customer_pkey ON public.customer USING btree (c_custkey)

Index Name: region_pkey
Index Definition: CREATE UNIQUE INDEX region_pkey ON public.region USING btree (r_regionkey)

Index Name: partsupp_pkey
Index Definition: CREATE UNIQUE INDEX partsupp_pkey ON public.partsupp USING btree (ps_partkey, ps_suppkey)

Index Name: orders_pkey
Index Definition: CREATE UNIQUE INDEX orders_pkey ON public.orders USING btree (o_orderkey)



## Query 1

is it necessary to create a mv for this query? we don't expect so much gain because of the sequential scan on shipdate


In [None]:
query_1 = """
SELECT
    l_returnflag,
    l_linestatus,
    l_shipdate,
    SUM(l_quantity) OVER (PARTITION BY l_returnflag, l_linestatus) AS sum_qty,
    SUM(l_extendedprice) OVER (PARTITION BY l_returnflag, l_linestatus) AS sum_base_price,
    SUM(l_extendedprice * (1 - l_discount)) OVER (PARTITION BY l_returnflag, l_linestatus) AS sum_disc_price,
    SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) OVER (PARTITION BY l_returnflag, l_linestatus) AS sum_charge,
    AVG(l_quantity) OVER (PARTITION BY l_returnflag, l_linestatus) AS avg_qty,
    AVG(l_extendedprice) OVER (PARTITION BY l_returnflag, l_linestatus) AS avg_price,
    AVG(l_discount) OVER (PARTITION BY l_returnflag, l_linestatus) AS avg_disc,
    COUNT(*) OVER (PARTITION BY l_returnflag, l_linestatus) AS count_order
FROM
    lineitem

"""

explain_analyze(query_1)

## First try with smaller table

In [None]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = off;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = off;")
    conn.commit()

query_materialized = """

CREATE MATERIALIZED VIEW part_lineitem AS

SELECT 
    part.p_brand, 
    part.p_container, 
    part.p_type,
    lineitem.l_partkey,
    lineitem.l_quantity, 
    lineitem.l_extendedprice, 
    lineitem.l_shipdate, 
    lineitem.l_discount
FROM part JOIN lineitem ON p_partkey = l_partkey;


"""

explain_analyze(query_materialized, analyze = False)

In [None]:
with conn.cursor() as cur:
    start_time = time.time()
    cur.execute(query_materialized)
    end_time = time.time()
    print(f"Time taken to create materialized view: {end_time - start_time} seconds")
    conn.commit()

in this case i used merge join

using hash join the time was 3 minutes and 40 seconds.

In [8]:
conn.rollback()
with conn.cursor() as cur:
    
    cur.execute("SELECT pg_total_relation_size('part_lineitem');")
    size = cur.fetchall()
    print(f"Size of materialised view: {size[0][0]/(1024**2)} MB")
    print(f"Size of materialised view: {size[0][0]/(1024**3)} GB")

Size of materialised view: 5782.5078125 MB


In [4]:
conn.rollback()
check_indexes('part_lineitem')

Index Name: idx_pl_shipdate
Index Definition: CREATE INDEX idx_pl_shipdate ON public.part_lineitem USING btree (l_shipdate)



### Query 14

In [6]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    cur.execute("SET enable_material = on;")
    conn.commit()

query_14 = """
SELECT
    100.00 * SUM(CASE
        WHEN p_type LIKE 'PROMO%'
        THEN l_extendedprice * (1 - l_discount)
        ELSE 0
    END) / SUM(l_extendedprice * (1 - l_discount)) AS promo_revenue
FROM
    part_lineitem
WHERE
    l_shipdate >= DATE '1995-09-01'
    AND l_shipdate < DATE '1995-09-01' + INTERVAL '1' MONTH;
"""


In [7]:
explain_analyze(query_14)

Finalize Aggregate  (cost=1167103.21..1167103.23 rows=1 width=32) (actual time=25889.548..25891.726 rows=1 loops=1)
  ->  Gather  (cost=1167102.98..1167103.19 rows=2 width=64) (actual time=25889.330..25891.705 rows=3 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial Aggregate  (cost=1166102.98..1166102.99 rows=1 width=64) (actual time=25860.486..25860.487 rows=1 loops=3)
              ->  Parallel Seq Scan on part_lineitem  (cost=0.00..1160576.82 rows=315780 width=33) (actual time=0.140..25640.208 rows=249741 loops=3)
                    Filter: ((l_shipdate >= '1995-09-01'::date) AND (l_shipdate < '1995-10-01 00:00:00'::timestamp without time zone))
                    Rows Removed by Filter: 19745610
Planning Time: 11.105 ms
Execution Time: 25892.157 ms


the problem is that it does a seq scan because there is no index on shipdate.

Size of the result table

In [10]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_14};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_14 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 (Decimal('16.6475949416150953'),)
Size of query_1 result table: 0.015625 MB


### Query 17

In [12]:
query_17 = """

CREATE VIEW l_quantity_avg AS

SELECT 
    l_partkey,
    AVG(l_quantity) AS avg_quantity
FROM
    part_lineitem
GROUP BY l_partkey;

EXPLAIN ANALYZE

SELECT
    SUM(l_extendedprice) / 7.0 AS avg_yearly
FROM
    part_lineitem JOIN l_quantity_avg q_avg ON part_lineitem.l_partkey = q_avg.l_partkey
WHERE
    p_brand = 'Brand#23'
    AND p_container = 'MED BOX'
    AND l_quantity < ( 0.2 * q_avg.avg_quantity);
    
"""

In [13]:
conn.rollback()
with conn.cursor() as cur:
    
    cur.execute(query_17)

In [14]:
query_view = """  
SELECT 
    l_partkey,
    AVG(l_quantity) AS avg_quantity
FROM
    part_lineitem
GROUP BY l_partkey;
"""

explain_analyze(query_view, analyze = False)

Finalize GroupAggregate  (cost=2910482.68..2968369.91 rows=221918 width=36)
  Group Key: l_partkey
  ->  Gather Merge  (cost=2910482.68..2962267.16 rows=443836 width=36)
        Workers Planned: 2
        ->  Sort  (cost=2909482.66..2910037.45 rows=221918 width=36)
              Sort Key: l_partkey
              ->  Partial HashAggregate  (cost=2636843.60..2883707.71 rows=221918 width=36)
                    Group Key: l_partkey
                    Planned Partitions: 16
                    ->  Parallel Seq Scan on part_lineitem  (cost=0.00..1035612.30 rows=24994830 width=9)


In [14]:
query_17_1 = """

SELECT
    SUM(pl1.l_extendedprice) / 7.0 AS avg_yearly
FROM
    part_lineitem pl1
WHERE
    pl1.p_brand = 'Brand#23'
    AND pl1.p_container = 'MED BOX'
    AND pl1.l_quantity < (
        SELECT
            0.2 * AVG(pl2.l_quantity)
        FROM
            part_lineitem pl2
        WHERE
            pl2.l_partkey = pl1.l_partkey
        GROUP BY
            pl2.l_partkey
    );

"""

explain_analyze(query_17_1, analyze = True)

Aggregate  (cost=44694664571.84..44694664571.86 rows=1 width=32) (actual time=16144.149..16144.150 rows=1 loops=1)
  ->  Seq Scan on part_lineitem pl1  (cost=0.00..44694664526.49 rows=18139 width=8) (actual time=4.731..16132.086 rows=5526 loops=1)
        Filter: ((p_brand = 'Brand#23'::bpchar) AND (p_container = 'MED BOX'::bpchar) AND (l_quantity < (SubPlan 1)))
        Rows Removed by Filter: 59980526
        SubPlan 1
          ->  GroupAggregate  (cost=0.44..745.05 rows=1 width=36) (actual time=0.098..0.098 rows=1 loops=61385)
                ->  Index Scan using idx_pl_partkey on part_lineitem pl2  (cost=0.44..744.36 rows=270 width=9) (actual time=0.046..0.096 rows=31 loops=61385)
                      Index Cond: (l_partkey = pl1.l_partkey)
Planning Time: 0.169 ms
Execution Time: 16144.216 ms


since there is no index on l_partkey the group by is very costly anyway.

Size of result table

In [None]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_17};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_1 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

## Trying with small table on query 10

In [10]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    conn.commit()

query_materialized = """

CREATE MATERIALIZED VIEW customer_order_lineitem_nation AS
SELECT
    c.c_custkey,
    c.c_name,
    c.c_acctbal,
    n.n_name,
    c.c_address,
    c.c_phone,
    c.c_comment,
    -- c.c_nationkey, not needed in the query, so not included
    l.l_returnflag,
    -- l.l_orderkey, not needed in the query, so not included
    l.l_discount,
    l.l_extendedprice,
    o.o_orderdate

FROM
    customer c
JOIN
    orders o ON c.c_custkey = o.o_custkey
JOIN
    lineitem l ON l.l_orderkey = o.o_orderkey
JOIN
    nation n ON c.c_nationkey = n.n_nationkey;
"""

explain_analyze(query_materialized, analyze = False)

Hash Join  (cost=773460.56..4508576.09 rows=59991596 width=265)
  Hash Cond: (c.c_nationkey = n.n_nationkey)
  ->  Hash Join  (cost=773459.00..4324400.33 rows=59991596 width=165)
        Hash Cond: (o.o_custkey = c.c_custkey)
        ->  Hash Join  (cost=671655.00..3329862.94 rows=59991596 width=22)
              Hash Cond: (l.l_orderkey = o.o_orderkey)
              ->  Seq Scan on lineitem l  (cost=0.00..1724458.96 rows=59991596 width=18)
              ->  Hash  (cost=410912.00..410912.00 rows=15000000 width=12)
                    ->  Seq Scan on orders o  (cost=0.00..410912.00 rows=15000000 width=12)
        ->  Hash  (cost=50827.00..50827.00 rows=1500000 width=147)
              ->  Seq Scan on customer c  (cost=0.00..50827.00 rows=1500000 width=147)
  ->  Hash  (cost=1.25..1.25 rows=25 width=108)
        ->  Seq Scan on nation n  (cost=0.00..1.25 rows=25 width=108)


In [11]:
with conn.cursor() as cur:
    start_time = time.time()
    cur.execute(query_materialized)
    end_time = time.time()
    print(f"Time taken to create materialized view: {end_time - start_time} seconds")
    conn.commit()

Time taken to create materialized view: 428.36515831947327 seconds


In [9]:
check_indexes('customer_order_lineitem_nation')

Index Name: idx_l_returnflag_linestatus
Index Definition: CREATE INDEX idx_l_returnflag_linestatus ON public.customer_order_lineitem_nation USING btree (o_orderdate)



In [8]:
query_10 = """
SELECT
    c_custkey,
    c_name,
    SUM(l_extendedprice * (1 - l_discount)) AS revenue,
    c_acctbal,
    n_name,
    c_address,
    c_phone,
    c_comment
FROM
    customer_order_lineitem_nation
WHERE
    o_orderdate >= DATE '1993-10-01'
    AND o_orderdate < DATE '1993-10-01' + INTERVAL '3' MONTH
    AND l_returnflag = 'R'
GROUP BY
    c_custkey,
    c_name,
    c_acctbal,
    c_phone,
    n_name,
    c_address,
    c_comment
ORDER BY
    revenue DESC;
"""


In [13]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    conn.commit()

explain_analyze(query_10)

Sort  (cost=2394038.81..2395473.69 rows=573953 width=202) (actual time=80025.838..80097.090 rows=381105 loops=1)
  Sort Key: (sum((l_extendedprice * ('1'::numeric - l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  Finalize GroupAggregate  (cost=2139180.42..2225353.56 rows=573953 width=202) (actual time=78299.289..79409.874 rows=381105 loops=1)
        Group Key: c_custkey, c_name, c_acctbal, c_phone, n_name, c_address, c_comment
        ->  Gather Merge  (cost=2139180.42..2206923.97 rows=500230 width=202) (actual time=78299.272..78984.661 rows=447671 loops=1)
              Workers Planned: 2
              Workers Launched: 2
              ->  Partial GroupAggregate  (cost=2138180.40..2148185.00 rows=250115 width=202) (actual time=78212.294..78715.281 rows=149224 loops=3)
                    Group Key: c_custkey, c_name, c_acctbal, c_phone, n_name, c_address, c_comment
                    ->  Sort  (cost=2138180.40..2138805.68 rows=250115 width=182) (actual time=78

In [None]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_10};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_10 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

## Materialisation + indexes

to optimize query 14 we could put an index on shipdate

to optimize query 17 we could put an index on l_partkey

to optimizze query 10 we could put an index on orderdate

In [4]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_pl_shipdate ON part_lineitem (l_shipdate);")
    end_time = time.time()
    print(f"Time to create idx_pl_shipdate ON part_lineitem (l_shipdate): {end_time - start_time} seconds")

    conn.commit()

Time to create idx_pl_shipdate ON part_lineitem (l_shipdate): 50.39820599555969 seconds


In [8]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_material = on;")
    conn.commit()
    
explain_analyze(query_14)

Finalize Aggregate  (cost=1175828.39..1175828.40 rows=1 width=32) (actual time=25657.520..25660.617 rows=1 loops=1)
  ->  Gather  (cost=1175828.15..1175828.36 rows=2 width=64) (actual time=25657.054..25660.586 rows=3 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial Aggregate  (cost=1174828.15..1174828.16 rows=1 width=64) (actual time=25627.173..25627.174 rows=1 loops=3)
              ->  Parallel Bitmap Heap Scan on part_lineitem  (cost=10340.74..1169302.00 rows=315780 width=33) (actual time=78.836..25405.574 rows=249741 loops=3)
                    Recheck Cond: ((l_shipdate >= '1995-09-01'::date) AND (l_shipdate < '1995-10-01 00:00:00'::timestamp without time zone))
                    Rows Removed by Index Recheck: 10732897
                    Heap Blocks: exact=18043 lossy=143761
                    ->  Bitmap Index Scan on idx_pl_shipdate  (cost=0.00..10151.27 rows=757871 width=0) (actual time=94.152..94.152 rows=749223 loops=1)
                 

In [26]:
conn.rollback()
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_pl_partkey ON part_lineitem (l_partkey);")
    end_time = time.time()
    print(f"Time to create index idx_pl_partkey ON part_lineitem (l_partkey): {end_time - start_time} seconds")

    conn.commit()

Time to create index idx_pl_partkey ON part_lineitem (l_partkey): 45.294238328933716 seconds


In [27]:
check_indexes('part_lineitem')

Index Name: idx_pl_shipdate
Index Definition: CREATE INDEX idx_pl_shipdate ON public.part_lineitem USING btree (l_shipdate)

Index Name: idx_pl_partkey
Index Definition: CREATE INDEX idx_pl_partkey ON public.part_lineitem USING btree (l_partkey)



In [30]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    conn.commit()

explain_analyze(query_17_1)

Aggregate  (cost=54694664571.84..54694664571.86 rows=1 width=32) (actual time=12511.120..12511.123 rows=1 loops=1)
  ->  Seq Scan on part_lineitem pl1  (cost=10000000000.00..54694664526.49 rows=18139 width=8) (actual time=70.244..12509.829 rows=5526 loops=1)
        Filter: ((p_brand = 'Brand#23'::bpchar) AND (p_container = 'MED BOX'::bpchar) AND (l_quantity < (SubPlan 1)))
        Rows Removed by Filter: 59980526
        SubPlan 1
          ->  GroupAggregate  (cost=0.44..745.05 rows=1 width=36) (actual time=0.048..0.048 rows=1 loops=61385)
                ->  Index Scan using idx_pl_partkey on part_lineitem pl2  (cost=0.44..744.36 rows=270 width=9) (actual time=0.020..0.044 rows=31 loops=61385)
                      Index Cond: (l_partkey = pl1.l_partkey)
Planning Time: 0.321 ms
Execution Time: 12511.285 ms


In [31]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_17_1};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_17 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 (Decimal('3295493.512857142857'),)
Size of query_10 result table: 0.015625 MB


In [36]:
conn.rollback()
with conn.cursor() as cur:
    
    cur.execute(f"{query_17};")
    result = cur.fetchone()
    print(f"First row: \n {result}")

# this is much slower, i would keep query 17_1


In [5]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_coln_orderdate ON customer_order_lineitem_nation (o_orderdate);")
    end_time = time.time()
    print(f"Time to create idx_coln_orderdate ON customer_order_lineitem_nation (o_orderdate): {end_time - start_time} seconds")

    conn.commit()

Time to create idx_coln_orderdate ON customer_order_lineitem_nation (o_orderdate): 63.60222887992859 seconds


In [6]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_coln_customer ON customer_order_lineitem_nation (c_custkey);")
    end_time = time.time()
    print(f"Time to create idx_coln_customer ON customer_order_lineitem_nation (c_custkey): {end_time - start_time} seconds")

    conn.commit()

Time to create idx_coln_customer ON customer_order_lineitem_nation (c_custkey): 64.0774028301239 seconds


In [14]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    conn.commit()


# this was done without custkey, don't compile this chunk to see the difference!!
explain_analyze(query_10)

Sort  (cost=2498190.35..2499625.23 rows=573953 width=202) (actual time=59691.971..59780.698 rows=381105 loops=1)
  Sort Key: (sum((l_extendedprice * ('1'::numeric - l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  Finalize GroupAggregate  (cost=2243331.96..2329505.10 rows=573953 width=202) (actual time=57336.988..58705.652 rows=381105 loops=1)
        Group Key: c_custkey, c_name, c_acctbal, c_phone, n_name, c_address, c_comment
        ->  Gather Merge  (cost=2243331.96..2311075.52 rows=500230 width=202) (actual time=57336.961..58160.311 rows=470726 loops=1)
              Workers Planned: 2
              Workers Launched: 2
              ->  Partial GroupAggregate  (cost=2242331.94..2252336.54 rows=250115 width=202) (actual time=57295.321..57826.542 rows=156909 loops=3)
                    Group Key: c_custkey, c_name, c_acctbal, c_phone, n_name, c_address, c_comment
                    ->  Sort  (cost=2242331.94..2242957.22 rows=250115 width=182) (actual time=57

In [9]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    conn.commit()

explain_analyze(query_10)

Sort  (cost=2498190.35..2499625.23 rows=573953 width=202) (actual time=57602.135..57672.339 rows=381105 loops=1)
  Sort Key: (sum((l_extendedprice * ('1'::numeric - l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  Finalize GroupAggregate  (cost=2243331.96..2329505.10 rows=573953 width=202) (actual time=55203.079..56845.415 rows=381105 loops=1)
        Group Key: c_custkey, c_name, c_acctbal, c_phone, n_name, c_address, c_comment
        ->  Gather Merge  (cost=2243331.96..2311075.52 rows=500230 width=202) (actual time=55203.055..56439.821 rows=470817 loops=1)
              Workers Planned: 2
              Workers Launched: 2
              ->  Partial GroupAggregate  (cost=2242331.94..2252336.54 rows=250115 width=202) (actual time=55164.143..55882.190 rows=156939 loops=3)
                    Group Key: c_custkey, c_name, c_acctbal, c_phone, n_name, c_address, c_comment
                    ->  Sort  (cost=2242331.94..2242957.22 rows=250115 width=182) (actual time=55

seems that it doesn't use the index on custkey