In [1]:
import psycopg2
import time

In [2]:
# establish a connection to the PostgreSQL database
conn = psycopg2.connect(
    dbname = "dw_cs", 
    user = "postgres", 
    host= '172.31.160.1',
    password = "test31415",
    port = 5432
)

In [3]:
# function to check the indexes on a table

def check_indexes(table_name):

    with conn.cursor() as cur:
        query = f"""
        SELECT
            indexname,
            indexdef
        FROM
            pg_indexes
        WHERE
            tablename = '{table_name}';
        """

        cur.execute(query)      
        
        indexes = cur.fetchall()
        
        for index in indexes:
            print(f"Index Name: {index[0]}")
            print(f"Index Definition: {index[1]}\n")


# function to explain-analyze a query

def explain_analyze(query, analyze = True):
    conn.rollback()
    with conn.cursor() as cur:
        if analyze:
            cur.execute(f"EXPLAIN ANALYZE {query}")
        else:
            cur.execute(f"EXPLAIN {query}")
        explain = cur.fetchall()

        for line in explain:
            print(line[0])

In [7]:
a = ['nation', 'part', 'supplier', 'customer', 'lineitem', 'region', 'partsupp', 'orders']

conn.rollback()
for table in a:
    check_indexes(table)

Index Name: nation_pkey
Index Definition: CREATE UNIQUE INDEX nation_pkey ON public.nation USING btree (n_nationkey)

Index Name: part_pkey
Index Definition: CREATE UNIQUE INDEX part_pkey ON public.part USING btree (p_partkey)

Index Name: supplier_pkey
Index Definition: CREATE UNIQUE INDEX supplier_pkey ON public.supplier USING btree (s_suppkey)

Index Name: customer_pkey
Index Definition: CREATE UNIQUE INDEX customer_pkey ON public.customer USING btree (c_custkey)

Index Name: region_pkey
Index Definition: CREATE UNIQUE INDEX region_pkey ON public.region USING btree (r_regionkey)

Index Name: partsupp_pkey
Index Definition: CREATE UNIQUE INDEX partsupp_pkey ON public.partsupp USING btree (ps_partkey, ps_suppkey)



In [8]:
conn.rollback()
with conn.cursor() as cur:

    cur.execute("SELECT pg_database_size('dw_cs');")
    all_rows = cur.fetchall()
    print(f"Database size: {all_rows[0][0] / (1024**2):.2f} MB")
    print(f"Database size: {all_rows[0][0] / (1024**3):.2f} GB \n")

Database size: 12510.64 MB
Database size: 12.22 GB 



## Query 1

In [4]:
query_1 = """
SELECT
    l_returnflag,
    l_linestatus,
    SUM(l_quantity) AS sum_qty,
    SUM(l_extendedprice) AS sum_base_price,
    SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
    SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
    AVG(l_quantity) AS avg_qty,
    AVG(l_extendedprice) AS avg_price,
    AVG(l_discount) AS avg_disc,
    COUNT(*) AS count_order
FROM
    lineitem
WHERE
    l_shipdate <= DATE '1998-12-01' - INTERVAL '90' DAY
GROUP BY
    l_returnflag,
    l_linestatus
ORDER BY
    l_returnflag,
    l_linestatus;
"""

In [6]:
conn.rollback()
with conn.cursor() as cur:
        cur.execute("SET enable_seqscan = on;")
        cur.execute("SET enable_indexscan = on;")
        cur.execute("SET enable_bitmapscan = off;")
        cur.execute("SET enable_indexonlyscan = off;")
        cur.execute("SET enable_tidscan = off;")
        cur.execute("SET enable_material = off;")
        cur.execute("SET enable_nestloop = on;")
        cur.execute("SET enable_mergejoin = on;")
        cur.execute("SET enable_hashjoin = off;")
        cur.execute("SET enable_sort = on;")
        cur.execute("SET enable_partition_pruning = on;")
        cur.execute("SET enable_partitionwise_join = on;")
        cur.execute("SET enable_partitionwise_aggregate = on;")
        cur.execute("SET constraint_exclusion = on;")
        conn.commit()
    
explain_analyze(query_1)

Finalize GroupAggregate  (cost=2285091.92..2286484.01 rows=40000 width=236) (actual time=113951.273..113967.152 rows=4 loops=1)
  Group Key: lineitem_67.l_returnflag, lineitem_67.l_linestatus
  ->  Gather Merge  (cost=2285091.92..2285320.37 rows=1958 width=236) (actual time=113950.178..113966.065 rows=45 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Sort  (cost=2284091.90..2284094.35 rows=979 width=236) (actual time=113830.712..113830.894 rows=15 loops=3)
              Sort Key: lineitem_67.l_returnflag, lineitem_67.l_linestatus
              Sort Method: quicksort  Memory: 35kB
              Worker 0:  Sort Method: quicksort  Memory: 31kB
              Worker 1:  Sort Method: quicksort  Memory: 31kB
              ->  Parallel Append  (cost=88465.10..2284043.27 rows=979 width=236) (actual time=6492.443..113829.350 rows=15 loops=3)
                    ->  Partial HashAggregate  (cost=88635.63..88635.66 rows=1 width=236) (actual time=9076.098..9076.101 rows=

## Query 10

In [7]:
query_10 = """
SELECT
    c_custkey,
    c_name,
    SUM(l_extendedprice * (1 - l_discount)) AS revenue,
    c_acctbal,
    n_name,
    c_address,
    c_phone,
    c_comment
FROM
    customer,
    orders,
    lineitem,
    nation
WHERE
    c_custkey = o_custkey
    AND l_orderkey = o_orderkey
    AND o_orderdate >= DATE '1993-10-01'
    AND o_orderdate < DATE '1993-10-01' + INTERVAL '3' MONTH
    AND l_returnflag = 'R'
    AND c_nationkey = n_nationkey
GROUP BY
    c_custkey,
    c_name,
    c_acctbal,
    c_phone,
    n_name,
    c_address,
    c_comment
ORDER BY
    revenue DESC;
"""

In [8]:
conn.rollback()
with conn.cursor() as cur:
        cur.execute("SET enable_seqscan = on;")
        cur.execute("SET enable_indexscan = on;")
        cur.execute("SET enable_bitmapscan = on;")
        cur.execute("SET enable_indexonlyscan = off;")
        cur.execute("SET enable_tidscan = off;")
        cur.execute("SET enable_material = off;")
        cur.execute("SET enable_nestloop = on;")
        cur.execute("SET enable_mergejoin = on;")
        cur.execute("SET enable_hashjoin = off;")
        cur.execute("SET enable_sort = on;")
        cur.execute("SET enable_partition_pruning = on;")
        cur.execute("SET enable_partitionwise_join = on;")
        cur.execute("SET enable_partitionwise_aggregate = on;")
        cur.execute("SET constraint_exclusion = on;")
        conn.commit()

In [9]:
explain_analyze(query_10, analyze=True)

Sort  (cost=2525852187.28..2525945937.28 rows=37500000 width=280) (actual time=30192.629..30200.907 rows=0 loops=1)
  Sort Key: (sum((lineitem.l_extendedprice * ('1'::numeric - lineitem.l_discount)))) DESC
  Sort Method: quicksort  Memory: 25kB
  ->  Finalize GroupAggregate  (cost=1752112632.14..2506522793.67 rows=37500000 width=280) (actual time=30192.610..30200.887 rows=0 loops=1)
        Group Key: customer.c_custkey, nation.n_name
        ->  Gather Merge  (cost=1752112632.14..2505304043.67 rows=75000000 width=280) (actual time=30192.609..30200.885 rows=0 loops=1)
              Workers Planned: 2
              Workers Launched: 2
              ->  Partial GroupAggregate  (cost=1752111632.12..2496646182.71 rows=37500000 width=280) (actual time=30133.596..30133.609 rows=0 loops=3)
                    Group Key: customer.c_custkey, nation.n_name
                    ->  Incremental Sort  (cost=1752111632.12..2408406811.93 rows=7021649662 width=260) (actual time=30133.594..30133.606 row

## Query 14

In [6]:
query_14 = """
SELECT
    100.00 * SUM(CASE
        WHEN p_type LIKE 'PROMO%'
        THEN l_extendedprice * (1 - l_discount)
        ELSE 0
    END) / SUM(l_extendedprice * (1 - l_discount)) AS promo_revenue
FROM
    lineitem,
    part
WHERE
    l_partkey = p_partkey
    AND l_shipdate >= DATE '1995-09-01'
    AND l_shipdate < DATE '1995-09-01' + INTERVAL '1' MONTH;

"""

In [7]:
conn.rollback()
with conn.cursor() as cur:
        cur.execute("SET enable_seqscan = on;")
        cur.execute("SET enable_indexscan = on;")
        cur.execute("SET enable_bitmapscan = on;")
        cur.execute("SET enable_indexonlyscan = off;")
        cur.execute("SET enable_tidscan = off;")
        cur.execute("SET enable_material = off;")
        cur.execute("SET enable_nestloop = on;")
        cur.execute("SET enable_mergejoin = on;")
        cur.execute("SET enable_hashjoin = off;")
        cur.execute("SET enable_sort = on;")
        cur.execute("SET enable_partition_pruning = on;")
        cur.execute("SET enable_partitionwise_join = on;")
        cur.execute("SET enable_partitionwise_aggregate = on;")
        cur.execute("SET constraint_exclusion = on;")
        conn.commit()

In [8]:
explain_analyze(query_14)

Finalize Aggregate  (cost=264122.54..264122.55 rows=1 width=32) (actual time=4294.356..4310.474 rows=1 loops=1)
  ->  Gather  (cost=264122.30..264122.51 rows=2 width=64) (actual time=4280.737..4310.416 rows=3 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial Aggregate  (cost=263122.30..263122.31 rows=1 width=64) (actual time=4213.017..4213.020 rows=1 loops=3)
              ->  Merge Join  (cost=167357.56..257613.09 rows=314812 width=33) (actual time=1819.193..3959.160 rows=250423 loops=3)
                    Merge Cond: (part.p_partkey = lineitem.l_partkey)
                    ->  Parallel Index Scan using part_pkey on part  (cost=0.43..81246.76 rows=833333 width=25) (actual time=0.506..1681.460 rows=666665 loops=3)
                    ->  Sort  (cost=167357.13..169246.00 rows=755548 width=16) (actual time=1807.646..2007.003 rows=751161 loops=3)
                          Sort Key: lineitem.l_partkey
                          Sort Method: external sort 

## Query 17

In [5]:
query_17 = """
SELECT
    SUM(l_extendedprice) / 7.0 AS avg_yearly
FROM
    lineitem,
    part
WHERE
    l_partkey = p_partkey
    AND p_brand = 'Brand#23'
    AND p_container = 'MED BOX'
    AND l_quantity < (
        SELECT
            0.2 * AVG(l_quantity)
        FROM
            lineitem
        WHERE
            p_partkey = l_partkey
    );
"""

In [7]:
conn.rollback()
with conn.cursor() as cur:
        cur.execute("SET enable_seqscan = on;")
        cur.execute("SET enable_indexscan = on;")
        cur.execute("SET enable_bitmapscan = off;")
        cur.execute("SET enable_indexonlyscan = off;")
        cur.execute("SET enable_tidscan = off;")
        cur.execute("SET enable_material = off;")
        cur.execute("SET enable_nestloop = on;")
        cur.execute("SET enable_mergejoin = on;")
        cur.execute("SET enable_hashjoin = off;")
        cur.execute("SET enable_sort = on;")
        cur.execute("SET enable_partition_pruning = on;")
        cur.execute("SET enable_partitionwise_join = on;")
        cur.execute("SET enable_partitionwise_aggregate = on;")
        cur.execute("SET constraint_exclusion = on;")
        conn.commit()

explain_analyze(query_17, analyze=True)

Aggregate  (cost=149050118.48..149050118.49 rows=1 width=32) (actual time=216692.960..216693.125 rows=1 loops=1)
  ->  Merge Join  (cost=13524380.35..149050070.07 rows=19362 width=8) (actual time=158996.674..216690.631 rows=3840 loops=1)
        Merge Cond: (part.p_partkey = lineitem.l_partkey)
        Join Filter: (lineitem.l_quantity < (SubPlan 1))
        Rows Removed by Join Filter: 57720
        ->  Sort  (cost=54761.29..54766.13 rows=1936 width=4) (actual time=1018.677..1020.761 rows=2044 loops=1)
              Sort Key: part.p_partkey
              Sort Method: quicksort  Memory: 49kB
              ->  Gather  (cost=1000.00..54655.60 rows=1936 width=4) (actual time=9.140..1017.880 rows=2044 loops=1)
                    Workers Planned: 2
                    Workers Launched: 2
                    ->  Parallel Seq Scan on part  (cost=0.00..53462.00 rows=807 width=4) (actual time=7.158..917.794 rows=681 loops=3)
                          Filter: ((p_brand = 'Brand#23'::bpchar) AND

In [1]:
conn.close()

NameError: name 'conn' is not defined