In [1]:
import psycopg2
import time

# establish a connection to the PostgreSQL database
conn = psycopg2.connect(
    dbname = "dw_cs", 
    user = "postgres", 
    host= 'localhost',
    password = "Mu34zi72",
    port = 5432
)

In [2]:
def check_indexes(table_name):

    with conn.cursor() as cur:
        query = f"""
        SELECT
            indexname,
            indexdef
        FROM
            pg_indexes
        WHERE
            tablename = '{table_name}';
        """

        cur.execute(query)      
        
        indexes = cur.fetchall()
        
        for index in indexes:
            print(f"Index Name: {index[0]}")
            print(f"Index Definition: {index[1]}\n")

# function to explain-analyze a query

def explain_analyze(query, analyze = True):
    conn.rollback()
    with conn.cursor() as cur:
        if analyze:
            cur.execute(f"EXPLAIN ANALYZE {query}")
        else:
            cur.execute(f"EXPLAIN {query}")
        explain = cur.fetchall()

        for line in explain:
            print(line[0])

## Check indexes and drop non pk indexes

In [3]:
a = ['nation', 'part', 'supplier', 'customer', 'lineitem', 'region', 'partsupp', 'orders']

for table in a:
    check_indexes(table)
    print("\n")

Index Name: nation_pkey
Index Definition: CREATE UNIQUE INDEX nation_pkey ON public.nation USING btree (n_nationkey)





In [17]:
query_drop_idx = """
DROP INDEX IF EXISTS idx_p_brand;
DROP INDEX IF EXISTS idx_p_container;
DROP INDEX IF EXISTS idx_lineitem_partkey;
DROP INDEX IF EXISTS idx_l_returnflag_linestatus;
DROP INDEX IF EXISTS idx_l_shipdate;
DROP INDEX IF EXISTS idx_o_orderdate;
"""

with conn.cursor() as cur:
    cur.execute(query_drop_idx) 
    conn.commit()

In [None]:
for table in a:
    check_indexes(table)
    print("\n")

## First try with smaller table

In [None]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    conn.commit()

query_materialized = """

CREATE MATERIALIZED VIEW part_lineitem AS

SELECT 
    part.p_brand, 
    part.p_container, 
    part.p_type,
    lineitem.l_partkey, 
    lineitem.l_quantity, 
    lineitem.l_extendedprice, 
    lineitem.l_shipdate, 
    lineitem.l_discount, 
FROM part JOIN lineitem ON p_partkey = l_partkey;


"""

explain_analyze(query_materialized, analyze = False)

WindowAgg  (cost=6497685.03..15203148.40 rows=59986052 width=110)
  ->  Merge Join  (cost=6497685.03..14303357.62 rows=59986052 width=78)
        Merge Cond: (lineitem.l_partkey = part.p_partkey)
        ->  Gather Merge  (cost=6496970.92..13483335.20 rows=59986052 width=35)
              Workers Planned: 2
              ->  Sort  (cost=6495970.90..6558456.37 rows=24994188 width=35)
                    Sort Key: lineitem.l_partkey
                    ->  Parallel Seq Scan on lineitem  (cost=0.00..1374484.88 rows=24994188 width=35)
        ->  Index Scan using part_pkey on part  (cost=0.43..92909.43 rows=2000000 width=47)


In [None]:
conn.rollback()
with conn.cursor() as cur:
    
    cur.execute("SELECT pg_total_relation_size('part_lineitem');")
    size = cur.fetchall()
    print(f"Size of materialised view: {size[0][0]/(1024**2)} MB")

### Query 14

In [None]:
query_14 = """
SELECT
    100.00 * SUM(CASE
        WHEN p_type LIKE 'PROMO%'
        THEN l_extendedprice * (1 - l_discount)
        ELSE 0
    END) / SUM(l_extendedprice * (1 - l_discount)) AS promo_revenue
FROM
    part_lineitem
WHERE
    l_shipdate >= DATE '1995-09-01'
    AND l_shipdate < DATE '1995-09-01' + INTERVAL '1' MONTH;
"""

explain_analyze(query_14)

Size of the result table

In [None]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_14};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_1 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

### Query 17

In [None]:
query_17 = """

CREATE VIEW l_quantity_avg AS

SELECT 
    l_partkey,
    AVG(l_quantity) AS avg_quantity
FROM
    part_lineitem
GROUP BY l_partkey;

SELECT
    SUM(l_extendedprice) / 7.0 AS avg_yearly
FROM
    part_lineitem JOIN l_quantity_avg q_avg ON l_partkey = q_avg.l_partkey
WHERE
    p_brand = 'Brand#23'
    AND p_container = 'MED BOX'
    AND l_quantity < (0.2 * q_avg.avg_quantity);
"""

explain_analyze(query_17)

Size of result table

In [None]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_17};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_1 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

## Trying with small table on query 10

In [3]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    conn.commit()

query_materialized = """

CREATE MATERIALIZED VIEW customer_order_lineitem_nation AS
SELECT
    c.c_custkey,
    c.c_name,
    c.c_acctbal,
    n.n_name,
    c.c_address,
    c.c_phone,
    c.c_comment,
    c.c_nationkey,
    c.c_custkey
    l.l_returnflag,
    l.l_orderkey,
    o.o_orderdate,

FROM
    customer c
JOIN
    orders o ON c.c_custkey = o.o_custkey
JOIN
    lineitem l ON l.l_orderkey = o.o_orderkey
JOIN
    nation n ON c.c_nationkey = n.n_nationkey;
"""

with conn.cursor() as cur:
    cur.execute(query_materialized)
    for row in cur.fetchall():
        print(row)

('Hash Join  (cost=897529.89..74639748.12 rows=59986052 width=114)',)
('  Hash Cond: (lineitem.l_orderkey = o.o_orderkey)',)
('  ->  WindowAgg  (cost=1000.45..71535435.25 rows=59986052 width=110)',)
('        ->  Gather Merge  (cost=1000.45..70635644.47 rows=59986052 width=78)',)
('              Workers Planned: 2',)
('              ->  Nested Loop  (cost=0.43..63710765.64 rows=24994188 width=78)',)
('                    ->  Parallel Index Scan using part_pkey on part  (cost=0.43..81242.76 rows=833333 width=47)',)
('                    ->  Index Scan using idx_lineitem_partkey on lineitem  (cost=0.00..76.07 rows=29 width=35)',)
('                          Index Cond: (l_partkey = part.p_partkey)',)
('  ->  Hash  (cost=650435.44..650435.44 rows=15000000 width=8)',)
('        ->  Index Scan using orders_pkey on orders o  (cost=0.43..650435.44 rows=15000000 width=8)',)


In [None]:
check_indexes('lineitem_orders_part')

In [14]:
query_10 = """
SELECT
    c_custkey,
    c_name,
    SUM(l_extendedprice * (1 - l_discount)) AS revenue,
    c_acctbal,
    n_name,
    c_address,
    c_phone,
    c_comment
FROM
    customer_order_lineitem_nation
WHERE
    o_orderdate >= DATE '1993-10-01'
    AND o_orderdate < DATE '1993-10-01' + INTERVAL '3' MONTH
    AND l_returnflag = 'R'
GROUP BY
    c_custkey,
    c_name,
    c_acctbal,
    c_phone,
    n_name,
    c_address,
    c_comment
ORDER BY
    revenue DESC;
"""

explain_analyze(query_10)

Sort  (cost=2285986.57..2287338.86 rows=540913 width=279) (actual time=94283.995..94529.354 rows=381105 loops=1)
  Sort Key: (sum((lineitem.l_extendedprice * ('1'::numeric - lineitem.l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  GroupAggregate  (cost=1894222.39..2093967.03 rows=540913 width=279) (actual time=84268.371..92207.632 rows=381105 loops=1)
        Group Key: customer.c_custkey, nation.n_name
        ->  Incremental Sort  (cost=1894222.39..2080444.21 rows=540913 width=259) (actual time=84268.332..90311.893 rows=1147084 loops=1)
              Sort Key: customer.c_custkey, nation.n_name
              Presorted Key: customer.c_custkey
              Full-sort Groups: 34124  Sort Method: quicksort  Average Memory: 31kB  Peak Memory: 31kB
              ->  Nested Loop  (cost=1894222.08..2056103.12 rows=540913 width=259) (actual time=84268.138..89108.177 rows=1147084 loops=1)
                    ->  Merge Join  (cost=1894221.93..2042509.49 rows=540913 width=1