In [2]:
import psycopg2
import time

# establish a connection to the PostgreSQL database
conn = psycopg2.connect(
    dbname = "dw_cs", 
    user = "postgres", 
    host= 'localhost',
    password = "Mu34zi72",
    port = 5432
)

In [3]:
def check_indexes(table_name):

    with conn.cursor() as cur:
        query = f"""
        SELECT
            indexname,
            indexdef
        FROM
            pg_indexes
        WHERE
            tablename = '{table_name}';
        """

        cur.execute(query)      
        
        indexes = cur.fetchall()
        
        for index in indexes:
            print(f"Index Name: {index[0]}")
            print(f"Index Definition: {index[1]}\n")

# function to explain-analyze a query

def explain_analyze(query, analyze = True):
    conn.rollback()
    with conn.cursor() as cur:
        if analyze:
            cur.execute(f"EXPLAIN ANALYZE {query}")
        else:
            cur.execute(f"EXPLAIN {query}")
        explain = cur.fetchall()

        for line in explain:
            print(line[0])

In [7]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    conn.commit()

query_materialized = """

CREATE VIEW part_lineitem AS

SELECT 
    part.p_brand, 
    part.p_container, 
    part.p_type,
    lineitem.l_partkey, 
    lineitem.l_orderkey, 
    lineitem.l_quantity, 
    lineitem.l_extendedprice, 
    lineitem.l_returnflag, 
    lineitem.l_shipdate, 
    lineitem.l_discount, 
    lineitem.l_tax,
    AVG(lineitem.l_quantity) OVER (PARTITION BY lineitem.l_partkey) AS avg_quantity
FROM part JOIN lineitem ON p_partkey = l_partkey;

EXPLAIN

SELECT 
    pl.p_brand, 
    pl.p_container, 
    pl.p_type,
    pl.l_partkey, 
    pl.l_orderkey, 
    pl.l_quantity, 
    pl.l_extendedprice, 
    pl.l_returnflag, 
    pl.l_shipdate, 
    pl.l_discount, 
    pl.l_tax, 
    pl.avg_quantity,
    o.o_orderdate
FROM part_lineitem pl JOIN orders o ON pl.l_orderkey = o.o_orderkey;

"""

with conn.cursor() as cur:
    cur.execute(query_materialized)
    for row in cur.fetchall():
        print(row)

('Hash Join  (cost=897529.89..74639748.12 rows=59986052 width=114)',)
('  Hash Cond: (lineitem.l_orderkey = o.o_orderkey)',)
('  ->  WindowAgg  (cost=1000.45..71535435.25 rows=59986052 width=110)',)
('        ->  Gather Merge  (cost=1000.45..70635644.47 rows=59986052 width=78)',)
('              Workers Planned: 2',)
('              ->  Nested Loop  (cost=0.43..63710765.64 rows=24994188 width=78)',)
('                    ->  Parallel Index Scan using part_pkey on part  (cost=0.43..81242.76 rows=833333 width=47)',)
('                    ->  Index Scan using idx_lineitem_partkey on lineitem  (cost=0.00..76.07 rows=29 width=35)',)
('                          Index Cond: (l_partkey = part.p_partkey)',)
('  ->  Hash  (cost=650435.44..650435.44 rows=15000000 width=8)',)
('        ->  Index Scan using orders_pkey on orders o  (cost=0.43..650435.44 rows=15000000 width=8)',)


In [8]:
query_materialized = """

CREATE VIEW part_lineitem AS

SELECT 
    part.p_brand, 
    part.p_container, 
    part.p_type,
    lineitem.l_partkey, 
    lineitem.l_orderkey, 
    lineitem.l_quantity, 
    lineitem.l_extendedprice, 
    lineitem.l_returnflag, 
    lineitem.l_shipdate, 
    lineitem.l_discount, 
    lineitem.l_tax,
    AVG(lineitem.l_quantity) OVER (PARTITION BY lineitem.l_partkey) AS avg_quantity
FROM part JOIN lineitem ON p_partkey = l_partkey;

SELECT 
    pl.p_brand, 
    pl.p_container, 
    pl.p_type,
    pl.l_partkey, 
    pl.l_orderkey, 
    pl.l_quantity, 
    pl.l_extendedprice, 
    pl.l_returnflag, 
    pl.l_shipdate, 
    pl.l_discount, 
    pl.l_tax, 
    pl.avg_quantity,
    o.o_orderdate
FROM part_lineitem pl JOIN orders o ON pl.l_orderkey = o.o_orderkey;
"""

conn.rollback()
with conn.cursor() as cur:
    start_time = time.time()
    cur.execute(query_materialized)
    end_time = time.time()
    print(f"Execution Time: {end_time - start_time}")
    conn.commit()

In [None]:
conn.rollback()
with conn.cursor() as cur:
    
    cur.execute("SELECT pg_total_relation_size('lineitem_orders_part');")
    size = cur.fetchall()
    print(f"Size of materialised view: {size[0][0]/(1024**2)} MB")

In [None]:
check_indexes('lineitem_orders_part')

In [14]:
query_10 = """
SELECT
    c_custkey,
    c_name,
    SUM(l_extendedprice * (1 - l_discount)) AS revenue,
    c_acctbal,
    n_name,
    c_address,
    c_phone,
    c_comment
FROM
    customer,
    orders,
    lineitem,
    nation
WHERE
    c_custkey = o_custkey
    AND l_orderkey = o_orderkey
    AND o_orderdate >= DATE '1993-10-01'
    AND o_orderdate < DATE '1993-10-01' + INTERVAL '3' MONTH
    AND l_returnflag = 'R'
    AND c_nationkey = n_nationkey
GROUP BY
    c_custkey,
    c_name,
    c_acctbal,
    c_phone,
    n_name,
    c_address,
    c_comment
ORDER BY
    revenue DESC;
"""

explain_analyze(query_10)

Sort  (cost=2285986.57..2287338.86 rows=540913 width=279) (actual time=94283.995..94529.354 rows=381105 loops=1)
  Sort Key: (sum((lineitem.l_extendedprice * ('1'::numeric - lineitem.l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  GroupAggregate  (cost=1894222.39..2093967.03 rows=540913 width=279) (actual time=84268.371..92207.632 rows=381105 loops=1)
        Group Key: customer.c_custkey, nation.n_name
        ->  Incremental Sort  (cost=1894222.39..2080444.21 rows=540913 width=259) (actual time=84268.332..90311.893 rows=1147084 loops=1)
              Sort Key: customer.c_custkey, nation.n_name
              Presorted Key: customer.c_custkey
              Full-sort Groups: 34124  Sort Method: quicksort  Average Memory: 31kB  Peak Memory: 31kB
              ->  Nested Loop  (cost=1894222.08..2056103.12 rows=540913 width=259) (actual time=84268.138..89108.177 rows=1147084 loops=1)
                    ->  Merge Join  (cost=1894221.93..2042509.49 rows=540913 width=1

In [17]:
query_14 = """
SELECT
    100.00 * SUM(CASE
        WHEN p_type LIKE 'PROMO%'
        THEN l_extendedprice * (1 - l_discount)
        ELSE 0
    END) / SUM(l_extendedprice * (1 - l_discount)) AS promo_revenue
FROM
    lineitem_orders_part
WHERE
    l_shipdate >= DATE '1995-09-01'
    AND l_shipdate < DATE '1995-09-01' + INTERVAL '1' MONTH;
"""

explain_analyze(query_14)

Finalize Aggregate  (cost=1100326.86..1100326.87 rows=1 width=32) (actual time=23610.942..23611.029 rows=1 loops=1)
  ->  Gather  (cost=1100326.62..1100326.83 rows=2 width=64) (actual time=23610.912..23611.004 rows=1 loops=1)
        Workers Planned: 2
        Workers Launched: 0
        ->  Partial Aggregate  (cost=1099326.62..1099326.64 rows=1 width=64) (actual time=23583.150..23583.153 rows=1 loops=1)
              ->  Parallel Seq Scan on lineitem_orders_part  (cost=0.00..1098552.00 rows=44264 width=132) (actual time=2.179..23142.598 rows=749223 loops=1)
                    Filter: ((l_shipdate >= '1995-09-01'::date) AND (l_shipdate < '1995-10-01 00:00:00'::timestamp without time zone))
                    Rows Removed by Filter: 59236829
Planning Time: 9.038 ms
Execution Time: 23615.344 ms


In [18]:
query_17 = """
SELECT
    SUM(l_extendedprice) / 7.0 AS avg_yearly
FROM
    lineitem_orders_part
WHERE
    p_brand = 'Brand#23'
    AND p_container = 'MED BOX'
    AND l_quantity < (0.2 * avg_quantity);
"""

explain_analyze(query_17)

Finalize Aggregate  (cost=1143816.41..1143816.42 rows=1 width=32) (actual time=26930.818..26930.899 rows=1 loops=1)
  ->  Gather  (cost=1143816.19..1143816.40 rows=2 width=32) (actual time=26930.798..26930.880 rows=1 loops=1)
        Workers Planned: 2
        Workers Launched: 0
        ->  Partial Aggregate  (cost=1142816.19..1142816.20 rows=1 width=32) (actual time=26908.808..26908.808 rows=1 loops=1)
              ->  Parallel Seq Scan on lineitem_orders_part  (cost=0.00..1142816.00 rows=74 width=32) (actual time=63.061..26907.140 rows=5526 loops=1)
                    Filter: ((p_brand = 'Brand#23'::bpchar) AND (p_container = 'MED BOX'::bpchar) AND (l_quantity < (0.2 * avg_quantity)))
                    Rows Removed by Filter: 59980526
Planning Time: 18.729 ms
Execution Time: 26930.992 ms


## trying with giga matrix 

In [None]:
query_materialized = """

CREATE MATERIALIZED VIEW customer_order_lineitem_part_nation AS
SELECT
    c.c_custkey,
    c.c_name,
    c.c_acctbal,
    c.c_phone,
    c.c_address,
    c.c_comment,
    n.n_name,
    o.o_orderkey,
    o.o_orderdate,
    l.l_partkey,
    l.l_extendedprice,
    l.l_discount,
    l.l_returnflag,
    l.l_quantity,
    p.p_brand,
    p.p_container,
    p.p_type
    AVG(lineitem.l_quantity) OVER (PARTITION BY lineitem.l_partkey) AS avg_quantity
FROM
    customer c
JOIN
    orders o ON c.c_custkey = o.o_custkey
JOIN
    lineitem l ON o.o_orderkey = l.l_orderkey
JOIN
    part p ON l.l_partkey = p.p_partkey
JOIN
    nation n ON c.c_nationkey = n.n_nationkey;
"""

with conn.cursor() as cur:
    start_time = time.time()
    cur.execute(query_materialized)
    end_time = time.time()
    print(f"Execution Time: {end_time - start_time}")
    conn.commit()

In [None]:
query_materialized = """

CREATE VIEW part_lineitem AS
SELECT 
    part.p_brand, 
    part.p_container, 
    part.p_type,
    lineitem.l_partkey, 
    lineitem.l_orderkey, 
    lineitem.l_quantity, 
    lineitem.l_extendedprice, 
    lineitem.l_returnflag, 
    lineitem.l_shipdate, 
    lineitem.l_discount, 
    lineitem.l_tax,
    AVG(lineitem.l_quantity) OVER (PARTITION BY lineitem.l_partkey) AS avg_quantity
FROM part 
JOIN lineitem ON part.p_partkey = lineitem.l_partkey;

CREATE MATERIALIZED VIEW customer_order_partlineitem_nation_mv AS
SELECT
    c.c_custkey,
    c.c_name,
    c.c_acctbal,
    c.c_phone,
    c.c_address,
    c.c_comment,
    n.n_name,
    o.o_orderkey,
    o.o_orderdate,
    pl.p_brand,
    pl.p_container,
    pl.p_type,
    pl.l_partkey,
    pl.l_orderkey,
    pl.l_quantity,
    pl.l_extendedprice,
    pl.l_returnflag,
    pl.l_shipdate,
    pl.l_discount,
    pl.l_tax,
    pl.avg_quantity
FROM
    customer c
JOIN
    orders o ON c.c_custkey = o.o_custkey
JOIN
    part_lineitem pl ON o.o_orderkey = pl.l_orderkey
JOIN
    nation n ON c.c_nationkey = n.n_nationkey;


"""

with conn.cursor() as cur:
    start_time = time.time()
    cur.execute(query_materialized)
    end_time = time.time()
    print(f"Execution Time: {end_time - start_time}")
    conn.commit()