In [1]:
import psycopg2
import time

# establish a connection to the PostgreSQL database
conn = psycopg2.connect(
    dbname = "dw_cs", 
    user = "postgres", 
    host= 'localhost',
    #host = '172.30.160.1',
    password = "postgres",
    port = 5432
)

In [2]:
def check_indexes(table_name):

    with conn.cursor() as cur:
        query = f"""
        SELECT
            indexname,
            indexdef
        FROM
            pg_indexes
        WHERE
            tablename = '{table_name}';
        """

        cur.execute(query)      
        
        indexes = cur.fetchall()
        
        for index in indexes:
            print(f"Index Name: {index[0]}")
            print(f"Index Definition: {index[1]}\n")

# function to explain-analyze a query

def explain_analyze(query, analyze = True):
    conn.rollback()
    with conn.cursor() as cur:
        if analyze:
            cur.execute(f"EXPLAIN ANALYZE {query}")
        else:
            cur.execute(f"EXPLAIN {query}")
        explain = cur.fetchall()

        for line in explain:
            print(line[0])

## Check indexes

In [3]:
a = ['nation', 'part', 'supplier', 'customer', 'lineitem', 'region', 'partsupp', 'orders']

conn.rollback()
for table in a:
    check_indexes(table)

Index Name: nation_pkey
Index Definition: CREATE UNIQUE INDEX nation_pkey ON public.nation USING btree (n_nationkey)

Index Name: part_pkey
Index Definition: CREATE UNIQUE INDEX part_pkey ON public.part USING btree (p_partkey)

Index Name: supplier_pkey
Index Definition: CREATE UNIQUE INDEX supplier_pkey ON public.supplier USING btree (s_suppkey)

Index Name: customer_pkey
Index Definition: CREATE UNIQUE INDEX customer_pkey ON public.customer USING btree (c_custkey)

Index Name: region_pkey
Index Definition: CREATE UNIQUE INDEX region_pkey ON public.region USING btree (r_regionkey)

Index Name: partsupp_pkey
Index Definition: CREATE UNIQUE INDEX partsupp_pkey ON public.partsupp USING btree (ps_partkey, ps_suppkey)

Index Name: orders_pkey
Index Definition: CREATE UNIQUE INDEX orders_pkey ON public.orders USING btree (o_orderkey)

Index Name: idx_o_orderdate
Index Definition: CREATE INDEX idx_o_orderdate ON public.orders USING btree (o_orderdate)



In [4]:
conn.rollback()
with conn.cursor() as cur:
    query = """
    drop index if exists idx_l_returnflag;
    """
    cur.execute(query)
    conn.commit()

## Query 1

is it necessary to create a mv for this query? we don't expect so much gain because of the sequential scan on shipdate. we could perform a group by, but in this way we can't add shipdate and adding this information to


## Materialized on part lineitem

In [6]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    conn.commit()

query_materialized = """

CREATE MATERIALIZED VIEW part_lineitem AS
SELECT
    l_returnflag,
    l_linestatus,
    l_quantity,
    l_extendedprice,
    l_discount,
    l_tax,
    l_shipdate,
    l_partkey,
    p_partkey,
    p_brand,
    p_container,
    SUBSTRING(p_type FROM 1 FOR 5) AS p_type_prefix,
    0.2 * AVG(l_quantity) OVER (PARTITION BY l_partkey) AS avg_quantity
FROM
    lineitem l
JOIN
    part p ON l.l_partkey = p.p_partkey;

"""

with conn.cursor() as cur:
    start_time = time.time()
    cur.execute(query_materialized)
    end_time = time.time()
    print(f"Time taken to create materialized view: {end_time - start_time} seconds")
    conn.commit()

Time taken to create materialized view: 299.80979585647583 seconds


we allowed to use hash join since we are not interested in indexes performance.

In [7]:
conn.rollback()
with conn.cursor() as cur:
    
    cur.execute("SELECT pg_total_relation_size('part_lineitem');")
    size = cur.fetchall()
    print(f"Size of materialised view: {size[0][0]/(1024**2)} MB")
    print(f"Size of materialised view: {size[0][0]/(1024**3)} GB")

Size of materialised view: 6582.0078125 MB
Size of materialised view: 6.427742004394531 GB


In [8]:
conn.rollback()
check_indexes('part_lineitem')

### Query 14

In [4]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    cur.execute("SET enable_material = on;")
    conn.commit()

query_14 = """
SELECT
    100.00 * SUM(CASE
        WHEN p_type_prefix LIKE 'PROMO'
        THEN l_extendedprice * (1 - l_discount)
        ELSE 0
    END) / SUM(l_extendedprice * (1 - l_discount)) AS promo_revenue
FROM
    part_lineitem
WHERE
    l_shipdate >= DATE '1995-09-01'
    AND l_shipdate < DATE '1995-09-01' + INTERVAL '1' MONTH;
"""

In [10]:
explain_analyze(query_14)

Finalize Aggregate  (cost=970607.83..970607.84 rows=1 width=32) (actual time=27278.221..27281.427 rows=1 loops=1)
  ->  Gather  (cost=970607.59..970607.80 rows=2 width=64) (actual time=27277.773..27281.379 rows=3 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial Aggregate  (cost=969607.59..969607.60 rows=1 width=64) (actual time=27265.298..27265.302 rows=1 loops=3)
              ->  Parallel Seq Scan on part_lineitem  (cost=0.00..968870.40 rows=42125 width=96) (actual time=0.432..27126.996 rows=249741 loops=3)
                    Filter: ((l_shipdate >= '1995-09-01'::date) AND (l_shipdate < '1995-10-01 00:00:00'::timestamp without time zone))
                    Rows Removed by Filter: 19745610
Planning Time: 1.226 ms
Execution Time: 27281.538 ms


the problem is that it does a seq scan because there is no index on shipdate.

Size of the result table

In [11]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_14};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_14 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 (Decimal('16.6475949416150953'),)
Size of query_14 result table: 0.015625 MB


### Query 17

Here we expect to get the most gain.

In [None]:
query_17 = """
SELECT
    SUM(l_extendedprice) / 7.0 AS avg_yearly
FROM
    part_lineitem
WHERE
    p_brand = 'Brand#23'
    AND p_container = 'MED BOX'
    AND l_quantity < avg_quantity;
"""

In [13]:
explain_analyze(query_17)

Finalize Aggregate  (cost=990933.20..990933.21 rows=1 width=32) (actual time=16940.366..16941.821 rows=1 loops=1)
  ->  Gather  (cost=990932.98..990933.19 rows=2 width=32) (actual time=16940.296..16941.812 rows=3 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial Aggregate  (cost=989932.98..989932.99 rows=1 width=32) (actual time=16935.286..16935.287 rows=1 loops=3)
              ->  Parallel Seq Scan on part_lineitem  (cost=0.00..989932.80 rows=70 width=32) (actual time=50.863..16934.722 rows=1842 loops=3)
                    Filter: ((l_quantity < avg_quantity) AND (p_brand = 'Brand#23'::bpchar) AND (p_container = 'MED BOX'::bpchar))
                    Rows Removed by Filter: 19993509
Planning Time: 2.369 ms
Execution Time: 16941.847 ms


Size of result table

In [14]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_17_1};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_1 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 (Decimal('3295493.512857142857'),)
Size of query_1 result table: 0.015625 MB


## Materialized customer_order_lineitem_nation

In [5]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    conn.commit()

query_materialized = """

CREATE MATERIALIZED VIEW customer_order_lineitem_nation AS
SELECT
    c.c_custkey,
    c.c_name,
    c.c_acctbal,
    n.n_name,
    c.c_address,
    c.c_phone,
    c.c_comment,
    -- c.c_nationkey, not needed in the query, so not included
    l.l_returnflag,
    -- l.l_orderkey, not needed in the query, so not included
    l.l_discount,
    l.l_extendedprice,
    o.o_orderdate

FROM
    customer c
JOIN
    orders o ON c.c_custkey = o.o_custkey
JOIN
    lineitem l ON l.l_orderkey = o.o_orderkey
JOIN
    nation n ON c.c_nationkey = n.n_nationkey;
"""

explain_analyze(query_materialized, analyze = False)

Hash Join  (cost=773460.56..4508342.52 rows=59986052 width=265)
  Hash Cond: (c.c_nationkey = n.n_nationkey)
  ->  Hash Join  (cost=773459.00..4324183.78 rows=59986052 width=165)
        Hash Cond: (o.o_custkey = c.c_custkey)
        ->  Hash Join  (cost=671655.00..3329726.95 rows=59986052 width=22)
              Hash Cond: (l.l_orderkey = o.o_orderkey)
              ->  Seq Scan on lineitem l  (cost=0.00..1724403.52 rows=59986052 width=18)
              ->  Hash  (cost=410912.00..410912.00 rows=15000000 width=12)
                    ->  Seq Scan on orders o  (cost=0.00..410912.00 rows=15000000 width=12)
        ->  Hash  (cost=50827.00..50827.00 rows=1500000 width=147)
              ->  Seq Scan on customer c  (cost=0.00..50827.00 rows=1500000 width=147)
  ->  Hash  (cost=1.25..1.25 rows=25 width=108)
        ->  Seq Scan on nation n  (cost=0.00..1.25 rows=25 width=108)


In [None]:
conn.rollback()
with conn.cursor() as cur:
    start_time = time.time()
    cur.execute("drop materialized view customer_order_lineitem_nation;")
    conn.commit()

In [7]:
conn.rollback()
with conn.cursor() as cur:
    start_time = time.time()
    cur.execute(query_materialized)
    end_time = time.time()
    print(f"Time taken to create materialized view: {end_time - start_time} seconds")
    conn.commit()

Time taken to create materialized view: 551.8813228607178 seconds


In [11]:
conn.rollback()
with conn.cursor() as cur:
    
    cur.execute("SELECT pg_total_relation_size('customer_order_lineitem_nation');")
    size = cur.fetchall()
    print(f"Size of materialised view: {size[0][0]/(1024**2)} MB")
    print(f"Size of materialised view: {size[0][0]/(1024**3)} GB")

Size of materialised view: 12939.0859375 MB
Size of materialised view: 12.635826110839844 GB


In [4]:
check_indexes('customer_order_lineitem_nation')

Index Name: idx_coln_orderdate
Index Definition: CREATE INDEX idx_coln_orderdate ON public.customer_order_lineitem_nation USING btree (o_orderdate)



In [9]:
query_10 = """
SELECT
    c_custkey,
    c_name,
    SUM(l_extendedprice * (1 - l_discount)) AS revenue,
    c_acctbal,
    n_name,
    c_address,
    c_phone,
    c_comment
FROM
    customer_order_lineitem_nation
WHERE
    o_orderdate >= DATE '1993-10-01'
    AND o_orderdate < DATE '1993-10-01' + INTERVAL '3' MONTH
    AND l_returnflag = 'R'
GROUP BY
    c_custkey,
    c_name,
    c_acctbal,
    c_phone,
    n_name,
    c_address,
    c_comment
ORDER BY
    revenue DESC;
"""


In [10]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    conn.commit()

explain_analyze(query_10)

Sort  (cost=1790056.27..1790057.41 rows=455 width=654) (actual time=62449.084..62499.962 rows=381105 loops=1)
  Sort Key: (sum((l_extendedprice * ('1'::numeric - l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  Finalize GroupAggregate  (cost=1789970.48..1790036.18 rows=455 width=654) (actual time=61336.128..61992.943 rows=381105 loops=1)
        Group Key: c_custkey, c_name, c_acctbal, c_phone, n_name, c_address, c_comment
        ->  Gather Merge  (cost=1789970.48..1790021.94 rows=380 width=654) (actual time=61329.519..61709.788 rows=449735 loops=1)
              Workers Planned: 2
              Workers Launched: 2
              ->  Partial GroupAggregate  (cost=1788970.46..1788978.06 rows=190 width=654) (actual time=61310.738..61596.312 rows=149912 loops=3)
                    Group Key: c_custkey, c_name, c_acctbal, c_phone, n_name, c_address, c_comment
                    ->  Sort  (cost=1788970.46..1788970.93 rows=190 width=686) (actual time=61308.963..61351.

In [20]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_10};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_10 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 (1237537, 'Customer#001237537', Decimal('884989.6657'), Decimal('7840.17'), 'RUSSIA                   ', 'FNG6WgB1mopyyY,ajQTU qUPW5o', '32-367-120-4327', 'nag carefully about the regular packages. carefully reg')
Size of query_10 result table: 78.5078125 MB


## mixed approach order lineitem part

In [6]:
query_10_1 = """
SELECT
    c_custkey,
    c_name,
    SUM(l_extendedprice * (1 - l_discount)) AS revenue,
    c_acctbal,
    n_name,
    c_address,
    c_phone,
    c_comment
FROM
    part_lineitem_order
    JOIN customer c ON c.c_custkey = o_custkey
    JOIN nation n ON c.c_nationkey = n.n_nationkey
WHERE
    o_orderdate >= DATE '1993-10-01'
    AND o_orderdate < DATE '1993-10-01' + INTERVAL '3' MONTH
    AND l_returnflag = 'R'
GROUP BY
    c_custkey,
    c_name,
    c_acctbal,
    c_phone,
    n_name,
    c_address,
    c_comment
ORDER BY
    revenue DESC;
"""

query_14_1 = """
SELECT
    100.00 * SUM(CASE
        WHEN p_type_prefix LIKE 'PROMO'
        THEN l_extendedprice * (1 - l_discount)
        ELSE 0
    END) / SUM(l_extendedprice * (1 - l_discount)) AS promo_revenue
FROM
    part_lineitem_order
WHERE
    l_shipdate >= DATE '1995-09-01'
    AND l_shipdate < DATE '1995-09-01' + INTERVAL '1' MONTH;
"""

query_17_1 = """
SELECT
    SUM(l_extendedprice) / 7.0 AS avg_yearly
FROM
    part_lineitem_order
WHERE
    p_brand = 'Brand#23'
    AND p_container = 'MED BOX'
    AND l_quantity < avg_quantity;
"""

In [None]:
query_materialized = """

CREATE MATERIALIZED VIEW part_lineitem_order AS
SELECT
    l_returnflag,
    l_linestatus,
    l_quantity,
    l_extendedprice,
    l_discount,
    l_tax,
    l_shipdate,
    l_partkey,
    p_partkey,
    p_brand,
    p_container,
    SUBSTRING(p_type FROM 1 FOR 5) AS p_type_prefix,
    0.2 * AVG(l_quantity) OVER (PARTITION BY l_partkey) AS avg_quantity,
    o_orderkey,
    o.o_custkey,
    o.o_orderdate
FROM
    lineitem l
JOIN
    part p ON l.l_partkey = p.p_partkey
JOIN
    orders o ON l.l_orderkey = o.o_orderkey;

"""

In [None]:
conn.rollback()
with conn.cursor() as cur:
    start_time = time.time()
    cur.execute(query_materialized)
    end_time = time.time()
    print(f"Time taken to create materialized view: {end_time - start_time} seconds")
    conn.commit()

In [10]:
check_indexes('part_lineitem_order')

Index Name: idx_plo_shipdate
Index Definition: CREATE INDEX idx_plo_shipdate ON public.part_lineitem_order USING btree (l_shipdate)

Index Name: idx_plo_o_orderdate
Index Definition: CREATE INDEX idx_plo_o_orderdate ON public.part_lineitem_order USING btree (o_orderdate)

Index Name: idx_plo_brand
Index Definition: CREATE INDEX idx_plo_brand ON public.part_lineitem_order USING btree (p_brand)

Index Name: idx_plo_container
Index Definition: CREATE INDEX idx_plo_container ON public.part_lineitem_order USING btree (p_container)

Index Name: idx_plo_custkey
Index Definition: CREATE INDEX idx_plo_custkey ON public.part_lineitem_order USING btree (o_custkey)



In [12]:
explain_analyze(query_10_1)

Sort  (cost=1776499.22..1777907.79 rows=563430 width=279) (actual time=25666.602..25716.922 rows=381105 loops=1)
  Sort Key: (sum((part_lineitem_order.l_extendedprice * ('1'::numeric - part_lineitem_order.l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  GroupAggregate  (cost=1371906.12..1576317.73 rows=563430 width=279) (actual time=23400.287..25290.271 rows=381105 loops=1)
        Group Key: c.c_custkey, n.n_name
        ->  Incremental Sort  (cost=1371906.12..1562231.98 rows=563430 width=259) (actual time=23400.272..24731.624 rows=1147084 loops=1)
              Sort Key: c.c_custkey, n.n_name
              Presorted Key: c.c_custkey
              Full-sort Groups: 34124  Sort Method: quicksort  Average Memory: 31kB  Peak Memory: 31kB
              ->  Nested Loop  (cost=1371905.81..1536877.63 rows=563430 width=259) (actual time=23400.199..24404.519 rows=1147084 loops=1)
                    ->  Merge Join  (cost=1371905.66..1523099.92 rows=563430 width=159) (actu

In [11]:
with conn.cursor() as cur:
    
    cur.execute("drop index if exists idx_plo_shipdate;")
    cur.execute("drop index if exists idx_plo_o_orderdate;")
    cur.execute("drop index if exists idx_plo_brand;")
    cur.execute("drop index if exists idx_plo_container;")
    cur.execute("drop index if exists idx_plo_custkey;")
    conn.commit()

In [13]:
explain_analyze(query_14_1)

Finalize Aggregate  (cost=1289948.88..1289948.89 rows=1 width=32) (actual time=22442.989..22445.220 rows=1 loops=1)
  ->  Gather  (cost=1289948.64..1289948.85 rows=2 width=64) (actual time=22442.756..22445.209 rows=3 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial Aggregate  (cost=1288948.64..1288948.65 rows=1 width=64) (actual time=22422.299..22422.300 rows=1 loops=3)
              ->  Parallel Seq Scan on part_lineitem_order  (cost=0.00..1283456.82 rows=313818 width=18) (actual time=1.258..22276.169 rows=249741 loops=3)
                    Filter: ((l_shipdate >= '1995-09-01'::date) AND (l_shipdate < '1995-10-01 00:00:00'::timestamp without time zone))
                    Rows Removed by Filter: 19745610
Planning Time: 0.095 ms
Execution Time: 22445.263 ms


In [14]:
explain_analyze(query_17_1)

Finalize Aggregate  (cost=1346964.43..1346964.44 rows=1 width=32) (actual time=22503.711..22505.520 rows=1 loops=1)
  ->  Gather  (cost=1346964.20..1346964.41 rows=2 width=32) (actual time=22503.504..22505.489 rows=3 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial Aggregate  (cost=1345964.20..1345964.21 rows=1 width=32) (actual time=22498.851..22498.852 rows=1 loops=3)
              ->  Parallel Seq Scan on part_lineitem_order  (cost=0.00..1345942.30 rows=8762 width=8) (actual time=20.351..22498.051 rows=1842 loops=3)
                    Filter: ((l_quantity < avg_quantity) AND (p_brand = 'Brand#23'::bpchar) AND (p_container = 'MED BOX'::bpchar))
                    Rows Removed by Filter: 19993509
Planning Time: 0.161 ms
Execution Time: 22505.596 ms


## indexes on order_lineitem_part

In [16]:
with conn.cursor() as cur:
    
    cur.execute("drop index if exists idx_plo_shipdate;")
    cur.execute("drop index if exists idx_plo_o_orderdate;")
    cur.execute("drop index if exists idx_plo_brand;")
    cur.execute("drop index if exists idx_plo_container;")
    conn.commit()

In [17]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_plo_shipdate ON part_lineitem_order (l_shipdate);")
    end_time = time.time()
    print(f"Time to create idx_plo_shipdate ON part_lineitem_order (l_shipdate): {end_time - start_time} seconds")

    cur.execute("SELECT pg_relation_size('idx_plo_shipdate');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_plo_shipdate: {index_size/(1024**2)} MB")
    print(f"Size of idx_plo_shipdate: {index_size/(1024**3)} GB")

    conn.commit()

Time to create idx_plo_shipdate ON part_lineitem_order (l_shipdate): 42.57509398460388 seconds
Size of idx_plo_shipdate: 397.546875 MB
Size of idx_plo_shipdate: 0.3882293701171875 GB


In [18]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_plo_o_orderdate ON part_lineitem_order (o_orderdate);")
    end_time = time.time()
    print(f"Time to create idx_plo_o_orderdate ON part_lineitem_order (o_orderdate): {end_time - start_time} seconds")

    cur.execute("SELECT pg_relation_size('idx_plo_o_orderdate');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_plo_o_orderdate: {index_size/(1024**2)} MB")
    print(f"Size of idx_plo_o_orderdate: {index_size/(1024**3)} GB")

    conn.commit()

Time to create idx_plo_o_orderdate ON part_lineitem_order (o_orderdate): 44.36399292945862 seconds
Size of idx_plo_o_orderdate: 397.5078125 MB
Size of idx_plo_o_orderdate: 0.38819122314453125 GB


In [19]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_plo_brand ON part_lineitem_order (p_brand);")
    end_time = time.time()
    print(f"Time to create idx_plo_brand ON part_lineitem_order (p_brand): {end_time - start_time} seconds")

    cur.execute("SELECT pg_relation_size('idx_plo_brand');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_plo_brand: {index_size/(1024**2)} MB")
    print(f"Size of idx_plo_brand: {index_size/(1024**3)} GB")

    conn.commit()

Time to create idx_plo_brand ON part_lineitem_order (p_brand): 58.09077000617981 seconds
Size of idx_plo_brand: 403.140625 MB
Size of idx_plo_brand: 0.3936920166015625 GB


In [20]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_plo_container ON part_lineitem_order (p_container);")
    end_time = time.time()
    print(f"Time to create idx_plo_container ON part_lineitem_order (p_container): {end_time - start_time} seconds")

    cur.execute("SELECT pg_relation_size('idx_plo_container');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_plo_container: {index_size/(1024**2)} MB")
    print(f"Size of idx_plo_container: {index_size/(1024**3)} GB")

    conn.commit()

Time to create idx_plo_container ON part_lineitem_order (p_container): 64.20518612861633 seconds
Size of idx_plo_container: 403.1484375 MB
Size of idx_plo_container: 0.39369964599609375 GB


In [21]:
with conn.cursor() as cur:

    cur.execute("SET enable_seqscan = off;")
    conn.commit()

explain_analyze(query_10_1)

Sort  (cost=1758138.74..1759547.31 rows=563430 width=279) (actual time=31916.069..31967.507 rows=381105 loops=1)
  Sort Key: (sum((part_lineitem_order.l_extendedprice * ('1'::numeric - part_lineitem_order.l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  Finalize GroupAggregate  (cost=1387783.36..1557957.25 rows=563430 width=279) (actual time=30158.032..31479.786 rows=381105 loops=1)
        Group Key: c.c_custkey, n.n_name
        ->  Gather Merge  (cost=1387783.36..1546219.13 rows=469524 width=279) (actual time=30158.012..31098.325 rows=719453 loops=1)
              Workers Planned: 2
              Workers Launched: 2
              ->  Partial GroupAggregate  (cost=1386783.34..1491024.39 rows=234762 width=279) (actual time=30143.852..31001.176 rows=239818 loops=3)
                    Group Key: c.c_custkey, n.n_name
                    ->  Incremental Sort  (cost=1386783.34..1485155.34 rows=234762 width=259) (actual time=30143.343..30741.911 rows=382361 loops=3)


In [22]:
explain_analyze(query_14_1)

Finalize Aggregate  (cost=1275875.34..1275875.35 rows=1 width=32) (actual time=21797.578..21809.007 rows=1 loops=1)
  ->  Gather  (cost=1275875.10..1275875.31 rows=2 width=64) (actual time=21796.586..21808.983 rows=3 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial Aggregate  (cost=1274875.10..1274875.11 rows=1 width=64) (actual time=21771.187..21771.188 rows=1 loops=3)
              ->  Parallel Bitmap Heap Scan on part_lineitem_order  (cost=10276.50..1269383.28 rows=313818 width=18) (actual time=69.307..21375.595 rows=249741 loops=3)
                    Recheck Cond: ((l_shipdate >= '1995-09-01'::date) AND (l_shipdate < '1995-10-01 00:00:00'::timestamp without time zone))
                    Rows Removed by Index Recheck: 9982884
                    Heap Blocks: exact=16183 lossy=155307
                    ->  Bitmap Index Scan on idx_plo_shipdate  (cost=0.00..10088.20 rows=753164 width=0) (actual time=83.657..83.657 rows=749223 loops=1)
           

In [23]:
explain_analyze(query_17_1)

Aggregate  (cost=241282.81..241282.83 rows=1 width=32) (actual time=1598.072..1598.078 rows=1 loops=1)
  ->  Bitmap Heap Scan on part_lineitem_order  (cost=43634.94..241230.24 rows=21028 width=8) (actual time=264.046..1597.398 rows=5526 loops=1)
        Recheck Cond: ((p_container = 'MED BOX'::bpchar) AND (p_brand = 'Brand#23'::bpchar))
        Rows Removed by Index Recheck: 304391
        Filter: (l_quantity < avg_quantity)
        Rows Removed by Filter: 55859
        Heap Blocks: exact=3130 lossy=4537
        ->  BitmapAnd  (cost=43634.94..43634.94 rows=63084 width=0) (actual time=262.502..262.505 rows=0 loops=1)
              ->  Bitmap Index Scan on idx_plo_container  (cost=0.00..17042.85 rows=1557638 width=0) (actual time=158.896..158.896 rows=1504601 loops=1)
                    Index Cond: (p_container = 'MED BOX'::bpchar)
              ->  Bitmap Index Scan on idx_plo_brand  (cost=0.00..26581.33 rows=2429435 width=0) (actual time=99.333..99.333 rows=2391264 loops=1)
          

## Materialisation + indexes

to optimize query 14 we could put an index on shipdate

to optimize query 17 we could put an index on l_partkey

to optimizze query 10 we could put an index on orderdate

In [70]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_pl_shipdate ON part_lineitem (l_shipdate);")
    end_time = time.time()
    print(f"Time to create idx_pl_shipdate ON part_lineitem (l_shipdate): {end_time - start_time} seconds")

    conn.commit()

Time to create idx_pl_shipdate ON part_lineitem (l_shipdate): 41.588475942611694 seconds


In [71]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_pl_brand ON part_lineitem (p_brand);")
    end_time = time.time()
    print(f"Time to create idx_pl_brand ON part_lineitem (p_brand): {end_time - start_time} seconds")

    conn.commit()

Time to create idx_pl_brand ON part_lineitem (p_brand): 57.2357919216156 seconds


In [72]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_pl_container ON part_lineitem (p_container);")
    end_time = time.time()
    print(f"Time to create idx_pl_container ON part_lineitem (p_container): {end_time - start_time} seconds")

    conn.commit()

Time to create idx_pl_container ON part_lineitem (p_container): 59.95706009864807 seconds


In [73]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_material = on;")
    conn.commit()
    
explain_analyze(query_14)

Finalize Aggregate  (cost=1275875.34..1275875.35 rows=1 width=32) (actual time=22617.626..22627.679 rows=1 loops=1)
  ->  Gather  (cost=1275875.10..1275875.31 rows=2 width=64) (actual time=22616.526..22627.654 rows=3 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial Aggregate  (cost=1274875.10..1274875.11 rows=1 width=64) (actual time=22576.878..22576.880 rows=1 loops=3)
              ->  Parallel Bitmap Heap Scan on part_lineitem_order  (cost=10276.50..1269383.28 rows=313818 width=18) (actual time=103.201..22201.052 rows=249741 loops=3)
                    Recheck Cond: ((l_shipdate >= '1995-09-01'::date) AND (l_shipdate < '1995-10-01 00:00:00'::timestamp without time zone))
                    Rows Removed by Index Recheck: 9982884
                    Heap Blocks: exact=16002 lossy=154642
                    ->  Bitmap Index Scan on idx_plo_shipdate  (cost=0.00..10088.20 rows=753164 width=0) (actual time=131.234..131.235 rows=749223 loops=1)
        

In [74]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_material = on;")
    conn.commit()
    
explain_analyze(query_17)

Aggregate  (cost=241282.81..241282.83 rows=1 width=32) (actual time=1665.278..1665.282 rows=1 loops=1)
  ->  Bitmap Heap Scan on part_lineitem_order  (cost=43634.94..241230.24 rows=21028 width=8) (actual time=139.719..1664.263 rows=5526 loops=1)
        Recheck Cond: ((p_container = 'MED BOX'::bpchar) AND (p_brand = 'Brand#23'::bpchar))
        Rows Removed by Index Recheck: 304391
        Filter: (l_quantity < avg_quantity)
        Rows Removed by Filter: 55859
        Heap Blocks: exact=3130 lossy=4537
        ->  BitmapAnd  (cost=43634.94..43634.94 rows=63084 width=0) (actual time=138.213..138.216 rows=0 loops=1)
              ->  Bitmap Index Scan on idx_plo_container  (cost=0.00..17042.85 rows=1557638 width=0) (actual time=57.710..57.711 rows=1504601 loops=1)
                    Index Cond: (p_container = 'MED BOX'::bpchar)
              ->  Bitmap Index Scan on idx_plo_brand  (cost=0.00..26581.33 rows=2429435 width=0) (actual time=76.879..76.880 rows=2391264 loops=1)
            

In [75]:
check_indexes('part_lineitem')

Index Name: idx_pl_shipdate
Index Definition: CREATE INDEX idx_pl_shipdate ON public.part_lineitem USING btree (l_shipdate)

Index Name: idx_pl_brand
Index Definition: CREATE INDEX idx_pl_brand ON public.part_lineitem USING btree (p_brand)

Index Name: idx_pl_container
Index Definition: CREATE INDEX idx_pl_container ON public.part_lineitem USING btree (p_container)



In [76]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_coln_orderdate ON customer_order_lineitem_nation (o_orderdate);")
    end_time = time.time()
    print(f"Time to create idx_coln_orderdate ON customer_order_lineitem_nation (o_orderdate): {end_time - start_time} seconds")

    conn.commit()

Time to create idx_coln_orderdate ON customer_order_lineitem_nation (o_orderdate): 49.34825897216797 seconds


In [78]:
check_indexes('customer_order_lineitem_nation')

Index Name: idx_coln_orderdate
Index Definition: CREATE INDEX idx_coln_orderdate ON public.customer_order_lineitem_nation USING btree (o_orderdate)



In [6]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_coln_customer ON customer_order_lineitem_nation (c_custkey);")
    end_time = time.time()
    print(f"Time to create idx_coln_customer ON customer_order_lineitem_nation (c_custkey): {end_time - start_time} seconds")

    conn.commit()

Time to create idx_coln_customer ON customer_order_lineitem_nation (c_custkey): 64.0774028301239 seconds


In [80]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = off;")
    cur.execute("SET enable_material = on;")
    conn.commit()
explain_analyze(query_10)

## Conclusion

We can see that the gain of using 2 different materialized views wrt using a single one with part order lineitem are the same. the only useful indexes on part order lineitem are p_brand and p_container that speed up a lot the query.

since we risparmiare time and space by creating only one mv, we choose to use that, also because it can be helpful for many other queries, in fact the tables part order lineitem are the "core" of the db.