In [1]:
import psycopg2
import time

In [2]:
# establish a connection to the PostgreSQL database
conn = psycopg2.connect(
    dbname = "dw_cs", 
    user = "postgres", 
    host= 'localhost',
    password = "Mu34zi72",
    port = 5432
)

In [5]:
with conn.cursor() as cur:
    query = f"""
    SELECT
        indexname,
        indexdef
    FROM
        pg_indexes
    WHERE
        tablename = 'lineitem';
    """

    cur.execute(query)      
     
    indexes = cur.fetchall()
    
    for index in indexes:
        print(f"Index Name: {index[0]}")
        print(f"Index Definition: {index[1]}\n")

## First step
Compute size and time for executing the queries without additional structure support. Record the size of the result set.


the first value is the startup cost, the second the total cost.

Startup Cost: This represents the amount of work the query planner estimates is required before the first row can be returned. For a sequential scan (Seq Scan), this value is typically very low or zero because the first row can be returned almost immediately.

Total Cost: This represents the total estimated cost to execute the entire query. It is the sum of the startup cost and the cost to process all rows.

## how to compute the size needed to execute the query?

## Query 1

In [3]:
query_1 = """
SELECT
    l_returnflag,
    l_linestatus,
    SUM(l_quantity) AS sum_qty,
    SUM(l_extendedprice) AS sum_base_price,
    SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
    SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
    AVG(l_quantity) AS avg_qty,
    AVG(l_extendedprice) AS avg_price,
    AVG(l_discount) AS avg_disc,
    COUNT(*) AS count_order
FROM
    lineitem
WHERE
    l_shipdate <= DATE '1998-12-01' - INTERVAL '90' DAY
GROUP BY
    l_returnflag,
    l_linestatus
ORDER BY
    l_returnflag,
    l_linestatus;
"""

In [4]:
conn.rollback()
with conn.cursor() as cur:


    cur.execute(f"EXPLAIN {query_1}")
    explain_result = cur.fetchall()
    print("EXPLAIN result:")
    for row in explain_result:
        print(row[0])

In [None]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    
    cur.execute(f"EXPLAIN ANALYZE {query_1}")
    explain_result = cur.fetchall()
    print("EXPLAIN ANALYZE result:")
    for row in explain_result:
        print(row[0])

Validate and record the size of the table

In [6]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_1};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_1 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 ('A', 'F', Decimal('377518399'), Decimal('566065727797.25'), Decimal('537759104278.0656'), Decimal('559276670892.116819'), Decimal('25.5009751030070973'), Decimal('38237.151008958546'), Decimal('0.05000657454024320463'), 14804077)
Size of query_1 result table: 0.015625 MB


## Optimising with indexes query 1

In [11]:
# l_shipdate has selectivity of 90/(6*12*365) = 0,003424657534 , so an index may be useful, 
# but since we have <=, an hash index can't be used. 
# we may use an inverted list

# tried with btree_gin extension, but it gave worse results

"""
conn.rollback()
with conn.cursor() as cur:
    cur.execute("CREATE EXTENSION IF NOT EXISTS btree_gin;")
    cur.execute("CREATE INDEX idx_lineitem_shipdate ON lineitem USING gin (l_shipdate);")

"""

# we may try to put an index also on (l_returnflag, l_linestatus) since they are used in the GROUP BY and ORDER BY clauses

Time to create gin index on lineitem (l_shipdate): 70.45620393753052 seconds


In [4]:
with conn.cursor() as cur:

    cur.execute("SELECT pg_relation_size('idx_lineitem_shipdate');")
    index_size = cur.fetchone()[0]
    print(f"Size of gin index on lineitem (l_shipdate): {index_size/(1024**2)} MB")
    print(f"Size of gin index on lineitem (l_shipdate): {index_size/(1024**3)} GB")

Size of gin index on lineitem (l_shipdate): 192.046875 MB
Size of gin index on lineitem (l_shipdate): 0.1875457763671875 GB


In [5]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("DROP INDEX idx_lineitem_shipdate;")
    conn.commit()


In [14]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_1};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_1 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 ('A', 'F', Decimal('377518399'), Decimal('566065727797.25'), Decimal('537759104278.0656'), Decimal('559276670892.116819'), Decimal('25.5009751030070973'), Decimal('38237.151008958546'), Decimal('0.05000657454024320463'), 14804077)
Size of query_1 result table: 0.015625 MB


## Query 10

In [17]:
query_10 = """
SELECT
    c_custkey,
    c_name,
    SUM(l_extendedprice * (1 - l_discount)) AS revenue,
    c_acctbal,
    n_name,
    c_address,
    c_phone,
    c_comment
FROM
    customer,
    orders,
    lineitem,
    nation
WHERE
    c_custkey = o_custkey
    AND l_orderkey = o_orderkey
    AND o_orderdate >= DATE '1993-10-01'
    AND o_orderdate < DATE '1993-10-01' + INTERVAL '3' MONTH
    AND l_returnflag = 'R'
    AND c_nationkey = n_nationkey
GROUP BY
    c_custkey,
    c_name,
    c_acctbal,
    c_phone,
    n_name,
    c_address,
    c_comment
ORDER BY
    revenue DESC;
"""


In [18]:
conn.rollback()
with conn.cursor() as cur:
    
    cur.execute(f"EXPLAIN ANALYZE {query_10}")
    explain_result = cur.fetchall()
    print("EXPLAIN ANALYZE result:")
    for row in explain_result:
        print(row[0])

EXPLAIN ANALYZE result:
Sort  (cost=2327540.51..2329002.51 rows=584800 width=280) (actual time=41666.724..42068.263 rows=381105 loops=1)
  Sort Key: (sum((lineitem.l_extendedprice * ('1'::numeric - lineitem.l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  GroupAggregate  (cost=1995877.47..2119609.74 rows=584800 width=280) (actual time=38977.089..41144.919 rows=381105 loops=1)
        Group Key: customer.c_custkey, nation.n_name
        ->  Incremental Sort  (cost=1995877.47..2104989.74 rows=584800 width=260) (actual time=38977.066..40379.966 rows=1147084 loops=1)
              Sort Key: customer.c_custkey, nation.n_name
              Presorted Key: customer.c_custkey
              Full-sort Groups: 34124  Sort Method: quicksort  Average Memory: 31kB  Peak Memory: 31kB
              ->  Nested Loop  (cost=1995877.31..2078673.74 rows=584800 width=260) (actual time=38976.981..39925.248 rows=1147084 loops=1)
                    ->  Gather Merge  (cost=1995877.16..2063

Validate and record the size of the table

In [20]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_10};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_10 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 (1237537, 'Customer#001237537', Decimal('884989.6657'), Decimal('7840.17'), 'RUSSIA                   ', 'FNG6WgB1mopyyY,ajQTU qUPW5o', '32-367-120-4327', 'nag carefully about the regular packages. carefully reg')
Size of query_10 result table: 78.5078125 MB


## Query 14

In [3]:
query_14 = """
SELECT
    100.00 * SUM(CASE
        WHEN p_type LIKE 'PROMO%'
        THEN l_extendedprice * (1 - l_discount)
        ELSE 0
    END) / SUM(l_extendedprice * (1 - l_discount)) AS promo_revenue
FROM
    lineitem,
    part
WHERE
    l_partkey = p_partkey
    AND l_shipdate >= DATE '1995-09-01'
    AND l_shipdate < DATE '1995-09-01' + INTERVAL '1' MONTH;

"""

In [None]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = off;")
    cur.execute("SET enable_bitmapscan = off;")

In [4]:
conn.rollback()
with conn.cursor() as cur:

    
    cur.execute(f"EXPLAIN {query_14}")
    explain_result = cur.fetchall()
    print("EXPLAIN  result:")
    for row in explain_result:
        print(row[0])

Validate and record size result table

In [23]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_14};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_14 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 (Decimal('16.6475949416150953'),)
Size of query_14 result table: 0.015625 MB


Since we put already an index on l_partkey we simply turn on the use of the coefficients
the condition on l_shipdate has a very low selectivity, so we don't put an index on it.

In [4]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    
    cur.execute(f"EXPLAIN {query_14}")
    explain_result = cur.fetchall()
    print("EXPLAIN ANALYZE result:")
    for row in explain_result:
        print(row[0])

## Query 17

In [5]:
query_17 = """
SELECT
    SUM(l_extendedprice) / 7.0 AS avg_yearly
FROM
    lineitem,
    part
WHERE
    p_partkey = l_partkey
    AND p_brand = 'Brand#23'
    AND p_container = 'MED BOX'
    AND l_quantity < (
        SELECT
            0.2 * AVG(l_quantity)
        FROM
            lineitem
        WHERE
            l_partkey = p_partkey
    );
"""

In [25]:
conn.rollback()
with conn.cursor() as cur:
    
    cur.execute(f"EXPLAIN {query_17}")
    explain_result = cur.fetchall()
    print("EXPLAIN ANALYZE result:")
    for row in explain_result:
        print(row[0])

EXPLAIN ANALYZE result:
Aggregate  (cost=1938000.25..1938000.26 rows=1 width=32)
  ->  Hash Join  (cost=54678.31..1937952.31 rows=19175 width=8)
        Hash Cond: (lineitem.l_partkey = part.p_partkey)
        Join Filter: (lineitem.l_quantity < (SubPlan 1))
        ->  Seq Scan on lineitem  (cost=0.00..1724403.52 rows=59986052 width=17)
        ->  Hash  (cost=54654.34..54654.34 rows=1918 width=4)
              ->  Gather  (cost=1000.00..54654.34 rows=1918 width=4)
                    Workers Planned: 2
                    ->  Parallel Seq Scan on part  (cost=0.00..53462.54 rows=799 width=4)
                          Filter: ((p_brand = 'Brand#23'::bpchar) AND (p_container = 'MED BOX'::bpchar))
        SubPlan 1
          ->  Aggregate  (cost=140.68..140.69 rows=1 width=32)
                ->  Index Scan using idx_lineitem_partkey on lineitem lineitem_1  (cost=0.00..140.59 rows=34 width=5)
                      Index Cond: (l_partkey = part.p_partkey)


In [None]:
"""

with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_17};")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_17 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

"""

## Part 2: Indexes

Some useful commands:

To create and index:

default is b+tree
CREATE INDEX idx_customer_name ON customer (c_name);

available indexes:

B-tree: The default and most common type of index.

Hash: Used for equality comparisons.

GIN (Generalized Inverted Index): Useful for indexing array values and full-text search.

GiST (Generalized Search Tree): Supports many types of queries, including full-text search.

SP-GiST (Space-Partitioned Generalized Search Tree): Useful for partitioning data.

BRIN (Block Range INdexes): Efficient for large tables where the column values are correlated with their physical location.

other possibilities:

Partial Indexes
Description: Indexes only a portion of a table, based on a condition.
Use Case: When you frequently query a subset of rows.

CREATE INDEX idx_active_customers ON customer (c_name) WHERE active = true;

Expression Indexes
Description: Indexes the result of an expression or function rather than a raw column.
Use Case: When queries involve expressions or function calls.

CREATE INDEX idx_lower_customer_name ON customer ((lower(c_name)));


To disable the indexscan

SET enable_seqscan = on;
SET enable_indexscan = off;
SET enable_bitmapscan = off;

In [None]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_lineitem_partkey ON lineitem USING hash (l_partkey);")
    end_time = time.time()
    print(f"Time to create index on lineitem: {end_time - start_time} seconds")

    conn.commit()

It took almost 3 minutes. I recompiled it and it gives me an error, i should delete the index and recreate it to have the right stdout.

In [3]:
with conn.cursor() as cur:

    cur.execute("SELECT pg_relation_size('idx_lineitem_partkey');")
    index_size = cur.fetchone()[0]
    print(f"Size of index on lineitem: {index_size/(1024**2)} MB")
    print(f"Size of index on lineitem: {index_size/(1024**3)} GB")


Size of index on lineitem: 1896.65625 MB
Size of index on lineitem: 1.852203369140625 GB


In [6]:
conn.rollback()
with conn.cursor() as cur:
    
    cur.execute(f"EXPLAIN ANALYZE {query_17}")
    explain_result = cur.fetchall()
    print("EXPLAIN ANALYZE result:")
    for row in explain_result:
        print(row[0])

EXPLAIN ANALYZE result:
Aggregate  (cost=1938000.25..1938000.26 rows=1 width=32) (actual time=259573.678..259573.756 rows=1 loops=1)
  ->  Hash Join  (cost=54678.31..1937952.31 rows=19175 width=8) (actual time=799.986..259570.590 rows=5526 loops=1)
        Hash Cond: (lineitem.l_partkey = part.p_partkey)
        Join Filter: (lineitem.l_quantity < (SubPlan 1))
        Rows Removed by Join Filter: 55859
        ->  Seq Scan on lineitem  (cost=0.00..1724403.52 rows=59986052 width=17) (actual time=0.010..28771.108 rows=59986052 loops=1)
        ->  Hash  (cost=54654.34..54654.34 rows=1918 width=4) (actual time=708.781..708.857 rows=2044 loops=1)
              Buckets: 2048  Batches: 1  Memory Usage: 88kB
              ->  Gather  (cost=1000.00..54654.34 rows=1918 width=4) (actual time=2.209..708.543 rows=2044 loops=1)
                    Workers Planned: 2
                    Workers Launched: 2
                    ->  Parallel Seq Scan on part  (cost=0.00..53462.54 rows=799 width=4) (act

Very good! A query that almost can't be executed now is completed in 4 minutes! But the cost is huge 1.85 GB ...

In [None]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_17};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchall()
    for row in result:
        print(row)

    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_17 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 (Decimal('3295493.512857142857'),)
Size of query_17 result table: 0.015625 MB


## Materialised view

In [34]:
query_materialized = """
CREATE MATERIALIZED VIEW part_lineitem AS
SELECT part.p_partkey, part.p_brand, part.p_container, lineitem.l_quantity, lineitem.l_extendedprice
FROM part JOIN lineitem ON p_partkey = l_partkey;
"""

with conn.cursor() as cur:
        cur.execute(f"EXPLAIN ANALYZE {query_materialized}")
        conn.commit()

In [12]:
conn.rollback()
with conn.cursor() as cur:
    
    cur.execute("SELECT pg_total_relation_size('part_lineitem');")
    size = cur.fetchall()
    print(f"Size of materialised view: {size[0][0]/(1024**2)} MB")


Size of materialised view: 3906.609375 MB


In [9]:
query_17_1 = """

CREATE VIEW average_quantity AS
SELECT
    p_partkey,
    AVG(l_quantity) AS avg_quantity
FROM
    part_lineitem
GROUP BY
    p_partkey;

SELECT
    p_partkey,
    SUM(l_extendedprice) / 7.0 AS avg_yearly
FROM
    part_lineitem
WHERE
    p_brand = 'Brand#23'
    AND p_container = 'MED BOX'
    AND l_quantity < (
        SELECT
            0.2 * avg_quantity
        FROM
            average_quantity
        WHERE 
            average_quantity.p_partkey = part_lineitem.p_partkey
    )
GROUP BY
    p_partkey;
"""

In [10]:
conn.rollback()
with conn.cursor() as cur:
        cur.execute(f"{query_17_1}")
        conn.commit()

takes more time than the index, there is something wrong? maybe there is no index in p_partkey and thats the problem??