In [2]:
import psycopg2
import time

In [3]:
# establish a connection to the PostgreSQL database
conn = psycopg2.connect(
    dbname = "dw_cs", 
    user = "postgres", 
    host= 'localhost',
    # host = '172.30.160.1',
    password = "postgres",
    port = 5432
)

In [5]:
# cur.close()
# conn.close()

In [4]:
# function to check the indexes on a table

def check_indexes(table_name):

    with conn.cursor() as cur:
        query = f"""
        SELECT
            indexname,
            indexdef
        FROM
            pg_indexes
        WHERE
            tablename = '{table_name}';
        """

        cur.execute(query)      
        
        indexes = cur.fetchall()
        
        for index in indexes:
            print(f"Index Name: {index[0]}")
            print(f"Index Definition: {index[1]}\n")


# function to explain-analyze a query

def explain_analyze(query, analyze = True):
    conn.rollback()
    with conn.cursor() as cur:
        if analyze:
            cur.execute(f"EXPLAIN ANALYZE {query}")
        else:
            cur.execute(f"EXPLAIN {query}")
        explain = cur.fetchall()

        for line in explain:
            print(line[0])

## First step
Compute size and time for executing the queries without additional structure support. Record the size of the result set.


the first value is the startup cost, the second the total cost.

Startup Cost: This represents the amount of work the query planner estimates is required before the first row can be returned. For a sequential scan (Seq Scan), this value is typically very low or zero because the first row can be returned almost immediately.

Total Cost: This represents the total estimated cost to execute the entire query. It is the sum of the startup cost and the cost to process all rows.

## Baseline queries

In [10]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = off;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_tidscan = off;")
    cur.execute("SET enable_material = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = off;")
    cur.execute("SET enable_hashagg = on;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_partition_pruning = off;")
    cur.execute("SET enable_partitionwise_join = off;")
    cur.execute("SET enable_partitionwise_aggregate = off;")
    conn.commit()

In [8]:
query_1 = """
SELECT
    l_returnflag,
    l_linestatus,
    SUM(l_quantity) AS sum_qty,
    SUM(l_extendedprice) AS sum_base_price,
    SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
    SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
    AVG(l_quantity) AS avg_qty,
    AVG(l_extendedprice) AS avg_price,
    AVG(l_discount) AS avg_disc,
    COUNT(*) AS count_order
FROM
    lineitem
WHERE
    l_shipdate <= DATE '1998-12-01' - INTERVAL '90' DAY
GROUP BY
    l_returnflag,
    l_linestatus
ORDER BY
    l_returnflag,
    l_linestatus;
"""

query_10 = """
SELECT
    c_custkey,
    c_name,
    SUM(l_extendedprice * (1 - l_discount)) AS revenue,
    c_acctbal,
    n_name,
    c_address,
    c_phone,
    c_comment
FROM
    customer,
    orders,
    lineitem,
    nation
WHERE
    c_custkey = o_custkey
    AND l_orderkey = o_orderkey
    AND o_orderdate >= DATE '1993-10-01'
    AND o_orderdate < DATE '1993-10-01' + INTERVAL '3' MONTH
    AND l_returnflag = 'R'
    AND c_nationkey = n_nationkey
GROUP BY
    c_custkey,
    c_name,
    c_acctbal,
    c_phone,
    n_name,
    c_address,
    c_comment
ORDER BY
    revenue DESC;
"""

query_14 = """
SELECT
    100.00 * SUM(CASE
        WHEN p_type LIKE 'PROMO%'
        THEN l_extendedprice * (1 - l_discount)
        ELSE 0
    END) / SUM(l_extendedprice * (1 - l_discount)) AS promo_revenue
FROM
    lineitem,
    part
WHERE
    l_partkey = p_partkey
    AND l_shipdate >= DATE '1995-09-01'
    AND l_shipdate < DATE '1995-09-01' + INTERVAL '1' MONTH;

"""

query_17 = """
SELECT
    SUM(l_extendedprice) / 7.0 AS avg_yearly
FROM
    lineitem,
    part
WHERE
    l_partkey = p_partkey
    AND p_brand = 'Brand#23'
    AND p_container = 'MED BOX'
    AND l_quantity < (
        SELECT
            0.2 * AVG(l_quantity)
        FROM
            lineitem
        WHERE
            p_partkey = l_partkey
    );
"""


In [8]:
explain_analyze(query_1, analyze = False)

Finalize GroupAggregate  (cost=2300104.98..2300106.93 rows=6 width=236) (actual time=59925.710..59938.396 rows=4 loops=1)
  Group Key: l_returnflag, l_linestatus
  ->  Gather Merge  (cost=2300104.98..2300106.38 rows=12 width=236) (actual time=59925.686..59938.354 rows=12 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Sort  (cost=2299104.95..2299104.97 rows=6 width=236) (actual time=59910.550..59910.551 rows=4 loops=3)
              Sort Key: l_returnflag, l_linestatus
              Sort Method: quicksort  Memory: 27kB
              Worker 0:  Sort Method: quicksort  Memory: 27kB
              Worker 1:  Sort Method: quicksort  Memory: 27kB
              ->  Partial HashAggregate  (cost=2299104.74..2299104.87 rows=6 width=236) (actual time=59910.509..59910.514 rows=4 loops=3)
                    Group Key: l_returnflag, l_linestatus
                    Batches: 1  Memory Usage: 24kB
                    Worker 0:  Batches: 1  Memory Usage: 24kB
              

In [9]:
explain_analyze(query_10, analyze = False)

Sort  (cost=3344325.21..3345720.08 rows=557947 width=279) (actual time=50605.739..50650.388 rows=381105 loops=1)
  Sort Key: (sum((lineitem.l_extendedprice * ('1'::numeric - lineitem.l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  GroupAggregate  (cost=2942891.02..3146127.82 rows=557947 width=279) (actual time=48503.664..50219.044 rows=381105 loops=1)
        Group Key: customer.c_custkey, nation.n_name
        ->  Incremental Sort  (cost=2942891.02..3132179.14 rows=557947 width=259) (actual time=48503.649..49758.638 rows=1147084 loops=1)
              Sort Key: customer.c_custkey, nation.n_name
              Presorted Key: customer.c_custkey
              Full-sort Groups: 34124  Sort Method: quicksort  Average Memory: 31kB  Peak Memory: 31kB
              ->  Nested Loop  (cost=2942890.72..3107071.53 rows=557947 width=259) (actual time=48503.588..49479.230 rows=1147084 loops=1)
                    ->  Merge Join  (cost=2942890.57..3093377.71 rows=557947 width=1

In [10]:
explain_analyze(query_14, analyze = False)

Aggregate  (cost=1800922.34..1800922.35 rows=1 width=32) (actual time=38352.426..38352.462 rows=1 loops=1)
  ->  Merge Join  (cost=1676444.04..1786617.50 rows=817419 width=33) (actual time=37591.652..38222.425 rows=749223 loops=1)
        Merge Cond: (part.p_partkey = lineitem.l_partkey)
        ->  Index Scan using part_pkey on part  (cost=0.43..92913.43 rows=2000000 width=25) (actual time=0.879..422.172 rows=1999994 loops=1)
        ->  Sort  (cost=1676443.20..1678486.75 rows=817419 width=16) (actual time=37590.732..37626.491 rows=749223 loops=1)
              Sort Key: lineitem.l_partkey
              Sort Method: external sort  Disk: 24288kB
              ->  Gather  (cost=1000.00..1582197.72 rows=817419 width=16) (actual time=1.168..36369.465 rows=749223 loops=1)
                    Workers Planned: 2
                    Workers Launched: 2
                    ->  Parallel Seq Scan on lineitem  (cost=0.00..1499455.82 rows=340591 width=16) (actual time=1.262..36981.023 rows=249741 

We set hashjoin and hashaggregate off, we only analyze the situation with indexes.

## Query 1

Trying to put an index on (l_returnflag, l_linestatus). since they do not have many distinct values: (3,2) respectively, we could use a bitmap index, but in postgre it is not implemented.

Let's use btree, hash index cannot be done on a pair

Maybe we can leverage bitmap scan.

I tried to create an index on the pair (l_returnflag, l_linestatus) but it was not used by the optimizer.

Let's see if with an index on shipdate we get better results.

In [13]:
conn.rollback()
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_l_shipdate ON lineitem (l_shipdate);")
    end_time = time.time()
    print(f"Time to create index on lineitem: {end_time - start_time} seconds")

    conn.commit()

Time to create index on lineitem: 49.96327495574951 seconds


In [14]:
with conn.cursor() as cur:

    cur.execute("SELECT pg_relation_size('idx_l_shipdate');")
    index_size = cur.fetchone()[0]
    print(f"Size of idx_l_shipdate ON lineitem (l_shipdate): {index_size/(1024**2)} MB")
    print(f"Size of idx_l_shipdate ON lineitem (l_shipdate): {index_size/(1024**3)} GB")

Size of idx_l_shipdate ON lineitem (l_shipdate): 397.546875 MB
Size of idx_l_shipdate ON lineitem (l_shipdate): 0.3882293701171875 GB


In [15]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_tidscan = off;")
    cur.execute("SET enable_material = off;")
    cur.execute("SET enable_nestloop = off;")
    cur.execute("SET enable_mergejoin = off;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    conn.commit()

explain_analyze(query_1)

Finalize GroupAggregate  (cost=2958736.61..2958738.57 rows=6 width=236) (actual time=35196.474..35199.506 rows=4 loops=1)
  Group Key: l_returnflag, l_linestatus
  ->  Gather Merge  (cost=2958736.61..2958738.01 rows=12 width=236) (actual time=35196.439..35199.444 rows=12 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Sort  (cost=2957736.59..2957736.60 rows=6 width=236) (actual time=35192.556..35192.557 rows=4 loops=3)
              Sort Key: l_returnflag, l_linestatus
              Sort Method: quicksort  Memory: 27kB
              Worker 0:  Sort Method: quicksort  Memory: 27kB
              Worker 1:  Sort Method: quicksort  Memory: 27kB
              ->  Partial HashAggregate  (cost=2957736.37..2957736.51 rows=6 width=236) (actual time=35192.515..35192.523 rows=4 loops=3)
                    Group Key: l_returnflag, l_linestatus
                    Batches: 1  Memory Usage: 24kB
                    Worker 0:  Batches: 1  Memory Usage: 24kB
              

At the end of the day, the index on (l_returnflag, l_linestatus) is not used for sorting nor grouping, so I would not use it. 

On the contrary we can see an improvement using an index on l_shipdate with a bitmapscan. if we used indexonlyscan we get worse results.

What I would suggest is to keep the index on l_shipdate since it may help us also in query 14, even if it has low selectivity and probably it won't help much.

Validate and record the size of the table

In [9]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_tidscan = off;")
    cur.execute("SET enable_material = off;")
    cur.execute("SET enable_nestloop = off;")
    cur.execute("SET enable_mergejoin = off;")
    cur.execute("SET enable_hashjoin = on;")
    cur.execute("SET enable_sort = on;")
    conn.commit()
    
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_1};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_1 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 ('A', 'F', Decimal('377518399'), Decimal('566065727797.25'), Decimal('537759104278.0656'), Decimal('559276670892.116819'), Decimal('25.5009751030070973'), Decimal('38237.151008958546'), Decimal('0.05000657454024320463'), 14804077)
Size of query_1 result table: 0.015625 MB


In [None]:
with conn.cursor() as cur:
    cur.execute("SELECT pg_total_relation_size('lineitem');")

In [11]:
# l_shipdate has selectivity of 90/(6*12*365) = 0,003424657534 , so an index may be useful, 
# but since we have <=, an hash index can't be used. 
# we may use an inverted list

# tried with btree_gin extension, but it gave worse results

"""
conn.rollback()
with conn.cursor() as cur:
    cur.execute("CREATE EXTENSION IF NOT EXISTS btree_gin;")
    cur.execute("CREATE INDEX idx_lineitem_shipdate ON lineitem USING gin (l_shipdate);")

"""
# we may try to put an index also on (l_returnflag, l_linestatus) since they are used in the GROUP BY and ORDER BY clauses

Time to create gin index on lineitem (l_shipdate): 70.45620393753052 seconds


## Query 10

Order_Date is quite selective 3 months / 7*12 months, so probably an index can be beneficial.

ordering by revenue, which is computed in the query, can't be optimised.
what can be optimised is the join.

l_return flag is not selective, so we don't put an index on it.

In [16]:
conn.rollback()
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_o_orderdate ON orders (o_orderdate);")
    end_time = time.time()
    print(f"Time to create idx_o_orderdate ON orders (o_orderdate): {end_time - start_time} seconds")

    conn.commit()

Time to create idx_o_orderdate ON orders (o_orderdate): 10.395975828170776 seconds


In [17]:
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_tidscan = off;")
    cur.execute("SET enable_material = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = off;")
    cur.execute("SET enable_sort = on;")
    conn.commit()

explain_analyze(query_10)

Sort  (cost=10003291048.47..10003292443.34 rows=557947 width=279) (actual time=38244.429..38295.235 rows=381105 loops=1)
  Sort Key: (sum((lineitem.l_extendedprice * ('1'::numeric - lineitem.l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  Finalize GroupAggregate  (cost=10002923518.50..10003092851.07 rows=557947 width=279) (actual time=36721.926..37842.118 rows=381105 loops=1)
        Group Key: customer.c_custkey, nation.n_name
        ->  Gather Merge  (cost=10002923518.50..10003081227.18 rows=464956 width=279) (actual time=36721.920..37559.885 rows=450558 loops=1)
              Workers Planned: 2
              Workers Launched: 2
              ->  Partial GroupAggregate  (cost=10002922518.47..10003026559.69 rows=232478 width=279) (actual time=36698.604..37499.219 rows=150186 loops=3)
                    Group Key: customer.c_custkey, nation.n_name
                    ->  Incremental Sort  (cost=10002922518.47..10003020747.74 rows=232478 width=259) (actual time=

It is interesting to note that it leverages an index on l_returnflag because we have an index on (l_returnflag, l_linestatus). but it may be dropped, sooo.

Validate and record the size of the table

In [9]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_10};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_10 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 (1237537, 'Customer#001237537', Decimal('884989.6657'), Decimal('7840.17'), 'RUSSIA                   ', 'FNG6WgB1mopyyY,ajQTU qUPW5o', '32-367-120-4327', 'nag carefully about the regular packages. carefully reg')
Size of query_10 result table: 78.5078125 MB


## Query 14

shipdate is selective and we know that we have already an index on it, so we can leverage it 

we know we have a btree index in both l_partkey, p_partkey

In [19]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = on;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = off;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    
    conn.commit()

explain_analyze(query_14)

Aggregate  (cost=1752182.01..1752182.03 rows=1 width=32) (actual time=17888.858..17889.252 rows=1 loops=1)
  ->  Merge Join  (cost=1627703.71..1737877.17 rows=817419 width=33) (actual time=17072.889..17683.330 rows=749223 loops=1)
        Merge Cond: (part.p_partkey = lineitem.l_partkey)
        ->  Index Scan using part_pkey on part  (cost=0.43..92913.43 rows=2000000 width=25) (actual time=0.154..317.587 rows=1999994 loops=1)
        ->  Sort  (cost=1627702.87..1629746.42 rows=817419 width=16) (actual time=17072.726..17133.166 rows=749223 loops=1)
              Sort Key: lineitem.l_partkey
              Sort Method: external sort  Disk: 24288kB
              ->  Gather  (cost=12155.11..1533457.40 rows=817419 width=16) (actual time=69.304..16334.072 rows=749223 loops=1)
                    Workers Planned: 2
                    Workers Launched: 2
                    ->  Parallel Bitmap Heap Scan on lineitem  (cost=11155.11..1450715.50 rows=340591 width=16) (actual time=61.670..16657.2

using sort seems to improve the result of 5 seconds.

bitmap scan improves of 9 seconds the time.

as it is clear, the index on lineitem is not used.

Validate and record size result table

In [20]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_14};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchone()
    print(f"First row: \n {result}")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_14 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 (Decimal('16.6475949416150953'),)
Size of query_14 result table: 0.015625 MB


## Query 17

This query takes a lot of time if we don't use indexes.

In [5]:
check_indexes('lineitem')

Index Name: idx_l_shipdate
Index Definition: CREATE INDEX idx_l_shipdate ON public.lineitem USING btree (l_shipdate)



In [6]:
conn.rollback()
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_l_partkey ON lineitem (l_partkey);")    
    end_time = time.time()
    print(f"Time to create idx_l_partkey ON lineitem (l_partkey): {end_time - start_time} seconds")
    conn.commit()

Time to create idx_l_partkey ON lineitem (l_partkey): 53.212462186813354 seconds


we can see that it is already a fast query, but we can try to put an index also on containier and brand to see if it speeds up the query. they have respectively 40 and 25 distinc values, so they are not that selective, but we can try.

In [14]:
conn.rollback()
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_p_brand ON part USING hash (p_brand);")    
    end_time = time.time()
    print(f"Time to create index on part: {end_time - start_time} seconds")
    conn.commit()

Time to create index on lineitem: 55.932111978530884 seconds


In [15]:
conn.rollback()
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_p_container ON part USING hash (p_container);")    
    end_time = time.time()
    print(f"Time to create index on part: {end_time - start_time} seconds")
    conn.commit()

Time to create index on lineitem: 36.967782974243164 seconds


In [16]:
check_indexes("part")

Index Name: part_pkey
Index Definition: CREATE UNIQUE INDEX part_pkey ON public.part USING btree (p_partkey)

Index Name: idx_p_brand
Index Definition: CREATE INDEX idx_p_brand ON public.part USING hash (p_brand)

Index Name: idx_p_container
Index Definition: CREATE INDEX idx_p_container ON public.part USING hash (p_container)



In [9]:
conn.rollback()
with conn.cursor() as cur:
    cur.execute("SET enable_seqscan = off;")
    cur.execute("SET enable_indexscan = on;")
    cur.execute("SET enable_bitmapscan = on;")
    cur.execute("SET enable_indexonlyscan = off;")
    cur.execute("SET enable_nestloop = on;")
    cur.execute("SET enable_mergejoin = on;")
    cur.execute("SET enable_hashjoin = off;")
    cur.execute("SET enable_sort = on;")
    cur.execute("SET enable_hashagg = on;")
    conn.commit()

explain_analyze(query_17, analyze=True)

Aggregate  (cost=9428617.44..9428617.45 rows=1 width=32) (actual time=13964.683..13964.684 rows=1 loops=1)
  ->  Nested Loop  (cost=0.87..9428567.40 rows=20015 width=8) (actual time=16.833..13963.177 rows=5526 loops=1)
        ->  Index Scan using part_pkey on part  (cost=0.43..102913.43 rows=2002 width=4) (actual time=1.922..402.807 rows=2044 loops=1)
              Filter: ((p_brand = 'Brand#23'::bpchar) AND (p_container = 'MED BOX'::bpchar))
              Rows Removed by Filter: 1997956
        ->  Index Scan using idx_l_partkey on lineitem  (cost=0.44..4658.06 rows=11 width=17) (actual time=6.407..6.633 rows=3 loops=2044)
              Index Cond: (l_partkey = part.p_partkey)
              Filter: (l_quantity < (SubPlan 1))
              Rows Removed by Filter: 27
              SubPlan 1
                ->  Aggregate  (cost=137.10..137.11 rows=1 width=32) (actual time=0.198..0.198 rows=1 loops=61385)
                      ->  Index Scan using idx_l_partkey on lineitem lineitem_1  (c

we get a small improvemtn, i don't think it is worth keeping those indexes.

In [None]:
"""

with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_17};")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_17 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

"""

##Â Part 2: Indexes

Some useful commands:

To create and index:

default is b+tree
CREATE INDEX idx_customer_name ON customer (c_name);

available indexes:

B-tree: The default and most common type of index.

Hash: Used for equality comparisons.

GIN (Generalized Inverted Index): Useful for indexing array values and full-text search.

GiST (Generalized Search Tree): Supports many types of queries, including full-text search.

SP-GiST (Space-Partitioned Generalized Search Tree): Useful for partitioning data.

BRIN (Block Range INdexes): Efficient for large tables where the column values are correlated with their physical location.

other possibilities:

Partial Indexes
Description: Indexes only a portion of a table, based on a condition.
Use Case: When you frequently query a subset of rows.

CREATE INDEX idx_active_customers ON customer (c_name) WHERE active = true;

Expression Indexes
Description: Indexes the result of an expression or function rather than a raw column.
Use Case: When queries involve expressions or function calls.

CREATE INDEX idx_lower_customer_name ON customer ((lower(c_name)));


To disable the indexscan

SET enable_seqscan = on;
SET enable_indexscan = off;
SET enable_bitmapscan = off;

In [9]:
with conn.cursor() as cur:

    start_time = time.time()
    cur.execute("CREATE INDEX idx_lineitem_partkey ON lineitem USING hash (l_partkey);")
    end_time = time.time()
    print(f"Time to create index on lineitem: {end_time - start_time} seconds")

    conn.commit()

Time to create index on lineitem: 146.5280566215515 seconds


In [10]:
with conn.cursor() as cur:

    cur.execute("SELECT pg_relation_size('idx_lineitem_partkey');")
    index_size = cur.fetchone()[0]
    print(f"Size of index on lineitem: {index_size/(1024**2)} MB")
    print(f"Size of index on lineitem: {index_size/(1024**3)} GB")


Size of index on lineitem: 1896.65625 MB
Size of index on lineitem: 1.852203369140625 GB


In [11]:
conn.rollback()
with conn.cursor() as cur:
    
    cur.execute(f"EXPLAIN ANALYZE {query_17}")
    explain_result = cur.fetchall()
    print("EXPLAIN ANALYZE result:")
    for row in explain_result:
        print(row[0])

EXPLAIN ANALYZE result:
Aggregate  (cost=1937805.96..1937805.97 rows=1 width=32) (actual time=240896.545..240896.670 rows=1 loops=1)
  ->  Hash Join  (cost=54683.72..1937756.72 rows=19695 width=8) (actual time=1156.271..240892.848 rows=5526 loops=1)
        Hash Cond: (lineitem.l_partkey = part.p_partkey)
        Join Filter: (lineitem.l_quantity < (SubPlan 1))
        Rows Removed by Join Filter: 55859
        ->  Seq Scan on lineitem  (cost=0.00..1724403.52 rows=59986052 width=17) (actual time=0.335..23018.323 rows=59986052 loops=1)
        ->  Hash  (cost=54659.10..54659.10 rows=1970 width=4) (actual time=989.198..989.319 rows=2044 loops=1)
              Buckets: 2048  Batches: 1  Memory Usage: 88kB
              ->  Gather  (cost=1000.00..54659.10 rows=1970 width=4) (actual time=3.643..975.057 rows=2044 loops=1)
                    Workers Planned: 2
                    Workers Launched: 2
                    ->  Parallel Seq Scan on part  (cost=0.00..53462.10 rows=821 width=4) (ac

Very good! A query that almost can't be executed now is completed in 4 minutes! But the cost is huge 1.85 GB ...

In [None]:
with conn.cursor() as cur:
    
    cur.execute(f"CREATE TEMP TABLE temp_result AS {query_17};")
    cur.execute("SELECT * FROM temp_result;")
    result = cur.fetchall()
    for row in result:
        print(row)

    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(f"Size of query_17 result table: {size[0][0]/(1024**2)} MB")
    cur.execute("DROP TABLE temp_result;")

First row: 
 (Decimal('3295493.512857142857'),)
Size of query_17 result table: 0.015625 MB
