In [1]:
import psycopg2

In [2]:
# establish a connection to the PostgreSQL database
conn = psycopg2.connect(
    dbname = "dw_cs", 
    user = "postgres", 
    host= 'localhost',
    password = "Mu34zi72",
    port = 5432
)

## First step
Compute size and time for executing the queries without additional structure support. Record the size of the result set.


the first value is the startup cost, the second the total cost.

Startup Cost: This represents the amount of work the query planner estimates is required before the first row can be returned. For a sequential scan (Seq Scan), this value is typically very low or zero because the first row can be returned almost immediately.

Total Cost: This represents the total estimated cost to execute the entire query. It is the sum of the startup cost and the cost to process all rows.

## size of the result set

In [9]:
with conn.cursor() as cur:
    
    cur.execute("CREATE TEMP TABLE temp_result AS SELECT * FROM partsupp WHERE ps_partkey > 1000000;")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(size)
    cur.execute("DROP TABLE temp_result;")

[(714612736,)]


In [11]:
with conn.cursor() as cur:
    
    cur.execute("SELECT SUM(pg_column_size(t)) FROM (SELECT * FROM partsupp WHERE ps_partkey > 1000000) AS t;")
    result_size = cur.fetchone()[0]
    print(result_size)


677530499


## how to compute the size needed to execute the query?

## Query 1

In [6]:
query = """
SELECT
    l_returnflag,
    l_linestatus,
    SUM(l_quantity) AS sum_qty,
    SUM(l_extendedprice) AS sum_base_price,
    SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
    SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
    AVG(l_quantity) AS avg_qty,
    AVG(l_extendedprice) AS avg_price,
    AVG(l_discount) AS avg_disc,
    COUNT(*) AS count_order
FROM
    lineitem
WHERE
    l_shipdate <= DATE '1998-12-01' - INTERVAL '90' DAY
GROUP BY
    l_returnflag,
    l_linestatus
ORDER BY
    l_returnflag,
    l_linestatus;
"""

conn.rollback()
with conn.cursor() as cur:
    
    cur.execute(f"EXPLAIN ANALYZE {query}")
    explain_result = cur.fetchall()
    print("EXPLAIN ANALYZE result:")
    for row in explain_result:
        print(row[0])

EXPLAIN ANALYZE result:
Finalize GroupAggregate  (cost=2300099.53..2300101.49 rows=6 width=236) (actual time=43110.727..43112.328 rows=4 loops=1)
  Group Key: l_returnflag, l_linestatus
  ->  Gather Merge  (cost=2300099.53..2300100.93 rows=12 width=236) (actual time=43110.702..43112.286 rows=12 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Sort  (cost=2299099.51..2299099.53 rows=6 width=236) (actual time=43063.000..43063.003 rows=4 loops=3)
              Sort Key: l_returnflag, l_linestatus
              Sort Method: quicksort  Memory: 27kB
              Worker 0:  Sort Method: quicksort  Memory: 27kB
              Worker 1:  Sort Method: quicksort  Memory: 27kB
              ->  Partial HashAggregate  (cost=2299099.30..2299099.43 rows=6 width=236) (actual time=43062.698..43062.702 rows=4 loops=3)
                    Group Key: l_returnflag, l_linestatus
                    Batches: 1  Memory Usage: 24kB
                    Worker 0:  Batches: 1  Memory Us

## Query 10

In [20]:
query = """
SELECT
    c_custkey,
    c_name,
    SUM(l_extendedprice * (1 - l_discount)) AS revenue,
    c_acctbal,
    n_name,
    c_address,
    c_phone,
    c_comment
FROM
    customer,
    orders,
    lineitem,
    nation
WHERE
    c_custkey = o_custkey
    AND l_orderkey = o_orderkey
    AND o_orderdate >= DATE '1993-10-01'
    AND o_orderdate < DATE '1993-10-01' + INTERVAL '3' MONTH
    AND l_returnflag = 'R'
    AND c_nationkey = n_nationkey
GROUP BY
    c_custkey,
    c_name,
    c_acctbal,
    c_phone,
    n_name,
    c_address,
    c_comment
ORDER BY
    revenue DESC;
"""

conn.rollback()
with conn.cursor() as cur:
    
    cur.execute(f"EXPLAIN ANALYZE {query}")
    explain_result = cur.fetchall()
    print("EXPLAIN ANALYZE result:")
    for row in explain_result:
        print(row[0])


EXPLAIN ANALYZE result:
Sort  (cost=2327515.18..2328977.13 rows=584782 width=280) (actual time=39383.437..39580.375 rows=381105 loops=1)
  Sort Key: (sum((lineitem.l_extendedprice * ('1'::numeric - lineitem.l_discount)))) DESC
  Sort Method: external merge  Disk: 71032kB
  ->  GroupAggregate  (cost=1995864.79..2119593.26 rows=584782 width=280) (actual time=37591.399..39124.192 rows=381105 loops=1)
        Group Key: customer.c_custkey, nation.n_name
        ->  Incremental Sort  (cost=1995864.79..2104973.71 rows=584782 width=260) (actual time=37591.387..38571.652 rows=1147084 loops=1)
              Sort Key: customer.c_custkey, nation.n_name
              Presorted Key: customer.c_custkey
              Full-sort Groups: 34124  Sort Method: quicksort  Average Memory: 31kB  Peak Memory: 31kB
              ->  Nested Loop  (cost=1995864.64..2078658.52 rows=584782 width=260) (actual time=37591.323..38242.717 rows=1147084 loops=1)
                    ->  Gather Merge  (cost=1995864.48..2063

## Query 14

In [4]:
query = """
SELECT
    100.00 * SUM(CASE
        WHEN p_type LIKE 'PROMO%'
        THEN l_extendedprice * (1 - l_discount)
        ELSE 0
    END) / SUM(l_extendedprice * (1 - l_discount)) AS promo_revenue
FROM
    lineitem,
    part
WHERE
    l_partkey = p_partkey
    AND l_shipdate >= DATE '1995-09-01'
    AND l_shipdate < DATE '1995-09-01' + INTERVAL '1' MONTH;

"""

conn.rollback()
with conn.cursor() as cur:
    
    cur.execute(f"EXPLAIN ANALYZE {query}")
    explain_result = cur.fetchall()
    print("EXPLAIN ANALYZE result:")
    for row in explain_result:
        print(row[0])

EXPLAIN ANALYZE result:
Finalize Aggregate  (cost=1580641.59..1580641.60 rows=1 width=32) (actual time=33384.811..33435.602 rows=1 loops=1)
  ->  Gather  (cost=1580641.35..1580641.56 rows=2 width=64) (actual time=33384.668..33435.579 rows=3 loops=1)
        Workers Planned: 2
        Workers Launched: 2
        ->  Partial Aggregate  (cost=1579641.35..1579641.36 rows=1 width=64) (actual time=33279.857..33279.861 rows=1 loops=3)
              ->  Parallel Hash Join  (cost=65409.80..1574319.61 rows=304099 width=33) (actual time=32853.884..33209.200 rows=249741 loops=3)
                    Hash Cond: (lineitem.l_partkey = part.p_partkey)
                    ->  Parallel Seq Scan on lineitem  (cost=0.00..1499444.55 rows=304099 width=16) (actual time=2.552..27717.177 rows=249741 loops=3)
                          Filter: ((l_shipdate >= '1995-09-01'::date) AND (l_shipdate < '1995-10-01 00:00:00'::timestamp without time zone))
                          Rows Removed by Filter: 19745610
      

## Query 17

In [6]:
query = """
SELECT
    SUM(l_extendedprice) / 7.0 AS avg_yearly
FROM
    lineitem,
    part
WHERE
    p_partkey = l_partkey
    AND p_brand = 'Brand#23'
    AND p_container = 'MED BOX'
    AND l_quantity < (
        SELECT
            0.2 * AVG(l_quantity)
        FROM
            lineitem
        WHERE
            l_partkey = p_partkey
    );
"""

conn.rollback()
with conn.cursor() as cur:
    
    cur.execute(f"EXPLAIN ANALYZE {query}")
    explain_result = cur.fetchall()
    print("EXPLAIN ANALYZE result:")
    for row in explain_result:
        print(row[0])