In [1]:
import psycopg2

In [2]:
# establish a connection to the PostgreSQL database
conn = psycopg2.connect(
    dbname = "dw_cs", 
    user = "postgres", 
    host= 'localhost',
    password = "password",
    port = 5432
)

## First step
Compute size and time for executing the queries without additional structure support. Record the size of the result set.


In [8]:
# example of query

query = "SELECT * FROM partsupp WHERE ps_partkey > 1000000;"

with conn.cursor() as cur:
    
    cur.execute(f"EXPLAIN ANALYZE {query}")
    explain_result = cur.fetchall()
    print("EXPLAIN ANALYZE result:")
    for row in explain_result:
        print(row[0])

EXPLAIN ANALYZE result:
Seq Scan on partsupp  (cost=0.00..274371.09 rows=3956580 width=143) (actual time=456.497..1024.624 rows=4000000 loops=1)
  Filter: (ps_partkey > 1000000)
  Rows Removed by Filter: 4000000
Planning Time: 0.083 ms
Execution Time: 1149.538 ms


the first value is the startup cost, the second the total cost.

Startup Cost: This represents the amount of work the query planner estimates is required before the first row can be returned. For a sequential scan (Seq Scan), this value is typically very low or zero because the first row can be returned almost immediately.

Total Cost: This represents the total estimated cost to execute the entire query. It is the sum of the startup cost and the cost to process all rows.

## size of the result set

In [9]:
with conn.cursor() as cur:
    
    cur.execute("CREATE TEMP TABLE temp_result AS SELECT * FROM partsupp WHERE ps_partkey > 1000000;")
    cur.execute("SELECT pg_total_relation_size('temp_result');")
    size = cur.fetchall()
    print(size)
    cur.execute("DROP TABLE temp_result;")

[(714612736,)]


In [11]:
with conn.cursor() as cur:
    
    cur.execute("SELECT SUM(pg_column_size(t)) FROM (SELECT * FROM partsupp WHERE ps_partkey > 1000000) AS t;")
    result_size = cur.fetchone()[0]
    print(result_size)


677530499


## how to compute the size needed to execute the query?