In [1]:
import psycopg2

In [2]:
# establish a connection to the PostgreSQL database
conn = psycopg2.connect(
    dbname = "dw_cs", 
    user = "postgres", 
    host= 'localhost',
    password = "Mu34zi72",
    port = 5432
)

In [3]:
def collect_stats(table : str, attributes : list) -> None:
    conn.rollback()
    with conn.cursor() as cur:

        for attribute in attributes:
        
            cur.execute(f"SELECT COUNT(*) FROM {table};")

            all_rows = cur.fetchall()

            cur.execute(f"SELECT COUNT (DISTINCT {attribute}) FROM {table};")

            distinct_rows = cur.fetchall()

            cur.execute(f"SELECT MAX ({attribute}) FROM {table};")
            max = cur.fetchall()

            cur.execute(f"SELECT MIN ({attribute}) FROM {table};")
            min = cur.fetchall()

            print(f'>> Attribute      : {attribute}\n\
   All Rows       : {all_rows}\n\
   Distinct Rows  : {distinct_rows}\n\
   Min            : {min}\n\
   Max            : {max}\n')
            

def collect_size(table : str) -> None:
    conn.rollback()
    with conn.cursor() as cur:

        cur.execute(f"SELECT pg_relation_size('{table}');")
        size = cur.fetchall()
        size_mb = size[0][0] / (1024**2)
        print(f"Relation size: {size_mb:.2f} MB")

        cur.execute(f"SELECT pg_table_size('{table}');")
        size = cur.fetchall()
        size_mb = size[0][0] / (1024**2)
        print(f"Table (relation + TOAST) size: {size_mb:.2f} MB")

        cur.execute(f"SELECT pg_indexes_size('{table}');")
        size = cur.fetchall()
        print(f"Index size for table {table}: {size[0][0] / (1024**2):.2f} MB")

        cur.execute(f"SELECT pg_total_relation_size('{table}');")
        size = cur.fetchall()
        size_mb = size[0][0] / (1024**2)
        print(f"Total size (relation + TOAST + index): {size_mb:.2f} MB \n")

In [4]:
conn.rollback()
with conn.cursor() as cur:

    cur.execute("SELECT pg_database_size('dw_cs');")
    all_rows = cur.fetchall()
    print(f"Database size: {all_rows[0][0] / (1024**2):.2f} MB")
    print(f"Database size: {all_rows[0][0] / (1024**3):.2f} GB \n")


Database size: 14681.63 MB
Database size: 14.34 GB 



In [5]:
conn.rollback()
with conn.cursor() as cur:

    sum = 0
    for table in ["customer", "lineitem", "region", "nation", "part", "supplier", "partsupp", "orders"]:

        cur.execute(f"SELECT pg_indexes_size('{table}');")
        size = cur.fetchall()
        print(f"Index size for table {table}: {size[0][0] / (1024**2):.2f} MB")
        sum += size[0][0] / (1024**2)
    print(f"Total index size: {sum:.2f} MB \n")

Index size for table customer: 32.15 MB
Index size for table lineitem: 1285.72 MB
Index size for table region: 0.02 MB
Index size for table nation: 0.02 MB
Index size for table part: 42.86 MB
Index size for table supplier: 2.16 MB
Index size for table partsupp: 172.32 MB
Index size for table orders: 321.33 MB
Total index size: 1856.56 MB 



In [7]:
collect_size("region")
collect_stats("region", ["r_regionkey", "r_name", "r_comment"])

Relation size: 0.01 MB
Table (relation + TOAST) size: 0.01 MB
Index size for table region: 0.02 MB
Total size (relation + TOAST + index): 0.02 MB 

>> Attribute      : r_regionkey
   All Rows       : [(5,)]
   Distinct Rows  : [(5,)]
   Min            : [(0,)]
   Max            : [(4,)]

>> Attribute      : r_name
   All Rows       : [(5,)]
   Distinct Rows  : [(5,)]
   Min            : [('AFRICA                   ',)]
   Max            : [('MIDDLE EAST              ',)]

>> Attribute      : r_comment
   All Rows       : [(5,)]
   Distinct Rows  : [(5,)]
   Min            : [('ges. thinly even pinto beans ca',)]
   Max            : [('uickly special accounts cajole carefully blithely close requests. carefully final asymptotes haggle furiousl',)]



In [8]:
collect_size("nation")
collect_stats("nation", ["n_nationkey", "n_name", "n_regionkey", "n_comment"])

Relation size: 0.01 MB
Table (relation + TOAST) size: 0.01 MB
Index size for table nation: 0.02 MB
Total size (relation + TOAST + index): 0.02 MB 

>> Attribute      : n_nationkey
   All Rows       : [(25,)]
   Distinct Rows  : [(25,)]
   Min            : [(0,)]
   Max            : [(24,)]

>> Attribute      : n_name
   All Rows       : [(25,)]
   Distinct Rows  : [(25,)]
   Min            : [('ALGERIA                  ',)]
   Max            : [('VIETNAM                  ',)]

>> Attribute      : n_regionkey
   All Rows       : [(25,)]
   Distinct Rows  : [(5,)]
   Min            : [(0,)]
   Max            : [(4,)]

>> Attribute      : n_comment
   All Rows       : [(25,)]
   Distinct Rows  : [(25,)]
   Min            : [(' haggle. carefully final deposits detect slyly agai',)]
   Max            : [('y final packages. slow foxes cajole quickly. quickly silent platelets breach ironic accounts. unusual pinto be',)]



In [9]:
collect_size("orders")
collect_stats("orders", ["o_orderkey", "o_custkey", "o_orederstatus", "o_totalprice", "o_orderdate", "o_orederpriority", "o_clerk", "o_shippriority", "o_comment"])

Relation size: 2038.38 MB
Table (relation + TOAST) size: 2038.97 MB
Index size for table orders: 321.33 MB
Total size (relation + TOAST + index): 2360.30 MB 

>> Attribute      : o_orderkey
   All Rows       : [(15000000,)]
   Distinct Rows  : [(15000000,)]
   Min            : [(1,)]
   Max            : [(60000000,)]

>> Attribute      : o_custkey
   All Rows       : [(15000000,)]
   Distinct Rows  : [(999982,)]
   Min            : [(1,)]
   Max            : [(1499999,)]

>> Attribute      : o_orederstatus
   All Rows       : [(15000000,)]
   Distinct Rows  : [(3,)]
   Min            : [('F',)]
   Max            : [('P',)]

>> Attribute      : o_totalprice
   All Rows       : [(15000000,)]
   Distinct Rows  : [(11944103,)]
   Min            : [(Decimal('838.05'),)]
   Max            : [(Decimal('558822.56'),)]

>> Attribute      : o_orderdate
   All Rows       : [(15000000,)]
   Distinct Rows  : [(2406,)]
   Min            : [(datetime.date(1992, 1, 1),)]
   Max            : [(datetime

In [None]:
collect_size("customer")
collect_stats("customer", ["c_custkey", "c_name", "c_address", "c_nationkey", "c_phone", "c_acctbal", "c_mktsegment", "c_comment"])

Relation size: 290.05 MB
Table (relation + TOAST) size: 290.17 MB
Index size for table customer: 32.15 MB
Total size (relation + TOAST + index): 322.32 MB 

>> Attribute      : c_custkey
   All Rows       : [(1500000,)]
   Distinct Rows  : [(1500000,)]
   Min            : [(1,)]
   Max            : [(1500000,)]

>> Attribute      : c_name
   All Rows       : [(1500000,)]
   Distinct Rows  : [(1500000,)]
   Min            : [('Customer#000000001',)]
   Max            : [('Customer#001500000',)]

>> Attribute      : c_address
   All Rows       : [(1500000,)]
   Distinct Rows  : [(1500000,)]
   Min            : [('   ,qJqVsHDVWLs6mv6S7Hwh9H',)]
   Max            : [('zzzbtVPaB5eL7AFB07nVjHFMa51j2UMU',)]

>> Attribute      : c_nationkey
   All Rows       : [(1500000,)]
   Distinct Rows  : [(25,)]
   Min            : [(0,)]
   Max            : [(24,)]

>> Attribute      : c_phone
   All Rows       : [(1500000,)]
   Distinct Rows  : [(1499963,)]
   Min            : [('10-100-106-1617',)]
   

In [None]:
collect_size("supplier")
collect_stats("supplier", ["s_suppkey", "s_name", "s_address", "s_nationkey", "s_phone", "s_acctbal", "s_comment"])

Relation size: 17.31 MB
Table (relation + TOAST) size: 17.35 MB
Index size for table supplier: 2.16 MB
Total size (relation + TOAST + index): 19.51 MB 

>> Attribute      : s_suppkey
   All Rows       : [(100000,)]
   Distinct Rows  : [(100000,)]
   Min            : [(1,)]
   Max            : [(100000,)]

>> Attribute      : s_name
   All Rows       : [(100000,)]
   Distinct Rows  : [(100000,)]
   Min            : [('Supplier#000000001       ',)]
   Max            : [('Supplier#000100000       ',)]

>> Attribute      : s_address
   All Rows       : [(100000,)]
   Distinct Rows  : [(100000,)]
   Min            : [('  , Jd6qNPDAgz',)]
   Max            : [('zzyu4VZw4LGgCMMJG8Yr',)]

>> Attribute      : s_nationkey
   All Rows       : [(100000,)]
   Distinct Rows  : [(25,)]
   Min            : [(0,)]
   Max            : [(24,)]

>> Attribute      : s_phone
   All Rows       : [(100000,)]
   Distinct Rows  : [(100000,)]
   Min            : [('10-100-166-6237',)]
   Max            : [('34-9

In [None]:
collect_size("part")
collect_stats("part", ["p_partkey", "p_name", "p_mfgr", "p_brand", "p_type", "p_size", "p_container", "p_retailprice", "p_comment"])

Relation size: 320.02 MB
Table (relation + TOAST) size: 320.14 MB
Index size for table part: 42.86 MB
Total size (relation + TOAST + index): 363.00 MB 

>> Attribute      : p_partkey
   All Rows       : [(2000000,)]
   Distinct Rows  : [(2000000,)]
   Min            : [(1,)]
   Max            : [(2000000,)]

>> Attribute      : p_name
   All Rows       : [(2000000,)]
   Distinct Rows  : [(1999828,)]
   Min            : [('almond antique aquamarine blanched floral',)]
   Max            : [('yellow white wheat peach aquamarine',)]

>> Attribute      : p_mfgr
   All Rows       : [(2000000,)]
   Distinct Rows  : [(5,)]
   Min            : [('Manufacturer#1           ',)]
   Max            : [('Manufacturer#5           ',)]

>> Attribute      : p_brand
   All Rows       : [(2000000,)]
   Distinct Rows  : [(25,)]
   Min            : [('Brand#11  ',)]
   Max            : [('Brand#55  ',)]

>> Attribute      : p_type
   All Rows       : [(2000000,)]
   Distinct Rows  : [(150,)]
   Min         

In [None]:
collect_size("partsupp")
collect_stats("partsupp", ["ps_partkey", "ps_suppkey", "ps_availqty", "ps_supplycost", "ps_comment"])

Relation size: 1362.39 MB
Table (relation + TOAST) size: 1362.80 MB
Index size for table partsupp: 172.32 MB
Total size (relation + TOAST + index): 1535.12 MB 

>> Attribute      : ps_partkey
   All Rows       : [(8000000,)]
   Distinct Rows  : [(2000000,)]
   Min            : [(1,)]
   Max            : [(2000000,)]

>> Attribute      : ps_suppkey
   All Rows       : [(8000000,)]
   Distinct Rows  : [(100000,)]
   Min            : [(1,)]
   Max            : [(100000,)]

>> Attribute      : ps_availqty
   All Rows       : [(8000000,)]
   Distinct Rows  : [(9999,)]
   Min            : [(1,)]
   Max            : [(9999,)]

>> Attribute      : ps_supplycost
   All Rows       : [(8000000,)]
   Distinct Rows  : [(99901,)]
   Min            : [(Decimal('1.0'),)]
   Max            : [(Decimal('1000.0'),)]

>> Attribute      : ps_comment
   All Rows       : [(8000000,)]
   Distinct Rows  : [(7914164,)]
   Min            : [(' Tiresias about the deposits lose bold ideas. furiously final packages

In [4]:
collect_size("lineitem")
collect_stats("lineitem", ["l_orderkey", "l_partkey", "l_suppkey", "l_linenumber", "l_quantity", "l_extendedprice", "l_discount", "l_tax", "l_returnflag", "l_linestatus", "l_shipdate", "l_commitdate", "l_receiptdate", "l_shipinstruct", "l_shipmode", "l_comment"])

Relation size: 8785.49 MB
Table (relation + TOAST) size: 8787.95 MB
Index size for table lineitem: 1285.72 MB
Total size (relation + TOAST + index): 10073.67 MB 

>> Attribute      : l_orderkey
   All Rows       : [(59986052,)]
   Distinct Rows  : [(15000000,)]
   Min            : [(1,)]
   Max            : [(60000000,)]

>> Attribute      : l_partkey
   All Rows       : [(59986052,)]
   Distinct Rows  : [(2000000,)]
   Min            : [(1,)]
   Max            : [(2000000,)]

>> Attribute      : l_suppkey
   All Rows       : [(59986052,)]
   Distinct Rows  : [(100000,)]
   Min            : [(1,)]
   Max            : [(100000,)]

>> Attribute      : l_linenumber
   All Rows       : [(59986052,)]
   Distinct Rows  : [(7,)]
   Min            : [(1,)]
   Max            : [(7,)]

>> Attribute      : l_quantity
   All Rows       : [(59986052,)]
   Distinct Rows  : [(50,)]
   Min            : [(Decimal('1'),)]
   Max            : [(Decimal('50'),)]

>> Attribute      : l_extendedprice
   All