In [1]:
import psycopg2
from psycopg2 import sql, extras
import pandas as pd

In [2]:
# establish a connection to the PostgreSQL database
conn = psycopg2.connect(
    dbname = "dw_cs", 
    user = "postgres", 
    host= 'localhost',
    password = "password",
    port = 5432
)

## list all tables

In [3]:
# Open a cursor to perform database operations
with conn.cursor() as cur:
    
    # execute a query to fetch the list of tables
    cur.execute("SELECT tablename FROM pg_catalog.pg_tables WHERE schemaname != 'pg_catalog' AND schemaname != 'information_schema'")

    # Fetch all rows from the result set
    tables = cur.fetchall()

    # Print the list of tables
    for table in tables:
        print(table[0])

partsupp
supplier
customer
region
nation
part
orders
lineitem


## PART 1 : **CREATE TABLES**

In [4]:
query_create_customer = """
CREATE TABLE IF NOT EXISTS public.customer (
    c_custkey INT PRIMARY KEY, -- Identifier SF*150,000 are populated
    c_name VARCHAR(25),
    c_address VARCHAR(40),
    c_nationkey INT, -- Identifier Foreign Key to n_nationkey
    c_phone CHAR(15),
    c_acctbal DECIMAL,
    c_mktsegment CHAR(10),
    c_comment VARCHAR(117)
)
"""


query_create_lineitem = """
CREATE TABLE IF NOT EXISTS public.lineitem (
    l_orderkey INT,
    l_partkey INT,
    l_suppkey INT,
    l_linenumber INT,
    l_quantity DECIMAL,
    l_extendedprice DECIMAL,
    l_discount DECIMAL,
    l_tax DECIMAL,
    l_returnflag CHAR(1),
    l_linestatus CHAR(1),
    l_shipdate DATE,
    l_commitdate DATE,
    l_receiptdate DATE,
    l_shipinstruct CHAR(25),
    l_shipmode CHAR(10),
    l_comment VARCHAR(44)
)
"""


query_create_region = """
CREATE TABLE IF NOT EXISTS public.region (
    r_regionkey INT PRIMARY KEY, -- identifier 5 regions are populated
    r_name CHAR(25),
    r_comment VARCHAR(152)
)
"""


query_create_nation = """
CREATE TABLE IF NOT EXISTS public.nation (
    n_nationkey INT PRIMARY KEY, -- identifier 25 nations are populated
    n_name CHAR(25),
    n_regionkey INT, -- Foreign Key to r_regionkey
    n_comment VARCHAR(152)
)
"""


query_create_part = """
CREATE TABLE IF NOT EXISTS public.part (
    p_partkey INT PRIMARY KEY, -- identifier SF*200,000 are populated
    p_name VARCHAR(55),
    p_mfgr CHAR(25),
    p_brand CHAR(10),
    p_type VARCHAR(25),
    p_size INT,
    p_container CHAR(10),
    p_retailprice DECIMAL,
    p_comment VARCHAR(23)
)
"""


query_create_supplier = """
CREATE TABLE IF NOT EXISTS public.supplier (
    s_suppkey INT PRIMARY KEY, -- identifier SF*10,000 are populated
    s_name CHAR(25),
    s_address VARCHAR(40),
    s_nationkey INT, -- Identifier Foreign Key to n_nationkey
    s_phone CHAR(15),
    s_acctbal DECIMAL,
    s_comment VARCHAR(101)
)
"""


query_create_partsupp = """
CREATE TABLE IF NOT EXISTS public.partsupp (
    ps_partkey INT,
    ps_suppkey INT,
    ps_availqty INT,
    ps_supplycost DECIMAL,
    ps_comment VARCHAR(199)
)
"""


query_create_orders = """
CREATE TABLE IF NOT EXISTS public.orders (
    o_orderkey INT PRIMARY KEY,
    o_custkey INT,
    o_orederstatus CHAR(1),
    o_totalprice DECIMAL,
    o_orderdate DATE,
    o_orederpriority CHAR(15),
    o_clerk CHAR(15),
    o_shippriority INT,
    o_comment VARCHAR(79)
)
"""

In [5]:
with conn.cursor() as cur:

    cur.execute(query_create_customer)
    cur.execute(query_create_lineitem)
    cur.execute(query_create_region)
    cur.execute(query_create_nation)
    cur.execute(query_create_part)
    cur.execute(query_create_supplier)
    cur.execute(query_create_partsupp)
    cur.execute(query_create_orders)

    conn.commit()

In [6]:
with conn.cursor() as cur:
    cur.execute("SELECT tablename FROM pg_catalog.pg_tables WHERE schemaname != 'pg_catalog' AND schemaname != 'information_schema'")
    tables = cur.fetchall()
    for table in tables:
        print(table[0])

lineitem
region
nation
supplier
part
partsupp
customer
orders


## PART 2 : **ADD MISSING CONSTRAINTS**

### FOREIGN KEYS

In [7]:
query_const_foreign_01 = """
ALTER TABLE customer
ADD CONSTRAINT fk_c_nationkey
FOREIGN KEY (c_nationkey) REFERENCES nation(n_nationkey)
ON DELETE CASCADE ON UPDATE CASCADE
"""

query_const_foreign_02 = """
ALTER TABLE lineitem
ADD CONSTRAINT fk_l_orderkey
FOREIGN KEY (l_orderkey) REFERENCES orders(o_orderkey)
ON DELETE CASCADE ON UPDATE CASCADE
"""

query_const_foreign_03 = """
ALTER TABLE lineitem
ADD CONSTRAINT fk_l_partsupp
FOREIGN KEY (l_partkey, l_suppkey) REFERENCES partsupp(ps_partkey, ps_suppkey)
ON DELETE CASCADE ON UPDATE CASCADE
"""

query_const_foreign_04 = """
ALTER TABLE nation
ADD CONSTRAINT fk_n_regionkey
FOREIGN KEY (n_regionkey) REFERENCES region(r_regionkey)
ON DELETE CASCADE ON UPDATE CASCADE
"""

In [8]:
with conn.cursor() as cur:
    
    cur.execute(query_const_foreign_01)
    cur.execute(query_const_foreign_02)
    cur.execute(query_const_foreign_03)
    cur.execute(query_const_foreign_04)

    conn.commit()

### CHECKS

In [9]:
# Part table
query_const_check_01 = """
ALTER TABLE part
ADD CONSTRAINT part_partkey_check CHECK (p_partkey >= 0);
"""

# Supplier table
query_const_check_02 = """
ALTER TABLE supplier
ADD CONSTRAINT supplier_suppkey_check CHECK (s_suppkey >= 0);
"""

# Customer table
query_const_check_03 = """
ALTER TABLE customer
ADD CONSTRAINT customer_custkey_check CHECK (c_custkey >= 0);
"""

# Partsupp table
query_const_check_04 = """
ALTER TABLE partsupp
ADD CONSTRAINT partsupp_partkey_check CHECK (ps_partkey >= 0);
"""

# Region table
query_const_check_05 = """
ALTER TABLE region
ADD CONSTRAINT region_regionkey_check CHECK (r_regionkey >= 0);
"""

# Nation table
query_const_check_06 = """
ALTER TABLE nation
ADD CONSTRAINT nation_nationkey_check CHECK (n_nationkey >= 0);
"""

# Part table
query_const_check_07 = """
ALTER TABLE part
ADD CONSTRAINT part_size_check CHECK (p_size >= 0),
ADD CONSTRAINT part_retailprice_check CHECK (p_retailprice >= 0);
"""

# Partsupp table
query_const_check_08 = """
ALTER TABLE partsupp
ADD CONSTRAINT partsupp_availqty_check CHECK (ps_availqty >= 0),
ADD CONSTRAINT partsupp_supplycost_check CHECK (ps_supplycost >= 0);
"""

# Orders table
query_const_check_09 = """
ALTER TABLE orders
ADD CONSTRAINT orders_totalprice_check CHECK (o_totalprice >= 0);
"""

# Lineitem table
query_const_check_10 = """
ALTER TABLE lineitem
ADD CONSTRAINT lineitem_quantity_check CHECK (l_quantity >= 0),
ADD CONSTRAINT lineitem_extendedprice_check CHECK (l_extendedprice >= 0),
ADD CONSTRAINT lineitem_tax_check CHECK (l_tax >= 0);
"""

# Lineitem table
query_const_check_11 = """
ALTER TABLE lineitem
ADD CONSTRAINT lineitem_discount_range_check CHECK (l_discount >= 0.00 AND l_discount <= 1.00),
ADD CONSTRAINT lineitem_shipdate_receiptdate_check CHECK (l_shipdate <= l_receiptdate);
"""

In [10]:
with conn.cursor() as cur:
    
    cur.execute(query_const_check_01)
    cur.execute(query_const_check_02)
    cur.execute(query_const_check_03)
    cur.execute(query_const_check_04)
    cur.execute(query_const_check_05)
    cur.execute(query_const_check_06)
    cur.execute(query_const_check_07)
    cur.execute(query_const_check_08)
    cur.execute(query_const_check_09)
    cur.execute(query_const_check_10)
    cur.execute(query_const_check_11)

    conn.commit()

# Populate Tables

In [11]:
df = pd.read_csv("/Users/marcozamp/Desktop/TCP-H/region.csv", delimiter="|", header=None)

In [12]:
data = df.to_numpy()

conn.rollback()
with conn.cursor() as cur:

    for row in data:
        cur.execute(
            "INSERT INTO region (r_regionkey, r_name, r_comment) VALUES (%s, %s, %s)",
            (row[0], row[1], row[2])
        )
        
    conn.commit()

In [4]:
conn.rollback()
with conn.cursor() as cur:
    
    cur.execute("SELECT * FROM region;")

    rows = cur.fetchall()

    for row in rows:
        print(row)

(0, 'AFRICA                   ', 'lar deposits. blithely final packages cajole. regular waters are final requests. regular accounts are according to ')
(1, 'AMERICA                  ', 'hs use ironic, even requests. s')
(2, 'ASIA                     ', 'ges. thinly even pinto beans ca')
(3, 'EUROPE                   ', 'ly final courts cajole furiously final excuse')
(4, 'MIDDLE EAST              ', 'uickly special accounts cajole carefully blithely close requests. carefully final asymptotes haggle furiousl')


In [14]:
df = pd.read_csv("/Users/marcozamp/Desktop/TCP-H/nation.csv", delimiter="|", header=None)

In [15]:
print(df)

     0               1  2                                                  3
0    0         ALGERIA  0   haggle. carefully final deposits detect slyly...
1    1       ARGENTINA  1  al foxes promise slyly according to the regula...
2    2          BRAZIL  1  y alongside of the pending deposits. carefully...
3    3          CANADA  1  eas hang ironic, silent packages. slyly regula...
4    4           EGYPT  4  y above the carefully unusual theodolites. fin...
5    5        ETHIOPIA  0                    ven packages wake quickly. regu
6    6          FRANCE  3             refully final requests. regular, ironi
7    7         GERMANY  3  l platelets. regular accounts x-ray: unusual, ...
8    8           INDIA  2  ss excuses cajole slyly across the packages. d...
9    9       INDONESIA  2   slyly express asymptotes. regular deposits ha...
10  10            IRAN  4  efully alongside of the slyly final dependenci...
11  11            IRAQ  4  nic deposits boost atop the quickly final requ...

In [16]:
data = df.to_numpy() 

conn.rollback()
with conn.cursor() as cur:

    for row in data:
        cur.execute(
            "INSERT INTO nation (n_nationkey, n_name, n_regionkey, n_comment) VALUES (%s, %s, %s, %s)",
            (row[0], row[1], row[2], row[3])
        )
    conn.commit()

In [5]:
with conn.cursor() as cur:
    
    cur.execute("SELECT * FROM nation;")

    rows = cur.fetchall()

    for row in rows:
        print(row)

(0, 'ALGERIA                  ', 0, ' haggle. carefully final deposits detect slyly agai')
(1, 'ARGENTINA                ', 1, 'al foxes promise slyly according to the regular accounts. bold requests alon')
(2, 'BRAZIL                   ', 1, 'y alongside of the pending deposits. carefully special packages are about the ironic forges. slyly special ')
(3, 'CANADA                   ', 1, 'eas hang ironic, silent packages. slyly regular packages are furiously over the tithes. fluffily bold')
(4, 'EGYPT                    ', 4, 'y above the carefully unusual theodolites. final dugouts are quickly across the furiously regular d')
(5, 'ETHIOPIA                 ', 0, 'ven packages wake quickly. regu')
(6, 'FRANCE                   ', 3, 'refully final requests. regular, ironi')
(7, 'GERMANY                  ', 3, 'l platelets. regular accounts x-ray: unusual, regular acco')
(8, 'INDIA                    ', 2, 'ss excuses cajole slyly across the packages. deposits print aroun')
(9, 'INDONESIA

In [19]:
df = pd.read_csv("/Users/marcozamp/Desktop/TCP-H/customer.csv", delimiter="|", header=None)

data = df.to_numpy() 
print(len(data))

conn.rollback()
with conn.cursor() as cur:

    for row in data:
        cur.execute(
            "INSERT INTO customer (c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_mktsegment, c_comment) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)",
            (row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7])
            )
    conn.commit()

1500000


In [6]:
conn.rollback()
with conn.cursor() as cur:
    
    cur.execute("SELECT COUNT(*) FROM customer;")

    rows = cur.fetchall()

    for row in rows:
        print(row)

(1500000,)


In [21]:
df = pd.read_csv("/Users/marcozamp/Desktop/TCP-H/orders.csv", delimiter="|", header=None)

data = df.to_numpy() 
print(len(data))

conn.rollback()
with conn.cursor() as cur:

    for row in data:
        cur.execute(
            "INSERT INTO orders (o_orderkey, o_custkey, o_orederstatus, o_totalprice, o_orderdate, o_orederpriority, o_clerk, o_shippriority, o_comment) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)",
            (row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8])
            )
    conn.commit()

15000000


In [7]:
with conn.cursor() as cur:
    
    cur.execute("SELECT COUNT(*) FROM orders;")

    rows = cur.fetchall()
    
    print(rows)

[(15000000,)]


In [None]:
df_supp = pd.read_csv("/Users/marcozamp/Desktop/TCP-H/supplier.csv", delimiter="|", header=None)

data = df_supp.to_numpy() 
print(len(df_supp))


conn.rollback()
with conn.cursor() as cur:

    for row in data:
        cur.execute(
            "INSERT INTO supplier (s_suppkey, s_name, s_address, s_nationkey, s_phone, s_acctbal, s_comment) VALUES (%s, %s, %s, %s, %s, %s, %s)",
            (row[0], row[1], row[2], row[3], row[4], row[5], row[6])
            )
    conn.commit()

In [8]:
conn.rollback
with conn.cursor() as cur:
    cur.execute("SELECT COUNT(*) FROM supplier;")
    rows = cur.fetchall()
    print(rows)

[(100000,)]


In [32]:
df_part = pd.read_csv("/Users/marcozamp/Desktop/TCP-H/part.csv", delimiter="|", header=None)

data = df_part.to_numpy() 
print(len(data))

conn.rollback()
with conn.cursor() as cur:

    for row in data:
        cur.execute(
            "INSERT INTO part (p_partkey, p_name, p_mfgr, p_brand, p_type, p_size, p_container, p_retailprice, p_comment) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)",
            (row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8])
            )
    conn.commit()

2000000


In [9]:
conn.rollback()
with conn.cursor() as cur:
    
    cur.execute("SELECT COUNT(*) FROM part;")
    rows = cur.fetchall()
    print(rows)

[(2000000,)]


In [None]:
df_partsupp = pd.read_csv("/Users/marcozamp/Desktop/TCP-H/partsupp.csv", delimiter="|", header=None)

data = df_partsupp.to_numpy() 
print(len(data))

In [None]:
conn.rollback()
with conn.cursor() as cur:

    for row in data:
        cur.execute(
            "INSERT INTO partsupp (ps_partkey, ps_suppkey, ps_availqty, ps_supplycost, ps_comment) VALUES (%s, %s, %s, %s, %s)",
            (row[0], row[1], row[2], row[3], row[4])
            )
    conn.commit()

In [13]:
with conn.cursor() as cur:
    
    cur.execute("SELECT COUNT(*) FROM partsupp;")
    rows = cur.fetchall()
    print(rows)

[(8000000,)]


In [14]:
file_path = "/Users/marcozamp/Desktop/TCP-H/lineitem_{}.csv"

for i in range(1, 11):
    file_name = file_path.format(i)
    df_name = f"df{i}"
    
    globals()[df_name] = pd.read_csv(file_name, delimiter="|", header=None)

sum = 0
for i in range(1, 11):
    sum += len(globals()[f"df{i}"])
    
print(sum)

59986052


In [20]:
with conn.cursor() as cur:
        for i in range(1, 11):
                print(i)
                for row in globals()[f"df{i}"].to_numpy():
                        cur.execute(
                        "INSERT INTO lineitem (l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
                        (row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[9], row[10], row[11], row[12], row[13], row[14], row[15])
                        )
                conn.commit()

In [21]:
with conn.cursor() as cur:
    
    cur.execute("SELECT COUNT(*) FROM lineitem;")
    rows = cur.fetchall()
    print(rows)

[(59986052,)]


## Add primary and foreing keys

In [None]:
query_add_primary = """
ALTER TABLE part     ADD PRIMARY KEY (p_partkey);
ALTER TABLE supplier ADD PRIMARY KEY (s_suppkey);
ALTER TABLE partsupp ADD PRIMARY KEY (ps_partkey,ps_suppkey);
ALTER TABLE customer ADD PRIMARY KEY (c_custkey);
ALTER TABLE orders   ADD PRIMARY KEY (o_orderkey);
-- ALTER TABLE lineitem ADD PRIMARY KEY (l_orderkey,l_linenumber);
-- not needed because on primary table and very expensive
ALTER TABLE nation   ADD PRIMARY KEY (n_nationkey);
ALTER TABLE region   ADD PRIMARY KEY (r_regionkey);
"""

query_add_foreign = """
ALTER TABLE PARTSUPP ADD FOREIGN KEY (ps_partkey)  REFERENCES PART (p_partkey);
ALTER TABLE PARTSUPP ADD FOREIGN KEY (ps_suppkey)  REFERENCES SUPPLIER (s_suppkey);
ALTER TABLE CUSTOMER ADD FOREIGN KEY (c_nationkey) REFERENCES NATION (n_nationkey);
ALTER TABLE ORDERS   ADD FOREIGN KEY (o_custkey)   REFERENCES CUSTOMER (c_custkey);
ALTER TABLE LINEITEM ADD FOREIGN KEY (l_orderkey)  REFERENCES ORDERS (o_orderkey);
ALTER TABLE LINEITEM ADD FOREIGN KEY (l_partkey)   REFERENCES PART (p_partkey);
ALTER TABLE LINEITEM ADD FOREIGN KEY (l_suppkey)   REFERENCES SUPPLIER (s_suppkey);
ALTER TABLE NATION   ADD FOREIGN KEY (n_regionkey) REFERENCES REGION (r_regionkey);
ALTER TABLE SUPPLIER ADD FOREIGN KEY (s_nationkey) REFERENCES NATION (n_nationkey);
"""

with conn.cursor() as cur:
    
    cur.execute(query_add_primary)
    cur.execute(query_add_foreign)

In [22]:
cur.close()
conn.close()