# Imports

In [1]:
# imports

import psycopg2
import random
from psycopg2 import OperationalError
from faker import Faker
import os
from dotenv import load_dotenv

KEY TABLES

- [x] customer.customers
- [x] inventory.products
- [x] inventory.suppliers
- [x] mapping.states
- [x] employees.departments

SECONDARY TABLES

- [x] mapping.cities
- [x] customers.customer_addresses
- [x] inventory.shipments
- [x] employees.employees
- [] sales.orders
- [] sales.order_details
- [] sales.payments



# Functions

## 1. Primary Tables

**1.1 customers.customers**

cols: customer_id, first_name, last_name, email, phone

In [22]:
def generate_customers(cursor, num_customers=100):
    """Generates and inserts dummy data into the customers table."""
    
    fake = Faker()
    customers = []

    for _ in range(num_customers):
        first_name = fake.first_name()
        last_name = fake.last_name()
        email = fake.email()
        phone = fake.numerify('(###)###-####') # Define a pattern for a 10-digit phone number
        customers.append((first_name, last_name, email, phone))
    
    query = """
    INSERT INTO customers.customers (first_name, last_name, email, phone)
    VALUES (%s, %s, %s, %s)
    """

    cursor.executemany(query, customers)

    print("customers.customers populated successfully.")

**1.2 inventory.products**

cols: name, description, price, stock_quantity

In [23]:
def generate_products(cursor, num_products=50):

    fake = Faker()
    products = []

    for _ in range(num_products):
        name = fake.word().capitalize()  # Product name
        description = fake.sentence(nb_words=10)  # Short description
        price = round(random.uniform(10, 1000), 2)  # Price between 10 and 1000
        stock_quantity = fake.random_int(min=1, max=500)  # qty between 1 and 500
        products.append((name, description, price, stock_quantity))
    
    query = """
    INSERT INTO inventory.products (name, description, price, stock_quantity)
    VALUES (%s, %s, %s, %s)
    """

    cursor.executemany(query, products)
    
    print('inventory.products populated successfully.')


 **1.3 inventory.suppliers**

cols: supplier_id, name, contact_info

In [34]:
def generate_suppliers(cursor, num_suppliers=10):
    
    fake = Faker()
    suppliers = []
    
    for _ in range(1, num_suppliers + 1):
        name = fake.company()
        contact_info = fake.phone_number()
        suppliers.append((name, contact_info))
    
    query = """
        INSERT INTO inventory.suppliers(name, contact_info)
        VALUES (%s, %s)
        """
    
    cursor.executemany(query, suppliers)

    print(f'inventory.suppliers populated with {num_suppliers} suppliers successfully.')  

**1.4 mapping.states**

cols: state_name, state_abbr

In [39]:
def generate_US_states(cursor):
    
    us_states = [
            ('Alabama', 'AL'), ('Alaska', 'AK'), ('Arizona', 'AZ'), ('Arkansas', 'AR'),
            ('California', 'CA'), ('Colorado', 'CO'), ('Connecticut', 'CT'), ('Delaware', 'DE'),
            ('Florida', 'FL'), ('Georgia', 'GA'), ('Hawaii', 'HI'), ('Idaho', 'ID'),
            ('Illinois', 'IL'), ('Indiana', 'IN'), ('Iowa', 'IA'), ('Kansas', 'KS'),
            ('Kentucky', 'KY'), ('Louisiana', 'LA'), ('Maine', 'ME'), ('Maryland', 'MD'),
            ('Massachusetts', 'MA'), ('Michigan', 'MI'), ('Minnesota', 'MN'), ('Mississippi', 'MS'),
            ('Missouri', 'MO'), ('Montana', 'MT'), ('Nebraska', 'NE'), ('Nevada', 'NV'),
            ('New Hampshire', 'NH'), ('New Jersey', 'NJ'), ('New Mexico', 'NM'), ('New York', 'NY'),
            ('North Carolina', 'NC'), ('North Dakota', 'ND'), ('Ohio', 'OH'), ('Oklahoma', 'OK'),
            ('Oregon', 'OR'), ('Pennsylvania', 'PA'), ('Rhode Island', 'RI'), ('South Carolina', 'SC'),
            ('South Dakota', 'SD'), ('Tennessee', 'TN'), ('Texas', 'TX'), ('Utah', 'UT'),
            ('Vermont', 'VT'), ('Virginia', 'VA'), ('Washington', 'WA'), ('West Virginia', 'WV'),
            ('Wisconsin', 'WI'), ('Wyoming', 'WY'), ('District of Columbia', 'DC')
        ]
    
    query = """
    INSERT INTO mapping.states(state_name, state_abbr)
    VALUES (%s, %s)
    """
    
    cursor.executemany(query, us_states)

    print('mapping.states populated successfully.')

**1.5 employees.departments**

cols: name, manager_id

In [12]:
def generate_departments(cursor):
    departments_data = [
        ('HR', None), ('IT', None), ('Sales', None), ('Marketing', None), ('Finance', None)
    ]
    query = "INSERT INTO employees.departments(name, manager_id) VALUES (%s, %s)"
    
    cursor.executemany(query, departments_data)
    
    print("Departments data inserted successfully.")

## 2. Secondary Tables

**2.1 mapping.cities**

cols: city_state_id, city_name, state_id

In [55]:
def generate_cities(cursor, num_cities=100):
        
    fake = Faker()
    cities_data = []

    # state_ids
    query_state_ids = """
    SELECT state_id FROM mapping.states;
    """
    cursor.execute(query_state_ids)
    state_ids = [r[0] for r in cursor.fetchall()]
    if not state_ids:
        raise ValueError("mapping.states is empty")

    # generate data
    for _ in range(1, num_cities+1):
        city_name = fake.city()
        state_id = random.choice(state_ids)
        cities_data.append((city_name, state_id))

    # insert data
    query = """
    INSERT INTO mapping.cities (city_name, state_id)
    VALUES (%s, %s)
    """
    cursor.executemany(query, cities_data)

    # confirmation message
    print(f'mapping.cities populated with {num_cities} records successfully.')

**2.2 customers.customer_addresses**

cols: address_id, customer_id, street, city_state_id, zip_code, address_type

In [65]:
def generate_customer_addresses(cursor):
        
        fake = Faker()
        customer_addresses_data = []
        addy_st_types = ['Street', 'Avenue', 'Boulevard', 'Court', 'Drive', 'Lane', 'Place', 'Road', 'Terrace']

        # city_state_ids
        query_city_state_ids = """
        SELECT city_state_id FROM mapping.cities;
        """
        cursor.execute(query_city_state_ids)
        city_state_ids = [r[0] for r in cursor.fetchall()]
        if not city_state_ids:
            raise ValueError("mapping.cities is empty")

        # customer_ids
        query_customer_ids = """
        SELECT customer_id FROM customers.customers;
        """
        cursor.execute(query_customer_ids)
        customer_ids = [r[0] for r in cursor.fetchall()]
        if not customer_ids:
            raise ValueError("customers.customers is empty")

        # generate data
        for customer_id in customer_ids:
            customer_id = customer_id
            addy_st_number = fake.building_number()
            addy_st_name = fake.street_name()
            addy_st_type = random.choice(addy_st_types)
            street = f"{addy_st_number} {addy_st_name} {addy_st_type}"
            city_state_id = random.choice(city_state_ids)
            zip_code = fake.zipcode()

            # 1. Add billing address
            customer_addresses_data.append((customer_id, street, city_state_id, zip_code, 'billing'))

            # 2. Add shipping address: Randomly decide if shipping address is the same or different than billing address
            if random.choice([True, False]):
                customer_addresses_data.append((customer_id, street, city_state_id, zip_code, 'shipping'))
            else:
                # Generate different shipping address
                addy_st_number = fake.building_number()
                addy_st_name = fake.street_name()
                addy_st_type = random.choice(addy_st_types)
                street = f"{addy_st_number} {addy_st_name} {addy_st_type}"
                city_state_id = random.choice(city_state_ids)
                zip_code = fake.zipcode()
                customer_addresses_data.append((customer_id, street, city_state_id, zip_code, 'shipping'))

        # insert data
        query = f"""
        INSERT INTO customers.customer_addresses(customer_id, street, city_state_id, zip_code, address_type)
        VALUES (%s, %s, %s, %s, %s)
        """
        cursor.executemany(query, customer_addresses_data)

        # confirmation message
        print(f'customers.customer_addresses populated with {len(customer_addresses_data)} records successfully.')

**2.3 inventory.shipments**

cols: shipment_id, supplier_id, shipment_date, status (shipped, delivered, pending)

In [53]:
def generate_inventory_shipments(cursor, num_shipments=50):
    
    fake = Faker()
    shipments = []

    # supplier_ids
    query_supplier_ids = """
    SELECT supplier_id FROM inventory.suppliers
    """
    cursor.execute(query_supplier_ids)
    supplier_ids = [r[0] for r in cursor.fetchall()]
    if not supplier_ids:
        raise ValueError("inventory.suppliers is empty")
    
    # status options
    status_options = ['Shipped', 'Delivered', 'Pending']

    # generate data
    for _ in range(1, num_shipments + 1):
        supplier_id = random.choice(supplier_ids)
        shipment_date = fake.date_between(start_date='-1y', end_date='today')
        status = random.choice(status_options)
        shipments.append((supplier_id, shipment_date, status))
    
    # insert data
    query = """
    INSERT INTO inventory.shipments(supplier_id, shipment_date, status)
    VALUES (%s, %s, %s)
    """
    cursor.executemany(query, shipments)

    # confirmation message
    print(f'inventory.shipments populated with {num_shipments} shipments successfully.')

**2.4 employees.employees**

In [11]:
def generate_employees(cursor, num_employees=50):
    
    fake = Faker()
    employees_data = []
    
    for _ in range(num_employees):
        first_name = fake.first_name()
        last_name = fake.last_name()
        email = f"{first_name.lower()}.{last_name.lower()}@dummycompany.com"
        start_date = fake.date_between(start_date='-5y', end_date='today')
        dept_id = random.randint(1, 5)
        salary = random.randint(40000, 120000)
        employees_data.append((first_name, last_name, email, start_date, None, dept_id, salary))
        
    query = """
    INSERT INTO employees.employees(first_name, last_name, email, start_date, end_date, department_id, salary)
    VALUES (%s, %s, %s, %s, %s, %s, %s)
    """
    cursor.executemany(query, employees_data)

    print(f'{num_employees} employees inserted successfully.')

    # Assign employees.departments.managers_id
    print('Assigning department managers based on highest salary...')
    
    update_manager_id_query = """
    WITH ranked AS (
        SELECT employee_id, department_id,
        ROW_NUMBER() OVER (PARTITION BY department_id ORDER BY salary DESC, employee_id ASC) AS rn
        FROM employees.employees
    )
    UPDATE employees.departments d
    SET manager_id = r.employee_id
    FROM ranked r
    WHERE d.department_id = r.department_id AND r.rn = 1;
    """
    cursor.execute(update_manager_id_query)
    
    print('employees.departments.manager_id updated successfully.')

## Database Utilities

**Connect to Database**

In [4]:
def get_connection():

    print("--- CONNECTING with database using following settings ---")

    # load env variables
    dotenv_path = '../.env'
    load_dotenv(dotenv_path)

    # confirm env variables are loaded
    DB_NAME = os.getenv("DB_NAME")
    DB_USER = os.getenv("DB_USER")
    DB_PASSWORD = os.getenv("DB_PASSWORD")
    DB_HOST = os.getenv("DB_HOST")
    DB_PORT = os.getenv("DB_PORT")

    print(
        f"DB_NAME: {DB_NAME}", 
        f"DB_USER: {DB_USER}", 
        f"DB_PASSWORD: {DB_PASSWORD}", 
        f"DB_HOST: {DB_HOST}", 
        f"DB_PORT: {DB_PORT}", 
        sep="\n"
        )
    
    # connect to database
    try:
        connection = psycopg2.connect(
            dbname = os.environ.get('DB_NAME'),
            user = os.environ.get('DB_USER'),
            password = os.environ.get('DB_PASSWORD'),
            host = os.environ.get('DB_HOST'),
            port = os.environ.get('DB_PORT')
        )
        print("Connection successful!")
        return connection
    
    except OperationalError as e:
        print(f"Error connecting to database: {e}")
        return None


**Truncate User Tables**

In [5]:
def clear_db_schema(cursor):
    """Dynamically truncates all user-defined tables."""
    print("\n--- MAINTENANCE: Clearing all user tables ---")
    
    dynamic_truncate = """
    SELECT 'TRUNCATE TABLE ' || 
           string_agg(quote_ident(schemaname) || '.' || quote_ident(tablename), ', ') || 
           ' RESTART IDENTITY CASCADE;'
    FROM pg_catalog.pg_tables
    WHERE schemaname NOT IN ('pg_catalog', 'information_schema')
      AND schemaname NOT LIKE 'pg_toast%'
      AND schemaname NOT LIKE 'pg_temp%';
    """
    
    cursor.execute(dynamic_truncate)
    row = cursor.fetchone()
    
    if row and row[0]:
        cursor.execute(row[0])
        print("Success: Database is now a clean slate.")
    else:
        print("Notice: No tables found to clear.")

## Populate Database

In [62]:
def run_data_generation():
    connection = get_connection()
    if not connection:
        return

    try:
        with connection:
            with connection.cursor() as cursor:

                # 1. Clear existing data
                # clear_db_schema(cursor)

                # 2. Insert data
                print("\n--- INSERTING data into tables ---")

                # Primary Tables
                # generate_customers(cursor)
                # generate_products(cursor, num_products=100)
                # generate_suppliers(cursor)
                # generate_US_states(cursor)
                # generate_departments(cursor)

                # Secondary Tables
                generate_cities(cursor)
                generate_customer_addresses(cursor)
                generate_inventory_shipments(cursor)
                # generate_employees(cursor)
                
        print("\n--- SUCCESS! ---\nAll data generated and committed.")
        
    except Exception as e:
        print("\n--- ERROR ---")
        print(f"CRITICAL ERROR: {e}")
        print("UNROLLING changes or commits, and closing connection.")
    finally:
        connection.close()
        print("\n--- CONNECTION CLOSED SAFELY ---")

# Execute

In [63]:
# Execute
run_data_generation()

--- CONNECTING with database using following settings ---
DB_NAME: dummy_company_abc_test
DB_USER: dummy_user
DB_PASSWORD: test12345
DB_HOST: localhost
DB_PORT: 5432
Connection successful!

--- INSERTING data into tables ---
mapping.cities populated with 100 records successfully.
Data successfully inserted.
inventory.shipments populated with 50 shipments successfully.

--- SUCCESS! ---
All data generated and committed.

--- CONNECTION CLOSED SAFELY ---
