## Imports

In [1]:
# imports

import psycopg2
import random
from psycopg2 import OperationalError
from faker import Faker
import os
from dotenv import load_dotenv

# load env variables
dotenv_path = '../.env'
load_dotenv(dotenv_path)



True

## Connect to Postgres Database

In [2]:
def create_connection():
    try:
        connection = psycopg2.connect(
            dbname = os.environ.get('DB_NAME'),
            user = os.environ.get('DB_USER'),
            password = os.environ.get('DB_PASSWORD'),
            host = os.environ.get('HOST'),
            port = os.environ.get('PORT')
        )
        print("Connection successful!")
        return connection
    except OperationalError as e:
        print(f"Error connecting to database: {e}")
        return None

Run test Query:

In [3]:
connection = create_connection()

if connection:
    cursor = connection.cursor()
    cursor.execute("SELECT version();")
    db_version = cursor.fetchone()
    print(f"PostgreSQL version: {db_version}")

    cursor.close()
    connection.close()    

Connection successful!
PostgreSQL version: ('PostgreSQL 16.6 (Postgres.app) on aarch64-apple-darwin21.6.0, compiled by Apple clang version 14.0.0 (clang-1400.0.29.102), 64-bit',)


## Generate Dummy Data

### KEY TABLES

- [x] customer.customers
- [x] inventory.products
- [x] inventory.suppliers
- [x] inventory.shipments
- [x] mapping.states
- [x] employees.departments

### customers.customers

In [16]:
fake = Faker()

# Generate dummy data for customers table
def generate_customers(n=100):
    customers = [] # empty list
    for _ in range(n): # for loop to itirate 'n' times, which is entered as a param above
        first_name = fake.first_name() # for each n, the following data is generated: f name, l name, email, phone
        last_name = fake.last_name()
        email = fake.email()
        phone = fake.numerify('(###)###-####') # Define a pattern for a 10-digit phone number
        customers.append((first_name, last_name, email, phone)) # the data is appended to the list defined above
    return customers


# Insert customers into database
def insert_customers(customers):
    query = """
    INSERT INTO customers.customers (first_name, last_name, email, phone)
    VALUES (%s, %s, %s, %s)
    """
    # connect to db
    connection = create_connection()
    
    # create cursor
    cursor = connection.cursor()
    
    # populate customers table
    cursor.executemany(query, customers)
    connection.commit()
    print('customers query ran and committed!')

    # close connection when done
    cursor.close()
    connection.close()


# Generate and insert 200 customers
customers = generate_customers(200)
insert_customers(customers)

Connection successful!
Query ran and committed!


### inventory.products

name, description, price, stock_quantity

In [19]:
fake = Faker()

def generate_products(n=50):
    products = []
    for _ in range(n):
        name = fake.word().capitalize()  # Product name
        description = fake.sentence(nb_words=10)  # Short description
        price = round(random.uniform(10, 1000), 2)  # Price between 10 and 1000
        stock_quantity = fake.random_int(min=1, max=500)  # Stock between 1 and 500
        products.append((name, description, price, stock_quantity))
    return products


def insert_products(products):
    query = """
    INSERT INTO inventory.products (name, description, price, stock_quantity)
    VALUES (%s, %s, %s, %s)
    """

    connection = create_connection()

    cursor = connection.cursor()

    cursor.executemany(query, products)
    connection.commit()
    print('inventory.products populated')

    cursor.close()
    connection.close()


products = generate_products(100)
insert_products(products)


Connection successful!
inventory.products populated


### inventory.suppliers

- supplier_id - Pkey, NOT auto-incremented, so need to generate this
- name
- contact_info

In [20]:
def insert_suppliers_data(n=10):
    # Connect to database
    try:
        connection = psycopg2.connect(
                dbname = os.environ.get('DB_NAME'),
                user = os.environ.get('DB_USER'),
                password = os.environ.get('DB_PASSWORD'),
                host = os.environ.get('HOST'),
                port = os.environ.get('PORT')
        )
        print('Connection successful!')
    except OperationalError as e:
        print(f'Error connecting to database: {e}')
        return # Exit function if connection fails
    
    try:
        # Generate supplier data
        print('Generating supplier data...')
        fake = Faker()
        suppliers = []
        for supplier_id in range(1, n + 1):
            name = fake.company()
            contact_info = fake.phone_number()
            suppliers.append((supplier_id, name, contact_info))

        # Insert supplier data
        print('Inserting supplier data...')
        query = """
            INSERT INTO inventory.suppliers(supplier_id, name, contact_info)
            VALUES (%s, %s, %s)
            """
        cursor = connection.cursor()
        cursor.executemany(query, suppliers)
        connection.commit()
        print(f'Successfully inserted {n} suppliers.')  
    except Exception as e:
        print(f'Error inserting supplier data: {e}')
        connection.rollback() # Rollback in case of error
    
    finally:
        # Close the cursor and connection
        if cursor:
            cursor.close()
        if connection:
            connection.close()
            print('Connection closed.')

In [21]:
insert_suppliers_data(20)

Connection successful!
Generating supplier data...
Inserting supplier data...
Successfully inserted 20 suppliers.
Connection closed.


### inventory.shipments

- shipment_id, pkey, not auto-incremented
- supplier_id
- shipment_date
- status: shipped, delivered, pending

In [4]:
def insert_shipments_data(n=50):
    # Connect to database
    connection = create_connection()
    
    if not connection:
        print('Exiting function as no connection could be established.')
        return
    
    # Generate and insert data
    try:
        # Generate data
        print('Generating shipments data...')
        fake = Faker()
        shipments = []
        status_options = ['Shipped', 'Delivered', 'Pending']
        for shipment_id in range(1, n + 1):
            supplier_id = random.randint(1, 20) # 20 suppliers inserted above
            shipment_date = fake.date_between(start_date='-1y', end_date='today')
            status = random.choice(status_options)
            shipments.append((shipment_id, supplier_id, shipment_date, status))

        # Insert data
        print('Inserting shipments data...')
        query = """
            INSERT INTO inventory.shipments(shipment_id, supplier_id, shipment_date, status)
            VALUES (%s, %s, %s, %s)
            """
        cursor = connection.cursor()
        cursor.executemany(query, shipments)
        connection.commit()
        print(f'Successfully inserted {n} shipments.')  
    except Exception as e:
        print(f'Error inserting supplier data: {e}')
        connection.rollback() # Rollback in case of error
    
    finally:
        # Close cursor and connection
        if cursor:
            cursor.close()
        if connection:
            connection.close()
            print('Connection closed.')

In [5]:
insert_shipments_data()

Connection successful!
Generating shipments data...
Inserting shipments data...
Successfully inserted 50 shipments.
Connection closed.


### mapping.states

- state_name
- state_abbr

In [None]:
def generate_states_data():
    connection = None
    try:
        # Create connection
        connection = create_connection()

        if not connection:
            print('Exiting function as no connection could be established.')
            return
        
        # Generate and insert data
        query = """
        INSERT INTO mapping.states(state_name, state_abbr)
        VALUES (%s, %s)
        """

        print('Creating list of states...')
        us_states = [
                ('Alabama', 'AL'), ('Alaska', 'AK'), ('Arizona', 'AZ'), ('Arkansas', 'AR'),
                ('California', 'CA'), ('Colorado', 'CO'), ('Connecticut', 'CT'), ('Delaware', 'DE'),
                ('Florida', 'FL'), ('Georgia', 'GA'), ('Hawaii', 'HI'), ('Idaho', 'ID'),
                ('Illinois', 'IL'), ('Indiana', 'IN'), ('Iowa', 'IA'), ('Kansas', 'KS'),
                ('Kentucky', 'KY'), ('Louisiana', 'LA'), ('Maine', 'ME'), ('Maryland', 'MD'),
                ('Massachusetts', 'MA'), ('Michigan', 'MI'), ('Minnesota', 'MN'), ('Mississippi', 'MS'),
                ('Missouri', 'MO'), ('Montana', 'MT'), ('Nebraska', 'NE'), ('Nevada', 'NV'),
                ('New Hampshire', 'NH'), ('New Jersey', 'NJ'), ('New Mexico', 'NM'), ('New York', 'NY'),
                ('North Carolina', 'NC'), ('North Dakota', 'ND'), ('Ohio', 'OH'), ('Oklahoma', 'OK'),
                ('Oregon', 'OR'), ('Pennsylvania', 'PA'), ('Rhode Island', 'RI'), ('South Carolina', 'SC'),
                ('South Dakota', 'SD'), ('Tennessee', 'TN'), ('Texas', 'TX'), ('Utah', 'UT'),
                ('Vermont', 'VT'), ('Virginia', 'VA'), ('Washington', 'WA'), ('West Virginia', 'WV'),
                ('Wisconsin', 'WI'), ('Wyoming', 'WY'), ('District of Columbia', 'DC')
            ]
        
        print('Inserting states data...')
        cursor = connection.cursor()
        cursor.executemany(query, us_states)
        connection.commit()

        print('States data inserted successfully.')

    except Exception as e:
        print(f'An error occurred: {e}')
    
    finally:
        # Ensure cursor and connection are closed
        if connection:
            cursor.close()
            connection.close()
            print('Connection closed.')

In [17]:
generate_states_data()

Connection successful!
Creating list of states...
Inserting states data...
States data inserted successfully.
Connection closed.


### mapping.payment_methods

```sql
-- Create table
CREATE TABLE mapping.payment_methods (
	payment_method_id SERIAL PRIMARY KEY,
	method_name TEXT UNIQUE NOT NULL
);

-- Insert values
INSERT INTO MAPPING.payment_methods (method_name)
VALUES
	('Cash'),
	('Credit Card'),
	('Bank Transfer')
;
```

### employees.departments

- department_id
- name
- manager_id

In [25]:
def generate_departments_data(connection):
    # Check connection
    print('Checking connection...')
    
    if not connection:
        print('Connection error. Please check connection. Exiting function.')
        return
    
    # Generating department data
    try:
        print('Generating department data...')
        
        departments_data = [
            ('HR', None),
            ('IT', None),
            ('Sales', None),
            ('Marketing', None),
            ('Finance', None)
        ]
        
        # Insert department data
        query = """
        INSERT INTO employees.departments(name, manager_id)
        VALUES (%s, %s)
        """
        
        cursor = connection.cursor()
        cursor.executemany(query, departments_data)
        connection.commit()
        
        print('Departments data inserted successfully.')

    except Exception as e:
        print(f'An error occurred: {e}')
    
    # Close connection
    finally:
        if 'cursor' in locals():
            cursor.close()
        
        if connection:
            connection.close()
        print('Connection closed.')


In [27]:
connection = create_connection()

generate_departments_data(connection)

Connection successful!
Checking connection...
Generating department data...
Departments data inserted successfully.
Connection closed.


### SECONDARY TABLES

- [x] mapping.cities
- [x] customers.customer_addresses
- [x] employees.employees
- [x] sales.orders
- [x] sales.order_details
- [x] sales.payments

### mapping.cities

- city_name
- state_id

In [20]:
def generate_cities_data(n=100):
    try:
        # create connection
        connection = create_connection()

        if not connection:
            print('Exiting as connection to db could not be established')
            return
        
        # generating and inserting cities data
        fake = Faker()
        cities_data = []
        print('Generating data...')
        for _ in range(1, n+1):
            city_name = fake.city(),
            state_id = random.randint(1,50)
            cities_data.append((city_name, state_id))

        print('Inserting data...')
        query = """
        INSERT INTO mapping.cities (city_name, state_id)
        VALUES (%s, %s)
        """
        cursor = connection.cursor()
        cursor.executemany(query, cities_data)
        connection.commit()

        print('Data SUCCESSFULLY inserted.')

    except Exception as e:
        print(f'An error occurred: {e}')

    finally:
        if cursor:
            cursor.close()
        if connection:
            connection.close()
        print('Connection closed.')

In [19]:
generate_cities_data()

Connection successful!
Generating data...
Inserting data...
Closing closed.


### customers.customer_addresses

- address_id
- customer_id
- street
- city_state_id
- zip_code
- address_type

In [22]:
def generate_customer_addresses_data(connection):
    
    # check connection
    print('checking connection to database...')
    
    if not connection:
        print('Connection error. Please check connection. Exiting function.')
        return

    
    # generating and inserting data
    try:
        print('Generating data...')
        
        fake = Faker()
        customer_addresses_data = []
        addy_st_types = ['Street', 'Avenue', 'Boulevard', 'Court', 'Drive', 'Lane', 'Place', 'Road', 'Terrace']

        for customer_id in range(201, 401):
            addy_st_number = fake.building_number()
            addy_st_name = fake.street_name()
            addy_st_type = random.choice(addy_st_types)
            street = f"{addy_st_number} {addy_st_name} {addy_st_type}"
            city_state_id = random.randint(1, 100)
            zip_code = fake.zipcode()

            # Add billing address
            customer_addresses_data.append((customer_id, street, city_state_id, zip_code, 'billing'))

            # Randomly decide if shipping address is the same or different
            if random.choice([True, False]):
                customer_addresses_data.append((customer_id, street, city_state_id, zip_code, 'shipping'))
            else:
                # Generate different shipping address
                addy_st_number = fake.building_number()
                addy_st_name = fake.street_name()
                addy_st_type = random.choice(addy_st_types)
                street = f"{addy_st_number} {addy_st_name} {addy_st_type}"
                city_state_id = random.randint(1, 100)
                zip_code = fake.zipcode()
                customer_addresses_data.append((customer_id, street, city_state_id, zip_code, 'shipping'))

        print('Inserting data...')

        query = f"""
        INSERT INTO customers.customer_addresses(customer_id, street, city_state_id, zip_code, address_type)
        VALUES (%s, %s, %s, %s, %s)
        """
        cursor = connection.cursor()
        cursor.executemany(query, customer_addresses_data)
        connection.commit()

        print('Data successfully inserted.')

    except Exception as e:
        print(f'An error occurred: {e}')  

    
    # closing connection
    finally:
        if 'cursor' in locals():
            cursor.close()
        
        if connection:
            connection.close()
        
        print('Connection closed.')   

In [23]:
connection = create_connection()

generate_customer_addresses_data(connection, )

Connection successful!
checking connection to database...
Generating data...
Inserting data...
Data successfully inserted.
Connection closed.


### employees.employees

In [28]:
def generate_employees_data(connection, num_employees=50):
    
    # Check connection
    print('Checking connection...')
    
    if not connection:
        print('Connection error. Please check connection. Exiting function.')
        return
    
    try:
        print('Generating employees data...')
        
        fake = Faker()
        employees_data = []
        salary_range = (40000, 120000)
        
        for emp_id in range(1, num_employees + 1):
            first_name = fake.first_name()
            last_name = fake.last_name()
            email = f"{first_name.lower()}.{last_name.lower()}@dummycompany.com"
            start_date = fake.date_between(start_date='-5y', end_date='today')
            end_date = None  # Let's assume all employees are currently working
            department_id = random.randint(1, 5)  # Since we have 5 departments
            salary = random.randint(*salary_range)
            
            employees_data.append((first_name, last_name, email, start_date, end_date, department_id, salary))
        
        # Insert employees data
        query = """
        INSERT INTO employees.employees(first_name, last_name, email, start_date, end_date, department_id, salary)
        VALUES (%s, %s, %s, %s, %s, %s, %s)
        """
        
        cursor = connection.cursor()
        cursor.executemany(query, employees_data)
        connection.commit()
        
        print(f'{num_employees} employees inserted successfully.')
        
    except Exception as e:
        print(f'An error occurred: {e}')
    
    # Close connection
    finally:
        if 'cursor' in locals():
            cursor.close()
        
        if connection:
            connection.close()
        print('Connection closed.')


In [29]:
connection = create_connection()

generate_employees_data(connection)

Connection successful!
Checking connection...
Generating employees data...
50 employees inserted successfully.
Connection closed.


### sales.orders

- order_id - auto incrementing
- customer_id - fkey customers.customers.customer_id (201 - 400)
- order_date
- total_amount

In [8]:
def generate_orders_data(connection, n=500):
    # Check database connection
    if not connection:
        print('Connection error. Exiting function.')
        return
    
    # Generating and inserting data
    try:
        print('Generating orders data...')
        fake = Faker()
        orders_data = []

        for _ in range(n):
            customer_id = random.randint(201, 400)
            order_date = fake.date_between(start_date='-1y', end_date='today')
            total_amount = round(random.uniform(10, 1000), 2)  # Random amount between 10 and 1000

            orders_data.append((customer_id, order_date, total_amount))

        print('Inserting orders data...')

        query = """
        INSERT INTO sales.orders (customer_id, order_date, total_amount)
        VALUES (%s, %s, %s)
        """
        cursor = connection.cursor()
        cursor.executemany(query, orders_data)
        connection.commit()

        print(f'Successfully inserted {n} orders.')

    except Exception as e:
        print(f'An error occurred: {e}')
    
    # Closing connection
    finally:
        if 'cursor' in locals():
            cursor.close()
        if connection:
            connection.close()
        print('Connection closed.')


In [9]:
connection = create_connection()

generate_orders_data(connection=connection)

Connection successful!
Generating orders data...
Inserting orders data...
Successfully inserted 500 orders.
Connection closed.


### sales.order_details

- order_item_id
- order_id (Fkey: `sales.orders(order_id)` )
- product_id (Fkey: `inventory.products(product_id)`)
- quantity
- price

In [16]:
def generate_order_details_data(connection):
    
    # Check database connection
    if not connection:
        print('Connection error. Exiting function.')
        return

    
    try:
        # Create cursor
        cursor = connection.cursor()
        
        # Get all product IDs from inventory.products
        print('Fetching product IDs...')
        
        cursor.execute("SELECT product_id, price FROM inventory.products;")
        products_dict = {row[0]: row[1] for row in cursor.fetchall()}  # dictionary of {product_id: price}

        if not products_dict:
            print('No products found. Exiting function.')
            return
        
        # Get all order IDs from sales.orders
        print('Fetching order IDs...')
        
        cursor.execute("SELECT order_id FROM sales.orders;")
        order_ids = [row[0] for row in cursor.fetchall()]

        if not order_ids:
            print('No orders found. Exiting function.')
            return

        # Generate & Insert order details data
        print('Generating order details data...')

        order_details_data = []

        for order_id in order_ids:
            num_items = random.randint(1, 5)  # Each order gets 1-5 items
            chosen_products = random.sample(list(products_dict.keys()), num_items)  # Unique products per order
            
            for product_id in chosen_products:
                quantity = random.randint(1, 10)  # Quantity between 1 and 10
                price = products_dict[product_id]  # Exact price from inventory.products

                order_details_data.append((order_id, product_id, quantity, price))

        print('Inserting order items data...')
        
        query = """
        INSERT INTO sales.order_details (order_id, product_id, quantity, price)
        VALUES (%s, %s, %s, %s)
        """
        
        cursor.executemany(query, order_details_data)
        connection.commit()

        print(f'Successfully inserted {len(order_details_data)} order details.')

    except Exception as e:
        print(f'An error occurred: {e}')
    
    finally:
        if 'cursor' in locals():
            cursor.close()
        if connection:
            connection.close()
        print('Connection closed.')


In [17]:
connection = create_connection()

generate_order_details_data(connection=connection)

Connection successful!
Fetching product IDs...
Fetching order IDs...
Generating order details data...
Inserting order items data...
Successfully inserted 1530 order items.
Connection closed.


After running the function above, I modifed the table using SQL as follows:

```sql
-- Change col 'price' -> 'item_price', for clarity
ALTER TABLE sales.order_details
RENAME COLUMN price TO item_price;

-- Add col 'line_item_total' to hold total cost of the line item
ALTER TABLE sales.order_details
ADD COLUMN line_item_total NUMERIC;

-- Update col 'line_item_total' to calculate each line item total
UPDATE sales.order_details
SET line_item_total = quantity * item_price;
```

Once the line item total was deteremined, `sales.orders.total_amount` was updated with the total cost to ensure consistency:


```sql
-- Update col 'total_amount' in 'table sales.orders' to reflect correct order details
UPDATE sales.orders o
SET total_amount = (
    SELECT COALESCE(SUM(od.line_item_total), 0)
    FROM sales.order_details od
    WHERE od.order_id = o.order_id
);
```

### sales.payments

In [7]:
def populate_sales_payments(connection):
    # Check connection
    print("Checking connection to database...")
    if not connection:
        print("Connection error. Please check connection. Exiting function.")
        return

    try:
        cursor = connection.cursor()

        # Retrieve all order_ids and their total_amount from sales.orders
        cursor.execute("SELECT order_id, total_amount, order_date FROM sales.orders;")
        orders = cursor.fetchall()  # List of (order_id, total_amount, order_date)

        # Retrieve available payment_method_ids from mapping.payment_methods
        cursor.execute("SELECT payment_method_id FROM mapping.payment_methods;")
        payment_methods = [row[0] for row in cursor.fetchall()]  # List of payment_method_ids

        # Generate payment records
        sales_payments_data = [
            (order_id, random.choice(payment_methods), total_amount, order_date)
            for order_id, total_amount, order_date in orders
        ]

        # Insert data into sales.payments
        print("Inserting data into sales.payments...")
        insert_query = """
        INSERT INTO sales.payments (order_id, payment_method_id, payment_amount, payment_date)
        VALUES (%s, %s, %s, %s);
        """
        cursor.executemany(insert_query, sales_payments_data)
        connection.commit()

        print("Data successfully inserted into sales.payments.")

    except Exception as e:
        print(f"An error occurred: {e}")

    finally:
        if 'cursor' in locals():
            cursor.close()
        
        if connection:
            connection.close()

        print("Connection closed.")


In [12]:
connection = create_connection()

populate_sales_payments(connection)

Connection successful!
Checking connection to database...
Inserting data into sales.payments...
Data successfully inserted into sales.payments.
Connection closed.
