# Data Generation
- The dataset was generated using the `Faker` package to create realistic sample data for five tables.  
- The generated data was manually reviewed and revised for consistency and accuracy.  
- This notebook is a work in progress as I continue improving my Python skills.  
- Each cell has a code to generate one of the 5 tables

In [15]:
# Generating the Categories data
import random
import string
import csv
from datetime import datetime, timedelta
from faker import Faker

# Define functions to generate realistic data
fake = Faker()


def generate_category():
    categories = ['Electronics', 'Clothing', 'Home & Garden', 'Sports', 'Books']
    return random.choice(categories)

#
def generate_categories_data(num_entries):
    categories_data = []
    for _ in range(num_entries):
        category_data = {
            'category_name': generate_category()
        }
        categories_data.append(category_data)

    with open('categories.csv', 'w', newline='') as csvfile:
        fieldnames = ['category_name']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(categories_data)

# Call the generate functions for the Category table
num_entries = 10
generate_categories_data(num_entries)



In [13]:
# Generate Data for the Products Table
import random
import string
import csv
from datetime import datetime, timedelta
from faker import Faker

# Define functions to generate realistic data
fake = Faker()

def generate_product_name():
    adjectives = ['Awesome', 'Incredible', 'Amazing', 'Fantastic', 'Superb']
    nouns = ['Gadget', 'Device', 'Shirt', 'Jacket', 'Chair', 'Ball', 'Book']
    return f"{random.choice(adjectives)} {random.choice(nouns)}"

def generate_price():
    return round(random.uniform(10, 5000),2)

def generate_stock_quality():
    quality=['A','B','C','D']
    return random.choice(quality)



def generate_products_data(num_entries):
    products_data = []
    category_ids = [i for i in range(1,11)]
    for _ in range(num_entries):
        product_data = {
            'product_id':_+1,
            'product_name': generate_product_name(),
            'price': generate_price(),
            'stock_quality':generate_stock_quality(),
            'category_id': random.choice(category_ids)
        }
        products_data.append(product_data)

    with open('products.csv', 'w', newline='') as csvfile:
        fieldnames = ['product_id','product_name', 'price', 'stock_quality','category_id']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(products_data)


# Call the generate functions for each table
num_entries = 400
generate_products_data(num_entries)


In [16]:
# Generate Customers data
import random
import string
import csv
from datetime import datetime, timedelta
from faker import Faker

# Define functions to generate realistic data
fake = Faker()

def generate_name():
    return fake.name()

def generate_email():
    return fake.email()

def generate_address():
    return fake.address()
                                                                                                                                                                 
def generate_phone_number():
    return fake.phone_number()

# Generate mock data for each table
def generate_customers_data(num_entries):
    customers_data = []
    for _ in range(num_entries):
        customer_data = {
            'customer_id': _+1,
            'name': generate_name(),
            'email': generate_email(),
            'address': generate_address(),
            'phone_number': generate_phone_number()
        }
        customers_data.append(customer_data)

    with open('customers.csv', 'w', newline='') as csvfile:
        fieldnames = ['customer_id','name', 'email', 'phone_number', 'address']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(customers_data)

# Call the generate functions for Customers table
num_entries = 1000
generate_customers_data(num_entries)


In [21]:
# Generating Orders data
import random
import string
import csv
from datetime import datetime, timedelta
from faker import Faker

# Define functions to generate realistic data
fake = Faker()



def generate_date():
    start_date = datetime.strptime('1/1/2000', '%m/%d/%Y')
    end_date = datetime.strptime('1/1/2025', '%m/%d/%Y')
    random_date = start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
    return random_date.strftime('%Y-%m-%d')

def generate_quantity():
    return random.randint(1, 10)

def generate_order_status():
    status=['pending','shipped','delivered']
    return random.choice(status)


def generate_orders_data(num_entries):
    orders_data = []
    customer_ids = [i for i in range(1, 1001)]
    product_ids= [i for i in range(1,401)]
    for _ in range(num_entries):
        order_data = {
            'order_id':_+1,
            'customer_id': random.choice(customer_ids),
            'product_id': random.choice(product_ids),
            'order_date': generate_date(),
            'order_status': generate_order_status(),
            'quantity': generate_quantity()
        }
        orders_data.append(order_data)

    with open('orders.csv', 'w', newline='') as csvfile:
        fieldnames = ['order_id','customer_id', 'product_id','order_date','quantity','order_status']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(orders_data)



# Call the generate functions for each table
num_entries = 2000
generate_orders_data(num_entries)



In [22]:
# Generating Reviews data
import random
import string
import csv
from datetime import datetime, timedelta
from faker import Faker

# Define functions to generate realistic data
fake = Faker()



def generate_date():
    start_date = datetime.strptime('1/1/2000', '%m/%d/%Y')
    end_date = datetime.strptime('1/1/2025', '%m/%d/%Y')
    random_date = start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
    return random_date.strftime('%Y-%m-%d')

def generate_rate():
    rate=['1','2','3','4','5']
    return random.choice(rate)

def generate_review_text():
    text=['very good product','amazing','good','lovely','I will buy more','great service','Useful','woow']
    return random.choice(text)


def generate_review_data(num_entries):
    order_items_data = []
    customer_ids = [i for i in range(1,1001)]
    product_ids = [i for i in range(1,401)]
    for _ in range(num_entries):
        order_item_data = {
            'review_id':_+1,
            'customer_id': random.choice(customer_ids),
            'product_id': random.choice(product_ids),
            'rating': generate_rate(),
            'review_text': generate_review_text(),
            'review_date': generate_date()

            
        }
        order_items_data.append(order_item_data)

    with open('Reviews.csv', 'w', newline='') as csvfile:
        fieldnames = ['review_id','customer_id', 'product_id', 'rating','review_text','review_date']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(order_items_data)

# Call the generate functions for each table
num_entries = 2000
generate_review_data(num_entries)


 -The 5 csv files  were successfully saved in my folder of choice
 - Several changes were made to the data to make more logical and feasible
    i. Split the generated names in Customer csv file to first name and last name, to avoid data inconsistencies during database importation
    ii. Altered the customer_id and product_id values in Review table to ensure that the Customer_id and product_id pairs in the Orders table are similar to the ones in the Review table
    