In [11]:
import pandas as pd
from faker import Faker
import random
import uuid
from datetime import datetime, timedelta

# Initialize Faker
fake = Faker()

# Seed for reproducibility
Faker.seed(0)
random.seed(0)

# Configuration
NUM_CUSTOMERS = 10000
NUM_PRODUCTS = 500
NUM_ORDERS = 20000
NUM_ORDER_ITEMS = 50000
OUTPUT_DIR = '<Location>'

# Helper Functions
def generate_invalid_email(email):
    """Randomly invalidate an email address."""
    if random.random() < 0.04:  # 4% invalid
        return email.replace('@', '')  # Remove @ symbol
    return email


def generate_valid_phone():
    """Generate a valid phone number in the format (XXX) XXX-XXXX."""
    # Generate a random 10-digit number
    digits = ''.join([str(random.randint(0, 9)) for _ in range(10)])
    # Format as (XXX) XXX-XXXX
    return f'({digits[0:3]}) {digits[3:6]}-{digits[6:10]}'

def generate_incomplete_address(address):
    """Randomly make an address incomplete."""
    if random.random() < 0.03:  # 3% incomplete
        return address.split('\n')[0]  # Keep only the first line
    return address

def generate_price(price):
    """Introduce dynamic price outliers."""
    if random.random() < 0.02:  # 2% outliers
        return round(price * random.choice([0.1, 10, 100]), 2)
    return round(price, 2)

def generate_quantity(quantity):
    """Introduce dynamic quantity issues."""
    rand = random.random()
    if rand < 0.02:  # 2% negative
        return -abs(quantity)
    elif rand < 0.04:  # Next 4% null
        return None
    return quantity

def generate_order_date():
    """Generate order dates with some future dates."""
    date = fake.date_between(start_date='-2y', end_date='today')
    if random.random() < 0.02:  # 2% future dates
        return date + timedelta(days=random.randint(1, 30))
    return date

def generate_unit_price(product_price):
    """Mismatched unit prices."""
    if random.random() < 0.05:  # 5% mismatched
        return round(product_price * random.choice([0.5, 1.5, 2]), 2)
    return product_price

# 1. Generate Customers
customers = []
for _ in range(NUM_CUSTOMERS):
    email = fake.email()
    phone =  generate_valid_phone()
    address = fake.address().replace('\n', ', ')
    customers.append({
        'CustomerID': str(uuid.uuid4()),
        'Name': fake.name(),
        'Email': generate_invalid_email(email),
        'Phone': phone,
        'Address': generate_incomplete_address(address)
    })

# Sanitize the 'Address' column to remove newline characters
df_customers['Address'] = df_customers['Address'].str.replace('\n', ', ')
    
df_customers = pd.DataFrame(customers)
df_customers.to_csv(f'{OUTPUT_DIR}customers.csv', index=False)
print(f'Generated {len(df_customers)} customers.')

# 2. Generate Products

# Define categories and sample products
categories = {
    'Electronics': [
        'Smartphone', 'Laptop', 'Tablet', 'Camera', 'Headphones',
        'Smartwatch', 'Gaming Console', 'Bluetooth Speaker', 'Drone', 'Wireless Charger'
    ],
    'Books': [
        'Fiction Novel', 'Science Textbook', 'Biography', "Children's Book",
        'Cookbook', 'Travel Guide', 'Mystery Thriller', 'Self-help Book', 'History Book', 'Comic Book'
    ],
    'Clothing': [
        'T-shirt', 'Jeans', 'Dress', 'Jacket', 'Sneakers',
        'Sweater', 'Hat', 'Scarf', 'Socks', 'Gloves'
    ],
    'Home': [
        'Coffee Maker', 'Vacuum Cleaner', 'Blender', 'Microwave', 'Air Purifier',
        'Dishwasher', 'Refrigerator', 'Lamp', 'Toaster', 'Air Conditioner'
    ],
    'Sports': [
        'Basketball', 'Tennis Racket', 'Yoga Mat', 'Football', 'Bicycle',
        'Running Shoes', 'Fitness Tracker', 'Dumbbells', 'Swim Goggles', 'Golf Clubs'
    ]
}
products = []
for _ in range(NUM_PRODUCTS):
    price = random.uniform(5, 500)
    category = random.choice(list(categories.keys()))
    product_name = random.choice(categories[category])
    products.append({
        'ProductID': str(uuid.uuid4()),
        'Category': category,
        'ProductName': product_name,
        'Price': generate_price(price),
        'StockQuantity': generate_quantity(random.randint(0, 1000))
    })


df_products = pd.DataFrame(products)
df_products.to_csv(f'{OUTPUT_DIR}products.csv', index=False)
print(f'Generated {len(df_products)} products.')

# 3. Generate Orders
# First, extract CustomerIDs
customer_ids = df_customers['CustomerID'].tolist()

orders = []
for _ in range(NUM_ORDERS):
    orders.append({
        'OrderID': str(uuid.uuid4()),
        'CustomerID': random.choice(customer_ids),
        'OrderDate': generate_order_date(),
        'TotalAmount': 0  # Placeholder, will calculate later
    })

df_orders = pd.DataFrame(orders)
df_orders.to_csv(f'{OUTPUT_DIR}orders.csv', index=False)
print(f'Generated {len(df_orders)} orders.')

# 4. Generate OrderItems
# Extract OrderIDs and ProductIDs
order_ids = df_orders['OrderID'].tolist()
product_ids = df_products['ProductID'].tolist()
product_price_map = df_products.set_index('ProductID')['Price'].to_dict()

order_items = []
order_total_map = {order_id: 0 for order_id in order_ids}

for _ in range(NUM_ORDER_ITEMS):
    order_id = random.choice(order_ids)
    product_id = random.choice(product_ids)
    quantity = generate_quantity(random.randint(1, 10))
    unit_price = generate_unit_price(product_price_map[product_id])
    total_price = quantity * unit_price if quantity and unit_price else 0
    order_items.append({
        'OrderItemID': str(uuid.uuid4()),
        'OrderID': order_id,
        'ProductID': product_id,
        'Quantity': quantity,
        'UnitPrice': unit_price
    })
    # Update order total
    if quantity and unit_price:
        order_total_map[order_id] += total_price

df_order_items = pd.DataFrame(order_items)
df_order_items.to_csv(f'{OUTPUT_DIR}order_items.csv', index=False)
print(f'Generated {len(df_order_items)} order items.')

# 5. Update Orders with TotalAmount
df_orders['TotalAmount'] = df_orders['OrderID'].map(order_total_map)
df_orders.to_csv(f'{OUTPUT_DIR}orders.csv', index=False)
print('Updated orders with TotalAmount.')

print('Data generation complete.')

Generated 10000 customers.
Generated 500 products.
Generated 20000 orders.
Generated 50000 order items.
Updated orders with TotalAmount.
Data generation complete.
