In [None]:
pip install faker pandas

Collecting faker
  Downloading faker-37.1.0-py3-none-any.whl.metadata (15 kB)
Collecting tzdata (from faker)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading faker-37.1.0-py3-none-any.whl (1.9 MB)
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   ---------------------------------------- 1.9/1.9 MB 11.8 MB/s eta 0:00:00
Downloading tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: tzdata, faker
Successfully installed faker-37.1.0 tzdata-2025.2
Note: you may need to restart the kernel to use updated packages.


In [17]:
import random
import pandas as pd
import datetime
from faker import Faker
import os

fake = Faker("en_IN")
Faker.seed(42)
random.seed(42)

# Configurable row count
NUM_CITIES = 30  # Matches unique cities
NUM_CUSTOMERS = 1000
NUM_PRODUCTS = 100
NUM_SUPPLIERS = 50
NUM_RENTALS = 5000
NUM_TRANSACTIONS = 2000
NUM_INVENTORY_SNAPSHOTS = 1000
NUM_RETURNS = 500

def generate_cities(n):
    cities = [
        ("Mumbai", "Maharashtra"), ("Delhi", "Delhi"), ("Bengaluru", "Karnataka"),
        ("Hyderabad", "Telangana"), ("Chennai", "Tamil Nadu"), ("Kolkata", "West Bengal"),
        ("Pune", "Maharashtra"), ("Ahmedabad", "Gujarat"), ("Jaipur", "Rajasthan"),
        ("Chandigarh", "Chandigarh"), ("Lucknow", "Uttar Pradesh"), ("Bhopal", "Madhya Pradesh"),
        ("Indore", "Madhya Pradesh"), ("Nagpur", "Maharashtra"), ("Surat", "Gujarat"),
        ("Patna", "Bihar"), ("Ranchi", "Jharkhand"), ("Guwahati", "Assam"),
        ("Visakhapatnam", "Andhra Pradesh"), ("Kochi", "Kerala"), ("Mysuru", "Karnataka"),
        ("Vadodara", "Gujarat"), ("Ludhiana", "Punjab"), ("Agra", "Uttar Pradesh"),
        ("Dehradun", "Uttarakhand"), ("Shimla", "Himachal Pradesh"), ("Coimbatore", "Tamil Nadu"),
        ("Varanasi", "Uttar Pradesh"), ("Jodhpur", "Rajasthan"), ("Raipur", "Chhattisgarh")
    ]
    return [(i+1, city, state, "India") for i, (city, state) in enumerate(cities[:n])]

def generate_dates(years=3):
    start_date = datetime.date(2022, 1, 1)
    dates = []
    for i in range(years * 365):
        date = start_date + datetime.timedelta(days=i)
        festival = None
        if date.month in [10, 11] and random.random() < 0.3:
            festival = "Diwali"
        elif date.month == 3 and random.random() < 0.2:
            festival = "Holi"
        elif date.month == 12 and random.random() < 0.1:
            festival = "Christmas"
        day_of_week = date.strftime("%A")
        is_weekend = 1 if date.weekday() >= 5 else 0
        dates.append((i+1, date, date.day, date.month, (date.month-1)//3 + 1, date.year, festival, day_of_week, is_weekend))
    return dates

def generate_products(n):
    categories = ["Sofa", "Bed", "Dining Table", "Chair", "Wardrobe", "Bookshelf", "TV Unit", "Recliner", "Study Desk", "Bar Stool"]
    prefixes = {
        "Sofa": ["Leather", "Fabric", "Sectional", "Recliner", "Modern", "Classic"],
        "Bed": ["King", "Queen", "Twin", "Bunk", "Platform", "Canopy"],
        "Dining Table": ["Wooden", "Glass", "Marble", "Extendable", "Round", "Rectangular"],
        "Chair": ["Office", "Dining", "Accent", "Folding", "Rocking", "Arm"],
        "Wardrobe": ["Single Door", "Double Door", "Sliding Door", "Walk-in", "Mirrored"],
        "Bookshelf": ["Tall", "Short", "Corner", "Ladder", "Cube"],
        "TV Unit": ["Wall Mounted", "Floor Standing", "Entertainment Center", "Console"],
        "Recliner": ["Manual", "Electric", "Massage", "Lift"],
        "Study Desk": ["Executive", "Computer", "Writing", "Standing", "L-shaped"],
        "Bar Stool": ["High", "Low", "Swivel", "Backless", "Upholstered"]
    }
    price_ranges = {
        "Sofa": (2000, 10000), "Bed": (1500, 8000), "Dining Table": (1000, 6000),
        "Chair": (500, 3000), "Wardrobe": (1000, 5000), "Bookshelf": (500, 3000),
        "TV Unit": (800, 4000), "Recliner": (1500, 7000), "Study Desk": (800, 4000),
        "Bar Stool": (500, 2500)
    }
    products = []
    for i in range(n):
        category = random.choice(categories)
        prefix = random.choice(prefixes[category])
        product_name = f"{prefix} {category}"
        rental_price = round(random.uniform(*price_ranges[category]), 2)
        products.append((i+1, product_name, category, rental_price))
    return products

def generate_customers(n, city_ids):
    customer_types = ["Regular", "Premium", "One-time", "Corporate"]
    return [(i+1, fake.name(), random.randint(18, 65), random.choice(['M', 'F', 'O']), random.choice(city_ids), random.choice(customer_types)) for i in range(n)]

def generate_suppliers(n, city_ids):
    return [(i+1, fake.company(), fake.phone_number(), random.choice(city_ids)) for i in range(n)]

def generate_rentals(n, product_ids, customer_ids, date_ids, product_id_to_rental_price, customer_id_to_city_id, weights):
    rentals = []
    for i in range(n):
        customer_id = random.choice(customer_ids)
        city_id = customer_id_to_city_id[customer_id]
        product_id = random.choice(product_ids)
        quantity = random.randint(1, 3)
        total_amount = round(product_id_to_rental_price[product_id] * quantity, 2)
        discount_amount = round(random.uniform(0, 0.2) * total_amount, 2)
        date_id = random.choices(date_ids, weights=weights, k=1)[0]
        rating = round(random.uniform(1, 5), 1)
        rentals.append((i+1, product_id, customer_id, date_id, city_id, quantity, total_amount, discount_amount, rating))
    return rentals

def generate_supplier_transactions(n, supplier_ids, product_ids, date_ids, product_id_to_unit_cost, supplier_id_to_city_id):
    transactions = []
    for i in range(n):
        supplier_id = random.choice(supplier_ids)
        city_id = supplier_id_to_city_id[supplier_id]
        product_id = random.choice(product_ids)
        quantity_supplied = random.randint(10, 500)
        cost_amount = round(product_id_to_unit_cost[product_id] * quantity_supplied, 2)
        date_id = random.choice(date_ids)
        transactions.append((i+1, supplier_id, product_id, date_id, city_id, quantity_supplied, cost_amount))
    return transactions

def generate_inventory_snapshots(n, product_ids, date_ids, city_ids):
    snapshots = []
    for i in range(n):
        date_id = random.choice(date_ids)
        product_id = random.choice(product_ids)
        city_id = random.choice(city_ids)
        inventory_on_hand = random.randint(10, 1000)
        inventory_reserved = random.randint(0, inventory_on_hand)
        inventory_damaged = random.randint(0, inventory_on_hand - inventory_reserved)
        snapshots.append((i+1, date_id, product_id, city_id, inventory_on_hand, inventory_reserved, inventory_damaged))
    return snapshots

def generate_product_returns(n, rentals, date_ids):
    reasons = ["Defective", "Not as Expected", "Late Return", "Size Issue", "Color Mismatch", "Delivery Delay"]
    selected_rentals = random.sample(rentals, n)
    returns = []
    for i, rental in enumerate(selected_rentals):
        rental_id = rental[0]
        product_id = rental[1]
        customer_id = rental[2]
        city_id = rental[4]
        quantity_returned = random.randint(1, rental[5])
        date_id = min(rental[3] + random.randint(1, 30), len(date_ids))
        reason = random.choice(reasons)
        returns.append((i+1, rental_id, product_id, customer_id, date_id, city_id, quantity_returned, reason))
    return returns

# Generate Data
cities = generate_cities(NUM_CITIES)
dates = generate_dates()
products = generate_products(NUM_PRODUCTS)
customers = generate_customers(NUM_CUSTOMERS, [c[0] for c in cities])
suppliers = generate_suppliers(NUM_SUPPLIERS, [c[0] for c in cities])

# Mappings
product_id_to_rental_price = {p[0]: p[3] for p in products}
customer_id_to_city_id = {c[0]: c[4] for c in customers}
supplier_id_to_city_id = {s[0]: s[3] for s in suppliers}
product_id_to_unit_cost = {p[0]: round(random.uniform(10, 20) * p[3], 2) for p in products}
weights = [2 if d[8] == 1 or d[6] is not None else 1 for d in dates]
date_ids = [d[0] for d in dates]

rentals = generate_rentals(NUM_RENTALS, [p[0] for p in products], [c[0] for c in customers], date_ids, product_id_to_rental_price, customer_id_to_city_id, weights)
transactions = generate_supplier_transactions(NUM_TRANSACTIONS, [s[0] for s in suppliers], [p[0] for p in products], date_ids, product_id_to_unit_cost, supplier_id_to_city_id)
inventory_snapshots = generate_inventory_snapshots(NUM_INVENTORY_SNAPSHOTS, [p[0] for p in products], date_ids, [c[0] for c in cities])
returns = generate_product_returns(NUM_RETURNS, rentals, date_ids)

parent_dit = "Data/"

# Ensure the directory exists
os.makedirs(parent_dit, exist_ok=True)

# Convert to DataFrame and Save
pd.DataFrame(cities, columns=["city_id", "city_name", "state", "country"]).to_csv(os.path.join(parent_dit, "Dim_City.csv"), index=False)
pd.DataFrame(dates, columns=["date_id", "full_date", "day", "month", "quarter", "year", "festival", "day_of_week", "is_weekend"]).to_csv(os.path.join(parent_dit, "Dim_Date.csv"), index=False)
pd.DataFrame(products, columns=["product_id", "product_name", "category", "rental_price"]).to_csv(os.path.join(parent_dit, "Dim_Product.csv"), index=False)
pd.DataFrame(customers, columns=["customer_id", "customer_name", "age", "gender", "city_id", "customer_type"]).to_csv(os.path.join(parent_dit, "Dim_Customer.csv"), index=False)
pd.DataFrame(suppliers, columns=["supplier_id", "supplier_name", "supplier_contact", "city_id"]).to_csv(os.path.join(parent_dit, "Dim_Supplier.csv"), index=False)
pd.DataFrame(rentals, columns=["rental_id", "product_id", "customer_id", "date_id", "city_id", "quantity", "total_amount", "discount_amount", "rating"]).to_csv(os.path.join(parent_dit, "Fact_Rentals.csv"), index=False)
pd.DataFrame(transactions, columns=["transaction_id", "supplier_id", "product_id", "date_id", "city_id", "quantity_supplied", "cost_amount"]).to_csv(os.path.join(parent_dit, "Fact_Supplier_Transactions.csv"), index=False)
pd.DataFrame(inventory_snapshots, columns=["snapshot_id", "date_id", "product_id", "city_id", "inventory_on_hand", "inventory_reserved", "inventory_damaged"]).to_csv(os.path.join(parent_dit, "Fact_InventorySnapshot.csv"), index=False)
pd.DataFrame(returns, columns=["return_id", "rental_id", "product_id", "customer_id", "date_id", "city_id", "quantity_returned", "reason_for_return"]).to_csv(os.path.join(parent_dit, "Fact_ProductReturns.csv"), index=False)
print("Synthetic data generated successfully!")

Synthetic data generated successfully!
