In [2]:
import pandas as pd
import random
from faker import Faker
from datetime import datetime, timedelta
import uuid

fake = Faker('en_IN')
random.seed(42)

# --- 1. Product Catalog ---
category_products = {
    "Groceries": [
        "Wheat Flour", "Rice", "Salt", "Sugar", "Tur Dal", "Moong Dal", "Chana Dal", "Tea Powder", "Coffee Powder", "Cooking Oil",
        "Mustard Oil", "Sunflower Oil", "Jeera", "Turmeric Powder", "Chilli Powder", "Coriander Powder", "Garam Masala", "Pepper"
    ],
    "Beverages": [
        "Orange Juice", "Apple Juice", "Lemon Juice", "Fanta", "Sprite", "Maaza", "Mirinda", "Pepsi", "Coca Cola", "Thums Up"
    ],
    "Snacks": [
        "Chips", "Namkeen", "Biscuits", "Mixture", "Popcorn", "Salted Peanuts", "Murukku", "Chakli", "Khakra", "Bhujia"
    ],
    "Dairy": [
        "Milk", "Butter", "Curd", "Ghee", "Paneer", "Buttermilk", "Cheese", "Yogurt", "Cream"
    ],
    "Personal Care": [
        "Toothpaste", "Toothbrush", "Shampoo", "Hair Oil", "Face Wash", "Body Lotion", "Soap", "Face Cream", "Talcum Powder"
    ],
    "Home Care": [
        "Floor Cleaner", "Dish Cleaner", "Room Freshener", "Bulb", "Mop", "Detergent Powder", "Toilet Cleaner", "Matchbox"
    ],
    "Bakery": [
        "Bread", "Buns", "Cookies", "Rusk", "Cakes", "Pastries", "Donuts"
    ],
    "Fruits": [
        "Mango", "Banana", "Apple", "Orange", "Papaya", "Pineapple", "Watermelon", "Guava", "Pomegranate"
    ],
    "Vegetables": [
        "Potato", "Onion", "Tomato", "Carrot", "Cauliflower", "Cabbage", "Beans", "Brinjal", "Spinach"
    ],
    "Meat & Seafood": [
        "Chicken", "Mutton", "Eggs", "Fish", "Prawns", "Crab"
    ],
    "Baby Care": [
        "Diapers", "Baby Lotion", "Baby Soap", "Baby Powder", "Baby Shampoo", "Baby Wipes"
    ]
}

# Generate products list
products = []
product_id_counter = 1
for category, items in category_products.items():
    for item in items:
        products.append({
            "product_id": f"P{product_id_counter:04}",
            "product_name": item,
            "category": category,
            "price": round(random.uniform(20, 500), 2)
        })
        product_id_counter += 1

df_products = pd.DataFrame(products)

# --- 2. Customers ---
cities = ["Mumbai", "Delhi", "Bengaluru", "Hyderabad", "Chennai", "Kolkata", "Pune", "Ahmedabad", "Jaipur", "Lucknow", "Vizag", "Mangaluru", "Mohali"]

customers = []
for i in range(1300):
    customers.append({
        "customer_id": f"C{i+1:05}",
        "customer_name": fake.name(),
        "city": random.choice(cities),
        "signup_date": fake.date_between(start_date='-3y', end_date='-1y')
    })

df_customers = pd.DataFrame(customers)

# --- 3. Orders (Limit ~10,000) ---
orders = []
order_items = []
order_id_counter = 1
order_item_id_counter = 1

while len(orders) < 10000:
    for cust in df_customers.itertuples():
        if len(orders) >= 10000:
            break

        num_orders = random.randint(1, 5)
        signup_date = pd.to_datetime(cust.signup_date)

        for _ in range(num_orders):
            if len(orders) >= 10000:
                break

            order_date = signup_date + timedelta(days=random.randint(10, 900))
            if order_date > datetime(2024, 12, 31):
                continue

            order_id = f"O{order_id_counter:06}"
            order_amount = 0
            num_items = random.randint(1, 5)
            chosen_items = random.sample(df_products.to_dict("records"), num_items)

            for item in chosen_items:
                qty = random.randint(1, 5)
                order_items.append({
                    "order_item_id": f"OD{order_item_id_counter:06}",
                    "order_id": order_id,
                    "product_id": item["product_id"],
                    "quantity": qty,
                    "unit_price": item["price"]
                })
                order_item_id_counter += 1
                order_amount += qty * item["price"]

            orders.append({
                "order_id": order_id,
                "customer_id": cust.customer_id,
                "order_date": order_date,
                "order_amount": round(order_amount, 2),
                "month_id": order_date.month,
                "year_id": order_date.year
            })

            order_id_counter += 1

df_orders = pd.DataFrame(orders)
df_order_items = pd.DataFrame(order_items)

# --- Save to CSV ---
df_products.to_csv("zepto_product_catalog.csv", index=False)
df_customers.to_csv("zepto_customers.csv", index=False)
df_orders.to_csv("zepto_orders.csv", index=False)
df_order_items.to_csv("zepto_order_items.csv", index=False)
