In [1]:
!pip install faker

Collecting faker
  Downloading faker-38.2.0-py3-none-any.whl.metadata (16 kB)
Downloading faker-38.2.0-py3-none-any.whl (2.0 MB)
   ---------------------------------------- 0.0/2.0 MB ? eta -:--:--
   ---------------------------------------- 2.0/2.0 MB 32.3 MB/s eta 0:00:00
Installing collected packages: faker
Successfully installed faker-38.2.0


In [2]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

fake = Faker()

# ---------------- Settings ----------------
NUM_CUSTOMERS = 50000
NUM_PRODUCTS = 500
NUM_ORDERS = 120000
RETURN_RATE = 0.08   # 8% of orders get returned
# ------------------------------------------

# ---------------- Customers ----------------
customers = []
for i in range(NUM_CUSTOMERS):
    customers.append([
        i+1,
        fake.name(),
        fake.email(),
        fake.city(),
        fake.state(),
        random.choice(["Facebook", "Google Ads", "Instagram", "Organic", "Referral"]),
        fake.date_between(start_date='-3y', end_date='today')
    ])

customers_df = pd.DataFrame(customers, columns=[
    "customer_id", "name", "email", "city", "state", 
    "acquisition_channel", "signup_date"
])

# ---------------- Products ----------------
categories = ["Electronics", "Clothing", "Home", "Beauty", "Sports", "Toys"]

products = []
for i in range(NUM_PRODUCTS):
    base_price = random.uniform(10, 500)
    products.append([
        i+1,
        fake.word().capitalize(),
        random.choice(categories),
        round(base_price, 2)
    ])

products_df = pd.DataFrame(products, columns=[
    "product_id", "product_name", "category", "price"
])

# ---------------- Orders ----------------
orders = []
for i in range(NUM_ORDERS):
    customer = random.randint(1, NUM_CUSTOMERS)
    order_date = fake.date_between(start_date='-2y', end_date='today')

    orders.append([
        i+1,
        customer,
        order_date,
        random.choice(["Credit Card", "PayPal", "Apple Pay"]),
        random.choice(["Delivered", "Shipped", "Processing"])
    ])

orders_df = pd.DataFrame(orders, columns=[
    "order_id", "customer_id", "order_date", 
    "payment_method", "status"
])

# ---------------- Order Items ----------------
order_items = []

for order_id in range(1, NUM_ORDERS+1):
    for _ in range(random.randint(1, 5)):  # 1â€“5 products per order
        product = random.randint(1, NUM_PRODUCTS)
        quantity = random.randint(1, 3)
        price = products_df.loc[products_df.product_id == product, 'price'].values[0]

        order_items.append([
            order_id,
            product,
            quantity,
            round(price * quantity, 2)
        ])

order_items_df = pd.DataFrame(order_items, columns=[
    "order_id", "product_id", "quantity", "total_price"
])

# ---------------- Returns ----------------
returns = []

for idx, row in orders_df.sample(frac=RETURN_RATE).iterrows():
    returns.append([
        row["order_id"],
        fake.date_between(start_date=row["order_date"], end_date='today'),
        random.choice(["Damaged", "Wrong Item", "Not Needed", "Bad Quality"])
    ])

returns_df = pd.DataFrame(returns, columns=[
    "order_id", "return_date", "return_reason"
])

# ---------------- Save Files ----------------
customers_df.to_csv("../data/customers.csv", index=False)
products_df.to_csv("../data/products.csv", index=False)
orders_df.to_csv("../data/orders.csv", index=False)
order_items_df.to_csv("../data/order_items.csv", index=False)
returns_df.to_csv("../data/returns.csv", index=False)

print("E-commerce Dataset Created Successfully!")


E-commerce Dataset Created Successfully!
