In [0]:
# Databricks notebook source
import random
from datetime import datetime, timedelta
from pyspark.sql import functions as F
from pyspark.sql.types import *
random.seed(42)

# Configuration
NUM_CUSTOMERS = 100000
NUM_PRODUCTS = 10000
NUM_TRANSACTIONS = 5000000
NUM_RATINGS = 1000000
START_DATE = datetime(2022, 1, 1)
END_DATE = datetime(2024, 12, 31)
OUTPUT_PATH = "/Volumes/workspace/default/ecommerce_project_volume/raw"
print(f"Generating: {NUM_CUSTOMERS:,} customers, {NUM_PRODUCTS:,} products")
print(f" {NUM_TRANSACTIONS:,} transactions, {NUM_RATINGS:,} ratings")

# Reference data
CATEGORIES = {
 "Electronics": ["Smartphones", "Laptops", "Headphones", "Cameras"],
 "Clothing": ["Men's Tops", "Women's Tops", "Shoes", "Accessories"],
 "Home": ["Furniture", "Kitchen", "Bedding", "Decor"],
 "Books": ["Fiction", "Non-Fiction", "Technical", "Children's"],
 "Sports": ["Exercise", "Outdoor", "Team Sports", "Water Sports"],
}
STATES = ["CA", "TX", "FL", "NY", "PA", "IL", "OH", "GA", "NC", "MI"]
SEGMENTS = ["Budget", "Regular", "Premium", "VIP"]
BRANDS = ["TechPro", "HomeStyle", "FitLife", "StyleMax", "ValueChoice"]

# Generate Customers
print("\n■ Generating customers...")

In [0]:
def generate_customers(n):
    customers = []
    first_names = ["James", "Mary", "John", "Patricia", "Robert", "Jennifer"]
    last_names = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia"]

    for i in range(n):
        customers.append({
            "customer_id": f"CUST_{str(i+1).zfill(8)}",
            "first_name": random.choice(first_names),
            "last_name": random.choice(last_names),
            "email": f"user{i}@email.com",
            "phone": f"{random.randint(200,999)}-{random.randint(200,999)}-{random.randint(1000,9999)}",
            "state": random.choice(STATES),
            "segment": random.choices(SEGMENTS, weights=[0.3, 0.45, 0.2,  0.05])[0],
            "registration_date": (START_DATE + timedelta(days=random.randint(0,365))).strftime("%Y-%m-%d"),
            "birth_year": random.randint(1950, 2005),
            "is_active": random.random()
        })
        if (i+1) % 25000 == 0: 
            print(f" {i+1:,} customers...")
    return customers
customers = generate_customers(NUM_CUSTOMERS)
customers_df = spark.createDataFrame(customers)
customers_df.write.mode("overwrite").parquet(f"{OUTPUT_PATH}/customers")
print(f"■ Saved {len(customers):,} customers")

In [0]:
# Generate Products
print("\n■ Generating products...")
def generate_products(n):
    products = []
    for i in range(n):
        category = random.choice(list(CATEGORIES.keys()))
        subcategory = random.choice(CATEGORIES[category])

        base_prices = {"Electronics": (50,2000), "Clothing": (15,500),
                        "Home": (20,1000), "Books": (10,100), "Sports": (25,800)}
        min_p, max_p = base_prices.get(category, (10,500))
        price = round(random.uniform(min_p, max_p), 2)

        products.append({
            "product_id": f"PROD_{str(i+1).zfill(7)}",
            "product_name": f"{random.choice(BRANDS)} {subcategory} {i}",
            "category": category,
            "subcategory": subcategory,
            "brand": random.choice(BRANDS),
            "price": price,
            "cost": round(price * random.uniform(0.3, 0.7), 2),
            "stock_quantity": random.randint(0, 1000),
            "avg_rating": round(random.uniform(2.5, 5.0), 1),
            "is_available": True
        })
    return products
products = generate_products(NUM_PRODUCTS)
products_df = spark.createDataFrame(products)
products_df.write.mode("overwrite").parquet(f"{OUTPUT_PATH}/products")
print(f"■ Saved {len(products):,} products")

In [0]:
# Generate Transactions
print("\n■ Generating transactions...")
customer_ids = [c["customer_id"] for c in customers]
product_ids = [p["product_id"] for p in products]
product_prices = {p["product_id"]: p["price"] for p in products}

# Generate and write transactions in batches to avoid OOM
BATCH_SIZE = 500_000
num_batches = NUM_TRANSACTIONS // BATCH_SIZE + int(NUM_TRANSACTIONS % BATCH_SIZE > 0)

for batch_num in range(num_batches):
    start_idx = batch_num * BATCH_SIZE
    end_idx = min((batch_num + 1) * BATCH_SIZE, NUM_TRANSACTIONS)
    batch_size = end_idx - start_idx

    transactions = []
    for i in range(batch_size):
        cid = random.choice(customer_ids)
        pid = random.choice(product_ids)
        qty = int(random.choices([1,2,3,4,5], weights=[0.6,0.25,0.1,0.03,0.02])[0])
        price = float(product_prices.get(pid, 50)) * random.uniform(0.8, 1.2)
        total = round(price * qty, 2)
        disc_pct = int(random.choices([0,5,10,15,20], weights=[0.5,0.2,0.15,0.1,0.05])[0])
        disc_amt = round(total * disc_pct / 100, 2)

        txn_time = START_DATE + timedelta(
            days=random.randint(0, (END_DATE-START_DATE).days),
            hours=random.randint(0,23), minutes=random.randint(0,59)
        )

        transactions.append({
            "transaction_id": f"TXN_{str(start_idx + i + 1).zfill(10)}",
            "customer_id": cid,
            "product_id": pid,
            "quantity": float(qty),
            "unit_price": round(float(price), 2),
            "total_amount": float(total),
            "discount_percent": float(disc_pct),
            "discount_amount": float(disc_amt),
            "final_amount": round(float(total - disc_amt), 2),
            "shipping_cost": float(random.choice([0, 5.99, 7.99, 9.99])),
            "payment_method": random.choice(["Credit Card", "PayPal", "Debit Card"]),
            "status": random.choices(
                ["Completed","Pending","Failed","Refunded"],
                weights=[0.85,0.05,0.05,0.05]
            )[0],
            "transaction_timestamp": txn_time.strftime("%Y-%m-%d %H:%M:%S")
        })

    transactions_df = spark.createDataFrame(transactions)
    transactions_df.write.mode("append").parquet(f"{OUTPUT_PATH}/transactions")
    print(f"■ Saved batch {batch_num+1}/{num_batches}: {batch_size:,} transactions")

In [0]:
# Generate Ratings
print("\n■ Generating ratings...")
def generate_ratings(n):
    ratings = []
    for i in range(n):
        ratings.append({
            "rating_id": f"RAT_{str(i+1).zfill(9)}",
            "customer_id": random.choice(customer_ids),
            "product_id": random.choice(product_ids),
            "rating": random.choices([1,2,3,4,5], weights=[0.05,0.08,0.15,0.35,0.37])[0],
            "has_review": random.random(),
            "rating_date": (START_DATE + timedelta(
                            days=random.randint(0, (END_DATE-START_DATE).days))).strftime("%Y-%m-%d"),
            "helpful_votes": random.randint(0, 100)
        })
    if (i+1) % 250000 == 0: 
        print(f" {i+1:,} ratings...")
    return ratings
ratings = generate_ratings(NUM_RATINGS)
ratings_df = spark.createDataFrame(ratings)
ratings_df.write.mode("overwrite").parquet(f"{OUTPUT_PATH}/ratings")
print(f"■ Saved {len(ratings):,} ratings")
print("\n" + "="*50)
print("■ DATA GENERATION COMPLETE!")
print("="*50)