In [2]:
import pandas as pd
import numpy as np
import uuid
import random
from datetime import datetime, timedelta
import os

np.random.seed(10)
random.seed(10)
os.makedirs("data", exist_ok=True)

NUM_USERS = 2000
START_DATE = datetime(2025, 1, 1)
MAX_EVENTS = 80

# Behavior categories and their purchase items
CATEGORY_MAP = {
    "fitness": {
        "keywords": [
            "bmi calculator", "track calories", "steps per day", "fitness tracker",
            "heart rate monitoring", "gym workout routine"
        ],
        "purchase_products": ["SMARTWATCH", "FITNESS_BAND"]
    },
    "smartphone": {
        "keywords": [
            "5G phone review", "android vs ios", "mobile camera test",
            "battery backup tricks", "gaming phone ranking"
        ],
        "purchase_products": ["SMARTPHONE"]
    },
    "gaming": {
        "keywords": [
            "ps5 gameplay", "console comparison", "best controller",
            "fps games list", "gpu for gamers"
        ],
        "purchase_products": ["GAMING_CONSOLE", "VR_SET"]
    },
    "home_entertainment": {
        "keywords": [
            "smart tv reviews", "4k hdr explained", "best sound system",
            "OTT app compare"
        ],
        "purchase_products": ["SMART_TV", "SMART_SPEAKER"]
    },
    "computer": {
        "keywords": [
            "cpu benchmark", "laptop vs desktop", "SSD vs HDD",
            "programming laptop", "gaming laptop"
        ],
        "purchase_products": ["LAPTOP", "DESKTOP_PC"]
    }
}

EVENT_TYPES = ["search", "watch_video", "read_article", "compare", "view_product"]

all_actions = []
all_decisions = []

for user in range(1, NUM_USERS+1):
    num_events = random.randint(15, MAX_EVENTS)
    last_time = START_DATE
    interest_scores = {cat: 0 for cat in CATEGORY_MAP.keys()}

    for _ in range(num_events):
        category = random.choice(list(CATEGORY_MAP.keys()))
        e_type = random.choice(EVENT_TYPES)
        query = random.choice(CATEGORY_MAP[category]["keywords"])
        product = (
            random.choice(CATEGORY_MAP[category]["purchase_products"])
            if e_type == "view_product"
            else ""
        )

        event_id = str(uuid.uuid4())[:8]
        last_time += timedelta(minutes=random.randint(30, 360))

        # Increase interest score for non-random browsing
        if e_type in ["search", "watch_video", "compare"]:
            interest_scores[category] += 1

        all_actions.append([
            user, event_id, last_time, e_type,
            product, category, query
        ])

    # Determine dominant interest → purchase category
    top_cat = max(interest_scores, key=interest_scores.get)
    if interest_scores[top_cat] >= 15:
        decision = str(uuid.uuid4())[:8]
        product = random.choice(CATEGORY_MAP[top_cat]["purchase_products"])
        timestamp = last_time + timedelta(minutes=200)

        all_decisions.append([
            user, decision, timestamp, product, top_cat
        ])

df_actions = pd.DataFrame(all_actions, columns=[
    "user_id", "event_id", "timestamp", "event_type",
    "product_id", "category", "query_text"
])

df_decisions = pd.DataFrame(all_decisions, columns=[
    "user_id", "event_id", "timestamp", "product_id", "category"
])

df_actions.to_csv("data/user_actions.csv", index=False)
df_decisions.to_csv("data/user_decisions.csv", index=False)

print("✓ Multi-category synthetic data generated")
print(df_actions.head(), "\n")
print(df_decisions.head())


✓ Multi-category synthetic data generated
   user_id  event_id           timestamp    event_type    product_id  \
0        1  60b41c33 2025-01-01 02:15:00       compare                 
1        1  978e90d4 2025-01-01 04:07:00       compare                 
2        1  5d6b9111 2025-01-01 05:15:00  view_product  FITNESS_BAND   
3        1  a03c522e 2025-01-01 09:20:00  read_article                 
4        1  0fde862f 2025-01-01 13:25:00  view_product    SMARTPHONE   

             category          query_text  
0  home_entertainment    smart tv reviews  
1  home_entertainment   best sound system  
2             fitness     fitness tracker  
3          smartphone     5G phone review  
4          smartphone  mobile camera test   

   user_id  event_id           timestamp      product_id    category
0       79  b615cb20 2025-01-12 02:41:00          VR_SET      gaming
1       99  e334b5b5 2025-01-11 16:32:00  GAMING_CONSOLE      gaming
2      124  11ca182d 2025-01-10 10:58:00    FITNESS_