In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

In [3]:
products_df = pd.read_csv("Dataset/Synthetic_Dataset/products_catalog.csv")
item_ids = products_df['itemID'].tolist()

In [4]:
num_users = 500 # Number of users to simulate
num_days = 30 # Number of days to simulate events for
event_types = ['view', 'cart', 'purchase']
event_probs = [0.80, 0.15, 0.05]  # User behavior probabilities(matches with reality)

In [5]:
events = []

for user_id in range(1, num_users + 1):
    # Each user performs actions near about 30 to 150 items in a month
    num_interactions = random.randint(30, 150)

    for _ in range(num_interactions):
        item_id = random.choice(item_ids)

        # Bias popular items to appear more
        if random.random() < 0.2:
            item_id = random.choice(item_ids[:int(len(item_ids) * 0.1)])  # Top 10%

        # Generate timestamp in last 30 days
        days_ago = random.randint(0, num_days)
        seconds_in_day = 86400
        rand_seconds = random.randint(0, seconds_in_day)
        timestamp = datetime.now() - timedelta(days=days_ago, seconds=rand_seconds)
        timestamp_str = timestamp.strftime("%Y-%m-%d %H:%M:%S")

        event = np.random.choice(event_types, p=event_probs)

        price_seen = products_df[products_df['itemID'] == item_id]['Price'].values[0]

        events.append({
            "timestamp": timestamp_str,
            "user_id": user_id,
            "event_type": event,
            "itemID": item_id,
            "price_seen": price_seen
        })

In [None]:
events_df = pd.DataFrame(events)

events_df.head()

Unnamed: 0,timestamp,user_id,event_type,itemID,price_seen
0,2025-06-14 03:57:42,1,view,445,46306
1,2025-06-19 16:10:32,1,view,529,4170
2,2025-06-15 14:08:08,1,view,3723,24831
3,2025-06-21 01:40:41,1,cart,74,35998
4,2025-06-20 00:31:33,1,view,555,25283


In [7]:
events_df.shape

(45624, 5)

In [8]:
events_df = events_df.sample(frac=1).reset_index(drop=True)

In [9]:
events_df.head()

Unnamed: 0,timestamp,user_id,event_type,itemID,price_seen
0,2025-06-27 14:24:50,139,view,148,5413
1,2025-06-19 04:54:59,442,view,4255,50151
2,2025-06-03 17:46:31,363,cart,1305,26641
3,2025-06-12 07:33:13,364,view,2296,117865
4,2025-06-22 17:58:37,424,view,222,42484


In [10]:
events_df.to_csv("user_events.csv", index=False)