In [4]:
import pandas as pd
import numpy as np
from datetime import timedelta
import random
from faker import Faker

In [5]:
fake = Faker()
np.random.seed(42)
random.seed(42)

num_users = 10000
events = []

for user in range(1, num_users + 1):
    event_time = fake.date_time_this_year()
    price = random.randint(500, 5000)
    device = random.choice(["Mobile", "Desktop"])
    traffic = random.choice(["Organic", "Ads", "Referral", "Social Media"])
    category = random.choice(["Electronics", "Fashion", "Home", "Beauty", "Sports"])

    # Visit
    events.append([user, event_time, "visit", device, traffic, category, price])

    # Product View (70%)
    if random.random() < 0.7:
        event_time += timedelta(minutes=random.randint(1, 5))
        events.append([user, event_time, "product_view", device, traffic, category, price])
    else:
        continue

    # Add to Cart (55%)
    if random.random() < 0.55:
        event_time += timedelta(minutes=random.randint(1, 5))
        events.append([user, event_time, "add_to_cart", device, traffic, category, price])
    else:
        continue

    # Checkout (60%)
    if random.random() < 0.6:
        event_time += timedelta(minutes=random.randint(1, 5))
        events.append([user, event_time, "checkout", device, traffic, category, price])
    else:
        continue

    # Payment Success (65%)
    if random.random() < 0.65:
        event_time += timedelta(minutes=random.randint(1, 3))
        events.append([user, event_time, "payment_success", device, traffic, category, price])

# Create DataFrame
df = pd.DataFrame(
    events,
    columns=[
        "user_id",
        "event_time",
        "event_name",
        "device_type",
        "traffic_source",
        "product_category",
        "price"
    ]
)

# ------------------------------------------------
# Funnel Success Flag (User-level)
# ------------------------------------------------

# Identify users who completed payment
successful_users = df[df["event_name"] == "payment_success"]["user_id"].unique()

# Add funnel success column
df["funnel_success"] = np.where(
    df["user_id"].isin(successful_users),
    "Yes",
    "No"
)

In [7]:
df.head(10)

Unnamed: 0,user_id,event_time,event_name,device_type,traffic_source,product_category,price,funnel_success
0,1,2026-01-01 06:30:57,visit,Mobile,Referral,Fashion,1412,No
1,1,2026-01-01 06:31:57,product_view,Mobile,Referral,Fashion,1412,No
2,2,2026-01-05 22:16:16,visit,Mobile,Social Media,Electronics,4967,No
3,2,2026-01-05 22:18:16,product_view,Mobile,Social Media,Electronics,4967,No
4,2,2026-01-05 22:23:16,add_to_cart,Mobile,Social Media,Electronics,4967,No
5,2,2026-01-05 22:25:16,checkout,Mobile,Social Media,Electronics,4967,No
6,3,2026-01-02 19:46:09,visit,Desktop,Ads,Beauty,4964,No
7,3,2026-01-02 19:47:09,product_view,Desktop,Ads,Beauty,4964,No
8,4,2026-01-06 01:01:30,visit,Desktop,Referral,Home,1807,Yes
9,4,2026-01-06 01:04:30,product_view,Desktop,Referral,Home,1807,Yes


In [8]:
# Save dataset
df.to_csv("funnel_events.csv", index=False)

print("Final funnel dataset created successfully!")

Final funnel dataset created successfully!
