In [1]:
import numpy as np
import pandas as pd

np.random.seed(42)

N = 50000

# User IDs
user_id = np.arange(1, N + 1)

# Random assignment
variant = np.random.choice(["control", "treatment"], size=N)

# Conversion probabilities
conversion_prob = np.where(variant == "control", 0.10, 0.115)

# Conversion outcome
converted = np.random.binomial(1, conversion_prob)

# Revenue (only if converted)
revenue = np.where(
    converted == 1,
    np.random.gamma(shape=2.0, scale=50.0, size=N),
    0
)

# Bounce behavior
bounce_prob = np.where(variant == "control", 0.40, 0.38)
bounced = np.random.binomial(1, bounce_prob)

# Create DataFrame
df = pd.DataFrame({
    "user_id": user_id,
    "variant": variant,
    "converted": converted,
    "revenue": revenue,
    "bounced": bounced
})

df.head()

Unnamed: 0,user_id,variant,converted,revenue,bounced
0,1,control,0,0.0,1
1,2,treatment,0,0.0,1
2,3,control,0,0.0,1
3,4,control,1,69.96121,0
4,5,control,0,0.0,1


In [2]:
df.to_csv("../data/raw/ab_test_raw.csv", index=False)


In [3]:
df["variant"].value_counts(normalize=True)
df["converted"].mean()


np.float64(0.10488)