In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

np.random.seed(42)

In [2]:

N_USERS = 100_000
N_ADS = 1_000
N_IMPRESSIONS = 2_000_000  # rows in the final log

RAW_DIR = Path("data/raw")
PROCESSED_DIR = Path("data/processed")

RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
age_bins = ["18-24", "25-34", "35-44", "45+"]
locations = ["Tier-1", "Tier-2", "Tier-3"]
devices = ["android", "ios", "web"]

def simulate_users(n_users: int) -> pd.DataFrame:
    user_ids = np.arange(1, n_users + 1)

    df = pd.DataFrame({
        "user_id": user_ids,
        "age_bucket": np.random.choice(age_bins, size=n_users, p=[0.35, 0.35, 0.2, 0.1]),
        "location": np.random.choice(locations, size=n_users, p=[0.4, 0.35, 0.25]),
        "device_pref": np.random.choice(devices, size=n_users, p=[0.6, 0.25, 0.15]),
    })

    base_ctr = (
        0.03
        + (df["age_bucket"].map({"18-24": 0.01, "25-34": 0.015, "35-44": 0.005, "45+": -0.005}).fillna(0))
        + (df["device_pref"].map({"android": 0.0, "ios": 0.01, "web": -0.01}).fillna(0))
        + np.random.normal(0, 0.005, size=n_users)
    )
    df["user_base_ctr"] = base_ctr.clip(0.001, 0.15)

    return df

users_df = simulate_users(N_USERS)
users_df.head()

Unnamed: 0,user_id,age_bucket,location,device_pref,user_base_ctr
0,1,25-34,Tier-2,android,0.044587
1,2,45+,Tier-2,android,0.019357
2,3,35-44,Tier-1,android,0.033264
3,4,25-34,Tier-2,android,0.048499
4,5,18-24,Tier-1,android,0.037543


In [4]:
ad_categories = ["fashion", "electronics", "grocery", "home", "beauty"]

def simulate_ads(n_ads: int) -> pd.DataFrame:
    ad_ids = np.arange(1, n_ads + 1)

    df = pd.DataFrame({
        "ad_id": ad_ids,
        "category": np.random.choice(ad_categories, size=n_ads, p=[0.3, 0.25, 0.15, 0.2, 0.1]),
    })

    df["quality_score"] = np.random.beta(a=4, b=2, size=n_ads)  # 0â€“1, skewed high

    cat_ctr_boost = {
        "fashion": 0.015,
        "electronics": 0.01,
        "grocery": 0.02,
        "home": 0.008,
        "beauty": 0.018,
    }

    base_ctr = 0.02 \
        + df["category"].map(cat_ctr_boost).fillna(0) \
        + 0.03 * (df["quality_score"] - 0.5) \
        + np.random.normal(0, 0.005, size=n_ads)

    df["ad_base_ctr"] = base_ctr.clip(0.002, 0.25)

    df["bid"] = np.round(
        np.random.uniform(1.0, 5.0, size=n_ads) * (0.5 + df["quality_score"]),
        2
    )

    return df

ads_df = simulate_ads(N_ADS)
ads_df.head()

Unnamed: 0,ad_id,category,quality_score,ad_base_ctr,bid
0,1,electronics,0.742371,0.040708,4.03
1,2,fashion,0.941238,0.040861,5.15
2,3,fashion,0.71366,0.045281,2.36
3,4,fashion,0.917448,0.045069,3.08
4,5,beauty,0.876564,0.044783,5.88


In [5]:
time_of_day_slots = ["morning", "afternoon", "evening", "night"]

time_of_day_effect = {
    "morning": 0.0,
    "afternoon": 0.01,
    "evening": 0.02,
    "night": -0.005,
}

device_effect = {
    "android": 0.0,
    "ios": 0.01,
    "web": -0.008,
}

def sigmoid(x: np.ndarray) -> np.ndarray:
    return 1 / (1 + np.exp(-x))

In [6]:
def simulate_impressions(
    n_impr: int,
    users: pd.DataFrame,
    ads: pd.DataFrame
) -> pd.DataFrame:
    user_sample = users.sample(n_impr, replace=True).reset_index(drop=True)
    ad_sample = ads.sample(n_impr, replace=True).reset_index(drop=True)

    df = pd.DataFrame({
        "user_id": user_sample["user_id"],
        "ad_id": ad_sample["ad_id"],
        "user_age_bucket": user_sample["age_bucket"],
        "user_location": user_sample["location"],
        "user_device": user_sample["device_pref"],
        "user_base_ctr": user_sample["user_base_ctr"],
        "ad_category": ad_sample["category"],
        "ad_quality_score": ad_sample["quality_score"],
        "ad_base_ctr": ad_sample["ad_base_ctr"],
        "bid": ad_sample["bid"],
    })

    df["time_of_day"] = np.random.choice(time_of_day_slots, size=n_impr, p=[0.25, 0.35, 0.3, 0.1])

    df["interest_match"] = (
        (df["user_age_bucket"].isin(["18-24", "25-34"]) & df["ad_category"].isin(["fashion", "beauty"])) |
        (df["user_age_bucket"].isin(["35-44", "45+"]) & df["ad_category"].isin(["home", "grocery"]))
    ).astype(int)

    base_logit = (
        2.5 * df["ad_quality_score"].values
        + 1.5 * df["interest_match"].values
        + 0.8 * (df["bid"].values / df["bid"].max())
        + df["time_of_day"].map(time_of_day_effect).fillna(0).values
        + df["user_device"].map(device_effect).fillna(0).values
        + np.random.normal(0, 0.3, size=n_impr)
    )

    base_probs = sigmoid(base_logit)

    user_adj = df["user_base_ctr"].values / df["user_base_ctr"].mean()
    ad_adj = df["ad_base_ctr"].values / df["ad_base_ctr"].mean()

    ctr = base_probs * user_adj * ad_adj
    ctr = np.clip(ctr, 0.001, 0.6)

    df["true_ctr"] = ctr
    df["clicked"] = (np.random.rand(n_impr) < ctr).astype(int)

    return df

events_df = simulate_impressions(N_IMPRESSIONS, users_df, ads_df)
events_df.head()

Unnamed: 0,user_id,ad_id,user_age_bucket,user_location,user_device,user_base_ctr,ad_category,ad_quality_score,ad_base_ctr,bid,time_of_day,interest_match,true_ctr,clicked
0,46993,646,35-44,Tier-3,android,0.03101,fashion,0.852224,0.037387,1.79,morning,0,0.6,1
1,15268,707,25-34,Tier-2,android,0.039462,electronics,0.588347,0.025422,4.95,evening,0,0.575525,1
2,65066,552,35-44,Tier-2,android,0.023422,fashion,0.81809,0.052376,3.24,morning,0,0.6,0
3,26740,640,25-34,Tier-2,ios,0.047996,electronics,0.838686,0.040108,2.28,evening,0,0.6,1
4,69272,643,25-34,Tier-2,ios,0.054098,electronics,0.711505,0.041477,4.7,evening,0,0.6,1


In [7]:
events_df["clicked"].mean(), events_df["true_ctr"].mean()

(np.float64(0.5737585), np.float64(0.5743141942761232))

In [8]:
events_df.groupby("ad_category")["clicked"].mean().sort_values(ascending=False)

ad_category
grocery        0.590649
beauty         0.587057
fashion        0.583704
electronics    0.563326
home           0.548227
Name: clicked, dtype: float64

In [9]:
raw_path = RAW_DIR / "ad_events_raw.csv"
proc_path = PROCESSED_DIR / "ad_events_processed.csv"

events_df.to_csv(raw_path, index=False)
events_df.to_csv(proc_path, index=False)

raw_path, proc_path

(PosixPath('data/raw/ad_events_raw.csv'),
 PosixPath('data/processed/ad_events_processed.csv'))