In [3]:
%pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.7.2-cp313-cp313-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Using cached scipy-1.16.3-cp313-cp313-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.7.2-cp313-cp313-macosx_12_0_arm64.whl (8.6 MB)
Using cached joblib-1.5.2-py3-none-any.whl (308 kB)
Using cached scipy-1.16.3-cp313-cp313-macosx_14_0_arm64.whl (20.9 MB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.5.2 scikit-learn-1.7.2 scipy-1.16.3 threadpoolctl-3.6.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
from pathlib import Path

DATA_DIR = Path("data")
PROCESSED_DIR = DATA_DIR / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [9]:
events_df = pd.read_csv( PROCESSED_DIR /"ad_events_processed.csv")
events_df.head()

Unnamed: 0,user_id,ad_id,user_age_bucket,user_location,user_device,user_base_ctr,ad_category,ad_quality_score,ad_base_ctr,bid,time_of_day,interest_match,true_ctr,clicked
0,46993,646,35-44,Tier-3,android,0.03101,fashion,0.852224,0.037387,1.79,morning,0,0.6,1
1,15268,707,25-34,Tier-2,android,0.039462,electronics,0.588347,0.025422,4.95,evening,0,0.575525,1
2,65066,552,35-44,Tier-2,android,0.023422,fashion,0.81809,0.052376,3.24,morning,0,0.6,0
3,26740,640,25-34,Tier-2,ios,0.047996,electronics,0.838686,0.040108,2.28,evening,0,0.6,1
4,69272,643,25-34,Tier-2,ios,0.054098,electronics,0.711505,0.041477,4.7,evening,0,0.6,1


In [11]:
events_df.shape

(2000000, 14)

In [12]:
events_df["clicked"].mean()

np.float64(0.5737585)

In [13]:
events_df["ad_category"].value_counts(normalize=True)

ad_category
fashion        0.304275
electronics    0.233265
home           0.189702
grocery        0.173550
beauty         0.099208
Name: proportion, dtype: float64

In [14]:
events_df["user_device"].value_counts(normalize=True)

user_device
android    0.597959
ios        0.250995
web        0.151045
Name: proportion, dtype: float64

In [18]:
target_col = ["clicked"]

id_cols = ["user_id", "ad_id"]
leak_cols = []
underlying_prob = ["true_ctr"]

feature_cols = [
    c for c in events_df.columns if c not in target_col + id_cols + leak_cols + underlying_prob
]

print(feature_cols)
len(feature_cols)

['user_age_bucket', 'user_location', 'user_device', 'user_base_ctr', 'ad_category', 'ad_quality_score', 'ad_base_ctr', 'bid', 'time_of_day', 'interest_match']


10

### Feature Engineering

In [19]:
feat_df = events_df.copy()
feat_df["effective_bid"] = feat_df["bid"] * feat_df["ad_quality_score"]
feat_df["is_evening_peak"] = feat_df["time_of_day"].isin(["evening"]).astype(int)
feat_df.head()

Unnamed: 0,user_id,ad_id,user_age_bucket,user_location,user_device,user_base_ctr,ad_category,ad_quality_score,ad_base_ctr,bid,time_of_day,interest_match,true_ctr,clicked,effective_bid,is_evening_peak
0,46993,646,35-44,Tier-3,android,0.03101,fashion,0.852224,0.037387,1.79,morning,0,0.6,1,1.52548,0
1,15268,707,25-34,Tier-2,android,0.039462,electronics,0.588347,0.025422,4.95,evening,0,0.575525,1,2.912318,1
2,65066,552,35-44,Tier-2,android,0.023422,fashion,0.81809,0.052376,3.24,morning,0,0.6,0,2.650612,0
3,26740,640,25-34,Tier-2,ios,0.047996,electronics,0.838686,0.040108,2.28,evening,0,0.6,1,1.912203,1
4,69272,643,25-34,Tier-2,ios,0.054098,electronics,0.711505,0.041477,4.7,evening,0,0.6,1,3.344073,1


In [20]:
feature_cols = [
    c for c in feat_df if c not in target_col + id_cols + leak_cols + underlying_prob
]

feature_cols

['user_age_bucket',
 'user_location',
 'user_device',
 'user_base_ctr',
 'ad_category',
 'ad_quality_score',
 'ad_base_ctr',
 'bid',
 'time_of_day',
 'interest_match',
 'effective_bid',
 'is_evening_peak']

In [21]:
categorical_features = [
    'user_age_bucket',
    'user_location',
    'user_device',
    'ad_category',
    'time_of_day'
]

numerical_features = [
    'user_base_ctr',
    'ad_quality_score',
    'ad_base_ctr',
    'bid',
    'interest_match',
    'effective_bid',
    'is_evening_peak'
]

X = feat_df[feature_cols]
y = feat_df[target_col]

In [26]:
X_temp, X_test, y_temp, y_test = train_test_split(X, y , test_size=0.2, random_state=42, stratify=y) #20% test
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42, stratify=y_temp) #20% validation
X_train.shape, X_val.shape, X_test.shape

((1280000, 12), (320000, 12), (400000, 12))

In [27]:
categorical_transformer = OneHotEncoder(
    handle_unknown="ignore",  # incase we get an unkown column value instead of crashing fill all with 0
    sparse_output=False      # Return a dense numpy array as result and not a sparse matrix
)

numeric_transformer = StandardScaler()  #StandardScaler would subtract mean from all data and devide it by standard deviation, makes model traning more stable. It Prevents one feature to overpower the others

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_features),
        ("num", numeric_transformer, numerical_features),
    ]
)

In [28]:
preprocessor.fit(X_train)

X_train_processed = preprocessor.transform(X_train)
X_val_processed = preprocessor.transform(X_val)
X_test_processed = preprocessor.transform(X_test)

X_train_processed.shape, X_val_processed.shape, X_test_processed.shape

((1280000, 26), (320000, 26), (400000, 26))

In [29]:
ohe_feature_names = preprocessor.named_transformers_["cat"].get_feature_names_out(
    categorical_features
)
len(ohe_feature_names), ohe_feature_names[:20]

(19,
 array(['user_age_bucket_18-24', 'user_age_bucket_25-34',
        'user_age_bucket_35-44', 'user_age_bucket_45+',
        'user_location_Tier-1', 'user_location_Tier-2',
        'user_location_Tier-3', 'user_device_android', 'user_device_ios',
        'user_device_web', 'ad_category_beauty', 'ad_category_electronics',
        'ad_category_fashion', 'ad_category_grocery', 'ad_category_home',
        'time_of_day_afternoon', 'time_of_day_evening',
        'time_of_day_morning', 'time_of_day_night'], dtype=object))

In [30]:
np.save(PROCESSED_DIR / "X_train.npy", X_train_processed)
np.save(PROCESSED_DIR / "X_val.npy", X_val_processed)
np.save(PROCESSED_DIR / "X_test.npy", X_test_processed)

np.save(PROCESSED_DIR / "y_train.npy", y_train.to_numpy())
np.save(PROCESSED_DIR / "y_val.npy", y_val.to_numpy())
np.save(PROCESSED_DIR / "y_test.npy", y_test.to_numpy())

joblib.dump(preprocessor, PROCESSED_DIR / "preprocessor.joblib")

['data/processed/preprocessor.joblib']

In [31]:
meta = {
    "categorical_features": categorical_features,
    "numeric_features": numerical_features,
    "n_train": int(len(y_train)),
    "n_val": int(len(y_val)),
    "n_test": int(len(y_test)),
}
pd.Series(meta).to_json(PROCESSED_DIR / "feature_meta.json")
meta

{'categorical_features': ['user_age_bucket',
  'user_location',
  'user_device',
  'ad_category',
  'time_of_day'],
 'numeric_features': ['user_base_ctr',
  'ad_quality_score',
  'ad_base_ctr',
  'bid',
  'interest_match',
  'effective_bid',
  'is_evening_peak'],
 'n_train': 1280000,
 'n_val': 320000,
 'n_test': 400000}