In [37]:
#Import Libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    roc_auc_score,
    confusion_matrix
)

In [40]:
#Load and Inspect Data
def load_data(path):
    #Loads advertising click dataset from CSV.
    #In production, this could be replaced with data from a data warehouse or pipeline.
    return pd.read_csv(path)

df = load_data("ad_click_dataset.csv")
df.head()

Unnamed: 0,id,full_name,age,gender,device_type,ad_position,browsing_history,time_of_day,click
0,670,User670,22.0,,Desktop,Top,Shopping,Afternoon,1
1,3044,User3044,,Male,Desktop,Top,,,1
2,5912,User5912,41.0,Non-Binary,,Side,Education,Night,1
3,5418,User5418,34.0,Male,,,Entertainment,Evening,1
4,9452,User9452,39.0,Non-Binary,,,Social Media,Morning,0


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                10000 non-null  int64  
 1   full_name         10000 non-null  object 
 2   age               5234 non-null   float64
 3   gender            5307 non-null   object 
 4   device_type       8000 non-null   object 
 5   ad_position       8000 non-null   object 
 6   browsing_history  5218 non-null   object 
 7   time_of_day       8000 non-null   object 
 8   click             10000 non-null  int64  
dtypes: float64(1), int64(2), object(6)
memory usage: 703.3+ KB


In [42]:
#Data Cleaning and Preprocessing
def preprocess_data(df):
    #Cleans and preprocesses raw data.
    #This function is designed to be reusable for batch or real-time inference.

    # Drop non-informative identifiers
    df = df.drop(columns=['id', 'full_name'])

    # Handle missing numerical values
    df['age'] = df['age'].fillna(df['age'].median())

    # Handle missing categorical values
    categorical_cols = [
        'gender',
        'device_type',
        'ad_position',
        'browsing_history',
        'time_of_day'
    ]

    for col in categorical_cols:
        df[col] = df[col].fillna('Unknown')

    # One-hot encode categorical variables
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

    return df

df = preprocess_data(df)
df.head()

Unnamed: 0,age,click,gender_Male,gender_Non-Binary,gender_Unknown,device_type_Mobile,device_type_Tablet,device_type_Unknown,ad_position_Side,ad_position_Top,ad_position_Unknown,browsing_history_Entertainment,browsing_history_News,browsing_history_Shopping,browsing_history_Social Media,browsing_history_Unknown,time_of_day_Evening,time_of_day_Morning,time_of_day_Night,time_of_day_Unknown
0,22.0,1,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False
1,39.5,1,True,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,True
2,41.0,1,False,True,False,False,False,True,True,False,False,False,False,False,False,False,False,False,True,False
3,34.0,1,True,False,False,False,False,True,False,False,True,True,False,False,False,False,True,False,False,False
4,39.0,0,False,True,False,False,False,True,False,False,True,False,False,False,True,False,False,True,False,False


In [43]:
#Train/Test Split
X = df.drop('click', axis=1)
y = df['click']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [27]:
#Encode Categorical Variables
#One-Hot Encoding (Best for Logistic Regression)
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

In [44]:
#Feature Scaling (Numeric Only)
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [45]:
#Model Training
def train_logistic_regression(X_train, y_train):
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    return model

def train_random_forest(X_train, y_train):
    model = RandomForestClassifier(
        n_estimators=100,
        random_state=42
    )
    model.fit(X_train, y_train)
    return model

lr_model = train_logistic_regression(X_train_scaled, y_train)
rf_model = train_random_forest(X_train_scaled, y_train)

In [46]:
#Evaluation
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    return {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_prob),
        "confusion_matrix": confusion_matrix(y_test, y_pred)
    }

In [49]:
#Compare Models
lr_metrics = evaluate_model(lr_model, X_test_scaled, y_test)
rf_metrics = evaluate_model(rf_model, X_test_scaled, y_test)

print("Logistic Regression Metrics:", lr_metrics)
print("Random Forest Metrics:", rf_metrics)

Logistic Regression Metrics: {'accuracy': 0.652, 'precision': 0.6513026052104208, 'recall': 1.0, 'roc_auc': np.float64(0.5628379120879121), 'confusion_matrix': array([[   4,  696],
       [   0, 1300]])}
Random Forest Metrics: {'accuracy': 0.7095, 'precision': 0.7385534173855341, 'recall': 0.8561538461538462, 'roc_auc': np.float64(0.7458406593406594), 'confusion_matrix': array([[ 306,  394],
       [ 187, 1113]])}


In [50]:
#Feature Importance
feature_importance = pd.Series(
    rf_model.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

feature_importance.head(10)

Unnamed: 0,0
age,0.468601
gender_Unknown,0.04308
device_type_Tablet,0.040408
ad_position_Side,0.036508
device_type_Mobile,0.035885
ad_position_Top,0.035882
device_type_Unknown,0.035236
gender_Male,0.031135
gender_Non-Binary,0.03067
time_of_day_Morning,0.030254


**Scalability and Production Considerations**

1.The preprocessing and training logic has been modularized to support batch inference and reuse in production systems.

2.For large-scale datasets, feature engineering and model training could be executed using distributed processing frameworks such as Spark.

3.In a real-time advertising system, the trained model could be deployed as a microservice to score ad impressions with low latency.

**Feature Importance Analysis:**

Random Forest feature importance indicates that user age is the most influential predictor of ad clicks, followed by device type and ad position. This suggests that demographic factors and ad placement play a critical role in user engagement, while temporal and gender-related features provide additional but smaller contributions.

**Logistic Regression vs Random Forest**

Logistic Regression assumes a linear relationship between features and click probability, whereas Random Forest captures non-linear patterns and interactions. The dominance of age and the combined influence of device type and ad position suggest complex interactions that are better modeled by ensemble methods.