In [20]:
#Import Libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix

import matplotlib.pyplot as plt

In [21]:
#Load and Inspect Data
df = pd.read_csv('ad_click_dataset.csv')
df.head()

Unnamed: 0,id,full_name,age,gender,device_type,ad_position,browsing_history,time_of_day,click
0,670,User670,22.0,,Desktop,Top,Shopping,Afternoon,1
1,3044,User3044,,Male,Desktop,Top,,,1
2,5912,User5912,41.0,Non-Binary,,Side,Education,Night,1
3,5418,User5418,34.0,Male,,,Entertainment,Evening,1
4,9452,User9452,39.0,Non-Binary,,,Social Media,Morning,0


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                10000 non-null  int64  
 1   full_name         10000 non-null  object 
 2   age               5234 non-null   float64
 3   gender            5307 non-null   object 
 4   device_type       8000 non-null   object 
 5   ad_position       8000 non-null   object 
 6   browsing_history  5218 non-null   object 
 7   time_of_day       8000 non-null   object 
 8   click             10000 non-null  int64  
dtypes: float64(1), int64(2), object(6)
memory usage: 703.3+ KB


In [23]:
#Drop useless Columns
df.drop(columns=['id', 'full_name'], inplace=True)

In [26]:
#Handle Missing Values
#Age
df['age'].fillna(df['age'].median(), inplace=True)
#Categorical columns: Fill missing with Unknown
cat_cols = [
    'gender',
    'device_type',
    'ad_position',
    'browsing_history',
    'time_of_day'
]

for col in cat_cols:
    df[col].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].median(), inplace=True)


In [27]:
#Encode Categorical Variables
#One-Hot Encoding (Best for Logistic Regression)
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

In [28]:
df.head()

Unnamed: 0,age,click,gender_Male,gender_Non-Binary,gender_Unknown,device_type_Mobile,device_type_Tablet,device_type_Unknown,ad_position_Side,ad_position_Top,ad_position_Unknown,browsing_history_Entertainment,browsing_history_News,browsing_history_Shopping,browsing_history_Social Media,browsing_history_Unknown,time_of_day_Evening,time_of_day_Morning,time_of_day_Night,time_of_day_Unknown
0,22.0,1,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False
1,39.5,1,True,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,True
2,41.0,1,False,True,False,False,False,True,True,False,False,False,False,False,False,False,False,False,True,False
3,34.0,1,True,False,False,False,False,True,False,False,True,True,False,False,False,False,True,False,False,False
4,39.0,0,False,True,False,False,False,True,False,False,True,False,False,False,True,False,False,True,False,False


In [29]:
#Split Features & Target
X = df.drop('click', axis=1)
y = df['click']
#Train-test split:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [30]:
#Feature Scaling (Numeric Only)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [31]:
#Train Logistic Regression Model
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [32]:
#Predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

In [34]:
#Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))
#Confusion Matrix:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.652
Precision: 0.6513026052104208
Recall: 1.0
ROC-AUC: 0.5628379120879121
Confusion Matrix:
 [[   4  696]
 [   0 1300]]


In [35]:
#Trying with Random Forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

rf_pred = rf.predict(X_test)
rf_prob = rf.predict_proba(X_test)[:,1]

print("RF ROC-AUC:", roc_auc_score(y_test, rf_prob))
print("RF Confusion Matrix:\n", confusion_matrix(y_test, rf_pred))

RF ROC-AUC: 0.7458406593406594
RF Confusion Matrix:
 [[ 306  394]
 [ 187 1113]]


In [36]:
#Feature Importance
import pandas as pd

feature_importance = pd.Series(
    rf.feature_importances_,
    index=df.drop('click', axis=1).columns
).sort_values(ascending=False)

feature_importance.head(10)

Unnamed: 0,0
age,0.468601
gender_Unknown,0.04308
device_type_Tablet,0.040408
ad_position_Side,0.036508
device_type_Mobile,0.035885
ad_position_Top,0.035882
device_type_Unknown,0.035236
gender_Male,0.031135
gender_Non-Binary,0.03067
time_of_day_Morning,0.030254


**Feature Importance Analysis:**

Random Forest feature importance indicates that user age is the most influential predictor of ad clicks, followed by device type and ad position. This suggests that demographic factors and ad placement play a critical role in user engagement, while temporal and gender-related features provide additional but smaller contributions.

**Logistic Regression vs Random Forest**

Logistic Regression assumes a linear relationship between features and click probability, whereas Random Forest captures non-linear patterns and interactions. The dominance of age and the combined influence of device type and ad position suggest complex interactions that are better modeled by ensemble methods.