In [1]:
import pandas as pd

df_all = pd.read_csv('ad_click_cleaned.csv')
df_no_null = pd.read_csv('ad_click_no_nulls_cleaned.csv')

First testing the ability of the whole dataset in predicting if the ad was clicked.

In [2]:
df_all

Unnamed: 0.1,Unnamed: 0,id,full_name,age,gender,device_type,ad_position,browsing_history,time_of_day,click
0,0,670,User670,22.0,,Desktop,Top,Shopping,Afternoon,1
1,1,3044,User3044,,Male,Desktop,Top,,,1
2,2,5912,User5912,41.0,Non-Binary,,Side,Education,Night,1
3,3,5418,User5418,34.0,Male,,,Entertainment,Evening,1
4,4,9452,User9452,39.0,Non-Binary,,,Social Media,Morning,0
...,...,...,...,...,...,...,...,...,...,...
9995,9995,8510,User8510,,,Mobile,Top,Education,,0
9996,9996,7843,User7843,,Female,Desktop,Bottom,Entertainment,,0
9997,9997,3914,User3914,,Male,Mobile,Side,,Morning,0
9998,9998,7924,User7924,,,Desktop,,Shopping,Morning,1


In [3]:
df_all['ad_position'].unique()

array(['Top', 'Side', nan, 'Bottom'], dtype=object)

In [4]:
# Select the specific columns we should be using based on the results from chi-square.
# And applying one-hot encoding as data science models can only interpret numerical inputs.

df_all = df_all[['ad_position','browsing_history','time_of_day','click']]

df_all_encoded = pd.get_dummies(df_all,columns=['ad_position',
                                        'browsing_history',
                                        'time_of_day'],drop_first=False)

df_all_encoded = df_all_encoded.replace({False: 0, True: 1})

  df_all_encoded = df_all_encoded.replace({False: 0, True: 1})


In [5]:
df_all_encoded

Unnamed: 0,click,ad_position_Bottom,ad_position_Side,ad_position_Top,browsing_history_Education,browsing_history_Entertainment,browsing_history_News,browsing_history_Shopping,browsing_history_Social Media,time_of_day_Afternoon,time_of_day_Evening,time_of_day_Morning,time_of_day_Night
0,1,0,0,1,0,0,0,1,0,1,0,0,0
1,1,0,0,1,0,0,0,0,0,0,0,0,0
2,1,0,1,0,1,0,0,0,0,0,0,0,1
3,1,0,0,0,0,1,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,1,1,0,0,0,0,0,0,0,0
9996,0,1,0,0,0,1,0,0,0,0,0,0,0
9997,0,0,1,0,0,0,0,0,0,0,0,1,0
9998,1,0,0,0,0,0,0,1,0,0,0,1,0


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = df_all_encoded.drop('click', axis=1)  # Features- we are using all the columns to predict the ad click for now.
y = df_all_encoded['click']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)

# Using the model we just built, try to predict the values with the training data.
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

# AUC = 'Area under curve'. This summarizes the performance of the binary classifier.
# AUC of 1 would be the perfect model while 0.5 would be equivalent to random guessing.
print("AUC: ", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

accuracy = accuracy_score(y_test, y_pred)
# The accuracy is measured as the number of correct predictions over the total number of predictions.
print(f'Accuracy: {accuracy:.2f}')

              precision    recall  f1-score   support

           0       0.40      0.09      0.14      1055
           1       0.65      0.93      0.77      1945

    accuracy                           0.63      3000
   macro avg       0.53      0.51      0.45      3000
weighted avg       0.56      0.63      0.55      3000

AUC:  0.5324994212892458
Accuracy: 0.63


Now testing to see if the result is any different when we use the dataset with no null values.

In [7]:
df_no_null

Unnamed: 0.1,Unnamed: 0,id,full_name,age,gender,device_type,ad_position,browsing_history,time_of_day,click
0,17,188,User188,56.0,Female,Tablet,Bottom,News,Morning,1
1,25,4890,User4890,43.0,Male,Tablet,Bottom,Education,Afternoon,1
2,33,4985,User4985,37.0,Male,Mobile,Top,News,Evening,0
3,52,9888,User9888,49.0,Male,Mobile,Top,News,Morning,1
4,102,8201,User8201,59.0,Female,Desktop,Bottom,Social Media,Morning,0
...,...,...,...,...,...,...,...,...,...,...
811,9951,7268,User7268,28.0,Female,Desktop,Bottom,News,Evening,1
812,9952,5912,User5912,41.0,Non-Binary,Mobile,Side,Education,Night,1
813,9960,9638,User9638,64.0,Non-Binary,Desktop,Top,Entertainment,Morning,0
814,9986,5574,User5574,52.0,Female,Desktop,Bottom,Shopping,Afternoon,1


In [8]:
df_no_null = df_no_null[['ad_position','browsing_history','time_of_day','click']]

df_no_null_encoded = pd.get_dummies(df_no_null,columns=['ad_position',
                                        'browsing_history',
                                        'time_of_day'],drop_first=False)

df_no_null_encoded = df_no_null_encoded.replace({False: 0, True: 1})

  df_no_null_encoded = df_no_null_encoded.replace({False: 0, True: 1})


In [9]:
X = df_no_null_encoded.drop('click', axis=1)  # Features- we are using all the columns to predict the ad click for now.
y = df_no_null_encoded['click']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)

# Using the model we just built, try to predict the values with the training data.
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

# AUC = 'Area under curve'. This summarizes the performance of the binary classifier.
# AUC of 1 would be the perfect model while 0.5 would be equivalent to random guessing.
print("AUC: ", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

accuracy = accuracy_score(y_test, y_pred)
# The accuracy is measured as the number of correct predictions over the total number of predictions.
print(f'Accuracy: {accuracy:.2f}')

              precision    recall  f1-score   support

           0       0.55      0.35      0.43        89
           1       0.69      0.84      0.76       156

    accuracy                           0.66       245
   macro avg       0.62      0.59      0.59       245
weighted avg       0.64      0.66      0.64       245

AUC:  0.6432584269662922
Accuracy: 0.66


The results are much better when we drop the rows containing null values. However, it turns out that 66% of the rows have atleast 1 null value. So by dropping all null values, we are getting rid of too much information.

It is only acceptable to get rid of null values when a very small proportion of the dataset actually contains missing values (<5%).

In [10]:
rows_with_missing = df_all.isna().any(axis=1)  # True if any NaN in the row
percentage_missing_rows = rows_with_missing.mean() * 100

percentage_missing_rows

np.float64(66.10000000000001)

Since such a high % of rows have missing values, it's better to use 'imputation'- fill the missing values.

In [11]:
df_all

Unnamed: 0,ad_position,browsing_history,time_of_day,click
0,Top,Shopping,Afternoon,1
1,Top,,,1
2,Side,Education,Night,1
3,,Entertainment,Evening,1
4,,Social Media,Morning,0
...,...,...,...,...
9995,Top,Education,,0
9996,Bottom,Entertainment,,0
9997,Side,,Morning,0
9998,,Shopping,Morning,1


In [12]:
df_imputed = df_all.fillna('Unknown')

In [13]:
df_imputed

Unnamed: 0,ad_position,browsing_history,time_of_day,click
0,Top,Shopping,Afternoon,1
1,Top,Unknown,Unknown,1
2,Side,Education,Night,1
3,Unknown,Entertainment,Evening,1
4,Unknown,Social Media,Morning,0
...,...,...,...,...
9995,Top,Education,Unknown,0
9996,Bottom,Entertainment,Unknown,0
9997,Side,Unknown,Morning,0
9998,Unknown,Shopping,Morning,1


In [14]:
df_imputed_encoded = pd.get_dummies(df_imputed,columns=['ad_position',
                                        'browsing_history',
                                        'time_of_day'],drop_first=False)

df_imputed_encoded = df_imputed_encoded.replace({False: 0, True: 1})

  df_imputed_encoded = df_imputed_encoded.replace({False: 0, True: 1})


In [15]:
df_imputed_encoded

Unnamed: 0,click,ad_position_Bottom,ad_position_Side,ad_position_Top,ad_position_Unknown,browsing_history_Education,browsing_history_Entertainment,browsing_history_News,browsing_history_Shopping,browsing_history_Social Media,browsing_history_Unknown,time_of_day_Afternoon,time_of_day_Evening,time_of_day_Morning,time_of_day_Night,time_of_day_Unknown
0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1
2,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0
3,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0
4,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1
9996,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1
9997,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0
9998,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0


In [16]:
X = df_imputed_encoded.drop('click', axis=1)  # Features- we are using all the columns to predict the ad click for now.
y = df_imputed_encoded['click']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)

# Using the model we just built, try to predict the values with the training data.
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

# AUC = 'Area under curve'. This summarizes the performance of the binary classifier.
# AUC of 1 would be the perfect model while 0.5 would be equivalent to random guessing.
print("AUC: ", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

accuracy = accuracy_score(y_test, y_pred)
# The accuracy is measured as the number of correct predictions over the total number of predictions.
print(f'Accuracy: {accuracy:.2f}')

              precision    recall  f1-score   support

           0       0.38      0.09      0.14      1055
           1       0.65      0.92      0.76      1945

    accuracy                           0.63      3000
   macro avg       0.52      0.51      0.45      3000
weighted avg       0.56      0.63      0.55      3000

AUC:  0.5328079045797341
Accuracy: 0.63


With imputation, the model gives similar results to when we just leave the missing values in. The best results so far is when get rid of all null values (even though there will only be 816 rows).

It's interesting that your model performs better when removing all rows with missing values, even though the dataset size is reduced to 816 rows. This suggests that the rows with missing values might be introducing noise or inconsistencies into the model, which could be negatively impacting its performance.

Possible Reasons:

Quality vs. Quantity: The remaining 816 rows might be of higher quality, leading to better model performance even with fewer data points.

Imputation Quality: Sometimes, imputation introduces bias or noise, especially if the missing data is not random or if the imputed values do not truly reflect the underlying patterns.

Data Distribution: The rows with missing values could have characteristics that make them more difficult for the model to generalize on, or they might belong to specific segments of the dataset where the relationships between features and the target variable are weaker.

Next Steps:

Assess the Nature of Missing Data: It might be useful to analyze whether there’s a pattern to the missing data. Are certain ad positions or times of day missing more frequently than others? If the missingness is non-random, it could be worth modeling why those values are missing.

Feature Importance Analysis: Perform feature importance analysis to see which features are driving model performance. This might help identify whether the missing values are concentrated in certain critical features.

Data Augmentation: If removing missing rows improves performance but drastically reduces the dataset size, you could explore synthetic data generation or data augmentation to increase the number of rows without reintroducing noise.

## Assessing patterns within the missing data

In [17]:
missing = df_all[df_all.isnull().any(axis=1)]

In [18]:
missing

Unnamed: 0,ad_position,browsing_history,time_of_day,click
1,Top,,,1
3,,Entertainment,Evening,1
4,,Social Media,Morning,0
6,Top,,,1
7,Side,,Evening,0
...,...,...,...,...
9994,,News,Morning,1
9995,Top,Education,,0
9996,Bottom,Entertainment,,0
9997,Side,,Morning,0


In [19]:
missing['browsing_history'].value_counts(normalize=True)*100

browsing_history
Entertainment    22.702407
Social Media     21.498906
Education        19.693654
Shopping         18.326039
News             17.778993
Name: proportion, dtype: float64

In [20]:
missing['ad_position'].value_counts(normalize=True)*100

ad_position
Bottom    34.707158
Top       33.188720
Side      32.104121
Name: proportion, dtype: float64

In [21]:
missing['time_of_day'].value_counts(normalize=True)*100

time_of_day
Morning      26.334056
Afternoon    25.509761
Evening      24.772234
Night        23.383948
Name: proportion, dtype: float64

In [22]:
missing['click'].value_counts(normalize=True)*100

click
1    65.189107
0    34.810893
Name: proportion, dtype: float64

Within all the rows that have a missing value, there doesn't seem to be a specific value that occurs more frequently; the missingness is quite random. Next, since there aren't many rows left when we get rid of the missing values, I will attempt to introduce synthetic data generation to increase the number of rows and give the model more data to learn from- this is known as data augmentation.

Since the dataset is imbalanced (65% clicking the ad and 35% not clicking it), I will apply SMOTE by interpolating minority cases (ad not clicked).

In [23]:
df_no_null['click'].value_counts(normalize=True)*100

click
1    63.235294
0    36.764706
Name: proportion, dtype: float64

In [24]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

X = df_no_null_encoded.drop('click', axis=1)  # Features- we are using all the columns to predict the ad click for now.
y = df_no_null_encoded['click']  # Target

# Assume X is your feature set and y is your target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

print(f"Original dataset shape: {X_train.shape}")
print(f"Resampled dataset shape: {X_smote.shape}")

Original dataset shape: (571, 12)
Resampled dataset shape: (720, 12)


In [25]:
model = RandomForestClassifier()
# model = RandomForestClassifier(class_weight={0: 2, 1: 1}, random_state=42)

model.fit(X_smote, y_smote)

# Using the model we just built, try to predict the values with the training data.
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

# AUC = 'Area under curve'. This summarizes the performance of the binary classifier.
# AUC of 1 would be the perfect model while 0.5 would be equivalent to random guessing.
print("AUC: ", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

accuracy = accuracy_score(y_test, y_pred)
# The accuracy is measured as the number of correct predictions over the total number of predictions.
print(f'Accuracy: {accuracy:.2f}')

              precision    recall  f1-score   support

           0       0.51      0.48      0.49        89
           1       0.71      0.73      0.72       156

    accuracy                           0.64       245
   macro avg       0.61      0.61      0.61       245
weighted avg       0.64      0.64      0.64       245

AUC:  0.6124675885911841
Accuracy: 0.64


So we get the best result when we apply SMOTE but the recall and precision for ad not cliked is still below 50%. Maybe if I increased the weighting for the minority group (not clicked), it'll improve the model.

In [26]:
model = RandomForestClassifier()
model.fit(X_smote, y_smote)

# Using the model we just built, try to predict the values with the training data.
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

# AUC = 'Area under curve'. This summarizes the performance of the binary classifier.
# AUC of 1 would be the perfect model while 0.5 would be equivalent to random guessing.
print("AUC: ", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

accuracy = accuracy_score(y_test, y_pred)
# The accuracy is measured as the number of correct predictions over the total number of predictions.
print(f'Accuracy: {accuracy:.2f}')

              precision    recall  f1-score   support

           0       0.51      0.49      0.50        89
           1       0.72      0.72      0.72       156

    accuracy                           0.64       245
   macro avg       0.61      0.61      0.61       245
weighted avg       0.64      0.64      0.64       245

AUC:  0.6031763180639584
Accuracy: 0.64


To further improve the model, I shall add grid search.

In [29]:
from sklearn.model_selection import GridSearchCV

# Looping through different parameters to identify the one that offers the highest accuracy.

param_grid = {'max_depth': [3, 5, 7, 10, 15, 20], 'n_estimators': [100, 200, 300, 400, 500]}
grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid.fit(X_smote, y_smote)

# Get the best model from the grid search
best_model = grid.best_estimator_
print(f"Best model: {best_model}")

best_params = grid.best_params_
print(f"Best parameters found: {best_params}")

# Get the best score (cross-validated performance of the best model)
best_score = grid.best_score_
print(f"Best cross-validated score: {best_score:.4f}")

best_model.fit(X_train, y_train)

y_pred_best = best_model.predict(X_test)

print(' ')

print(classification_report(y_test, y_pred_best))

print("AUC: ", roc_auc_score(y_test, y_pred_best))

accuracy = accuracy_score(y_test, y_pred)
# The accuracy is measured as the number of correct predictions over the total number of predictions.
print(f'Accuracy: {accuracy:.2f}')

Best model: RandomForestClassifier(max_depth=7, n_estimators=400)
Best parameters found: {'max_depth': 7, 'n_estimators': 400}
Best cross-validated score: 0.6639
 
              precision    recall  f1-score   support

           0       0.55      0.35      0.43        89
           1       0.69      0.84      0.76       156

    accuracy                           0.66       245
   macro avg       0.62      0.59      0.59       245
weighted avg       0.64      0.66      0.64       245

AUC:  0.5940290982425814
Accuracy: 0.64


Strangely, applying grid search has not actually improved the overall model by much. The recall of people clicking ads the increased by 12% however it's precision decreased. The recall of those who didn't click the ad has decreased by 14%. The overall accuracy has stayed the same.

In [32]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred = xgb_model.predict(X_test)

# Calculate probabilities for ROC AUC score
y_pred_prob = xgb_model.predict_proba(X_test)[:, 1]

# Print the classification report
print(classification_report(y_test, y_pred))

# Print the ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f"ROC AUC score: {roc_auc:.4f}")

# Print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

              precision    recall  f1-score   support

           0       0.55      0.35      0.43        89
           1       0.69      0.84      0.76       156

    accuracy                           0.66       245
   macro avg       0.62      0.59      0.59       245
weighted avg       0.64      0.66      0.64       245

ROC AUC score: 0.6439
Accuracy: 0.66


Parameters: { "use_label_encoder" } are not used.



In [33]:
param_grid = {'max_depth': [3, 5, 7, 10, 15, 20], 'n_estimators': [100, 200, 300, 400, 500]}
grid = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='roc_auc')
grid.fit(X_smote, y_smote)

# Get the best model from the grid search
best_model = grid.best_estimator_
print(f"Best model: {best_model}")

best_params = grid.best_params_
print(f"Best parameters found: {best_params}")

# Get the best score (cross-validated performance of the best model)
best_score = grid.best_score_
print(f"Best cross-validated score: {best_score:.4f}")

best_model.fit(X_train, y_train)

y_pred_best = best_model.predict(X_test)

print(' ')

print(classification_report(y_test, y_pred_best))

print("AUC: ", roc_auc_score(y_test, y_pred_best))

accuracy = accuracy_score(y_test, y_pred)
# The accuracy is measured as the number of correct predictions over the total number of predictions.
print(f'Accuracy: {accuracy:.2f}')

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best model: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=500,
              n_jobs=None, num_parallel_tree=None, random_state=None, ...)
Best parameters found: {'max_depth': 3, 'n_estimators': 500}
Best cross-validated score: 0.7164
 
              precision    recall  f1-score   support

           0       0.55      0.35      0.43        89
           1       0.6