<a href="https://colab.research.google.com/github/Ron-levi1/Social-Media-Advertisement-Performance/blob/main/part_6_Model_Selection_and_Fine_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Importing essential libraries for data processing, model training, evaluation metrics, and resampling. Includes XGBoost, SMOTE for class balancing, and various classification algorithms for comparison.

In [12]:
import warnings; warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    roc_auc_score, log_loss, confusion_matrix
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
import xgboost as xgb

from imblearn.over_sampling import SMOTE

In [13]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from imblearn.over_sampling import SMOTE
from scipy.stats import randint as sp_randint

#### Mounting Google Drive and loading the preprocessed dataset (df_after_lasso.csv) that contains selected features after LASSO feature selection.

In [14]:
from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv("/content/drive/MyDrive/df_after_lasso.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#### Creating a reusable function to calculate Accuracy, Precision, Recall, F1-score, Log-loss, and AUC for consistent model comparison.

In [15]:
def classificationMetrics(y, yhat):
    prf1 = metrics.precision_recall_fscore_support(y,yhat)
    res = {'Accuracy': metrics.accuracy_score(y,yhat),
           'Precision':prf1[0][1],
           'Recall': prf1[1][1],
           'f1-score': prf1[2][1],
           'Log-loss': metrics.log_loss(y,yhat),
           'AUC': metrics.roc_auc_score(y,yhat)
          }
    return res

#### Splitting the dataset into train, development (dev), and test sets using stratified sampling to maintain class balance.Balancing only the training set using SMOTE to handle class imbalance without affecting validation integrity.

In [16]:
X = df.drop(columns=['event_type'])
y = df['event_type']

In [17]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=222, stratify=y
)


X_dev, X_test, y_dev, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=222, stratify=y_temp
)

smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

#### Re-defining the metrics function to ensure it is accessible for all subsequent cells and model evaluations.

In [18]:
from sklearn import metrics
def classificationMetrics(y, yhat):
    prf1 = metrics.precision_recall_fscore_support(y, yhat)
    res = {
        'Accuracy': metrics.accuracy_score(y, yhat),
        'Precision': prf1[0][1],
        'Recall': prf1[1][1],
        'f1-score': prf1[2][1],
        'Log-loss': metrics.log_loss(y, yhat),
        'AUC': metrics.roc_auc_score(y, yhat)
    }
    return res

#### Creating a dictionary of models: Decision Tree, Random Forest, XGBoost, Gradient Boosting, AdaBoost, Logistic Regression, and SVM. with tuned parameters for faster computation and stable results.

In [20]:
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42, n_jobs=-1),
    "XGB": xgb.XGBClassifier(
        random_state=42,
        eval_metric="logloss",
        use_label_encoder=False,
        tree_method="hist",
        n_jobs=-1
    ),
    "GBM": GradientBoostingClassifier(random_state=42, subsample=0.8),
    "ADAboost": AdaBoostClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(
        max_iter=1000,
        random_state=42,
        solver="saga",
        n_jobs=-1
    ),
    "SVM": SVC(probability=True, random_state=42, max_iter=2000, tol=1e-2, cache_size=2000)
}

#### Training all models on the balanced training data (after SMOTE) and evaluating their performance on the same set to obtain initial performance metrics.

In [21]:
results = []
for name, model in models.items():
    model.fit(X_train_sm, y_train_sm)
    y_train_pred = model.predict(X_train_sm)
    m = classificationMetrics(y_train_sm, y_train_pred)
    results.append({"Model": name, **m})

###### results table

In [22]:
df_results = pd.DataFrame(results)
df_results

Unnamed: 0,Model,Accuracy,Precision,Recall,f1-score,Log-loss,AUC
0,Decision Tree,0.999416,0.999937,0.998894,0.999415,0.021062,0.999416
1,RandomForest,0.999397,0.999588,0.999205,0.999397,0.021744,0.999397
2,XGB,0.834049,0.878748,0.775039,0.823642,5.981491,0.834049
3,GBM,0.806666,0.815846,0.792133,0.803815,6.968468,0.806666
4,ADAboost,0.763457,0.736298,0.820926,0.776312,8.525872,0.763457
5,Logistic Regression,0.550421,0.548916,0.565801,0.557231,16.204476,0.550421
6,SVM,0.499626,0.499744,0.73112,0.593685,18.035313,0.499626


###### Tree-based models (Decision Tree, RandomForest) achieved near-perfect scores - indicating overfitting. XGBoost (XGB) and GBM achieved strong, realistic results with high F1 and AUC values.Logistic Regression and SVM performed poorly on this dataset.
###### Chosen models for next stage: XGB (primary) and GBM (backup).

#### Performing 5-Fold Stratified Cross-Validation to assess the robustness and consistency of both candidate models across multiple splits.

In [23]:
from sklearn.model_selection import StratifiedKFold, cross_validate
import pandas as pd

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = {"f1": "f1", "auc": "roc_auc", "precision": "precision", "recall": "recall"}

xgb_cv = xgb.XGBClassifier(
    random_state=42, eval_metric="logloss", tree_method="hist", n_jobs=-1
)
gbm_cv = GradientBoostingClassifier(random_state=42, subsample=0.8)

cv_models = {"XGB": xgb_cv, "GBM": gbm_cv}

rows = []
for name, mdl in cv_models.items():
    res = cross_validate(mdl, X_train_sm, y_train_sm, cv=cv, scoring=scoring, n_jobs=-1)
    rows.append({
        "Model": name,
        "F1_mean": res["test_f1"].mean(),      "F1_std": res["test_f1"].std(),
        "AUC_mean": res["test_auc"].mean(),    "AUC_std": res["test_auc"].std(),
        "Prec_mean": res["test_precision"].mean(), "Prec_std": res["test_precision"].std(),
        "Rec_mean": res["test_recall"].mean(),     "Rec_std": res["test_recall"].std(),
    })

df_cv = pd.DataFrame(rows).sort_values(["F1_mean","AUC_mean"], ascending=False)
df_cv


Unnamed: 0,Model,F1_mean,F1_std,AUC_mean,AUC_std,Prec_mean,Prec_std,Rec_mean,Rec_std
0,XGB,0.819409,0.00158,0.885493,0.001012,0.874031,0.001576,0.771218,0.002616
1,GBM,0.803746,0.001556,0.869758,0.001567,0.815915,0.002358,0.791939,0.001937


###### XGB achieved higher mean F1 (0.819) and AUC (0.885), with low standard deviations → highly stable and reliable. GBM also performed well but slightly below XGB.


#### Selected model for Fine-Tuning: XGB.

###### Running a Randomized Search to optimize XGB hyperparameters (e.g., n_estimators, max_depth, learning_rate, etc.) using 5-fold CV.After finding the best parameters, evaluating the tuned model on the unbalanced DEV set to check real-world performance.

In [24]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

xgb_base = xgb.XGBClassifier(
    random_state=42, eval_metric="logloss", tree_method="hist", n_jobs=-1
)

xgb_param_dist = {
    "n_estimators": randint(100, 400),
    "max_depth": randint(3, 10),
    "learning_rate": uniform(0.01, 0.2),
    "subsample": uniform(0.7, 0.3),
    "colsample_bytree": uniform(0.7, 0.3),
    "min_child_weight": randint(1, 6)
}

xgb_rs = RandomizedSearchCV(
    xgb_base,
    param_distributions=xgb_param_dist,
    n_iter=20,
    scoring="f1",
    cv=cv,
    n_jobs=-1,
    random_state=42,
    verbose=1
)
xgb_rs.fit(X_train_sm, y_train_sm)

print("XGB best params:", xgb_rs.best_params_)
print("XGB best CV F1:", xgb_rs.best_score_)


y_dev_pred = xgb_rs.best_estimator_.predict(X_dev)
classificationMetrics(y_dev, y_dev_pred)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
XGB best params: {'colsample_bytree': np.float64(0.8683831592708489), 'learning_rate': np.float64(0.1641934359909122), 'max_depth': 9, 'min_child_weight': 3, 'n_estimators': 306, 'subsample': np.float64(0.8282623055075649)}
XGB best CV F1: 0.846053829241054


{'Accuracy': 0.7902,
 'Precision': np.float64(0.15166340508806261),
 'Recall': np.float64(0.08584404076207355),
 'f1-score': np.float64(0.10963361154335832),
 'Log-loss': 7.561958481036778,
 'AUC': np.float64(0.5003986742302089)}

###### Best parameters:
{'colsample_bytree': 0.868, 'learning_rate': 0.164, 'max_depth': 9, 'min_child_weight': 3, 'n_estimators': 306, 'subsample': 0.828}
Best CV F1 ≈ 0.846 → indicates strong improvement after tuning.
DEV results show low recall because the test is unbalanced and uses a fixed 0.5 threshold.
The next step is to adjust the decision threshold using probabilities (predict_proba) and then evaluate on the final test set.

#### Final chosen model: XGB (XGBoost) after Random Search Fine-Tuning.

In [27]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

bp = xgb_rs.best_params_
grid_small = {
    "n_estimators": [max(50, int(bp["n_estimators"])-50), int(bp["n_estimators"]), int(bp["n_estimators"])+50],
    "max_depth":    [max(3, int(bp["max_depth"])-1), int(bp["max_depth"]), int(bp["max_depth"])+1],
    "learning_rate":[float(bp["learning_rate"])*0.7, float(bp["learning_rate"]), float(bp["learning_rate"])*1.3],
}

xgb_grid = xgb.XGBClassifier(random_state=42, eval_metric="logloss", tree_method="hist", n_jobs=-1)

gs = GridSearchCV(
    estimator=xgb_grid,
    param_grid=grid_small,
    scoring="f1",
    cv=3,
    n_jobs=-1,
    verbose=1
)

gs.fit(X_train_sm, y_train_sm)
print("XGB (grid) best params:", gs.best_params_)
print("XGB (grid) best CV F1:", gs.best_score_)


Fitting 3 folds for each of 27 candidates, totalling 81 fits
XGB (grid) best params: {'learning_rate': 0.21345146678818588, 'max_depth': 10, 'n_estimators': 356}
XGB (grid) best CV F1: 0.8323654158107304


In [31]:
import joblib
joblib.dump(gs.best_estimator_, '/content/drive/MyDrive/xgb_best_model.pkl')



['/content/drive/MyDrive/xgb_best_model.pkl']

In [32]:
pd.DataFrame(X_dev).to_csv('/content/drive/MyDrive/X_dev.csv', index=False)
pd.DataFrame(y_dev).to_csv('/content/drive/MyDrive/y_dev.csv', index=False)
pd.DataFrame(X_test).to_csv('/content/drive/MyDrive/X_test.csv', index=False)
pd.DataFrame(y_test).to_csv('/content/drive/MyDrive/y_test.csv', index=False)