# Machine Learning Model Development

##  Goal: Predict Conversion & Optimize Marketing Strategies

### 📖 Overview:
In this notebook, we will develop machine learning models to predict **conversion rates** and optimize **marketing strategies**. 


### Import Libraries & Load Data
We will load the dataset (../data/feature_engineered.csv) and check its structure.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


df = pd.read_csv("../data/raw_data.csv")


df.head()


Unnamed: 0,CustomerID,Age,Gender,Income,CampaignChannel,CampaignType,AdSpend,ClickThroughRate,ConversionRate,WebsiteVisits,PagesPerVisit,TimeOnSite,SocialShares,EmailOpens,EmailClicks,PreviousPurchases,LoyaltyPoints,AdvertisingPlatform,AdvertisingTool,Conversion
0,8000,56,Female,136912,Social Media,Awareness,6497.870068,0.043919,0.088031,0,2.399017,7.396803,19,6,9,4,688,IsConfid,ToolConfid,1
1,8001,69,Male,41760,Email,Retention,3898.668606,0.155725,0.182725,42,2.917138,5.352549,5,2,7,2,3459,IsConfid,ToolConfid,1
2,8002,46,Female,88456,PPC,Awareness,1546.429596,0.27749,0.076423,2,8.223619,13.794901,0,11,2,8,2337,IsConfid,ToolConfid,1
3,8003,32,Female,44085,PPC,Conversion,539.525936,0.137611,0.088004,47,4.540939,14.688363,89,2,2,0,2463,IsConfid,ToolConfid,1
4,8004,60,Female,83964,PPC,Conversion,1678.043573,0.252851,0.10994,0,2.046847,13.99337,6,6,6,8,4345,IsConfid,ToolConfid,1


In [None]:

X = df.drop(columns=['Conversion'])  
y = df['Conversion']  


print(y.value_counts(normalize=True))
X = pd.get_dummies(X)

Conversion
1    0.8765
0    0.1235
Name: proportion, dtype: float64


### Train-Test Split

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")


Training samples: 6400, Test samples: 1600


### Handle Class Imbalance

In [None]:
# Check class imbalance
class_counts = np.bincount(y_train)
min_samples = np.min(class_counts)
max_samples = np.max(class_counts)
imbalance_ratio = min_samples / max_samples
    
if imbalance_ratio < 0.5:  
        print(f"Class imbalance detected. Imbalance ratio: {imbalance_ratio:.2f}")
        if imbalance_ratio < 0.1:
            print("Severe imbalance. Applying SMOTE...")
            smote = SMOTE(random_state=42)
            X_train, y_train = smote.fit_resample(X_train, y_train)
        else:
            print("Moderate imbalance. Consider adjusting class weights in models.")
    

Class imbalance detected. Imbalance ratio: 0.14
Moderate imbalance. Consider adjusting class weights in models.


### Train Baseline Model (Logistic Regression)

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

# Evaluation
print("\n**Logistic Regression Performance:**")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))



**Logistic Regression Performance:**
Accuracy: 0.876875
F1 Score: 0.9341357405549984
Confusion Matrix:
 [[   6  192]
 [   5 1397]]
Classification Report:
               precision    recall  f1-score   support

           0       0.55      0.03      0.06       198
           1       0.88      1.00      0.93      1402

    accuracy                           0.88      1600
   macro avg       0.71      0.51      0.50      1600
weighted avg       0.84      0.88      0.83      1600



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Train Advanced Models

In [None]:
models = {
    'RandomForest': RandomForestClassifier(),
    'GradientBoost': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': LGBMClassifier(),
    'CatBoost': CatBoostClassifier(verbose=0)
}

model_performance = {}

for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    
    
    model.fit(X_train, y_train)
    
    
    y_pred = model.predict(X_test)
    
    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    model_performance[model_name] = (accuracy, f1)
    
    print(f"Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")

# Sort top 3 models
top_3_models = sorted(model_performance.items(), key=lambda x: x[1][1], reverse=True)[:3]
print("\nTop 3 Models based on F1 Score:", top_3_models)



Training RandomForest...
Accuracy: 0.8925, F1 Score: 0.9417

Training GradientBoost...
Accuracy: 0.9025, F1 Score: 0.9459

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.9306, F1 Score: 0.9610

Training LightGBM...
[LightGBM] [Info] Number of positive: 5610, number of negative: 790
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001268 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2305
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.876563 -> initscore=1.960273
[LightGBM] [Info] Start training from score 1.960273
Accuracy: 0.9275, F1 Score: 0.9594

Training CatBoost...
Accuracy: 0.9394, F1 Score: 0.9658

Top 3 Models based on F1 Score: [('CatBoost', (0.939375, 0.9657606777267914)), ('XGBoost', (0.930625, 0.9610115911485775)), ('LightGBM', (0.9275, 0.9593552908199019))]


### Voting Classifier with Top Models
Combine the top 3 models into a Voting Classifier.

In [None]:
top_3_model_names = [model[0] for model in top_3_models]
top_3_classifiers = [models[model_name] for model_name in top_3_model_names]

voting_clf = VotingClassifier(estimators=list(zip(top_3_model_names, top_3_classifiers)), voting='hard')


voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_test)

# Evaluation
print("\n**Voting Classifier Performance:**")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 5610, number of negative: 790
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001819 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2305
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.876563 -> initscore=1.960273
[LightGBM] [Info] Start training from score 1.960273

**Voting Classifier Performance:**
Accuracy: 0.93375
F1 Score: 0.9627808988764045
Confusion Matrix:
 [[ 123   75]
 [  31 1371]]
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.62      0.70       198
           1       0.95      0.98      0.96      1402

    accuracy                           0.93      1600
   macro avg       0.87      0.80      0.83      1600
weighted avg       0.93      0.93      0.93      1600



### Stacking Classifier
Combine top 3 models into a Stacking Classifier.

In [None]:
stacking_clf = StackingClassifier(estimators=list(zip(top_3_model_names, top_3_classifiers)))


stacking_clf.fit(X_train, y_train)
y_pred = stacking_clf.predict(X_test)



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 5610, number of negative: 790
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000965 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2305
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.876563 -> initscore=1.960273
[LightGBM] [Info] Start training from score 1.960273


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 4488, number of negative: 632
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000869 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2305
[LightGBM] [Info] Number of data points in the train set: 5120, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.876563 -> initscore=1.960273
[LightGBM] [Info] Start training from score 1.960273
[LightGBM] [Info] Number of positive: 4488, number of negative: 632
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000978 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2305
[LightGBM] [Info] Number of data points in the train set: 5120, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.876563 -> initscore=1.960273
[LightGBM] [Info] Start training from score 1.960273
[LightGBM] [Info] Number

In [57]:
# Evaluation
print("\n**Stacking Classifier Performance:**")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


**Stacking Classifier Performance:**
Accuracy: 0.94125
F1 Score: 0.9667844522968198
Confusion Matrix:
 [[ 138   60]
 [  34 1368]]
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.70      0.75       198
           1       0.96      0.98      0.97      1402

    accuracy                           0.94      1600
   macro avg       0.88      0.84      0.86      1600
weighted avg       0.94      0.94      0.94      1600



In [None]:
import joblib
import os

os.makedirs("../models", exist_ok=True)


for model_name, model in models.items():
    model_path = f"../models/{model_name}.pkl"
    joblib.dump(model, model_path)
    print(f"✅ Saved {model_name} to {model_path}")


voting_clf_path = "../models/VotingClassifier.pkl"
joblib.dump(voting_clf, voting_clf_path)
print(f"✅ Saved Voting Classifier to {voting_clf_path}")


stacking_clf_path = "../models/StackingClassifier.pkl"
joblib.dump(stacking_clf, stacking_clf_path)
print(f"✅ Saved Stacking Classifier to {stacking_clf_path}")


✅ Saved RandomForest to ../models/RandomForest.pkl
✅ Saved GradientBoost to ../models/GradientBoost.pkl
✅ Saved XGBoost to ../models/XGBoost.pkl
✅ Saved LightGBM to ../models/LightGBM.pkl
✅ Saved CatBoost to ../models/CatBoost.pkl
✅ Saved Voting Classifier to ../models/VotingClassifier.pkl
✅ Saved Stacking Classifier to ../models/StackingClassifier.pkl


In [None]:
import joblib

joblib.dump(model_performance, "../models/model_performance.pkl")

print("\n✅ Model performance saved as model_performance.pkl!")



✅ Model performance saved as model_performance.pkl!
