In [None]:
# 03 â€” Advanced ML Models

Train advanced gradient boosting models for improved performance:
-Logistic Regression 
-Tuned Random Forest
- Gradient Boosting 

Also includes early hyperparameter tuning.


In [7]:
# =========================
# Advanced Models
# =========================

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
sns.set(style="whitegrid")

In [8]:
# === Load processed sparse arrays ===
X_train = sparse.load_npz("data/processed/X_train.npz")
X_test = sparse.load_npz("data/processed/X_test.npz")
y_train = np.load("data/processed/y_train.npy", allow_pickle=True)
y_test = np.load("data/processed/y_test.npy", allow_pickle=True)

In [10]:
# Tree-based models require dense arrays
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()


In [11]:
# =========================
#  L1 Logistic Regression
# =========================

lr_l1 = LogisticRegression(
    penalty="l1",
    solver="liblinear",
    max_iter=500,
    random_state=42
)

lr_l1.fit(X_train_dense, y_train)

pred_lr_l1 = lr_l1.predict(X_test_dense)
prob_lr_l1 = lr_l1.predict_proba(X_test_dense)[:, 1]

print("L1 Logistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, pred_lr_l1))
print("ROC-AUC:", roc_auc_score(y_test, prob_lr_l1))
print(classification_report(y_test, pred_lr_l1))





L1 Logistic Regression Results:
Accuracy: 0.8882283580622974
ROC-AUC: 0.6524423938513572
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     18083
           1       0.48      0.02      0.03      2271

    accuracy                           0.89     20354
   macro avg       0.68      0.51      0.49     20354
weighted avg       0.84      0.89      0.84     20354



In [12]:
# =========================
#  Tuned Random Forest
# =========================

rf_adv = RandomForestClassifier(
    n_estimators=300,
    max_depth=12,
    min_samples_split=20,
    min_samples_leaf=10,
    n_jobs=-1,
    random_state=42
)

rf_adv.fit(X_train_dense, y_train)

pred_rf_adv = rf_adv.predict(X_test_dense)
prob_rf_adv = rf_adv.predict_proba(X_test_dense)[:, 1]

print("Tuned Random Forest Results:")
print("Accuracy:", accuracy_score(y_test, pred_rf_adv))
print("ROC-AUC:", roc_auc_score(y_test, prob_rf_adv))
print(classification_report(y_test, pred_rf_adv))


Tuned Random Forest Results:
Accuracy: 0.8884248796305394
ROC-AUC: 0.6626251723028797
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     18083
           1       0.00      0.00      0.00      2271

    accuracy                           0.89     20354
   macro avg       0.44      0.50      0.47     20354
weighted avg       0.79      0.89      0.84     20354



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
# =========================
# Phase 4.3: Gradient Boosting
# =========================

gb_adv = GradientBoostingClassifier(
    n_estimators=80,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.8,
    random_state=42
)

gb_adv.fit(X_train_dense, y_train)

pred_gb_adv = gb_adv.predict(X_test_dense)
prob_gb_adv = gb_adv.predict_proba(X_test_dense)[:, 1]

print("Gradient Boosting Results:")
print("Accuracy:", accuracy_score(y_test, pred_gb_adv))
print("ROC-AUC:", roc_auc_score(y_test, prob_gb_adv))
print(classification_report(y_test, pred_gb_adv))


In [None]:
# =========================
# Phase 4.4: Model Comparison
# =========================

results_phase4 = pd.DataFrame({
    "Model": [
        "L1 Logistic Regression",
        "Tuned Random Forest",
        "Gradient Boosting"
    ],
    "Accuracy": [
        accuracy_score(y_test, pred_lr_l1),
        accuracy_score(y_test, pred_rf_adv),
        accuracy_score(y_test, pred_gb_adv)
    ],
    "ROC-AUC": [
        roc_auc_score(y_test, prob_lr_l1),
        roc_auc_score(y_test, prob_rf_adv),
        roc_auc_score(y_test, prob_gb_adv)
    ]
})

results_phase4
