In [None]:
# 02 â€” Baseline Machine Learning Models
Train baseline classifiers to establish reference performance:
- Logistic Regression
- Random Forest
- Gradient Boosting


In [None]:
# === Imports ===
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse

sns.set(style="whitegrid")

In [3]:
# === Load processed sparse arrays ===
X_train = sparse.load_npz("data/processed/X_train.npz")
X_test = sparse.load_npz("data/processed/X_test.npz")
y_train = np.load("data/processed/y_train.npy", allow_pickle=True)
y_test = np.load("data/processed/y_test.npy", allow_pickle=True)


In [4]:
# Tree-based models require dense arrays
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()


In [5]:
#Logistic Regression
lr = LogisticRegression(max_iter=500)
lr.fit(X_train, y_train)  # Sparse OK

pred_lr = lr.predict(X_test)
prob_lr = lr.predict_proba(X_test)[:,1]

print("Logistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, pred_lr))
print("ROC-AUC:", roc_auc_score(y_test, prob_lr))
print(classification_report(y_test, pred_lr))


Logistic Regression Results:
Accuracy: 0.8883266188464184
ROC-AUC: 0.6469640102942319
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     18083
           1       0.49      0.02      0.04      2271

    accuracy                           0.89     20354
   macro avg       0.69      0.51      0.49     20354
weighted avg       0.85      0.89      0.84     20354



In [6]:
print(type(X_train_dense), X_train_dense.shape)
print(type(X_test_dense), X_test_dense.shape)
print(type(y_train), y_train.shape)
print(type(y_test), y_test.shape)


<class 'numpy.ndarray'> (81412, 2470)
<class 'numpy.ndarray'> (20354, 2470)
<class 'numpy.ndarray'> (81412,)
<class 'numpy.ndarray'> (20354,)


In [7]:
# Random Forest

rf = RandomForestClassifier(
    n_estimators=30,        # fewer trees
    max_depth=10,           # limit tree depth
    n_jobs=-1,              # use all CPU cores
    random_state=42
)

rf.fit(X_train_dense, y_train)

pred_rf = rf.predict(X_test_dense)
prob_rf = rf.predict_proba(X_test_dense)[:, 1]

print("Random Forest Results:")
print("Accuracy:", accuracy_score(y_test, pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, prob_rf))
print(classification_report(y_test, pred_rf))



Random Forest Results:
Accuracy: 0.8884248796305394
ROC-AUC: 0.6405966659972644
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     18083
           1       0.00      0.00      0.00      2271

    accuracy                           0.89     20354
   macro avg       0.44      0.50      0.47     20354
weighted avg       0.79      0.89      0.84     20354



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
# Gradient Boosting

gb = GradientBoostingClassifier(
    n_estimators=100,
    max_depth=3,
    random_state=42
)

gb.fit(X_train_dense, y_train)

pred_gb = gb.predict(X_test_dense)
prob_gb = gb.predict_proba(X_test_dense)[:, 1]

print("Gradient Boosting Results:")
print("Accuracy:", accuracy_score(y_test, pred_gb))
print("ROC-AUC:", roc_auc_score(y_test, prob_gb))
print(classification_report(y_test, pred_gb))



In [None]:
# Compare Models

results = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest", "Gradient Boosting"],
    "Accuracy": [accuracy_score(y_test, pred_lr),
                 accuracy_score(y_test, pred_rf),
                 accuracy_score(y_test, pred_gb)],
    "ROC-AUC": [roc_auc_score(y_test, prob_lr),
                roc_auc_score(y_test, prob_rf),
                roc_auc_score(y_test, prob_gb)]
})

results
