In [12]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score
import joblib

from sklearn.model_selection import StratifiedKFold


In [17]:
X_train = np.load('../notebooks/new_artifacts/X_train_new.npz', allow_pickle=True)['arr_0']
Y_train = np.load('../notebooks/new_artifacts/Y_train_new.npz', allow_pickle=True)['arr_0']
X_test  = np.load('../notebooks/new_artifacts/X_test_new.npz',  allow_pickle=True)['arr_0']
Y_test  = np.load('../notebooks/new_artifacts/Y_test_new.npz',  allow_pickle=True)['arr_0']


### 1. Logistic Regression

In [21]:
print("BASELINE MODELS")
print("=" * 30)

# Logistic Regression
print("Training Logistic Regression...")
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train, Y_train)

# Predictions
lr_pred = lr_model.predict(X_test)
lr_pred_proba = lr_model.predict_proba(X_test)[:, 1]

# Performance
lr_auc = roc_auc_score(Y_test, lr_pred_proba)
print(f"Logistic Regression AUC: {lr_auc:.3f}")

# Store results
baseline_results = {
    'Logistic Regression': {
        'model': lr_model,
        'predictions': lr_pred,
        'probabilities': lr_pred_proba,
        'auc': lr_auc
    }
}

BASELINE MODELS
Training Logistic Regression...
Logistic Regression AUC: 0.836


### 2. Decison Tree Classifier

In [22]:
# Decision Tree
print("Training Decision Tree...")
dt_model = DecisionTreeClassifier(random_state=42, max_depth=10)
dt_model.fit(X_train, Y_train)

# Predictions
dt_pred = dt_model.predict(X_test)
dt_pred_proba = dt_model.predict_proba(X_test)[:, 1]

# Performance
dt_auc = roc_auc_score(Y_test, dt_pred_proba)
print(f"Decision Tree AUC: {dt_auc:.3f}")

# Store results
baseline_results['Decision Tree'] = {
    'model': dt_model,
    'predictions': dt_pred,
    'probabilities': dt_pred_proba,
    'auc': dt_auc
}

print("✅ Baseline models trained!")

Training Decision Tree...
Decision Tree AUC: 0.742
✅ Baseline models trained!


### K-Fold Validation

In [20]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def kfold_evaluation(model, X, Y):
    acc, f1s, recalls, precisions, roc_aucs = [], [], [], [], []
    
    for train_idx, val_idx in kfold.split(X, Y):
        X_tr, X_val = X[train_idx], X[val_idx]   # NumPy indexing
        Y_tr, Y_val = Y[train_idx], Y[val_idx]
        
        model.fit(X_tr, Y_tr)
        Y_pred = model.predict(X_val)
        acc.append(accuracy_score(Y_val, Y_pred))
        f1s.append(f1_score(Y_val, Y_pred))
        recalls.append(recall_score(Y_val, Y_pred))
        precisions.append(precision_score(Y_val, Y_pred))
        if hasattr(model, "predict_proba"):
            probs = model.predict_proba(X_val)[:,1]
            roc_aucs.append(roc_auc_score(Y_val, probs))
    
    print(f"Accuracy : {np.mean(acc):.4f}")
    print(f"F1-Score : {np.mean(f1s):.4f}")
    print(f"Recall   : {np.mean(recalls):.4f}")
    print(f"Precision: {np.mean(precisions):.4f}")
    if roc_aucs:
        print(f"ROC-AUC  : {np.mean(roc_aucs):.4f}")

print("Logistic Regression Validation Metrics: \n")
lr = LogisticRegression(random_state=42)
kfold_evaluation(lr, X_train, Y_train)

print("\nDecision Tree Classifier Validation: \n")
dt = DecisionTreeClassifier(random_state=42)
kfold_evaluation(dt, X_train, Y_train)



Logistic Regression Validation Metrics: 



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy : 0.9297
F1-Score : 0.9377
Recall   : 0.9420
Precision: 0.9336
ROC-AUC  : 0.9806

Decision Tree Classifier Validation: 

Accuracy : 0.9304
F1-Score : 0.9384
Recall   : 0.9423
Precision: 0.9345
ROC-AUC  : 0.9287
