In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_recall_curve, auc
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import joblib

# Load processed data
X_train = np.load('data/processed/X_train_smote.npy')
y_train = np.load('data/processed/y_train_smote.npy')
X_test = np.load('data/processed/X_test.npy')
y_test = np.load('data/processed/y_test.npy')

# Load preprocessor (for feature names later)
preprocessor = joblib.load('data/processed/preprocessor.pkl')

print("✅ Data loaded for modeling.")

In [None]:
# Baseline: Logistic Regression
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train, y_train)

# Predict
y_pred_lr = lr.predict(X_test)
y_proba_lr = lr.predict_proba(X_test)[:, 1]

# Metrics
f1_lr = f1_score(y_test, y_pred_lr)
print("=== Logistic Regression (Baseline) ===")
print(f"F1-Score: {f1_lr:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr))

# AUC-PR
precision, recall, _ = precision_recall_curve(y_test, y_proba_lr)
auc_pr_lr = auc(recall, precision)
print(f"AUC-PR: {auc_pr_lr:.4f}")

In [None]:
# Ensemble: XGBoost
xgb = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    eval_metric='logloss'
)
xgb.fit(X_train, y_train)

# Predict
y_pred_xgb = xgb.predict(X_test)
y_proba_xgb = xgb.predict_proba(X_test)[:, 1]

# Metrics
f1_xgb = f1_score(y_test, y_pred_xgb)
print("=== XGBoost (Ensemble) ===")
print(f"F1-Score: {f1_xgb:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb))

# AUC-PR
precision, recall, _ = precision_recall_curve(y_test, y_proba_xgb)
auc_pr_xgb = auc(recall, precision)
print(f"AUC-PR: {auc_pr_xgb:.4f}")

In [None]:
# Stratified K-Fold CV (k=5)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def evaluate_model_cv(model, X, y):
    f1_scores = []
    auc_pr_scores = []
    for train_idx, val_idx in skf.split(X, y):
        X_tr, X_val = X[train_idx], X[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        y_proba = model.predict_proba(X_val)[:, 1]
        f1_scores.append(f1_score(y_val, y_pred))
        p, r, _ = precision_recall_curve(y_val, y_proba)
        auc_pr_scores.append(auc(r, p))
    return np.mean(f1_scores), np.std(f1_scores), np.mean(auc_pr_scores), np.std(auc_pr_scores)

# Evaluate both models
f1_lr_mean, f1_lr_std, auc_pr_lr_mean, auc_pr_lr_std = evaluate_model_cv(lr, X_train, y_train)
f1_xgb_mean, f1_xgb_std, auc_pr_xgb_mean, auc_pr_xgb_std = evaluate_model_cv(xgb, X_train, y_train)

print("=== Cross-Validation Results (Mean ± Std) ===")
print(f"Logistic Regression → F1: {f1_lr_mean:.4f} ± {f1_lr_std:.4f} | AUC-PR: {auc_pr_lr_mean:.4f} ± {auc_pr_lr_std:.4f}")
print(f"XGBoost             → F1: {f1_xgb_mean:.4f} ± {f1_xgb_std:.4f} | AUC-PR: {auc_pr_xgb_mean:.4f} ± {auc_pr_xgb_std:.4f}")

In [None]:
print("=== MODEL SELECTION ===")
if auc_pr_xgb > auc_pr_lr:
    print("✅ SELECTED MODEL: XGBoost")
    print("- Higher AUC-PR → better at ranking fraud cases")
    print("- F1-Score balanced → controls false positives/negatives")
    best_model = xgb
else:
    print("✅ SELECTED MODEL: Logistic Regression")
    print("- Interpretable (meets business trust needs)")
    best_model = lr

# Save best model
joblib.dump(best_model, 'src/models/best_fraud_model.pkl')
print("\n✅ Best model saved to src/models/best_fraud_model.pkl")