# Model Building: Fraud Detection Classifiers
Train baseline (LR) and ensemble (XGBoost) on engineered e-comm data. Handle imbalance with SMOTE, eval with AUC-PR/F1. Stratified splits, 5-fold CV. Select best based on perf + business (low FP).

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_auc_score as auc_pr, f1_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier  # Alt to XGB
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_palette("husl")

# Load engineered e-comm
df = pd.read_parquet('../data/processed/fraud_engineered.parquet')
X = df.drop('class', axis=1)
y = df['class']
print(f"Loaded: {X.shape}, fraud rate: {y.mean():.2%}")


df_cc = pd.read_parquet('../data/processed/creditcard_engineered.parquet')
 X_cc = df_cc.drop('Class', axis=1)
 y_cc = df_cc['Class']

In [None]:
# 80/20 stratified split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print(f"Train: {X_train.shape}, fraud {y_train.mean():.2%}")
print(f"Test: {X_test.shape}, fraud {y_test.mean():.2%}")

# SMOTE on train only
smote = SMOTE(random_state=42, k_neighbors=5)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print(f"Resampled train: {X_train_res.shape}, fraud {y_train_res.mean():.2%}")

Baseline Model - Logistic Regression

In [None]:
# Baseline: LR (interpretable, fast)
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train_res, y_train_res)

# Predict
y_pred_lr = lr.predict(X_test)
y_prob_lr = lr.predict_proba(X_test)[:, 1]

# Metrics
auc_pr_lr = auc_pr(y_test, y_prob_lr)
f1_lr = f1_score(y_test, y_pred_lr)
cm_lr = confusion_matrix(y_test, y_pred_lr)

print(f"LR AUC-PR: {auc_pr_lr:.3f}")
print(f"LR F1 (fraud): {f1_lr:.3f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))

# CM plot
plt.figure(figsize=(6, 5))
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix: Logistic Regression')
plt.ylabel('True')
plt.xlabel('Predicted')
plt.show()