# Model Building & Training for Fraud Detection

In [1]:
# Imports
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    average_precision_score,
    f1_score
)

## 1. Load Processed Data

In [2]:
train = pd.read_csv("../data/processed/train_ready.csv")
test = pd.read_csv("../data/processed/test_ready.csv")


# Separate X and y
X_train = train.iloc[:, :-1]
y_train = train.iloc[:, -1]

X_test = test.iloc[:, :-1]
y_test = test.iloc[:, -1]

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Train shape: (191744, 18), Test shape: (45334, 18)


## 2. Define Models

In [3]:
# Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=42)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)

## 3. Train Models

In [4]:
print("\n🔹 Training Logistic Regression...")
lr.fit(X_train, y_train)

print("🔹 Training Random Forest...")
rf.fit(X_train, y_train)


🔹 Training Logistic Regression...
🔹 Training Random Forest...


## 4. Evaluate Models

In [5]:
# Logistic Regression Predictions
y_pred_lr = lr.predict(X_test)
y_prob_lr = lr.predict_proba(X_test)[:, 1]

print("\n✅ Logistic Regression Results")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr))
print("ROC AUC:", roc_auc_score(y_test, y_prob_lr))
print("AUC-PR:", average_precision_score(y_test, y_prob_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr))

# Random Forest Predictions
y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:, 1]

print("\n✅ Random Forest Results")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))
print("ROC AUC:", roc_auc_score(y_test, y_prob_rf))
print("AUC-PR:", average_precision_score(y_test, y_prob_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))


✅ Logistic Regression Results
Confusion Matrix:
[[40983   106]
 [ 1957  2288]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     41089
           1       0.96      0.54      0.69      4245

    accuracy                           0.95     45334
   macro avg       0.96      0.77      0.83     45334
weighted avg       0.95      0.95      0.95     45334

ROC AUC: 0.7723829404073626
AUC-PR: 0.6324285292739573
F1 Score: 0.6892604307877692

✅ Random Forest Results
Confusion Matrix:
[[39893  1196]
 [ 1904  2341]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96     41089
           1       0.66      0.55      0.60      4245

    accuracy                           0.93     45334
   macro avg       0.81      0.76      0.78     45334
weighted avg       0.93      0.93      0.93     45334

ROC AUC: 0.7699075072207444
AUC-PR: 0.6278875374702841
F1 Sc

## 5. Save Best Model

In [6]:
joblib.dump(lr, "../data/models/logistic_regression_model.pkl")
print("✅ Logistic Regression model saved")

✅ Logistic Regression model saved
