# Model Building & Training for Fraud Detection

In [1]:
# Imports
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    average_precision_score,
    f1_score
)

## 1. Load Processed Data

In [2]:
train = pd.read_csv("../data/processed/creditcard_train_ready.csv")
test = pd.read_csv("../data/processed/creditcard_test_ready.csv")


# Separate X and y
X_train = train.iloc[:, :-1]
y_train = train.iloc[:, -1]

X_test = test.iloc[:, :-1]
y_test = test.iloc[:, -1]

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Train shape: (398040, 30), Test shape: (85443, 30)


## 2. Define Models

In [3]:
# Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=42)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)

## 3. Train Models

In [None]:
print("\n🔹 Training Logistic Regression...")
lr.fit(X_train, y_train)

print("🔹 Training Random Forest...")
rf.fit(X_train, y_train)


🔹 Training Logistic Regression...
🔹 Training Random Forest...


## 4. Evaluate Models

In [None]:
# Logistic Regression Predictions
y_pred_lr = lr.predict(X_test)
y_prob_lr = lr.predict_proba(X_test)[:, 1]

print("\n✅ Logistic Regression Results")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr))
print("ROC AUC:", roc_auc_score(y_test, y_prob_lr))
print("AUC-PR:", average_precision_score(y_test, y_prob_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr))

# Random Forest Predictions
y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:, 1]

print("\n✅ Random Forest Results")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))
print("ROC AUC:", roc_auc_score(y_test, y_prob_rf))
print("AUC-PR:", average_precision_score(y_test, y_prob_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))


✅ Logistic Regression Results
Confusion Matrix:
[[83410  1885]
 [   18   130]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     85295
           1       0.06      0.88      0.12       148

    accuracy                           0.98     85443
   macro avg       0.53      0.93      0.55     85443
weighted avg       1.00      0.98      0.99     85443

ROC AUC: 0.967180437369194
AUC-PR: 0.7062135707831341
F1 Score: 0.12020342117429496

✅ Random Forest Results
Confusion Matrix:
[[85281    14]
 [   32   116]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.89      0.78      0.83       148

    accuracy                           1.00     85443
   macro avg       0.95      0.89      0.92     85443
weighted avg       1.00      1.00      1.00     85443

ROC AUC: 0.9689994027088815
AUC-PR: 0.8281333489231498
F1 Sc

## 5. Save Best Model

In [None]:
joblib.dump(rf, ",,/data/models/random_forest_model.pkl")
print("✅ Random Forest model saved")

✅ Random Forest model saved
