In [3]:
import numpy as np
import joblib
import sys
sys.path.append("..") 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

preprocessor = joblib.load('../models/preprocessor.pkl')
X_balanced = np.load('../data/processed/X_balanced.npz')['arr_0']
y_balanced = np.load('../data/processed/y_balanced.npz')['arr_0']


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
)


In [5]:
# train a logistic repgression model
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)

In [6]:
# trian random forest model
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)

In [7]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_test, y_proba))

print("Logistic Regression:")
evaluate_model(lr_model, X_test, y_test)

print("\nRandom Forest:")
evaluate_model(rf_model, X_test, y_test)


Logistic Regression:
Confusion Matrix:
[[26032  1361]
 [ 9775 17617]]
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.95      0.82     27393
           1       0.93      0.64      0.76     27392

    accuracy                           0.80     54785
   macro avg       0.83      0.80      0.79     54785
weighted avg       0.83      0.80      0.79     54785

ROC AUC Score: 0.8731452512149225

Random Forest:
Confusion Matrix:
[[25727  1666]
 [ 5677 21715]]
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.94      0.88     27393
           1       0.93      0.79      0.86     27392

    accuracy                           0.87     54785
   macro avg       0.87      0.87      0.87     54785
weighted avg       0.87      0.87      0.87     54785

ROC AUC Score: 0.9419867491643783


In [8]:
import joblib
import os

os.makedirs("../models", exist_ok=True)

# Save models
joblib.dump(lr_model, "../models/logistic_regression_model.pkl")
joblib.dump(rf_model, "../models/random_forest_model.pkl")

print("Models saved successfully in ../models/")


Models saved successfully in ../models/


| Metric                | Logistic Regression | Random Forest |
| --------------------- | ------------------- | ------------- |
| **Accuracy**          | 80%                 | **87%**       |
| **F1-Score (Fraud)**  | 0.76                | **0.86**      |
| **Recall (Fraud)**    | 0.64                | **0.79**      |
| **Precision (Fraud)** | 0.93                | **0.93**      |
| **ROC AUC**           | 0.87                | **0.94**      |


Key Findings:
Random Forest consistently outperforms Logistic Regression across key metrics.

Recall improved from 64% to 79%, meaning Random Forest detects significantly more fraudulent transactions.

ROC AUC of 0.94 shows strong discriminative ability between fraud and legitimate transactions.

Precision is equally high, meaning both models avoid too many false alarms.

Business Insights:
High Recall is critical in fraud detection — missing a fraudulent transaction may lead to financial loss or regulatory issues.

Random Forest strikes a balance between catching fraud and minimizing false positives (which could annoy genuine customers).

The improved detection rate (79%) with acceptable precision makes Random Forest a strong candidate for operational deployment.

Logistic Regression remains useful as a baseline or fallback model due to its simplicity and interpretability.

In [9]:

import pandas as pd
import sys
sys.path.append("../") 
credit_df = pd.read_csv("../data/processed/csv/creditcard_data_cleaned.csv")
print(credit_df.shape)
credit_df.head()


(283726, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [10]:
# separate features and target variable
X_cc = credit_df.drop(columns=["Class"])
y_cc = credit_df["Class"]


In [11]:
# scale features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_cc_scaled = scaler.fit_transform(X_cc)


In [12]:
# balance classes with smote 
from src.transformers import apply_balancing

X_cc_bal, y_cc_bal = apply_balancing(X_cc_scaled, y_cc, strategy="smote")


INFO:root:Original class distribution: {0: 283253, 1: 473}
INFO:root:Balanced class distribution: {0: 283253, 1: 283253}


In [13]:
# train-test split
from sklearn.model_selection import train_test_split

Xcc_train, Xcc_test, ycc_train, ycc_test = train_test_split(X_cc_bal, y_cc_bal, test_size=0.3, random_state=42, stratify=y_cc_bal)


In [None]:
# train models 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
#
ccrf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
ccrf_model.fit(Xcc_train, ycc_train)
#
cclr_model = LogisticRegression(max_iter=1000, random_state=42)
cclr_model.fit(Xcc_train, ycc_train)


In [None]:
# train models logistic regression, random forest, xgboost
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

def evaluate_model(model, Xcc_test, ycc_test):
    ycc_pred = model.predict(Xcc_test)
    ycc_proba = model.predict_proba(Xcc_test)[:, 1]
    print("Confusion Matrix:")
    print(confusion_matrix(ycc_test, ycc_pred))
    print("Classification Report:")
    print(classification_report(ycc_test, ycc_pred))
    print("ROC AUC Score:", roc_auc_score(ycc_test, ycc_proba))

print("Logistic Regression-credit card:")
evaluate_model(cclr_model, Xcc_test, ycc_test)

print("\nRandom Forest-credit card:")
evaluate_model(ccrf_model, Xcc_test, ycc_test)
