In [8]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score, f1_score, confusion_matrix

from imblearn.over_sampling import SMOTE
import joblib
import warnings

warnings.filterwarnings("ignore")
print("✅ Imports OK")


✅ Imports OK


In [10]:
fraud = pd.read_csv("../data/processed/fraud_processed.csv")
credit = pd.read_csv("../data/processed/creditcard_processed.csv")

print("Fraud shape:", fraud.shape)
print("Credit shape:", credit.shape)


Fraud shape: (151112, 5)
Credit shape: (284807, 31)


In [11]:
X_fraud = fraud.drop(columns=["class"])
y_fraud = fraud["class"]

X_credit = credit.drop(columns=["Class"])
y_credit = credit["Class"]

print("Fraud target distribution:\n", y_fraud.value_counts())
print("\nCredit target distribution:\n", y_credit.value_counts())


Fraud target distribution:
 class
0    136961
1     14151
Name: count, dtype: int64

Credit target distribution:
 Class
0    284315
1       492
Name: count, dtype: int64


In [12]:
Xf_train, Xf_test, yf_train, yf_test = train_test_split(
    X_fraud, y_fraud, test_size=0.2, stratify=y_fraud, random_state=42
)

Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    X_credit, y_credit, test_size=0.2, stratify=y_credit, random_state=42
)

print("Fraud train distribution:\n", yf_train.value_counts())
print("\nCredit train distribution:\n", yc_train.value_counts())


Fraud train distribution:
 class
0    109568
1     11321
Name: count, dtype: int64

Credit train distribution:
 Class
0    227451
1       394
Name: count, dtype: int64


In [13]:
smote = SMOTE(random_state=42)

Xf_train_res, yf_train_res = smote.fit_resample(Xf_train, yf_train)
Xc_train_res, yc_train_res = smote.fit_resample(Xc_train, yc_train)

print("Fraud after SMOTE:\n", yf_train_res.value_counts())
print("\nCredit after SMOTE:\n", yc_train_res.value_counts())


Fraud after SMOTE:
 class
0    109568
1    109568
Name: count, dtype: int64

Credit after SMOTE:
 Class
0    227451
1    227451
Name: count, dtype: int64


In [14]:
lr_fraud = LogisticRegression(max_iter=1000)
lr_credit = LogisticRegression(max_iter=1000)

lr_fraud.fit(Xf_train_res, yf_train_res)
lr_credit.fit(Xc_train_res, yc_train_res)

print("✅ Logistic Regression trained")


✅ Logistic Regression trained


In [15]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    return {
        "AUC_PR": average_precision_score(y_test, y_prob),
        "F1": f1_score(y_test, y_pred),
        "Confusion_Matrix": confusion_matrix(y_test, y_pred)
    }


In [16]:
print("Fraud LR Metrics:", evaluate_model(lr_fraud, Xf_test, yf_test))
print("Credit LR Metrics:", evaluate_model(lr_credit, Xc_test, yc_test))


Fraud LR Metrics: {'AUC_PR': 0.5214748991979973, 'F1': 0.27227993439037723, 'Confusion_Matrix': array([[17583,  9810],
       [  838,  1992]])}
Credit LR Metrics: {'AUC_PR': 0.7244601740257455, 'F1': 0.1088929219600726, 'Confusion_Matrix': array([[55399,  1465],
       [    8,    90]])}


In [17]:
def grid_search_rf(X_train, y_train, param_grid):
    rf = RandomForestClassifier(random_state=42, n_jobs=-1)

    grid = GridSearchCV(
        rf,
        param_grid,
        scoring="average_precision",
        cv=5,
        n_jobs=-1
    )

    grid.fit(X_train, y_train)
    return grid.best_estimator_, grid.best_params_


In [18]:
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [5, 10, None]
}

rf_fraud, best_params = grid_search_rf(Xf_train_res, yf_train_res, param_grid)
print("Best RF params:", best_params)


Best RF params: {'max_depth': None, 'n_estimators': 200}


In [19]:
print("Random Forest Fraud Metrics:", evaluate_model(rf_fraud, Xf_test, yf_test))


Random Forest Fraud Metrics: {'AUC_PR': 0.6153667782336316, 'F1': 0.613112100364225, 'Confusion_Matrix': array([[26796,   597],
       [ 1315,  1515]])}


In [20]:
joblib.dump(rf_fraud, "../models/rf_fraud_final.pkl")
print("✅ Final model saved")


✅ Final model saved
