In [38]:
import sys
import os

# Add project root to Python path so src/ can be imported
sys.path.append(os.path.abspath(".."))

# Core libraries
import pandas as pd
import joblib
import warnings
warnings.filterwarnings("ignore")

# Import reusable functions
from src.modeling_utils import *





In [39]:
fraud_path = "../data/processed/fraud_processed.csv"
credit_path = "../data/processed/creditcard_processed.csv"

fraud, credit = load_data(fraud_path, credit_path)

print("Fraud dataset shape:", fraud.shape)
print("Credit dataset shape:", credit.shape)


Data loaded successfully!
Fraud dataset shape: (151112, 5)
Credit dataset shape: (284807, 31)


In [40]:
# Fraud Dataset
X_fraud, y_fraud = separate_features_target(fraud, "class")

# Credit Card Dataset
X_credit, y_credit = separate_features_target(credit, "Class")

print("Fraud features shape:", X_fraud.shape)
print("Credit features shape:", X_credit.shape)


Fraud features shape: (151112, 4)
Credit features shape: (284807, 30)


In [41]:
Xf_train, Xf_test, yf_train, yf_test = stratified_split(X_fraud, y_fraud)
Xc_train, Xc_test, yc_train, yc_test = stratified_split(X_credit, y_credit)

print("Fraud training set:", Xf_train.shape)
print("Credit training set:", Xc_train.shape)


Fraud training set: (120889, 4)
Credit training set: (227845, 30)


In [42]:
Xf_train_res, yf_train_res = apply_smote(Xf_train, yf_train)
Xc_train_res, yc_train_res = apply_smote(Xc_train, yc_train)

print("Fraud class distribution after SMOTE:\n", yf_train_res.value_counts())
print("Credit class distribution after SMOTE:\n", yc_train_res.value_counts())


Fraud class distribution after SMOTE:
 class
0    109568
1    109568
Name: count, dtype: int64
Credit class distribution after SMOTE:
 Class
0    227451
1    227451
Name: count, dtype: int64


SMOTE is applied only on training data to handle class imbalance.
This helps improve model performance on minority classes.

In [43]:
# Train Logistic Regression
lr_fraud = train_logistic_regression(Xf_train_res, yf_train_res)
lr_credit = train_logistic_regression(Xc_train_res, yc_train_res)

# Evaluate
metrics_lr_fraud = evaluate_model(lr_fraud, Xf_test, yf_test)
metrics_lr_credit = evaluate_model(lr_credit, Xc_test, yc_test)

print("Logistic Regression Fraud Metrics:", metrics_lr_fraud)
print("Logistic Regression Credit Metrics:", metrics_lr_credit)


Logistic Regression Fraud Metrics: {'AUC-PR': 0.5214748991979973, 'F1': 0.27227993439037723, 'Confusion Matrix': array([[17583,  9810],
       [  838,  1992]])}
Logistic Regression Credit Metrics: {'AUC-PR': 0.7244601740257455, 'F1': 0.1088929219600726, 'Confusion Matrix': array([[55399,  1465],
       [    8,    90]])}


Train a baseline Logistic Regression model for both datasets.
Evaluation metrics: AUC-PR, F1-score, Confusion Matrix.

In [44]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, None]
}

rf_fraud, best_params = grid_search_rf(Xf_train_res, yf_train_res, param_grid=param_grid)
metrics_rf_fraud = evaluate_model(rf_fraud, Xf_test, yf_test)

print("Random Forest Fraud Metrics:", metrics_rf_fraud)
print("Best RF Parameters:", best_params)


Random Forest Fraud Metrics: {'AUC-PR': 0.6153667782336316, 'F1': 0.613112100364225, 'Confusion Matrix': array([[26796,   597],
       [ 1315,  1515]])}
Best RF Parameters: {'max_depth': None, 'n_estimators': 200}


Random Forest trained with GridSearchCV to tune `n_estimators` and `max_depth`.
Best parameters and evaluation metrics are reported.

In [45]:
mean_cv, std_cv = cross_validate_model(rf_fraud, X_fraud, y_fraud)
print(f"Random Forest CV AUC-PR: {mean_cv:.4f} ± {std_cv:.4f}")


Random Forest CV AUC-PR: 0.6251 ± 0.0045


Perform stratified 5-fold cross-validation to assess model generalization.
Mean and standard deviation of AUC-PR reported.

In [46]:
joblib.dump(lr_fraud, "../models/lr_fraud_final.pkl")
joblib.dump(rf_fraud, "../models/rf_fraud_final.pkl")

print("Models saved to ../models/")


Models saved to ../models/


In [None]:
Final models are saved for reproducibility and future use in Task 3.

### Model Comparison & Selection

**Logistic Regression**:
- Pros: Interpretable, simple
- Cons: Lower recall on fraud cases, limited non-linear patterns

**Random Forest**:
- Pros: Higher AUC-PR & F1-score, captures non-linear interactions
- Cons: Slightly less interpretable

**Decision**: Random Forest is selected as the final model due to improved fraud detection metrics, validated by stratified cross-validation. Logistic Regression retained for baseline comparison and interpretability.
