In [None]:
# 04 â€“ Fraud Detection Modeling
#
# Steps:
# - Load processed features and labels
# - Apply SMOTE
# - Train Logistic Regression and Random Forest
# - Evaluate (Accuracy, Precision, Recall, F1, ROC-AUC)
# - Export fraud_predictions.csv
# - Save Random Forest model (for later feature importance / dashboards)

import os
import joblib
import numpy as np
import pandas as pd

from src.fraud_utils import (
    apply_smote,
    train_logistic_regression,
    train_random_forest,
    evaluate_fraud_model,
    get_feature_importances,
)

# -------------------------------------------------------------------
# Paths
# -------------------------------------------------------------------
DATA_DIR = os.path.join("..", "data")
RESULTS_DIR = os.path.join("..", "results")
os.makedirs(RESULTS_DIR, exist_ok=True)

processed_dir = os.path.join(DATA_DIR, "processed")

# -------------------------------------------------------------------
# Load Processed Data
# -------------------------------------------------------------------
X_train_processed = joblib.load(os.path.join(processed_dir, "X_train_processed.joblib"))
X_test_processed = joblib.load(os.path.join(processed_dir, "X_test_processed.joblib"))
y_train = joblib.load(os.path.join(processed_dir, "y_train.joblib"))
y_test = joblib.load(os.path.join(processed_dir, "y_test.joblib"))

print("Shapes:")
print("X_train_processed:", X_train_processed.shape)
print("X_test_processed:", X_test_processed.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

# -------------------------------------------------------------------
# Apply SMOTE
# -------------------------------------------------------------------
X_train_res, y_train_res = apply_smote(
    X_train_processed,
    y_train.values,
    random_state=42,
)

print("\nClass distribution before SMOTE:", np.bincount(y_train))
print("Class distribution after SMOTE:", np.bincount(y_train_res))

# -------------------------------------------------------------------
# Train Logistic Regression
# -------------------------------------------------------------------
log_reg_model = train_logistic_regression(
    X_train_res,
    y_train_res,
    C=1.0,
    max_iter=2000,
    class_weight="balanced",
)

print("\nTrained Logistic Regression model:", log_reg_model)

# -------------------------------------------------------------------
# Train Random Forest
# -------------------------------------------------------------------
rf_model = train_random_forest(
    X_train_res,
    y_train_res,
    n_estimators=300,
    max_depth=None,
    class_weight="balanced",
    random_state=42,
)

print("\nTrained Random Forest model:", rf_model)

# -------------------------------------------------------------------
# Evaluate Models
# -------------------------------------------------------------------
log_reg_metrics = evaluate_fraud_model(
    log_reg_model,
    X_test_processed,
    y_test.values,
)

rf_metrics = evaluate_fraud_model(
    rf_model,
    X_test_processed,
    y_test.values,
)

print("\nLogistic Regression metrics:")
for k, v in log_reg_metrics.items():
    print(f"{k}: {v:.4f}")

print("\nRandom Forest metrics:")
for k, v in rf_metrics.items():
    print(f"{k}: {v:.4f}")

# -------------------------------------------------------------------
# Feature Importances for RF (generic feature names)
# -------------------------------------------------------------------
n_features = X_train_processed.shape[1]
feature_names = [f"feature_{i}" for i in range(n_features)]

rf_importances_df = get_feature_importances(rf_model, feature_names)
print("\nTop 15 Random Forest feature importances:")
display(rf_importances_df.head(15))

# -------------------------------------------------------------------
# Export Fraud Predictions (using RF as best model)
# -------------------------------------------------------------------
best_model = rf_model

y_pred = best_model.predict(X_test_processed)
y_proba = best_model.predict_proba(X_test_processed)[:, 1]

fraud_predictions_df = pd.DataFrame({
    "index": y_test.index,
    "true_label": y_test.values,
    "predicted_label": y_pred,
    "fraud_probability": y_proba,
})

fraud_predictions_path = os.path.join(RESULTS_DIR, "fraud_predictions.csv")
fraud_predictions_df.to_csv(fraud_predictions_path, index=False)

print("\nSaved fraud predictions to:", fraud_predictions_path)
display(fraud_predictions_df.head())

# -------------------------------------------------------------------
# Save Random Forest model for later use (visualizations, dashboards)
# -------------------------------------------------------------------
rf_model_path = os.path.join(processed_dir, "rf_model.joblib")
joblib.dump(rf_model, rf_model_path)
print("\nSaved Random Forest model to:", rf_model_path)

print("\n=== 04_fraud_detection_model completed ===")
