In [None]:
# FraudDetectPro: Full Model Pipeline for Raw Data

# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
import shap
import matplotlib.pyplot as plt
import joblib
import os

# Ensure directories exist

# Load processed data
X_train = np.load("data/processed/X_train.npy")
y_train = np.load("data/processed/y_train.npy")
X_test  = np.load("data/processed/X_test.npy")
y_test  = np.load("data/processed/y_test.npy")

# Load the saved scaler (if needed later for new data)
scaler = joblib.load("data/processed/scaler.pkl")

print(f"Raw dataset shape: {df.shape}")
print(df.head())

# ---------------------
# Basic Data Preparation
# ---------------------
# Check for missing values
print("Missing values per column:\n", df.isnull().sum())


# Features and target
X = df.drop(columns=["Class"])
y = df["Class"]

# Scale features (all numerical in this dataset)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ---------------------
# Handle Class Imbalance
# ---------------------
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

print(f"Resampled dataset shape: {X_resampled.shape}, Fraud ratio: {np.mean(y_resampled)}")

# ---------------------
# Train/Test Split
# ---------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

# ---------------------
# Model Definition
# ---------------------
# Ensemble: RandomForest + XGBoost + LogisticRegression (Voting by averaging probabilities)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=100, random_state=42)
lr = LogisticRegression(max_iter=1000, random_state=42)

# Fit individual models
rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)
lr.fit(X_train, y_train)

# ---------------------
# Ensemble Predictions (Averaging Probabilities)
# ---------------------
def ensemble_predict(models, X):
    probs = np.zeros(X.shape[0])
    for model in models:
        probs += model.predict_proba(X)[:, 1]
    probs /= len(models)
    return (probs >= 0.5).astype(int), probs

models = [rf, xgb, lr]
y_pred, y_prob = ensemble_predict(models, X_test)

# ---------------------
# Evaluation
# ---------------------
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

# ---------------------
# Explainability with SHAP
# ---------------------
# Using the XGBoost model for SHAP explanation
explainer = shap.TreeExplainer(xgb)
shap_values = explainer.shap_values(X_test)

# Summary plot
shap.summary_plot(shap_values, X_test, feature_names=df.drop(columns=['Class']).columns)

# ---------------------
# Save Models & Scaler
# ---------------------
joblib.dump(rf, "models/rf_model.pkl")
joblib.dump(xgb, "models/xgb_model.pkl")
joblib.dump(lr, "models/lr_model.pkl")
joblib.dump(scaler, "models/scaler.pkl")

print("Models and scaler saved in 'models/' directory.")
