In [None]:
# ======================================================
# FraudDetectPro: Two-Stage Hybrid Model Pipeline
# ======================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score, precision_recall_curve
from imblearn.over_sampling import SMOTE
import shap
import matplotlib.pyplot as plt
import joblib
import os
import warnings
warnings.filterwarnings('ignore')

# TensorFlow / Keras
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# =====================
# Setup
# =====================
os.makedirs("../models", exist_ok=True)
os.makedirs("../visualizations", exist_ok=True)

print("="*65)
print("FRAUDDETECTPRO - TWO-STAGE HYBRID MODEL TRAINING")
print("="*65)

# =====================
# Load Processed Data
# =====================
print("\nüìÇ Loading preprocessed data...")
X_train = np.load("../data/processed/X_train.npy")
y_train = np.load("../data/processed/y_train.npy")
X_test = np.load("../data/processed/X_test.npy")
y_test = np.load("../data/processed/y_test.npy")

# Load feature names
try:
    feature_names = np.load("../data/processed/feature_names.npy", allow_pickle=True)
    print(f"‚úì Loaded {len(feature_names)} feature names")
except FileNotFoundError:
    print("‚ö†Ô∏è  feature_names.npy not found. Using generic names.")
    feature_names = [f"Feature_{i}" for i in range(X_train.shape[1])]

print(f"\nTraining set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Fraud ratio in training: {y_train.mean() * 100:.2f}%")
print(f"Fraud ratio in test: {y_test.mean() * 100:.2f}%")

# ======================================================
# STAGE 1 ‚Äî Neural Network Feature Extractor
# ======================================================
print("\nüß† Training Neural Network Feature Extractor...")

input_dim = X_train.shape[1]

nn_model = Sequential([
    Dense(64, activation='relu', input_dim=input_dim),
    Dropout(0.3),
    Dense(32, activation='relu', name="feature_layer"),
    Dense(1, activation='sigmoid')
])

nn_model.compile(optimizer=Adam(learning_rate=0.001),
                 loss='binary_crossentropy',
                 metrics=['accuracy'])

es = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = nn_model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=256,
    validation_split=0.2,
    callbacks=[es],
    verbose=1
)

# Extract learned features
feature_extractor = Model(inputs=nn_model.input,
                          outputs=nn_model.get_layer("feature_layer").output)

X_train_nn = feature_extractor.predict(X_train)
X_test_nn = feature_extractor.predict(X_test)

print(f"Original features: {X_train.shape[1]}, Extracted NN features: {X_train_nn.shape[1]}")

# Combine original + extracted features
X_train_hybrid = np.hstack((X_train, X_train_nn))
X_test_hybrid = np.hstack((X_test, X_test_nn))

print(f"Hybrid feature set shape: {X_train_hybrid.shape}")

# ======================================================
# STAGE 2 ‚Äî Ensemble Models on Hybrid Features
# ======================================================
print("\nü§ñ Training ensemble models on hybrid features...")

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=10,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

xgb = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=(len(y_train) - y_train.sum()) / y_train.sum(),
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1,
    use_label_encoder=False
)

lr = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

# Train models
rf.fit(X_train_hybrid, y_train)
xgb.fit(X_train_hybrid, y_train)
lr.fit(X_train_hybrid, y_train)

# ======================================================
# Ensemble Predictions
# ======================================================
def ensemble_predict(models, X, threshold=0.5):
    probs = np.zeros(X.shape[0])
    for model in models:
        probs += model.predict_proba(X)[:, 1]
    probs /= len(models)
    preds = (probs >= threshold).astype(int)
    return preds, probs

# Find optimal threshold
print("\nüéØ Finding optimal threshold...")
models = [rf, xgb, lr]
_, y_prob_train = ensemble_predict(models, X_train_hybrid)

precision, recall, thresholds = precision_recall_curve(y_train, y_prob_train)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]

print(f"Optimal threshold: {optimal_threshold:.3f} (F1-score: {f1_scores[optimal_idx]:.3f})")

# ======================================================
# Test Set Evaluation
# ======================================================
print("\nüìä Evaluating on test set...")
y_pred, y_prob = ensemble_predict(models, X_test_hybrid, threshold=optimal_threshold)

print("\n" + "="*60)
print("ENSEMBLE MODEL EVALUATION (Hybrid Features)")
print("="*60)
print(classification_report(y_test, y_pred, target_names=['Legitimate', 'Fraud']))

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)
print(f"F1-Score: {f1_score(y_test, y_pred):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_prob):.4f}")

# ======================================================
# SHAP Explainability (XGBoost)
# ======================================================
print("\nüîç Generating SHAP explanations...")
sample_size = min(1000, X_test_hybrid.shape[0])
X_test_sample = X_test_hybrid[:sample_size]

explainer = shap.TreeExplainer(xgb)
shap_values = explainer.shap_values(X_test_sample)

plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values, X_test_sample, feature_names=[*feature_names, *[f"NN_Feature_{i}" for i in range(X_train_nn.shape[1])]], show=False)
plt.tight_layout()
plt.savefig("../visualizations/shap_summary_plot_hybrid.png", dpi=300, bbox_inches='tight')
plt.close()
print("‚úì SHAP summary plot saved")

# ======================================================
# Save All Models and Metadata
# ======================================================
print("\nüíæ Saving models and metadata...")
nn_model.save("../models/nn_feature_extractor.h5")
joblib.dump(rf, "../models/rf_model.pkl")
joblib.dump(xgb, "../models/xgb_model.pkl")
joblib.dump(lr, "../models/lr_model.pkl")

metadata = {
    'feature_names': feature_names.tolist(),
    'optimal_threshold': float(optimal_threshold),
    'hybrid_features': X_train_hybrid.shape[1],
    'test_metrics': {
        'f1_score': float(f1_score(y_test, y_pred)),
        'roc_auc': float(roc_auc_score(y_test, y_prob)),
        'confusion_matrix': cm.tolist()
    }
}
joblib.dump(metadata, "../models/model_metadata.pkl")

print("\n" + "="*60)
print("‚úÖ TWO-STAGE HYBRID MODEL TRAINING COMPLETE")
print("="*60)
print("\nSaved:")
print("  - ../models/nn_feature_extractor.h5")
print("  - ../models/rf_model.pkl")
print("  - ../models/xgb_model.pkl")
print("  - ../models/lr_model.pkl")
print("  - ../models/model_metadata.pkl")
print("  - ../visualizations/shap_summary_plot_hybrid.png")


FRAUDDETECTPRO - MODEL TRAINING

üìÇ Loading preprocessed data...
‚úì Loaded 30 feature names

Training set: (454902, 30)
Test set: (56962, 30)
Fraud ratio in training: 50.00%
Fraud ratio in test: 0.17%

ü§ñ Training ensemble models...
Training Random Forest...
Training XGBoost...
Training Logistic Regression...

üéØ Finding optimal threshold...
Optimal threshold: 0.431 (F1-score: 0.997)

üìä Evaluating on test set...

ENSEMBLE MODEL EVALUATION

Classification Report:
              precision    recall  f1-score   support

  Legitimate       1.00      1.00      1.00     56864
       Fraud       0.30      0.91      0.46        98

    accuracy                           1.00     56962
   macro avg       0.65      0.95      0.73     56962
weighted avg       1.00      1.00      1.00     56962


Confusion Matrix:
[[56661   203]
 [    9    89]]

True Negatives: 56661, False Positives: 203
False Negatives: 9, True Positives: 89

F1-Score: 0.4564
ROC-AUC: 0.9773

INDIVIDUAL MODEL PERFORMANC