 ## Proper Data Handling with SMOTE

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, precision_recall_curve, auc
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
import xgboost as xgb
import joblib
import shap


 ### Load and Preprocess Data

In [None]:
df = pd.read_csv("PS_20174392719_1491204439457_log.csv").drop(['nameOrig', 'nameDest'], axis=1)
encoder = LabelEncoder()
df['type'] = encoder.fit_transform(df['type'])


 ### Data Splitting and SMOTE Application

In [None]:
X = df.drop('isFraud', axis=1)
y = df['isFraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train_res, y_train_res = SMOTE(random_state=42).fit_resample(X_train, y_train)


 ### Model Training

In [None]:
# Train LightGBM model
lgb_model = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', learning_rate=0.05,
                               num_leaves=31, max_depth=5, n_estimators=200, reg_alpha=0.1, reg_lambda=0.1)
lgb_model.fit(X_train_res, y_train_res)

# Train XGBoost model
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
xgb_model = xgb.XGBClassifier(objective='binary:logistic', learning_rate=0.05, max_depth=5, n_estimators=200,
                              reg_alpha=0.1, reg_lambda=0.5, scale_pos_weight=scale_pos_weight, eval_metric='auc')
xgb_model.fit(X_train, y_train)


 ### Model Evaluation

In [None]:
def evaluate_model(model, X_test, y_test, model_name):
    y_probs = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    roc_auc = roc_auc_score(y_test, y_probs)
    precision, recall, _ = precision_recall_curve(y_test, y_probs)
    pr_auc = auc(recall, precision)
    fpr, tpr, _ = roc_curve(y_test, y_probs)
    
    print(f"\nEvaluation for {model_name}:")
    print(classification_report(y_test, y_pred))
    
    # Plot ROC and PR curves
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    axes[0].plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC AUC = {roc_auc:.2f}')
    axes[0].plot([0, 1], [0, 1], linestyle='--', color='navy')
    axes[0].set_xlabel('False Positive Rate')
    axes[0].set_ylabel('True Positive Rate')
    axes[0].set_title(f'ROC Curve - {model_name}')
    axes[0].legend()
    
    axes[1].plot(recall, precision, color='blue', lw=2, label=f'PR AUC = {pr_auc:.2f}')
    axes[1].set_xlabel('Recall')
    axes[1].set_ylabel('Precision')
    axes[1].set_title(f'Precision-Recall Curve - {model_name}')
    axes[1].legend()
    
    plt.tight_layout()
    plt.show()


In [None]:
evaluate_model(lgb_model, X_test, y_test, "LightGBM")
evaluate_model(xgb_model, X_test, y_test, "XGBoost")


 ### Feature Importance Plot

In [None]:
def plot_feature_importance(model, X, model_name):
    explainer = shap.Explainer(model)
    shap_values = explainer(X)
    shap.summary_plot(shap_values, X, plot_type="bar", show=False)
    plt.title(f"Feature Importance - {model_name}")
    plt.show()


In [None]:
plot_feature_importance(lgb_model, X_test, "LightGBM")
plot_feature_importance(xgb_model, X_test, "XGBoost")


 ### Save Models

In [None]:
joblib.dump(lgb_model, "lightgbm_fraud_detection.pkl")
joblib.dump(xgb_model, "xgboost_fraud_detection.pkl")
joblib.dump(encoder, "label_encoder.pkl")
print("Models and encoder saved successfully!")
