In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier

df = pd.read_csv("../cleaned_data.csv")

# Assuming 'target' is the fraud label column
X = df.drop(columns=['is_fraud'])  
y = df['is_fraud']  

# Apply 3x Undersampling
undersample = RandomUnderSampler(sampling_strategy=1/3, random_state=42)
X_resampled, y_resampled = undersample.fit_resample(X, y)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Train XGBoost Model
xgb_model = XGBClassifier(
    n_estimators=200, 
    learning_rate=0.05, 
    max_depth=6, 
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
xgb_model.fit(X_train, y_train)

# Predictions
y_pred = xgb_model.predict(X_test)
y_prob = xgb_model.predict_proba(X_test)[:, 1]

# Evaluation
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.97      0.96     56884
           1       0.90      0.83      0.87     18961

    accuracy                           0.94     75845
   macro avg       0.92      0.90      0.91     75845
weighted avg       0.94      0.94      0.93     75845

ROC AUC Score: 0.9792834001239581
Confusion Matrix:
 [[55217  1667]
 [ 3198 15763]]
