In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

# ----------------------------
# 1. Load the Dataset (Data is already cleaned)
# ----------------------------
file_path = "main4.csv"  # Adjust the path if needed
df = pd.read_csv(file_path)

# ----------------------------
# 2. Define Features and Target
# ----------------------------
target_column = "Is Fraud?"
X = df.drop(columns=[target_column])
# Convert target labels from 1 and 2 to 0 and 1 for binary classification
y = df[target_column].astype(int) - 1

# ----------------------------
# 3. Setup 10-Fold Stratified Cross-Validation with TQDM
# ----------------------------
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
fold = 1

for train_index, test_index in tqdm(skf.split(X, y), total=10, desc="CV Folds"):
    # Split the data into training and testing sets for this fold
    X_train = X.iloc[train_index]
    X_test = X.iloc[test_index]
    y_train = y.iloc[train_index]
    y_test = y.iloc[test_index]
    
    # ----------------------------
    # 4. Apply SMOTE for Class Balancing on Training Data
    # ----------------------------
    smote = SMOTE(random_state=42)
    X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)
    
    # ----------------------------
    # 5. Initialize and Train XGBoost with GPU Acceleration
    # ----------------------------
    xgb_clf = XGBClassifier(
        tree_method='gpu_hist',      # Use GPU-accelerated tree building
        predictor='gpu_predictor',   # Use GPU for predictions
        random_state=42,
        use_label_encoder=False,      # Avoid warning regarding label encoder
        eval_metric='logloss'         # Evaluation metric for binary classification
    )
    
    xgb_clf.fit(X_train_bal, y_train_bal)
    
    # ----------------------------
    # 6. Make Predictions on the Test Set
    # ----------------------------
    y_pred = xgb_clf.predict(X_test)
    
    # ----------------------------
    # 7. Evaluate: Classification Report and Confusion Matrix
    # ----------------------------
    print(f"\n--- Classification Report for Fold {fold} ---")
    print(classification_report(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix for Fold {fold}:")
    print(cm)
    
    # Plot confusion matrix as a heatmap
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"Confusion Matrix - Fold {fold}")
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.show()
    
    fold += 1


CV Folds:   0%|          | 0/10 [02:22<?, ?it/s]


XGBoostError: [15:53:42] D:\bld\xgboost-split_1727635034975\work\src\common\common.h:174: XGBoost version not compiled with GPU support.