In [2]:
%pip install xgboost

Collecting xgboost
  Using cached xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Using cached xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
Installing collected packages: xgboost
Successfully installed xgboost-3.0.2
Note: you may need to restart the kernel to use updated packages.


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, auc, f1_score, confusion_matrix
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import time

# Start timer
start_time = time.time()

# Load datasets
try:
    fraud_data = pd.read_csv('Fraud_Data_transformed.csv')
    fraud_labels = pd.read_csv('Fraud_Data_labels.csv')
    creditcard_data = pd.read_csv('creditcard_scaled.csv')
    creditcard_labels = pd.read_csv('creditcard_labels.csv')
except FileNotFoundError as e:
    print(f"Error: {e}")
    exit(1)

# Subsample creditcard data if too large
if len(creditcard_data) > 100000:
    sample_frac = 0.5
    creditcard_data = creditcard_data.sample(frac=sample_frac, random_state=42)
    creditcard_labels = creditcard_labels.loc[creditcard_data.index]
    print(f"Subsampled creditcard data to {len(creditcard_data)} rows.")

# Prepare Fraud_Data
min_len = min(len(fraud_data), len(fraud_labels))
X_fraud = fraud_data.iloc[:min_len].reset_index(drop=True)
y_fraud = fraud_labels.iloc[:min_len, 0].reset_index(drop=True)
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud)

# Prepare creditcard
X_credit = creditcard_data
y_credit = creditcard_labels.iloc[:, 0]
X_credit_train, X_credit_test, y_credit_train, y_credit_test = train_test_split(X_credit, y_credit, test_size=0.2, random_state=42, stratify=y_credit)

# Initialize models
lr = LogisticRegression(random_state=42, max_iter=500)
xgb_model = xgb.XGBClassifier(random_state=42, n_estimators=50, max_depth=3, n_jobs=-1)

# Evaluation function
def evaluate_model(model, X_train, X_test, y_train, y_test, dataset_name, plot_cm=False):
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    
    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    auc_pr = auc(recall, precision)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    print(f"\n{dataset_name} - {model.__class__.__name__}")
    print(f"AUC-PR: {auc_pr:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print("Confusion Matrix:\n", cm)
    
    if plot_cm:
        plt.figure(figsize=(6, 4))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'{dataset_name} - {model.__class__.__name__} Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.savefig(f'{dataset_name}_best_cm.png')
        plt.close()
    
    return auc_pr, f1

# Train and evaluate
print("Training on Fraud_Data...")
lr_fraud_auc_pr, lr_fraud_f1 = evaluate_model(lr, X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test, 'Fraud_Data')
xgb_fraud_auc_pr, xgb_fraud_f1 = evaluate_model(xgb_model, X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test, 'Fraud_Data', plot_cm=True)

print("\nTraining on creditcard...")
lr_credit_auc_pr, lr_credit_f1 = evaluate_model(lr, X_credit_train, X_credit_test, y_credit_train, y_credit_test, 'creditcard')
xgb_credit_auc_pr, xgb_credit_f1 = evaluate_model(xgb_model, X_credit_train, X_credit_test, y_credit_train, y_credit_test, 'creditcard', plot_cm=True)

# Model justification
print("\nModel Selection:")
print(f"Fraud_Data: LR AUC-PR={lr_fraud_auc_pr:.4f}, XGBoost AUC-PR={xgb_fraud_auc_pr:.4f}")
print(f"creditcard: LR AUC-PR={lr_credit_auc_pr:.4f}, XGBoost AUC-PR={xgb_credit_auc_pr:.4f}")
best_model_fraud = 'XGBoost' if xgb_fraud_auc_pr > lr_fraud_auc_pr else 'Logistic Regression'
best_model_credit = 'XGBoost' if xgb_credit_auc_pr > lr_credit_auc_pr else 'Logistic Regression'
print(f"Best for Fraud_Data: {best_model_fraud} (higher AUC-PR)")
print(f"Best for creditcard: {best_model_credit} (higher AUC-PR)")

print(f"\nExecution time: {(time.time() - start_time) / 60:.2f} minutes")

Subsampled creditcard data to 283253 rows.
Training on Fraud_Data...

Fraud_Data - LogisticRegression
AUC-PR: 0.8165
F1-Score: 0.7018
Confusion Matrix:
 [[18127  9266]
 [ 7576 19816]]

Fraud_Data - XGBClassifier
AUC-PR: 0.9817
F1-Score: 0.9680
Confusion Matrix:
 [[27291   102]
 [ 1603 25789]]

Training on creditcard...

creditcard - LogisticRegression
AUC-PR: 0.9973
F1-Score: 0.9774
Confusion Matrix:
 [[28065   252]
 [ 1011 27323]]

creditcard - XGBClassifier
AUC-PR: 0.9997
F1-Score: 0.9927
Confusion Matrix:
 [[28198   119]
 [  293 28041]]

Model Selection:
Fraud_Data: LR AUC-PR=0.8165, XGBoost AUC-PR=0.9817
creditcard: LR AUC-PR=0.9973, XGBoost AUC-PR=0.9997
Best for Fraud_Data: XGBoost (higher AUC-PR)
Best for creditcard: XGBoost (higher AUC-PR)

Execution time: 0.60 minutes
