In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    precision_recall_curve,
    auc,
    roc_auc_score
)

import warnings
warnings.filterwarnings("ignore")


In [4]:
# Load processed fraud data
df = pd.read_csv("../data/processed/cleaned_fraud_data.csv")
df.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,ip_int,country,transaction_hour,transaction_dayofweek,time_since_signup,transactions_per_device,user_transaction_count
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0,,Unknown,2,5,1251.856111,1,1
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0,,Unknown,1,0,4.984444,1,1
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1,,Unknown,18,3,0.000278,12,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0,,Unknown,13,0,136.690278,1,1
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0,,Unknown,18,2,1211.516944,1,1


In [1]:
# Drop high-cardinality and identifier columns
drop_cols = ['class', 'user_id', 'device_id', 'ip_address', 'ip_int']
X = df.drop(columns=drop_cols, errors='ignore')
y = df['class']

# One-hot encode safe categorical columns
X = pd.get_dummies(X, drop_first=True)


NameError: name 'df' is not defined

In [None]:
# Stratified split to preserve class balance in both sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
# Scale numerical features (important for logistic regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Apply SMOTE to training data only
sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train_scaled, y_train)

# Show class distribution after resampling
pd.Series(y_train_resampled).value_counts()

In [None]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_resampled, y_train_resampled)

y_pred_lr = lr_model.predict(X_test_scaled)
y_proba_lr = lr_model.predict_proba(X_test_scaled)[:, 1]

In [None]:
xgb_model = XGBClassifier(
    use_label_encoder=False,
    scale_pos_weight=1,
    eval_metric='logloss',
    random_state=42
)

xgb_model.fit(X_train_resampled, y_train_resampled)

y_pred_xgb = xgb_model.predict(X_test_scaled)
y_proba_xgb = xgb_model.predict_proba(X_test_scaled)[:, 1]


In [None]:
def evaluate_model(name, y_true, y_pred, y_score):
    print(f"{name} Evaluation")
    print("-" * 40)
    print(classification_report(y_true, y_pred, digits=4))
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))

    precision, recall, _ = precision_recall_curve(y_true, y_score)
    pr_auc = auc(recall, precision)
    print(f"AUC-PR: {pr_auc:.4f}")
    print(f"ROC AUC: {roc_auc_score(y_true, y_score):.4f}")

    # Plot precision-recall curve
    plt.figure(figsize=(6, 4))
    plt.plot(recall, precision, label=f'{name} (AUC-PR = {pr_auc:.4f})')
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title(f"Precision-Recall Curve: {name}")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
evaluate_model("Logistic Regression", y_test, y_pred_lr, y_proba_lr)
evaluate_model("XGBoost", y_test, y_pred_xgb, y_proba_xgb)