In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

# 1. Load Dataset
# Columns: ['order_id', 'user_id', 'order_amount', 'payment_method', 'device', 'country', 'is_fraud']
df = pd.read_csv("C:/Users/OM/Downloads/CSV Data/Fraudulent Order Data.csv")

# 2. Data Preprocessing
# Drop IDs (not useful for prediction)
df = df.drop(["order_id", "user_id"], axis=1)

# Encode categorical features
categorical_cols = df.select_dtypes(include=['object']).columns
encoder = LabelEncoder()
for col in categorical_cols:
    df[col] = encoder.fit_transform(df[col])

# Features & Target
X = df.drop("is_fraud", axis=1)
y = df["is_fraud"]

# Train-test split (stratified for class imbalance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Feature scaling (important for XGBoost)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 3. Random Forest Classifier
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    class_weight="balanced"  # handles imbalance
)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

print("\n Random Forest Results:")
print("Accuracy:", accuracy_score(y_test, rf_preds))
print("ROC-AUC:", roc_auc_score(y_test, rf.predict_proba(X_test)[:,1]))
print(classification_report(y_test, rf_preds))

# 4. XGBoost Classifier
xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    scale_pos_weight=(y_train.value_counts()[0]/y_train.value_counts()[1])  # imbalance handling
)

xgb.fit(X_train, y_train)
xgb_preds = xgb.predict(X_test)

print("\n XGBoost Results:")
print("Accuracy:", accuracy_score(y_test, xgb_preds))
print("ROC-AUC:", roc_auc_score(y_test, xgb.predict_proba(X_test)[:,1]))
print(classification_report(y_test, xgb_preds))

# 5. Confusion Matrix (Comparison)
print("\nConfusion Matrix - Random Forest:\n", confusion_matrix(y_test, rf_preds))
print("\nConfusion Matrix - XGBoost:\n", confusion_matrix(y_test, xgb_preds))


📌 Random Forest Results:
Accuracy: 0.8666666666666667
ROC-AUC: 0.3746630727762803
              precision    recall  f1-score   support

           0       0.88      0.98      0.93        53
           1       0.00      0.00      0.00         7

    accuracy                           0.87        60
   macro avg       0.44      0.49      0.46        60
weighted avg       0.78      0.87      0.82        60


📌 XGBoost Results:
Accuracy: 0.8333333333333334
ROC-AUC: 0.41778975741239893
              precision    recall  f1-score   support

           0       0.89      0.92      0.91        53
           1       0.20      0.14      0.17         7

    accuracy                           0.83        60
   macro avg       0.55      0.53      0.54        60
weighted avg       0.81      0.83      0.82        60


Confusion Matrix - Random Forest:
 [[52  1]
 [ 7  0]]

Confusion Matrix - XGBoost:
 [[49  4]
 [ 6  1]]
