# Fraud Detection on Transaction Data

## Imports

In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns


## Set Working Directory

In [44]:
import os
os.chdir(os.getcwd())
#os.chdir("C:/Users/MelodyPogula/Downloads")

## Data

In [45]:
### Data
train_df = pd.read_csv("transaction.csv")
data = train_df.copy()

X = data.drop(columns=["isFraud"])
y = data["isFraud"]

## split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=7406
)


### Encoding

In [46]:
cat_cols = X_train.select_dtypes(include=["object"]).columns
num_cols = X_train.select_dtypes(exclude=["object"]).columns
# one-hot encode
X_train_enc = pd.get_dummies(X_train, columns=cat_cols, drop_first=True)
X_test_enc = pd.get_dummies(X_test, columns=cat_cols, drop_first=True)
X_test_enc = X_test_enc.reindex(columns=X_train_enc.columns, fill_value=0)

## Export Data

## Models

In [47]:
### Clean columns
# column names
X_train_enc.columns = X_train_enc.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
X_test_enc.columns = X_test_enc.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
# remove duplicates
X_train_enc = X_train_enc.loc[:, ~X_train_enc.columns.duplicated()]
X_test_enc = X_test_enc.loc[:, ~X_test_enc.columns.duplicated()]
X_test_enc = X_test_enc.reindex(columns=X_train_enc.columns, fill_value=0)

### Random Forest

In [None]:
### Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=7406, n_jobs=-1)
rf.fit(X_train_enc, y_train)
pred_rf = rf.predict_proba(X_test_enc)[:, 1]
auc_rf = roc_auc_score(y_test, pred_rf)
print(f"Random Forest AUC: {auc_rf:.4f}")

### Light GBM

In [None]:
### Light GBM
lgb_train = lgb.Dataset(X_train_enc, y_train)
lgb_eval = lgb.Dataset(X_test_enc, y_test, reference=lgb_train)
params = {
 "objective": "binary",
 "metric": "auc",
 "boosting_type": "gbdt",
 "learning_rate": 0.05,
 "num_leaves": 64,
 "verbose": -1
}
13
gbm = lgb.train(
 params,
 lgb_train,
 valid_sets=[lgb_eval],
 num_boost_round=200,
 callbacks=[lgb.early_stopping(stopping_rounds=10)]
)
pred_lgb = gbm.predict(X_test_enc)
auc_lgb = roc_auc_score(y_test, pred_lgb)
print(f"LightGBM AUC: {auc_lgb:.4f}")

## Cross-Validation

In [None]:
### Cross-Validation
#cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=7406)
models = {
 "Random Forest": RandomForestClassifier(n_estimators=200, n_jobs=-1,
random_state=7406),
 "LightGBM": lgb.LGBMClassifier(
 n_estimators=200,
 learning_rate=0.05,
 num_leaves=64,
 subsample=0.8,
 colsample_bytree=0.8,
 random_state=7406
 )
}

In [None]:
cv_results = {}
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=7406)
for name, model in models.items():
 print(f"\n{name} Cross-Validation (3-fold, sampled):")
 X_sample = X_train_enc.sample(frac=0.25, random_state=42)
 y_sample = y_train.loc[X_sample.index]
 aucs = cross_val_score(model, X_sample, y_sample, cv=cv, scoring="roc_auc", n_jobs=-1)
 cv_results[name] = aucs
 print(f"AUCs: {np.round(aucs, 4)}")
 print(f"Mean AUC: {np.mean(aucs):.4f} Â± {np.std(aucs):.4f}")

### Summary Graphs

In [None]:
### Summary graphs
cv_df = (
 pd.DataFrame(cv_results)
 .melt(var_name="Model", value_name="AUC")
)
15
sns.boxplot(data=cv_df, x="Model", y="AUC", palette="pastel")
plt.title("Cross-Validation AUC Comparison")
plt.xlabel("Model")
plt.ylabel("AUC")
plt.show()
mean_aucs = {k: np.mean(v) for k, v in cv_results.items()}
sns.barplot(x=list(mean_aucs.keys()), y=list(mean_aucs.values()), palette="coolwarm")
plt.title("Mean AUC by Model (5-Fold CV)")
plt.xlabel("Model")
plt.ylabel("Mean AUC")
plt.show()
