In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [12]:
import warnings
warnings.filterwarnings("ignore")

In [13]:
df = pd.read_csv("Fraud_Analysis_Dataset.csv")

df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1
1,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1
2,1,TRANSFER,2806.0,C1420196421,2806.0,0.0,C972765878,0.0,0.0,1
3,1,CASH_OUT,2806.0,C2101527076,2806.0,0.0,C1007251739,26202.0,0.0,1
4,1,TRANSFER,20128.0,C137533655,20128.0,0.0,C1848415041,0.0,0.0,1


Based on EDA and Basic Domain knowledge we will be dropping certain columns 
["step","type","nameOrig","nameDest"]

In [14]:
df = df.drop(columns=["step","type","nameOrig","nameDest"])

df.head()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,181.0,181.0,0.0,0.0,0.0,1
1,181.0,181.0,0.0,21182.0,0.0,1
2,2806.0,2806.0,0.0,0.0,0.0,1
3,2806.0,2806.0,0.0,26202.0,0.0,1
4,20128.0,20128.0,0.0,0.0,0.0,1


In [15]:
X = df.drop(columns=["isFraud"])
y = df["isFraud"]


In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# 1. Split your data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# 2. Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    # "Decision Tree": DecisionTreeClassifier(),
    # "SVM": SVC(),
    "LightGBM": LGBMClassifier()
}

# 3. Loop through and evaluate each model
for name, model in models.items():
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("model", model)
    ])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    print(f"\n📌 Model: {name}")
    print(classification_report(y_test, y_pred))



📌 Model: Logistic Regression
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      2001
           1       0.99      0.48      0.64       228

    accuracy                           0.95      2229
   macro avg       0.97      0.74      0.81      2229
weighted avg       0.95      0.95      0.94      2229


📌 Model: Random Forest
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2001
           1       0.96      0.93      0.94       228

    accuracy                           0.99      2229
   macro avg       0.98      0.96      0.97      2229
weighted avg       0.99      0.99      0.99      2229

[LightGBM] [Info] Number of positive: 914, number of negative: 7999
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000067 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`

Based on the evaluation metrics, the LightGBM model outperforms other models by achieving the highest precision, recall, and F1-score, making it the best choice for fraud detection in this dataset.

Grid Search CV for LightGBM Model

In [19]:
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Define pipeline with scaler + LightGBM
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LGBMClassifier(random_state=42))
])

# Define a wide parameter grid for LightGBM
param_grid = {
    "model__n_estimators": [50, 100, 200],
    "model__num_leaves": [31, 50, 100],
    "model__max_depth": [-1, 10, 20, 30],
    "model__learning_rate": [0.01, 0.05, 0.1],
    "model__min_child_samples": [10, 20, 30],
    "model__subsample": [0.6, 0.8, 1.0],
    "model__colsample_bytree": [0.6, 0.8, 1.0]
}

# Setup GridSearchCV with 5-fold CV
grid_search = GridSearchCV(
    pipe,
    param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    verbose=2
)

# Fit GridSearchCV on training data
grid_search.fit(X_train, y_train)

# Best params and score
print("✅ Best Parameters:", grid_search.best_params_)
print("🔍 Best CV F1 Score:", grid_search.best_score_)

# Evaluate on test data
y_pred = grid_search.predict(X_test)
from sklearn.metrics import classification_report
print("📊 Test Set Classification Report:\n", classification_report(y_test, y_pred))


Fitting 5 folds for each of 2916 candidates, totalling 14580 fits
[LightGBM] [Info] Number of positive: 914, number of negative: 7999
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003772 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Number of data points in the train set: 8913, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.102547 -> initscore=-2.169241
[LightGBM] [Info] Start training from score -2.169241
✅ Best Parameters: {'model__colsample_bytree': 1.0, 'model__learning_rate': 0.1, 'model__max_depth': 20, 'model__min_child_samples': 10, 'model__n_estimators': 200, 'model__num_leaves': 50, 'model__subsample': 0.6}
🔍 Best CV F1 Score: 0.9665593183268836
📊 Test Set Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2001
           1       0.96      0.96      0.96    