In [28]:
# üì¶ Libraries
import pandas as pd                      # For data handling
import numpy as np                       # For numeric operations
import matplotlib.pyplot as plt          # Plotting
import seaborn as sns                    # Styling for plots

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings("ignore")


In [29]:
# üéØ Load the cleaned dataset
df = pd.read_csv("Fraud_Analysis_Dataset.csv")
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1
1,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1
2,1,TRANSFER,2806.0,C1420196421,2806.0,0.0,C972765878,0.0,0.0,1
3,1,CASH_OUT,2806.0,C2101527076,2806.0,0.0,C1007251739,26202.0,0.0,1
4,1,TRANSFER,20128.0,C137533655,20128.0,0.0,C1848415041,0.0,0.0,1


üßπ Drop Columns (Based on EDA Insights)

In [30]:
# ‚ùå Drop columns that are data leaks or IDs
df = df.drop(columns=["step", "type", "nameOrig", "nameDest"])
df.head()


Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,181.0,181.0,0.0,0.0,0.0,1
1,181.0,181.0,0.0,21182.0,0.0,1
2,2806.0,2806.0,0.0,0.0,0.0,1
3,2806.0,2806.0,0.0,26202.0,0.0,1
4,20128.0,20128.0,0.0,0.0,0.0,1


 üõ†Ô∏è Apply Feature Engineering

In [None]:
from utils import feature_engineering

df = feature_engineering(df)
df 

Unnamed: 0,amount,isFraud,balance_diff_orig,balance_diff_dest,error_balance_orig,error_balance_dest
0,181.00,1,181.00,0.00,0.000000e+00,181.00
1,181.00,1,181.00,-21182.00,0.000000e+00,21363.00
2,2806.00,1,2806.00,0.00,0.000000e+00,2806.00
3,2806.00,1,2806.00,-26202.00,0.000000e+00,29008.00
4,20128.00,1,20128.00,0.00,0.000000e+00,20128.00
...,...,...,...,...,...,...
11137,6393.71,0,6393.70,-267885.75,-1.000000e-02,274279.46
11138,298799.54,0,298799.54,4566008.83,1.455192e-11,-4267209.29
11139,423159.63,0,127891.14,139142.53,-2.952685e+05,284017.10
11140,1094.03,0,1094.03,0.00,0.000000e+00,1094.03


üéØ Split Features & Target

In [33]:
# Define features and target
X = df.drop(columns=["isFraud"])
y = df["isFraud"]

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)


# ‚è© Apply SMOTE only to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

ü§ñ Train Multiple Models (Baseline Comparison)

In [34]:
# Define models to compare
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "LightGBM": LGBMClassifier()
}

# Train and evaluate each
for name, model in models.items():
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("model", model)
    ])
    
    pipe.fit(X_train_resampled, y_train_resampled)
    y_pred = pipe.predict(X_test)
    
    print(f"\nüìå Model: {name}")
    print(classification_report(y_test, y_pred))



üìå Model: Logistic Regression
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      2001
           1       0.85      0.76      0.80       228

    accuracy                           0.96      2229
   macro avg       0.91      0.87      0.89      2229
weighted avg       0.96      0.96      0.96      2229


üìå Model: Random Forest
              precision    recall  f1-score   support

           0       0.98      0.94      0.96      2001
           1       0.61      0.85      0.71       228

    accuracy                           0.93      2229
   macro avg       0.80      0.89      0.84      2229
weighted avg       0.94      0.93      0.93      2229

[LightGBM] [Info] Number of positive: 7999, number of negative: 7999
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000496 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Num

üìù Observations:

Logistic Regression struggles with low recall for fraud.

Random Forest and LightGBM perform much better, especially LightGBM with excellent F1 scores.

We'll tune LightGBM next using GridSearchCV for optimization.

üîß Hyperparameter Tuning with GridSearchCV (LightGBM)

In [36]:
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Define pipeline with scaler + LightGBM
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LGBMClassifier(random_state=42))
])

# Define a wide parameter grid for LightGBM
param_grid = {
    "model__n_estimators": [50, 100, 200],
    "model__num_leaves": [31, 50, 100],
    "model__max_depth": [-1, 10, 20, 30],
    "model__learning_rate": [0.01, 0.05, 0.1],
    "model__min_child_samples": [10, 20, 30],
    "model__subsample": [0.6, 0.8, 1.0],
    "model__colsample_bytree": [0.6, 0.8, 1.0]
}

# Setup GridSearchCV with 5-fold CV
grid_search = GridSearchCV(
    pipe,
    param_grid,
    cv=3,
    scoring='f1',
    n_jobs=-1,
    verbose=2
)

# Fit GridSearchCV on training data
grid_search.fit(X_train, y_train)

# Best params and score
print("‚úÖ Best Parameters:", grid_search.best_params_)
print("üîç Best CV F1 Score:", grid_search.best_score_)

# Evaluate on test data
y_pred = grid_search.predict(X_test)
from sklearn.metrics import classification_report
print("üìä Test Set Classification Report:\n", classification_report(y_test, y_pred))


Fitting 3 folds for each of 2916 candidates, totalling 8748 fits
[LightGBM] [Info] Number of positive: 914, number of negative: 7999
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000558 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Number of data points in the train set: 8913, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.102547 -> initscore=-2.169241
[LightGBM] [Info] Start training from score -2.169241
‚úÖ Best Parameters: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.05, 'model__max_depth': 10, 'model__min_child_samples': 30, 'model__n_estimators': 100, 'model__num_leaves': 31, 'model__subsample': 0.6}
üîç Best CV F1 Score: 0.8892314509463063
üìä Test Set Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99      2001
           1       0.94      0.83      