In [1]:
# Install required packages (run once)
# %pip install pandas numpy scikit-learn xgboost lightgbm imbalanced-learn joblib matplotlib seaborn tensorflow shap

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import joblib


In [2]:
# Load features and target
features = pd.read_csv(r'C:\Users\gunar\OneDrive\Desktop\fraud detection\Features.csv')
target = pd.read_csv(r'C:\Users\gunar\OneDrive\Desktop\fraud detection\Target.csv')


# Merge for convenience
df = pd.concat([features, target], axis=1)
df.head()

# Missing values
df.isnull().sum()[lambda x: x>0]

# Class balance
df['PotentialFraud'].value_counts(), 
df['PotentialFraud'].value_counts(normalize=True)


PotentialFraud
0    0.90647
1    0.09353
Name: proportion, dtype: float64

In [3]:
X = df.drop(columns=['PotentialFraud'])
y = df['PotentialFraud']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)


In [4]:
def evaluate_model(name, model, X_test, y_test):
    preds = model.predict(X_test)
    print(f"\n{name}")
    print(classification_report(y_test, preds, digits=4))
    cm = confusion_matrix(y_test, preds)
    print("Confusion Matrix:\n", cm)
    return {
        'Model': name,
        'Accuracy': accuracy_score(y_test, preds),
        'Precision': precision_score(y_test, preds),
        'Recall': recall_score(y_test, preds),
        'F1-Score': f1_score(y_test, preds)
    }


In [5]:
results = []

# 1. Logistic Regression (SMOTE + Scaling)
pipe_lr = ImbPipeline([
    ('scale', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))
])
pipe_lr.fit(X_train, y_train)
results.append(evaluate_model("Logistic Regression", pipe_lr, X_test, y_test))

# 2. Random Forest
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)
results.append(evaluate_model("Random Forest", rf, X_test, y_test))

# 3. XGBoost
scale_pos = (y_train==0).sum()/(y_train==1).sum()
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale_pos, random_state=42)
xgb.fit(X_train.values, y_train.values)
results.append(evaluate_model("XGBoost", xgb, X_test, y_test))

# 4. LightGBM
lgbm = LGBMClassifier(scale_pos_weight=scale_pos, random_state=42)
lgbm.fit(X_train, y_train)
results.append(evaluate_model("LightGBM", lgbm, X_test, y_test))

# 5. Decision Tree
dt = DecisionTreeClassifier(class_weight='balanced', random_state=42)
dt.fit(X_train, y_train)
results.append(evaluate_model("Decision Tree", dt, X_test, y_test))

# 6. Voting Ensemble
voting = VotingClassifier(estimators=[
    ('xgb', xgb),
    ('lr', LogisticRegression(max_iter=1000)),
    ('rf', rf)
], voting='soft')
voting.fit(X_train, y_train)
results.append(evaluate_model("Voting Ensemble", voting, X_test, y_test))

# 7. Stacking Ensemble
stacking = StackingClassifier(estimators=[
    ('rf', rf),
    ('xgb', xgb)
], final_estimator=LogisticRegression(max_iter=1000))
stacking.fit(X_train, y_train)
results.append(evaluate_model("Stacking Ensemble", stacking, X_test, y_test))

# Summary
pd.DataFrame(results).sort_values('F1-Score', ascending=False)



Logistic Regression
              precision    recall  f1-score   support

           0     0.9859    0.9246    0.9542       981
           1     0.5432    0.8713    0.6692       101

    accuracy                         0.9196      1082
   macro avg     0.7645    0.8979    0.8117      1082
weighted avg     0.9445    0.9196    0.9276      1082

Confusion Matrix:
 [[907  74]
 [ 13  88]]

Random Forest
              precision    recall  f1-score   support

           0     0.9536    0.9847    0.9689       981
           1     0.7826    0.5347    0.6353       101

    accuracy                         0.9427      1082
   macro avg     0.8681    0.7597    0.8021      1082
weighted avg     0.9376    0.9427    0.9378      1082

Confusion Matrix:
 [[966  15]
 [ 47  54]]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



XGBoost
              precision    recall  f1-score   support

           0     0.9622    0.9602    0.9612       981
           1     0.6214    0.6337    0.6275       101

    accuracy                         0.9298      1082
   macro avg     0.7918    0.7970    0.7943      1082
weighted avg     0.9304    0.9298    0.9301      1082

Confusion Matrix:
 [[942  39]
 [ 37  64]]
[LightGBM] [Info] Number of positive: 405, number of negative: 3923
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036443 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 57371
[LightGBM] [Info] Number of data points in the train set: 4328, number of used features: 300
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.093577 -> initscore=-2.270725
[LightGBM] [Info] Start training from score -2.270725

LightGBM
              precision    recall  f1-score   support

           0     0.9669    0.9531    0.9600       981
           

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Voting Ensemble
              precision    recall  f1-score   support

           0     0.9583    0.9837    0.9708       981
           1     0.7867    0.5842    0.6705       101

    accuracy                         0.9464      1082
   macro avg     0.8725    0.7839    0.8206      1082
weighted avg     0.9423    0.9464    0.9428      1082

Confusion Matrix:
 [[965  16]
 [ 42  59]]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Stacking Ensemble
              precision    recall  f1-score   support

           0     0.9564    0.9837    0.9698       981
           1     0.7808    0.5644    0.6552       101

    accuracy                         0.9445      1082
   macro avg     0.8686    0.7740    0.8125      1082
weighted avg     0.9400    0.9445    0.9405      1082

Confusion Matrix:
 [[965  16]
 [ 44  57]]


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
5,Voting Ensemble,0.946396,0.786667,0.584158,0.670455
0,Logistic Regression,0.919593,0.54321,0.871287,0.669202
6,Stacking Ensemble,0.944547,0.780822,0.564356,0.655172
3,LightGBM,0.927911,0.6,0.683168,0.638889
1,Random Forest,0.942699,0.782609,0.534653,0.635294
2,XGBoost,0.92976,0.621359,0.633663,0.627451
4,Decision Tree,0.907579,0.504274,0.584158,0.541284


In [6]:
param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'class_weight': ['balanced']
}
grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid,
                       scoring='f1', cv=3, n_jobs=-1, verbose=1)
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_
results.append(evaluate_model("Random Forest Tuned", best_rf, X_test, y_test))


Fitting 3 folds for each of 8 candidates, totalling 24 fits

Random Forest Tuned
              precision    recall  f1-score   support

           0     0.9698    0.9480    0.9588       981
           1     0.5854    0.7129    0.6429       101

    accuracy                         0.9261      1082
   macro avg     0.7776    0.8304    0.8008      1082
weighted avg     0.9339    0.9261    0.9293      1082

Confusion Matrix:
 [[930  51]
 [ 29  72]]


In [8]:
param_grid = {
    'n_estimators': [200, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [5, 10, 15],
    'class_weight': [None, {0:1, 1:10}, {0:1, 1:20}],
    'scale_pos_weight': [scale_pos, scale_pos*2]
}
lgbm = LGBMClassifier(random_state=42)
grid = GridSearchCV(lgbm, param_grid, scoring='f1', cv=3, n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)
print("Best params:", grid.best_params_)
best_lgbm = grid.best_estimator_
results.append(evaluate_model("LightGBM Tuned", best_lgbm, X_test, y_test))


Fitting 3 folds for each of 108 candidates, totalling 324 fits
[LightGBM] [Info] Number of positive: 405, number of negative: 3923
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027846 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 57371
[LightGBM] [Info] Number of data points in the train set: 4328, number of used features: 300
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.673709 -> initscore=0.725007
[LightGBM] [Info] Start training from score 0.725007
Best params: {'class_weight': {0: 1, 1: 20}, 'learning_rate': 0.05, 'max_depth': 15, 'n_estimators': 500, 'scale_pos_weight': np.float64(19.372839506172838)}

LightGBM Tuned
              precision    recall  f1-score   support

           0     0.9727    0.9450    0.9586       981
           1     0.5814    0.7426    0.6522       101

    accuracy                         0.9261      1082
   macro avg     0.7771    0.8438    0.8054      1082
w

In [9]:
output_dir = os.path.join('models', 'models')
os.makedirs(output_dir, exist_ok=True)

# Save LightGBM
joblib.dump(lgbm, os.path.join(output_dir, 'model1.pkl'))
# Save tuned RF
joblib.dump(best_rf, os.path.join(output_dir, 'model2.pkl'))

joblib.dump(pipe_lr, os.path.join(output_dir, 'model3.pkl'))

print("Models saved to:", output_dir)


Models saved to: models\models
