In [None]:
# Mounting the google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import os
os.chdir("/content/drive/MyDrive/GNCIPL_Internship_Projects/Finance/Notebook")

In [None]:
!pwd
!ls

/content/drive/MyDrive/GNCIPL_Internship_Projects/Finance/Notebook
catboost_info  preprocess.py		      Visualization.ipynb
EDA.ipynb      __pycache__
model.ipynb    Synthetic_Data_Analysis.ipynb


In [None]:
import sys
sys.path.append("/content/drive/MyDrive/GNCIPL_Internship_Projects/Finance/Notebook")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
import preprocess

import warnings
warnings.filterwarnings('ignore')

In [None]:
original_data_path = "/content/drive/MyDrive/GNCIPL_Internship_Projects/Finance/Data/Bank_Transaction.csv"
augmented_data_path = "/content/drive/MyDrive/GNCIPL_Internship_Projects/Finance/Data/Augmented_data.csv"

# Preprocessing Data

### Preprocessing of Original Data

In [1]:
original_data = pd.read_csv(original_data_path)

NameError: name 'pd' is not defined

In [None]:
# Displaying Original Data
original_data.head()

NameError: name 'original_data' is not defined

In [None]:
# Shape of original Data
original_data.shape

In [None]:
# Target Label
original_label = original_data['Is_Fraud'].values

original_label.shape

In [None]:
# Applying preprocessing on original data
original_data_preprocessed, pipeline = preprocess.fit_preprocess(original_data)

In [None]:
# Shape of processed data
original_data_preprocessed.shape

NameError: name 'original_data_preprocessed' is not defined

In [None]:
temp_df = pd.DataFrame(original_data_preprocessed)

temp_df.head()

### Preprocessing of Augmentated Data

In [None]:
augmented_data = pd.read_csv(augmented_data_path)

augmented_data.tail()

In [None]:
# Shape of the Augmented Data
augmented_data.shape

NameError: name 'augmented_data' is not defined

In [None]:
aug_label = augmented_data['Is_Fraud'].values

aug_label.shape

In [None]:
augmented_data.duplicated().sum()

In [None]:
#Shuffling the Dataset
augmented_data = augmented_data.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
# Applying preprocessing on augmented data
augmented_data_preprocess = pipeline.transform(augmented_data)

In [None]:
augmented_data_preprocess.shape

NameError: name 'augmented_data_preprocess' is not defined

# **Model Development**

In [None]:
!pip install catboost



In [None]:
# -------------------- Imports --------------------
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

In [None]:
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# -------------------- Define classifiers --------------------
clfs = {
    # Linear Models
    'LR': LogisticRegression(solver='liblinear', penalty='l1', class_weight='balanced', max_iter=1000),
    'SGD': SGDClassifier(loss='log_loss', class_weight='balanced', max_iter=1000, tol=1e-3, n_jobs=-1),

    # Tree-based models
    'DT': DecisionTreeClassifier(max_depth=5, class_weight='balanced'),
    'RF': RandomForestClassifier(n_estimators=50, random_state=2, class_weight='balanced', n_jobs=-1),
    'BRF': BalancedRandomForestClassifier(n_estimators=50, random_state=2, n_jobs=-1),
    'AdaBoost': AdaBoostClassifier(n_estimators=50, random_state=2),

    # Gradient Boosting
    'XGB': XGBClassifier(n_estimators=50, random_state=2, use_label_encoder=False, eval_metric='logloss', scale_pos_weight=6),  # approximate imbalance ratio
    'LGBM': lgb.LGBMClassifier(n_estimators=50, random_state=2, class_weight='balanced'),
    'CatBoost': CatBoostClassifier(iterations=50, random_state=2, verbose=0, class_weights=[1,6])
}

In [None]:
# ---- Training function ----
def train_classifier(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    return accuracy, precision, recall

In [None]:
def evaluate_models(X, y, dataset_name="Dataset"):
    results = {"Model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": [], "Estimator": []}

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        stratify=y,
        random_state=42
    )

    print("Model Training Started ......")
    for name, clf in clfs.items():

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print(f'{name} Model training completed.')
        results["Model"].append(name),
        results["Accuracy"].append(accuracy_score(y_test, y_pred))
        results["Precision"].append(precision_score(y_test, y_pred, zero_division=0))
        results["Recall"].append(recall_score(y_test, y_pred, zero_division=0))
        results["F1"].append(f1_score(y_test, y_pred, zero_division=0))
        results["Estimator"].append(clf)

    results_df = pd.DataFrame(results)
    print(f"\n📊 Model Performance on {dataset_name}:")
    display(results_df)


    return results_df

In [None]:
def get_best_model(results_df, metric="F1"):
    best_idx = results_df[metric].idxmax()
    best_model_name = results_df.loc[best_idx, "Model"]
    best_model = results_df.loc[best_idx, "Estimator"]
    print(f"\n✅ Best model: {best_model_name} ({metric} = {results_df.loc[best_idx, metric]:.4f})")
    return best_model

### Training Model on Original Data

In [None]:
# ---- Run on Original Data ----
y_original = original_label
X_original = original_data_preprocessed

results_original = evaluate_models(X_original, y_original, dataset_name="Original Data")

Model Training Started ......
LR Model training completed.
SGD Model training completed.
DT Model training completed.
RF Model training completed.
BRF Model training completed.
AdaBoost Model training completed.
XGB Model training completed.
[LightGBM] [Info] Number of positive: 8070, number of negative: 151930
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009698 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1972
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 521
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
LGBM Model training completed.
CatBoost Model training completed.

📊 Model Performance on Original Data:


Unnamed: 0,Model,Accuracy,Precision,Recall,F1,Estimator
0,LR,0.5009,0.050991,0.504955,0.092628,"LogisticRegression(class_weight='balanced', ma..."
1,SGD,0.53995,0.050828,0.459366,0.091528,"SGDClassifier(class_weight='balanced', loss='l..."
2,DT,0.91875,0.053623,0.03667,0.043555,DecisionTreeClassifier(class_weight='balanced'...
3,RF,0.94955,0.0,0.0,0.0,"(DecisionTreeClassifier(max_features='sqrt', r..."
4,BRF,0.883375,0.051508,0.075322,0.061179,"(DecisionTreeClassifier(max_features='sqrt', r..."
5,AdaBoost,0.94955,0.0,0.0,0.0,"(DecisionTreeClassifier(max_depth=1, random_st..."
6,XGB,0.9477,0.079545,0.003469,0.006648,"XGBClassifier(base_score=None, booster=None, c..."
7,LGBM,0.6256,0.047998,0.340932,0.084149,"LGBMClassifier(class_weight='balanced', n_esti..."
8,CatBoost,0.946775,0.056,0.003469,0.006533,<catboost.core.CatBoostClassifier object at 0x...


In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Metrics to plot
metrics = ["Accuracy", "Precision", "Recall", "F1"]
models = results_original["Model"]

# Create subplot figure: 1 row, 4 columns (one per metric)
fig = make_subplots(rows=1, cols=4, subplot_titles=metrics)

for i, metric in enumerate(metrics):
    fig.add_trace(
        go.Bar(
            x=models,
            y=results_original[metric],
            text=results_original[metric].round(3),
            textposition='auto',
            name=metric
        ),
        row=1, col=i+1
    )

fig.update_layout(
    title_text="Original Data - Model Metrics Comparison",
    showlegend=False,
    height=500, width=1200,
    margin=dict(t=50, b=50)
)

fig.show()


### Training Model on Augmented Data

In [None]:
# ---- Run on Augmented Data ----

y_augmented = aug_label
X_augmented = augmented_data_preprocess

results_augmented = evaluate_models(X_augmented, y_augmented, dataset_name="Augmented Data")

Model Training Started ......
LR Model training completed.
SGD Model training completed.
DT Model training completed.
RF Model training completed.
BRF Model training completed.
AdaBoost Model training completed.
XGB Model training completed.
[LightGBM] [Info] Number of positive: 24070, number of negative: 151930
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020090 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1706
[LightGBM] [Info] Number of data points in the train set: 176000, number of used features: 524
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
LGBM Model training completed.
CatBoost Model training completed.

📊 Model Performance on Augmented Data:


Unnamed: 0,Model,Accuracy,Precision,Recall,F1,Estimator
0,LR,0.504,0.134222,0.481888,0.209962,"LogisticRegression(class_weight='balanced', ma..."
1,SGD,0.7025,0.132967,0.212861,0.163685,"SGDClassifier(class_weight='balanced', loss='l..."
2,DT,0.313068,0.136586,0.755899,0.231366,DecisionTreeClassifier(class_weight='balanced'...
3,RF,0.863227,0.0,0.0,0.0,"(DecisionTreeClassifier(max_features='sqrt', r..."
4,BRF,0.795091,0.13687,0.093885,0.111374,"(DecisionTreeClassifier(max_features='sqrt', r..."
5,AdaBoost,0.863159,0.0,0.0,0.0,"(DecisionTreeClassifier(max_depth=1, random_st..."
6,XGB,0.629091,0.13592,0.319541,0.190717,"XGBClassifier(base_score=None, booster=None, c..."
7,LGBM,0.457114,0.136942,0.559987,0.220067,"LGBMClassifier(class_weight='balanced', n_esti..."
8,CatBoost,0.614045,0.138152,0.34779,0.197751,<catboost.core.CatBoostClassifier object at 0x...


In [None]:
# Metrics to plot
metrics = ["Accuracy", "Precision", "Recall", "F1"]
models = results_augmented["Model"]

# Create subplot figure: 1 row, 4 columns (one per metric)
fig = make_subplots(rows=1, cols=4, subplot_titles=metrics)

for i, metric in enumerate(metrics):
    fig.add_trace(
        go.Bar(
            x=models,
            y=results_augmented[metric],
            text=results_augmented[metric].round(3),
            textposition='auto',
            name=metric
        ),
        row=1, col=i+1
    )

fig.update_layout(
    title_text="Augmented Data - Model Metrics Comparison",
    showlegend=False,
    height=500, width=1200,
    margin=dict(t=50, b=50)
)

fig.show()


# **Model Comparison and Insights**
📊 Ranking by F1 (Augmented Data)

1. Decision Tree (DT) → F1 = 0.23 (Recall = 0.85, but low Precision = 0.14)

2. SGD → F1 = 0.22 (Recall = 0.77, Precision = 0.14)

3. Logistic Regression (LR) → F1 = 0.21 (Recall = 0.51, Precision = 0.13)

4. LightGBM (LGBM) → F1 = 0.21 (Recall = 0.59, Precision = 0.14)

5. CatBoost → F1 = 0.19 (Recall = 0.34, Precision = 0.12)

6. XGBoost (XGB) → F1 = 0.19 (Recall = 0.31, Precision = 0.12)

7. Random Forest (RF) → F1 ≈ 0.00 (predicts majority)

8. AdaBoost → F1 ≈ 0.00 (predicts majority)

9. Balanced RF (BRF) → F1 ≈ 0.00 (didn’t improve here)


✅ Insights

* DT, SGD, LR, LGBM are your best performers so far.

* DT gives the highest Recall (0.85), which is critical in fraud detection (catching most fraud cases, even if false positives increase).

* SGD & LGBM balance Recall + Precision slightly better.

* RF, AdaBoost, BRF, KNN are unsuitable with current settings.

## **Selecting the Best Model for Hypertuning**
-------------------------------------------------
### ***Hyperparameter Tuning***
* **Definition**: The process of optimizing model hyperparameters that are set before training and control how the learning process works.

* **Difference**: Unlike model parameters (learned during training), hyperparameters must be chosen manually or through search strategies.


* **Why important**: Correct hyperparameter choices improve accuracy, generalization, and efficiency while preventing overfitting or underfitting.

* **Selected models**: Logistic Regression, Random Forest, XGBoost, LightGBM, and CatBoost — chosen as they are highly sensitive to hyperparameter settings.

* **Key parameters tuned**:

  * Tree-based models → depth, number of estimators, minimum samples per split

  * Boosting models → learning rate, regularization terms, scale for class imbalance

  * Logistic Regression/SGD → penalty type, solver, and regularization strength

* **Purpose in project**: To maximize performance metrics (especially Recall and F1-score) for fraud detection while handling class imbalance effectively.


# **Applying Hyperparameter Tunning to Improve Performance**

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
# ---------------- Select models ----------------
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from catboost import CatBoostClassifier

In [None]:
# ---------------- Hyperparameter grids ----------------
param_grid = {
    "DT": {
        "max_depth": [3, 5, 7, 10, None],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 5]
    },
    "LR": {
        "C": [0.01, 0.1, 1, 10],
        "penalty": ["l1", "l2"],
        "solver": ["liblinear"],
        "class_weight": ["balanced"]
    },
    "LGBM": {
        "n_estimators": [100, 200],
        "max_depth": [-1, 5, 10],
        "learning_rate": [0.01, 0.05, 0.1],
        "num_leaves": [31, 50, 100],
        "class_weight": ["balanced"]
    },
    "CatBoost": {
        "iterations": [100, 200],
        "depth": [4, 6, 8, 10],
        "learning_rate": [0.01, 0.05, 0.1],
        "l2_leaf_reg": [1, 3, 5, 7],
        "verbose": [0],
        "class_weights": [[1, 6]]  # imbalance handling
    }
}

In [None]:
models = {
    "DT": DecisionTreeClassifier(random_state=2),
    "LR": LogisticRegression(max_iter=1000, random_state=2),
    "LGBM": lgb.LGBMClassifier(random_state=2),
    "CatBoost": CatBoostClassifier(random_state=2)
}

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_augmented,y_augmented, test_size=0.2, random_state=42)

In [None]:
# Number of random parameter combinations to try
best_models = {}
n_iter_search = 20  # You can increase/decrease based on dataset size

# ---------------- Run RandomizedSearchCV ----------------
for name, model in models.items():
    print(f"\n🔍 Tuning {name} with RandomizedSearchCV...")

    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grid[name],
        n_iter=n_iter_search,
        scoring="f1",
        cv=3,
        n_jobs=-1,
        verbose=1,
        random_state=42
    )

    search.fit(X_train, y_train)
    best_models[name] = search.best_estimator_

    print(f"✅ Best {name}: {search.best_params_}")


🔍 Tuning DT with RandomizedSearchCV...
Fitting 3 folds for each of 20 candidates, totalling 60 fits
✅ Best DT: {'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': None}

🔍 Tuning LR with RandomizedSearchCV...
Fitting 3 folds for each of 8 candidates, totalling 24 fits
✅ Best LR: {'solver': 'liblinear', 'penalty': 'l2', 'class_weight': 'balanced', 'C': 0.1}

🔍 Tuning LGBM with RandomizedSearchCV...
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[LightGBM] [Info] Number of positive: 24080, number of negative: 151920
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019968 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1706
[LightGBM] [Info] Number of data points in the train set: 176000, number of used features: 524
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Sta

KeyboardInterrupt: 

## Retraining Decision Tree on Tuned Hyperparameters

In [None]:
from sklearn.metrics import classification_report, f1_score
from lightgbm import LGBMClassifier


print("Retraining Model Start ...")
# ---------------- Retrain with best params ----------------
dt_best = DecisionTreeClassifier(
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=2,
    random_state=42
)
dt_best.fit(X_train, y_train)
print("DT Model done training !")

lr_best = LogisticRegression(
    solver="liblinear",
    penalty="l2",
    class_weight="balanced",
    C=0.1,
    max_iter=5000,
    random_state=42
)
lr_best.fit(X_train, y_train)
print("LR Model done training !")

lgbm_best = LGBMClassifier(
    num_leaves=31,
    n_estimators=100,
    max_depth=10,
    learning_rate=0.01,
    class_weight="balanced",
    random_state=42
)
lgbm_best.fit(X_train, y_train)
print("LGBM Model done training !")

# ---------------- Evaluate Models ----------------
models = {
    "Decision Tree": dt_best,
    "Logistic Regression": lr_best,
    "LightGBM": lgbm_best
}

for name, model in models.items():
    print(f"\n🔹 {name} Performance:")
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred, digits=4))
    print("F1 Score:", f1_score(y_test, y_pred))


Retraining Model Start ...
DT Model done training !
LR Model done training !
[LightGBM] [Info] Number of positive: 24080, number of negative: 151920
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020031 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1706
[LightGBM] [Info] Number of data points in the train set: 176000, number of used features: 524
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
LGBM Model done training !

🔹 Decision Tree Performance:
              precision    recall  f1-score   support

           0     0.8628    0.8903    0.8764     37992
           1     0.1312    0.1047    0.1164      6008

    accuracy                         0.7831     44000
   macro avg     0.4970    0.4975    0.4964     44000
weighted avg     0.7629    0.7831

# Building a Deep Learning Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import Precision, Recall
from sklearn.metrics import f1_score, precision_recall_curve
import tensorflow as tf

In [None]:
# ---------------- 1️⃣ Split data ----------------
X_train, X_test, y_train, y_test = train_test_split(
    X_augmented, y_augmented, test_size=0.2, random_state=42, stratify=y_augmented
)

# ---------------- 2️⃣ Compute class weights ----------------
# Balances the loss to give more importance to minority class
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights = dict(enumerate(class_weights))
print("Class weights:", class_weights)

Class weights: {0: np.float64(0.5792141117619957), 1: np.float64(3.656003323639385)}


In [None]:
# ---------------- 3️⃣ Define Focal Loss ----------------
def focal_loss(gamma=2., alpha=0.25):
    def loss(y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        bce = tf.keras.losses.binary_crossentropy(y_true, y_pred)
        p_t = y_true * y_pred + (1 - y_true) * (1 - y_pred)
        alpha_factor = y_true * alpha + (1 - y_true) * (1 - alpha)
        modulating_factor = tf.pow(1.0 - p_t, gamma)
        return tf.reduce_mean(alpha_factor * modulating_factor * bce)
    return loss

In [None]:
# ---------------- 4️⃣ Build the model ----------------
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss=focal_loss(gamma=2, alpha=0.25),
    metrics=[Precision(name='precision'), Recall(name='recall')]
)

In [None]:
# ---------------- 5️⃣ Early stopping ----------------
early_stop = EarlyStopping(
    monitor='val_loss', patience=5, restore_best_weights=True
)

In [None]:
# ---------------- 6️⃣ Train the model ----------------
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=1024,
    class_weight=class_weights,
    callbacks=[early_stop],
    verbose=2
)

Epoch 1/50
138/138 - 6s - 43ms/step - loss: 0.0195 - precision: 0.1209 - recall: 0.0074 - val_loss: 0.0137 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 2/50
138/138 - 3s - 21ms/step - loss: 0.0146 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_loss: 0.0137 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 3/50
138/138 - 3s - 21ms/step - loss: 0.0145 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_loss: 0.0137 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 4/50
138/138 - 4s - 32ms/step - loss: 0.0144 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_loss: 0.0137 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 5/50
138/138 - 3s - 22ms/step - loss: 0.0142 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_loss: 0.0138 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 6/50
138/138 - 3s - 20ms/step - loss: 0.0142 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_loss: 0.0138 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00


In [None]:
# ---------------- 7️⃣ Predict probabilities ----------------
y_probs = model.predict(X_test)

[1m1375/1375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step


In [None]:
# ---------------- 8️⃣ Optimize threshold ----------------
precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
print(f"Best threshold for F1-score: {best_threshold:.3f}")


Best threshold for F1-score: 0.145


In [None]:
# ---------------- 9️⃣ Apply threshold ----------------
y_pred = (y_probs > best_threshold).astype(int)

# ---------------- 🔟 Evaluate ----------------
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8632    0.9999    0.9266     37982
           1     0.0000    0.0000    0.0000      6018

    accuracy                         0.8632     44000
   macro avg     0.4316    0.5000    0.4633     44000
weighted avg     0.7452    0.8632    0.7998     44000



## **Using SMOTE**

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:

# ---------------- 1️⃣ Apply SMOTE ----------------
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.shape)
print("After SMOTE:", y_train_res.shape)

Before SMOTE: (176000,)
After SMOTE: (303860,)


In [None]:
# ---------------- 3️⃣ Train models ----------------
print("Starting Model Training .....")
# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42, class_weight='balanced')
dt_model.fit(X_train_res, y_train_res)
print("DT Model training completed ! \n")

# Logistic Regression
lr_model = LogisticRegression(solver='liblinear', class_weight='balanced', random_state=42)
lr_model.fit(X_train_res, y_train_res)
print("Logistic Regression training completed! \n")

# SGD Classifier
sgd_model = SGDClassifier(loss='log_loss', class_weight='balanced', random_state=42)
sgd_model.fit(X_train_res, y_train_res)
print("SGD Model training completed ! \n")

#lgbm Classifier
lgbm_best = LGBMClassifier(
    num_leaves=31,
    n_estimators=100,
    max_depth=10,
    learning_rate=0.01,
    class_weight="balanced",
    random_state=42
)
lgbm_best.fit(X_train_res, y_train_res)
print("LGBM Model done training !")

print("Model Training Ended....")

Starting Model Training .....
DT Model training completed ! 

Logistic Regression training completed! 

SGD Model training completed ! 

[LightGBM] [Info] Number of positive: 151930, number of negative: 151930
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.805832 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 132981
[LightGBM] [Info] Number of data points in the train set: 303860, number of used features: 524
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
LGBM Model done training !
Model Training Ended....


In [None]:
# ---------------- 4️⃣ Evaluate models ----------------
for name, model in zip(['Decision Tree', 'Logistic Regression', 'SGD', 'LGBM'],
                       [dt_model, lr_model, sgd_model, lgbm_best]):
    y_pred = model.predict(X_test)
    print(f"\n{name} Performance:")
    print(classification_report(y_test, y_pred, digits=4))
    print("F1 Score:", f1_score(y_test, y_pred))


Decision Tree Performance:
              precision    recall  f1-score   support

           0     0.8628    0.8629    0.8628     37982
           1     0.1339    0.1338    0.1338      6018

    accuracy                         0.7632     44000
   macro avg     0.4983    0.4983    0.4983     44000
weighted avg     0.7631    0.7632    0.7631     44000

F1 Score: 0.13382096251350678

Logistic Regression Performance:
              precision    recall  f1-score   support

           0     0.8620    0.5088    0.6399     37982
           1     0.1355    0.4860    0.2120      6018

    accuracy                         0.5057     44000
   macro avg     0.4988    0.4974    0.4259     44000
weighted avg     0.7627    0.5057    0.5814     44000

F1 Score: 0.21195652173913043

SGD Performance:
              precision    recall  f1-score   support

           0     0.8647    0.3406    0.4887     37982
           1     0.1375    0.6635    0.2278      6018

    accuracy                         0.384

# Saving Model with Best Result

In [None]:
import joblib

# ✅ Save best model (on augmented data)
best_model_augmented = get_best_model(results_augmented, metric="F1")
joblib.dump(best_model_augmented, "/content/drive/MyDrive/GNCIPL_Internship_Projects/Finance/Model/best_model_augmented.pkl")
print("✅ Best augmented model saved as best_model_augmented.pkl")

# ✅ Save preprocessing pipeline
joblib.dump(pipeline, "/content/drive/MyDrive/GNCIPL_Internship_Projects/Finance/Model/preprocess_pipeline.pkl")
print("✅ Preprocessing pipeline saved as preprocess_pipeline.pkl")



✅ Best model: DT (F1 = 0.2314)
✅ Best augmented model saved as best_model_augmented.pkl
✅ Preprocessing pipeline saved as preprocess_pipeline.pkl
