# Handling imbalanced Data

In [3]:
import dagshub
dagshub.init(repo_owner='Pranay5519', repo_name='fraud_detection', mlflow=True)


In [None]:
# ==============================
# FRAUD MODEL TRAINING WITH MULTIPLE IMBALANCE TECHNIQUES (MLFLOW)
# ==============================

import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    precision_score, recall_score, f1_score, roc_auc_score , classification_report
)

from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

# ------------------------------
# 1. LOAD DATA
# ------------------------------

df = pd.read_csv(r"D:\accredian\data\cleaned_fraud.csv")

# ------------------------------
# 2. ENCODE CATEGORICAL COLUMN
# ------------------------------

df = pd.get_dummies(df, columns=['type'], drop_first=True)

# ------------------------------
# 3. DEFINE FEATURES AND TARGET
# ------------------------------

X = df.drop(columns=['isFraud'])
y = df['isFraud']

# ------------------------------
# 4. TRAIN‚ÄìTEST SPLIT
# ------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# ------------------------------
# 5. SCALE FEATURES
# ------------------------------

scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ------------------------------
# 6. EXPERIMENT CONFIG
# ------------------------------

imbalance_methods = [
    "none",
    "oversampling",
    "adasyn",
    "undersampling",
    "smote_enn"
]

mlflow.set_tracking_uri("https://dagshub.com/Pranay5519/fraud_detection.mlflow")
mlflow.set_experiment("Fraud_Imbalance_Comparison")

# ------------------------------
# 7. TRAINING FUNCTION
# ------------------------------

def run_imbalanced_experiment(imbalance_method):

    X_train_vec = X_train_scaled.copy()
    y_train_vec = y_train.copy()

    # ---------- Resampling ----------
    if imbalance_method == "oversampling":
        sampler = SMOTE(random_state=42)
        X_train_vec, y_train_vec = sampler.fit_resample(X_train_vec, y_train_vec)

    elif imbalance_method == "adasyn":
        sampler = ADASYN(random_state=42)
        X_train_vec, y_train_vec = sampler.fit_resample(X_train_vec, y_train_vec)

    elif imbalance_method == "undersampling":
        sampler = RandomUnderSampler(random_state=42)
        X_train_vec, y_train_vec = sampler.fit_resample(X_train_vec, y_train_vec)

    elif imbalance_method == "smote_enn":
        sampler = SMOTEENN(random_state=42)
        X_train_vec, y_train_vec = sampler.fit_resample(X_train_vec, y_train_vec)

    # ---------- MLflow Run ----------
    with mlflow.start_run(run_name=f"imbalance_{imbalance_method}"):

        mlflow.log_param("imbalance_method", imbalance_method)
        mlflow.log_param("train_samples", len(y_train_vec))

        # ---------- Model ----------
        model = LogisticRegression(
            max_iter=1000,
            solver="lbfgs",
            class_weight="balanced"
        )

        model.fit(X_train_vec, y_train_vec)

        # ---------- Predictions ----------
        y_test_proba = model.predict_proba(X_test_scaled)[:, 1]
        y_test_pred = (y_test_proba >= 0.5).astype(int)

        # ---------- Metrics ----------
        precision = precision_score(y_test, y_test_pred, zero_division=0)
        recall = recall_score(y_test, y_test_pred, zero_division=0)
        f1 = f1_score(y_test, y_test_pred, zero_division=0)
        roc_auc = roc_auc_score(y_test, y_test_proba)

        # ---------- Log Metrics ----------
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("roc_auc", roc_auc)

         # Log classification report
        classification_rep = classification_report(y_test, y_test_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)
        
        # ---------- Log Model ----------
        mlflow.sklearn.log_model(model, "model")

        print(
            f"[{imbalance_method.upper()}] "
            f"Precision={precision:.4f} | Recall={recall:.4f} | ROC-AUC={roc_auc:.4f}"
        )

# ------------------------------
# 8. RUN ALL EXPERIMENTS
# ------------------------------

for method in imbalance_methods:
    run_imbalanced_experiment(method)


2026/01/16 22:20:16 INFO mlflow.tracking.fluent: Experiment with name 'Fraud_Imbalance_Comparison' does not exist. Creating a new experiment.


[NONE] Precision=0.0138 | Recall=0.9446 | ROC-AUC=0.9749
üèÉ View run imbalance_none at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/2/runs/b622e8a950204f4daf33cad2788b098f
üß™ View experiment at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/2




[OVERSAMPLING] Precision=0.0140 | Recall=0.9422 | ROC-AUC=0.9755
üèÉ View run imbalance_oversampling at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/2/runs/7d7820924e9149b680a73d0f8659c160
üß™ View experiment at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/2




[ADASYN] Precision=0.0073 | Recall=0.9897 | ROC-AUC=0.9794
üèÉ View run imbalance_adasyn at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/2/runs/9d3f185c71ac4af88660c5b48d969b5e
üß™ View experiment at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/2




[UNDERSAMPLING] Precision=0.0045 | Recall=0.8990 | ROC-AUC=0.9166
üèÉ View run imbalance_undersampling at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/2/runs/70787e92cba845d495183d47dbcc2852
üß™ View experiment at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/2




[SMOTE_ENN] Precision=0.0134 | Recall=0.9440 | ROC-AUC=0.9751
üèÉ View run imbalance_smote_enn at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/2/runs/7cfbeae49e5f4e81b834756f63b4aa76
üß™ View experiment at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/2


# Decision Tree

In [5]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import optuna

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

# ------------------------------
# 1. LOAD DATA
# ------------------------------

df = pd.read_csv(r"D:\accredian\data\cleaned_fraud.csv")

# ------------------------------
# 2. ENCODE CATEGORICAL COLUMN
# ------------------------------

df = pd.get_dummies(df, columns=['type'], drop_first=True)

# ------------------------------
# 3. DEFINE X AND y
# ------------------------------

X = df.drop(columns=['isFraud'])
y = df['isFraud']

# ------------------------------
# 4. TRAIN‚ÄìTEST SPLIT
# ------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# ------------------------------
# 5. SCALE FEATURES
# ------------------------------
# (Not required for trees, but kept for consistency)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ------------------------------
# 6. SMOTE (TRAIN ONLY)
# ------------------------------

smote = SMOTE(random_state=42, k_neighbors=5)
X_train_bal, y_train_bal = smote.fit_resample(X_train_scaled, y_train)

# ------------------------------
# 7. MLFLOW + OPTUNA SETUP
# ------------------------------

mlflow.set_tracking_uri("https://dagshub.com/Pranay5519/fraud_detection.mlflow")
mlflow.set_experiment("Fraud_DecisionTree_Optuna")

mlflow.sklearn.autolog()

# ------------------------------
# 8. OPTUNA OBJECTIVE FUNCTION
# ------------------------------

def objective(trial):

    params = {
        "max_depth": trial.suggest_int("max_depth", 3, 30),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 50),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
        "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
        "class_weight": trial.suggest_categorical(
            "class_weight",
            [{0: 1, 1: 1}, {0: 2, 1: 1}, {0: 5, 1: 1}]
        ),
        "random_state": 42
    }

    with mlflow.start_run(nested=True):

        model = DecisionTreeClassifier(**params)
        model.fit(X_train_bal, y_train_bal)

        y_test_pred = model.predict(X_test_scaled)

        # ---------- Classification Report ----------
        classification_rep = classification_report(
            y_test, y_test_pred, output_dict=True, zero_division=0
        )

        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Optuna needs ONE value ‚Üí use fraud-class recall
        return classification_rep["1"]["recall"]

# ------------------------------
# 9. RUN OPTUNA STUDY
# ------------------------------

with mlflow.start_run(run_name="DecisionTree_Run"):

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=10)

    mlflow.log_params({f"best_{k}": v for k, v in study.best_params.items()})

    best_params = study.best_params

    # --------------------------
    # 10. TRAIN FINAL MODEL
    # --------------------------

    final_model = DecisionTreeClassifier(**best_params, random_state=42)
    final_model.fit(X_train_bal, y_train_bal)

    y_test_pred = final_model.predict(X_test_scaled)

    classification_rep = classification_report(
        y_test, y_test_pred, output_dict=True, zero_division=0
    )

    for label, metrics in classification_rep.items():
        if isinstance(metrics, dict):
            for metric, value in metrics.items():
                mlflow.log_metric(f"final_{label}_{metric}", value)

    print("Best Params:", best_params)
 

  from .autonotebook import tqdm as notebook_tqdm
2026/01/16 22:58:56 INFO mlflow.tracking.fluent: Experiment with name 'Fraud_DecisionTree_Optuna' does not exist. Creating a new experiment.
[I 2026-01-16 22:58:57,865] A new study created in memory with name: no-name-a71b3b27-f763-48e2-9f03-b5809e0f6f1d


üèÉ View run caring-gnu-131 at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/3/runs/e1b21b28f5374d2fbb1267146a3080ab
üß™ View experiment at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/3


[I 2026-01-16 23:00:41,256] Trial 0 finished with value: 0.9007912355447353 and parameters: {'max_depth': 29, 'min_samples_split': 15, 'min_samples_leaf': 16, 'criterion': 'entropy', 'class_weight': {0: 1, 1: 1}}. Best is trial 0 with value: 0.9007912355447353.


üèÉ View run sassy-bird-345 at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/3/runs/9309a7617f664f65bf5b632d248ba030
üß™ View experiment at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/3


[I 2026-01-16 23:02:11,442] Trial 1 finished with value: 0.8630553864881315 and parameters: {'max_depth': 18, 'min_samples_split': 35, 'min_samples_leaf': 11, 'criterion': 'gini', 'class_weight': {0: 5, 1: 1}}. Best is trial 0 with value: 0.9007912355447353.


üèÉ View run unleashed-swan-160 at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/3/runs/783ffcacb1b84b1e84d14e17f7591fde
üß™ View experiment at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/3


[I 2026-01-16 23:03:33,893] Trial 2 finished with value: 0.9111381618989653 and parameters: {'max_depth': 11, 'min_samples_split': 3, 'min_samples_leaf': 11, 'criterion': 'gini', 'class_weight': {0: 1, 1: 1}}. Best is trial 2 with value: 0.9111381618989653.


üèÉ View run unruly-frog-859 at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/3/runs/03b53a9c120d41f9bdcce38daf2526aa
üß™ View experiment at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/3


[I 2026-01-16 23:04:52,789] Trial 3 finished with value: 0.8831405964698722 and parameters: {'max_depth': 9, 'min_samples_split': 50, 'min_samples_leaf': 7, 'criterion': 'gini', 'class_weight': {0: 5, 1: 1}}. Best is trial 2 with value: 0.9111381618989653.


üèÉ View run incongruous-worm-53 at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/3/runs/fe225bdde1db4d3da03dc83d0567728d
üß™ View experiment at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/3


[I 2026-01-16 23:06:35,705] Trial 4 finished with value: 0.8490566037735849 and parameters: {'max_depth': 27, 'min_samples_split': 25, 'min_samples_leaf': 6, 'criterion': 'entropy', 'class_weight': {0: 5, 1: 1}}. Best is trial 2 with value: 0.9111381618989653.


üèÉ View run serious-lark-861 at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/3/runs/c94935ce54704cdb9f843a3906afec6c
üß™ View experiment at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/3


[I 2026-01-16 23:08:15,810] Trial 5 finished with value: 0.8861838101034692 and parameters: {'max_depth': 19, 'min_samples_split': 20, 'min_samples_leaf': 10, 'criterion': 'entropy', 'class_weight': {0: 1, 1: 1}}. Best is trial 2 with value: 0.9111381618989653.


üèÉ View run bittersweet-hawk-48 at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/3/runs/921389f309184eaf89196a3b77b1c838
üß™ View experiment at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/3


[I 2026-01-16 23:10:00,848] Trial 6 finished with value: 0.9007912355447353 and parameters: {'max_depth': 29, 'min_samples_split': 32, 'min_samples_leaf': 15, 'criterion': 'entropy', 'class_weight': {0: 1, 1: 1}}. Best is trial 2 with value: 0.9111381618989653.


üèÉ View run dashing-mule-981 at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/3/runs/0d060e1404284d0b9ddd72adfcda71b9
üß™ View experiment at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/3


[I 2026-01-16 23:11:42,194] Trial 7 finished with value: 0.8581862446743761 and parameters: {'max_depth': 30, 'min_samples_split': 43, 'min_samples_leaf': 19, 'criterion': 'entropy', 'class_weight': {0: 5, 1: 1}}. Best is trial 2 with value: 0.9111381618989653.


üèÉ View run kindly-stork-272 at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/3/runs/2de16b8f7e444634a2de4e935821b811
üß™ View experiment at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/3


[I 2026-01-16 23:12:36,820] Trial 8 finished with value: 0.7334144856968959 and parameters: {'max_depth': 3, 'min_samples_split': 48, 'min_samples_leaf': 4, 'criterion': 'entropy', 'class_weight': {0: 2, 1: 1}}. Best is trial 2 with value: 0.9111381618989653.


üèÉ View run lyrical-swan-849 at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/3/runs/fe332257ec59466e938c661ff3344038
üß™ View experiment at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/3


[I 2026-01-16 23:14:05,750] Trial 9 finished with value: 0.877054169202678 and parameters: {'max_depth': 15, 'min_samples_split': 20, 'min_samples_leaf': 17, 'criterion': 'gini', 'class_weight': {0: 5, 1: 1}}. Best is trial 2 with value: 0.9111381618989653.


Best Params: {'max_depth': 11, 'min_samples_split': 3, 'min_samples_leaf': 11, 'criterion': 'gini', 'class_weight': {0: 1, 1: 1}}
üèÉ View run DecisionTree_Run at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/3/runs/0c3598119cbe4b94a2eb68dc2636062f
üß™ View experiment at: https://dagshub.com/Pranay5519/fraud_detection.mlflow/#/experiments/3
