In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

In [2]:
path = '/content/drive/MyDrive/Colab_Notebooks/Handling Unbalanced Datasets/data/creditcard.csv'

def load_and_split_data(path):
  df = pd.read_csv(path)
  X = df.drop('Class', axis=1)
  y = df['Class']

  # Standardize 'Amount'
  X['Amount'] = StandardScaler().fit_transform(X[['Amount']])

  return train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [3]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
# from preprocessing import load_and_split_data

def train_with_smote(path):
    X_train, X_test, y_train, y_test = load_and_split_data(path)
    X_res, y_res = SMOTE(random_state=42).fit_resample(X_train, y_train)

    # Param grid
    param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1],
        'n_estimators': [100, 200],
        'subsample': [0.8, 1.0]
    }

    model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    grid = GridSearchCV(model, param_grid, scoring='f1', cv=3, verbose=1)
    grid.fit(X_res, y_res)

    print("Best params (SMOTE):", grid.best_params_)
    return grid.best_estimator_, X_test, y_test


In [4]:
import xgboost as xgb
# from preprocessing import load_and_split_data

def train_with_weight(path):
    X_train, X_test, y_train, y_test = load_and_split_data(path)
    scale = (y_train == 0).sum() / (y_train == 1).sum()
    # Param grid
    param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1],
        'n_estimators': [100, 200],
        'subsample': [0.8, 1.0]
    }
    model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale)
    grid = GridSearchCV(model, param_grid, scoring='f1', cv=3, verbose=1)
    grid.fit(X_train, y_train)
    print("Best params (scale_pos_weight):", grid.best_params_)
    return grid.best_estimator_, X_test, y_test


In [5]:
from sklearn.metrics import classification_report,confusion_matrix

def evaluate(model, X_test, y_test, label):
    y_pred = model.predict(X_test)
    print(f"\n{label} Results:\n")
    print(classification_report(y_test, y_pred, digits=4))
    print('Confusion Matrix:\n\n', confusion_matrix(y_test, y_pred))



In [6]:
# from smote_pipeline import train_with_smote
# from weight_pipeline import train_with_weight
# from evaluate import evaluate

model_smote, X_test, y_test = train_with_smote(path)
evaluate(model_smote, X_test, y_test, "SMOTE")

model_weight, X_test, y_test = train_with_weight(path)
evaluate(model_weight, X_test, y_test, "scale_pos_weight")


Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best params (SMOTE): {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 1.0}

SMOTE Results:

              precision    recall  f1-score   support

           0     0.9998    0.9995    0.9996     56864
           1     0.7589    0.8673    0.8095        98

    accuracy                         0.9993     56962
   macro avg     0.8793    0.9334    0.9046     56962
weighted avg     0.9994    0.9993    0.9993     56962

Confusion Matrix:

 [[56837    27]
 [   13    85]]
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best params (scale_pos_weight): {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}

scale_pos_weight Results:

              precision    recall  f1-score   support

           0     0.9997    0.9997    0.9997     56864
           1     0.8438    0.8265    0.8351        98

    accuracy                         0.9994     56962
   macro avg     0.9217    0