In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

path = '/content/drive/MyDrive/Colab_Notebooks/Handling Unbalanced Datasets/data/creditcard.csv'

def load_and_split_data(path):
  df = pd.read_csv(path)
  X = df.drop('Class', axis=1)
  y = df['Class']

  # Standardize 'Amount'
  X['Amount'] = StandardScaler().fit_transform(X[['Amount']])

  return train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

from imblearn.over_sampling import SMOTE
import xgboost as xgb
# from preprocessing import load_and_split_data

def train_with_smote():
    X_train, X_test, y_train, y_test = load_and_split_data(path)
    X_res, y_res = SMOTE(random_state=42).fit_resample(X_train, y_train)

    model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    model.fit(X_res, y_res)
    return model, X_test, y_test

import xgboost as xgb
# from preprocessing import load_and_split_data

def train_with_weight():
    X_train, X_test, y_train, y_test = load_and_split_data(path)
    scale = (y_train == 0).sum() / (y_train == 1).sum()

    model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale)
    model.fit(X_train, y_train)
    return model, X_test, y_test


from sklearn.metrics import classification_report,confusion_matrix

def evaluate(model, X_test, y_test, label):
    y_pred = model.predict(X_test)
    print(f"\n{label} Results:\n")
    print(classification_report(y_test, y_pred, digits=4))
    print(confusion_matrix(y_test, y_pred))



# from smote_pipeline import train_with_smote
# from weight_pipeline import train_with_weight
# from evaluate import evaluate

model_smote, X_test, y_test = train_with_smote()
evaluate(model_smote, X_test, y_test, "SMOTE")

model_weight, X_test, y_test = train_with_weight()
evaluate(model_weight, X_test, y_test, "scale_pos_weight")


Parameters: { "use_label_encoder" } are not used.




SMOTE Results:

              precision    recall  f1-score   support

           0     0.9998    0.9996    0.9997     56864
           1     0.7727    0.8673    0.8173        98

    accuracy                         0.9993     56962
   macro avg     0.8862    0.9335    0.9085     56962
weighted avg     0.9994    0.9993    0.9994     56962

[[56839    25]
 [   13    85]]


Parameters: { "use_label_encoder" } are not used.




scale_pos_weight Results:

              precision    recall  f1-score   support

           0     0.9997    0.9998    0.9998     56864
           1     0.8817    0.8367    0.8586        98

    accuracy                         0.9995     56962
   macro avg     0.9407    0.9183    0.9292     56962
weighted avg     0.9995    0.9995    0.9995     56962

[[56853    11]
 [   16    82]]
