<a href="https://colab.research.google.com/github/PrettyCharity/Machine_Learning_Practice/blob/main/LightGBM_Classification_and_SMOTE_for_Credit_Card_Fraud.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Installing and upgrading packages
!pip install lightgbm --upgrade
!pip install optuna

In [39]:
#@title Preparing the data
# Importing libraries
import pandas as pd
import numpy as np
pd.set_option("display.precision", 4)
# Metrics and tools
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
# Optuna packages
import optuna.integration.lightgbm as lgb
from lightgbm import early_stopping
from lightgbm import log_evaluation

# Loading the data
# [https://www.kaggle.com/mlg-ulb/creditcardfraud]
df = pd.read_csv('creditcard.csv') 
df.drop('Time', axis = 1, inplace = True)

X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.25,
                                                    random_state = 42)

# Scaling the 'Amount' column
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train['Amount'] = sc.fit_transform(X_train['Amount'].to_numpy().reshape(-1, 1))
X_test['Amount'] = sc.transform(X_test['Amount'].to_numpy().reshape(-1, 1))

In [40]:
X_train.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
83225,-1.6486,1.2281,1.3702,-1.7355,-0.0295,-0.4841,0.9186,-0.4387,0.9821,1.2416,...,0.3842,-0.2181,-0.2035,-0.213,0.0114,-0.3045,0.6321,-0.263,-0.0999,-0.196
52800,-0.2348,-0.4933,1.2367,-2.3388,-1.1767,0.8857,-1.961,-2.3634,-2.6948,0.3602,...,0.3647,-1.4954,-0.0831,0.0746,-0.3473,0.5419,-0.4333,0.0893,0.212,-0.1072
21293,1.1346,-0.7745,-0.1634,-0.5334,-0.6046,-0.2445,-0.2127,0.0408,-1.1366,0.792,...,-0.3965,-0.6845,-1.8553,0.172,-0.3878,-0.063,0.2451,-0.0612,0.0122,0.0867
133600,0.0695,1.0178,1.0331,1.3844,0.2232,-0.3108,0.5973,-0.1277,-0.7015,0.0707,...,0.1488,0.097,0.37,-0.2193,-0.1249,-0.0497,-0.1129,0.1144,0.0661,-0.3068
38225,-0.1994,0.6101,-0.1144,0.2566,2.2908,4.0085,-0.1235,1.0384,-0.0758,0.0305,...,0.293,-0.0197,0.1655,-0.081,1.0207,-0.3007,-0.2696,0.4818,0.2541,-0.26


In [41]:
#@title Applying SMOTE to balance the data 
from collections import Counter

print('Frequency of each class before SMOTE')
print('Training set: {}'.format(Counter(y_train)))
print('Testing set: {}'.format(Counter(y_test)))

from imblearn.over_sampling import SMOTE 

# add artificial samples to training data only
sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

print('\nFrequency of each class after SMOTE')
print('Training set: {}'.format(Counter(y_train_sm)))
print('Testing set: {}'.format(Counter(y_test)))

Frequency of each class before SMOTE
Training set: Counter({0: 213226, 1: 379})
Testing set: Counter({0: 71089, 1: 113})

Frequency of each class after SMOTE
Training set: Counter({0: 213226, 1: 213226})
Testing set: Counter({0: 71089, 1: 113})


In [42]:
#@title Optuna Lightgbm tuner
def tuner(X_train, X_test, y_train, y_test):
  # Setting up the data for the model
  dtrain = lgb.Dataset(X_train, label = y_train)
  dtest = lgb.Dataset(X_test, label = y_test)

  # Parameters for classification
  params = {
      'objective' : 'binary',
      'metric' : 'binary_logloss',
      'verbose' : -1,
      'boosting_type' : 'gbdt'
  }

  # Training the model
  model = lgb.train(
      params,
      dtrain,
      valid_sets = [dtrain, dtest],
      callbacks = [early_stopping(100), log_evaluation(100)]
  )

  # Results
  y_pred = np.rint(model.predict(X_test, 
                                     num_iteration = model.best_iteration))
  score = f1_score(y_test, y_pred)
  return score

In [None]:
f1_score_no_smote = tuner(X_train.values, X_test.values, y_train.values, y_test.values)

In [None]:
f1_score_with_smote = tuner(X_train_sm.values, X_test.values, y_train_sm.values, y_test.values)

In [58]:
#@title Results
results = pd.DataFrame(np.array([f1_score_no_smote, f1_score_with_smote]),
                       columns = ['LightGBM'],
                       index = ['f1 score no SMOTE', 'f1 score with SMOTE'])
results.style

Unnamed: 0,LightGBM
f1 score no SMOTE,0.8558
f1 score with SMOTE,0.8398
