In [31]:
#Import required libraries
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from collections import Counter

In [5]:
#import data
url = 'https://raw.githubusercontent.com/jackty9/Handling_Imbalanced_Data_in_Python/master/bank-full-encoded.csv'
data = pd.read_csv(url)

In [6]:
#Assigning dependent and independent variables
x = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [12]:
#split the data for training and testing
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.3,random_state=0)

In [13]:
#Normalise the data
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
x_train = std.fit_transform(xtrain)
x_test = std.transform(xtest)

In [15]:
print('Before Sampling',Counter(ytrain))

Before Sampling Counter({0: 27953, 1: 3694})


In [16]:
#Define under sampling
undersampler = RandomUnderSampler(sampling_strategy='majority')

In [17]:
#Resampling
x_train_under,y_train_under = undersampler.fit_resample(x_train,ytrain)

In [18]:
print('Before Sampling',Counter(y_train_under))

Before Sampling Counter({0: 3694, 1: 3694})


In [23]:
#Import SVM libraries
from sklearn.svm import SVC
model = SVC()
clf_SVM_under = model.fit(x_train_under,y_train_under)

In [24]:
#Prediction
pred_under = clf_SVM_under.predict(x_test)

In [27]:
#Model evaluation
from sklearn.metrics import roc_auc_score
print("ROC AUC Score for undersampled data: ",roc_auc_score(ytest,pred_under));print()

from sklearn.metrics import confusion_matrix,accuracy_score
print('Accuracy of the model after under sampling:',accuracy_score(ytest,pred_under))
print('Confusion Matrix after under sampling:\n',confusion_matrix(ytest,pred_under))

ROC AUC Score for undersampled data:  0.8250282404047447

Accuracy of the model after under sampling: 0.8058094957239752
Confusion Matrix after under sampling:
 [[9574 2395]
 [ 239 1356]]


# Oversampling

In [36]:
#Defining oversampling and resampling

from imblearn.over_sampling import SMOTE
over_sampler = SMOTE(sampling_strategy='minority')
x_train_over,y_train_over = over_sampler.fit_resample(x_train,ytrain)

In [37]:
#Import SVM libraries

from sklearn.svm import SVC
model = SVC()
clf_SVM_over = model.fit(x_train_over,y_train_over)

In [40]:
#Prediction
pred_over = clf_SVM_over.predict(x_test)

In [41]:
#Model evaluation
from sklearn.metrics import roc_auc_score
print("ROC AUC Score for oversampling data: ",roc_auc_score(ytest,pred_over));print()

from sklearn.metrics import confusion_matrix,accuracy_score
print('Accuracy of the model after over sampling:',accuracy_score(ytest,pred_over))
print('Confusion Matrix after over sampling:\n',confusion_matrix(ytest,pred_over))

ROC AUC Score for oversampling data:  0.8157216487420088

Accuracy of the model after over sampling: 0.8224712474196402
Confusion Matrix after over sampling:
 [[9869 2100]
 [ 308 1287]]


# Cross Validation

In [46]:
#normalise the data
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
x_norm = std.fit_transform(x)

In [66]:
from sklearn.model_selection import KFold,StratifiedKFold,cross_val_score
from sklearn.tree import DecisionTreeClassifier
kfold_cv = KFold(10)
strat_cv = StratifiedKFold(n_splits=10)
model = DecisionTreeClassifier()
result_Kfold = cross_val_score(model,x_norm,y,cv = kfold_cv)
result_strat_fold = cross_val_score(model,x_norm,y,cv=strat_cv)

In [67]:
print('K-FOLD ACCURACY',np.mean(result_Kfold));print()
print('STRATIFIED-CV',np.mean(result_strat_fold))

K-FOLD ACCURACY 0.7973912101773619

STRATIFIED-CV 0.620396325330677
