In [1]:
#Import libraries
import warnings
warnings.simplefilter('ignore')

In [2]:
#Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.model_selection import train_test_split

In [3]:
#Import data
url = 'https://raw.githubusercontent.com/jackty9/Handling_Imbalanced_Data_in_Python/master/bank-full-encoded.csv'
df = pd.read_csv(url)

In [4]:
#Separating independent and dependent variables
x = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [5]:
#split train and test
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.3)

In [6]:
#Normalaise the data
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
x_train = std.fit_transform(xtrain)
x_test = std.transform(xtest)

In [7]:
#Summarise the class distribution
print('Before sampling:',Counter(ytrain))

Before sampling: Counter({0: 27968, 1: 3679})


In [8]:
#Define undersampling strategy
undersample = RandomUnderSampler(sampling_strategy='majority')

In [9]:
#Fit and apply the transform
x_train_under,y_train_under = undersample.fit_resample(x_train,ytrain)

In [10]:
#Summarise the class distribution
print('After sampling:',Counter(y_train_under))

After sampling: Counter({0: 3679, 1: 3679})


In [11]:
#Import SVM libraries
from sklearn.svm import SVC
from sklearn.metrics import classification_report,roc_auc_score

model = SVC()
clf_under = model.fit(x_train_under,y_train_under)
pred_under = clf_under.predict(x_test)

print("ROC AUC Score for undersampled data: ",roc_auc_score(ytest,pred_under))

ROC AUC Score for undersampled data:  0.8294225691236696


In [19]:
from sklearn.metrics import confusion_matrix,accuracy_score
print('Accuracy of the model after under sampling:',accuracy_score(ytest,pred_under))
print('Confusion Matrix after under sampling:\n',confusion_matrix(ytest,pred_under))

Accuracy of the model after under sampling: 0.8054408728988499
Confusion Matrix after under sampling:
 [[9539 2415]
 [ 224 1386]]


# Over Sampling

In [22]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='minority')

x_train_smote,y_train_smote = smote.fit_resample(x_train,ytrain)

In [13]:
#Import SVM libraries
from sklearn.svm import SVC
from sklearn.metrics import classification_report,roc_auc_score

model = SVC()
clf_smote = model.fit(x_train_smote,y_train_smote)
pred_smote = clf_smote.predict(x_test)

print("ROC AUC Score for oversampled data: ",roc_auc_score(ytest,pred_smote))

ROC AUC Score for oversampled data:  0.8235714129837254


In [15]:
ytest.value_counts()

0    11954
1     1610
Name: y, dtype: int64

In [20]:
from sklearn.metrics import confusion_matrix,accuracy_score
print('Accuracy of the model after over sampling:',accuracy_score(ytest,pred_smote))
print('Confusion Matrix after over sampling:\n',confusion_matrix(ytest,pred_smote))

Accuracy of the model after over sampling: 0.8273370687112946
Confusion Matrix after over sampling:
 [[9904 2050]
 [ 292 1318]]


# Both Oversampling and undersampling by using pipeline

In [33]:
#Part 1
#Import necessary libraries

import pandas as pd
import numpy as np
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import classification_report,roc_auc_score

In [34]:
#Import data
url = 'https://raw.githubusercontent.com/jackty9/Handling_Imbalanced_Data_in_Python/master/bank-full-encoded.csv'
df = pd.read_csv(url)

In [35]:
#Separating the independent and dependent variables
x = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [39]:
# Define pipeline
model = SVC()
over = SMOTE(sampling_strategy=0.4)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o',over),('u',under),('model',model)]
pipeline = Pipeline()