In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from numpy import sqrt
import random

In [2]:
data = pd.read_csv('creditcard.csv')
data.head(3) #gets the first three rows

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0


In [3]:
from sklearn.preprocessing import StandardScaler

data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
data = data.drop(['Time', 'Amount'], axis=1)
data.head() # The columns need to be in similar scale.Can be acheived by Standarization or Normalization

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Class,normAmount
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0,0.244964
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0,-0.342475
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0,1.160686
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0,0.140534
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0,-0.073403


In [4]:
## Create a matrix for independent and dependent variables.
X = np.array(data.iloc[:, data.columns != 'Class'])
y = np.array(data.iloc[:, data.columns == 'Class'])
print('Shape of X: {}'.format(X.shape))
print('Shape of y: {}'.format(y.shape)) 

Shape of X: (284807, 29)
Shape of y: (284807, 1)


In [5]:
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# splitting datset into train and test
print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

Number transactions X_train dataset:  (199364, 29)
Number transactions y_train dataset:  (199364, 1)
Number transactions X_test dataset:  (85443, 29)
Number transactions y_test dataset:  (85443, 1)


In [6]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

Before OverSampling, counts of label '1': [345]
Before OverSampling, counts of label '0': [199019] 



In [7]:
def nearest_neighbour(X, x):
    euclidean = np.ones(X.shape[0]-1)
    
    additive = [None]*(1*X.shape[1])
    #print(additive)
    additive = np.array(additive).reshape(1, X.shape[1])
    #print(additive)
    k = 0
    for j in range(0,X.shape[0]):
        if np.array_equal(X[j], x) == False:
            euclidean[k] = sqrt(sum((X[j]-x)**2))
            k = k + 1
    euclidean = np.sort(euclidean)
    #print(euclidean)
    weight = random.random()
    while(weight == 0):
        weight = random.random()
    additive = np.multiply(euclidean[:1],weight)#The k difference vectors are each multiplied by a random number between 0 and 1 (excluding 0 and 1).
    return additive
# Now, the difference vectors, after being multiplied by random numbers, are added to the feature vector of the considered instance (original minority instance) at each iteration. 
def SMOTE_100(X):
    new = [None]*(X.shape[0]*X.shape[1]) #345*29
    new = np.array(new).reshape(X.shape[0],X.shape[1])
    # new.shape = 345,29
    k = 0
    for i in range(0,X.shape[0]): # 0,345
        additive = nearest_neighbour(X, X[i]) # The difference between the feature vector of the considered instance and the feature vectors of the k nearest neighbours are found. So, k number of difference vectors are obtained.
        for j in range(0,1):
            new[k] = X[i] + additive[j]
            k = k + 1
    return new # the synthetic samples created by SMOTe 

In [8]:
# Getting the number of Minority Class Instances in Training Set
unique, counts = np.unique(y_train, return_counts=True)
minority_shape = dict(zip(unique, counts))[1] # The dict() function creates a dictionary.
# zip() with n arguments, then the function will return an iterator that generates tuples of length n
print(minority_shape)

345


In [9]:
# Storing the minority class instances separately
x1 = np.ones((minority_shape, X_train.shape[1])) # to get the shape as (no. of 1's, number of columns)
print(x1.shape)
k=0
for i in range(0,X_train.shape[0]): # checking for each row if the value of y is one
    if y_train[i] == 1.0:
        x1[k] = X[i] # getting the value
        k = k + 1

(345, 29)


In [10]:
#  Applying  SMOTE
sampled_instances = SMOTE_100(x1)

In [11]:
# Keeping the artificial instances and original instances together
print(X_train.shape)
print(sampled_instances.shape)
X_new = np.concatenate((X_train,sampled_instances),axis=0)
print(X_new.shape)
y_sampled_instances = np.ones((minority_shape,y_train.shape[1]))

print(y_sampled_instances.shape)
print(y_train.shape)
y_new = np.concatenate((y_train,y_sampled_instances),axis=0)
print(y_new.shape)
# X_new and y_new are the Training Set Features and Labels respectively 

(199364, 29)
(345, 29)
(199709, 29)
(345, 1)
(199364, 1)
(199709, 1)


In [12]:
print('After OverSampling, the shape of train_X: {}'.format(X_new.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_new.shape))
print("After OverSampling, counts of label '1': {}".format(sum(y_new==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_new==0)))

After OverSampling, the shape of train_X: (199709, 29)
After OverSampling, the shape of train_y: (199709, 1) 

After OverSampling, counts of label '1': [690]
After OverSampling, counts of label '0': [199019]
