# Import

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
from sklearn.preprocessing import (StandardScaler, OneHotEncoder)
from sklearn.decomposition import PCA
from sklearn.model_selection import (train_test_split, StratifiedKFold)
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import svm
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler 

from imblearn.ensemble import EasyEnsemble
import math
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
from tensorflow import keras
from imblearn.over_sampling import SMOTE
#from imblearn.datasets import make_imbalance



# Data Preprocessing

In [0]:
##------------------------------------------------------------------------------------------##

from google.colab import drive
drive.mount('/content/drive')

data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Machine Learning For Data Mining/Credit Card Data/creditcard.csv')

print("Raw Data: ",data.head)

data = data.values

print("Raw Data Shape: ",data.shape)
print()

fraud = data[np.where(data[:,30] == 1)]
non_fraud = data[np.where(data[:,30] == 0)]
print("number of fraud occured: ",fraud.shape)
print("number of fraud not occured: ",non_fraud.shape)

print()

np.random.shuffle(fraud)
np.random.shuffle(non_fraud)

non_fraud_under_samp = non_fraud[:(fraud.shape[0]*9),:]
print("Under sampled non fraud: ", non_fraud_under_samp.shape)

print()

fraud_test = fraud[:int(fraud.shape[0]/10),:]
fraud_train = fraud[int(fraud.shape[0]/10):,:]
non_fraud_test = non_fraud_under_samp[:int(non_fraud_under_samp.shape[0]/10),:]
non_fraud_train = non_fraud_under_samp[int(non_fraud_under_samp.shape[0]/10):,:]

print()

print("Fraud_test shape: ",fraud_test.shape)
print("Fraud_train shape: ",fraud_train.shape)
print("Non Fraud test shape: ",non_fraud_test.shape)
print("Non Fraud train shape: ",non_fraud_train.shape)

print()

total_imb_train_data = np.concatenate((fraud_train, non_fraud_train), axis=0)
print("Total imbalanced training data: ",total_imb_train_data.shape)

total_imb_test_data = np.concatenate((fraud_test, non_fraud_test), axis=0)
print("Total imbalanced testing data: ",total_imb_test_data.shape)

print()

np.random.shuffle(total_imb_train_data)
np.random.shuffle(total_imb_test_data)

imb_train_data_labels = total_imb_train_data[:,(total_imb_train_data.shape[1]-1)]
total_imb_train_data = np.delete(total_imb_train_data,30,1)
imb_test_data_labels = total_imb_test_data[:,(total_imb_test_data.shape[1]-1)]
total_imb_test_data = np.delete(total_imb_test_data,30,1)

print("Imb Test data shape: ",total_imb_test_data.shape)
print("Imb Test data labels: ", imb_test_data_labels.shape)
print("Imb Train data shape: ",total_imb_train_data.shape)
print("Imb Train data labels: ", imb_train_data_labels.shape)

##------------------------------------------------------------------------------------------##

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Raw Data:  <bound method NDFrame.head of             Time         V1         V2  ...       V28  Amount  Class
0            0.0  -1.359807  -0.072781  ... -0.021053  149.62      0
1            0.0   1.191857   0.266151  ...  0.014724    2.69      0
2            1.0  -1.358354  -1.340163  ... -0.059752  378.66      0
3            1.0  -0.966272  -0.185226  ...  0.061458  123.50      0
4            2.0  -1.158233   0.877737  ...  0.215153   69.99      0
...          ...        ...        ...  ...       ...     ...    ...
284802  172786.0 -11.881118  10.071785  ...  0.823731    0.77      0
284803  172787.0  -0.732789  -0.055080  ... -0.053527   24.79      0
284804  172788.0   1.919565  -0.301254  ... -0.026561   67.88      0
284805  172788.0  -0.240440   0.530483  ...  0.104533   10.00      0
284806  172792.0  -0.533413  -0.189733  ...  0.013649  217.00      0

[

# Model Trianing with imbalanced data

In [0]:
rbfsvmc = svm.SVC(kernel = 'rbf', gamma = 0.7)
rbf_svmc = rbfsvmc.fit(total_imb_train_data, imb_train_data_labels.ravel())
rbf_y_pred = rbfsvmc.predict(total_imb_test_data)
precision = precision_score(imb_test_data_labels, rbf_y_pred, average="macro")
recall = recall_score(imb_test_data_labels, rbf_y_pred, average="macro")
print("Score :: SVM -> Kernel = rbf: ",rbfsvmc.score(total_imb_test_data,imb_test_data_labels))
print("Precision :: SVM -> Kernel = rbf: ",precision)
print("Recall :: SVM -> Kernel = rbf: ",recall)

Score :: SVM -> Kernel = rbf:  0.9022403258655805
Precision :: SVM -> Kernel = rbf:  0.9510204081632654
Recall :: SVM -> Kernel = rbf:  0.5102040816326531


# Model Training with Undersampled data

In [0]:
np.random.shuffle(non_fraud)

us_nf_data = non_fraud[:(fraud.shape[0]),:]
print("Undersampled non fraud data: ",us_nf_data.shape)
print("Fraud data: ",fraud.shape)

fraud_test = fraud[:int(fraud.shape[0]/10),:]
fraud_train = fraud[int(fraud.shape[0]/10):,:]
us_nf_data_test = us_nf_data[:int(us_nf_data.shape[0]/10),:]
us_nf_data_train = us_nf_data[int(us_nf_data.shape[0]/10):,:]

total_train_data = np.concatenate((fraud_train, us_nf_data_train), axis=0)
total_test_data = np.concatenate((fraud_test, us_nf_data_test), axis=0)

np.random.shuffle(total_train_data)
np.random.shuffle(total_test_data)

train_labels = total_train_data[:,(total_train_data.shape[1]-1)]
train_data = np.delete(total_train_data,30,1)
test_labels = total_test_data[:,(total_test_data.shape[1]-1)]
test_data = np.delete(total_test_data,30,1)


print("Test data shape: ", test_data.shape)
print("Test labels shape: ", test_labels.shape)
print("Train data shape: ", train_data.shape)
print("Train labels shape: ", train_labels.shape)

rbfsvmc = svm.SVC(kernel = 'rbf', gamma = 0.7)
rbf_svmc = rbfsvmc.fit(train_data, train_labels.ravel())
rbf_y_pred = rbfsvmc.predict(test_data)
precision = precision_score(test_labels, rbf_y_pred, average="macro")
recall = recall_score(test_labels, rbf_y_pred, average="macro")
print("Score :: SVM -> Kernel = rbf: ",rbfsvmc.score(test_data,test_labels))
print("Precision :: SVM -> Kernel = rbf: ",precision)
print("Recall :: SVM -> Kernel = rbf: ",recall)

Undersampled non fraud data:  (492, 31)
Fraud data:  (492, 31)
Test data shape:  (98, 30)
Test labels shape:  (98,)
Train data shape:  (886, 30)
Train labels shape:  (886,)
Score :: SVM -> Kernel = rbf:  0.5102040816326531
Precision :: SVM -> Kernel = rbf:  0.7525773195876289
Recall :: SVM -> Kernel = rbf:  0.5102040816326531


# Ensemble Without Replacement

In [0]:
np.random.shuffle(fraud)
np.random.shuffle(non_fraud)

fraud_test = fraud[:int(fraud.shape[0]/10),:]
fraud_train = fraud[int(fraud.shape[0]/10):,:]
non_fraud_test = non_fraud[:int(non_fraud.shape[0]/10),:]
non_fraud_train = non_fraud[int(non_fraud.shape[0]/10):,:]

train_data = np.concatenate((fraud_train, non_fraud_train), axis=0)
test_data = np.concatenate((fraud_test, non_fraud_test), axis=0)

train_labels = train_data[:,(train_data.shape[1]-1)]
train_data = np.delete(train_data,30,1)
test_labels = test_data[:,(test_data.shape[1]-1)]
test_data = np.delete(test_data,30,1)

print("Test data shape: ", test_data.shape)
print("Test labels shape: ", test_labels.shape)
print("Train data shape: ", train_data.shape)
print("Train labels shape: ", train_labels.shape)

print(Counter(train_labels))
print(Counter(test_labels))

print()

ensembles = EasyEnsemble( random_state=177, replacement=False, n_subsets=10 )
train_data_resample, train_label_resample = ensembles.fit_resample(train_data, train_labels)
print("Train data shape: ", train_data_resample.shape)
print("Train labels shape: ", train_label_resample.shape)

print()

prediction = np.zeros((10,test_labels.shape[0]))

for i in range(train_data_resample.shape[0]):
  
  rbfsvmc = svm.SVC(kernel = 'rbf', gamma = 0.5)
  rbf_svmc = rbfsvmc.fit(train_data_resample[i], train_label_resample[i].ravel())
  prediction[i,:] = np.array(rbfsvmc.predict(test_data))
  
print(prediction.shape)
print(prediction)

pred = []
for i in range(prediction.shape[1]):
  pred.append(Counter(prediction[:,i]).most_common(1)[0][0])

accuracy = rbfsvmc.score(test_data,test_labels)
precision = precision_score(test_labels, pred, average="macro")
recall = recall_score(test_labels, pred, average="macro")

print("Score :: SVM -> Kernel = rbf: ",accuracy)
print("Precision :: SVM -> Kernel = rbf: ",precision)
print("Recall :: SVM -> Kernel = rbf: ",recall)

Test data shape:  (28480, 30)
Test labels shape:  (28480,)
Train data shape:  (256327, 30)
Train labels shape:  (256327,)
Counter({0.0: 255884, 1.0: 443})
Counter({0.0: 28431, 1.0: 49})





Train data shape:  (10, 886, 30)
Train labels shape:  (10, 886)

(10, 28480)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Score :: SVM -> Kernel = rbf:  0.9984199438202247
Precision :: SVM -> Kernel = rbf:  0.9992098609355247
Recall :: SVM -> Kernel = rbf:  0.5408163265306123


# Ensemble With Replacement

In [0]:
np.random.shuffle(fraud)
np.random.shuffle(non_fraud)

fraud_test = fraud[:int(fraud.shape[0]/10),:]
fraud_train = fraud[int(fraud.shape[0]/10):,:]
non_fraud_test = non_fraud[:int(non_fraud.shape[0]/10),:]
non_fraud_train = non_fraud[int(non_fraud.shape[0]/10):,:]

train_data = np.concatenate((fraud_train, non_fraud_train), axis=0)
test_data = np.concatenate((fraud_test, non_fraud_test), axis=0)

train_labels = train_data[:,(train_data.shape[1]-1)]
train_data = np.delete(train_data,30,1)
test_labels = test_data[:,(test_data.shape[1]-1)]
test_data = np.delete(test_data,30,1)

print("Test data shape: ", test_data.shape)
print("Test labels shape: ", test_labels.shape)
print("Train data shape: ", train_data.shape)
print("Train labels shape: ", train_labels.shape)

print(Counter(train_labels))
print(Counter(test_labels))

print()

ensembles = EasyEnsemble( random_state=177, replacement=True, n_subsets=10 )
train_data_resample, train_label_resample = ensembles.fit_resample(train_data, train_labels)
print("Train data shape: ", train_data_resample.shape)
print("Train labels shape: ", train_label_resample.shape)

print()

prediction = np.zeros((10,test_labels.shape[0]))

for i in range(train_data_resample.shape[0]):
  
  rbfsvmc = svm.SVC(kernel = 'rbf', gamma = 0.5)
  rbf_svmc = rbfsvmc.fit(train_data_resample[i], train_label_resample[i].ravel())
  prediction[i,:] = np.array(rbfsvmc.predict(test_data))
  
print(prediction.shape)
print(prediction)

pred = []
for i in range(prediction.shape[1]):
  pred.append(Counter(prediction[:,i]).most_common(1)[0][0])

accuracy = rbfsvmc.score(test_data,test_labels)
precision = precision_score(test_labels, pred, average="macro")
recall = recall_score(test_labels, pred, average="macro")

print("Score :: SVM -> Kernel = rbf: ",accuracy)
print("Precision :: SVM -> Kernel = rbf: ",precision)
print("Recall :: SVM -> Kernel = rbf: ",recall)

Test data shape:  (28480, 30)
Test labels shape:  (28480,)
Train data shape:  (256327, 30)
Train labels shape:  (256327,)
Counter({0.0: 255884, 1.0: 443})
Counter({0.0: 28431, 1.0: 49})





Train data shape:  (10, 886, 30)
Train labels shape:  (10, 886)

(10, 28480)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Score :: SVM -> Kernel = rbf:  0.9984550561797753
Precision :: SVM -> Kernel = rbf:  0.9992273924495172
Recall :: SVM -> Kernel = rbf:  0.5510204081632653


# Oversampling with smote

In [0]:
np.random.shuffle(fraud)
np.random.shuffle(non_fraud)

fraud_test = fraud[:int(fraud.shape[0]/10),:]
fraud_train = fraud[int(fraud.shape[0]/10):,:]
non_fraud_test = non_fraud[:int(non_fraud.shape[0]/10),:]
non_fraud_train = non_fraud[int(non_fraud.shape[0]/10):,:]

train_data = np.concatenate((fraud_train, non_fraud_train), axis=0)
test_data = np.concatenate((fraud_test, non_fraud_test), axis=0)

train_labels = train_data[:,(train_data.shape[1]-1)]
train_data = np.delete(train_data,30,1)
test_labels = test_data[:,(test_data.shape[1]-1)]
test_data = np.delete(test_data,30,1)

print("Test data shape: ", test_data.shape)
print("Test labels shape: ", test_labels.shape)
print("Train data shape: ", train_data.shape)
print("Train labels shape: ", train_labels.shape)

print(Counter(train_labels))
print(Counter(test_labels))

print()

smote = SMOTE(random_state=177, sampling_strategy='auto', k_neighbors=7)

train_data_resample, train_label_resample = smote.fit_resample(train_data, train_labels)
print(train_data_resample.shape)
print(train_label_resample.shape)

train_data_resample = train_data_resample[:10000,:]
train_label_resample = train_label_resample[:10000]

print(train_data_resample.shape)
print(train_label_resample.shape)

Test data shape:  (28480, 30)
Test labels shape:  (28480,)
Train data shape:  (256327, 30)
Train labels shape:  (256327,)
Counter({0.0: 255884, 1.0: 443})
Counter({0.0: 28431, 1.0: 49})

(511768, 30)
(511768,)
(10000, 30)
(10000,)


In [0]:
rbfsvmc = svm.SVC(kernel = 'rbf', gamma = 0.5)
rbf_svmc = rbfsvmc.fit(train_data_resample, train_label_resample.ravel())
print("Predicting")
prediction = np.array(rbfsvmc.predict(test_data))
print("Calculating")
accuracy = rbfsvmc.score(test_data,test_labels)
precision = precision_score(test_labels, prediction, average="macro")
recall = recall_score(test_labels, prediction, average="macro")

print("Score :: SVM -> Kernel = rbf: ",accuracy)
print("Precision :: SVM -> Kernel = rbf: ",precision)
print("Recall :: SVM -> Kernel = rbf: ",recall)


Predicting
Calculating
Score :: SVM -> Kernel = rbf:  0.9983497191011236
Precision :: SVM -> Kernel = rbf:  0.9991748016012361
Recall :: SVM -> Kernel = rbf:  0.5204081632653061
