In [1]:
import numpy as np
import pandas as pd 
from matplotlib import pyplot as plt 
import os 
import sys 
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier
# Splitting the data into train and test
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,roc_curve,confusion_matrix,roc_auc_score,auc

# Reference - https://towardsdatascience.com/having-an-imbalanced-dataset-here-is-how-you-can-solve-it-1640568947eb

In [2]:
data = '../data/creditcard.csv'
creditcard_df = pd.read_csv(data)
creditcard_df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
# percentage of missing values in each column
creditcard_df.isnull().sum()/len(creditcard_df.index)*100

Time      0.0
V1        0.0
V2        0.0
V3        0.0
V4        0.0
V5        0.0
V6        0.0
V7        0.0
V8        0.0
V9        0.0
V10       0.0
V11       0.0
V12       0.0
V13       0.0
V14       0.0
V15       0.0
V16       0.0
V17       0.0
V18       0.0
V19       0.0
V20       0.0
V21       0.0
V22       0.0
V23       0.0
V24       0.0
V25       0.0
V26       0.0
V27       0.0
V28       0.0
Amount    0.0
Class     0.0
dtype: float64

In [4]:
creditcard_df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,3.91956e-15,5.688174e-16,-8.769071e-15,2.782312e-15,-1.552563e-15,2.010663e-15,-1.694249e-15,-1.927028e-16,-3.137024e-15,...,1.537294e-16,7.959909e-16,5.36759e-16,4.458112e-15,1.453003e-15,1.699104e-15,-3.660161e-16,-1.206049e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [5]:
# Shuffle the Dataset.
creditcard_df = creditcard_df.sample(frac=1,random_state=4)
creditcard_df.head(10)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
135406,81240.0,0.999672,-0.034679,0.446984,1.37476,-0.272838,0.033625,0.025702,0.118362,0.052625,...,-0.00606,0.042805,-0.095535,0.227518,0.575778,-0.347302,0.024111,0.016893,64.0,0
137826,82347.0,-0.844413,1.032424,1.090921,-0.671593,-0.006061,-0.621923,0.322604,0.513167,-0.705964,...,-0.080023,-0.53509,0.005324,-0.042269,-0.453002,-0.127804,-0.057711,0.053107,0.89,0
70830,54035.0,-0.474271,1.027526,1.546229,-0.082036,0.180465,-0.407305,0.69501,0.025371,-0.522539,...,-0.1864,-0.415345,0.014862,0.076099,-0.236694,0.097987,0.284462,0.120016,8.99,0
194993,130854.0,-1.619583,-0.460686,0.219034,-0.418723,0.933105,-0.477342,0.902804,-0.120123,0.138692,...,-0.009905,0.886662,0.620723,-0.317571,0.048672,0.61777,0.383019,0.054503,75.98,0
87575,61745.0,-1.159349,0.816687,1.743063,-0.724069,-0.39859,-0.796834,0.275232,0.405158,-0.238336,...,-0.21597,-0.746148,-0.141145,0.423557,0.225527,0.749267,-0.169812,-0.036058,12.1,0
58922,48609.0,-0.527161,0.200911,1.821059,-2.06802,-0.025181,0.093388,0.035732,0.028121,-1.269504,...,0.101647,0.146677,-0.410605,-0.983874,0.50004,-0.289184,0.048504,0.057803,19.86,0
14329,25411.0,-1.271485,1.468437,-0.636989,-0.394338,2.138492,3.535837,-0.25918,1.170031,-0.461973,...,0.082528,-0.134647,-0.064862,0.982685,-0.050696,-0.444604,-0.401793,0.149573,6.48,0
110440,71826.0,1.096225,-0.854262,0.408811,-0.613771,-0.984417,-0.502653,-0.377324,-0.145042,-0.936799,...,0.026978,-0.279569,0.031661,0.092783,0.13773,-0.469841,0.006471,0.045504,147.6,0
89431,62576.0,0.9747,-0.956996,1.490097,0.474495,-1.290798,1.248123,-1.284833,0.583325,1.697556,...,-0.028082,0.270264,-0.106748,-0.181975,0.175866,1.156613,0.005694,0.010555,67.31,0
129035,78930.0,-1.144008,0.766896,2.381739,-0.024527,-0.40428,0.509322,0.491448,0.054743,0.056054,...,0.246352,0.730906,-0.492254,-0.428331,0.487274,-0.287019,-0.096385,0.054411,108.45,0


In [14]:
#Create an object of the classifier.
clf_tree = DecisionTreeClassifier(max_depth=10,max_leaf_nodes=2,min_impurity_split=2)
bbc = BalancedBaggingClassifier(base_estimator=clf_tree,
                                sampling_strategy='auto',
                                replacement=False,
                                random_state=0)
X = creditcard_df.drop('Class',axis=1)
y = creditcard_df['Class']
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.90,test_size=0.10,)
print "Trainnig data",X_train.shape
print "Train label",y_train.shape
print "Test data",X_test.shape
print "Test Label",y_test.shape

Trainnig data (256326, 30)
Train label (256326,)
Test data (28481, 30)
Test Label (28481,)


In [15]:
#Train the classifier.
bbc.fit(X_train, y_train)




BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=2, min_impurity_decrease=0.0,
            min_impurity_split=2, min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
             bootstrap=True, bootstrap_features=False, max_features=1.0,
             max_samples=1.0, n_estimators=10, n_jobs=1, oob_score=False,
             random_state=0, ratio=None, replacement=False,
             sampling_strategy='auto', verbose=0, warm_start=False)

In [16]:
predictions = bbc.predict(X_test)
print classification_report(y_true=y_test,y_pred=predictions)
print ("AUC score{:2.2}".format(roc_auc_score(y_test, predictions)))
cnf_matix = confusion_matrix(y_true=y_test,y_pred=predictions)
TP = cnf_matix[1,1]
TN = cnf_matix[0,0]
FP = cnf_matix[0,1]
FN = cnf_matix[1,0]
print("TP",TP) 
print("TN",TN) 
print("FP",FP) 
print("FN",FN)
print ("Recall= {}".format(TP/float(TP+FN)))
print ("Precison= {}".format(TP/float(TP+FP)))
print ("specificity= {}".format(TN / float(TN+FP)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28443
           1       0.00      0.00      0.00        38

   micro avg       1.00      1.00      1.00     28481
   macro avg       0.50      0.50      0.50     28481
weighted avg       1.00      1.00      1.00     28481

AUC score0.5
('TP', 0)
('TN', 28443)
('FP', 0)
('FN', 38)
Recall= 0.0
Precison= nan
specificity= 1.0


  
