In [206]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cross_validation import train_test_split # to split the data
from sklearn.cross_validation import KFold            # For cross vbalidation
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score
import itertools

%matplotlib inline


In [207]:
data = pd.read_csv("/Users/zahrakhambaty/Downloads/creditcard.csv")
data.head()


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [208]:
## Pre-processing the data
## Normalizing the amount column

from sklearn.preprocessing import StandardScaler
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1,1))
data = data.drop(['Time','Amount'],axis=1)
data.head()


Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Class,normAmount
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0,0.244964
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0,-0.342475
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0,1.160686
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0,0.140534
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0,-0.073403


In [209]:
## Since the data is largely imbalanced we need to resample the data such that the proportion/ratio between fraudulent and normal transactions are relativeley similar.

x = data.loc[:, data.columns != 'Class']
y = data.loc[:, data.columns == 'Class']


In [210]:
## Splitting the data into Training,Validation and Test Set##
## Test Set needs to be unused till the mere end##
X_train, X_test, y_train, y_test = train_test_split(data,y, test_size=0.25, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, test_size=0.25, random_state=42)
# #Figuring out the ratio of normal transction and fraudelent transaction from training data# #
normal_tdata = X_train[X_train["Class"]==0]
print("train data: length of normal data",len(normal_tdata))
fraud_tdata = X_train[X_train["Class"]==1]
print("train data: length of fraud data",len(fraud_tdata))
## dataset for validation set ##
normal_vdata = X_val[X_val["Class"]==0]
print("For Validation Set :length of normal data",len(normal_vdata))
fraud_vdata = X_val[X_val["Class"]==1]
print("For Validation Set :length of fraud data",len(fraud_vdata))

train data: length of normal data 159900
train data: length of fraud data 303
For Validation Set :length of normal data 53320
For Validation Set :length of fraud data 82


In [211]:
##Since the data is highly imbalanced we use the sklearn package to balance out the data by introducing more fraudulent data ##
## basically oversampling of data ##
sm = SMOTE(random_state=12, ratio = 0.6)
x_train_res, y_train_res = sm.fit_sample(X_train, y_train.values.ravel())

In [212]:

a = x_train_res[:,28]
b= np.count_nonzero(a == 1)
c= np.count_nonzero(a == 0)
print("length of oversampled data is ",len(x_train_res))
print("Number of normal transcation in oversampled data",b)
print("No.of fraud transcation",c)
print("Proportion of Normal data in oversampled data is ",c/len(x_train_res))
print("Proportion of fraud data in oversampled data is ",b/len(x_train_res))

length of oversampled data is  255840
Number of normal transcation in oversampled data 95940
No.of fraud transcation 159900
Proportion of Normal data in oversampled data is  0.625
Proportion of fraud data in oversampled data is  0.375


In [215]:
lrn = LogisticRegression()
lrn.fit(x_train_res, y_train_res)
y_pred = lrn.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))
print (metrics.confusion_matrix(y_test, y_pred))
print (metrics.classification_report(y_test, y_pred))


1.0
[[71095     0]
 [    0   107]]
             precision    recall  f1-score   support

          0       1.00      1.00      1.00     71095
          1       1.00      1.00      1.00       107

avg / total       1.00      1.00      1.00     71202

