In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn import linear_model, datasets
from sklearn.linear_model import RandomizedLogisticRegression
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("creditcard.csv")

In [3]:
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
Time      284807 non-null float64
V1        284807 non-null float64
V2        284807 non-null float64
V3        284807 non-null float64
V4        284807 non-null float64
V5        284807 non-null float64
V6        284807 non-null float64
V7        284807 non-null float64
V8        284807 non-null float64
V9        284807 non-null float64
V10       284807 non-null float64
V11       284807 non-null float64
V12       284807 non-null float64
V13       284807 non-null float64
V14       284807 non-null float64
V15       284807 non-null float64
V16       284807 non-null float64
V17       284807 non-null float64
V18       284807 non-null float64
V19       284807 non-null float64
V20       284807 non-null float64
V21       284807 non-null float64
V22       284807 non-null float64
V23       284807 non-null float64
V24       284807 non-null float64
V25       284807 non-null float64
V26  

In [13]:
'''We need to do some additional manipulation with data 
   since class zero (normal) is significantly more than fraud data
'''
#we can calculate the ratios first
bad_data = data[data['Class'] == 1]
good_data = data[data['Class'] == 0]
ratio = float(len(bad_data)/(len(good_data)))
ratio

# The below ratio indicated that the fraud data is 0.17% of normal data,
#This will skewed the algorithms, therefore we need to rebalance the data set

0.0017304750013189597

In [14]:
good_data = good_data.sample(frac=ratio)
good_data

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
61778,49982.0,1.202752,0.217415,0.623026,0.551504,-0.506489,-0.780844,-0.024316,-0.081785,-0.255674,...,-0.190141,-0.579809,0.146990,0.535245,0.166038,0.063848,-0.031477,0.011329,1.98,0
148328,89718.0,-0.151769,0.773317,0.217510,-0.990222,1.273956,-0.133331,1.220424,-0.233007,0.057706,...,-0.328350,-0.668780,-0.049670,-0.043039,-0.537554,0.123571,0.180082,0.013355,11.99,0
201461,133894.0,-0.815881,1.386916,-0.857170,-0.607506,0.044927,-0.690336,-0.096024,-0.148382,-0.215492,...,0.587049,-1.036307,0.177484,-0.684492,-0.376773,0.175217,-0.361760,-0.155877,4.95,0
2080,1606.0,1.239456,0.225786,0.404418,0.422563,-0.214612,-0.464966,-0.011435,-0.067165,-0.271190,...,-0.221057,-0.650838,0.075829,0.006808,0.249130,0.095922,-0.030060,0.006747,2.69,0
216515,140511.0,2.150996,-0.070509,-2.571993,-0.338700,1.002009,-0.596891,0.500969,-0.201456,0.256296,...,0.125413,0.366467,-0.085480,-0.105760,0.458333,0.247255,-0.103833,-0.094870,0.76,0
160127,113129.0,2.040578,-0.146368,-2.955721,-0.578510,2.609546,3.142573,-0.417135,0.784442,0.359925,...,-0.352449,-0.996836,0.363652,0.604817,-0.264490,0.219691,-0.039174,-0.042820,1.79,0
5302,5182.0,1.116107,0.344326,1.226401,1.526809,-0.588444,-0.394763,-0.329882,-0.101085,1.366724,...,0.049521,0.508152,0.018567,0.571935,0.376743,-0.335803,0.039034,0.030295,10.00,0
169502,119735.0,-0.362899,-1.290946,1.429818,-2.942627,-1.106790,0.125321,-0.418336,-0.113141,-0.940727,...,-0.092248,0.271268,0.111174,-0.845774,-0.846847,-0.391760,-0.036463,-0.013396,114.63,0
64275,51095.0,1.272827,-0.592725,-0.237013,-0.688364,-0.486976,-0.177794,-0.628693,0.016287,-0.624489,...,0.057091,-0.046896,-0.131150,-0.847854,0.365341,-0.207164,0.028935,0.045092,79.90,0
215992,140311.0,-0.123587,-0.105834,-0.174089,-1.959005,0.743929,-1.027379,0.223034,-0.265627,-1.126629,...,0.570474,1.600096,-0.224828,0.671403,-0.705032,-0.139411,0.131017,0.211375,1.00,0


In [15]:
balanced_data = bad_data.append(good_data)
balanced_data

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
541,406.0,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,...,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.177840,0.261145,-0.143276,0.00,1
623,472.0,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,...,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764,529.00,1
4920,4462.0,-2.303350,1.759247,-0.359745,2.330243,-0.821628,-0.075788,0.562320,-0.399147,-0.238253,...,-0.294166,-0.932391,0.172726,-0.087330,-0.156114,-0.542628,0.039566,-0.153029,239.93,1
6108,6986.0,-4.397974,1.358367,-2.592844,2.679787,-1.128131,-1.706536,-3.496197,-0.248778,-0.247768,...,0.573574,0.176968,-0.436207,-0.053502,0.252405,-0.657488,-0.827136,0.849573,59.00,1
6329,7519.0,1.234235,3.019740,-4.304597,4.732795,3.624201,-1.357746,1.713445,-0.496358,-1.282858,...,-0.379068,-0.704181,-0.656805,-1.632653,1.488901,0.566797,-0.010016,0.146793,1.00,1
6331,7526.0,0.008430,4.137837,-6.240697,6.675732,0.768307,-3.353060,-1.631735,0.154612,-2.795892,...,0.364514,-0.608057,-0.539528,0.128940,1.488481,0.507963,0.735822,0.513574,1.00,1
6334,7535.0,0.026779,4.132464,-6.560600,6.348557,1.329666,-2.513479,-1.689102,0.303253,-3.139409,...,0.370509,-0.576752,-0.669605,-0.759908,1.605056,0.540675,0.737040,0.496699,1.00,1
6336,7543.0,0.329594,3.712889,-5.775935,6.078266,1.667359,-2.420168,-0.812891,0.133080,-2.214311,...,0.156617,-0.652450,-0.551572,-0.716522,1.415717,0.555265,0.530507,0.404474,1.00,1
6338,7551.0,0.316459,3.809076,-5.615159,6.047445,1.554026,-2.651353,-0.746579,0.055586,-2.678679,...,0.208828,-0.511747,-0.583813,-0.219845,1.474753,0.491192,0.518868,0.402528,1.00,1
6427,7610.0,0.725646,2.300894,-5.329976,4.007683,-1.730411,-1.732193,-3.968593,1.063728,-0.486097,...,0.589669,0.109541,0.601045,-0.364700,-1.843078,0.351909,0.594550,0.099372,1.00,1


In [17]:
new_ratio = float(len(bad_data)/len(good_data))
new_ratio


1.0

In [18]:
#Now the classes of data are balanced, we shall proceed the training
X = balanced_data.iloc[:, :-1].values
y = balanced_data.iloc[:, -1].values


In [19]:
#now we split the data sets into train & test
seed = 100
np.random.seed(seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = seed)


In [21]:
#Normalize the datasets now, although it seems unnecessary to do it, except the amount
from sklearn.preprocessing import  StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [23]:
#Now let's train a random forest model
random_forest_model = RandomForestClassifier(criterion='entropy', n_estimators=100)


In [26]:
random_forest_model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [28]:
predicted = random_forest_model.predict(X_test)
print(metrics.classification_report(y_test, predicted))
print(metrics.confusion_matrix(y_test, predicted))

             precision    recall  f1-score   support

          0       0.87      0.98      0.92       133
          1       0.98      0.88      0.93       163

avg / total       0.93      0.92      0.92       296

[[130   3]
 [ 20 143]]


In [33]:
#Try to use Support vector michine

from sklearn.svm import SVC

svm_model = SVC()

In [34]:
svm_model.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [35]:
# see the prediction

predicted = random_forest_model.predict(X_test)
print(metrics.classification_report(y_test, predicted))
print(metrics.confusion_matrix(y_test, predicted))

             precision    recall  f1-score   support

          0       0.87      0.98      0.92       133
          1       0.98      0.88      0.93       163

avg / total       0.93      0.92      0.92       296

[[130   3]
 [ 20 143]]


In [52]:
#Achieved 93 percent accuracy