# Credit Card Fraud Detection By: Mridul Gulati

## Import Dependencies

In [121]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Load Dataset

In [122]:
credit_card_data = pd.read_csv("creditcard.csv")

In [123]:
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [124]:
credit_card_data.shape

(284807, 31)

## Exploratory Data Analysis

In [125]:
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

#### Distribution of Transactions

In [126]:
credit_card_data["Class"].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

In [127]:
legit = credit_card_data[credit_card_data["Class"] == 0]
fraud = credit_card_data[credit_card_data["Class"] == 1]

#### Dataset is Highly unbalanced
0 -> Legit transaction
1 -> Fraud transaction

In [128]:
print(legit.shape)
print(fraud.shape)

(284315, 31)
(492, 31)


#### Statistical Analysis

In [129]:
legit.Amount.describe()

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [130]:
fraud.Amount.describe()

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

In [131]:
credit_card_data.groupby("Class").mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


### Under Sampling
Build a sample Dataset of similar legit and Fraud Transactions i.e 492 Each

In [132]:
legit_sample = legit.sample(n = 492)

In [133]:
new_dataset = pd.concat([legit_sample,fraud], axis = 0)

In [134]:
new_dataset["Amount"].value_counts()

Amount
1.00      136
0.00       29
99.99      28
0.76       25
0.77       13
         ... 
349.08      1
390.00      1
77.89       1
245.00      1
219.60      1
Name: count, Length: 612, dtype: int64

In [135]:
new_dataset.groupby("Amount").mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
Amount,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.00,98410.206897,-2.943774,3.031207,-5.410115,5.153284,-0.669105,-1.580530,-4.443334,-0.142459,-3.231292,...,0.640321,-0.201008,-0.095622,-0.000497,-0.071233,0.087762,0.199779,0.206844,0.095215,0.931034
0.01,108844.600000,-8.106526,7.137760,-10.283143,5.867211,-5.602184,0.008782,-12.859083,-7.018536,-5.009814,...,2.502583,-3.400279,2.129745,1.097661,0.077938,-0.534605,0.469754,-0.986191,0.463167,1.000000
0.11,54503.000000,1.225934,-0.293803,0.987563,0.344647,-0.753643,0.470828,-0.994604,0.331381,0.993288,...,-0.162097,0.173752,0.553657,-0.083083,-0.746717,0.187834,0.663603,0.022412,0.015130,0.000000
0.20,58858.500000,0.658029,0.159026,1.174556,1.191993,-0.648217,1.123783,-1.325568,-1.036417,-0.778309,...,0.445638,-0.495461,0.813002,-0.207394,-0.339527,0.768579,0.227297,0.087328,0.108201,0.500000
0.38,161154.000000,-3.387601,3.977881,-6.978585,1.657766,-1.100500,-3.599487,-3.686651,1.942252,-3.065089,...,-0.004301,1.043587,0.262189,-0.479224,-0.326638,-0.156939,0.113807,0.354124,0.287592,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1504.93,154278.000000,-1.600211,-3.488130,-6.459303,3.246816,-1.614608,-1.260375,0.288223,-0.048964,-0.734975,...,3.189355,1.191175,-0.967141,-1.463421,-0.624231,-0.176462,0.400348,0.152947,0.477775,1.000000
1523.28,55889.000000,-1.675111,-5.596279,-0.919990,1.674960,-2.240046,1.928280,0.768237,0.146112,-0.244513,...,2.448117,0.361293,-1.731707,-1.343117,-1.149003,-0.231820,-0.511189,-0.187350,0.276094,0.000000
1809.68,9064.000000,-3.499108,0.258555,-4.489558,4.853894,-6.974522,3.628382,5.431271,-1.946734,-0.775680,...,-3.042626,-1.052368,0.204817,-2.119007,0.170279,-0.393844,0.296367,1.985913,-0.900452,1.000000
1836.08,36532.000000,-3.539749,-4.790511,-0.133445,1.426135,-5.561810,3.588378,5.994770,-0.403618,-1.699656,...,3.763112,0.562122,-0.874208,4.033518,-0.687981,1.062177,-0.330916,-0.478503,0.224217,0.000000


### Features(X) and Targets(Y)

In [136]:
X = new_dataset.drop(columns = ["Class"], axis = 1)
Y = new_dataset.Class

In [137]:
X

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
81124,58796.0,1.148305,-0.448005,1.126584,0.192769,-1.267567,-0.451308,-0.647589,0.036833,1.095492,...,0.005389,-0.097208,-0.169873,0.030086,0.474609,0.097411,0.986223,-0.041176,0.022546,44.22
151963,96575.0,1.663205,-1.136898,-0.748404,-0.012352,-0.691445,-0.198093,-0.543873,-0.125848,2.502062,...,0.247972,0.099857,0.131079,0.048951,0.497096,-0.542047,1.194749,-0.158092,-0.024513,219.60
264844,161619.0,-0.187381,0.001970,0.248577,-1.884929,0.685755,-1.121612,0.552552,-0.295092,-1.371605,...,0.130025,0.140466,0.332435,-0.122618,-0.393347,-0.290883,-0.329400,0.134421,0.171772,6.48
29879,35676.0,-2.417392,0.175638,1.141489,-1.180951,-1.576122,0.026440,1.033571,-0.156980,0.423550,...,-0.957012,-0.055105,0.292624,0.083060,0.403880,-0.515826,0.655373,-1.010350,0.047846,242.00
139572,83222.0,0.030229,-2.893902,0.476432,-0.061910,-1.774603,1.148457,-0.554761,0.308564,-0.226900,...,1.248454,0.482028,0.217861,-0.485924,-0.222191,-0.014200,-0.267874,-0.031079,0.114969,620.94
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279863,169142.0,-1.927883,1.125653,-4.518331,1.749293,-1.566487,-2.010494,-0.882850,0.697211,-2.064945,...,1.252967,0.778584,-0.319189,0.639419,-0.294885,0.537503,0.788395,0.292680,0.147968,390.00
280143,169347.0,1.378559,1.289381,-5.004247,1.411850,0.442581,-1.326536,-1.413170,0.248525,-1.127396,...,0.226138,0.370612,0.028234,-0.145640,-0.081049,0.521875,0.739467,0.389152,0.186637,0.76
280149,169351.0,-0.676143,1.126366,-2.213700,0.468308,-1.120541,-0.003346,-2.234739,1.210158,-0.652250,...,0.247968,0.751826,0.834108,0.190944,0.032070,-0.739695,0.471111,0.385107,0.194361,77.89
281144,169966.0,-3.113832,0.585864,-5.399730,1.817092,-0.840618,-2.943548,-2.208002,1.058733,-1.632333,...,0.306271,0.583276,-0.269209,-0.456108,-0.183659,-0.328168,0.606116,0.884876,-0.253700,245.00


In [138]:
Y

81124     0
151963    0
264844    0
29879     0
139572    0
         ..
279863    1
280143    1
280149    1
281144    1
281674    1
Name: Class, Length: 984, dtype: int64

### Split into Training & Testing Data

In [139]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, stratify = Y, random_state = 2)

In [140]:
X_train

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
283005,171302.0,2.004315,-0.135459,-1.018216,0.316560,-0.129972,-0.861653,0.042725,-0.145494,0.333027,...,-0.247091,-0.230586,-0.579104,0.349559,0.007354,-0.367878,0.179667,-0.073792,-0.070650,1.79
206651,136304.0,0.000523,1.062069,-0.225834,-0.594526,0.930894,-0.785105,1.223470,-0.427531,0.282164,...,0.218236,0.280363,1.296170,-0.139227,0.773525,-0.734823,-0.277863,0.449639,0.165540,1.79
5730,6043.0,-0.503236,0.561927,2.733588,0.988518,-0.657709,-0.422318,-0.119873,-0.014734,1.361409,...,0.116130,-0.060576,0.243101,0.054266,0.921507,-0.578781,0.264582,0.125829,0.157241,20.21
116404,74262.0,-2.250535,2.365755,-2.955491,0.089791,-2.830745,-0.844462,-0.174062,-0.407138,0.174216,...,-0.486537,0.256560,-0.466245,0.291105,0.242567,-1.279094,-1.123534,-0.630977,0.326839,311.28
240222,150494.0,1.852889,1.069593,-1.776101,4.617410,0.770413,-0.400859,-0.040970,0.089510,-0.217705,...,-0.288392,-0.157869,-0.176244,0.027437,-0.468006,0.058063,0.148263,0.042278,0.040573,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120000,75674.0,1.372810,-0.744940,0.787858,-0.532910,-1.327106,-0.262728,-0.979279,-0.015468,-0.207853,...,-0.440707,-0.288066,-0.195521,-0.056590,-0.054028,0.265750,1.195509,-0.018605,0.011686,11.66
57470,47923.0,0.364377,1.443523,-2.220907,2.036985,-1.237055,-1.728161,-2.058582,0.358895,-1.393306,...,0.310980,0.402730,-0.132129,-0.032977,0.460861,0.560404,0.409366,0.539668,0.296918,0.76
41395,40662.0,-4.446847,-0.014793,-5.126307,6.945130,5.269255,-4.297177,-2.591242,0.342671,-3.880663,...,-0.108006,0.247913,-0.049586,-0.226017,-0.401236,0.856124,0.661272,0.492560,0.971834,1.00
230476,146344.0,-0.099724,2.795414,-6.423856,3.247513,-1.632290,-2.766665,-2.312223,0.961014,-1.896001,...,0.340898,0.647714,0.126576,0.203953,0.008495,-0.174501,0.575295,0.152876,-0.098173,94.82


In [141]:
Y_train

283005    0
206651    0
5730      0
116404    1
240222    1
         ..
120000    0
57470     1
41395     1
230476    1
207342    0
Name: Class, Length: 738, dtype: int64

In [142]:
X_test

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
151011,94364.0,-15.192064,10.432528,-19.629515,8.046075,-12.838167,-1.875859,-21.359738,-3.717850,-5.969782,...,1.657476,-3.474097,1.765446,1.701257,0.381587,-1.413417,-1.023078,-2.634761,-0.463931,1.00
84549,60357.0,-4.200744,-4.572094,-1.044589,0.360988,1.914340,-2.619056,-0.109648,0.484437,-0.466980,...,1.925648,0.767058,-0.102921,0.951632,0.035243,-0.782943,0.693705,0.146925,-0.445929,409.10
30333,35874.0,-0.245922,-0.083172,2.519350,1.054689,-0.839533,0.621702,-0.189094,0.161858,1.098618,...,0.145407,0.192453,1.139811,0.025868,0.492117,-0.852915,0.670039,0.065122,-0.024539,48.00
4324,3760.0,1.264567,-0.578895,0.527456,-0.605065,-0.882020,-0.459940,-0.657957,-0.128530,0.252333,...,0.184274,-0.175532,-0.558979,0.036158,-0.066199,0.199406,-0.524522,-0.032618,0.014319,73.81
42756,41233.0,-10.645800,5.918307,-11.671043,8.807369,-7.975501,-3.586806,-13.616797,6.428169,-7.368451,...,-0.046170,2.571970,0.206809,-1.667801,0.558419,-0.027898,0.354254,0.273329,-0.152908,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51758,45106.0,1.263534,-0.325411,-0.357391,-1.457818,-0.350907,-0.983849,0.161081,-0.219189,0.992228,...,-0.007860,0.007493,0.184711,-0.248983,0.095029,0.832238,0.094233,-0.021482,-0.005980,33.78
42528,41138.0,-4.595617,5.083690,-7.581015,7.546033,-6.949165,-1.729185,-8.190192,2.714670,-7.083169,...,1.682160,2.248971,0.566844,0.033744,0.591783,0.334229,0.386801,2.163898,0.983104,340.11
112840,72824.0,-1.111495,-0.257575,2.250210,1.152671,0.432904,1.254126,-0.584163,-0.609682,1.014602,...,-0.510614,0.862913,0.927825,-0.343058,-0.256268,-0.600742,-0.180331,0.026762,-0.358335,45.03
120232,75754.0,-0.945490,0.683041,1.862034,-0.550037,0.405870,-0.973198,1.406939,-0.381647,-0.598760,...,0.250161,-0.215910,-0.743565,-0.010258,0.339670,0.298960,-0.066560,-0.210500,-0.141904,89.99


In [143]:
Y_test

151011    1
84549     0
30333     0
4324      0
42756     1
         ..
51758     0
42528     1
112840    1
120232    0
198223    0
Name: Class, Length: 246, dtype: int64

## Model Training
Logistic Regression because Binary Classification

In [144]:
model = LogisticRegression()

In [145]:
model = model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Evaluation
### Accuracy Score

In [146]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
training_data_accuracy

0.9512195121951219

In [147]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
test_data_accuracy

0.926829268292683

## Deploy
### Creating Pickle File

In [148]:
import pickle

In [149]:
pickle.dump(model, open("model.pkl", 'wb'))

### Creating Flask API

In [150]:
import numpy as np
from flask import Flask, request, jsonify, render_template

In [151]:
app = Flask(__name__)
calc = pickle.load(open("model.pkl", 'rb'))