In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm

In [2]:
df=pd.read_csv('creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
# Checking for null values

In [4]:
df.isna().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [5]:
# Distribution of legit transaction and fraudlant transaction
df.Class.value_counts()

0    284315
1       492
Name: Class, dtype: int64

The dataset above is highly unbalanced.

0 ---->  Normal transaction
1 -----> Fraudlant transaction

In [6]:
# Seperation the data for anlysis purpose

legit_transaction = df[df.Class==0]
fraud_transaction = df[df.Class==1]

In [7]:
# Checking for the amount column how it diffrentiate

In [8]:
legit_transaction.Amount.describe()

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [9]:
fraud_transaction.Amount.describe()

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

In [10]:
df.groupby('Class').count()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,284315,284315,284315,284315,284315,284315,284315,284315,284315,284315,...,284315,284315,284315,284315,284315,284315,284315,284315,284315,284315
1,492,492,492,492,492,492,492,492,492,492,...,492,492,492,492,492,492,492,492,492,492


In [11]:
# Applying under sampling to the legit transaction

In [12]:
sampled_transaction = legit_transaction.sample(n=492)
sampled_transaction

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
265804,162056.0,-0.055693,0.792946,-3.217905,-1.383573,3.166645,2.857917,0.829036,1.019802,-0.785333,...,0.245084,0.482323,0.109856,0.663649,-0.319433,0.563211,-0.091563,0.032340,85.25,0
37318,38905.0,1.207533,-0.502016,-0.222169,0.330074,-0.309177,-0.022250,-0.115285,-0.075422,-1.133517,...,-0.348886,-0.595661,-0.255672,-0.521394,0.781503,-0.249623,0.015311,0.014276,87.94,0
13560,24050.0,1.216761,0.698963,-0.137686,2.527629,0.618533,-0.314776,0.486486,-0.303207,0.330209,...,-0.200891,-0.448693,-0.226691,-0.523911,0.793878,0.079119,-0.076812,0.001141,41.51,0
283409,171571.0,2.105061,-0.147111,-1.377191,0.265695,0.104095,-0.840445,0.108064,-0.236636,0.734680,...,-0.344164,-0.906802,0.263997,-0.681016,-0.239797,0.241736,-0.078456,-0.068548,4.49,0
162760,115346.0,0.153802,0.214506,0.843605,-0.814451,-0.078554,-0.062051,0.183226,0.103653,0.846100,...,0.247535,0.817880,-0.025091,0.695084,-0.339963,0.022540,0.043392,0.017801,32.00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67590,52610.0,1.272545,0.214337,0.197844,0.306654,0.030933,-0.180431,-0.016456,-0.039388,-0.247114,...,-0.252251,-0.737040,0.013215,-0.486059,0.316874,0.126240,-0.030460,0.002032,3.59,0
163908,116309.0,-1.421616,0.473430,1.189947,0.721422,0.222399,0.460909,-0.243492,0.910755,-0.239177,...,-0.187857,-0.928745,-0.181448,-1.174822,0.379792,-0.611088,0.204308,-0.021156,49.01,0
845,646.0,-0.916848,1.471812,0.742995,-0.131084,0.518566,0.772582,-0.342258,-2.248702,-0.574626,...,2.114718,-1.311987,0.155953,-1.070769,-0.454872,0.186908,0.227996,0.152385,1.79,0
137012,81980.0,-0.534817,0.450906,1.870975,1.302771,0.506673,0.106984,0.431860,0.072194,-0.384349,...,0.022404,0.307295,-0.166736,0.207879,-0.109964,-0.267092,-0.018224,-0.103156,4.99,0


In [13]:
# Concatination two dataframes, fraudent dataframe+sampled datframe

In [14]:
new_dataframe=pd.concat([sampled_transaction,fraud_transaction],axis=0)
new_dataframe

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
265804,162056.0,-0.055693,0.792946,-3.217905,-1.383573,3.166645,2.857917,0.829036,1.019802,-0.785333,...,0.245084,0.482323,0.109856,0.663649,-0.319433,0.563211,-0.091563,0.032340,85.25,0
37318,38905.0,1.207533,-0.502016,-0.222169,0.330074,-0.309177,-0.022250,-0.115285,-0.075422,-1.133517,...,-0.348886,-0.595661,-0.255672,-0.521394,0.781503,-0.249623,0.015311,0.014276,87.94,0
13560,24050.0,1.216761,0.698963,-0.137686,2.527629,0.618533,-0.314776,0.486486,-0.303207,0.330209,...,-0.200891,-0.448693,-0.226691,-0.523911,0.793878,0.079119,-0.076812,0.001141,41.51,0
283409,171571.0,2.105061,-0.147111,-1.377191,0.265695,0.104095,-0.840445,0.108064,-0.236636,0.734680,...,-0.344164,-0.906802,0.263997,-0.681016,-0.239797,0.241736,-0.078456,-0.068548,4.49,0
162760,115346.0,0.153802,0.214506,0.843605,-0.814451,-0.078554,-0.062051,0.183226,0.103653,0.846100,...,0.247535,0.817880,-0.025091,0.695084,-0.339963,0.022540,0.043392,0.017801,32.00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279863,169142.0,-1.927883,1.125653,-4.518331,1.749293,-1.566487,-2.010494,-0.882850,0.697211,-2.064945,...,0.778584,-0.319189,0.639419,-0.294885,0.537503,0.788395,0.292680,0.147968,390.00,1
280143,169347.0,1.378559,1.289381,-5.004247,1.411850,0.442581,-1.326536,-1.413170,0.248525,-1.127396,...,0.370612,0.028234,-0.145640,-0.081049,0.521875,0.739467,0.389152,0.186637,0.76,1
280149,169351.0,-0.676143,1.126366,-2.213700,0.468308,-1.120541,-0.003346,-2.234739,1.210158,-0.652250,...,0.751826,0.834108,0.190944,0.032070,-0.739695,0.471111,0.385107,0.194361,77.89,1
281144,169966.0,-3.113832,0.585864,-5.399730,1.817092,-0.840618,-2.943548,-2.208002,1.058733,-1.632333,...,0.583276,-0.269209,-0.456108,-0.183659,-0.328168,0.606116,0.884876,-0.253700,245.00,1


In [15]:
new_dataframe.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,95984.977642,0.067654,-0.068874,0.057266,-0.030719,-0.055557,-0.154495,0.069919,-0.016155,0.022188,...,0.006688,-0.053843,0.009319,0.048811,-0.037316,-0.003337,0.015833,0.016284,0.000697,87.810915
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


In [16]:
# Spliting the data into features and target
# target --- our Class
#feature- all the independent variable

In [17]:
x=new_dataframe.drop('Class',axis=1)
x

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
265804,162056.0,-0.055693,0.792946,-3.217905,-1.383573,3.166645,2.857917,0.829036,1.019802,-0.785333,...,-0.071076,0.245084,0.482323,0.109856,0.663649,-0.319433,0.563211,-0.091563,0.032340,85.25
37318,38905.0,1.207533,-0.502016,-0.222169,0.330074,-0.309177,-0.022250,-0.115285,-0.075422,-1.133517,...,-0.368788,-0.348886,-0.595661,-0.255672,-0.521394,0.781503,-0.249623,0.015311,0.014276,87.94
13560,24050.0,1.216761,0.698963,-0.137686,2.527629,0.618533,-0.314776,0.486486,-0.303207,0.330209,...,-0.123921,-0.200891,-0.448693,-0.226691,-0.523911,0.793878,0.079119,-0.076812,0.001141,41.51
283409,171571.0,2.105061,-0.147111,-1.377191,0.265695,0.104095,-0.840445,0.108064,-0.236636,0.734680,...,-0.258351,-0.344164,-0.906802,0.263997,-0.681016,-0.239797,0.241736,-0.078456,-0.068548,4.49
162760,115346.0,0.153802,0.214506,0.843605,-0.814451,-0.078554,-0.062051,0.183226,0.103653,0.846100,...,-0.085901,0.247535,0.817880,-0.025091,0.695084,-0.339963,0.022540,0.043392,0.017801,32.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279863,169142.0,-1.927883,1.125653,-4.518331,1.749293,-1.566487,-2.010494,-0.882850,0.697211,-2.064945,...,1.252967,0.778584,-0.319189,0.639419,-0.294885,0.537503,0.788395,0.292680,0.147968,390.00
280143,169347.0,1.378559,1.289381,-5.004247,1.411850,0.442581,-1.326536,-1.413170,0.248525,-1.127396,...,0.226138,0.370612,0.028234,-0.145640,-0.081049,0.521875,0.739467,0.389152,0.186637,0.76
280149,169351.0,-0.676143,1.126366,-2.213700,0.468308,-1.120541,-0.003346,-2.234739,1.210158,-0.652250,...,0.247968,0.751826,0.834108,0.190944,0.032070,-0.739695,0.471111,0.385107,0.194361,77.89
281144,169966.0,-3.113832,0.585864,-5.399730,1.817092,-0.840618,-2.943548,-2.208002,1.058733,-1.632333,...,0.306271,0.583276,-0.269209,-0.456108,-0.183659,-0.328168,0.606116,0.884876,-0.253700,245.00


In [18]:
y=new_dataframe.Class
y

265804    0
37318     0
13560     0
283409    0
162760    0
         ..
279863    1
280143    1
280149    1
281144    1
281674    1
Name: Class, Length: 984, dtype: int64

# Train test split


In [82]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2)


In [20]:
# Logistic regression

In [83]:
model_lr=LogisticRegression(max_iter=1000)

In [84]:
model_lr.fit(x_train,y_train)


LogisticRegression(max_iter=1000)

In [85]:
model_lr.score(x_test,y_test)

0.8934010152284264

In [25]:
# Support vector machine

In [86]:
model_svc = SVC()

In [87]:
model_svc.fit(x_train,y_train)

SVC()

In [88]:
model_svc.score(x_test,y_test)

0.5532994923857868

In [29]:
# Random forest calssifier

In [30]:
model_rfc = RandomForestClassifier()

In [31]:
model_rfc.fit(x_train,y_train)

RandomForestClassifier()

In [32]:
model_rfc.score(x_test,y_test)

0.9390862944162437

In [33]:
# Decision tree classifier

In [34]:
model_dc = DecisionTreeClassifier()

In [35]:
model_dc.fit(x_train,y_train)

DecisionTreeClassifier()

In [36]:
model_dc.score(x_test,y_test)

0.9238578680203046

# Cross val score

In [37]:
# Logistic regression

In [69]:
cross_score_lr= cross_val_score(LogisticRegression(max_iter=1000),x,y,cv=5)

In [70]:
cross_score_lr

array([0.96954315, 0.92893401, 0.91370558, 0.94923858, 0.87244898])

In [71]:
np.mean(cross_score_lr)

0.9267740598777582

In [41]:
# Support vector machine

In [72]:
cross_score_svc= cross_val_score(SVC(),x,y,cv=5)

In [73]:
cross_score_svc

array([0.19796954, 0.40609137, 0.39593909, 0.30456853, 0.20408163])

In [74]:
np.mean(cross_score_svc)

0.3017300321143686

In [45]:
# Random forest calssifier

In [75]:
cross_score_rfc=cross_val_score(RandomForestClassifier(),x,y,cv=5)

In [76]:
cross_score_rfc

array([0.96954315, 0.92385787, 0.92385787, 0.95431472, 0.92346939])

In [77]:
np.mean(cross_score_rfc)

0.939008598363203

In [49]:
# Decision tree classifier

In [50]:
cross_score_dtc=cross_val_score( DecisionTreeClassifier(),x,y,cv=5)

In [51]:
cross_score_dtc

array([0.9035533 , 0.89340102, 0.9035533 , 0.9035533 , 0.5       ])

In [52]:
np.mean(cross_score_dtc)

0.8208121827411168

# Best Score By Random forest classifier 0.9631306976414817(96.31%)

In [123]:
cross_score_rfc=cross_val_score(RandomForestClassifier(),x,y,cv=5,scoring='precision')
cross_score_rfc

array([0.9893617 , 0.94565217, 0.97701149, 0.95918367, 0.94444444])

In [124]:
np.mean(cross_score_rfc)

0.9631306976414817