In [1]:
# Importing all the required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Loading the dataset into Pandas Dataframe
credit_card_data = pd.read_csv('/content/creditcard.csv')

In [4]:
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [5]:
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29799 entries, 0 to 29798
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    29799 non-null  int64  
 1   V1      29799 non-null  float64
 2   V2      29799 non-null  float64
 3   V3      29799 non-null  float64
 4   V4      29799 non-null  float64
 5   V5      29799 non-null  float64
 6   V6      29798 non-null  float64
 7   V7      29798 non-null  float64
 8   V8      29798 non-null  float64
 9   V9      29798 non-null  float64
 10  V10     29798 non-null  float64
 11  V11     29798 non-null  float64
 12  V12     29798 non-null  float64
 13  V13     29798 non-null  float64
 14  V14     29798 non-null  float64
 15  V15     29798 non-null  float64
 16  V16     29798 non-null  float64
 17  V17     29798 non-null  float64
 18  V18     29798 non-null  float64
 19  V19     29798 non-null  float64
 20  V20     29798 non-null  float64
 21  V21     29798 non-null  float64
 22

In [6]:
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        1
V7        1
V8        1
V9        1
V10       1
V11       1
V12       1
V13       1
V14       1
V15       1
V16       1
V17       1
V18       1
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [7]:
# Since the last row contains all the missing values lets drop the last row
last_row = len(credit_card_data)-1
credit_card_data = credit_card_data.drop(credit_card_data.index[last_row])

In [8]:
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [9]:
credit_card_data['Class'].value_counts()

0.0    29704
1.0       94
Name: Class, dtype: int64

This Upper dataset is highly unbalanced and hence we cannot use it to train our data. Thus we will make a balanced dataset using 2*94 Legit transactions to train our data well.


In [10]:
# Fraud Transactions --> 94
# Legit Transactions ---> 29704
# So our data will have 94*2 = 188 Real Transactions.

In [11]:
# Separating Data for Analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [12]:
print(legit.shape)
print(fraud.shape)

(29704, 31)
(94, 31)


In [13]:
legit.Amount.describe() # Analysing the data

count    29704.000000
mean        79.570030
std        221.991154
min          0.000000
25%          6.637500
50%         20.000000
75%         70.652500
max       7879.420000
Name: Amount, dtype: float64

In [14]:
fraud.Amount.describe() # Analysing the data

count      94.000000
mean       95.590000
std       257.920621
min         0.000000
25%         1.000000
50%         1.050000
75%        99.990000
max      1809.680000
Name: Amount, dtype: float64

In [15]:
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
0.0,21422.566422,-0.184395,0.106639,0.759413,0.19469,-0.186422,0.09679,-0.096841,0.018203,0.361273,-0.094014,0.496304,-0.653937,0.393733,0.370569,0.056312,0.017991,0.225218,-0.072521,-0.041748,0.044,-0.035795,-0.122933,-0.041017,0.010051,0.131937,0.022275,0.009773,0.004475,79.57003
1.0,19007.702128,-8.099702,6.084984,-11.565958,6.014185,-5.681925,-2.370349,-7.912202,4.043743,-2.891421,-6.781438,5.689416,-8.522727,0.34964,-8.396927,-0.087243,-4.782071,-7.672807,-2.707301,0.296538,0.679513,0.573983,-0.380899,-0.338752,-0.258204,0.352443,0.183181,0.830627,0.101053,95.59


Sub-Sampling to Extract 188 datasets from legit dataset

In [16]:
legit_sample = legit.sample(n=188)

In [17]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [18]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
21272,31567,1.192521,0.19854,0.147221,0.45695,-0.025364,-0.105565,-0.145185,0.140084,-0.109294,-0.138314,1.448225,0.552219,-0.475585,0.048025,0.741297,0.509069,-0.059707,-0.065168,-0.139044,-0.132328,-0.237295,-0.717292,0.113491,-0.366292,0.134795,0.129203,-0.015154,0.012529,1.98,0.0
26690,34208,-5.608118,3.511658,-4.307411,1.230513,-2.985626,-1.834037,-2.160156,3.618523,-0.378909,0.621724,-1.749006,1.236016,0.065338,2.93738,0.607764,0.526425,1.323178,0.149092,0.066585,0.071061,0.243561,0.117257,-0.083839,0.401397,-0.252792,-0.393853,0.175738,0.021282,1.0,0.0
16668,28033,-0.333875,1.126726,1.146031,-0.019499,0.240197,-0.766541,0.74664,-0.077241,-0.441546,-0.528497,-0.454163,0.047163,0.643434,-0.541498,0.870339,0.466485,-0.14866,-0.075455,0.039865,0.17077,-0.277675,-0.718734,-0.048942,-0.014143,-0.094162,0.092438,0.24575,0.096311,8.91,0.0
28224,34908,1.277366,0.202672,0.19503,0.30722,0.008903,-0.187924,-0.038604,-0.023057,-0.20508,0.095579,0.796616,0.950829,0.686624,0.324117,0.35755,0.725567,-0.992093,0.184339,0.480308,-0.03707,-0.256995,-0.758238,0.018815,-0.493738,0.308504,0.127441,-0.03177,0.000988,0.99,0.0
22605,32335,-2.168388,-0.662924,2.708657,0.706417,-0.988196,1.894088,0.115434,-0.037827,0.878056,2.307951,0.342246,0.482775,0.002926,-2.020925,-1.556832,-2.790536,0.518394,1.304433,0.647817,0.151221,-0.867401,-0.120097,0.079978,-0.315882,0.512795,-0.090372,0.350325,0.167913,150.0,0.0


In [19]:
new_dataset.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0,282.0
mean,21407.687943,-2.919526,2.170727,-3.434315,2.169884,-1.979723,-0.65236,-2.572736,1.313017,-0.769512,-2.2875,2.21218,-3.254457,0.356036,-2.577538,0.023843,-1.561702,-2.42598,-0.85782,0.108182,0.286806,0.205383,-0.240822,-0.119382,-0.06486,0.207448,0.076863,0.311804,0.0301,103.170745,0.333333
std,10999.100808,6.861019,4.154179,8.186578,3.409086,5.031321,1.995767,5.806627,4.3258,2.204024,4.115283,3.184348,4.69583,1.075833,5.009404,0.89894,3.159471,5.285347,2.135976,1.036332,0.72,1.124131,0.774506,1.505013,0.586216,0.815412,0.475629,0.79113,0.423072,268.819239,0.472243
min,50.0,-30.55238,-13.449684,-31.103685,-3.599023,-22.105532,-4.977692,-21.922811,-11.07798,-7.175097,-14.166795,-1.851277,-17.769143,-2.759266,-19.214325,-2.491353,-12.227189,-18.587366,-8.061208,-3.28615,-3.042626,-2.475962,-2.858266,-19.254328,-1.824841,-4.781606,-1.069201,-3.90808,-1.86929,0.0,0.0
25%,11144.5,-2.169544,-0.017767,-3.905338,-0.059942,-1.550404,-1.700779,-1.707327,-0.144903,-1.749776,-3.634272,0.005439,-5.660584,-0.33952,-5.307791,-0.434072,-2.305393,-2.81632,-1.235838,-0.466534,-0.121714,-0.237356,-0.64772,-0.274741,-0.370735,-0.170301,-0.249072,-0.050011,-0.010062,1.4975,0.0
50%,25242.5,-0.725148,0.722582,0.19453,1.153264,-0.39337,-0.488264,-0.19399,0.125055,-0.247669,-0.451439,1.122282,-1.678229,0.344535,-0.182249,-0.005188,-0.240025,-0.210023,-0.152908,0.177342,0.108464,-0.030672,-0.199457,-0.05846,0.013747,0.177113,-0.008,0.05663,0.035287,15.0,0.0
75%,30822.25,1.024974,2.795343,1.082451,3.710932,0.32121,0.301972,0.422631,0.674678,0.706679,0.107691,3.737438,0.176955,1.112912,0.579769,0.576855,0.393836,0.407708,0.394095,0.728658,0.512361,0.479427,0.254883,0.153815,0.346538,0.58999,0.418949,0.529865,0.209549,84.4825,1.0
max,35585.0,1.616392,16.713389,4.029428,11.927512,4.362777,10.033923,8.571717,20.007208,4.065266,8.453344,12.018913,1.662002,3.018041,2.93738,2.40646,2.798983,6.739384,3.042493,3.166999,2.829562,8.852711,1.91527,10.47577,1.074964,2.419792,1.491184,3.052358,1.108933,2126.13,1.0


In [20]:
new_dataset['Class'].value_counts()

0.0    188
1.0     94
Name: Class, dtype: int64

In [21]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
0.0,22607.680851,-0.329438,0.213598,0.631506,0.247733,-0.128621,0.206634,0.096997,-0.052346,0.291443,-0.040531,0.473562,-0.620322,0.359234,0.332156,0.079386,0.048482,0.197434,0.066921,0.014004,0.090453,0.021083,-0.170783,-0.009697,0.031813,0.13495,0.023704,0.052393,-0.005376,106.961117
1.0,19007.702128,-8.099702,6.084984,-11.565958,6.014185,-5.681925,-2.370349,-7.912202,4.043743,-2.891421,-6.781438,5.689416,-8.522727,0.34964,-8.396927,-0.087243,-4.782071,-7.672807,-2.707301,0.296538,0.679513,0.573983,-0.380899,-0.338752,-0.258204,0.352443,0.183181,0.830627,0.101053,95.59


In [22]:
# Splitting data into Features and Targets
X = new_dataset.drop(columns = 'Class', axis = 1)
Y = new_dataset['Class']


In [23]:
X

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
21272,31567,1.192521,0.198540,0.147221,0.456950,-0.025364,-0.105565,-0.145185,0.140084,-0.109294,-0.138314,1.448225,0.552219,-0.475585,0.048025,0.741297,0.509069,-0.059707,-0.065168,-0.139044,-0.132328,-0.237295,-0.717292,0.113491,-0.366292,0.134795,0.129203,-0.015154,0.012529,1.98
26690,34208,-5.608118,3.511658,-4.307411,1.230513,-2.985626,-1.834037,-2.160156,3.618523,-0.378909,0.621724,-1.749006,1.236016,0.065338,2.937380,0.607764,0.526425,1.323178,0.149092,0.066585,0.071061,0.243561,0.117257,-0.083839,0.401397,-0.252792,-0.393853,0.175738,0.021282,1.00
16668,28033,-0.333875,1.126726,1.146031,-0.019499,0.240197,-0.766541,0.746640,-0.077241,-0.441546,-0.528497,-0.454163,0.047163,0.643434,-0.541498,0.870339,0.466485,-0.148660,-0.075455,0.039865,0.170770,-0.277675,-0.718734,-0.048942,-0.014143,-0.094162,0.092438,0.245750,0.096311,8.91
28224,34908,1.277366,0.202672,0.195030,0.307220,0.008903,-0.187924,-0.038604,-0.023057,-0.205080,0.095579,0.796616,0.950829,0.686624,0.324117,0.357550,0.725567,-0.992093,0.184339,0.480308,-0.037070,-0.256995,-0.758238,0.018815,-0.493738,0.308504,0.127441,-0.031770,0.000988,0.99
22605,32335,-2.168388,-0.662924,2.708657,0.706417,-0.988196,1.894088,0.115434,-0.037827,0.878056,2.307951,0.342246,0.482775,0.002926,-2.020925,-1.556832,-2.790536,0.518394,1.304433,0.647817,0.151221,-0.867401,-0.120097,0.079978,-0.315882,0.512795,-0.090372,0.350325,0.167913,150.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27362,34521,1.081234,0.416414,0.862919,2.520863,-0.005021,0.563341,-0.123372,0.223122,-0.673598,0.644550,1.120505,1.237773,0.342362,0.038372,-1.194503,0.085406,-0.219143,-0.694259,-0.655270,-0.165249,-0.159387,-0.305154,0.053620,0.011761,0.375146,-0.106299,0.021008,0.010559,1.52
27627,34634,0.333499,1.699873,-2.596561,3.643945,-0.585068,-0.654659,-2.275789,0.675229,-2.042416,-2.834871,2.802005,-4.392732,-1.369671,-5.327287,0.163159,-1.359389,-5.095183,-0.338013,-0.124138,0.329342,0.469212,-0.144363,-0.317981,-0.769644,0.807855,0.228164,0.551002,0.305473,18.96
27738,34684,-2.439237,2.591458,-2.840126,1.286244,-1.777016,-1.436139,-2.206056,-2.282725,-0.292885,-3.717450,3.907399,-7.220004,-1.211739,-9.657627,0.927518,-4.738662,-9.276636,-3.081961,0.177746,0.513530,1.774460,-0.771390,0.065727,0.103916,-0.057578,0.242652,-0.268649,-0.743713,125.30
27749,34687,-0.860827,3.131790,-5.052968,5.420941,-2.494141,-1.811287,-5.479117,1.189472,-3.908206,-7.060746,4.729974,-8.629054,1.178798,-11.182063,0.445243,-6.532982,-13.389251,-4.480413,0.432054,1.085760,1.192694,0.090356,-0.341881,-0.215924,1.053032,0.271139,1.373300,0.691195,19.02


In [24]:
Y

21272    0.0
26690    0.0
16668    0.0
28224    0.0
22605    0.0
        ... 
27362    1.0
27627    1.0
27738    1.0
27749    1.0
29687    1.0
Name: Class, Length: 282, dtype: float64

In [25]:
# Test Train Splitting 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y , random_state=2)

In [26]:
print(X.shape, X_train.shape, X_test.shape)

(282, 30) (225, 30) (57, 30)


Training The Data : MODEL TRAINING


In [27]:
model = LogisticRegression()

In [28]:
model.fit(X_train, Y_train) # This is training our model with test data


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Model Evaluation : ACCURACY SCORE

In [29]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
traing_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [30]:
print("The Training Data Accuracy is :",traing_data_accuracy)

The Training Data Accuracy is : 0.9733333333333334


In [31]:
# Accuracy Check on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [32]:
print("The Test Data Accuracy is :",test_data_accuracy)

The Test Data Accuracy is : 0.9649122807017544
