Importing the Dependencies

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('/content/creditcard.csv')

In [None]:
# first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [None]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
95157,65189,1.142364,-1.090177,0.511068,-0.792356,-1.160044,0.134166,-0.997184,0.236721,-0.583865,...,0.399798,0.815983,-0.172587,-0.298962,0.310993,-0.077793,0.011223,0.019004,106.0,0.0
95158,65190,-2.145283,1.280406,0.014577,-2.003358,1.479294,4.673049,-2.008023,-2.933663,0.036717,...,0.276433,-0.038006,0.085076,1.04987,0.48957,1.045371,-0.363337,-0.222526,34.61,0.0
95159,65190,-3.715715,3.870511,-1.525809,0.082535,-0.244009,-0.901579,0.70883,0.070491,2.349423,...,-0.32718,0.573451,0.266379,0.040564,-0.175983,-0.49422,0.257349,-0.309196,0.89,0.0
95160,65190,-5.164795,4.510526,-0.994499,-1.110853,-0.913228,-0.889076,0.373572,0.361552,3.841062,...,-0.908623,-1.15421,0.300341,-0.102776,0.8178,0.201861,2.384092,1.576142,7.18,0.0
95161,65191,-1.430966,1.19267,1.237388,1.074059,-0.997949,0.687186,-1.04557,1.012203,0.095426,...,,,,,,,,,,


In [None]:
# dataset informations
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95162 entries, 0 to 95161
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    95162 non-null  int64  
 1   V1      95162 non-null  float64
 2   V2      95162 non-null  float64
 3   V3      95162 non-null  float64
 4   V4      95162 non-null  float64
 5   V5      95162 non-null  float64
 6   V6      95162 non-null  float64
 7   V7      95162 non-null  float64
 8   V8      95162 non-null  float64
 9   V9      95162 non-null  float64
 10  V10     95162 non-null  float64
 11  V11     95162 non-null  float64
 12  V12     95162 non-null  float64
 13  V13     95161 non-null  float64
 14  V14     95161 non-null  float64
 15  V15     95161 non-null  float64
 16  V16     95161 non-null  float64
 17  V17     95161 non-null  float64
 18  V18     95161 non-null  float64
 19  V19     95161 non-null  float64
 20  V20     95161 non-null  float64
 21  V21     95161 non-null  float64
 22

In [None]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       1
V14       1
V15       1
V16       1
V17       1
V18       1
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [None]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

Class
0.0    94944
1.0      217
Name: count, dtype: int64

This Dataset is highly unblanced

0 --> Normal Transaction

1 --> fraudulent transaction

In [None]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [None]:
print(legit.shape)
print(fraud.shape)

(94944, 31)
(217, 31)


In [None]:
# statistical measures of the data
legit.Amount.describe()

count    94944.000000
mean        98.685374
std        267.432993
min          0.000000
25%          7.600000
50%         26.720000
75%         89.752500
max      19656.530000
Name: Amount, dtype: float64

In [None]:
fraud.Amount.describe()

count     217.000000
mean      109.784286
std       243.927116
min         0.000000
25%         1.000000
50%         7.580000
75%        99.990000
max      1809.680000
Name: Amount, dtype: float64

In [None]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,41232.78362,-0.25064,-0.047145,0.696491,0.15125,-0.270069,0.097914,-0.093292,0.048787,-0.032333,...,0.043401,-0.031938,-0.10734,-0.037071,0.009933,0.1319,0.026607,-0.000819,0.001332,98.685374
1.0,35870.354839,-6.16618,4.217965,-8.087819,4.99603,-4.444345,-1.845983,-6.41759,2.796291,-2.953535,...,0.354728,0.731128,-0.127758,-0.247689,-0.102596,0.20696,0.097699,0.532822,0.035472,109.784286


Under-Sampling

Build a sample dataset containing similar distribution of normal transactions and Fraudulent Transactions

Number of Fraudulent Transactions --> 492

In [None]:
legit_sample = legit.sample(n=492)

Concatenating two DataFrames

In [None]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [None]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
84540,60352,-1.569016,0.040286,1.021276,0.651293,0.844435,-0.406717,0.017999,-0.108295,-0.101639,...,-0.057581,0.132836,-0.715664,-0.334369,-0.61915,0.342485,-0.48042,-0.64079,3.68,0.0
63287,50647,1.255123,0.237755,0.270336,0.707936,-0.51219,-1.108213,-0.02167,-0.112626,0.275174,...,-0.306242,-0.962084,0.147729,0.296005,0.162588,0.100089,-0.033579,0.027365,1.98,0.0
12497,21889,-0.691914,0.885946,0.014271,-1.009578,2.540823,3.398745,0.021169,0.776846,0.906629,...,-0.411404,-1.079221,0.072916,0.919513,-0.053887,0.026872,-0.016976,0.104806,7.92,0.0
12379,21693,1.228059,-0.282108,1.296592,0.272377,-1.199034,-0.317642,-1.015511,0.050034,2.301372,...,0.018483,0.277405,-0.034361,0.018924,0.055892,1.098973,-0.063459,0.012085,17.95,0.0
78202,57418,-2.282505,-0.941511,0.687949,0.764339,1.445198,-1.179341,-0.294364,0.519555,-1.288334,...,-0.067209,-1.230794,-0.053944,-0.345393,0.144982,0.324801,-0.111275,-0.324734,60.9,0.0


In [None]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
92777,64093,-6.133987,2.941499,-5.593986,3.258845,-5.315512,-0.637328,-4.476488,1.695994,-1.606743,...,0.86834,0.793736,0.217347,-0.021985,0.145882,0.665088,-1.684186,0.310195,294.9,1.0
93424,64412,-1.348042,2.522821,-0.782432,4.083047,-0.66228,-0.598776,-1.943552,-0.329579,-1.853274,...,1.079871,-0.352026,-0.218358,0.125866,-0.07418,0.179116,0.61258,0.234206,1.0,1.0
93486,64443,1.079524,0.872988,-0.30385,2.755369,0.301688,-0.350284,-0.042848,0.246625,-0.779176,...,-0.023255,-0.158601,-0.038806,-0.060327,0.358339,0.076984,0.018936,0.060574,0.0,1.0
93788,64585,1.080433,0.962831,-0.278065,2.743318,0.412364,-0.320778,0.04129,0.17617,-0.966952,...,-0.008996,-0.057036,-0.053692,-0.026373,0.4003,0.072828,0.027043,0.063238,0.0,1.0
94218,64785,-8.744415,-3.420468,-4.850575,6.606846,-2.800546,0.105512,-3.269801,0.940378,-2.558691,...,0.102913,0.311626,-4.129195,0.034639,-1.133631,0.272265,1.841307,-1.796363,720.38,1.0


In [None]:
new_dataset['Class'].value_counts()

Class
0.0    492
1.0    217
Name: count, dtype: int64

In [None]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,41770.855691,-0.301809,0.000704,0.726625,0.154477,-0.31828,0.008547,-0.148499,-0.016585,-0.036082,...,0.047183,0.0336,-0.137914,-0.035166,0.007561,0.119621,0.032558,-0.010492,-0.019533,91.376545
1.0,35870.354839,-6.16618,4.217965,-8.087819,4.99603,-4.444345,-1.845983,-6.41759,2.796291,-2.953535,...,0.354728,0.731128,-0.127758,-0.247689,-0.102596,0.20696,0.097699,0.532822,0.035472,109.784286


Splitting the data into Features & Targets

In [None]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [None]:
print(X)

        Time        V1        V2        V3        V4        V5        V6  \
84540  60352 -1.569016  0.040286  1.021276  0.651293  0.844435 -0.406717   
63287  50647  1.255123  0.237755  0.270336  0.707936 -0.512190 -1.108213   
12497  21889 -0.691914  0.885946  0.014271 -1.009578  2.540823  3.398745   
12379  21693  1.228059 -0.282108  1.296592  0.272377 -1.199034 -0.317642   
78202  57418 -2.282505 -0.941511  0.687949  0.764339  1.445198 -1.179341   
...      ...       ...       ...       ...       ...       ...       ...   
92777  64093 -6.133987  2.941499 -5.593986  3.258845 -5.315512 -0.637328   
93424  64412 -1.348042  2.522821 -0.782432  4.083047 -0.662280 -0.598776   
93486  64443  1.079524  0.872988 -0.303850  2.755369  0.301688 -0.350284   
93788  64585  1.080433  0.962831 -0.278065  2.743318  0.412364 -0.320778   
94218  64785 -8.744415 -3.420468 -4.850575  6.606846 -2.800546  0.105512   

             V7        V8        V9  ...       V20       V21       V22  \
84540  0.0179

In [None]:
print(Y)

84540    0.0
63287    0.0
12497    0.0
12379    0.0
78202    0.0
        ... 
92777    1.0
93424    1.0
93486    1.0
93788    1.0
94218    1.0
Name: Class, Length: 709, dtype: float64


Split the data into Training data & Testing Data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(709, 30) (567, 30) (142, 30)


Model Training

Logistic Regression

In [None]:
model = LogisticRegression()

In [None]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Evaluation

Accuracy Score

In [None]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9611992945326279


In [None]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.98989898989899
