In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('/content/credit_card_fraud_sample.csv')

In [None]:
# first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,Amount,Time,Class
0,-4.025171,-1.789863,2.427788,0.173762,-1.901035,0.717276,-0.618354,0.44684,-0.471796,0.013293,...,1.110627,-2.484538,-1.714656,-0.771708,0.074941,0.180279,-9.369892,1.396911,0.361585,0
1,-3.463944,0.083512,1.930428,-0.219564,1.086545,-4.577378,-0.407571,-1.306154,1.065232,-0.061674,...,1.413997,-1.268562,1.058417,-0.888749,0.397237,-0.054604,2.340949,0.243641,-1.478434,0
2,0.225846,-0.541326,0.861232,-1.603556,-2.157069,-0.277504,-0.935832,2.062546,-0.832419,0.202897,...,0.232709,-3.336243,1.996796,0.094667,-0.685501,1.104848,1.266388,0.159934,1.776206,0
3,-2.810813,0.760931,2.427816,0.35927,-0.185584,-4.597746,-1.211505,-2.333815,3.173247,0.582476,...,-0.907497,-0.093771,3.112598,-0.172112,1.698012,-0.957238,8.606992,0.977369,-0.792913,0
4,-7.068379,-2.019129,8.473799,-0.050852,-5.15866,0.551342,-1.214344,-3.608522,-1.742245,-1.028028,...,-1.211488,-2.766472,-4.282507,-0.311129,0.321671,-2.067151,-3.054716,0.246565,-0.613012,0


In [None]:
credit_card_data.tail()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,Amount,Time,Class
995,-5.18866,1.379388,0.661978,1.118036,-0.88638,-4.751515,1.008076,1.134145,-2.821727,1.298147,...,-1.131687,-3.097334,0.975415,-1.010093,-2.434994,-0.280795,0.813311,-0.601503,1.654653,0
996,-1.953855,-2.318438,0.318608,1.386975,0.764438,2.783485,-1.212583,0.767413,-1.959928,-2.033919,...,-0.972431,-0.152123,-1.849324,-2.052834,0.694765,-0.173118,-6.890609,-1.588985,0.205321,0
997,0.050191,0.384805,-1.974514,-1.15997,0.815158,0.561425,0.995922,0.60323,1.043986,-0.39336,...,1.060505,0.404062,-1.312732,0.03886,0.005088,0.300038,-8.683407,0.036225,-0.759805,0
998,6.87772,0.436903,-0.642282,0.152688,-1.330156,2.39042,-1.100935,2.020367,3.232675,0.755927,...,0.332456,-1.46162,1.729016,0.449057,1.55121,0.813397,0.804508,-0.758064,1.882146,0
999,0.632724,-1.507493,-5.198782,2.843156,7.743735,-1.717012,1.042827,0.199417,-3.434194,-0.432969,...,-0.96273,2.158822,-1.830422,-1.231303,-0.981398,-1.732651,-5.430321,0.006024,-8.031682,0


In [None]:
# dataset informations
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V1      1000 non-null   float64
 1   V2      1000 non-null   float64
 2   V3      1000 non-null   float64
 3   V4      1000 non-null   float64
 4   V5      1000 non-null   float64
 5   V6      1000 non-null   float64
 6   V7      1000 non-null   float64
 7   V8      1000 non-null   float64
 8   V9      1000 non-null   float64
 9   V10     1000 non-null   float64
 10  V11     1000 non-null   float64
 11  V12     1000 non-null   float64
 12  V13     1000 non-null   float64
 13  V14     1000 non-null   float64
 14  V15     1000 non-null   float64
 15  V16     1000 non-null   float64
 16  V17     1000 non-null   float64
 17  V18     1000 non-null   float64
 18  V19     1000 non-null   float64
 19  V20     1000 non-null   float64
 20  V21     1000 non-null   float64
 21  V22     1000 non-null   float64
 22  V

In [None]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

Unnamed: 0,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0
V10,0


In [None]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,944
1,56


In [None]:
# Separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [None]:
print(legit.shape)
print(fraud.shape)

(944, 31)
(56, 31)


In [None]:
# statistical measures of the data
legit.Amount.describe()

Unnamed: 0,Amount
count,944.0
mean,0.073679
std,0.997348
min,-3.310843
25%,-0.609895
50%,0.076044
75%,0.781907
max,3.336025


In [None]:
fraud.Amount.describe()

Unnamed: 0,Amount
count,56.0
mean,-0.132474
std,0.982211
min,-1.885053
25%,-0.907385
50%,-0.056689
75%,0.55598
max,2.433999


In [None]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Time
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-0.968774,-1.061957,0.530724,0.047286,0.324307,-0.601488,-0.077787,-0.04176,-0.206678,0.011428,...,0.000836,-0.021886,-0.919474,-0.02466,0.006799,0.068877,0.015894,-1.846844,0.073679,-1.668256
1,-0.192738,-0.657719,0.305786,0.067369,0.578711,0.179242,-0.108511,-0.885237,0.009459,0.185347,...,0.362114,0.068714,-0.052465,0.863252,-0.059624,0.011665,-0.026501,2.251361,-0.132474,-0.622025


In [None]:
legit_sample = legit.sample(n=492)

In [None]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [None]:
new_dataset.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,Amount,Time,Class
336,-1.717748,0.280937,1.283948,0.712021,-0.824677,-1.031527,0.522621,1.028578,-1.219653,0.291609,...,1.180734,-1.913181,-1.642135,-0.567731,0.498852,-0.900204,-4.048626,-0.283375,0.274192,0
96,-1.362297,-0.67762,0.219617,-0.714556,1.833315,0.820354,0.611577,-2.192311,2.75915,-3.097827,...,0.277358,2.120319,0.777857,0.917092,2.098681,-0.575238,0.089663,0.473769,-0.099111,0
91,-4.0082,-1.033859,0.434986,1.322416,-1.091756,-0.536556,1.284261,0.748625,-3.205959,0.582184,...,1.059558,-1.597062,-0.603746,0.555694,-0.476307,0.005811,-3.603176,-0.41714,-1.509114,0
654,-3.14483,-1.263134,-4.793897,0.049754,8.46384,-1.863417,-1.106,-2.225338,-5.589509,0.297727,...,0.350121,2.892945,-1.830993,-2.181132,-2.489589,0.510381,-2.374879,-0.478317,-7.561176,0
317,-3.664361,-0.197562,1.058922,1.605324,2.243974,-4.472077,-1.696089,-0.710521,-0.532973,-0.028185,...,0.438655,-1.100614,1.194592,-0.17677,-0.212787,0.597662,3.11168,0.818865,-1.100448,0


In [None]:
new_dataset.tail()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,Amount,Time,Class
873,3.384335,1.534913,-3.881675,-0.906778,-2.628545,5.590773,-0.766526,3.139153,-4.857473,0.88361,...,-0.780214,2.561063,-1.741461,-2.308451,-1.408255,-0.226786,0.056449,-0.305258,3.502655,1
875,-2.076337,-2.880864,1.521386,1.078884,3.041401,0.855927,0.017792,-1.834056,2.275413,0.969913,...,-0.447655,1.137225,1.618287,-0.557489,3.481549,0.029117,1.337929,-0.391386,-0.83195,1
904,-0.169626,-3.726348,3.364342,0.287565,-2.616847,2.67592,0.104104,1.019265,4.259941,0.878658,...,1.325872,-1.96982,1.492764,-0.245901,2.471934,-2.166559,-5.379905,-1.117821,1.749828,1
935,8.603419,-1.161886,-1.048958,0.653328,4.301263,2.919052,-0.524714,-0.547777,2.590773,1.377065,...,0.611234,-1.280769,2.796256,1.007487,1.968785,-0.470848,6.207414,-0.670642,-0.652638,1
956,-1.857365,0.391969,-4.117848,1.042696,0.949185,-0.351123,-1.437924,0.015652,-4.89523,0.897331,...,-1.168556,1.536946,0.056087,-0.352114,-5.250347,-0.464351,0.269494,0.594365,-1.86608,1


In [None]:
new_dataset['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,492
1,56


In [None]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Time
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-0.972671,-1.056736,0.801805,0.02506,0.116227,-0.458887,-0.128399,-0.132885,0.029674,0.051329,...,-0.097315,-0.01969,-0.903889,-0.123404,0.029068,0.249341,0.06084,-1.923944,0.115155,-1.464324
1,-0.192738,-0.657719,0.305786,0.067369,0.578711,0.179242,-0.108511,-0.885237,0.009459,0.185347,...,0.362114,0.068714,-0.052465,0.863252,-0.059624,0.011665,-0.026501,2.251361,-0.132474,-0.622025


Splitting the data into Features & Targets




In [None]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [None]:
print(X)

           V1        V2        V3        V4        V5        V6        V7  \
336 -1.717748  0.280937  1.283948  0.712021 -0.824677 -1.031527  0.522621   
96  -1.362297 -0.677620  0.219617 -0.714556  1.833315  0.820354  0.611577   
91  -4.008200 -1.033859  0.434986  1.322416 -1.091756 -0.536556  1.284261   
654 -3.144830 -1.263134 -4.793897  0.049754  8.463840 -1.863417 -1.106000   
317 -3.664361 -0.197562  1.058922  1.605324  2.243974 -4.472077 -1.696089   
..        ...       ...       ...       ...       ...       ...       ...   
873  3.384335  1.534913 -3.881675 -0.906778 -2.628545  5.590773 -0.766526   
875 -2.076337 -2.880864  1.521386  1.078884  3.041401  0.855927  0.017792   
904 -0.169626 -3.726348  3.364342  0.287565 -2.616847  2.675920  0.104104   
935  8.603419 -1.161886 -1.048958  0.653328  4.301263  2.919052 -0.524714   
956 -1.857365  0.391969 -4.117848  1.042696  0.949185 -0.351123 -1.437924   

           V8        V9       V10  ...       V21       V22       V23  \
336

In [None]:
print(Y)

336    0
96     0
91     0
654    0
317    0
      ..
873    1
875    1
904    1
935    1
956    1
Name: Class, Length: 548, dtype: int64


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(548, 30) (438, 30) (110, 30)


In [None]:
model = LogisticRegression()

In [None]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

In [None]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.952054794520548


In [None]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.9454545454545454
