In [67]:
#importing the dependencies

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score

In [68]:
#loading the dataset

credit_card_data = pd.read_csv('creditcard.csv')
credit_card_data.sample(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
126312,77981.0,-0.830637,0.306038,2.468645,-2.109545,-0.340487,0.028122,0.231455,0.23495,1.309931,...,0.003024,0.396253,-0.37187,-0.000803,0.406142,-0.769502,0.196109,-0.101067,1.0,0
274047,165829.0,2.064018,0.145645,-1.696072,0.416422,0.430365,-0.811077,0.16815,-0.209255,0.49891,...,-0.357668,-0.921165,0.349904,0.507099,-0.278218,0.17547,-0.059883,-0.030848,1.98,0
195341,131010.0,1.663602,-0.165802,-3.118227,0.90095,0.439905,-1.892352,0.754065,-0.422537,0.461059,...,-0.026306,-0.481687,-0.153125,-0.371457,-0.03466,0.687134,-0.115967,0.029973,220.0,0
128905,78883.0,-0.302235,-3.649805,-0.338804,-0.135715,-1.522761,1.547469,-0.168734,0.182435,-0.184581,...,0.514354,-0.264735,-0.90263,-1.259872,0.158173,-0.23979,-0.094647,0.147521,865.58,0
10039,15160.0,-1.330549,0.245955,0.700337,-2.558681,1.103394,4.562194,-1.817631,-1.664204,0.175746,...,-1.120054,0.9335,-0.086147,0.965919,0.559777,-0.176092,-0.073279,0.071905,79.0,0


In [69]:
#information about the credit card

credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [70]:
#checking the number of missing values in each column

credit_card_data.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0


Here we can see that no missing value is present in the dataset

In [71]:
#checking the distribution of legit and fraudulent transactions

credit_card_data.Class.value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,284315
1,492


Here we can see that the data is highly imbalanced.
Class=0 represents normal transaction.
Class=1 represents fraudulent transaction.

In [72]:
#separating the data for further analysis

legit = credit_card_data[credit_card_data.Class==0]
fraud = credit_card_data[credit_card_data.Class==1]


In [73]:
#getting the shape of legit and fraud

print(legit.shape)
print(fraud.shape)
#

(284315, 31)
(492, 31)


In [74]:
#getting the statistical measure of the legit data

legit.Amount.describe()

Unnamed: 0,Amount
count,284315.0
mean,88.291022
std,250.105092
min,0.0
25%,5.65
50%,22.0
75%,77.05
max,25691.16


In [75]:
#getting the statistical measure of the fraud data

fraud.Amount.describe()

Unnamed: 0,Amount
count,492.0
mean,122.211321
std,256.683288
min,0.0
25%,1.0
50%,9.25
75%,105.89
max,2125.87


In [76]:
#compare the values for both transactions

credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


**Under_Sampling**

we will build a dataset containing similar distribution of normal and fraudulent transactions.

since number of fraudelent tranactions is 492 so to make the data relatively balanced we will take a sample of 492 from legit data.

In [77]:
legit_sample = legit.sample(n=492)
legit_sample

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
23555,32819.0,1.162978,-1.097519,0.905584,-0.651494,-1.455128,0.051269,-1.066579,0.002200,-0.322047,...,-0.136549,-0.086390,-0.143488,-0.381944,0.064101,1.151312,-0.017963,0.041565,125.90,0
202180,134205.0,2.005550,-1.897794,-0.858712,-1.337728,-1.716429,-0.895968,-0.986008,-0.166685,-0.876317,...,-0.587851,-1.631566,0.427098,-0.247566,-0.723566,-0.665722,-0.018225,-0.019060,159.99,0
179722,124213.0,-1.449516,0.592550,1.593578,-3.506733,-0.482206,-0.158185,-0.202943,0.770412,1.230895,...,0.097237,0.200519,-0.323754,0.698879,0.612312,-0.432227,0.016662,0.047445,3.59,0
103489,68681.0,0.942666,0.289313,0.229888,2.341587,0.373488,0.560804,0.211232,0.207515,-1.145602,...,0.099610,0.115948,-0.063724,-0.304112,0.393170,0.042133,-0.013933,0.010544,75.66,0
140663,83856.0,1.454575,-0.410600,0.138114,-1.070725,-0.382569,-0.032418,-0.596144,-0.043026,-1.139737,...,-0.108758,-0.396151,-0.032819,-0.831219,0.398099,-0.429745,0.013618,0.002530,4.78,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244983,152557.0,-0.091093,-0.005060,0.394352,-0.729028,0.107341,-1.060980,0.418767,-0.279560,-1.560870,...,0.000083,0.445276,-0.055729,0.022249,-0.556564,0.888555,0.130420,0.184503,30.00,0
16772,28132.0,-0.342052,1.086670,1.162506,-0.276683,0.498421,-0.298662,0.750366,-0.055371,-0.467933,...,-0.283012,-0.703561,-0.155844,-0.616638,-0.028060,0.130448,0.269842,0.112834,3.59,0
230469,146343.0,-0.559083,0.507117,0.566686,-0.487982,0.920591,-1.015388,0.616024,-0.098308,-0.196572,...,0.106033,0.256620,0.053738,1.106678,-0.535106,0.417503,0.108978,0.220924,27.17,0
236275,148740.0,2.001936,-0.262784,-0.900215,0.030312,-0.219944,-0.639003,-0.202200,-0.013555,0.467454,...,-0.430138,-1.353787,0.512674,-0.476655,-0.764250,0.029383,-0.077611,-0.062928,12.99,0


In [78]:
#concatinatimg legit_sample data and Fraud data

new_dataset=pd.concat([legit_sample,fraud],axis=0)
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
23555,32819.0,1.162978,-1.097519,0.905584,-0.651494,-1.455128,0.051269,-1.066579,0.0022,-0.322047,...,-0.136549,-0.08639,-0.143488,-0.381944,0.064101,1.151312,-0.017963,0.041565,125.9,0
202180,134205.0,2.00555,-1.897794,-0.858712,-1.337728,-1.716429,-0.895968,-0.986008,-0.166685,-0.876317,...,-0.587851,-1.631566,0.427098,-0.247566,-0.723566,-0.665722,-0.018225,-0.01906,159.99,0
179722,124213.0,-1.449516,0.59255,1.593578,-3.506733,-0.482206,-0.158185,-0.202943,0.770412,1.230895,...,0.097237,0.200519,-0.323754,0.698879,0.612312,-0.432227,0.016662,0.047445,3.59,0
103489,68681.0,0.942666,0.289313,0.229888,2.341587,0.373488,0.560804,0.211232,0.207515,-1.145602,...,0.09961,0.115948,-0.063724,-0.304112,0.39317,0.042133,-0.013933,0.010544,75.66,0
140663,83856.0,1.454575,-0.4106,0.138114,-1.070725,-0.382569,-0.032418,-0.596144,-0.043026,-1.139737,...,-0.108758,-0.396151,-0.032819,-0.831219,0.398099,-0.429745,0.013618,0.00253,4.78,0


In [79]:
new_dataset.Class.value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,492
1,492


In [80]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,96756.522358,0.110279,-0.061551,0.008847,0.082338,-0.008927,0.076361,0.010227,0.041754,0.015212,...,-0.026795,-0.028264,-0.005968,-0.05123,-0.002401,0.013901,0.000259,0.004789,-0.026978,89.824939
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


Splitting the data into Features and Targets

In [81]:
X=new_dataset.drop(columns='Class',axis=1)
Y=new_dataset['Class']

In [82]:
print(X)

            Time        V1        V2        V3        V4        V5        V6  \
23555    32819.0  1.162978 -1.097519  0.905584 -0.651494 -1.455128  0.051269   
202180  134205.0  2.005550 -1.897794 -0.858712 -1.337728 -1.716429 -0.895968   
179722  124213.0 -1.449516  0.592550  1.593578 -3.506733 -0.482206 -0.158185   
103489   68681.0  0.942666  0.289313  0.229888  2.341587  0.373488  0.560804   
140663   83856.0  1.454575 -0.410600  0.138114 -1.070725 -0.382569 -0.032418   
...          ...       ...       ...       ...       ...       ...       ...   
279863  169142.0 -1.927883  1.125653 -4.518331  1.749293 -1.566487 -2.010494   
280143  169347.0  1.378559  1.289381 -5.004247  1.411850  0.442581 -1.326536   
280149  169351.0 -0.676143  1.126366 -2.213700  0.468308 -1.120541 -0.003346   
281144  169966.0 -3.113832  0.585864 -5.399730  1.817092 -0.840618 -2.943548   
281674  170348.0  1.991976  0.158476 -2.583441  0.408670  1.151147 -0.096695   

              V7        V8        V9  .

In [83]:
print(Y)

23555     0
202180    0
179722    0
103489    0
140663    0
         ..
279863    1
280143    1
280149    1
281144    1
281674    1
Name: Class, Length: 984, dtype: int64


In [84]:
#spliting the data into Training data and testing data

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=10)

In [85]:
print(X.shape,X_train.shape,X_test.shape)

(984, 30) (787, 30) (197, 30)


In [86]:
#training the model

lr=LogisticRegression()

In [87]:
#training the logistic regression model with training data

lr.fit(X_train,Y_train)


**Model Evaluation**

In [88]:
#checking the accuracy score on training data

X_train_prediction=lr.predict(X_train)
training_data_accuracy=accuracy_score(X_train_prediction,Y_train)
print('the accuracy score for training data is:',training_data_accuracy)



the accuracy score for training data is: 0.9466327827191868


In [89]:
#checking the accuracy score on testing data

X_test_prediction=lr.predict(X_test)
testing_data_accuracy=accuracy_score(X_test_prediction,Y_test)
print('the accuracy score for testing data is:',testing_data_accuracy)

the accuracy score for testing data is: 0.9289340101522843


In [90]:
  #checking the precision score on training data

  X_train_prediction=lr.predict(X_train)
  training_data_precision=precision_score(X_train_prediction,Y_train)
  print('the precision score for training data is:',training_data_precision)

  #checking the precision score on testing data

  X_test_prediction=lr.predict(X_test)
  testing_data_precision=precision_score(X_test_prediction,Y_test)
  print('the precision score for testing data is:',testing_data_precision)


the precision score for training data is: 0.9185750636132316
the precision score for testing data is: 0.898989898989899


**Conclusion**

By applying Logistic Regression on the Credit card Dataset we have achieved a significant accuracy score of 0.94 on both training data and testing data and have also achieved precision score of 0.92 on both training data and testing data which means our model has performed very well on both set of data and providing significantly good results.