# Importing Dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score

In [2]:
#loading dataset to pandas dataframe 
credit_card_data=pd.read_csv("creditcard.csv")

In [3]:
#first 5 rows of data
credit_card_data.head()

#class--> If label 0 then legit transaction and if label 1 then fraud transaction 

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
#Getting dataset information
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [5]:
#Checking for missing values in each column in dataset
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [6]:
#Distribution of legit transactions and fraud transaction
credit_card_data['Class'].value_counts()  

0    284315
1       492
Name: Class, dtype: int64

 # This dataset is highly unballanced 
     0--> Normal Transactions 
     1--> Fraudulent Transaction

In [7]:
#Seperatinng data for analysis 
legit=credit_card_data[credit_card_data.Class == 0]
fraud=credit_card_data[credit_card_data.Class == 1]

print(legit.shape)
print(fraud.shape)

(284315, 31)
(492, 31)


In [8]:
#Statistical measures of the data for legit amount
legit.Amount.describe() #count tells us number of data points we have in legit.Amount

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [9]:
#Statistical measures of the data for fraud amount
fraud.Amount.describe() #count tells us number of data points we have in fraud.Amount

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

In [10]:
#comparing the values for both  transactions


#Gives mean of all columns by grouping class field
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


# Under-Sampling 
    building a sample dataset containig similar distribution of legit transaction and fraud transaction
    Number of fraud tranaction is 492
    Number of legit transaction is 284315
    
    so, out of 284315 we will randomly choose 492 datasets and we will join it with fraud dataset so its becomes balanced dataset.

In [11]:
legit_sample=legit.sample(n=492)

# Concatenating two DataFrames(fraud and legit_sample)

In [12]:
new_dataset=pd.concat([legit_sample,fraud],axis=0)

In [13]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
207018,136486.0,-1.31568,1.481884,0.630002,-0.332323,0.894362,-0.258516,1.188969,0.301585,-1.245917,...,-0.048552,-0.334699,-0.452824,-0.489793,1.023682,-0.738499,-0.237366,-0.038201,31.5,0
254409,156720.0,-6.129419,3.451606,-2.297169,-3.239551,-4.830324,-0.506925,-4.517534,4.751357,-1.598064,...,-0.00036,-0.339857,0.206725,-0.512745,1.044491,-0.108628,0.00298,0.014316,10.0,0
8469,11349.0,-0.471906,0.411932,1.951918,1.482344,0.610217,0.02127,0.261656,-0.113203,1.408417,...,-0.078171,0.355041,-0.097925,0.036685,-0.326912,-0.313997,0.006723,-0.060625,9.99,0
108861,71111.0,-0.604466,0.967455,0.214171,-0.379811,2.635379,3.598706,0.274036,0.840084,-0.921064,...,-0.129367,-0.598811,-0.285599,0.977462,0.481176,-0.390036,-0.060853,0.078228,10.21,0
36794,38688.0,-2.566992,-3.029462,1.792735,-1.002348,1.418198,-1.337037,-0.420691,-0.004191,-1.151686,...,0.347768,0.009942,0.599463,-0.000988,0.75804,-0.3516,-0.304903,-0.116112,249.7,0


In [14]:
#uniformly distributed dataset
new_dataset['Class'].value_counts()

0    492
1    492
Name: Class, dtype: int64

In [15]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,92996.378049,-0.04742,0.082826,0.047576,-0.04397,0.036492,-0.127514,-0.032602,0.101253,0.007013,...,0.011381,-0.007879,-0.078645,0.020124,0.012153,-0.013973,0.013505,0.010032,0.001665,73.517012
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


Splitting the data into features(x) and target(target is either 0 or 1)

In [16]:
x = new_dataset.drop(columns='Class',axis=1)
y = new_dataset['Class']

In [17]:
print(x)

            Time        V1        V2        V3        V4        V5        V6  \
207018  136486.0 -1.315680  1.481884  0.630002 -0.332323  0.894362 -0.258516   
254409  156720.0 -6.129419  3.451606 -2.297169 -3.239551 -4.830324 -0.506925   
8469     11349.0 -0.471906  0.411932  1.951918  1.482344  0.610217  0.021270   
108861   71111.0 -0.604466  0.967455  0.214171 -0.379811  2.635379  3.598706   
36794    38688.0 -2.566992 -3.029462  1.792735 -1.002348  1.418198 -1.337037   
...          ...       ...       ...       ...       ...       ...       ...   
279863  169142.0 -1.927883  1.125653 -4.518331  1.749293 -1.566487 -2.010494   
280143  169347.0  1.378559  1.289381 -5.004247  1.411850  0.442581 -1.326536   
280149  169351.0 -0.676143  1.126366 -2.213700  0.468308 -1.120541 -0.003346   
281144  169966.0 -3.113832  0.585864 -5.399730  1.817092 -0.840618 -2.943548   
281674  170348.0  1.991976  0.158476 -2.583441  0.408670  1.151147 -0.096695   

              V7        V8        V9  .

In [18]:
print(y)

207018    0
254409    0
8469      0
108861    0
36794     0
         ..
279863    1
280143    1
280149    1
281144    1
281674    1
Name: Class, Length: 984, dtype: int64


# Splitting data into Training data and Testing data 

In [19]:
X_train,X_test,Y_train,Y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=2)
#test_size=0.2 means 20% of data will be stored in test data

In [20]:
print(x.shape,X_train.shape,X_test.shape)

(984, 30) (787, 30) (197, 30)


# Model Training 
    LogisticRegression

In [21]:
model=LogisticRegression(solver='lbfgs', max_iter=10000)

In [22]:
#Training the Logistic Regresssion Model with training data
model.fit(X_train, Y_train)

# Model Evaluation
    Evaluting model on the basis of accuracy score

In [23]:
#Accuracy on training data
X_train_prediction= model.predict(X_train)
training_data_accuracy=accuracy_score(X_train_prediction, Y_train)

In [24]:
print('Accuracy of training data:',training_data_accuracy)

Accuracy of training data: 0.9199491740787802


In [25]:
#Accuracy on test data
X_test_prediction= model.predict(X_test)
test_data_accuracy=accuracy_score(X_test_prediction, Y_test)

In [26]:
print('Accuracy of test data:',test_data_accuracy)

Accuracy of test data: 0.8984771573604061
