In [1]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score 
from sklearn.linear_model import LogisticRegression

In [2]:
credit_card_df = pd.read_csv(r"datasets/creditcard.csv")

In [3]:
credit_card_df.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.01448,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.05508,2.03503,-0.738589,0.868229,1.058415,0.02433,0.294869,0.5848,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.24964,-0.557828,2.630515,3.03126,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.24044,0.530483,0.70251,0.689799,-0.377961,0.623708,-0.68618,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.0,0
284806,172792.0,-0.533413,-0.189733,0.703337,-0.506271,-0.012546,-0.649617,1.577006,-0.41465,0.48618,...,0.261057,0.643078,0.376777,0.008797,-0.473649,-0.818267,-0.002415,0.013649,217.0,0


In [4]:
credit_card_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [6]:
## checking the number of missin values in each column

credit_card_df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [7]:
## distribution of legit vs fraudalent 

credit_card_df['Class'].value_counts()

# we can see that the data is very unbalanced 

Class
0    284315
1       492
Name: count, dtype: int64

In [8]:
# separating the data for analysis 

legit = credit_card_df[credit_card_df['Class'] == 0]
fraud = credit_card_df[credit_card_df['Class'] == 1]

In [9]:
print(legit.shape , fraud.shape )

(284315, 31) (492, 31)


In [11]:
# statistical meaures about the data 

legit.Amount.describe()

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [12]:
fraud.Amount.describe()

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

In [13]:
# compare the values for both the transactions 

credit_card_df.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


In [14]:
## unsampling : Build a sample dataset from original dataset that contains similar distribution of normal and fraudlant txs


legit_sample = legit.sample(n = 492)

In [17]:
# concat two datasets 

new_df = pd.concat([legit_sample, fraud], axis = 0)

In [18]:
new_df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
206339,136171.0,-1.284339,1.296447,-1.696099,-0.155749,3.180868,3.577883,0.349306,1.013207,-0.840606,...,0.227255,0.366359,-0.348902,0.72441,0.407965,-0.360741,-0.68004,0.142719,2.0,0
167514,118733.0,1.798773,-0.462379,-2.189639,0.261443,0.362789,-0.412364,0.10844,-0.051859,0.978738,...,-0.203813,-0.69354,-0.054896,-1.187571,-0.041865,-0.054998,-0.041393,-0.01839,138.7,0
213258,139180.0,2.067937,-0.189262,-1.649063,0.285905,0.470421,-0.27907,0.062121,-0.195786,0.555707,...,0.254354,0.840107,-0.082439,0.103036,0.275313,0.589988,-0.066516,-0.0625,28.0,0
121044,76052.0,1.082993,-0.00689,0.711442,0.938848,-0.59204,-0.521089,0.01672,-0.0896,0.294218,...,-0.138192,-0.222806,0.066565,0.662879,0.324867,0.259481,-0.006393,0.022817,42.46,0
5880,6426.0,1.121627,0.63258,0.54568,2.657725,0.323969,0.303616,0.061599,0.02552,0.334984,...,-0.123836,-0.002049,-0.078897,-0.030357,0.605469,0.061771,-0.03315,-0.007532,0.76,0


In [19]:
new_df['Class'].value_counts() # now we have a uniformly distributed data

Class
0    492
1    492
Name: count, dtype: int64

In [20]:
new_df.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,93926.052846,0.046266,0.008524,0.013004,0.084472,-0.001691,-0.039763,0.045553,0.019477,0.133096,...,0.008361,0.018133,0.022146,-0.014216,-0.001111,0.020122,0.008777,-0.036959,-0.000195,85.211037
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


In [21]:
## Splitting the data into features and targets 

X = new_df.drop(columns = 'Class', axis = 1)
Y = new_df['Class']

In [24]:
## split data into training and testing data


X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify =  Y,  random_state = 23)



In [26]:
model = LogisticRegression()

In [27]:
## training the model 

model.fit(X_train, Y_train)

In [30]:
## Accuracy Score 

X_train_pred = model.predict(X_train)
training_accuracy = accuracy_score(X_train_pred, Y_train)

In [31]:
training_accuracy

0.9148665819567979

In [33]:
X_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(X_test_pred, Y_test)

In [34]:
test_accuracy 

0.9238578680203046