### Importing Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

 ### Data collection and pre-processing

In [3]:
#loading the dataset to a pandas dataframe
credit_card_data = pd.read_csv("creditcard.csv")

In [7]:
# print first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [8]:
# checking the number of datapoints and features (rows & columns respectively)
credit_card_data.shape

(284807, 31)

In [9]:
# getting information about the dataset
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [10]:
# check for the missing values
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [12]:
# distribution of the legit and fraudulent transaction
credit_card_data['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

##### NOTE: The dataset is highly unbalanced. There are more legit transaction than fraudulent transaction.

0 --> Legit transaction

1 --> fraudulent transaction

In [13]:
# seperate the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [14]:
print(legit.shape)
print(fraud.shape)

(284315, 31)
(492, 31)


In [15]:
# statistical measures of the legit data
legit['Amount'].describe()

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [16]:
# statistical measures of the fraud data
fraud['Amount'].describe()

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

In [17]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


### How to deal with the unbalanced data?
1. Undersampling: Randomly delete examples in the majority class.

In our case, legit transactions are more than the fraudulent transactions. So, we randomly delete the legit transactions to match the number of samples corresponding to fraudulent transactions.

2. Oversampling: Randomly duplicate examples in the minority class.

In [49]:
# Randomly sample 492 transaction from legit transaction data
legit_samples = legit.sample(n=492)
legit_samples.shape

(492, 31)

In [50]:
# concatenate the sampled legit data with the fraud data
processed_dataset = pd.concat([legit_samples,fraud], axis=0)

In [51]:
processed_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
251173,155255.0,0.288629,-3.655393,-3.75857,0.290916,-0.555971,-0.940947,1.8822,-0.849359,-1.226717,...,0.49744,-0.315563,-0.984879,0.693082,0.160126,0.887279,-0.323956,0.091809,1040.0,0
159740,112878.0,-0.496533,0.045882,0.290437,-4.075579,0.671,0.01242,1.099471,-0.239286,1.88342,...,-0.29519,-0.482571,0.065452,-0.147445,-0.565586,-1.603645,-0.129572,-0.066546,60.0,0
243320,151885.0,-1.72918,0.642367,-2.857555,-0.461637,3.751187,3.129151,1.630456,0.13845,-0.52497,...,0.218651,1.366823,-0.63567,0.806121,0.089734,-0.394987,-1.272378,-0.907188,109.0,0
104486,69086.0,1.167896,-0.115484,0.682163,-0.175638,-0.623745,-0.342395,-0.347984,0.082311,0.061092,...,-0.032155,-0.081365,0.127616,0.284625,-0.00327,0.902508,-0.056768,-4.1e-05,9.99,0
183248,125714.0,-0.409334,-0.17496,1.009366,-2.611642,0.473814,0.545173,-0.065346,0.204629,-1.134263,...,0.41379,1.087772,-0.33473,-0.150029,0.119089,-0.180995,0.108043,0.126291,24.99,0


In [52]:
processed_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
279863,169142.0,-1.927883,1.125653,-4.518331,1.749293,-1.566487,-2.010494,-0.88285,0.697211,-2.064945,...,0.778584,-0.319189,0.639419,-0.294885,0.537503,0.788395,0.29268,0.147968,390.0,1
280143,169347.0,1.378559,1.289381,-5.004247,1.41185,0.442581,-1.326536,-1.41317,0.248525,-1.127396,...,0.370612,0.028234,-0.14564,-0.081049,0.521875,0.739467,0.389152,0.186637,0.76,1
280149,169351.0,-0.676143,1.126366,-2.2137,0.468308,-1.120541,-0.003346,-2.234739,1.210158,-0.65225,...,0.751826,0.834108,0.190944,0.03207,-0.739695,0.471111,0.385107,0.194361,77.89,1
281144,169966.0,-3.113832,0.585864,-5.39973,1.817092,-0.840618,-2.943548,-2.208002,1.058733,-1.632333,...,0.583276,-0.269209,-0.456108,-0.183659,-0.328168,0.606116,0.884876,-0.2537,245.0,1
281674,170348.0,1.991976,0.158476,-2.583441,0.40867,1.151147,-0.096695,0.22305,-0.068384,0.577829,...,-0.16435,-0.295135,-0.072173,-0.450261,0.313267,-0.289617,0.002988,-0.015309,42.53,1


In [53]:
# distribution of the legit and fraudulent transaction in new processed dataset
processed_dataset.Class.value_counts()

# NOW THE DATA IS BALANCED

0    492
1    492
Name: Class, dtype: int64

In [54]:
# compare the values for both transactions
processed_dataset.groupby('Class').mean()

# The nature of the dataset id not changed as the mean values are not very different.

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,93153.737805,0.051978,0.040383,0.038317,-0.050475,0.064463,0.057485,0.028046,-0.034074,0.014045,...,-0.040448,0.038968,0.017131,0.014588,-0.02803,-0.021216,-0.016383,0.024358,-0.010567,75.865264
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


In [55]:
# seperating features and target
x = processed_dataset.drop(['Class'], axis=1) #features
y = processed_dataset['Class'] #target

#Note: axis = 0 represents row and axis = 1 reprsents column

In [56]:
# Split the data to training and testing set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify=y, random_state = 2)

### Model Training - Logistic Regression

In [57]:
# define the model
model = LogisticRegression()

# training the model
model.fit(x_train, y_train)

LogisticRegression()

### Evaluate the model

In [58]:
#Accuracy on train data
train_data_prediction = model.predict(x_train)
train_data_accuracy = accuracy_score(train_data_prediction, y_train)
print("Accuracy on train data: ",train_data_accuracy)

Accuracy on train data:  0.9301143583227446


In [59]:
#Accuracy on test data
test_data_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(test_data_prediction, y_test)
print("Accuracy on test data: ",test_data_accuracy)

Accuracy on test data:  0.934010152284264
