# Credit card fraud predictive system based on a trained model

### Import dependencies

In [3]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 

https://www.youtube.com/watch?v=NCgjcHLFNDg&ab_channel=SiddhardhanSiddhardhan

In [5]:
# Loading the dataset 

df = pd.read_csv('creditcard.csv')

In [6]:
df.head(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [7]:
# To check the number of missing value in each column of the data frame 

df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [8]:
# To check the distribution of legit and fraudulent transactions 
# The dataset is unbalanced because 90% of the data belongs to a particular class
# This is not good for out ML because it may not be able to recognise the fraudulent transactions 
# 0 represents legit 
# 1 represents fraud
df['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [9]:
# to seperate the data for analysis

legit = df[df.Class ==0]
fraud = df[df.Class ==1]

In [10]:
print(legit.shape)
print(fraud.shape)

(284315, 31)
(492, 31)


In [11]:
# Statistical measure of the data

legit.Amount.describe()

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [12]:
fraud.Amount.describe()

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

#### Groupby

Note that the groupby function has calculated the mean of all the columns and grouped them according to the two classes 0 and 1 

In [13]:
#compare the values for both transactions
# Compares the mean of all the values by its class 
# Groupby gives us the values of two transactions based on the mean 

df.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


#### Above is useful so we know the ML algorithm can predict correctly


## Under-Sampling 


#### Because the data is unbalanced, we need to do something called Under-Sampling - Here we build a sample data set containing similar distribution of normal and fraudulent transactions 
 
     

The number of Fraudulent transactions --> 492

In [14]:
#  From the normal transactions, we randomly take 492 transactions and then 
# we join it with the 492 fraudulent transactions to get equal amounts of legit
# and fraudulent transactions to make the sample even

# Sample is a function


legit_sample = legit.sample(n=492)

### Concatenation 
##### Now we Concatenate the two data frames

In [17]:
# Axis 0 means row and 1 means columns for this scenarion we want it added row-wise
new_df = pd.concat([legit_sample, fraud],axis = 0)
new_df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
83472,59863.0,1.008995,-0.003185,0.336046,1.516047,-0.345745,-0.58666,0.349926,-0.23611,0.133956,...,-0.055749,-0.145907,-0.195176,0.432981,0.731987,-0.356518,0.010249,0.039538,111.93,0
99808,67292.0,-0.704882,1.392789,0.929001,0.104884,-0.240335,-1.009869,0.427845,0.312941,-0.670552,...,-0.206651,-0.646405,0.055496,0.345627,-0.108272,0.071876,0.121535,0.035601,13.99,0
206643,136299.0,-5.434203,-0.171769,-2.436557,5.132118,-2.078971,1.551315,-0.060757,2.117342,-2.137798,...,0.088735,-0.120353,-1.293509,0.500618,-0.449118,0.26003,0.502811,-0.310661,295.09,0
267362,162742.0,-0.580996,0.752792,2.058373,-0.546555,0.07563,0.08811,0.472456,0.078355,0.280372,...,-0.19534,-0.302636,-0.362136,-0.411728,0.323544,-0.667471,0.181384,-0.094765,1.0,0
49952,44278.0,0.842261,-0.537864,1.194348,0.773702,-0.460228,1.583202,-0.830344,0.60565,0.805995,...,0.095858,0.491654,0.134035,-0.570319,-0.111249,0.415909,0.079341,0.028686,76.19,0


In [18]:
new_df.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
279863,169142.0,-1.927883,1.125653,-4.518331,1.749293,-1.566487,-2.010494,-0.88285,0.697211,-2.064945,...,0.778584,-0.319189,0.639419,-0.294885,0.537503,0.788395,0.29268,0.147968,390.0,1
280143,169347.0,1.378559,1.289381,-5.004247,1.41185,0.442581,-1.326536,-1.41317,0.248525,-1.127396,...,0.370612,0.028234,-0.14564,-0.081049,0.521875,0.739467,0.389152,0.186637,0.76,1
280149,169351.0,-0.676143,1.126366,-2.2137,0.468308,-1.120541,-0.003346,-2.234739,1.210158,-0.65225,...,0.751826,0.834108,0.190944,0.03207,-0.739695,0.471111,0.385107,0.194361,77.89,1
281144,169966.0,-3.113832,0.585864,-5.39973,1.817092,-0.840618,-2.943548,-2.208002,1.058733,-1.632333,...,0.583276,-0.269209,-0.456108,-0.183659,-0.328168,0.606116,0.884876,-0.2537,245.0,1
281674,170348.0,1.991976,0.158476,-2.583441,0.40867,1.151147,-0.096695,0.22305,-0.068384,0.577829,...,-0.16435,-0.295135,-0.072173,-0.450261,0.313267,-0.289617,0.002988,-0.015309,42.53,1


In [19]:
# To view the distribution of the dataset 
new_df['Class'].value_counts()

1    492
0    492
Name: Class, dtype: int64

In [20]:
 new_df.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,93809.910569,-0.059495,-0.039173,0.005784,-0.059895,0.033138,-0.035779,0.024579,0.019751,-0.061992,...,-0.026035,-0.008644,0.007927,0.052776,0.025971,-0.016265,-0.025424,0.002158,-0.008466,82.567785
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


Above shows that the nature of the dataset has not changed much so this makes it a good sample. If the values were very different then this would be a bad sample  

### Split the data into Features and Targets

Features = Columns (Time and V1-28)
Target = Class 0 or 1 


In [21]:
X = new_df.drop(columns = 'Class', axis= 1)
Y = new_df['Class']

### Split the data into training data and testing data

We create 4 variables or arrays 
We split the data into testing data and training data randomly 
test size is the amount of data you want to test from the  split dataset. In our case its 20% (0.2) 

Stratify means 

random state shows how you want the data to be split 

In [22]:
X_train,X_test,Y_train,Y_test = train_test_split(X, Y, test_size= 0.2, stratify = Y, random_state= 2)

In [23]:
print(X.shape, X_train.shape, X_test.shape)

(984, 30) (787, 30) (197, 30)


### Model Training 

We will be using the Logistic Regression model which is used for binary classification problems 

In [25]:
# This line means that we are running an instance of the LR model into the 
# variable called model
model = LogisticRegression()

Now we train the Logistic Regression model with the training data

In [26]:
#  The fit function fits out data into the ML model 
model.fit(X_train, Y_train)

LogisticRegression()

### Model Evaluation

#### Accuracy Score

First we find the accuracy score on the training data

So we see that as our model has learned from the data (model.fit)

So we will give only X_train values to our model and it will
try to predict what is the class (0 or 1) for X_train

So once this is done we  will then compare it to the original values
supplied at Y_train

In [30]:
# Accuracy on training data

# We use the predict function on the model on X_train and we store it in the variable 

X_train_prediction = model.predict(X_train)

# Then we compare the values on the prediction and original values on Y_train and
# store them in the variable 

training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [31]:
print('Accuracy on training data:',training_data_accuracy )

Accuracy on training data: 0.9415501905972046


This means the accuracy score is  94.1% which means our model prediction is accurate. Accuracy scores of more than 70% means the predictions are good 

So out of 100 predictions our model can predict correctly 94.1 predictions

### Evaluation on Test data



In [34]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [36]:
print('Accuracy score on Test data:',test_data_accuracy)

Accuracy score on Test data: 0.9289340101522843


If the accuracy score on training data is very different from the accuracy score on test data it means our model is under fitted or over fitted 