In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Statistical Libraries
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('creditcard.csv')

# EDA

In [3]:
df.shape

(284807, 31)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

### Its clear that the class column is categorical data transformed, Amount another important column, time doesn't seem to have any value here considering the data, all the other float variables are very important

In [5]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [6]:
df.drop(['Time'], axis=1, inplace=True)

In [7]:
df.shape

(284807, 30)

In [8]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
V1,284807.0,1.168375e-15,1.958696,-56.40751,-0.920373,0.018109,1.315642,2.45493
V2,284807.0,3.416908e-16,1.651309,-72.715728,-0.59855,0.065486,0.803724,22.057729
V3,284807.0,-1.379537e-15,1.516255,-48.325589,-0.890365,0.179846,1.027196,9.382558
V4,284807.0,2.074095e-15,1.415869,-5.683171,-0.84864,-0.019847,0.743341,16.875344
V5,284807.0,9.604066e-16,1.380247,-113.743307,-0.691597,-0.054336,0.611926,34.801666
V6,284807.0,1.487313e-15,1.332271,-26.160506,-0.768296,-0.274187,0.398565,73.301626
V7,284807.0,-5.556467e-16,1.237094,-43.557242,-0.554076,0.040103,0.570436,120.589494
V8,284807.0,1.213481e-16,1.194353,-73.216718,-0.20863,0.022358,0.327346,20.007208
V9,284807.0,-2.406331e-15,1.098632,-13.434066,-0.643098,-0.051429,0.597139,15.594995
V10,284807.0,2.239053e-15,1.08885,-24.588262,-0.535426,-0.092917,0.453923,23.745136


In [9]:
total_transactions = len(df)
normal = len(df[df.Class == 0])
fraud = len(df[df.Class == 1])
fraud_percentage = round(fraud/normal*100,2)
print('The total number of transactions in the dataset is', total_transactions)
print('The total number of normal transactions in the dataset is', normal)
print('The total number of fraud transactions in the dataset is', fraud)
print('The % of fraud transaction is', fraud_percentage)

The total number of transactions in the dataset is 284807
The total number of normal transactions in the dataset is 284315
The total number of fraud transactions in the dataset is 492
The % of fraud transaction is 0.17


In [10]:
min(df.Amount), max(df.Amount)

(0.0, 25691.16)

### I have observed in our data 28 features are transformed versions of PCA but the Amount is the original one. And, while checking the minimum and maximum is in the amount — I found the difference is huge that can deviate our result. Lets see if there are any duplicates, if yes then drop it and then scale the amount variable. 


In [11]:
df.drop_duplicates(inplace=True)

In [12]:
df.shape

(275663, 30)

In [13]:
min(df.Amount), max(df.Amount)

(0.0, 25691.16)

In [14]:
sc = StandardScaler()
amount = df['Amount'].values
df['Amount'] = sc.fit_transform(amount.reshape(-1,1))

# Algorithms

### Lets first split the data

In [15]:
X = df.drop('Class', axis=1).values
Y = df['Class'].values

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.25, random_state=1)

## Logistic Regression

In [17]:
lr = LogisticRegression()
lr.fit(X_train, Y_train)
lr_predict = lr.predict(X_test)

In [18]:
print('The accuracy score of the Logistic Regression model is', accuracy_score(Y_test, lr_predict));
print('The f1 score of the Logistic Regression model is {}'.format(f1_score(Y_test, lr_predict)))

The accuracy score of the Logistic Regression model is 0.9989552498694062
The f1 score of the Logistic Regression model is 0.6666666666666666


## K Nearest Neighbour

In [19]:
n = 5
knn = KNeighborsClassifier(n_neighbors = n)
knn.fit(X_train, Y_train)
knn_predict = knn.predict(X_test)

In [20]:
print('The accuracy score of the KNN model is', accuracy_score(Y_test, knn_predict));
print('The f1 score of the KNN model is {}'.format(f1_score(Y_test, knn_predict)))

The accuracy score of the KNN model is 0.9993325207498984
The f1 score of the KNN model is 0.8034188034188035


## Support Vector Machine

In [21]:
svm = SVC()
svm.fit(X_train, Y_train)
svm_predict = svm.predict(X_test)

In [22]:
print('The accuracy score of the SVM model is', accuracy_score(Y_test, svm_predict));
print('The f1 score of the SVM model is {}'.format(f1_score(Y_test, svm_predict)))

The accuracy score of the SVM model is 0.999318010331418
The f1 score of the SVM model is 0.7813953488372093


## XGBoost

In [23]:
xgb = XGBClassifier(max_depth = 4)
xgb.fit(X_train, Y_train)
xgb_predict = xgb.predict(X_test)

In [24]:
print('The accuracy score of the XGB model is', accuracy_score(Y_test, xgb_predict));
print('The f1 score of the XGB model is {}'.format(f1_score(Y_test, xgb_predict)))

The accuracy score of the XGB model is 0.999506645771664
The f1 score of the XGB model is 0.8495575221238937


# Its clear that the XG Boost algorithm outperformed with 99.95% accuracy in detecting the fraudulent transactions