<a href="https://colab.research.google.com/github/RajeevBhadola/myrepo/blob/master/Credit_Card_Fraud_Example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd # data processing
import numpy as np # working with arrays
import matplotlib.pyplot as plt # visualization
from termcolor import colored as cl # text customization
import itertools # advanced tools

In [None]:
from sklearn.preprocessing import StandardScaler # data normalization
from sklearn.model_selection import train_test_split # data split
from sklearn.tree import DecisionTreeClassifier # Decision tree algorithm
from sklearn.neighbors import KNeighborsClassifier # KNN algorithm
from sklearn.linear_model import LogisticRegression # Logistic regression algorithm
from sklearn.svm import SVC # SVM algorithm
from sklearn.ensemble import RandomForestClassifier # Random forest tree algorithm
from xgboost import XGBClassifier # XGBoost algorithm

In [None]:
from sklearn.metrics import confusion_matrix # evaluation metric
from sklearn.metrics import accuracy_score # evaluation metric
from sklearn.metrics import f1_score # evaluation metric

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
df = pd.read_csv('creditcard.csv')
df.head()

In [None]:
df.drop('Time', axis=1, inplace = True)
df.head()

In [None]:
cases = len(df)
nonfraud_count = len(df[df.Class==0])
fraud_count = len(df[df.Class ==1])
fraud_percentage = round((fraud_count/(fraud_count+ nonfraud_count))*100, 2)

In [None]:
print(cl('CASE COUNT', attrs = ['bold']))
print(cl('-----------------------------', attrs = ['bold']))
print(cl("Number of Cases are {}".format(cases),attrs = ['bold']))
print(cl("Number of Non Fraud Cases {}".format(nonfraud_count),attrs = ['bold']))
print(cl("Number of Fraud Cases {}".format(fraud_count),attrs = ['bold']))
print(cl("Percentage of Fraud Cases are {}".format(fraud_percentage), attrs=['bold']))

In [None]:
nonfraud = df[df.Class == 0]
fraud = df[df.Class == 1]
print(cl('Case Count', attrs =['bold']))
print(cl('---------------------------', attrs=['bold']))
print(cl("Non Fraud Cases Amounts Stats", attrs=['bold']))
print(nonfraud.Amount.describe())
print(cl("Fraud Cases Amounts Stats", attrs=['bold']))
print(fraud.Amount.describe())
print(cl('---------------------------', attrs=['bold']))

In [None]:
sc = StandardScaler()
amount = df['Amount'].values
df['Amount'] = sc.fit_transform(amount.reshape(-1,1))
print(cl(df['Amount'].head(15), attrs =['bold']))

In [None]:
x = df.drop('Class', axis = 1).values
y= df['Class'].values
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size =0.2, random_state = 0)
print(cl('X_train samples : ', attrs =['bold']), X_train[:1])
print(cl('X_test samples : ', attrs =['bold']), X_test[0:20])
print(cl('y_train samples : ', attrs =['bold']), y_train[0:20])
print(cl('y_test samples : ', attrs =['bold']), y_test[0:20])

In [None]:
tree_model = DecisionTreeClassifier(max_depth= 4, criterion='entropy')
tree_model.fit(X_train, y_train)
tree_yhat = tree_model.predict(X_test)


n=5
knn = KNeighborsClassifier(n_neighbors= 5)
knn.fit(X_train, y_train)
knn_yhat = knn.predict(X_test)

lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_yhat = lr.predict(X_test)

svm = SVC()
svm.fit(X_train, y_train)
svm_yhat = svm.predict(X_test)

rf = RandomForestClassifier(max_depth = 4)
rf.fit(X_train, y_train)
rf_yhat = rf.predict(X_test)

xgb = XGBClassifier(max_depth= 4)
xgb.fit(X_train, y_train)
xgb_yhat = xgb.predict(X_test)

In [None]:
print(cl('ACCURACY SCORE', attrs = ['bold']))
print(cl('---------------------------', attrs =['bold']))
print(cl("Accuracy of decision tree model is : {}".format(accuracy_score (y_test,tree_yhat )), attrs =['bold'], color ='green'))
print(cl("---------------------------", attrs =['bold']))
print(cl("Accuracy of Nearest Neighbour model is : {}".format(accuracy_score (y_test,knn_yhat )), attrs =['bold'], color ='blue'))
print(cl('---------------------------', attrs =['bold']))
print(cl("Accuracy of Logistic Regression model is : {}".format(accuracy_score (y_test, lr_yhat )), attrs =['bold'], color ='red'))
print(cl('---------------------------', attrs =['bold']))
print(cl("Accuracy of SVC model is : {}".format(accuracy_score (y_test, svm_yhat )), attrs =['bold'], color ='green'))
print(cl('---------------------------', attrs =['bold']))
print(cl("Accuracy of Random Forest model is : {}".format(accuracy_score (y_test, rf_yhat )), attrs =['bold'], color ='blue'))
print(cl('---------------------------', attrs =['bold']))
print(cl("Accuracy of xgb model is : {}".format(accuracy_score (y_test, xgb_yhat )), attrs =['bold'], color ='red'))

In [None]:
from sklearn.metrics import confusion_matrix
#Get the confusion matrix
tree_matrix = confusion_matrix(y_test, tree_yhat)
print(tree_matrix)

knn_matrix = confusion_matrix(y_test, knn_yhat)
print(knn_matrix)

lr_matrix = confusion_matrix(y_test, lr_yhat)
print(lr_matrix)

svm_matrix = confusion_matrix(y_test, svm_yhat)
print(svm_matrix)

xgb_matrix = confusion_matrix(y_test, xgb_yhat)
print(xgb_matrix)



In [None]:
import seaborn as sns
sns.heatmap(tree_matrix, annot=True)
sns.heatmap(knn_matrix, annot=True)