CodeClause : Data Science Intern
Batch-September 2022
Name : Riham Essam
Project Name : Fraud Detection

In [None]:
#importing libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import warnings
import sklearn
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.metrics import make_scorer, accuracy_score
from sklearn import metrics
from sklearn.ensemble import  RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB

warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("fraud-detection.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df['Class'].value_counts() #0 is not frauded and 1 is frauded

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace = True)

In [None]:
df.duplicated().sum()

In [None]:
df.shape

In [None]:
df.describe().T

In [None]:
sns.countplot(df['Class'])

In [None]:
df.isnull().sum()

In [None]:
sns.countplot(df['Class'],color="salmon",saturation = 0.5)

In [None]:
df['Class'].value_counts() 

In [None]:
df.corr()

In [None]:
#feature scalling to normalize data in range 0:1
stand=MinMaxScaler()
df[['Amount']] = stand.fit_transform(df[['Amount']])

df.head()

In [None]:
#representing correlation between features
plt.figure(figsize=(12,12))
hm = sns.heatmap(df.corr(), square = True,cmap="Greens")
hm.set(title = "Correlation matrix of fraud detection data\n")

plt.show()

In [None]:
#As we count of unfrauded is much more than the frauded class so we'll take a sample from the unfrauded class
frauded = df[df.Class == 1]
unfrauded = df[df.Class == 0]

sampleOfUnfrauded = unfrauded.sample(n = 2000)
newdf = pd.concat([sampleOfUnfrauded,frauded],axis = 0)
newdf

In [None]:
newdf['Class'].value_counts() #It seems good we'll use this new dataframe to apply models on it

In [None]:
newdf.plot.scatter(x = "Amount", y = "Class", alpha = 0.5)

In [None]:
newdf.plot.box(figsize=(18,6))

In [None]:
#Andrews curves are used to visualize high_dimensional data by mapping each observation to a function
pd.plotting.andrews_curves(newdf, 'Class', color = ["purple", "salmon"])

In [None]:
shuffled = shuffle(newdf)
print(shuffled.head())

In [None]:
pd.plotting.andrews_curves(newdf.sample(n = 1000), 'Class', colormap = "Accent")

Models

In [None]:
X = newdf.iloc[:,:-1]
Y = newdf.iloc[:,-1]

In [None]:
print(X.shape)
print(Y.shape)

In [None]:
#splliting data to train and test with 75% train and 25% test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, shuffle=True, random_state=10)

In [None]:
#A list to save accuracy score for each model
scores=[]

In [None]:
#random forest
classifier = RandomForestClassifier(n_estimators = 100)
#train the model
classifier.fit(X_train, y_train)
#predict with the test data
y_pred = classifier.predict(X_test)

print("random forest accuracy:",accuracy_score(y_test,y_pred))
print("random forest precision ",metrics.precision_score(y_test,y_pred))
print("random forest recall ",metrics.recall_score(y_test,y_pred))

#adding accuracy for the model in scores list
scores.append({
        'model': 'Random forest',
        'accuracy':accuracy_score(y_test,y_pred)})

#confusion matrix 
cm = metrics.confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(4, 4))
ax.imshow(cm)
ax.grid(False)
ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
ax.set_ylim(1.5, -0.5)

for i in range(2):
    for j in range(2):
        ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
        
plt.show()

In [None]:
#desicion tree 
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

print("desicion tree accuracy:",accuracy_score(y_test,predictions))
print("desicion tree precision: ",metrics.precision_score(y_test,predictions))
print("desicion tree recall: ",metrics.recall_score(y_test,predictions))

scores.append({
        'model': 'Decision tree',
        'accuracy':accuracy_score(y_test,predictions) })

confusionMatrix = metrics.confusion_matrix(y_test, predictions)
cm = metrics.ConfusionMatrixDisplay(confusion_matrix = confusionMatrix, display_labels = [False, True])
cm.plot()

plt.show()

In [None]:
#logistic regression
log = LogisticRegression()
pred = log.fit(X_train,y_train).predict(X_test)

print("logistic regression accuracy: ",accuracy_score(y_test,pred))
print("logistic regression precision ",metrics.precision_score(y_test,pred))
print("logistic regression recall ",metrics.recall_score(y_test,pred))

scores.append({
        'model': 'logistic regression',
        'accuracy':accuracy_score(y_test,pred) })

confusionMatrix = metrics.confusion_matrix(y_test, pred)
cm = metrics.ConfusionMatrixDisplay(confusion_matrix = confusionMatrix, display_labels = [False, True])
cm.plot()

plt.show()

In [None]:
#SVM
#we use grid search to choose the best parameters 
param_grid = {'C': [0.1, 1, 10, 100, 1000],
                  'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                  'kernel': ['rbf']}

grid_SVM = GridSearchCV(SVC(), param_grid)
grid_SVM.fit(X_train, y_train)
pred=grid_SVM.predict(X_test)

print("SVM accuracy: ", accuracy_score(y_test, pred))
print("SVM precision ",metrics.precision_score(y_test,pred))
print("SVM recall ",metrics.recall_score(y_test,pred))
print("best parameters: ",grid_SVM.best_params_)

scores.append({
        'model': 'SVM',
        'accuracy':accuracy_score(y_test,pred) })

confusionMatrix = metrics.confusion_matrix(y_test, pred)
cm = metrics.ConfusionMatrixDisplay(confusion_matrix = confusionMatrix, display_labels = [False, True])
cm.plot()

plt.show()

In [None]:
GNB = GaussianNB()
pred = GNB.fit(X_train,y_train).predict(X_test)

print("GaussianNB accuracy: ",accuracy_score(y_test,pred))
print("GaussianNB precision ",metrics.precision_score(y_test,pred))
print("GaussianNB recall ",metrics.recall_score(y_test,pred))
scores.append({
        'model': 'GaussianNB',
        'accuracy':accuracy_score(y_test,pred) })

confusionMatrix = metrics.confusion_matrix(y_test, pred)
cm = metrics.ConfusionMatrixDisplay(confusion_matrix = confusionMatrix, display_labels = [False, True])
cm.plot()

plt.show()

In [None]:
#printing accuracy for each model 
df_score = pd.DataFrame(scores,columns=['model','accuracy'])
df_score

In [None]:
#visualization for each model's accuracy
plt.figure(figsize=(8,4))
sns.barplot(x="model", y="accuracy", data=df_score)
plt.ylim(0, 1)

#conclusion: Random forest is the highest accuracy and SVM is the lowest

In [None]:
#classification report 
print(classification_report(y_test,y_pred))