## Import libraries

In [4]:
import os
import numpy as np
import pandas as pd


## Loading Datasets

In [5]:
path = '../input/creditcardfraud/creditcard.csv'

In [6]:
data = pd.read_csv(path)
data.head()

### Data preprocessing

In [7]:
import seaborn as sns
import matplotlib.pyplot as plt


In [8]:
data.Class.value_counts()

In [9]:
# plt.bar(data.Class,data.Class.count())
sns.countplot(data.Class);

In [10]:
data.drop(['Time'],axis = 1,inplace = True)

In [11]:
data.columns

In [12]:
data.describe()

In [13]:
data.info()

In [14]:
from sklearn.utils import resample

In [15]:
data.Class.value_counts()

In [16]:
df_min = data[(data['Class'] == 1)]
df_max = data[(data['Class'] == 0)]

In [17]:
max_value = len(df_max)
min_value = len(df_min)
max_value,min_value

In [18]:
df_min_upsample = resample(df_min,replace = True,n_samples=max_value,random_state = 42)
df = pd.concat([df_min_upsample, df_max])

In [19]:
df.head()

In [20]:
df.Class.value_counts()

In [21]:
sns.countplot(df.Class)

In [22]:
y = df.iloc[:,-1]
x = df.iloc[:,0:-1]
y.shape,x.shape

## Train test split

In [23]:
from sklearn.model_selection import train_test_split


In [24]:
x_train,x_test,y_train,y_test = train_test_split(x,y,stratify = y)

In [25]:
y_train.value_counts(),y_test.value_counts()

### Data Normalization

In [26]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

In [27]:
x_train = ss.fit_transform(x_train)
x_test = ss.transform(x_test)

## Applying ML Algorithms

## XGBoost

In [28]:
from xgboost import XGBClassifier
xgbc = XGBClassifier()

In [30]:
xgbc.fit(x_train,y_train)

In [32]:
xgbc.score(x_train,y_train)


In [33]:
xgbc.score(x_test,y_test)


## Random Forest

In [34]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

In [35]:
rfc.fit(x_train,y_train)

In [36]:
rfc.score(x_train,y_train)

In [37]:
rfc.score(x_test,y_test)

## Classification report and Confusion matrix

In [38]:
from sklearn.metrics import classification_report, confusion_matrix

## XGBC

In [39]:
xgbc_y_pred = xgbc.predict(x_test)


In [40]:
confusion_matrix(y_test,xgbc_y_pred)

In [41]:
target_class = ['not_fraud','fraud']

In [42]:
print(classification_report(y_test,xgbc_y_pred, target_names= target_class))

## RFC

In [43]:
rfc_y_pred = rfc.predict(x_test)

In [44]:
confusion_matrix(y_test,rfc_y_pred)

In [45]:
print(classification_report(y_test,rfc_y_pred, target_names= target_class))

## Precision Recall F1score and logloss

In [46]:
from sklearn.metrics import precision_score,recall_score,f1_score,log_loss

## XGBC

In [47]:
precision_score(y_test,xgbc_y_pred)

In [48]:
recall_score(y_test,xgbc_y_pred)

In [49]:
f1_score(y_test,xgbc_y_pred)

In [50]:
log_loss(y_test,xgbc_y_pred)

## RFC

In [51]:
precision_score(y_test,rfc_y_pred)

In [52]:
recall_score(y_test,rfc_y_pred)

In [53]:
f1_score(y_test,rfc_y_pred)

In [54]:
log_loss(y_test,rfc_y_pred)

## AUC and ROC curve

In [55]:
from sklearn.metrics import roc_curve,roc_auc_score

## XGBC

In [56]:
xgbc_rc = roc_curve(y_test,xgbc_y_pred)

## RFC

In [57]:
rfc_rc = roc_curve(y_test,rfc_y_pred)

In [58]:
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs)

In [61]:
plt.plot(xgbc_rc[0],xgbc_rc[1], linestyle = '--',color = 'black', label = 'XGBClassifier')
#plt.plot(rfc_rc[0],rfc_rc[1], linestyle = '--',color = 'yellow', label = 'RandomForestClassifier')
plt.plot(p_fpr,p_tpr, linestyle = '--',color = 'blue')
plt.title('ROC Curve')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc = 'best')
plt.show()

In [62]:
plt.plot(rfc_rc[0],rfc_rc[1], linestyle = '--',color = 'yellow', label = 'RandomForestClassifier')
plt.plot(p_fpr,p_tpr, linestyle = '--',color = 'blue')
plt.title('ROC Curve')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc = 'best')
plt.show()