In [None]:

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


### Task 1: Loading libraries and dataset

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("/kaggle/input/creditcardfraud/creditcard.csv")

### Task 2: Exploring the data

In [None]:
#check the shape of the data

data.shape

In [None]:
#check the head of the data

data.head()

In [None]:
# check for missing values

data.info()

In [None]:
#check the target class distribution


data['Class'].value_counts()

In [None]:
#create visual plot

sns.countplot(x='Class', data=data)

This shows the dataset is highly imbalance

In [None]:
#create scatter plot to see any pattern in the dataset

fig, ((ax1,ax2),(ax3,ax4)) = plt.subplots(2,2, figsize=(10,10))

sns.scatterplot(x='V1', y='V2', data=data, hue='Class', ax=ax1)
sns.scatterplot(x='V3', y='V4', data=data, hue='Class', ax=ax2)
sns.scatterplot(x='V1', y='V3', data=data, hue='Class', ax=ax3)
sns.scatterplot(x='V2', y='V4', data=data, hue='Class', ax=ax4)

In [None]:
fig, ((ax1,ax2),(ax3,ax4)) = plt.subplots(2,2, figsize=(10,10))

sns.scatterplot(x='V4', y='V6', data=data, hue='Class', ax=ax1)
sns.scatterplot(x='V5', y='V7', data=data, hue='Class', ax=ax2)
sns.scatterplot(x='V10', y='V13', data=data, hue='Class', ax=ax3)
sns.scatterplot(x='V9', y='V12', data=data, hue='Class', ax=ax4)

### TASK 3: Evaluation of metrics selection

In [None]:
# check % of data belonging to each class

round(data['Class'].value_counts(normalize=True)*100,2)

This dataset is highly imbalance, so if we consider accuracy for evaluation. It will always show accuracy around 99%, since most of the data belongs to '0' class only. So, we need to consider other evaluation meterics, which are precision and recall.

In this problem, recall value is more important than precision (Think why !)


### Task 4: Creating Baseline model

In [None]:
#here we will consider only PCA components, therefore drop columns = 'time', 'amount'

drop_cols = ['Time', 'Amount']
data = data.drop(drop_cols, axis =1)

In [None]:
#extract features and target from the data

features = data.drop(['Class'], axis=1)
target = data['Class']

In [None]:
#split the dataset into train and test set

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

In [None]:
#create a random forest classifier

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=42, verbose=1)
clf.fit(x_train, y_train)

In [None]:
#predict results
y_predict = clf.predict(x_test)

In [None]:
#plot confusion matrix

from sklearn.metrics import confusion_matrix

sns.heatmap(confusion_matrix(y_test, y_predict), annot=True, fmt='0.0f')

In [None]:
#get classification report

from sklearn.metrics import classification_report

print(classification_report(y_test, y_predict))

### Task 5: Resampling techniques for imbalance dataset

Two methods : Undersampling and Oversampling

**Undersampling:** Samples are taken from majority class equal to the datapoints present for minority class.

**Oversampling:** 

Minority Oversampling: Samples of minority class are duplicated until the total number of data points belonging to minority class do not get equal to majority class.

SMOTE(Synthetic Minority Oversampling Technique): Here we create observation for minority class, based on that already existed. it randomly picks a point from the minority class , and find the k-nearest ngh points from this point. The synthetic points are added between the choosen point and the ngb points.

In [None]:
#Here we are using SMOTE technique

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
x_smote, y_smote = smote.fit_resample(x_train, y_train)

In [None]:
#new random forest classifier
clf_smote = RandomForestClassifier(random_state=42, verbose=1)
clf_smote.fit(x_smote, y_smote)

In [None]:
#predict results
y_predict_smote = clf_smote.predict(x_test)

#plot confusion matrix
sns.heatmap(confusion_matrix(y_test, y_predict_smote), annot=True, fmt='0.0f')

#print classification report
print(classification_report(y_test, y_predict_smote))

### Task 6: Compute ROC AUC curve

In [None]:
!pip install plot_metric==0.0.6

In [None]:
# compute AUC curve for the model

from plot_metric.functions import BinaryClassification

bc = BinaryClassification(y_test, clf_smote.predict_proba(x_test)[:,1], labels= [0,1])
plt.figure(figsize=(16,10))
bc.plot_roc_curve()
plt.show()

In [None]:
bc = BinaryClassification(y_test, clf_smote.predict_proba(x_test)[:,1],threshold=0.03 ,labels= [0,1])
plt.figure(figsize=(16,10))
bc.plot_roc_curve()
plt.show()

### Task 7: Adjusting probability threshold

In [None]:
#compute the probabilities for the test set

y_predict_prob = clf_smote.predict_proba(x_test)[:,1]

#compare the probabilities with the threshold
y_pred_labels = (y_predict_prob>0.03)

In [None]:
#plot confusion matrix
sns.heatmap(confusion_matrix(y_test, y_pred_labels), annot=True, fmt='0.0f')



In [None]:
#print classification report
print(classification_report(y_test, y_pred_labels))