In [None]:
#importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import norm, skew
from subprocess import check_output
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
color = sns.color_palette()
sns.set_style('darkgrid')

pd.pandas.set_option('display.max_columns',None) 
import pandas.util.testing as tm

from sklearn.metrics import classification_report,accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM

In [None]:
#reading dataframe
data = pd.read_excel('creditcard.xlsb', engine='pyxlsb')

In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
#data has zero missing values

#### EDA

In [None]:
#Lets start looking the difference by Normal and Fraud transactions
print("Distribuition of Normal(0) and Frauds(1): ")
print(data["Class"].value_counts())
plt.figure(figsize=(7,5))
sns.countplot(data["Class"], palette = 'ch:.25')
plt.title("Class Count", fontsize=18)
plt.xlabel("Is fraud?", fontsize=15)
plt.ylabel("Count", fontsize=15)
plt.show()

#### Time Features and some Feature Engineering

As our Time feature are in seconds we will transform it ot minutes and hours to get a better understand of the patterns

In [None]:
timedelta = pd.to_timedelta(data['Time'], unit = 's')
data['Time_min'] = (timedelta.dt.components.minutes).astype(int)
data['Time_hr'] = (timedelta.dt.components.hours).astype(int)

In [None]:
plt.figure(figsize = (12,6))

sns.distplot( data[data["Class"] == 0]['Time_min'] ,  color = 'g')
sns.distplot( data[data["Class"] == 1]['Time_min'], color = 'r')
plt.title('Fraud(red) vs normal transaction(green)) by minutes', fontsize = 17)
plt.legend()
                        
plt.xlim(-1,61)
plt.show()

In [None]:
plt.figure(figsize = (12,5))

sns.distplot(data[data["Class"]== 0]['Time_hr'], color = 'g')
sns.distplot(data[data['Class']==1]['Time_hr'], color= 'r')
plt.title("Fraud(red) vs normal transaction(green)) by hour")

plt.xlim(-1,25)
plt.show()

We can see that the fraud transation is not much dependent on the time period.

In [None]:
correlation_matrix = data.corr()
fig = plt.figure(figsize=(12,9))
sns.heatmap(correlation_matrix,vmax=0.8,square = True)
plt.show()

correlation matrix shows that none of the V1 to V28 PCA components have any correlation to each other

In [None]:
Fraud = data[data['Class']==1]
Valid = data[data['Class']==0]
outlier_fraction = len(Fraud)/float(len(Valid))

In [None]:
X = data.drop(["Class"], axis = 1)
Y = data["Class"]

# Define a random state 
state = np.random.RandomState(42)
X_outliers = state.uniform(low=0, high=1, size=(X.shape[0], X.shape[1]))

The types of algorithms we are going to use to try to do anomaly detection on this dataset are as follows

1. Isolation Forest Algorithm:
2. Local Oulier Factor Algorithm
3. Support Vector Machine(SVM)

In [None]:
classifiers = {
    "Isolation Forest":IsolationForest(n_estimators=100, max_samples=len(X), 
                                       contamination=outlier_fraction,random_state=state, verbose=0),
    "Local Outlier Factor":LocalOutlierFactor(n_neighbors=20, algorithm='auto', 
                                              leaf_size=30, metric='minkowski',
                                              p=2, metric_params=None, contamination=outlier_fraction),
    "Support Vector Machine":OneClassSVM(kernel='rbf', degree=3, gamma=0.1,nu=0.05, 
                                         max_iter=-1, random_state=state)
   
}

In [None]:
n_outliers = len(Fraud)
for i, (clf_name,clf) in enumerate(classifiers.items()):
    #Fit the data and tag outliers
    if clf_name == "Local Outlier Factor":
        y_pred = clf.fit_predict(X)
        scores_prediction = clf.negative_outlier_factor_
    else:    
        clf.fit(X)
        scores_prediction = clf.decision_function(X)
        y_pred = clf.predict(X)
    #Reshape the prediction values to 0 for Valid transactions , 1 for Fraud transactions
    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1
    n_errors = (y_pred != Y).sum()
    # Run Classification Metrics
    print("{}: {}".format(clf_name,n_errors))
    print("Accuracy Score :")
    print(accuracy_score(Y,y_pred))
    print("Classification Report :")
    print(classification_report(Y,y_pred))