# Credit Card Fraud Detection

### Load Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

from keras.models import Sequential
from keras.layers import Dense

In [None]:
df = pd.read_csv("creditcard.csv")
df.head()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

### Visualization + Preprocess

In [None]:
df['Class'].value_counts()

In [None]:
sns.countplot(df['Class'], label='Count') 

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(18,4))

amount_val = df['Amount'].values
time_val = df['Time'].values

sns.distplot(amount_val, ax=ax[0], color='r')
ax[0].set_title('Distribution of Transaction Amount', fontsize=14)
ax[0].set_xlim([min(amount_val), max(amount_val)])

sns.distplot(time_val, ax=ax[1], color='b')
ax[1].set_title('Distribution of Transaction Time', fontsize=14)
ax[1].set_xlim([min(time_val), max(time_val)])

plt.show()

In [None]:
rs = RobustScaler()

df['Amount'] = rs.fit_transform(df['Amount'].values.reshape(-1,1))
df['Time'] = rs.fit_transform(df['Time'].values.reshape(-1,1))

In [None]:
df = df.sample(frac=1)

fraud_df = df.loc[df['Class'] == 1]
non_fraud_df = df.loc[df['Class'] == 0][:492]

new_df = pd.concat([fraud_df, non_fraud_df])
new_df = new_df.sample(frac=1, random_state=42)

new_df.head()

In [None]:
new_df['Class'].value_counts()
sns.countplot(new_df['Class'], label='Count') 

In [None]:
plt.figure(figsize=(16,12))
sns.heatmap(df.corr(), cmap='coolwarm_r')

In [None]:
plt.figure(figsize=(16,12))
sns.heatmap(new_df.corr(), cmap='coolwarm_r')

In [None]:
f, ax = plt.subplots(ncols=4, figsize=(20,4))

sns.boxplot(x="Class", y="V17", data=new_df, ax=ax[0])
ax[0].set_title('V17 vs Class Negative Correlation')

sns.boxplot(x="Class", y="V14", data=new_df, ax=ax[1])
ax[1].set_title('V14 vs Class Negative Correlation')

sns.boxplot(x="Class", y="V12", data=new_df, ax=ax[2])
ax[2].set_title('V12 vs Class Negative Correlation')

sns.boxplot(x="Class", y="V10", data=new_df, ax=ax[3])
ax[3].set_title('V10 vs Class Negative Correlation')

plt.show()

In [None]:
f, ax = plt.subplots(ncols=4, figsize=(20,4))

sns.boxplot(x="Class", y="V11", data=new_df, ax=ax[0])
ax[0].set_title('V11 vs Class Positive Correlation')

sns.boxplot(x="Class", y="V4", data=new_df, ax=ax[1])
ax[1].set_title('V4 vs Class Positive Correlation')


sns.boxplot(x="Class", y="V2", data=new_df, ax=ax[2])
ax[2].set_title('V2 vs Class Positive Correlation')

sns.boxplot(x="Class", y="V19", data=new_df, ax=ax[3])
ax[3].set_title('V19 vs Class Positive Correlation')

plt.show()

In [None]:
X = new_df.drop(['Class'], axis=1)
y = new_df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)

In [None]:
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

### Models Training + Evaluation

In [None]:
models = [
    LogisticRegression(), 
    SVC(), 
    KNeighborsClassifier(), 
    DecisionTreeClassifier()
]

for m in models: 
    m.fit(X_train, y_train)
    name = m.__class__.__name__
    y_pred = m.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy of {}: {}".format(name, acc))
    print(classification_report(y_test, y_pred))

In [None]:
model = Sequential()  

model.add(Dense(X_train.shape[1], input_shape=(X_train.shape[1], ), activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.compile(optimizer='adam', loss = 'sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, validation_split=0.2, batch_size=25, epochs=20, shuffle=True)          

In [None]:
model.evaluate(X_test, y_test)

In [None]:
'''
Inspiration
1. https://www.kaggle.com/janiobachmann/credit-fraud-dealing-with-imbalanced-datasets
'''