In [None]:
# Imported Libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

# Classifier Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Other Libraries
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report

In [None]:
# path = "/content/drive/MyDrive/Datasets/creditcard.csv"
# df = pd.read_csv(path)
df = pd.read_csv("creditcard.csv")
df.head()

In [None]:
# Show column names
df.columns

## Data Cleaning

We start by checking for missing values.

In [None]:
# Check for Null Values!
df.isnull().sum().max()

In [None]:
# Visualize the distribution of the transacton types
color = ["blue", "red"]
sns.countplot(x='Class', data=df, palette = color)
plt.ylabel('Frequency')
plt.title('Class Distributions', fontsize=14)

This plot shows us that the dataset is largely unbalanced. This will make the model to be biased towards 0 (non-fraudulent transactions)

In [None]:
# The classes are heavily skewed
print('Non-Fraudulent:', round(df['Class'].value_counts()[0]/len(df) * 100,2), '% of the dataset')
print('Fraudulent:', round(df['Class'].value_counts()[1]/len(df) * 100,2), '% of the dataset')

Next, we scale the Amount and Time column since those are the only columns that aren't scaled in our dataset. We can see this from the describe table below.
Another reason we need to scale is because we will be using some distance based algorithms to perform classification.

In [None]:
# Extract statistical details of the data
df.describe()

We use RobustScaler to perform the scaling which is less prone to outliers because it uses the median and Inter Quartile Range (IQR) to scale the values.

In [None]:
# Since most of our data has already been scaled we should scale the columns that are left to scale (Amount and Time)
# We use RobustScaler that's less prone to Outliers
from sklearn.preprocessing import RobustScaler

rob_scaler = RobustScaler()

df['scaled_amount'] = rob_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df['scaled_time'] = rob_scaler.fit_transform(df['Time'].values.reshape(-1,1))

df.drop(['Time','Amount'], axis=1, inplace=True)

In [None]:
# Every column is Scaled!
df.head()

Now, we perform cross validation on the main (unbalanced) dataset to bring out test values that won't be affected by the undersampling. 

StratifiedkFold ensures they maintain the class proportion (99.83 : 0.17) in each of the folds.

In [None]:
from sklearn.model_selection import StratifiedKFold

print('Non-Fraudulent:', round(df['Class'].value_counts()[0]/len(df) * 100,2), '% of the dataset')
print('Fraudulent:', round(df['Class'].value_counts()[1]/len(df) * 100,2), '% of the dataset')

X = df.drop('Class', axis=1)
y = df['Class']

ss = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

for train_index, test_index in ss.split(X, y):
    print("Train:", train_index, "Test:", test_index)
    original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index]
    original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]

# We would still get X_train and y_train for the undersample data that's why we are using original to distinguish.

# Check the Distribution of the labels
# Turn them into an array
original_Xtrain = original_Xtrain.values
original_Xtest = original_Xtest.values
original_ytrain = original_ytrain.values
original_ytest = original_ytest.values

# See if both the train and test label distribution are similarly distributed
train_unique_label, train_counts_label = np.unique(original_ytrain, return_counts=True)
test_unique_label, test_counts_label = np.unique(original_ytest, return_counts=True)
print('-' * 100)

print('Label Distributions: \n')
print("Train Distribution:", train_counts_label/ len(original_ytrain))
print("Test Distribution:", test_counts_label/ len(original_ytest))

### UnderSampling

In [None]:
df['Class'].value_counts()

Since the classes are highly skewed towards the non-fraudulent transactions, we use undersampling to make them equal.

In [None]:
# Since our classes are highly skewed we should make them equivalent in order to have a normal distribution of the classes.
# Lets shuffle the data before creating the subsamples

df = df.sample(frac=1)

# Amount of fraud classes 492 rows.
fraud = df.loc[df['Class'] == 1]
non_fraud = df.loc[df['Class'] == 0][:492]

sub_sampled_df = pd.concat([fraud, non_fraud])

# Shuffle dataframe rows
new_df = sub_sampled_df.sample(frac=1, random_state=4)

new_df.head()

Using frac=1 in df.sample returns the shuffled dataset completely.

In [None]:
df = df.sample(frac=1)
df.shape

In [None]:
# Check Equal distribution
print('Distribution of the Classes in the subsample dataset')
print(new_df['Class'].value_counts()/len(new_df))

sns.countplot(x='Class', data=new_df, palette=color)
plt.ylabel('Frequency')
plt.title('Equally Distributed Classes', fontsize=14)
plt.show()

Now we see that the dataset (new_df) is equally distributed amongst the classes.

Next, we check if any variable is strongly correlated with the Class column 

In [None]:
# Show correlation matrix for the subsample dataframe
sns.heatmap(new_df.corr(), cmap='coolwarm_r', annot_kws={'size':20})
plt.title('UnderSample Correlation Matrix', fontsize=14)
plt.show()

Negative Correlations: V16, V17, V14, V12 and V10 are negatively correlated. Notice how the lower these values are, the more likely the end result will be a fraud transaction.

Positive Correlations: V2, V4 and V11 are positively correlated. Notice how the higher these values are, the more likely the end result will be a fraud transaction.

## Data Modeling For Undersampling 

In [None]:
# Undersampling before cross validating
X = new_df.drop('Class', axis=1)
y = new_df['Class']

# Our data is already scaled we should split our training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Turn the values into an array for feeding the classification algorithms.
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

In [None]:
# We implement four classifiers

classifiers = {
    "LogisiticRegression": LogisticRegression(),
    "KNearest": KNeighborsClassifier(),
    "Support Vector Classifier": SVC(),
    "RandomForestClassifier": RandomForestClassifier()
}

In [None]:
# Now, we calculate the cross validation score with each classifier.
from sklearn.model_selection import cross_val_score

for key, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    training_score = cross_val_score(classifier, X_train, y_train, cv=5)
    print("Classifiers: ", classifier.__class__.__name__, "has a training score of", round(training_score.mean(), 2) * 100, "% accuracy")

We can see that Logistic Regression has the best cross_val_score, now we use GridSearchCV to find the best parameters for each classifier.

In [None]:
# We calculate the cross validation scores using the best parameters

log_reg_score = cross_val_score(log_reg, X_train, y_train, cv=5)
print('Logistic Regression Cross Validation Score:', round(log_reg_score.mean() * 100, 2), '%')

knears_score = cross_val_score(knears_neighbors, X_train, y_train, cv=5)
print('Knears Neighbors Cross Validation Score:', round(knears_score.mean() * 100, 2).astype(str) + '%')

svc_score = cross_val_score(svc, X_train, y_train, cv=5)
print('Support Vector Classifier Cross Validation Score:', round(svc_score.mean() * 100, 2).astype(str) + '%')

rf_score = cross_val_score(rf_clf, X_train, y_train, cv=5)
print('RandomForest Classifier Cross Validation Score:', round(rf_score.mean() * 100, 2).astype(str) + '%')

After using the best parameters, we still see that Logistic regression has the best cross_val_score.

In [None]:
# GridSearchCV is used to determine the paremeters that gives the best predictive score for the classifiers.
# Logistic Regression has the best Receiving Operating Characteristic score (ROC), meaning that LogisticRegression pretty accurately separates fraud and non-fraud transactions.

In [None]:
from sklearn.model_selection import cross_val_predict
# Create a DataFrame with all the scores and the classifiers names.

log_reg_pred = cross_val_predict(log_reg, X_train, y_train, cv=5,
                             method="decision_function")

knears_pred = cross_val_predict(knears_neighbors, X_train, y_train, cv=5)

svc_pred = cross_val_predict(svc, X_train, y_train, cv=5,
                             method="decision_function")

rf_pred = cross_val_predict(rf_clf, X_train, y_train, cv=5)

In [None]:
# Calculate ROC scores
from sklearn.metrics import roc_auc_score

print('Logistic Regression: ', roc_auc_score(y_train, log_reg_pred))
print('KNears Neighbors: ', roc_auc_score(y_train, knears_pred))
print('Support Vector Classifier: ', roc_auc_score(y_train, svc_pred))
print('Random Forest Classifier: ', roc_auc_score(y_train, rf_pred))

We observe that Logistic Regression has the best cross_val_score and a very high Receiving Operating Characteristic score (ROC), meaning that LogisticRegression pretty accurately separates fraud and non-fraud transactions.

So we would use the Logistic Regression Model when oversampling.

## SMOTE Technique (Over-Sampling)

Synthetic Minority Oversampling Technique (SMOTE) synthesizes new examples from the minority class to make the dataset balanced.

We are very interested in the recall score, because that is the metric that will help us try to capture the most fraudulent transactions. The formulars of Accuracy, Precision and Recall are given below:
*   Accuracy = (TP+TN)/total
*   Precision = TP/(TP+FP)
*   Recall = TP/(TP+FN)

In [None]:
# SMOTE Technique (OverSampling) After splitting and Cross Validating
sm = SMOTE(sampling_strategy='minority', random_state=42)

# This will be the data were we are going to 
Xsm_train, ysm_train = sm.fit_resample(original_Xtrain, original_ytrain)

In [None]:
# Logistic Regression
log_reg_sm = LogisticRegression()
log_reg_sm.fit(Xsm_train, ysm_train)

### SMOTE Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

# Logistic Regression fitted using SMOTE technique
y_pred_log_reg = log_reg_sm.predict(X_test)

# Other models fitted with UnderSampling
y_pred_knear = knears_neighbors_sm.predict(X_test)
y_pred_svc = svc_sm.predict(X_test)
y_pred_rf = rf_sm.predict(X_test)


log_reg_cf = confusion_matrix(y_test, y_pred_log_reg)
kneighbors_cf = confusion_matrix(y_test, y_pred_knear)
svc_cf = confusion_matrix(y_test, y_pred_svc)
rf_cf = confusion_matrix(y_test, y_pred_rf)


fig, ax = plt.subplots(2, 2,figsize=(22,10))


sns.heatmap(log_reg_cf, ax=ax[0][0], annot=True, cmap=plt.cm.Blues)
ax[0, 0].set_title("Logistic Regression \n Confusion Matrix", fontsize=14)
ax[0, 0].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[0, 0].set_yticklabels(['', ''], fontsize=14, rotation=360)

sns.heatmap(kneighbors_cf, ax=ax[0][1], annot=True, cmap=plt.cm.Blues)
ax[0][1].set_title("KNearsNeighbors \n Confusion Matrix", fontsize=14)
ax[0][1].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[0][1].set_yticklabels(['', ''], fontsize=14, rotation=360)

sns.heatmap(svc_cf, ax=ax[1][0], annot=True, cmap=plt.cm.Blues)
ax[1][0].set_title("Suppor Vector Classifier \n Confusion Matrix", fontsize=14)
ax[1][0].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[1][0].set_yticklabels(['', ''], fontsize=14, rotation=360)

sns.heatmap(rf_cf, ax=ax[1][1], annot=True, cmap=plt.cm.Blues)
ax[1][1].set_title("Random Forest Classifier \n Confusion Matrix", fontsize=14)
ax[1][1].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[1][1].set_yticklabels(['', ''], fontsize=14, rotation=360)


plt.show()

In [None]:
from sklearn.metrics import classification_report

print('Logistic Regression:')
print(classification_report(y_test, y_pred_log_reg))

print('KNears Neighbors:')
print(classification_report(y_test, y_pred_knear))

print('Support Vector Classifier:')
print(classification_report(y_test, y_pred_svc))

print('Random Forest Classifier:')
print(classification_report(y_test, y_pred_rf))

Observe that the logistic regression using SMOTE has a higher accuracy than other classifiers

In [None]:
# Final Score in the main test set of logistic regression
from sklearn.metrics import accuracy_score

# Logistic Regression with Under-Sampling
y_pred = log_reg.predict(X_test)
undersample_score = accuracy_score(y_test, y_pred)

# Logistic Regression with SMOTE Technique (Better accuracy with SMOTE t)
y_pred_sm = log_reg_sm.predict(original_Xtest)
oversample_score = accuracy_score(original_ytest, y_pred_sm)


d = {'Technique': ['Random UnderSampling', 'Oversampling (SMOTE)'], 'Score': [undersample_score, oversample_score]}
final_df = pd.DataFrame(data=d)

# Move column
score = final_df['Score']
final_df.drop('Score', axis=1, inplace=True)
final_df.insert(1, 'Score', score)

final_df

## Neural Network (UnderSampling vs. OverSampling)

### UnderSampling

In [None]:
n_inputs = X_train.shape[1]

undersample_model = tf.keras.Sequential([
    tf.keras.layers.Dense(n_inputs, input_shape=(n_inputs, ), activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(2, activation='softmax')
])

In [None]:
undersample_model.summary()

In [None]:
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
undersample_model.compile(opt, loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=['Recall'])

In [None]:
undersample_model.fit(X_train, y_train, validation_split=0.2, batch_size=25, epochs=20, shuffle=True, verbose=2)

#### Model Evaluation

In [None]:
undersample_predictions = undersample_model.predict(original_Xtest, batch_size=200, verbose=0)

In [None]:
predict_x = undersample_model.predict(original_Xtest, batch_size=200, verbose=0)
undersample_fraud_predictions = np.argmax(predict_x,axis=1)

In [None]:
undersample_cm = confusion_matrix(original_ytest, undersample_fraud_predictions)
actual_cm = confusion_matrix(original_ytest, original_ytest)

f, (ax1, ax2) = plt.subplots(1,2, figsize=(16, 8))

sns.heatmap(undersample_cm, ax=ax1, annot=True, cmap=plt.cm.Reds)
ax1.set_title("Random UnderSample \n Confusion Matrix", fontsize=14)

sns.heatmap(actual_cm, ax=ax2, annot=True, cmap=plt.cm.Blues)
ax2.set_title("Confusion Matrix \n Of Original Test Set", fontsize=14)

### OverSampling

In [None]:
n_inputs = Xsm_train.shape[1]

oversample_model = tf.keras.Sequential([
    tf.keras.layers.Dense(n_inputs, input_shape=(n_inputs, ), activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(2, activation='softmax')
])

In [None]:
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
oversample_model.compile(opt, loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=['Recall'])

In [None]:
oversample_model.fit(Xsm_train, ysm_train, validation_split=0.2, batch_size=300, epochs=20, shuffle=True, verbose=2)

#### Model Evaluation

In [None]:
oversample_predictions = oversample_model.predict(original_Xtest, batch_size=200, verbose=0)

In [None]:
predict_x2 = oversample_model.predict(original_Xtest, batch_size=200, verbose=0)
oversample_fraud_predictions = np.argmax(predict_x2,axis=1)

In [None]:
oversample_smote = confusion_matrix(original_ytest, oversample_fraud_predictions)
actual_cm = confusion_matrix(original_ytest, original_ytest)

f, (ax1, ax2) = plt.subplots(1,2, figsize=(16, 8))

sns.heatmap(oversample_smote, ax=ax1, annot=True, cmap=plt.cm.Oranges)
ax1.set_title("OverSample (SMOTE) \n Confusion Matrix", fontsize=14)

sns.heatmap(actual_cm, ax=ax2, annot=True, cmap=plt.cm.Greens)
ax2.set_title("Confusion Matrix \n Of Original Test Set", fontsize=14)

We can see that the Neural Network Model of the OverSampled Data did much better than the one applied to the UnderSampled Data. This could be because when undersampling there is loss of information which could have been used in model building.