# Credit Fraud Machine Learning Dashboard

Precision: Higher precision means lower false positives  
Recall: Higher recall means lower false negatives  
F1-Score: Balance of both, F1 Score = 2 * (Precision * Recall) / (Precision + Recall)  
Base Data: All data points
Balanced Data: Undersampling the legitimate transactions to have a 50/50 split of legitimate vs fraudulent. Will usually improve the fraudulent accuracy while decreasing the legitimate accuracy.

In [66]:
import pandas as pd
df = pd.read_csv('creditcard.csv')
df.dropna(inplace = True)
pd.set_option('display.max_columns', None)

In [67]:
#Descriptive analysis
import matplotlib.pyplot as plt
df_hist = df
df_hist
plt.close()

def generateGraphOne(column_name):
    fig, ax = plt.subplots(figsize=(2, 1))
    df_hist[column_name].hist(bins=30, ax=ax)
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')
    ax.set_title(f'Histogram of {column_name}')
    plt.show()
#generateGraphOne('Amount')

In [68]:
import seaborn as sns
df_a = df[['class', 'Amount']].copy()
def generateGraphTwo():

    amount_bins = [0,10, 25, 50, 75, 100, 125, 150, 175, 200, 225, 250, 275, 300, 325, 350, 375, 400, 425, 450, 475, 500, float('inf')]
    labels = ['0-10','11-25', '26-50', '51-75', '76-100', '101-125', '126-150', '151-175', '176-200',
            '201-225', '226-250', '251-275', '276-300', '301-325', '326-350', '351-375',
            '376-400', '401-425', '426-450', '451-475', '476-500', 'Above 500']

    # Categorize the amounts into bins for each class separately
    df_a['Amount_Category'] = pd.cut(df_a[df_a['class'] == 0]['Amount'], bins=amount_bins, labels=labels, right=True)
    df_a['Amount_Category_Fraud'] = pd.cut(df_a[df_a['class'] == 1]['Amount'], bins=amount_bins, labels=labels, right=True)

    # Count the occurrences of each category for each class
    category_counts_legitimate = df_a[df_a['class'] == 0]['Amount_Category'].value_counts()
    category_counts_fraudulent = df_a[df_a['class'] == 1]['Amount_Category_Fraud'].value_counts()

    # Create two subplots side by side for legitimate and fraudulent transactions
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

    # Plot for legitimate transactions
    sns.countplot(x='Amount_Category', data=df_a[df_a['class'] == 0], order=labels, palette='viridis', ax=ax1)
    ax1.set_xlabel('Transaction Amount Range')
    ax1.set_ylabel('Number of Transactions')
    ax1.set_title('Number of Transactions for Different Transaction Amount Ranges (Legitimate)')
    ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45, ha='right')
    for index, value in enumerate(category_counts_legitimate):
        ax1.text(index, value, str(value), ha='center', va='bottom', fontsize=10)

    # Plot for fraudulent transactions
    sns.countplot(x='Amount_Category_Fraud', data=df_a[df_a['class'] == 1], order=labels, palette='viridis', ax=ax2)
    ax2.set_xlabel('Transaction Amount Range')
    ax2.set_ylabel('Number of Transactions')
    ax2.set_title('Number of Transactions for Different Transaction Amount Ranges (Fraudulent)')
    ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45, ha='right')
    for index, value in enumerate(category_counts_fraudulent):
        ax2.text(index, value, str(value), ha='center', va='bottom', fontsize=10)

    plt.tight_layout()
    plt.show()


In [69]:
#Nondescriptive methods
#Shrinking the outlier in the amount column
from sklearn.preprocessing import RobustScaler
new_df = df
new_df.dropna(inplace = True)
new_df['Amount'] = RobustScaler().fit_transform(new_df['Amount'].to_numpy().reshape(-1,1))

#Standardizing time
time = new_df['Time']
new_df['Time'] = (time - time.min()) / (time.max() - time.min())

#Randomizing dataset
new_df = new_df.sample(frac=1)
#new_df

In [70]:
#convert to numpy arrays and modifying to fit with sklearn formats
#train.dropna(inplace= True)

train = df[32:200000]
test = df[200001:240000]
val = df[240001:]

train_np = train.to_numpy()
test_np = test.to_numpy()
val_np = val.to_numpy()

x_train = train_np[:,:30]
y_train = train_np[:, -1]
x_test = test_np[:,:30]
y_test = test_np[:, -1]
x_val = val_np[:,:30]
y_val = val_np[:, -1]

#x_train.shape, y_train.shape, x_test.shape, y_test.shape, x_val.shape, y_val.shape

In [71]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# logistic regression with base data
logistic_model = LogisticRegression()
logistic_model.fit(x_train, y_train)

training_accuracy = logistic_model.score(x_train, y_train)
#print("Training Accuracy:", training_accuracy)

test_accuracy = logistic_model.score(x_test, y_test)
#print("Test Accuracy:", test_accuracy)

base_log = classification_report(y_test,logistic_model.predict(x_test), target_names=['Legitimate', 'Fraudulent'])
#print(base_log)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [72]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import sys
import os

# Shallow neural network with base data
model = Sequential([
    Dense(64, activation='relu', input_shape=(x_train.shape[1],)), 
    Dense(1, activation='sigmoid') 
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy']);

stdout_original = sys.stdout
sys.stdout = open(os.devnull, 'w')

history = model.fit(x_train, y_train, batch_size=2048, epochs=30, validation_data=(x_val, y_val));

test_loss, test_accuracy = model.evaluate(x_test, y_test);
print("Test Loss:", test_loss);
print("Test Accuracy:", test_accuracy);

y_prob = model.predict(x_test);
y_pred = (y_prob > 0.5).astype(int);

base_snn = classification_report(y_test, y_pred);
#print(base_snn)

In [73]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

# Assuming you have already defined and prepared your features (X) and target (y)
x = new_df.drop(['class'], axis=1)
y = new_df['class']

# Step 1: Instantiate the RandomUnderSampler
under_sampler = RandomUnderSampler()

# Step 2: Fit and transform the data to perform undersampling
x_resampled, y_resampled = under_sampler.fit_resample(x, y)

resampled_df = resampled_df = pd.DataFrame(data=x_resampled, columns=x.columns)
resampled_df['class'] = y_resampled

resampled_df = resampled_df.sample(frac=1)

#resampled_df


In [74]:
resampled_train = resampled_df[32:700]
resampled_test = resampled_df[701:850]
resampled_val = resampled_df[851:984]


train_np_b = resampled_train.to_numpy()
test_np_b = resampled_test.to_numpy()
val_np_b = resampled_val.to_numpy()


x_train_b = train_np_b[:,:30]
y_train_b = train_np_b[:, -1]
x_test_b = test_np_b[:,:30]
y_test_b = test_np_b[:, -1]
x_val_b = val_np_b[:,:30]
y_val_b = val_np_b[:, -1]

In [75]:
# logistic regression with balanced data
logistic_model = LogisticRegression()

logistic_model.fit(x_train_b, y_train_b)

training_accuracy = logistic_model.score(x_train_b, y_train_b)
#print("Training Accuracy:", training_accuracy)

test_accuracy = logistic_model.score(x_test_b, y_test_b)
#print("Test Accuracy:", test_accuracy)

balanced_log = classification_report(y_test_b,logistic_model.predict(x_test_b), target_names=['Legitimate', 'Fraudulent'])
#print(balanced_log)

In [76]:
# Shallow neural network with balanced data
model = Sequential([
    Dense(128, activation='relu', input_shape=(x_train_b.shape[1],)),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(x_train_b, y_train_b, batch_size=1024, epochs=40, validation_data=(x_val, y_val))

test_loss, test_accuracy = model.evaluate(x_test_b, y_test_b)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

y_prob = model.predict(x_test_b)
y_pred = (y_prob > 0.5).astype(int)

balanced_snn = classification_report(y_test_b, y_pred)
#print(balanced_snn)

In [77]:
def plot_classification_report(report, title):
    lines = report.split('\n')[2:-5]  # Skip the first 2 lines and last 5 lines
    class_names = []
    precision = []
    recall = []
    f1_score = []

    for line in lines:
        t = line.strip().split()
        class_names.append(t[0])
        precision.append(float(t[1]))
        recall.append(float(t[2]))
        f1_score.append(float(t[3]))

    fig, ax = plt.subplots(figsize=(6, 4))
    bar_width = 0.2
    index = range(len(class_names))

    ax.bar(index, precision, width=bar_width, label='Precision', color='b')
    ax.bar([i + bar_width for i in index], recall, width=bar_width, label='Recall', color='g')
    ax.bar([i + 2 * bar_width for i in index], f1_score, width=bar_width, label='F1-Score', color='r')

    ax.set_xlabel('Class')
    ax.set_ylabel('Score')
    ax.set_title(title)
    ax.set_ylim(0.5, 1)
    ax.set_xticks([i + bar_width for i in index])
    ax.set_xticklabels(class_names)
    ax.legend()

    plt.tight_layout()
    plt.show()

# Assuming you have four classification reports named 'report1', 'report2', 'report3', and 'report4'
# Replace these with your actual classification reports
report1 = base_log

report2 = base_snn

report3 = balanced_log

report4 = balanced_snn

def generateGraphThree():
#Plot the graphs for each classification report
    plot_classification_report(report1, "Base Data Logistic Regression")
    plot_classification_report(report2, "Base Data Shallow NN")
    plot_classification_report(report3, "Balanced Data Logistic Regression")
    plot_classification_report(report4, "Balanced Data Shallow NN")


## Select a Graph to View

In [78]:
import ipywidgets as widgets
from IPython.display import display

# Dropdown widget
dropdown = widgets.Dropdown(
    options=['All Data Histograms', 'Amounts of Legitimate and Fraudulent Transactions', 'Machine Learning Model Comparisons'],
    value='All Data Histograms',
    description='Select Graph:'
)
display(dropdown)

output = widgets.Output()
display(output)

import ipywidgets as widgets
from IPython.display import display, clear_output
import matplotlib.pyplot as plt

def update_graph(change):
    with output:
        clear_output(wait=True)
        selected_graph = dropdown.value
        if selected_graph == 'All Data Histograms':
            for column_name in df.columns:
                generateGraphOne(column_name)
        elif selected_graph == 'Amounts of Legitimate and Fraudulent Transactions':
            generateGraphTwo()
        elif selected_graph == 'Machine Learning Model Comparisons':
            generateGraphThree()

update_graph(None)
dropdown.observe(update_graph, names='value')




Dropdown(description='Select Graph:', options=('All Data Histograms', 'Amounts of Legitimate and Fraudulent Tr…

Output()