# Deep Neural Network

## 1. Importing libraries

In [3]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, ConfusionMatrixDisplay, classification_report, roc_auc_score, roc_curve, auc
from imblearn.over_sampling import RandomOverSampler, SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.utils import shuffle

## 2. Import and pre-process data

In [4]:
data = pd.read_csv('./Data/onlinefraud.csv')
# Convert the 'type' attribute to a numerical one
data["type"] = data["type"].map({"CASH_OUT": 1, "PAYMENT": 2, "CASH_IN": 3, "TRANSFER": 4, "DEBIT": 5})
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,2,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,2,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,4,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,1,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,2,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [5]:
data = data.drop(["nameOrig", "nameDest"], axis=1)

In [6]:
# Shuffle the data randomly
data = data.sample(frac=1, random_state=42)

# Set the target variable (y) and input features (X)
target_variable = 'isFraud'
input_features = [col for col in data.columns if col != target_variable]

# Determine the proportions for train, test, and validation sets
train_ratio = 0.8
test_ratio = 0.1
validation_ratio = 0.1

# Split the data into train, test, and validation sets
train_data, remaining_data = train_test_split(data, test_size=1 - train_ratio, random_state=42)
test_data, validation_data = train_test_split(remaining_data, test_size=validation_ratio / (test_ratio + validation_ratio), random_state=42)

# Set the y and X values for train, test, and validation sets
y_train = train_data[target_variable]
X_train = train_data[input_features]

y_test = test_data[target_variable]
X_test = test_data[input_features]

y_validation = validation_data[target_variable]
X_validation = validation_data[input_features]

# Scale the input features
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_validation_scaled = scaler.transform(X_validation)

# Verify the shapes of the resulting sets
print("Train set shapes - X:", X_train_scaled.shape, "y:", y_train.shape)
print("Test set shapes - X:", X_test_scaled.shape, "y:", y_test.shape)
print("Validation set shapes - X:", X_validation_scaled.shape, "y:", y_validation.shape)

Train set shapes - X: (5090096, 8) y: (5090096,)
Test set shapes - X: (636262, 8) y: (636262,)
Validation set shapes - X: (636262, 8) y: (636262,)


## 3. Helper Function

In [7]:
# Function to train a DNN and return the metrics
def train_dnn(X_train, y_train, X_val, y_val, X_test, y_test, epochs, batch_size):
    model = Sequential()
    model.add(Dense(15, activation='relu', input_dim=X_train.shape[1]))
    model.add(Dense(24, activation='relu'))
    model.add(Dropout(0.5))
    #model.add(Dense(20, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy',metrics=['Precision','Recall'])

    # Train the model
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size, verbose=1)

    # Make predictions on validation data
    y_val_pred = model.predict(X_val)
    y_val_pred = np.round(y_val_pred)
    
    # Make predictions on test data
    y_test_pred = model.predict(X_test)
    y_test_pred = np.round(y_test_pred)

    # Calculate metrics for validation data
    val_cm = confusion_matrix(y_val, y_val_pred)
    val_f1 = f1_score(y_val, y_val_pred)
    val_precision = precision_score(y_val, y_val_pred)
    val_recall = recall_score(y_val, y_val_pred)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    
    # Calculate metrics for test data
    test_cm = confusion_matrix(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    loss = history.history['loss']
    weight_matrix = model.get_weights()

    return (val_cm, val_f1, val_precision, val_recall, val_accuracy,
            test_cm, test_f1, test_precision, test_recall, test_accuracy,
            loss, weight_matrix)

## 4. Models

### 4A. Testing to find the best number of epochs on normal data

In [12]:
# Specify hyperparameters
epochs = [1,2,5,8,10]  # Number of epochs to train

batch_size = 32  # Batch size for training

# Initialize variables to store results
loss_per_epoch = []
val_metrics = []
test_metrics = []

# Iterate over different numbers of hidden layers
for epoch in epochs:
    # Train the DNN and obtain metrics
    val_cm, val_f1, val_precision, val_recall, val_accuracy, \
    test_cm, test_f1, test_precision, test_recall, test_accuracy, \
    loss, weight_matrix = train_dnn(X_train_scaled, y_train, X_validation_scaled, y_validation, X_test_scaled, y_test, epoch, batch_size)
    # Store the loss per epoch
    loss_per_epoch.append(loss)

    # Store the metrics
    val_metrics.append((val_cm, val_f1, val_precision, val_recall, val_accuracy))
    test_metrics.append((test_cm, test_f1, test_precision, test_recall, test_accuracy))

# Print and plot the metrics
for i, epoch in enumerate(epochs):
    print(f"Metrics for {epoch} epochs:")
    print("Validation Metrics:")
    val_cm, val_f1, val_precision, val_recall, val_accuracy = val_metrics[i]
    print("Confusion Matrix:")
    print(val_cm)
    print("F1-score:", val_f1)
    print("Precision:", val_precision)
    print("Recall:", val_recall)
    print("Accuracy:", val_accuracy)
    print()

    print("Test Metrics:")
    test_cm, test_f1, test_precision, test_recall, test_accuracy = test_metrics[i]
    print("Confusion Matrix:")
    print(test_cm)
    print("F1-score:", test_f1)
    print("Precision:", test_precision)
    print("Recall:", test_recall)
    print("Accuracy:", test_accuracy)
    print()

Epoch 1/2
Epoch 2/2
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Metrics for 1 epochs:
Validation Metrics:
Confusion Matrix:
[[635422      7]
 [   414    419]]
F1-score: 0.6656076250992852
Precision: 0.9835680751173709
Recall: 0.503001200480192
Accuracy: 0.9993383228921419

Test Metrics:
Confusion Matrix:
[[635415      5]
 [   445    397]]
F1-score: 0.6382636655948553
Precision: 0.9875621890547264
Recall: 0.47149643705463185
Accuracy: 0.9992927441839997

Metrics for 2 epochs:
Validation Metrics:
Confusion Matrix:
[[635428      1]
 [   522    311]]
F1-score: 0.5432314410480349
Precision: 0.9967948717948718
Recall: 0.37334933973589435
Accuracy: 0.9991780115738486

Test Metrics:
Confusion Matrix:
[[635420      0]
 [   542    300]]
F1-score: 0.5253940455341506
Precision: 1.0
Recall: 0.356294536817

#### Interpretation of the above data

On the normal set of data, applying more epochs increased the precision and decreased the recall value. We think this may be due to the number of actual fraudulent in the set being too low and the model being trained to better identify "not fraud" than "fraud".  

### 4B. Model using the most epoch with different sampling ratios

In [11]:
# Specify hyperparameters
epochs = 10  # Number of epochs to train
batch_size = 32  # Batch size for training
sampling_ratios = [0.05,0.1, 0.2, 0.3, 0.4]  # Sampling ratios to test
#,0.1, 0.2, 0.3, 0.4
# Initialize variables to store results
loss_per_epoch = []
val_metrics = []
test_metrics = []

# Iterate over different sampling ratios
for ratio in sampling_ratios:
    # Apply SMOTE to augment the training data
    smote = SMOTE(sampling_strategy=ratio, random_state=42)
    X_train_aug, y_train_aug = smote.fit_resample(X_train_scaled, y_train)

    # Train the DNN and obtain metrics
    val_cm, val_f1, val_precision, val_recall, val_accuracy, \
    test_cm, test_f1, test_precision, test_recall, test_accuracy, \
    loss, weight_matrix = train_dnn(X_train_aug, y_train_aug, X_validation_scaled, y_validation, X_test_scaled, y_test, epochs, batch_size)

    # Store the loss per epoch
    loss_per_epoch.append(loss)

    # Store the metrics
    val_metrics.append((val_cm, val_f1, val_precision, val_recall, val_accuracy))
    test_metrics.append((test_cm, test_f1, test_precision, test_recall, test_accuracy))

# Print the results
for i, ratio in enumerate(sampling_ratios):
    print(f"Metrics for sampling ratio {ratio}:")
    print("Validation Metrics:")
    print("Confusion Matrix:")
    print(val_metrics[i][0])
    print("F1-score:", val_metrics[i][1])
    print("Precision:", val_metrics[i][2])
    print("Recall:", val_metrics[i][3])
    print("Accuracy:", val_metrics[i][4])

    print("Test Metrics:")
    print("Confusion Matrix:")
    print(test_metrics[i][0])
    print("F1-score:", test_metrics[i][1])
    print("Precision:", test_metrics[i][2])
    print("Recall:", test_metrics[i][3])
    print("Accuracy:", test_metrics[i][4])
    print()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Metrics for sampling ratio 0.05:
Validation Metrics:
Confusion Matrix:
[[635083    346]
 [   145    688]]
F1-score: 0.7370112479914301
Precision: 0.6653771760154739
Recall: 0.8259303721488596
Accuracy: 0.9992283053207641
Test Metrics:
Confusion Matrix:
[[635131    289]
 [   125    717]]
F1-score: 0.775974025974026
Precision: 0.7127236580516899
Recall: 0.8515439429928741
Accuracy: 0.9993493246492797

Metrics for sampling ratio 0.1:
Validation

#### Interpretation of the above data

It seems that the redistribution of data increases the recall value but also decreases the precision value. We think 0.1 is the best ratio to use for resampling the data because the recall rate was close to 80% on the test data but also had an 86% on the precision. 