In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, BatchNormalization
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils import class_weight
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import hashlib
from imblearn.over_sampling import RandomOverSampler, SMOTE

### Pre-Processing

In [None]:
# convert type to number categorical
data["type"] = data["type"].map({"CASH_OUT": 1, "PAYMENT": 2, "CASH_IN": 3, "TRANSFER": 4, "DEBIT": 5})
data.head()

In [None]:
data = data.drop(["nameOrig", "nameDest"], axis=1)

In [None]:
# Prepare the Data
X = data.drop(['isFraud'], axis=1)
y = data['isFraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


### Models

#### DNN

In [None]:
# Build the Dense Neural Network (DNN)
dnn_model = Sequential()
dnn_model.add(Dense(64, activation='relu', input_dim=X_train_scaled.shape[1]))
dnn_model.add(Dense(32, activation='relu'))
dnn_model.add(Dense(1, activation='sigmoid'))
dnn_model.compile(optimizer='adam', loss='binary_crossentropy')
dnn_model.fit(X_train_scaled, y_train, epochs=10, batch_size=32)
dnn_predictions = dnn_model.predict(X_test_scaled)
dnn_predictions = np.round(dnn_predictions)

In [None]:
# Calculate Evaluation Metrics
dnn_confusion_matrix = confusion_matrix(y_test, dnn_predictions)
dnn_accuracy = accuracy_score(y_test, dnn_predictions)
dnn_precision = precision_score(y_test, dnn_predictions)
dnn_recall = recall_score(y_test, dnn_predictions)
dnn_f1_score = f1_score(y_test, dnn_predictions)


# Print the results
print("DNN Confusion Matrix:")
print(dnn_confusion_matrix)
print("DNN Accuracy:", dnn_accuracy)
print("DNN Precision:", dnn_precision)
print("DNN Recall:", dnn_recall)
print("DNN F1-Score:", dnn_f1_score)

#### Decision Tree

In [None]:
# Build the Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(class_weight=class_weights)

# Train the Decision Tree Classifier
dt_classifier.fit(X_train_scaled, y_train)

# Make predictions with the Decision Tree Classifier
dt_predictions = dt_classifier.predict(X_test_scaled)

# Calculate evaluation metrics for Decision Tree Classifier
dt_confusion_matrix = confusion_matrix(y_test, dt_predictions)
dt_accuracy = accuracy_score(y_test, dt_predictions)
dt_precision = precision_score(y_test, dt_predictions)
dt_recall = recall_score(y_test, dt_predictions)
dt_f1_score = f1_score(y_test, dt_predictions)

# Print the results for Decision Tree Classifier
print("Decision Tree Classifier (without balancing weights) Confusion Matrix:")
print(dt_confusion_matrix)
print("Decision Tree Classifier Accuracy:", dt_accuracy)
print("Decision Tree Classifier Precision:", dt_precision)
print("Decision Tree Classifier Recall:", dt_recall)
print("Decision Tree Classifier F1-Score:", dt_f1_score)

In [None]:
class_weights = {0: 1, 1: 4}  # Adjust the weights 

# Build the Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(class_weight=class_weights)

# Train the Decision Tree Classifier
dt_classifier.fit(X_train_scaled, y_train)

# Make predictions with the Decision Tree Classifier
dt_predictions = dt_classifier.predict(X_test_scaled)

# Calculate evaluation metrics for Decision Tree Classifier
dt_confusion_matrix = confusion_matrix(y_test, dt_predictions)
dt_accuracy = accuracy_score(y_test, dt_predictions)
dt_precision = precision_score(y_test, dt_predictions)
dt_recall = recall_score(y_test, dt_predictions)
dt_f1_score = f1_score(y_test, dt_predictions)

# Print the results for Decision Tree Classifier
print("Decision Tree Classifier Confusion Matrix:")
print(dt_confusion_matrix)
print("Decision Tree Classifier Accuracy:", dt_accuracy)
print("Decision Tree Classifier Precision:", dt_precision)
print("Decision Tree Classifier Recall:", dt_recall)
print("Decision Tree Classifier F1-Score:", dt_f1_score)

#### Using Grid Sweep to find best parameters for DT


In [None]:
# Define the range of weight ratios to test
weight_ratios = [1.5, 2, 2.5, 3, 3.5]

# Create a parameter grid for the weight ratios
param_grid = {'class_weight': [{0: 1, 1: ratio} for ratio in weight_ratios]}

# Create the decision tree classifier
dt_model = DecisionTreeClassifier()

# Perform grid search with cross-validation
grid_search = GridSearchCV(dt_model, param_grid, cv=5, scoring='recall')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best weight ratio and corresponding results
best_weight_ratio = grid_search.best_params_['class_weight']
best_results = grid_search.cv_results_

# Print the results for each weight ratio
for ratio, mean_score, std_score in zip(weight_ratios, best_results['mean_test_score'], best_results['std_test_score']):
    print("Weight Ratio:", ratio)
    print("Mean Recall:", mean_score)
    print("Standard Deviation of Recall:", std_score)
    print()

# Evaluate the model with the best weight ratio on the test set
best_dt_model = grid_search.best_estimator_
y_pred = best_dt_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics for the best model
print("Best Weight Ratio:", best_weight_ratio)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

#### Random Forest

In [None]:
# 2 ESTIMATORS

rf_model = RandomForestClassifier(n_estimators=2, random_state=42)
rf_model.fit(X_train, y_train)

rf_predictions = rf_model.predict(X_test)

rf_confusion_matrix = confusion_matrix(y_test, rf_predictions)
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_precision = precision_score(y_test, rf_predictions)
rf_recall = recall_score(y_test, rf_predictions)
rf_f1_score = f1_score(y_test, rf_predictions)

print("Random Forest Confusion Matrix:")
print(rf_confusion_matrix)
print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest Precision:", rf_precision)
print("Random Forest Recall:", rf_recall)
print("Random Forest F1-Score:", rf_f1_score)

In [None]:
# 4 ESTIMATORS
rf_model = RandomForestClassifier(n_estimators=4, random_state=42)
rf_model.fit(X_train, y_train)

rf_predictions = rf_model.predict(X_test)

rf_confusion_matrix = confusion_matrix(y_test, rf_predictions)
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_precision = precision_score(y_test, rf_predictions)
rf_recall = recall_score(y_test, rf_predictions)
rf_f1_score = f1_score(y_test, rf_predictions)

print("Random Forest Confusion Matrix:")
print(rf_confusion_matrix)
print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest Precision:", rf_precision)
print("Random Forest Recall:", rf_recall)
print("Random Forest F1-Score:", rf_f1_score)

### Other Improvements?

#### Random Oversampling


#### Other Oversampling techniques?

#### Generating Synthetic data for class imbalance?