In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import make_scorer, accuracy_score, classification_report, f1_score, confusion_matrix, roc_auc_score, precision_score, recall_score, average_precision_score

In [2]:
# Load the data
X_train = pd.read_csv('../Resources/Datasets/X_train.csv', index_col=0)
y_train = pd.read_csv('../Resources/Datasets/y_train.csv', index_col=0)
X_test = pd.read_csv('../Resources/Datasets/X_test.csv', index_col=0)
y_test = pd.read_csv('../Resources/Datasets/y_test.csv', index_col=0)

In [3]:
# Create the parameter grid based on the results of random search
param_grid = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

# Create a base model
nn = MLPClassifier(random_state=42, early_stopping=True, max_iter=400, n_iter_no_change=20, verbose=True)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=nn, param_grid=param_grid, scoring='f1_weighted', verbose=1, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train.values.ravel())

# Print the best parameters found
print(grid_search.best_params_)
print(grid_search.best_score_)

ValueError: Found input variables with inconsistent numbers of samples: [201445, 208480]

In [None]:
# train the model with the best parameters
nn = grid_search.best_estimator_
nn.fit(X_train, y_train.values.ravel())

# predict the test set
y_pred = nn.predict(X_test)

Iteration 1, loss = 5.03877758
Validation score: 0.349338
Iteration 2, loss = 2.08335459
Validation score: 0.558663
Iteration 3, loss = 1.63697481
Validation score: 0.429106
Iteration 4, loss = 1.24433394
Validation score: 0.575163
Iteration 5, loss = 1.05132229
Validation score: 0.535495
Iteration 6, loss = 1.03672321
Validation score: 0.471556
Iteration 7, loss = 0.92755423
Validation score: 0.574156
Iteration 8, loss = 0.89267799
Validation score: 0.558711
Iteration 9, loss = 0.89194056
Validation score: 0.567920
Iteration 10, loss = 0.86828826
Validation score: 0.564323
Iteration 11, loss = 0.85231873
Validation score: 0.526046
Iteration 12, loss = 0.83132464
Validation score: 0.572189
Iteration 13, loss = 0.82936215
Validation score: 0.572333
Iteration 14, loss = 0.82445594
Validation score: 0.572093
Iteration 15, loss = 0.81777744
Validation score: 0.551756
Iteration 16, loss = 0.81757443
Validation score: 0.570079
Iteration 17, loss = 0.81303046
Validation score: 0.561733
Iterat

In [None]:
result_df = pd.read_csv('../Resources/Datasets/results.csv', index_col=0)
result_df

Unnamed: 0_level_0,accuracy,precision,recall,weighted_f1_score,AUC,AP
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Decision Tree,0.0,0.0,0.0,0.0,0.0,0.0
Random Forest,0.0,0.0,0.0,0.0,0.0,0.0
XGB,0.0,0.0,0.0,0.0,0.0,0.0
SVM,0.0,0.0,0.0,0.0,0.0,0.0
KNN,0.0,0.0,0.0,0.0,0.0,0.0
Logistic Regression,0.0,0.0,0.0,0.0,0.0,0.0
Naive Bayes,0.0,0.0,0.0,0.0,0.0,0.0
Neural Network,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("F1 Score (Weighted):", f1_score(y_test, y_pred, average='weighted'))
print("Precision Score (Weighted):", precision_score(y_test, y_pred, average='weighted'))
print("Recall Score (Weighted):", recall_score(y_test, y_pred, average='weighted'))

y_pred_proba = grid_search.best_estimator_.predict_proba(X_test)
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='weighted'))

              precision    recall  f1-score   support

           1       0.63      0.38      0.47      5025
           2       0.66      0.84      0.74     29652
           3       0.66      0.43      0.52     17444

    accuracy                           0.66     52121
   macro avg       0.65      0.55      0.58     52121
weighted avg       0.66      0.66      0.64     52121

[[ 1887  3034   104]
 [ 1055 24828  3769]
 [   66  9886  7492]]
Accuracy Score: 0.6562997640106675
F1 Score (Weighted): 0.6385024992101994
Precision Score (Weighted): 0.6552966084251843
Recall Score (Weighted): 0.6562997640106675
ROC AUC Score: 0.7355116343837474


In [None]:
# update the results dataframe
result_df.loc['Neural Network'] = [accuracy_score(y_test, y_pred), 
                                   precision_score(y_test, y_pred, average='weighted'), 
                                   recall_score(y_test, y_pred, average='weighted'), 
                                   f1_score(y_test, y_pred, average='weighted'), 
                                   roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='weighted'), 
                                   average_precision_score(y_test, y_pred_proba, average='weighted')]

In [None]:
result_df

Unnamed: 0_level_0,accuracy,precision,recall,weighted_f1_score,AUC,AP
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Decision Tree,0.0,0.0,0.0,0.0,0.0,0.0
Random Forest,0.0,0.0,0.0,0.0,0.0,0.0
XGB,0.0,0.0,0.0,0.0,0.0,0.0
SVM,0.0,0.0,0.0,0.0,0.0,0.0
KNN,0.0,0.0,0.0,0.0,0.0,0.0
Logistic Regression,0.0,0.0,0.0,0.0,0.0,0.0
Naive Bayes,0.0,0.0,0.0,0.0,0.0,0.0
Neural Network,0.6563,0.655297,0.6563,0.638502,0.735512,0.669619


In [None]:
result_df.to_csv('../Resources/Datasets/results.csv')

In [None]:
# save the model using pickle
import pickle
filename = '../Models/nn_model.pkl'
pickle.dump(nn, open(filename, 'wb'))