In [1]:
from numpy import random
random.seed(2814)

In [2]:
from numpy import load
X_train = load('../../data/interim//Design/X_train.npy')
X_test = load('../../data/interim/Design/X_test.npy')
y_train = load('../../data/interim/Design/y_train.npy')
y_test = load('../../data/interim/Design/y_test.npy')

In [3]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
_, X_strat, _, y_strat = train_test_split(
X_train, y_train, test_size=0.013, random_state=2814, stratify=y_train)

In [4]:
len(X_strat)

17014

## 1. KNN

In [5]:
from sklearn.neighbors import KNeighborsClassifier
parameters_KNN = {
    'n_neighbors': range(1,50,2),
    'leaf_size': range(20,40,5),
    'weights': ('uniform', 'distance'),
    'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'),
    'metric': ['minkowski']}

knn = KNeighborsClassifier()

from sklearn.model_selection import StratifiedKFold, GridSearchCV
grid_knn = GridSearchCV(knn, parameters_KNN, cv=StratifiedKFold(10), scoring='accuracy')

In [6]:
grid_knn.fit(X_strat, y_strat)

In [7]:
grid_knn.best_params_

{'algorithm': 'auto',
 'leaf_size': 20,
 'metric': 'minkowski',
 'n_neighbors': 13,
 'weights': 'distance'}

In [8]:
grid_knn.best_score_

0.3706944004045453

In [9]:
y_pred_grid_knn = grid_knn.predict(X_test)

In [10]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_grid_knn))

              precision    recall  f1-score   support

           0       0.04      0.72      0.08       196
           1       0.66      0.56      0.61      5195
           2       0.70      0.54      0.61      6857
           3       0.16      0.42      0.23      2773
           4       0.15      0.29      0.20      6231
           5       0.24      0.25      0.24     15040
           6       0.26      0.20      0.22     22077
           7       0.26      0.17      0.21     24325
           8       0.24      0.18      0.20     23215
           9       0.24      0.16      0.19     22520
          10       0.21      0.18      0.19     19384
          11       0.13      0.19      0.15     11198
          12       0.13      0.22      0.16      8873
          13       0.16      0.27      0.20      9191
          14       0.56      0.59      0.58     13290

    accuracy                           0.25    190365
   macro avg       0.28      0.33      0.27    190365
weighted avg       0.27   

In [17]:
knn_optimal = KNeighborsClassifier(
    algorithm = 'auto',
    leaf_size = 20,
    metric = 'minkowski',
    n_neighbors = 13,
    weights = 'distance'
)

knn_optimal.fit(X_train, y_train)
y_pred_knn_optimal = knn_optimal.predict(X_test)

print(classification_report(y_test, y_pred_knn_optimal))

              precision    recall  f1-score   support

           0       0.11      0.40      0.17       196
           1       0.74      0.76      0.75      5195
           2       0.72      0.68      0.70      6857
           3       0.20      0.45      0.28      2773
           4       0.22      0.42      0.29      6231
           5       0.30      0.35      0.32     15040
           6       0.35      0.28      0.31     22077
           7       0.34      0.22      0.27     24325
           8       0.31      0.23      0.27     23215
           9       0.30      0.22      0.26     22520
          10       0.28      0.24      0.26     19384
          11       0.18      0.27      0.21     11198
          12       0.17      0.28      0.21      8873
          13       0.24      0.34      0.28      9191
          14       0.67      0.63      0.65     13290

    accuracy                           0.32    190365
   macro avg       0.34      0.39      0.35    190365
weighted avg       0.34   

---

## 2. Decision Tree

In [11]:
from sklearn.tree import DecisionTreeClassifier
parameters_dt = {
    'criterion' : ("gini", "entropy", "log_loss"),
    'splitter' : ("best", "random"),
    'max_depth': range(10,100,10),
    'min_samples_split': range(2,10,2),
    'min_samples_leaf': range(2,10,2)
}

dt = DecisionTreeClassifier()

from sklearn.model_selection import StratifiedKFold, GridSearchCV
grid_dt = GridSearchCV(dt, parameters_dt, cv=StratifiedKFold(10), scoring='accuracy')

In [12]:
grid_dt.fit(X_strat, y_strat)

In [20]:
grid_dt.best_params_

{'criterion': 'log_loss',
 'max_depth': 30,
 'min_samples_leaf': 6,
 'min_samples_split': 8,
 'splitter': 'best'}

In [14]:
grid_dt.best_score_

0.6628055591823708

In [15]:
y_pred_grid_dt = grid_dt.predict(X_test)

In [16]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_grid_dt))

              precision    recall  f1-score   support

           0       0.08      0.76      0.14       196
           1       0.73      0.64      0.68      5195
           2       0.77      0.63      0.69      6857
           3       0.33      0.62      0.43      2773
           4       0.46      0.60      0.52      6231
           5       0.60      0.60      0.60     15040
           6       0.63      0.60      0.61     22077
           7       0.67      0.61      0.64     24325
           8       0.60      0.62      0.61     23215
           9       0.64      0.54      0.58     22520
          10       0.66      0.64      0.65     19384
          11       0.52      0.61      0.56     11198
          12       0.57      0.65      0.61      8873
          13       0.67      0.68      0.67      9191
          14       0.91      0.86      0.88     13290

    accuracy                           0.63    190365
   macro avg       0.59      0.64      0.59    190365
weighted avg       0.64   

In [19]:
dt_optimal = DecisionTreeClassifier(
    criterion = 'log_loss',
    max_depth = 30,
    min_samples_leaf = 6,
    min_samples_split = 8,
    splitter = 'best'
)

dt_optimal.fit(X_train, y_train)
y_pred_dt_optimal = dt_optimal.predict(X_test)

print(classification_report(y_test, y_pred_dt_optimal))

              precision    recall  f1-score   support

           0       0.26      0.50      0.34       196
           1       0.84      0.86      0.85      5195
           2       0.85      0.81      0.83      6857
           3       0.51      0.70      0.59      2773
           4       0.63      0.73      0.67      6231
           5       0.77      0.77      0.77     15040
           6       0.80      0.77      0.78     22077
           7       0.78      0.75      0.77     24325
           8       0.75      0.74      0.74     23215
           9       0.75      0.72      0.74     22520
          10       0.76      0.72      0.74     19384
          11       0.65      0.71      0.68     11198
          12       0.67      0.72      0.70      8873
          13       0.76      0.79      0.78      9191
          14       0.94      0.91      0.93     13290

    accuracy                           0.76    190365
   macro avg       0.71      0.75      0.73    190365
weighted avg       0.76   

---

## 3. Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier
parameters_rf = {
    'criterion' : ("gini", "entropy", "log_loss"),
    'n_estimators': (20,200,20),
    'max_depth': range(10,100,10),
    'min_samples_split': range(2,10,2),
    'min_samples_leaf': range(2,10,2)
}

rf = RandomForestClassifier()

from sklearn.model_selection import StratifiedKFold, GridSearchCV
grid_rf = GridSearchCV(rf, parameters_rf, cv=StratifiedKFold(10), scoring='accuracy')

In [22]:
grid_rf.fit(X_strat, y_strat)

In [None]:
grid_rf.best_params_

{'criterion': 'log_loss',
 'max_depth': 30,
 'min_samples_leaf': 6,
 'min_samples_split': 8,
 'splitter': 'best'}

In [None]:
grid_rf.best_score_

0.6628055591823708

In [None]:
y_pred_grid_rf = grid_rf.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_grid_rf))

              precision    recall  f1-score   support

           0       0.08      0.76      0.14       196
           1       0.73      0.64      0.68      5195
           2       0.77      0.63      0.69      6857
           3       0.33      0.62      0.43      2773
           4       0.46      0.60      0.52      6231
           5       0.60      0.60      0.60     15040
           6       0.63      0.60      0.61     22077
           7       0.67      0.61      0.64     24325
           8       0.60      0.62      0.61     23215
           9       0.64      0.54      0.58     22520
          10       0.66      0.64      0.65     19384
          11       0.52      0.61      0.56     11198
          12       0.57      0.65      0.61      8873
          13       0.67      0.68      0.67      9191
          14       0.91      0.86      0.88     13290

    accuracy                           0.63    190365
   macro avg       0.59      0.64      0.59    190365
weighted avg       0.64   

In [None]:
rf_optimal = RandomForestClassifier(
    criterion = 'log_loss',
    max_depth = 30,
    min_samples_leaf = 6,
    min_samples_split = 8,
    splitter = 'best'
)

rf_optimal.fit(X_train, y_train)
y_pred_rf_optimal = rf_optimal.predict(X_test)

print(classification_report(y_test, y_pred_rf_optimal))

              precision    recall  f1-score   support

           0       0.26      0.50      0.34       196
           1       0.84      0.86      0.85      5195
           2       0.85      0.81      0.83      6857
           3       0.51      0.70      0.59      2773
           4       0.63      0.73      0.67      6231
           5       0.77      0.77      0.77     15040
           6       0.80      0.77      0.78     22077
           7       0.78      0.75      0.77     24325
           8       0.75      0.74      0.74     23215
           9       0.75      0.72      0.74     22520
          10       0.76      0.72      0.74     19384
          11       0.65      0.71      0.68     11198
          12       0.67      0.72      0.70      8873
          13       0.76      0.79      0.78      9191
          14       0.94      0.91      0.93     13290

    accuracy                           0.76    190365
   macro avg       0.71      0.75      0.73    190365
weighted avg       0.76   

---

## 4. Neural Network