In [1]:
from numpy import random
random.seed(2814)

In [8]:
from numpy import load
import pickle

X_train = load('../../data/interim//Design/X_train.npy')
X_test = load('../../data/interim/Design/X_test.npy')
y_train = load('../../data/interim/Design/y_train.npy')
y_test = load('../../data/interim/Design/y_test.npy')

loaded_model = pickle.load(open('../../models/final_model.pkl', 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.8087305964856986


In [9]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
_, X_strat, _, y_strat = train_test_split(
X_train, y_train, test_size=0.013, random_state=2814, stratify=y_train)

In [10]:
len(X_strat)

17028

## 1. KNN

In [11]:
from sklearn.neighbors import KNeighborsClassifier
parameters_KNN = {
    'n_neighbors': range(1,50,2),
    'leaf_size': range(20,40,5),
    'weights': ('uniform', 'distance'),
    'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'),
    'metric': ['minkowski']}

scoring = 'f1_macro'

knn = KNeighborsClassifier()

from sklearn.model_selection import StratifiedKFold, GridSearchCV
grid_knn = GridSearchCV(knn, parameters_KNN, cv=StratifiedKFold(10), scoring=scoring)

In [12]:
grid_knn.fit(X_strat, y_strat)

In [None]:
grid_knn.best_params_

{'algorithm': 'auto',
 'leaf_size': 20,
 'metric': 'minkowski',
 'n_neighbors': 13,
 'weights': 'distance'}

In [None]:
grid_knn.best_score_

0.349833045690889

In [None]:
y_pred_grid_knn = grid_knn.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_grid_knn))

              precision    recall  f1-score   support

           0       0.04      0.72      0.08       196
           1       0.66      0.56      0.61      5195
           2       0.70      0.54      0.61      6857
           3       0.16      0.42      0.23      2773
           4       0.15      0.29      0.20      6231
           5       0.24      0.25      0.24     15040
           6       0.26      0.20      0.22     22077
           7       0.26      0.17      0.21     24325
           8       0.24      0.18      0.20     23215
           9       0.24      0.16      0.19     22520
          10       0.21      0.18      0.19     19384
          11       0.13      0.19      0.15     11198
          12       0.13      0.22      0.16      8873
          13       0.16      0.27      0.20      9191
          14       0.56      0.59      0.58     13290

    accuracy                           0.25    190365
   macro avg       0.28      0.33      0.27    190365
weighted avg       0.27   

In [None]:
knn_optimal = KNeighborsClassifier(
    algorithm = 'auto',
    leaf_size = 20,
    metric = 'minkowski',
    n_neighbors = 13,
    weights = 'distance'
)

knn_optimal.fit(X_train, y_train)
y_pred_knn_optimal = knn_optimal.predict(X_test)

print(classification_report(y_test, y_pred_knn_optimal))

              precision    recall  f1-score   support

           0       0.11      0.40      0.17       196
           1       0.74      0.76      0.75      5195
           2       0.72      0.68      0.70      6857
           3       0.20      0.45      0.28      2773
           4       0.22      0.42      0.29      6231
           5       0.30      0.35      0.32     15040
           6       0.35      0.28      0.31     22077
           7       0.34      0.22      0.27     24325
           8       0.31      0.23      0.27     23215
           9       0.30      0.22      0.26     22520
          10       0.28      0.24      0.26     19384
          11       0.18      0.27      0.21     11198
          12       0.17      0.28      0.21      8873
          13       0.24      0.34      0.28      9191
          14       0.67      0.63      0.65     13290

    accuracy                           0.32    190365
   macro avg       0.34      0.39      0.35    190365
weighted avg       0.34   

---

## 2. Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
parameters_dt = {
    'criterion' : ("gini", "entropy", "log_loss"),
    'splitter' : ("best", "random"),
    'max_depth': range(10,100,10),
    'min_samples_split': range(2,10,2),
    'min_samples_leaf': range(2,10,2)
}

scoring = 'f1_macro'

dt = DecisionTreeClassifier()

from sklearn.model_selection import StratifiedKFold, GridSearchCV
grid_dt = GridSearchCV(dt, parameters_dt, cv=StratifiedKFold(10), scoring=scoring)

In [None]:
grid_dt.fit(X_strat, y_strat)

In [None]:
grid_dt.best_params_

{'criterion': 'entropy',
 'max_depth': 80,
 'min_samples_leaf': 6,
 'min_samples_split': 8,
 'splitter': 'best'}

In [None]:
grid_dt.best_score_

0.6576124791029911

In [None]:
y_pred_grid_dt = grid_dt.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_grid_dt))

              precision    recall  f1-score   support

           0       0.08      0.76      0.14       196
           1       0.73      0.64      0.68      5195
           2       0.77      0.63      0.69      6857
           3       0.33      0.61      0.43      2773
           4       0.46      0.60      0.52      6231
           5       0.60      0.61      0.60     15040
           6       0.63      0.59      0.61     22077
           7       0.66      0.61      0.64     24325
           8       0.60      0.62      0.61     23215
           9       0.65      0.54      0.59     22520
          10       0.66      0.64      0.65     19384
          11       0.52      0.61      0.56     11198
          12       0.57      0.66      0.61      8873
          13       0.67      0.67      0.67      9191
          14       0.91      0.85      0.88     13290

    accuracy                           0.63    190365
   macro avg       0.59      0.64      0.59    190365
weighted avg       0.64   

In [None]:
dt_optimal = DecisionTreeClassifier(
    criterion = 'log_loss',
    max_depth = 30,
    min_samples_leaf = 6,
    min_samples_split = 8,
    splitter = 'best'
)

dt_optimal.fit(X_train, y_train)
y_pred_dt_optimal = dt_optimal.predict(X_test)

print(classification_report(y_test, y_pred_dt_optimal))

              precision    recall  f1-score   support

           0       0.26      0.50      0.34       196
           1       0.84      0.86      0.85      5195
           2       0.85      0.81      0.83      6857
           3       0.51      0.70      0.59      2773
           4       0.63      0.73      0.67      6231
           5       0.77      0.77      0.77     15040
           6       0.80      0.77      0.78     22077
           7       0.78      0.75      0.77     24325
           8       0.75      0.74      0.74     23215
           9       0.75      0.72      0.74     22520
          10       0.76      0.72      0.74     19384
          11       0.65      0.71      0.68     11198
          12       0.67      0.72      0.70      8873
          13       0.76      0.79      0.78      9191
          14       0.94      0.91      0.93     13290

    accuracy                           0.76    190365
   macro avg       0.71      0.75      0.73    190365
weighted avg       0.76   

---

## 3. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer
import sklearn
parameters_rf = {
    'criterion' : ("entropy", "log_loss"),
    'n_estimators': (100,200,500,1000),
    'max_depth': [40,50],
    'min_samples_split': [2,3,4],
    'min_samples_leaf': [2,3,4]
}

from sklearn.metrics import make_scorer
scoring = {'accuracy': make_scorer(sklearn.metrics.accuracy_score),
           'precision': make_scorer(sklearn.metrics.precision_score, average = 'macro'),
           'recall': make_scorer(sklearn.metrics.recall_score, average = 'macro'),
           'f1_macro': make_scorer(sklearn.metrics.f1_score, average = 'macro'),
           'f1_weighted': make_scorer(sklearn.metrics.f1_score, average = 'weighted')}

rf = RandomForestClassifier()

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import StratifiedKFold, HalvingGridSearchCV
grid_rf = HalvingGridSearchCV(rf, parameters_rf, cv=StratifiedKFold(10), scoring='f1_macro', factor = 3, verbose=1)

In [None]:
grid_rf.fit(X_strat, y_strat)

n_iterations: 5
n_required_iterations: 6
n_possible_iterations: 5
min_resources_: 150
max_resources_: 17014
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 576
n_resources: 150
Fitting 5 folds for each of 576 candidates, totalling 2880 fits
[CV] END criterion=gini, max_depth=20, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END criterion=gini, max_depth=20, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END criterion=gini, max_depth=20, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END criterion=gini, max_depth=20, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END criterion=gini, max_depth=20, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END criterion=gini, max_depth=20, min_samples_leaf=2, min_samples_split=2, n_estimators=1000; total time=   0.5s
[CV] END criterion=gini, max_de

In [None]:
grid_rf.best_params_

{'criterion': 'entropy',
 'max_depth': 40,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 1000}

In [None]:
grid_rf.best_score_

0.6343387465117967

In [None]:
y_pred_grid_rf = grid_rf.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_grid_rf))

NameError: name 'y_pred_grid_rf' is not defined

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
rf_optimal = RandomForestClassifier(
    criterion = 'entropy',
    n_estimators=150,
    random_state=2814
)

rf_optimal.fit(X_train, y_train)
y_pred_rf_optimal = rf_optimal.predict(X_test)

print(classification_report(y_test, y_pred_rf_optimal))

              precision    recall  f1-score   support

           0       0.50      0.49      0.49       196
           1       0.90      0.89      0.89      5195
           2       0.88      0.88      0.88      6857
           3       0.65      0.75      0.69      2773
           4       0.75      0.78      0.77      6231
           5       0.82      0.84      0.83     15040
           6       0.85      0.83      0.84     22077
           7       0.84      0.80      0.82     24325
           8       0.81      0.78      0.79     23215
           9       0.81      0.77      0.79     22520
          10       0.79      0.79      0.79     19384
          11       0.69      0.77      0.73     11198
          12       0.71      0.77      0.74      8873
          13       0.77      0.84      0.80      9191
          14       0.94      0.93      0.94     13290

    accuracy                           0.81    190365
   macro avg       0.78      0.79      0.79    190365
weighted avg       0.81   

In [19]:
rf_optimal.feature_importances_

array([0.02813963, 0.04013652, 0.15156458, 0.2492943 , 0.05221669,
       0.04431258, 0.05123072, 0.01861447, 0.02186074, 0.04118196,
       0.03129756, 0.15355793, 0.11659231])

---

## 4. Neural Network

In [None]:
yle = LabelEncoder()
y_train = yle.fit_transform(y_train)
y_test = yle.transform(y_test)
from tensorflow.keras.utils import to_categorical
y_train = to_categorical(y_train, 15)
y_test = to_categorical(y_test, 15)

In [None]:
from tensorflow import keras
from tensorflow.keras.layers import Dense
from keras.models import Sequential
from scikeras.wrappers import KerasClassifier

def baseline_model():
    # Create model here
    model = Sequential()
    model.add(Dense(1024, input_dim = 13, activation = 'relu')) # Rectified Linear Unit Activation Function
    model.add(Dense(512, activation = 'relu'))
    model.add(Dense(256, activation = 'relu'))
    model.add(Dense(128, activation = 'relu'))
    model.add(Dense(64, activation = 'relu'))
    model.add(Dense(32, activation = 'relu'))
    model.add(Dense(15, activation = 'softmax')) # Softmax for multi-class classification
    # Compile model here
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()])
    return model

---

## 5. Boxplot of Outliers