In [1]:
from numpy import random
random.seed(2814)

In [2]:
from numpy import load
import pickle

X_train = load('../../data/interim//Design/X_train.npy')
X_test = load('../../data/interim/Design/X_test.npy')
y_train = load('../../data/interim/Design/y_train.npy')
y_test = load('../../data/interim/Design/y_test.npy')

loaded_model = pickle.load(open('../../models/final_model.pkl', 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.8087305964856986


In [8]:
import pandas as pd
pd.DataFrame(y_train).value_counts().sort_index()

0     97294
1     96900
2     96627
3     96464
4     94818
5     86912
6     75265
7     68420
8     69375
9     73321
10    81233
11    90512
12    93190
13    94137
14    95307
dtype: int64

In [9]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
_, X_strat, _, y_strat = train_test_split(
X_train, y_train, test_size=0.013, random_state=2814, stratify=y_train)

In [10]:
len(X_strat)

17028

## 1. KNN

In [11]:
from sklearn.neighbors import KNeighborsClassifier
parameters_KNN = {
    'n_neighbors': range(1,50,2),
    'leaf_size': range(20,40,5),
    'weights': ('uniform', 'distance'),
    'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'),
    'metric': ['minkowski']}

scoring = 'f1_macro'

knn = KNeighborsClassifier()

from sklearn.model_selection import StratifiedKFold, GridSearchCV
grid_knn = GridSearchCV(knn, parameters_KNN, cv=StratifiedKFold(10), scoring=scoring)

In [12]:
grid_knn.fit(X_strat, y_strat)

In [13]:
grid_knn.best_params_

{'algorithm': 'auto',
 'leaf_size': 20,
 'metric': 'minkowski',
 'n_neighbors': 27,
 'weights': 'distance'}

In [14]:
grid_knn.best_score_

0.350528806577099

In [15]:
y_pred_grid_knn = grid_knn.predict(X_test)

In [16]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_grid_knn))

              precision    recall  f1-score   support

           0       0.03      0.68      0.06       196
           1       0.67      0.54      0.60      5195
           2       0.69      0.55      0.61      6857
           3       0.15      0.39      0.21      2773
           4       0.16      0.30      0.21      6231
           5       0.22      0.25      0.24     15040
           6       0.27      0.19      0.22     22077
           7       0.27      0.15      0.19     24325
           8       0.24      0.15      0.18     23215
           9       0.23      0.16      0.19     22520
          10       0.21      0.19      0.20     19384
          11       0.13      0.20      0.16     11198
          12       0.13      0.23      0.17      8873
          13       0.17      0.30      0.22      9191
          14       0.55      0.61      0.58     13290

    accuracy                           0.25    190365
   macro avg       0.28      0.33      0.27    190365
weighted avg       0.27   

In [17]:
knn_optimal = KNeighborsClassifier(
    algorithm = 'auto',
    leaf_size = 20,
    metric = 'minkowski',
    n_neighbors = 13,
    weights = 'distance'
)

knn_optimal.fit(X_train, y_train)
y_pred_knn_optimal = knn_optimal.predict(X_test)

print(classification_report(y_test, y_pred_knn_optimal))

              precision    recall  f1-score   support

           0       0.10      0.35      0.15       196
           1       0.74      0.75      0.74      5195
           2       0.72      0.67      0.70      6857
           3       0.20      0.47      0.28      2773
           4       0.22      0.41      0.29      6231
           5       0.29      0.35      0.32     15040
           6       0.34      0.28      0.31     22077
           7       0.34      0.22      0.27     24325
           8       0.31      0.23      0.27     23215
           9       0.30      0.23      0.26     22520
          10       0.29      0.25      0.26     19384
          11       0.19      0.29      0.23     11198
          12       0.18      0.30      0.22      8873
          13       0.25      0.35      0.29      9191
          14       0.69      0.64      0.66     13290

    accuracy                           0.33    190365
   macro avg       0.34      0.39      0.35    190365
weighted avg       0.34   

---

## 2. Decision Tree

In [18]:
from sklearn.tree import DecisionTreeClassifier
parameters_dt = {
    'criterion' : ("gini", "entropy", "log_loss"),
    'splitter' : ("best", "random"),
    'max_depth': range(10,100,10),
    'min_samples_split': range(2,10,2),
    'min_samples_leaf': range(2,10,2)
}

scoring = 'f1_macro'

dt = DecisionTreeClassifier()

from sklearn.model_selection import StratifiedKFold, GridSearchCV
grid_dt = GridSearchCV(dt, parameters_dt, cv=StratifiedKFold(10), scoring=scoring)

In [19]:
grid_dt.fit(X_strat, y_strat)

In [20]:
grid_dt.best_params_

{'criterion': 'entropy',
 'max_depth': 80,
 'min_samples_leaf': 6,
 'min_samples_split': 4,
 'splitter': 'best'}

In [21]:
grid_dt.best_score_

0.6713513050006431

In [22]:
y_pred_grid_dt = grid_dt.predict(X_test)

In [23]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_grid_dt))

              precision    recall  f1-score   support

           0       0.07      0.69      0.13       196
           1       0.76      0.70      0.73      5195
           2       0.77      0.65      0.70      6857
           3       0.33      0.61      0.42      2773
           4       0.39      0.59      0.47      6231
           5       0.65      0.57      0.61     15040
           6       0.67      0.62      0.64     22077
           7       0.63      0.62      0.62     24325
           8       0.64      0.62      0.63     23215
           9       0.67      0.61      0.64     22520
          10       0.68      0.61      0.64     19384
          11       0.56      0.65      0.60     11198
          12       0.51      0.64      0.57      8873
          13       0.65      0.70      0.67      9191
          14       0.93      0.83      0.88     13290

    accuracy                           0.64    190365
   macro avg       0.59      0.65      0.60    190365
weighted avg       0.66   

In [24]:
dt_optimal = DecisionTreeClassifier(
    criterion = 'log_loss',
    max_depth = 30,
    min_samples_leaf = 6,
    min_samples_split = 8,
    splitter = 'best'
)

dt_optimal.fit(X_train, y_train)
y_pred_dt_optimal = dt_optimal.predict(X_test)

print(classification_report(y_test, y_pred_dt_optimal))

              precision    recall  f1-score   support

           0       0.22      0.38      0.28       196
           1       0.82      0.85      0.83      5195
           2       0.84      0.80      0.82      6857
           3       0.51      0.70      0.59      2773
           4       0.62      0.73      0.67      6231
           5       0.77      0.76      0.77     15040
           6       0.79      0.78      0.78     22077
           7       0.79      0.76      0.77     24325
           8       0.75      0.73      0.74     23215
           9       0.75      0.72      0.74     22520
          10       0.76      0.73      0.74     19384
          11       0.66      0.72      0.69     11198
          12       0.70      0.73      0.71      8873
          13       0.76      0.80      0.78      9191
          14       0.94      0.91      0.93     13290

    accuracy                           0.76    190365
   macro avg       0.71      0.74      0.72    190365
weighted avg       0.76   

---

## 3. Random Forest

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer
import sklearn
parameters_rf = {
    'criterion' : ("entropy", "log_loss"),
    'n_estimators': (100,200,500,1000),
    'max_depth': [40,50],
    'min_samples_split': [2,3,4],
    'min_samples_leaf': [2,3,4]
}

from sklearn.metrics import make_scorer
scoring = {'accuracy': make_scorer(sklearn.metrics.accuracy_score),
           'precision': make_scorer(sklearn.metrics.precision_score, average = 'macro'),
           'recall': make_scorer(sklearn.metrics.recall_score, average = 'macro'),
           'f1_macro': make_scorer(sklearn.metrics.f1_score, average = 'macro'),
           'f1_weighted': make_scorer(sklearn.metrics.f1_score, average = 'weighted')}

rf = RandomForestClassifier()

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import StratifiedKFold, HalvingGridSearchCV
grid_rf = HalvingGridSearchCV(rf, parameters_rf, cv=StratifiedKFold(10), scoring='f1_macro', factor = 3, verbose=1)

In [26]:
grid_rf.fit(X_strat, y_strat)

n_iterations: 4
n_required_iterations: 5
n_possible_iterations: 4
min_resources_: 300
max_resources_: 17028
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 144
n_resources: 300
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
----------
iter: 1
n_candidates: 48
n_resources: 900
Fitting 10 folds for each of 48 candidates, totalling 480 fits
----------
iter: 2
n_candidates: 16
n_resources: 2700
Fitting 10 folds for each of 16 candidates, totalling 160 fits
----------
iter: 3
n_candidates: 6
n_resources: 8100
Fitting 10 folds for each of 6 candidates, totalling 60 fits


In [27]:
grid_rf.best_params_

{'criterion': 'log_loss',
 'max_depth': 50,
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 1000}

In [28]:
grid_rf.best_score_

0.6113277949656102

In [29]:
y_pred_grid_rf = grid_rf.predict(X_test)

In [30]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_grid_rf))

              precision    recall  f1-score   support

           0       0.10      0.65      0.17       196
           1       0.79      0.76      0.78      5195
           2       0.80      0.70      0.75      6857
           3       0.44      0.66      0.53      2773
           4       0.53      0.65      0.59      6231
           5       0.70      0.69      0.69     15040
           6       0.70      0.67      0.69     22077
           7       0.68      0.60      0.64     24325
           8       0.61      0.59      0.60     23215
           9       0.66      0.55      0.60     22520
          10       0.62      0.59      0.60     19384
          11       0.43      0.58      0.50     11198
          12       0.53      0.61      0.56      8873
          13       0.60      0.67      0.64      9191
          14       0.85      0.86      0.86     13290

    accuracy                           0.64    190365
   macro avg       0.60      0.66      0.61    190365
weighted avg       0.65   

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
rf_optimal = RandomForestClassifier(
    criterion = 'entropy',
    n_estimators=100,
    random_state=2814,
    min_samples_leaf=2,
    min_samples_split=4,
    max_depth=50
)

rf_optimal.fit(X_train, y_train)
y_pred_rf_optimal = rf_optimal.predict(X_test)

print(classification_report(y_test, y_pred_rf_optimal))

KeyboardInterrupt: 

In [19]:
rf_optimal.feature_importances_

array([0.02813963, 0.04013652, 0.15156458, 0.2492943 , 0.05221669,
       0.04431258, 0.05123072, 0.01861447, 0.02186074, 0.04118196,
       0.03129756, 0.15355793, 0.11659231])

---

## 4. Neural Network

In [None]:
yle = LabelEncoder()
y_train = yle.fit_transform(y_train)
y_test = yle.transform(y_test)
from tensorflow.keras.utils import to_categorical
y_train = to_categorical(y_train, 15)
y_test = to_categorical(y_test, 15)

In [None]:
from tensorflow import keras
from tensorflow.keras.layers import Dense
from keras.models import Sequential
from scikeras.wrappers import KerasClassifier

def baseline_model():
    # Create model here
    model = Sequential()
    model.add(Dense(1024, input_dim = 13, activation = 'relu')) # Rectified Linear Unit Activation Function
    model.add(Dense(512, activation = 'relu'))
    model.add(Dense(256, activation = 'relu'))
    model.add(Dense(128, activation = 'relu'))
    model.add(Dense(64, activation = 'relu'))
    model.add(Dense(32, activation = 'relu'))
    model.add(Dense(15, activation = 'softmax')) # Softmax for multi-class classification
    # Compile model here
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()])
    return model

---

## 5. Boxplot of Outliers