<a href="https://colab.research.google.com/github/SohilaOsama/ChatBot/blob/main/code_5b_knn_search_oversample_neighbours_weights_algorithm_p.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Section 1: Import

In [1]:
import pandas as pd
import numpy as np
import copy

from sklearn.model_selection import StratifiedKFold

from sklearn.impute import KNNImputer
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from collections import Counter

from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline

from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [2]:
# ---------- read ----------

df_1 = pd.read_csv('/content/code_3_train.csv')

# ---------- drop ----------

print('Before drop :', df_1.shape)
df_1.drop(columns=['index',
                   'pco2', 'ph', 'basophils', 'lactic_acid', 'bmi',
                   'creatine_kinase', 'lymphocyte', 'neutrophils'], inplace=True)
print('After drop:', df_1.shape)
print('')

# ---------- form X ----------

X_train = df_1.drop(columns=['outcome'])
print('X_train :', X_train.shape)

# ---------- form y ----------

y_train = df_1['outcome']
print('y_train :', y_train.shape)
print('')
print('y_train :', np.unique(y_train, return_counts=True))
print('y_train :', Counter(y_train))
print(y_train.value_counts(normalize=True))

Before drop : (882, 51)
After drop: (882, 42)

X_train : (882, 41)
y_train : (882,)

y_train : (array([0., 1.]), array([763, 119]))
y_train : Counter({0.0: 763, 1.0: 119})
0.0    0.865079
1.0    0.134921
Name: outcome, dtype: float64


Section 3: Set up 15-fold cross validation

In [3]:
kfold_cv = StratifiedKFold(n_splits=15, shuffle=True, random_state=42)

Section 4: Explore K Nearest Neighbours Classifier with oversample, neighbours, weights, algorithm, p

In [5]:
# ---------- make pipeline ----------

pipe_line_kn = Pipeline([('knn_impute', KNNImputer(n_neighbors=5)),
                         ('oversample_SMOTE', SMOTE(random_state=42)),
                         ('ss_scale', StandardScaler()),
                         ('kn_class', KNeighborsClassifier())])

In [6]:
# ---------- tuning of hyperparameters ----------

pipe_line_params_kn = {'oversample_SMOTE__sampling_strategy': [1.00, 0.85, 0.70],
                       'kn_class__n_neighbors': [100, 150, 200],
                       'kn_class__weights': ['uniform', 'distance'],
                       'kn_class__algorithm': ['auto', 'brute'],
                       'kn_class__p': [1, 2]}

In [7]:
# ---------- instantiate gridsearchcv for recall in accordance to aim of modelling ----------

search_kn = GridSearchCV(pipe_line_kn,
                         param_grid=pipe_line_params_kn,
                         cv=kfold_cv,
                         return_train_score=True,
                         scoring='recall')

search_kn.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=15, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('knn_impute', KNNImputer()),
                                       ('oversample_SMOTE',
                                        SMOTE(random_state=42)),
                                       ('ss_scale', StandardScaler()),
                                       ('kn_class', KNeighborsClassifier())]),
             param_grid={'kn_class__algorithm': ['auto', 'brute'],
                         'kn_class__n_neighbors': [100, 150, 200],
                         'kn_class__p': [1, 2],
                         'kn_class__weights': ['uniform', 'distance'],
                         'oversample_SMOTE__sampling_strategy': [1.0, 0.85,
                                                                 0.7]},
             return_train_score=True, scoring='recall')

In [8]:
# ---------- quick overview of recall train score, test score and overfit ----------

print('Training :', search_kn.cv_results_['mean_train_score'])
print('Validation :', search_kn.cv_results_['mean_test_score'])
print('Overfit % :', (search_kn.cv_results_['mean_test_score']-search_kn.cv_results_['mean_train_score'])/search_kn.cv_results_['mean_train_score']*100)
print('')


Training : [0.81751931 0.74551158 0.63807379 1.         1.         1.
 0.93576791 0.90995817 0.84455169 1.         1.         1.
 0.82771879 0.7521021  0.61107894 1.         1.         1.
 0.93876019 0.90875697 0.84754397 1.         1.         1.
 0.82951523 0.75151223 0.59484663 1.         1.         1.
 0.94837516 0.9099689  0.82532711 1.         1.         1.
 0.81751931 0.74551158 0.63807379 1.         1.         1.
 0.93576791 0.90995817 0.84455169 1.         1.         1.
 0.82771879 0.7521021  0.61107894 1.         1.         1.
 0.93876019 0.90875697 0.84754397 1.         1.         1.
 0.82951523 0.75151223 0.59484663 1.         1.         1.
 0.94837516 0.9099689  0.82532711 1.         1.         1.        ]
Validation : [0.78214286 0.66309524 0.57142857 0.78214286 0.69642857 0.57142857
 0.9        0.89166667 0.81666667 0.9        0.89166667 0.81666667
 0.79047619 0.70714286 0.5797619  0.79880952 0.70714286 0.5797619
 0.9        0.88333333 0.81666667 0.9        0.88333333 0.8

In [10]:
# ---------- details of recall train score and test score ----------

temp_df_kn = []
for j in range(len(search_kn.cv_results_['params'])):
  for i in range(kfold_cv.n_splits):
    over_fit = (search_kn.cv_results_['split'+str(i)+'_test_score'][j]-search_kn.cv_results_['split'+str(i)+'_train_score'][j])/search_kn.cv_results_['split'+str(i)+'_train_score'][j]*100
    temp_df_kn.append([search_kn.cv_results_['params'][j],
                           search_kn.cv_results_['params'][j]['oversample_SMOTE__sampling_strategy'],
                           search_kn.cv_results_['params'][j]['kn_class__n_neighbors'],
                           search_kn.cv_results_['params'][j]['kn_class__weights'],
                           search_kn.cv_results_['params'][j]['kn_class__algorithm'],
                           search_kn.cv_results_['params'][j]['kn_class__p'],
                           search_kn.cv_results_['split'+str(i)+'_train_score'][j],
                           search_kn.cv_results_['split'+str(i)+'_test_score'][j],
                           over_fit])
    # ---------- save results ----------

temp_df_kn = pd.DataFrame(temp_df_kn, columns=['parameters', 'oversample_SMOTE__sampling_strategy',
                                               'kn_class__n_neighbors', 'kn_class__weights',
                                               'kn_class__algorithm', 'kn_class__p', 'training', 'validation', 'overfit_%'])
print(temp_df_kn)
temp_df_kn.to_csv('/content/code_5b_kn_train_validate_recall.csv', na_rep='NaN', index_label='index')

                                             parameters  \
0     {'kn_class__algorithm': 'auto', 'kn_class__n_n...   
1     {'kn_class__algorithm': 'auto', 'kn_class__n_n...   
2     {'kn_class__algorithm': 'auto', 'kn_class__n_n...   
3     {'kn_class__algorithm': 'auto', 'kn_class__n_n...   
4     {'kn_class__algorithm': 'auto', 'kn_class__n_n...   
...                                                 ...   
1075  {'kn_class__algorithm': 'brute', 'kn_class__n_...   
1076  {'kn_class__algorithm': 'brute', 'kn_class__n_...   
1077  {'kn_class__algorithm': 'brute', 'kn_class__n_...   
1078  {'kn_class__algorithm': 'brute', 'kn_class__n_...   
1079  {'kn_class__algorithm': 'brute', 'kn_class__n_...   

      oversample_SMOTE__sampling_strategy  kn_class__n_neighbors  \
0                                     1.0                    100   
1                                     1.0                    100   
2                                     1.0                    100   
3                  