Agora, aplicaremos todos os conceitos que vimos de otimização. Usaremos ParemeterGrid, ParameterSampler, GridSearch e RandomSearch. Use o datase diabetes.csv

# Instruções

In [None]:
# biblioteca
from sklearn.metrics import log_loss, accuracy_score
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from scipy.stats import randint
from sklearn.model_selection import StratifiedKFold, ParameterGrid, ParameterSampler, GridSearchCV, RandomizedSearchCV, train_test_split

In [None]:
# dados 
df_diabetes = pd.read_csv('https://raw.githubusercontent.com/abnr/ml-data/main/diabetes.csv')

In [None]:
df_diabetes.head()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,tested_positive
1,1,85,66,29,0,26.6,0.351,31,tested_negative
2,8,183,64,0,0,23.3,0.672,32,tested_positive
3,1,89,66,23,94,28.1,0.167,21,tested_negative
4,0,137,40,35,168,43.1,2.288,33,tested_positive


# Treino, Teste e Validação

Primeiramente, teste a estratégia de dividir em treino, teste e validação. Ou seja, teremos 3 datasets. Siga o passo a passo abaixo.

1. Defina um Pipeline com Knn e MinMaxScaler. Defina também a validação StratifiedKFold
2. Divida o X em X_train, X_test e X_val. Sugestão de percentuais: 70, 15, 15
3. Crie a grid de parâmetros para o grid search (varie métrica de distância, número de vizinhos e pesos de votação)
4. Crie a grid de parâmetros para o random search (varie métrica de distância, número de vizinhos e pesos de votação). Use 50 iterações
5. Execute um loop para o grid search
6. Execute um loop para o random search
7. Avalie a performance e os conjuntos de hiperparametros associados (lembre-se de que o objetivo é pegar o conjunto, cujo erro de validação é o menor)

Dica: Divida primeiro em X_train e X_test e depois use o X_train para dividr em X_train e X_Val.
Lembre de printar os parâmetros escolhidos e o score. Use sempre 'neg_log_loss' como métrica.m

In [None]:
# Pipeline 
knn = Pipeline(steps=[('pre_processing', MinMaxScaler()), ('model', KNeighborsClassifier())])

In [None]:
# Divisão em X_train, X_test, X_val
df_train, df_test = train_test_split(df_diabetes, stratify=df_diabetes['class'], test_size=0.15, random_state=123)

In [None]:
n_rows_validation = int(0.15*df_diabetes.shape[0])
percent_validation = round((n_rows_validation/df_train.shape[0]), 2)

In [None]:
df_train, df_validation = train_test_split(df_train, stratify=df_train['class'], test_size=percent_validation, random_state=123)

In [None]:
# Definição da grid para o grid search
param_grid_search = {'model__metric':['manhattan','euclidean'],
                     'model__n_neighbors':list(range(1,52,2)),
                     'model__weights':['uniform','distance']}

In [None]:
# Definição da grid para o random search
param_random_search = {'model__metric':['manhattan', 'euclidean'],
                       'model__n_neighbors': randint(1,51),
                       'model__weights':['uniform','distance']}

In [None]:
X_train, y_train = df_train.drop('class', axis=1), df_train['class']
X_test, y_test = df_test.drop('class', axis=1), df_test['class']
X_validation, y_validation = df_validation.drop('class', axis=1), df_validation['class']

In [None]:
# Loop grid search
validation_score_grid_search = []
train_score_grid_search = []
list_grid_search_params = list(ParameterGrid(param_grid_search))

for combinacao in list_grid_search_params:
    knn.set_params(**combinacao)
    knn.fit(X_train, y_train)
    y_validation_predict = knn.predict_proba(X_validation)
    y_train_predict = knn.predict_proba(X_train)
    validation_score_grid_search.append(log_loss(y_validation, y_validation_predict))
    train_score_grid_search.append(log_loss(y_train, y_train_predict))

In [None]:
list_grid_search_params

[{'model__metric': 'manhattan',
  'model__n_neighbors': 1,
  'model__weights': 'uniform'},
 {'model__metric': 'manhattan',
  'model__n_neighbors': 1,
  'model__weights': 'distance'},
 {'model__metric': 'manhattan',
  'model__n_neighbors': 3,
  'model__weights': 'uniform'},
 {'model__metric': 'manhattan',
  'model__n_neighbors': 3,
  'model__weights': 'distance'},
 {'model__metric': 'manhattan',
  'model__n_neighbors': 5,
  'model__weights': 'uniform'},
 {'model__metric': 'manhattan',
  'model__n_neighbors': 5,
  'model__weights': 'distance'},
 {'model__metric': 'manhattan',
  'model__n_neighbors': 7,
  'model__weights': 'uniform'},
 {'model__metric': 'manhattan',
  'model__n_neighbors': 7,
  'model__weights': 'distance'},
 {'model__metric': 'manhattan',
  'model__n_neighbors': 9,
  'model__weights': 'uniform'},
 {'model__metric': 'manhattan',
  'model__n_neighbors': 9,
  'model__weights': 'distance'},
 {'model__metric': 'manhattan',
  'model__n_neighbors': 11,
  'model__weights': 'unif

In [None]:
np.min(validation_score_grid_search)

0.4842443612174514

In [None]:
np.argmin(validation_score_grid_search)

79

In [None]:
best_grid_search_params = list_grid_search_params[np.argmin(validation_score_grid_search)]

In [None]:
knn_grid_search = knn.set_params(**best_grid_search_params)
knn_grid_search.fit(X_train, y_train)


Pipeline(memory=None,
         steps=[('pre_processing',
                 MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('model',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='euclidean', metric_params=None,
                                      n_jobs=None, n_neighbors=27, p=2,
                                      weights='distance'))],
         verbose=False)

In [None]:
print(f'Performance no treino: {log_loss(y_train, knn_grid_search.predict_proba(X_train))}')

Performance no treino: 9.992007221626413e-16


In [None]:
print(f'Performance no test: {log_loss(y_test, knn_grid_search.predict_proba(X_test))}')

Performance no test: 0.4617165016364378


In [None]:
df_metrics_grid_search = pd.DataFrame({'params':list_grid_search_params, 'train_score':train_score_grid_search, 'validation_score':validation_score_grid_search})

In [None]:
df_metrics_grid_search.sort_values(by='validation_score', inplace=True)

In [None]:
df_metrics_grid_search

Unnamed: 0,params,train_score,validation_score
79,"{'model__metric': 'euclidean', 'model__n_neigh...",9.992007e-16,0.484244
77,"{'model__metric': 'euclidean', 'model__n_neigh...",9.992007e-16,0.484616
76,"{'model__metric': 'euclidean', 'model__n_neigh...",4.795928e-01,0.484912
78,"{'model__metric': 'euclidean', 'model__n_neigh...",4.804441e-01,0.485690
75,"{'model__metric': 'euclidean', 'model__n_neigh...",9.992007e-16,0.487739
...,...,...,...
3,"{'model__metric': 'manhattan', 'model__n_neigh...",9.992007e-16,4.693654
53,"{'model__metric': 'euclidean', 'model__n_neigh...",9.992007e-16,10.537254
52,"{'model__metric': 'euclidean', 'model__n_neigh...",9.992007e-16,10.537254
1,"{'model__metric': 'manhattan', 'model__n_neigh...",9.992007e-16,11.415358


In [None]:
len(list_grid_search_params)

104

In [None]:
# Loop Random Search
validation_score_random_search = []
train_score_random_search = []
list_random_search_params = list(ParameterSampler(param_random_search, n_iter=50, random_state=123))

for combinacao in list_random_search_params:
    knn.set_params(**combinacao)
    knn.fit(X_train, y_train)
    y_validation_predict = knn.predict_proba(X_validation)
    y_train_predict = knn.predict_proba(X_train)
    validation_score_random_search.append(log_loss(y_validation, y_validation_predict))
    train_score_random_search.append(log_loss(y_train, y_train_predict))

In [None]:
# Avalie os melhores parametros
np.min(validation_score_random_search)

0.48330469099035106

In [None]:
np.min(validation_score_grid_search)

0.4842443612174514

In [None]:
best_random_search_params = list_random_search_params[np.argmin(validation_score_random_search)]
knn_random_search = knn.set_params(**best_random_search_params)
knn_random_search.fit(X_train, y_train)
print(f'Performance no treino: {log_loss(y_train, knn_random_search.predict_proba(X_train))}')
print(f'Performance no test: {log_loss(y_test, knn_random_search.predict_proba(X_test))}')

Performance no treino: 0.47970810348364473
Performance no test: 0.4717777420818704


In [None]:
best_random_search_params

{'model__metric': 'euclidean',
 'model__n_neighbors': 24,
 'model__weights': 'uniform'}

In [None]:
knn_grid_search

Pipeline(memory=None,
         steps=[('pre_processing',
                 MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('model',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='euclidean', metric_params=None,
                                      n_jobs=None, n_neighbors=24, p=2,
                                      weights='uniform'))],
         verbose=False)

# Cross Validation

Reaproveitando os passos 1, 2, 3, 4 do tópico 2, realize as validações cruzadas com Grid Search e Random Search. Lembre de printar os melhores parâmetros e o score obtido.

In [None]:
# Divisão em X_train, X_test
df_train, df_test = train_test_split(df_diabetes, stratify=df_diabetes['label'], test_size=0.15, random_state=123)

In [None]:
# cross validation
skf = StratifiedKFold(n_splits=10)

In [None]:
# Grid Search
grid_search = GridSearchCV(knn, param_grid_search, cv=skf, return_train_score=True, scoring='neg_log_loss')

In [None]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('pre_processing', MinMaxScaler()),
                                       ('model',
                                        KNeighborsClassifier(metric='euclidean',
                                                             n_neighbors=24))]),
             param_grid={'model__metric': ['manhattan', 'euclidean'],
                         'model__n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17,
                                                19, 21, 23, 25, 27, 29, 31, 33,
                                                35, 37, 39, 41, 43, 45, 47, 49,
                                                51],
                         'model__weights': ['uniform', 'distance']},
             return_train_score=True, scoring='neg_log_loss')

In [None]:
#pd.DataFrame(grid_search.cv_results_)

In [None]:
grid_search.best_score_

-0.5058822204803055

In [None]:
grid_search.best_params_

{'model__metric': 'manhattan',
 'model__n_neighbors': 23,
 'model__weights': 'uniform'}

In [None]:
knn_grid_search_cv = grid_search.best_estimator_

In [None]:
grid_search.cv_results_['mean_train_score'][grid_search.best_index_]

-9.992007221626413e-16

In [None]:
grid_search.best_index_

39

In [None]:
grid_search.cv_results_['mean_train_score']

array([-9.99200722e-16, -9.99200722e-16, -2.92673114e-01, -9.99200722e-16,
       -3.65345127e-01, -9.99200722e-16, -4.04283730e-01, -9.99200722e-16,
       -4.22536310e-01, -9.99200722e-16, -4.32305998e-01, -9.99200722e-16,
       -4.43747546e-01, -9.99200722e-16, -4.50856255e-01, -9.99200722e-16,
       -4.59610115e-01, -9.99200722e-16, -4.64864102e-01, -9.99200722e-16,
       -4.68498607e-01, -9.99200722e-16, -4.69995936e-01, -9.99200722e-16,
       -4.72150038e-01, -9.99200722e-16, -4.74207814e-01, -9.99200722e-16,
       -4.75898446e-01, -9.99200722e-16, -4.77716483e-01, -9.99200722e-16,
       -4.78867240e-01, -9.99200722e-16, -4.80992624e-01, -9.99200722e-16,
       -4.83466143e-01, -9.99200722e-16, -4.85483373e-01, -9.99200722e-16,
       -4.87592237e-01, -9.99200722e-16, -4.89596601e-01, -9.99200722e-16,
       -4.91473525e-01, -9.99200722e-16, -4.92577671e-01, -9.99200722e-16,
       -4.94372487e-01, -9.99200722e-16, -4.95084230e-01, -9.99200722e-16,
       -9.99200722e-16, -

In [None]:
log_loss(y_train, knn_grid_search.predict_proba(X_train))

0.47970810348364473

In [None]:
log_loss(y_test, knn_grid_search.predict_proba(X_test))

0.4717777420818704

In [None]:
# Random Search

In [None]:
random_search = RandomizedSearchCV()

In [None]:
# Avalie a performance no treino e na validação

OBS: Não estamos preocupado com performance. A ideia é exercitar a construção desses steps. O dataset não é tão grande e podemos sofrer com isso.