# Grid Search
Tuning the hyperparameters 
`GridSearchCV` performs an exhaustive search over the parameter space evaluating each combination using X-val. 

In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_digits
import pandas as pd
import numpy as np

In [2]:
data = load_digits()
X = data.data
y = data.target

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=1/2,
                                                    random_state=42)
X_train.shape, X_test.shape

((898, 64), (899, 64))

## Basic *k*-NN Classifier
A basic *out-of-the-box* scalar and classifier combination. 

In [4]:
bScal = StandardScaler().fit(X_train)
X_trainS = bScal.transform(X_train)
X_testS = bScal.transform(X_test)

In [5]:
knn = KNeighborsClassifier()
knn.fit(X_trainS,y_train)
y_pred = knn.predict(X_testS)
print("Accuracy: {0:4.2f}".format(accuracy_score(y_test,y_pred)))
confusion_matrix(y_test, y_pred)

Accuracy: 0.96


array([[82,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 89,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  3, 80,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  1, 84,  0,  0,  0,  3,  5,  0],
       [ 0,  0,  0,  0, 91,  0,  0,  2,  0,  0],
       [ 0,  0,  0,  0,  0, 94,  1,  0,  0,  4],
       [ 0,  0,  0,  0,  0,  0, 98,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  1,  0, 85,  1,  0],
       [ 0,  3,  0,  2,  0,  0,  0,  0, 78,  0],
       [ 0,  0,  0,  1,  1,  2,  1,  2,  2, 83]])

In [6]:
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

## Grid Search  
First define the combinations of parameters to be considered. 

In [7]:
param_grid = {'n_neighbors':[1,3,5,10], 
              'metric':['manhattan','euclidean'],
             'weights':['uniform','distance']}

Run the grid search.

In [8]:
knn_gs = GridSearchCV(knn,param_grid,cv=10,  
                      verbose = 1, n_jobs = -1)
knn_gs = knn_gs.fit(X_trainS,y_train)

Fitting 10 folds for each of 16 candidates, totalling 160 fits


The grid search object will now work as a classifier with the 'optimal' parameters. 

In [9]:
y_pred_gs = knn_gs.predict(X_testS)
print("Accuracy: {0:4.2f}".format(accuracy_score(y_test,y_pred_gs)))
confusion_matrix(y_test, y_pred_gs)

Accuracy: 0.97


array([[82,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 89,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0, 81,  0,  0,  0,  0,  0,  2,  0],
       [ 0,  0,  0, 87,  0,  0,  1,  2,  2,  1],
       [ 0,  2,  0,  0, 91,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  1,  0, 95,  1,  0,  0,  2],
       [ 1,  0,  0,  0,  0,  0, 97,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0, 86,  0,  1],
       [ 1,  3,  1,  1,  0,  0,  0,  0, 77,  0],
       [ 0,  0,  0,  2,  1,  1,  0,  1,  1, 86]])

In [10]:
knn_gs

We can 'manually' provide the best parmameters to the *k*-NN object. 

In [11]:
knn2 = KNeighborsClassifier(metric= 'manhattan', 
                           n_neighbors = 1, weights = 'uniform')
knn2.fit(X_trainS,y_train)
y_pred_gs = knn2.predict(X_testS)
print("Accuracy: {0:4.2f}".format(accuracy_score(y_test,y_pred_gs)))
confusion_matrix(y_test, y_pred_gs)


Accuracy: 0.97


array([[82,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 89,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0, 81,  0,  0,  0,  0,  0,  2,  0],
       [ 0,  0,  0, 87,  0,  0,  1,  2,  2,  1],
       [ 0,  2,  0,  0, 91,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  1,  0, 95,  1,  0,  0,  2],
       [ 1,  0,  0,  0,  0,  0, 97,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0, 86,  0,  1],
       [ 1,  3,  1,  1,  0,  0,  0,  0, 77,  0],
       [ 0,  0,  0,  2,  1,  1,  0,  1,  1, 86]])

We can unpack the best parameters directly.

In [12]:
knn3 = KNeighborsClassifier(**knn_gs.best_params_)
knn3.fit(X_trainS,y_train)
y_pred_gs = knn3.predict(X_testS)
print("Accuracy: {0:4.2f}".format(accuracy_score(y_test,y_pred_gs)))
confusion_matrix(y_test, y_pred_gs)


Accuracy: 0.97


array([[82,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 89,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0, 81,  0,  0,  0,  0,  0,  2,  0],
       [ 0,  0,  0, 87,  0,  0,  1,  2,  2,  1],
       [ 0,  2,  0,  0, 91,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  1,  0, 95,  1,  0,  0,  2],
       [ 1,  0,  0,  0,  0,  0, 97,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0, 86,  0,  1],
       [ 1,  3,  1,  1,  0,  0,  0,  0, 77,  0],
       [ 0,  0,  0,  2,  1,  1,  0,  1,  1, 86]])

In [13]:
# presumably the reason weights = 'uniform' is not mentioned is because that is the default anyway.
knn3

## Pipelines & Grid Search

In [14]:
mam_mass = pd.read_csv('MamMass.csv',na_values='?')
mam_mass.pop('BI-RADS')
y = mam_mass.pop('Severity').values
X = mam_mass.values
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    random_state=1)
X_train.shape, X_test.shape

((768, 4), (193, 4))

The pipeline

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import MinMaxScaler

kNNpipe  = Pipeline(steps=[
    ('imputer', KNNImputer(missing_values = np.nan)),
    ('scaler', StandardScaler()),
    ('classifier', KNeighborsClassifier())])

*k*-NN hyperparameters to be set

In [16]:
param_grid = {'scaler':[StandardScaler(), MinMaxScaler(),'passthrough'],
              'classifier__n_neighbors':[1,3,5,10], 
              'classifier__metric':['manhattan','euclidean'],
              'classifier__weights':['uniform','distance']}

In [17]:
pipe_gs = GridSearchCV(kNNpipe,param_grid,cv=10, 
                      verbose = 1, n_jobs = -1)

In [18]:
pipe_gs = pipe_gs.fit(X_train, y_train)

Fitting 10 folds for each of 48 candidates, totalling 480 fits


In [19]:
pipe_gs.best_params_

{'classifier__metric': 'manhattan',
 'classifier__n_neighbors': 10,
 'classifier__weights': 'uniform',
 'scaler': StandardScaler()}

In [20]:
y_pred_gs = pipe_gs.predict(X_test)
print("Accuracy: {0:4.2f}".format(accuracy_score(y_test,y_pred_gs)))
confusion_matrix(y_test, y_pred_gs)

Accuracy: 0.79


array([[89, 21],
       [20, 63]])

### All results

In [21]:
scores_df = pd.DataFrame(pipe_gs.cv_results_)
scores_df = scores_df.sort_values(by=['rank_test_score']).reset_index(drop='index')
scores_df [['rank_test_score', 'mean_test_score','param_scaler', 
            'param_classifier__n_neighbors']].head()

Unnamed: 0,rank_test_score,mean_test_score,param_scaler,param_classifier__n_neighbors
0,1,0.78785,StandardScaler(),10
1,2,0.782741,MinMaxScaler(),10
2,3,0.781408,MinMaxScaler(),10
3,4,0.777444,passthrough,10
4,5,0.774812,StandardScaler(),10


In [22]:
pipe_gs.cv_results_

{'mean_fit_time': array([0.22384877, 0.04478543, 0.00330417, 0.00312314, 0.00361629,
        0.00415382, 0.0032825 , 0.0034713 , 0.00305648, 0.00309052,
        0.00324485, 0.0035239 , 0.00376101, 0.0030498 , 0.00318809,
        0.00304649, 0.00402708, 0.00326991, 0.00258462, 0.00413558,
        0.00283976, 0.00308506, 0.00764675, 0.00375023, 0.0077678 ,
        0.0060535 , 0.00370796, 0.00444589, 0.00550508, 0.00496271,
        0.00530343, 0.00456035, 0.00403957, 0.00418909, 0.00286255,
        0.00319152, 0.00338821, 0.00326102, 0.004632  , 0.00315878,
        0.0039382 , 0.00340612, 0.00381951, 0.00406492, 0.00375149,
        0.00332735, 0.00354211, 0.00303831]),
 'std_fit_time': array([0.01727479, 0.08155619, 0.00113381, 0.00092295, 0.00077993,
        0.00170792, 0.00117672, 0.00116128, 0.00117666, 0.00109321,
        0.00172992, 0.00197424, 0.00133245, 0.00102959, 0.00106294,
        0.00113871, 0.00141052, 0.00120888, 0.00099766, 0.0016558 ,
        0.00094328, 0.00139893, 0.007