In [1]:
# What is a Hyperparameter?
# The parameters that we set in the model is called Hyperparameters 
# Every Algorithm has different hyperparameters.

# What is Hyperparamater Tuning?
# Its all about identifying which paramter will improve the quality of the model

# Is Hyperparamter Tuning mandatory?
# If you achieve your quality model with default config, then this step is not required

In [2]:
import numpy as np
import pandas as pd

In [3]:
irisData = pd.read_csv('https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv')

In [4]:
features = irisData.iloc[:,0:4].values
label = irisData.iloc[:,4].values

In [5]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier() #n_neighbours = 5

In [6]:
# 1. Cross Validation Score
#
# Getting an idea of what can be the best possible score I can achieve for the given dataset
# with respect to the selected algo
#
#
from sklearn.model_selection import cross_val_score
cvResult = cross_val_score(model,
               features,
               label,
               cv = 10)

In [7]:
cvResult.mean()

0.9666666666666668

In [22]:
# Technique 1.  GridSearchCV technique
#
# Useful to define Hyperparameters and understand which parameter value has given best quality

kValues = np.arange(1,31)
weightParameter = ["uniform","distance"]
pgrid = dict(n_neighbors = kValues , weights=weightParameter)
print(pgrid)

{'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]), 'weights': ['uniform', 'distance']}


In [23]:
from sklearn.grid_search import GridSearchCV
grid = GridSearchCV(model, 
                   param_grid= pgrid,
                   cv=10)

In [24]:
grid.fit(features,label)

GridSearchCV(cv=10, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]), 'weights': ['uniform', 'distance']},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [25]:
grid.best_params_

{'n_neighbors': 13, 'weights': 'uniform'}

In [26]:
grid.best_score_

0.98

In [27]:
grid.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=13, p=2,
           weights='uniform')

In [21]:
finalModel = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=13, p=2,
           weights='uniform')

In [28]:
?KNeighborsClassifier

In [31]:
# For Large Datasets, I recommend to use
# RandomizedGridSearchCV
kValues = np.arange(1,31)
weightParameter = ["uniform","distance"]
pgrid = dict(n_neighbors = kValues , weights=weightParameter)
print(pgrid)

{'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]), 'weights': ['uniform', 'distance']}


In [36]:
from sklearn.grid_search import RandomizedSearchCV
grid2 = RandomizedSearchCV(model, 
                   param_distributions= pgrid,
                   cv=10,
                    n_iter = 10)

In [37]:
grid2.fit(features,label)

RandomizedSearchCV(cv=10, error_score='raise',
          estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]), 'weights': ['uniform', 'distance']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=None, verbose=0)

In [38]:
grid2.best_params_

{'n_neighbors': 13, 'weights': 'uniform'}

In [39]:
grid2.best_score_

0.98

In [41]:
grid2.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=13, p=2,
           weights='uniform')