* Model Selection - Selecting the best hyperparameters to optimize the model
* It is enabled by cross validation 
* 2 Model Selection techniques:
* 1. Grid Search 
* 2. Randomized Grid Search

In [8]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from time import time 

In [2]:
df= pd.read_csv('C:/Users/Palla Anuraag Sharma/Downloads/Datacamp/Datasets/Iris DataSet/Iris.csv')

In [3]:
df = df.set_index('Id')

In [4]:
df.sample(5)

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
56,5.7,2.8,4.5,1.3,Iris-versicolor
98,6.2,2.9,4.3,1.3,Iris-versicolor
61,5.0,2.0,3.5,1.0,Iris-versicolor
74,6.1,2.8,4.7,1.2,Iris-versicolor
45,5.1,3.8,1.9,0.4,Iris-setosa


In [5]:
x = df[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()
df['Species_cat'] = le.fit_transform(df.Species)
y = df['Species_cat']

In [6]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=23)

In [11]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

start = time()
skf = StratifiedKFold(n_splits=10)

neighbors = [1, 3, 5, 11, 17, 23, 31, 53]
for n in neighbors:
    knc = KNeighborsClassifier(n_neighbors=n)
    score = cross_val_score(knc, x_train, y_train, cv=skf)
    print(f'neighbors={n}, score={np.mean(score)*100:4.1f}%')
# Display compute time
print(f'Compute time = {time() - start:4.2f} seconds.')

neighbors=1, score=94.5%
neighbors=3, score=93.5%
neighbors=5, score=94.5%
neighbors=11, score=96.3%
neighbors=17, score=95.3%
neighbors=23, score=93.5%
neighbors=31, score=95.3%
neighbors=53, score=87.8%
Compute time = 0.25 seconds.


### USING GRID SEARCH CV

In [13]:
from sklearn.model_selection import GridSearchCV

# Start clock
start = time()

skf = StratifiedKFold(n_splits=10)

knc = KNeighborsClassifier()

# Create a dictionary of hyperparameters and values
neighbors = [1, 3, 5, 11, 17, 23, 31, 53]
params = {'n_neighbors':neighbors}

# Create grid search cross validator
gse = GridSearchCV(estimator=knc, param_grid=params, cv=skf)

# Fit estimator
gse.fit(x_train, y_train)

# Display time and best estimator results.
print(f'Compute time = {time() - start:4.2f} seconds.\n')

print(f'Best n_neighbors={gse.best_estimator_.get_params()["n_neighbors"]:5.4f}')
print(f'Best CV Score = {gse.best_score_:4.3f}')

Compute time = 0.25 seconds.

Best n_neighbors=11.0000
Best CV Score = 0.963


### Using Randomized Search CV

In [19]:
from sklearn.model_selection import RandomizedSearchCV

# Start clock
start = time()

knc = KNeighborsClassifier()
skf = StratifiedKFold(n_splits=10)

neighbors = range(1, 51)
weights = ['uniform', 'distance']
# Create a dictionary of hyperparameters and values
params = {'n_neighbors':neighbors, 'weights':weights}

# Number of random parameter samples
num_samples = 20

# Run randomized search
rscv = RandomizedSearchCV(knc, param_distributions=params, n_iter=num_samples, random_state=23)

# Fit grid search estimator and display results
rscv.fit(x_train, y_train)

print(f'Compute time = {time() - start:4.2f} seconds', end='')
print(f' for {num_samples} parameter combinations')

Compute time = 0.30 seconds for 20 parameter combinations


In [20]:
# Get best esimtator
be = rscv.best_estimator_

# Display parameter values
print(f'Best n_neighbors={be.get_params()["n_neighbors"]:5.4f}')
print(f'Best weights={be.get_params()["weights"]}')

# Display best score
print(f'Best CV Score = {rscv.best_score_:4.3f}')

Best n_neighbors=39.0000
Best weights=distance
Best CV Score = 0.962
