In [13]:
import numpy as np
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.model_selection import KFold

### Initialise iris datasets

In [26]:
iris = datasets.load_iris()
X = iris.data
y = iris.target
print(X.shape,y.shape)

(150, 4) (150,)


### To try every possible K from 1 to 15

In [27]:
ks = [x for x in range(1,17,2)]
print(ks)

[1, 3, 5, 7, 9, 11, 13, 15]


### Realise KFold k = 5

#### So 

<table border=“1” align="left">
    <tr>
        <th>Training Part</th>
        <td>80%</td>
    </tr>
    <tr>
        <th>Validation Part</th>
        <td>20%</td>
    </tr>
</table>

In [28]:
kf = KFold(n_splits = 5,random_state=2001,shuffle=True) ### shuffle data in order to yield values randomly to avoid overfit



### Which is the best K?

### Method 1 Calculate one by one using KFold

In [29]:
best_k = ks[0]
best_score = 0

for k in ks:
    cur_score = 0
    for train_index,valid_index in kf.split(X): ### divise training data with KFold created
        clf = KNN(n_neighbors=k)
        clf.fit(X[train_index],y[train_index])
        cur_score = cur_score + clf.score(X[valid_index],y[valid_index])
    average_score = cur_score/5 # n_splits in KFold
    if average_score > best_score:
        best_k = k
        best_score = average_score
    print("current best score is: %.2f"%best_score,"best k: %d"%best_k)
    
print("After cross validation, the final best k is: %d"%best_k)

current best score is: 0.96 best k: 1
current best score is: 0.96 best k: 1
current best score is: 0.97 best k: 5
current best score is: 0.98 best k: 7
current best score is: 0.98 best k: 7
current best score is: 0.98 best k: 7
current best score is: 0.98 best k: 7
current best score is: 0.98 best k: 7
After cross validation, the final best k is: 7


### Method 2 Using GridSearchCV in Sklearn

In [32]:
### Which k will be tested? in Dictionary
from sklearn.model_selection import GridSearchCV

parameters = {"n_neighbors":[x for x in range(1,20,2)]}
knn = KNN()

clf = GridSearchCV(knn,parameters,cv=5)# n_splits = 5
clf.fit(X,y)

print("best score is: %.2f"%clf.best_score_," best param: ",clf.best_params_)

best score is: 0.98  best param:  {'n_neighbors': 7}
