In [None]:
#KNN Algorithm
#OBJECTIVE : LETTER RECOGNITION USING K-Nearest Neigbour

In [1]:
#Importing the libraries
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import numpy as np
import time 

In [2]:
#Importing the dataset
dataset=pd.read_csv('letter-recognitiondata.csv',header=None)
X=dataset.iloc[:,1:17].values
Y=dataset.iloc[:,0]

In [3]:
#Splitting the Data and Rescaling
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30)
X_tr, X_cv, y_tr, y_cv = train_test_split(X_train, y_train, test_size=0.20)

In [4]:
print(X_tr.shape)
print(y_tr.shape)

(11200, 16)
(11200,)


In [5]:
#Building the model
for m in range(1,20,2):
    knn = KNeighborsClassifier(n_neighbors=m,algorithm='kd_tree',metric='euclidean',p=0,weights='uniform',leaf_size=20)
    knn.fit(X_tr, y_tr)
    pred = knn.predict(X_cv)
    a=accuracy_score(y_cv, pred,normalize=True)* float(100)
    print('\nCV accuracy for k = %d is %d%%' % (m, a)) 


CV accuracy for k = 1 is 94%

CV accuracy for k = 3 is 94%

CV accuracy for k = 5 is 93%

CV accuracy for k = 7 is 93%

CV accuracy for k = 9 is 93%

CV accuracy for k = 11 is 93%

CV accuracy for k = 13 is 92%

CV accuracy for k = 15 is 92%

CV accuracy for k = 17 is 91%

CV accuracy for k = 19 is 91%


In [6]:
#Evaluation on Test set
knn = KNeighborsClassifier(n_neighbors=3,algorithm='kd_tree',metric='euclidean',p=0,weights='uniform',leaf_size=20)
knn.fit(X_tr, y_tr)
pred = knn.predict(X_test)
a=accuracy_score(y_test, pred,normalize=True)* float(100)
print('The accuracy of the model is :',a)

The accuracy of the model is : 93.48333333333333


In [7]:
#Cross Validation
k_range = list(range(0,20))
k_values = list(filter(lambda x: x % 2 != 0, k_range))
cv_scores = []
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k,algorithm='kd_tree',metric='euclidean',p=0,weights='uniform',leaf_size=20)
    scores = cross_val_score(knn, X_train, y_train, cv=20, scoring='accuracy')
    cv_scores.append(scores.mean())

In [8]:
print(cv_scores)

[0.953546121852661, 0.9462548025233664, 0.9463251082209657, 0.9431951728442518, 0.9394507656089426, 0.9358353615083084, 0.9323312269743272, 0.9296076544422975, 0.9268349550925498, 0.921620755135028]


In [9]:
# Misclassification and Determining best k
Mean_square_error = [1 - x for x in cv_scores]
optimal_k = k_values[Mean_square_error.index(min(Mean_square_error))]
print(optimal_k)

1


In [10]:
#Building model with optimum k
knn_optimal = KNeighborsClassifier(n_neighbors=optimal_k,algorithm='kd_tree',metric='euclidean',p=0,weights='uniform',leaf_size=20)
knn_optimal.fit(X_train, y_train)
pred = knn_optimal.predict(X_test)
a=accuracy_score(y_test, pred,normalize=True)* float(100)
print('The accuracy of the model is :',a)

The accuracy of the model is : 95.11666666666667


In [11]:
#Accuracy,Confusion matrix,Classification report
acc=accuracy_score(y_test, pred,normalize=True)* float(100)
conf_mat = confusion_matrix(y_test, pred)
report = classification_report(y_test, pred)
print('\nThe accuracy of the knn classifier for k = %d is %f%%' % (optimal_k, acc))


The accuracy of the knn classifier for k = 1 is 95.116667%
