In [None]:
#Importing necessary libraries
import math
from pprint import pprint
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Load the dataset
file_columns = ['sepal_len','sepal_width','petal_len','petal_width','class']
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',header=None, names=file_columns)
data.head()

Unnamed: 0,sepal_len,sepal_width,petal_len,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [None]:
#Shuffle the data
data = data.sample(frac=1).reset_index(drop=True)
data['seq'] = data.index
data.head()

Unnamed: 0,sepal_len,sepal_width,petal_len,petal_width,class,seq
0,7.7,3.8,6.7,2.2,Iris-virginica,0
1,5.5,2.3,4.0,1.3,Iris-versicolor,1
2,5.4,3.0,4.5,1.5,Iris-versicolor,2
3,5.6,2.5,3.9,1.1,Iris-versicolor,3
4,6.5,3.2,5.1,2.0,Iris-virginica,4


In [None]:
#split dataset into train and test data
dev_size = int(data.shape[0]*0.70)
test_size = int(data.shape[0]*0.30)

# Take first 70% of the data as dev set
dev = data[:dev_size]

# Take last 30% of the data as test set
test = data[test_size:]

In [None]:
# Euclidean distance method
def get_euclidean(row1, row2):
    return math.sqrt(sum([(x1-x2)**2 for x1,x2 in zip(row1,row2)]))

In [None]:
#calculate distance between every point in the dataset and every other point is calculated 
#by using euclidean distance method
dev2 = dev.values
eud = []
l = len(dev2)
for i in range(l):
    eu_distance = []
    for j in range(l):
        if(i!=j):
            index = dev2[j][5]
            ed = get_euclidean(dev2[i][:-2], dev2[j][:-2])
            
            eu_distance.append((ed, index))
    eu_distance.sort(key= lambda x: x[0])
    eu_distance = [i[1] for i in eu_distance]
    eud.append(eu_distance)
dev['euclidean'] = eud

In [None]:
#k closest neighbors are selected for every point in the dev set.
def get_nearest(row, distance_measure,k):
    return row[distance_measure][:k]

In [None]:
#Among the K nearest neighbors, the dominant class is elected and 
#the data point is classified to belong to this class.
def get_dominant_class(df, neighbors):
    classes = df[df['seq'].isin(neighbors)]['class']
    return classes.value_counts().index[0]

In [None]:
#Optimizing K
k = 1
hyper_params = []
acc = {1: {}, 3:{}, 5:{}, 7:{}}

while k <= 7:
    dev['eud_{}'.format(k)] = dev.apply(lambda x: get_nearest(x, 'euclidean',k), axis=1)
    dev['eud_{}_class'.format(k)] = dev['eud_{}'.format(k)].apply(lambda row: get_dominant_class(dev, row))
    hyper_params.append('eud_{}_class'.format(k))
    acc[k]['eud'] = dev[dev['class']==dev['eud_{}_class'.format(k)]].shape[0]/dev.shape[0]
    k+=2

In [None]:
#Observe various hyper parameters
cols = ['class'] + hyper_params
dev[cols].head()

Unnamed: 0,class,eud_1_class,eud_3_class,eud_5_class,eud_7_class
0,Iris-virginica,Iris-virginica,Iris-virginica,Iris-virginica,Iris-virginica
1,Iris-versicolor,Iris-versicolor,Iris-versicolor,Iris-versicolor,Iris-versicolor
2,Iris-versicolor,Iris-versicolor,Iris-versicolor,Iris-versicolor,Iris-versicolor
3,Iris-versicolor,Iris-versicolor,Iris-versicolor,Iris-versicolor,Iris-versicolor
4,Iris-virginica,Iris-virginica,Iris-virginica,Iris-virginica,Iris-virginica


In [None]:
pprint(acc)

{1: {'eud': 0.9523809523809523},
 3: {'eud': 0.9428571428571428},
 5: {'eud': 0.9428571428571428},
 7: {'eud': 0.9428571428571428}}


In [None]:
#Calculate euclidean distance between every point in the test dataset 
#with every point in the dev set and pick 3 nearest neighbors
test['seq'] = test.index
test2 = test.values
test_eud = []
l = len(test)
for i in range(l):
    test_eu_distance = []
    for j in range(len(dev)):
        index = dev2[j][5]
        ed = get_euclidean(test2[i][:-2], dev2[j][:-2])
        test_eu_distance.append((ed, index))
        
    test_eu_distance.sort(key= lambda x: x[0])
    test_eu_distance = [i[1] for i in test_eu_distance]
    test_eud.append(test_eu_distance)
    
test['euclidean'] = test_eud

In [None]:
#Since we obtained 3 as optimal value for the hyperparameter k, 
#we find 3 nearest neighbors between every point in the test set
# and every point in the dev set using Euclidean Distance
test['eu'] = test.apply(lambda x: get_nearest(x, 'euclidean',3), axis=1)
test[file_columns+['eu']].head()

Unnamed: 0,sepal_len,sepal_width,petal_len,petal_width,class,eu
45,5.7,2.9,4.2,1.3,Iris-versicolor,"[45, 48, 6]"
46,6.9,3.1,4.9,1.5,Iris-versicolor,"[46, 32, 86]"
47,4.7,3.2,1.6,0.2,Iris-setosa,"[47, 68, 57]"
48,5.7,2.8,4.1,1.3,Iris-versicolor,"[48, 45, 6]"
49,6.8,3.0,5.5,2.1,Iris-virginica,"[49, 36, 63]"


In [None]:
#Pick the most dominant class among 3 nearest neighbors
test['eu_class'] = test['eu'].apply(lambda row: get_dominant_class(dev, row))
test[file_columns+['eu_class']].head()

Unnamed: 0,sepal_len,sepal_width,petal_len,petal_width,class,eu_class
45,5.7,2.9,4.2,1.3,Iris-versicolor,Iris-versicolor
46,6.9,3.1,4.9,1.5,Iris-versicolor,Iris-versicolor
47,4.7,3.2,1.6,0.2,Iris-setosa,Iris-setosa
48,5.7,2.8,4.1,1.3,Iris-versicolor,Iris-versicolor
49,6.8,3.0,5.5,2.1,Iris-virginica,Iris-virginica


In [None]:
y_test=test["class"].tolist()

In [None]:
y_pred=test["eu_class"].tolist()

In [None]:
# Accuracy, Confusion Matrix, Classification report of KNN classifier
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
print(confusion_matrix(y_test, y_pred))
print("Accuracy:",accuracy_score(y_test,y_pred)*100)
print('\n\n')
print(classification_report(y_test, y_pred))

[[40  0  0]
 [ 0 29  0]
 [ 0  3 33]]
Accuracy: 97.14285714285714



                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        40
Iris-versicolor       0.91      1.00      0.95        29
 Iris-virginica       1.00      0.92      0.96        36

       accuracy                           0.97       105
      macro avg       0.97      0.97      0.97       105
   weighted avg       0.97      0.97      0.97       105

