In [92]:
import math  
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from collections import OrderedDict, defaultdict

In [93]:
Colnames = ["sepal_length", "sepal_width", "petal_length", "petal_width", "class"] #Column names for the data set
Data = pd.read_csv("iris.csv", header=None, names=Colnames) #read data in dataframe
Data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [94]:
Data = shuffle(Data) #shuffle data as the data set is ordered
Data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
131,7.9,3.8,6.4,2.0,Virginica
82,5.8,2.7,3.9,1.2,Versicolor
113,5.7,2.5,5.0,2.0,Virginica
9,4.9,3.1,1.5,0.1,Setosa
8,4.4,2.9,1.4,0.2,Setosa


In [95]:
train, test = train_test_split(Data, test_size=0.2) #split whole dataframe into random test and train sets

In [96]:
def calculateDist(testFeture,trainRec):
    var=0
    x=testFeture.values #create array of values from series type
    y=trainRec.values
    i = 0
    while i < len(x): 
        q=x[i]
        r=y[i]
        var+=math.pow(q-r,2) #calculate distance between test record and all train records
        i=i+1
    return math.sqrt(var) #return the distance

In [97]:
def UpdateNeighbors(neighbors, item, distance, k):
    temp= item.to_dict()
    if len(neighbors) < k:   
        # If list is not full add the item and sort   
        neighbors.append([distance, temp['class']]) 
        neighbors = sorted(neighbors) 
    else: 
        # if list is full check if new distance value is lesser than the maximun value of in the list
        if neighbors[-1][0] > distance: 
            neighbors[-1] = [distance, temp['class']] 
            neighbors = sorted(neighbors)   
    return neighbors 

In [98]:
def CalculateNeighborsClass(neighbors, k): 
    count = {}   
    for i in range(k): 
        if neighbors[i][1] not in count: #initialize class count to 1 if its not yet in the list 
            count[neighbors[i][1]]=1
        else:  
            count[neighbors[i][1]]+=1 #if the class already exists then increment the count
    return count 

In [100]:
def FindMax(Dict): 
    maximum = -1
    classification = ''   
    for key in Dict.keys(): #return the class having maximum count  
        if Dict[key] > maximum: 
            maximum = Dict[key] 
            classification=key  
    return (classification, maximum) 

In [101]:
def runKNN(rowFeatures,k,train):
    neighbors=[] #list to store neighbors of each cluster
    for index, row in train.iterrows(): #for each test record against train dataset calculate the following
        distance=calculateDist(rowFeatures,row)  #calculate distance of test records against all train records   
        neighbors = UpdateNeighbors(neighbors, row, distance, k) #calculate neighbors of test records against all train records 
    count = CalculateNeighborsClass(neighbors, k) #find count of each class records
    return FindMax(count) #find the class with maximum count

In [102]:
def knn(k,train,test):
    correct=0; #store count of correct predictions
    total=(test.shape[0])
    for index, row in test.iterrows(): #calculate distance and group each test record
        rowClass=row['class'] #store actual classification
        rowFeatures=row.iloc[0:4] # retrive features from the record
        prediction=runKNN(rowFeatures,k,train) #call knn and find the predicted classification
        if prediction[0]==rowClass: #if prediction is equal to actual classification increment the count
             correct+=1
    accuracy=correct/float(total) #Accuracy is count of correct predictions by total predictions
    print("When K is ",k,"prediction accuracy is ",accuracy)
        

In [105]:
i=1
while i < 11: 
    knn(i,train,test) #Call Knn algorithm for K between 3 to 7
    i=i+1

When K is  1 prediction accuracy is  0.9666666666666667
When K is  2 prediction accuracy is  0.9666666666666667
When K is  3 prediction accuracy is  0.9666666666666667
When K is  4 prediction accuracy is  0.9666666666666667
When K is  5 prediction accuracy is  0.9666666666666667
When K is  6 prediction accuracy is  0.9666666666666667
When K is  7 prediction accuracy is  0.9333333333333333
When K is  8 prediction accuracy is  0.9666666666666667
When K is  9 prediction accuracy is  0.9333333333333333
When K is  10 prediction accuracy is  0.9333333333333333
