# Implementation of K-Nearest-Neighbors from Scratch

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn import datasets

data_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'label']
data = pd.read_csv('iris.data', 
                   names = data_names)

In [2]:
all_vals = data[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']].values
all_labels = data['label'].values
unique_labels = np.unique(all_labels)
print(unique_labels)
#change string labels to numbers
new_labels = np.zeros(len(all_labels))
for i in range(0,len(unique_labels)):
    new_labels[all_labels == unique_labels[i]] = i
all_labels = new_labels

['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']


In [32]:
#Split dataset using scikit learn split function - keep 25 data points as test_set
(training_data, test_data, training_labels, test_labels) = train_test_split(all_vals, all_labels, test_size=0.166)

In [36]:
def knnclassify(test_data, training_data, training_labels, k=3):
    #training_labels = [0, 1, 2] => ['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']
    
    #contains the predicted label for each test data point
    pred_labels=[]
    
    #for each test_data point
    for i in range(len(test_data)):
        distances = []
            
        #compute euclidean distance with all points of the training set
        for j in range(len(training_data)):
            euclidean_distance = np.linalg.norm(test_data[i]-training_data[j])
            #Pair each distance with the training label of the training data point
            distances.append([euclidean_distance, training_labels[j]]) 
                 
        #sort pairs by distances to get the K nearest neighbor(s) at the top
        distances.sort()
        
        #select number of k neighbors
        k_nearests = distances[0:k]
    
        #Collect vote/class_label for each nearest neighbor (k=3 there are 3 votes) 
        votes = [k_nearest[1] for k_nearest in k_nearests]
        
        #Store the most_common vote for each test data point into class_label_result
        class_label_result = Counter(votes).most_common()[0][0]
    
        #Append predicted label for the test data point into pred_labels
        pred_labels.append(class_label_result)
    
    return pred_labels

In [42]:
pred_labels = knnclassify(test_data, training_data, training_labels)

In [38]:
from sklearn.metrics import accuracy_score
accuracy_score(y_true=test_labels, y_pred=pred_labels)

0.96