# [k-nearest neighbors algorithm](https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm)

In pattern recognition, the k-nearest neighbors algorithm (k-NN) is a non-parametric method used for classification and regression. In both cases, the input consists of the k closest training examples in the feature space. The output depends on whether k-NN is used for classification or regression.

Let's go to learn this algorithm. For more information, please check [this repos](https://github.com/DTU-CS101/ML-TUTORIAL).

In [None]:
import csv
import random
import math
import operator

## sorted, operator.itemgetter

### operator.itemgetter
return a function which accesses object fields by index

In [None]:
a = [1,2,3]

In [None]:
index = operator.itemgetter(0)
index(a)

In [None]:
index = operator.itemgetter(1, 0)
index(a)

### sorted(iterable[, cmp[, key[, reverse]]])
sort a list or an iterable object

In [None]:
students = [('john', 'A', 15), ('jane', 'B', 12), ('dave', 'B', 10)]

In [None]:
sorted(students, key=lambda student : student[2])

In [None]:
sorted(students, key=operator.itemgetter(2))

In [None]:
sorted(students, key=operator.itemgetter(1,2))

In [None]:
sorted(students, key=operator.itemgetter(2), reverse=True)

## Load IRIS Data

In [None]:
def loadDataset(filename,split_ratio):
    training_data, test_data = [], []
    with open(filename,"r") as csvfile:
        lines = csv.reader(csvfile)
        dataset = list(lines)
        
        for x in range(len(dataset)- 1):
            for y in range(4):
                dataset[x][y] = float(dataset[x][y])
                
            if random.random() < split_ratio:
                training_data.append(dataset[x])
            else:
                test_data.append(dataset[x])
    return training_data, test_data

In [None]:
training_data, test_data = loadDataset('data/iris.data', 0.66)

## Compute Metrics

In [None]:
def euclidean_distance(ins1, ins2, features):
    distance = 0
    for x in range(features):
        distance += pow(ins1[x] - ins2[x], 2)

    return math.sqrt(distance)

In [None]:
euclidean_distance(training_data[0], training_data[1], 4)

## Compute Nearest Neighbours

In [None]:
def getNeighbours(training_data, test_ins, k):
    distances = []
    features = len(test_ins) - 1

    for x in range(len(training_data)):
        distance = euclidean_distance(test_ins, training_data[x], features)
        distances.append((training_data[x],distance))

    distances.sort(key = operator.itemgetter(1))

    neighbours = []
    for x in range(k):
        neighbours.append(distances[x][0])

    return neighbours

In [None]:
neighbours = getNeighbours(training_data, test_data[15], 5)
neighbours

In [None]:
test_data[15]

## Vote to Predict

In [None]:
def getResponseVotes(neighbours):
    classvotes = {}

    for x in range(len(neighbours)):
        response = neighbours[x][-1]
        if response in classvotes:
            classvotes[response] +=1
        else:
            classvotes[response] = 1

    sortedvotes = sorted(classvotes.items(),key = operator.itemgetter(1),reverse = True)    

    return sortedvotes[0][0]

In [None]:
getResponseVotes(neighbours)

## Compute Accuracy

In [None]:
def getAccuracy(test_data, predictions):
    correct = 0
    
    for x in range(len(test_data)):
        if test_data[x][-1] == predictions[x]:
            correct+=1
            
    return (correct/float(len(test_data)))*100.0

In [None]:
def generatePredictions(training_data, test_data, k):
    predictions = []
    
    for x in range(len(test_data)):                       
        neighbours = getNeighbours(training_data, test_data[x], k)
        prediction = getResponseVotes(neighbours)
        predictions.append(prediction)
        
    return predictions

In [None]:
predictions = generatePredictions(training_data, test_data, k)

In [None]:
getAccuracy(test_data, predictions)