# kNN implementation, classification and regression

In [1]:
# Implement k-nearest neightbour. 
# Reuse the program to implement kNN regression (based on a simple mean) and classification (voting)
# https://hlab.stanford.edu/brian/euclidean_distance_in.html
# https://www.researchgate.net/figure/Pseudocode-for-KNN-classification_fig7_260397165
    

import pandas as pd
import numpy as np
import bisect

In [2]:
def load_csv(path):
    df1 = pd.read_csv(path)
    return df1



In [3]:
# Returns the euclidean distance sum in n-dimentional space:
# p2 is the test point!
def findDistance(p1,p2):
    euclid_dist_sum = 0
    for i in range(len(p1)):
        euclid_dist_sum += (p1[i] - p2[i])**2
    return (euclid_dist_sum)

## General implementation of kNN:

In [4]:
# kNN implementation, 
# Xin is the training data, 
# Yin is the class labels (supervised)
# x is the unknown sample 
def find_K_neighbours(Xin,Yin,x):
    k = 10 # from the assignment
    # The closes neighbours to the left in asc_neig:
    asc_neig = []
    for rownum, train_point in Xin.iterrows():
        # Calculate the distance to the sample x and insert it into a sorted list:
        bisect.insort(asc_neig, (findDistance(train_point, x),Yin[rownum],rownum))
    # Slicing the list from element 0 to element k:
    
    # Removing the closest one, which is the actual point from the training data:
    asc_neig = asc_neig[1:]

    return asc_neig[0:(k)]

### Classification problem, solved with voting: 

In [7]:
df_class = load_csv("./dataset(3)/knn_classification.csv")
Y_class = list(df_class.y)
X_class = df_class.loc[:,'x1':'x4']

knn_classification_problem = find_K_neighbours(X_class,Y_class,[6.3, 2.7, 4.91, 1.8])
# Voting:
# Making a list with the different target values we have seen in the training data:
distinct_vals_Y = []
for y in Y_class:
    if y not in distinct_vals_Y:
        distinct_vals_Y.append(y)
        
# Count the values

def count_votes():
    votes = [0 for x in range(len(distinct_vals_Y))]
    for i in range(len(votes)):
        for n in knn_classification_problem:
            if (n[1] == i):
                votes[i]+=1     
    return votes.index(max(votes))
# what if we have a tie?
classifier_result = count_votes()

def printClosestAsc(nearest_neighbours,input_x):
    i = 1
    for elem in nearest_neighbours:
        print("Number ",i, "is:")
        row_index = (elem[2])
        p = input_x.iloc[row_index,:]
        print(row_index)
        print(p)
        print("Distance to target:", round(elem[0],4))
        i+=1
        print(" ")

In [8]:
print("The datapoint is in the class: ", classifier_result)
print(" ")
print("The ten closest neighbours for the classification problem are:")

printClosestAsc(knn_classification_problem,X_class)



The datapoint is in the class:  2
 
The ten closest neighbours for the classification problem are:
Number  1 is:
126
x1    6.2
x2    2.8
x3    4.8
x4    1.8
Name: 126, dtype: float64
Distance to target: 0.0321
 
Number  2 is:
146
x1    6.3
x2    2.5
x3    5.0
x4    1.9
Name: 146, dtype: float64
Distance to target: 0.0581
 
Number  3 is:
127
x1    6.1
x2    3.0
x3    4.9
x4    1.8
Name: 127, dtype: float64
Distance to target: 0.1301
 
Number  4 is:
72
x1    6.3
x2    2.5
x3    4.9
x4    1.5
Name: 72, dtype: float64
Distance to target: 0.1301
 
Number  5 is:
133
x1    6.3
x2    2.8
x3    5.1
x4    1.5
Name: 133, dtype: float64
Distance to target: 0.1361
 
Number  6 is:
83
x1    6.0
x2    2.7
x3    5.1
x4    1.6
Name: 83, dtype: float64
Distance to target: 0.1661
 
Number  7 is:
111
x1    6.4
x2    2.7
x3    5.3
x4    1.9
Name: 111, dtype: float64
Distance to target: 0.1721
 
Number  8 is:
138
x1    6.0
x2    3.0
x3    4.8
x4    1.8
Name: 138, dtype: float64
Distance to target: 0.1921
 
N

### Regression problem, solved with mean: 

In [9]:
# Loading the data for the regression problem:
df_reg = load_csv("./dataset(3)/knn_regression.csv")
Y_reg = list(df_reg.y)
X_reg = df_reg.loc[:,'x1':'x3']

knn_regression_problem = find_K_neighbours(X_reg,Y_reg,[6.3, 2.7, 4.91])

def sample_mean_value(ten_closest_neighbours):
    sum = 0
    for elem in ten_closest_neighbours:
        sum += elem[1]
    return sum/len(ten_closest_neighbours)

res_reg = sample_mean_value(knn_regression_problem)




In [10]:
print("The mean value for the regression problem is the following:", res_reg)

print("The ten closest neighbours for the regression problem are:")

printClosestAsc(knn_regression_problem,X_reg)

    
    
    

The mean value for the regression problem is the following: 1.6099999999999999
The ten closest neighbours for the regression problem are:
Number  1 is:
126
x1    6.2
x2    2.8
x3    4.8
Name: 126, dtype: float64
Distance to target: 0.0321
 
Number  2 is:
72
x1    6.3
x2    2.5
x3    4.9
Name: 72, dtype: float64
Distance to target: 0.0401
 
Number  3 is:
133
x1    6.3
x2    2.8
x3    5.1
Name: 133, dtype: float64
Distance to target: 0.0461
 
Number  4 is:
146
x1    6.3
x2    2.5
x3    5.0
Name: 146, dtype: float64
Distance to target: 0.0481
 
Number  5 is:
73
x1    6.1
x2    2.8
x3    4.7
Name: 73, dtype: float64
Distance to target: 0.0941
 
Number  6 is:
63
x1    6.1
x2    2.9
x3    4.7
Name: 63, dtype: float64
Distance to target: 0.1241
 
Number  7 is:
83
x1    6.0
x2    2.7
x3    5.1
Name: 83, dtype: float64
Distance to target: 0.1261
 
Number  8 is:
127
x1    6.1
x2    3.0
x3    4.9
Name: 127, dtype: float64
Distance to target: 0.1301
 
Number  9 is:
54
x1    6.5
x2    2.8
x3    4.6