# kNN implementation, classification and regression

In [90]:
# Implement k-nearest neightbour. 
# Reuse the program to implement kNN regression (based on a simple mean) and classification (voting)
# https://hlab.stanford.edu/brian/euclidean_distance_in.html
# https://www.researchgate.net/figure/Pseudocode-for-KNN-classification_fig7_260397165
    

import pandas as pd
import numpy as np
import bisect

In [91]:
def load_csv(path):
    df1 = pd.read_csv(path)
    return df1



In [92]:
# Returns the euclidean distance sum in n-dimentional space:
def findDistance(p1,p2):
    euclid_dist_sum = 0
    for i in range(len(p1)):
        euclid_dist_sum += (p1[i] - p2[i])**2
    return euclid_dist_sum

## General implementation of kNN:

In [93]:
# kNN implementation, 
# Xin is the training data, 
# Yin is the class labels (supervised)
# x is the unknown sample 
def find_K_neighbours(Xin,Yin,x):
    k = 10 # from the assignment
    # The closes neighbours to the left in asc_neig:
    asc_neig = []
    for rownum, train_point in Xin.iterrows():
        # Calculate the distance to the sample x and insert it into a sorted list:
        bisect.insort(asc_neig, (findDistance(train_point, x),Yin[rownum]))
    # Slicing the list from element 0 to element k:
    
    # Removing the closest one, which is the actual point from the training data:
    asc_neig = asc_neig[1:]

    return asc_neig[0:(k)]

### Classification problem, solved with voting: 

In [101]:
df_class = load_csv("./dataset(3)/knn_classification.csv")
Y_class = list(df_class.y)
X_class = df_class.loc[:,'x1':'x4']

knn_classification_problem = find_K_neighbours(X_class,Y_class,[6.3, 2.7, 4.91, 1.8])

# Voting:
# Making a list with the different target values we have seen in the training data:
distinct_vals_Y = []
for y in Y_class:
    if y not in distinct_vals_Y:
        distinct_vals_Y.append(y)

# Count the values

def count_votes():
    votes = [0 for x in range(len(distinct_vals_Y))]
    for i in range(len(votes)):
        for n in knn_classification_problem:
            if (n[1] == i):
                votes[i]+=1
    return votes.index(max(votes))


classifier_result = count_votes()

print("The datapoint is in the class: ", classifier_result)
print("The ten closest neighbours for the classification problem are:")

for elem in knn_classification_problem:
    print(round(elem[0],4))


The datapoint is in the class:  2
The ten closest neighbours for the classification problem are:
0.0321
0.0581
0.1301
0.1301
0.1361
0.1661
0.1721
0.1921
0.2361
0.2541


### Regression problem, solved with mean: 

In [103]:
# Loading the data for the regression problem:
df_reg = load_csv("./dataset(3)/knn_regression.csv")
Y_reg = list(df_reg.y)
X_reg = df_reg.loc[:,'x1':'x3']

knn_regression_problem = find_K_neighbours(X_reg,Y_reg,[6.3, 2.7, 4.91])

def sample_mean_value(ten_closest_neighbours):
    sum = 0
    for elem in ten_closest_neighbours:
        sum += elem[1]
    return sum/len(ten_closest_neighbours)

res_reg = sample_mean_value(knn_regression_problem)
print("The mean value for the regression problem is the following:", res_reg)

print("The ten closest neighbours for the regression problem are:")
for elem in knn_regression_problem:
    print(round(elem[0],4))



The mean value for the regression problem is the following: 1.6099999999999999
The ten closest neighbours for the regression problem are:
0.0321
0.0401
0.0461
0.0481
0.0941
0.1241
0.1261
0.1301
0.1461
0.1621
