In [None]:
"""
@Topic: K-NN Classifier
@Author: Shiwen Xu
@Email: shxu4542@uni.sydney.edu.au
@Date: 08/15/2022
@Credit: COMP5318 Week2 Tutorial
@TODO: 
    1. Build a K-Nearest Neighbor Classifier using Euclidean distance ✅
    2. Add distance-weighted method ✅
"""

In [128]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import accuracy_score
import math

In [129]:
iris_dataset = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris_dataset['data'], iris_dataset['target'], random_state=3)

In [130]:
# proximity measurement
def cal_distance(x1, x2):
    '''
    @description: calculate the Euclidean distance （L2 norm) between two points
    @x1: point 1; array
    @x2: point 2; array
    '''
    out_arr = np.subtract(x1, x2)
    summ = sum([number ** 2 for number in out_arr])
    return math.sqrt(summ)

In [203]:
def get_label(test_entry, X_train, y_train, k, weighted=False):
    '''
    @test_entry: one test record
    @X_train: training set data
    @y_train: training set target
    @k: hyperparameter; # of nearest neighbor
    @return: return the predicted label
    '''
    # store the index of the k nearest record
    k_dict = {}
    for i in range(len(X_train)):
        train_entry = X_train[i]
        dist = cal_distance(train_entry, test_entry)
        if len(k_dict) < k:
            k_dict[i] = dist
        else:
            key = max(k_dict, key = lambda k: k_dict[k])
            if k_dict[key] > dist:
                del k_dict[key]
                k_dict[i] = dist
    #print("-------------k_dict--------------")
    #print(k_dict)
    if weighted:
        w_dist = {}
        for key in k_dict:
            if y_train[key] not in w_dist:
                w_dist[y_train[key]] = 1 / k_dict[key]
            else:
                w_dist[y_train[key]] += 1 / k_dict[key]
#         print('--------------w_dist-------------')
#         print(w_dist)
#         print(max(w_dist, key = w_dist.get))
        return max(w_dist, key = w_dist.get) 
    else:
        type_list = [y_train[key] for key in k_dict]
        pred_label = np.bincount(type_list).argmax()
        return pred_label
    

In [204]:
def knn_predict(X_train, y_train, test_data, k, weighted=False):
    '''
    @description: predict on test data
    @test_data:
    @train_data: matrix combining training data & labels
    @K: hyperparameter
    '''
    pred_labels = []
    for i in range(len(test_data)):
        test_entry = test_data[i]
        # calculate the distance between test_entry and every record in the training set
        if weighted:
            pred_labels.append(get_label(test_entry, X_train, y_train, k, weighted=True))
        else:
            pred_labels.append(get_label(test_entry, X_train, y_train, k)) 
    return np.array(pred_labels)

In [205]:
# sklearn output
knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(X_train, y_train)
print(knn.predict(X_test))
print(y_test)
print("Accuracy on test set: {:.2f}".format(knn.score(X_test, y_test))) # score method calls self.predict(X) to get predict value of y

[0 0 0 0 0 2 1 0 2 1 1 0 1 1 2 0 1 2 2 0 2 2 2 1 0 2 2 1 1 1 0 0 2 1 0 0 2
 0]
[0 0 0 0 0 2 1 0 2 1 1 0 1 1 2 0 1 2 2 0 2 2 2 1 0 2 2 1 1 1 0 0 2 1 0 0 1
 0]
Accuracy on test set: 0.97


In [206]:
# my own output
y_predict = knn_predict(X_train, y_train, X_test, 11)
print(y_predict)
print(y_test)
print("Accuracy on test set: {:.2f}".format(accuracy_score(y_test, y_predict))) ## (# y_pred intersect y_test) / # y_test)

[0 0 0 0 0 2 1 0 2 1 1 0 1 1 2 0 1 2 2 0 2 2 2 1 0 2 2 1 1 1 0 0 2 1 0 0 2
 0]
[0 0 0 0 0 2 1 0 2 1 1 0 1 1 2 0 1 2 2 0 2 2 2 1 0 2 2 1 1 1 0 0 2 1 0 0 1
 0]
Accuracy on test set: 0.97


In [217]:
# weighted-distance knn
# weight points by the inverse of their distance
y_predict = knn_predict(X_train, y_train, X_test, 3, weighted=True)
print(y_predict)
print(y_test)
print("Accuracy on test set: {:.2f}".format(accuracy_score(y_test, y_predict))) ## (# y_pred intersect y_test) / # y_test)

[0 0 0 0 0 2 1 0 2 1 1 0 1 1 2 0 2 2 2 0 2 2 2 1 0 2 2 1 1 1 0 0 2 1 0 0 2
 0]
[0 0 0 0 0 2 1 0 2 1 1 0 1 1 2 0 1 2 2 0 2 2 2 1 0 2 2 1 1 1 0 0 2 1 0 0 1
 0]
Accuracy on test set: 0.95


In [218]:
# sklearn weighted distance knn
# sklearn output
knn = KNeighborsClassifier(n_neighbors=3, weights='distance')
knn.fit(X_train, y_train)
print(knn.predict(X_test))
print(y_test)
print("Accuracy on test set: {:.2f}".format(knn.score(X_test, y_test))) # score method calls self.predict(X) to get predict value of y

[0 0 0 0 0 2 1 0 2 1 1 0 1 1 2 0 2 2 2 0 2 2 2 1 0 2 2 1 1 1 0 0 2 1 0 0 2
 0]
[0 0 0 0 0 2 1 0 2 1 1 0 1 1 2 0 1 2 2 0 2 2 2 1 0 2 2 1 1 1 0 0 2 1 0 0 1
 0]
Accuracy on test set: 0.95
