In [1]:
import numpy as np
import operator
from operator import itemgetter
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from matplotlib import pyplot as plt

In [2]:
# x1 and x2 are two vectors of different images
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

In [None]:
class KNN:
    # constructor for initializing k
    def __init__(self, k=3):
        self.k = k

    # save training data and labels
    def fit(self, x_train, y_train):
        self.x_train = x_train # training data
        self.y_train = y_train # training labels

    # performing prediction
    def predict(self, x_test):
        predictions = []
        for i in range(len(x_test)):
            # calculating distance of test data[i] from all training data
            dist = np.array([euclidean_distance(x_test[i], x) for x in self.x_train])
            # sort the distnaces and return first k neighbors
            dist_sorted = np.argsort()[:self.k]
            # set neighbour count to empty set
            neigh_count = {}
            for idx in dist_sorted:
                # if exist in set, increment frequency
                if self.y_train[idx] in neigh_count:
                    neigh_count[self.y_train[idx]] += 1
                else:
                    # if not exist in set, add to set with frequency 1
                    neigh_count[self.y_train[idx]] = 1
            # sort the neighbour count in descending order and return the first one
            # which is the most frequent label in the k neighbours
            sorted_neigh_count = sorted(neigh_count.items(), key=itemgetter(1), reverse=True)
            predictions.append(sorted_neigh_count[0][0])
        return predictions

In [None]:
# loading mnist dataset
mnist = load_digits()
print(mnist.data.shape)

X = mnist.data
y = mnist.target

In [None]:
# splitting data to train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

In [None]:
# find the best k-value by training for k in the range 1 to 100
kVals = np.arange(3, 100, 2)
accuracies = []
for k in kVals:
    model = KNN(k=k)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    accuracies.append(accuracy)
    print(f"k={k}, accuracy={accuracy:.4f}")

In [None]:
# find the index where k = max
max_index = accuracies.index(max(accuracies))
print(f"Best k value is {kVals[max_index]} with accuracy {max(accuracies):.4f}")

In [None]:
# plot k values and their accuracies
plt.plot(kVals, accuracies, marker='o')
plt.title("k vs. Accuracy")
plt.xlabel("k")
plt.ylabel("Accuracy")
plt.show()

In [None]:
# print precision-recall for best k-value = 3
model = KNN(k=kVals[max_index])	
model.fit(X_train, y_train)
predictions = model.predict(X_test)
acc = accuracy_score(y_test, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, predictions)
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

print(f"Testing Accuracy: {acc:.4f}")
