In [18]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import csv
import numpy as np 
from collections import Counter
from sklearn.neighbors import KNeighborsClassifier
from random import seed
from random import randint

In [2]:
# calculate the Euclidean distance between two vectors
def euclidean_distance(test_row, train):
    a = train - test_row
    b =  a**2
    distances = np.sum(b, axis = 1)
    return distances.reshape(-1,1)

# calculate the Manhattan distance between two vectors
def manhattan_distance(test_row, train):
    a = train - test_row
    b = np.absolute(a)
    distances = np.sum(b, axis = 1)
    return distances.reshape(-1,1)

# Locate the most similar neighbors
def get_neighbors(train, test_row, num_neighbors):
    distances = manhattan_distance(test_row[1:], train[:,1:])
    # print(distances[0])
    dist = np.append(train, distances, axis=1)
    # print(distances[0])
    # exit()
    # print(distances[0])
    dist = np.array(sorted(dist, key=lambda a_entry: a_entry[-1]))
    # print(distances[0,:])
    neighbors = dist[0:num_neighbors,0]
    # print("NRIGH", neighbors[0])
    return neighbors

# Make a classification prediction with neighbors
def predict_classification(trainFile, num_neighbors):
    with open(trainFile,'r') as f:
        reader  = csv.reader(f, quoting=csv.QUOTE_NONNUMERIC)
        train = np.array(list(reader))
        validate = train[0:400,:]
        train = train[400:,:]
        tags = np.zeros(len(validate))
        predict = np.zeros(len(validate))
        for index in range(len(validate)):
            # print(index)
            test_row = validate[index,:]
            output_values = get_neighbors(train, test_row, num_neighbors)
            b = Counter(output_values)
            prediction = b.most_common()[0][0]
            tags[index] = test_row[0]
            predict[index] = prediction
    return tags, predict

In [3]:
def evaluate(tags, predictions, num_neighbors):
    print("k = ", num_neighbors)
    print("Accuracy",accuracy_score(tags, predictions))
    print("F1 Score",f1_score(tags, predictions, average='micro'))
    print("Confusion matrix")
    print(confusion_matrix(tags, predictions))


KNN classifier for K = 1 and Manhattan distance

In [4]:
tags, predict = predict_classification("./Datasets/q1/train.csv", 1)

In [5]:
evaluate(tags, predict, 1)


k =  1
Accuracy 0.95
F1 Score 0.9500000000000001
Confusion matrix
[[39  0  0  0  0  0  0  0  0  0]
 [ 0 47  0  0  0  0  0  0  0  0]
 [ 0  0 31  1  0  0  0  0  0  0]
 [ 0  1  0 38  0  0  0  0  0  2]
 [ 0  1  0  0 26  0  0  0  0  2]
 [ 0  0  0  2  0 34  0  0  0  0]
 [ 0  0  0  0  0  0 56  0  0  0]
 [ 0  1  0  0  0  0  0 35  0  0]
 [ 0  2  0  0  0  1  0  0 33  2]
 [ 0  0  0  0  0  0  0  5  0 41]]


KNN classifier for K = 2 and Manhattan distance

In [6]:
tags, predict = predict_classification("./Datasets/q1/train.csv", 2)

In [7]:
evaluate(tags, predict, 2)

k =  2
Accuracy 0.96
F1 Score 0.96
Confusion matrix
[[39  0  0  0  0  0  0  0  0  0]
 [ 0 47  0  0  0  0  0  0  0  0]
 [ 0  0 32  0  0  0  0  0  0  0]
 [ 0  1  0 37  0  0  0  0  0  3]
 [ 0  1  0  0 25  0  0  0  0  3]
 [ 0  0  0  2  0 34  0  0  0  0]
 [ 0  0  0  0  0  0 56  0  0  0]
 [ 0  1  0  0  0  0  0 35  0  0]
 [ 0  1  1  0  0  0  0  0 34  2]
 [ 0  0  0  0  0  0  0  1  0 45]]


In [8]:
# Locate the most similar neighbors
def get_neighbors(train, test_row, num_neighbors):
    distances = euclidean_distance(test_row[1:], train[:,1:])
    # print(distances[0])
    dist = np.append(train, distances, axis=1)
    # print(distances[0])
    # exit()
    # print(distances[0])
    dist = np.array(sorted(dist, key=lambda a_entry: a_entry[-1]))
    # print(distances[0,:])
    neighbors = dist[0:num_neighbors,0]
    # print("NRIGH", neighbors[0])
    return neighbors

KNN classifier for K = 1 and Euclidean distance

In [9]:
tags, predict = predict_classification("./Datasets/q1/train.csv", 1)

In [10]:
evaluate(tags, predict, 1)

k =  1
Accuracy 0.97
F1 Score 0.97
Confusion matrix
[[39  0  0  0  0  0  0  0  0  0]
 [ 0 47  0  0  0  0  0  0  0  0]
 [ 0  0 32  0  0  0  0  0  0  0]
 [ 0  1  0 38  0  0  0  0  0  2]
 [ 0  1  0  0 27  0  0  0  0  1]
 [ 0  0  0  1  0 35  0  0  0  0]
 [ 0  0  0  0  0  0 56  0  0  0]
 [ 0  0  0  0  0  0  0 36  0  0]
 [ 0  1  0  0  0  1  0  0 34  2]
 [ 0  0  0  0  0  0  0  2  0 44]]


KNN classifier for K = 2 and Euclidean distance

In [11]:
tags, predict = predict_classification("./Datasets/q1/train.csv", 2)

In [12]:
evaluate(tags, predict, 2)

k =  2
Accuracy 0.9625
F1 Score 0.9625000000000001
Confusion matrix
[[39  0  0  0  0  0  0  0  0  0]
 [ 0 46  0  0  0  0  0  0  1  0]
 [ 0  0 32  0  0  0  0  0  0  0]
 [ 0  1  0 37  0  0  0  0  1  2]
 [ 0  1  0  0 26  0  0  0  0  2]
 [ 0  0  0  2  0 34  0  0  0  0]
 [ 1  0  0  0  0  0 55  0  0  0]
 [ 0  0  0  0  0  0  0 36  0  0]
 [ 0  1  1  0  0  0  0  0 35  1]
 [ 0  0  0  0  0  0  0  1  0 45]]


Accuracy and other metrics best for Eucliedean distance and k =1. Thus for this setting we will compare the results with Inbuilt KNN classifier from sklearn, Random guessing and Majority choosing based implementation

In [21]:
def inbuiltKNN(trainFile, num_neighbor):
    with open(trainFile,'r') as f:
        reader  = csv.reader(f, quoting=csv.QUOTE_NONNUMERIC)
        train = np.array(list(reader))
        validate = train[0:400,:]
        train = train[400:,:]
        knn = KNeighborsClassifier(n_neighbors=num_neighbor)
        knn.fit(train[:,1:], train[:,0])
        predict = knn.predict(validate[:,1:])
        tags = validate[:,0]
        return tags, predict

def random_classifier(trainFile, num_neighbor):
    with open(trainFile,'r') as f:
        reader  = csv.reader(f, quoting=csv.QUOTE_NONNUMERIC)
        train = np.array(list(reader))
        validate = train[0:400,:]
        train = train[400:,:]
        seed(1)
        tags = np.zeros(len(validate))
        predict = np.zeros(len(validate))
        for index in range(len(validate)):
            i = randint(0, len(train))
            tags[index] = validate[index,0]
            predict[index] = train[i,0]
    return tags, predict

def majority_classifier(trainFile, num_neighbors):
    with open(trainFile,'r') as f:
        reader  = csv.reader(f, quoting=csv.QUOTE_NONNUMERIC)
        train = np.array(list(reader))
        validate = train[0:400,:]
        train = train[400:,:]
        tags = validate[:,0]
        b = Counter(train[:,0])
        prediction = b.most_common()[0][0]
        predict = np.full(len(validate), prediction)
        return tags, predict

Inbuilt KNN implementation using sklearn

In [22]:
tags, predict = inbuiltKNN("./Datasets/q1/train.csv", 1)

In [23]:
evaluate(tags, predict, 1)

k =  1
Accuracy 0.97
F1 Score 0.97
Confusion matrix
[[39  0  0  0  0  0  0  0  0  0]
 [ 0 47  0  0  0  0  0  0  0  0]
 [ 0  0 32  0  0  0  0  0  0  0]
 [ 0  1  0 38  0  0  0  0  0  2]
 [ 0  1  0  0 27  0  0  0  0  1]
 [ 0  0  0  1  0 35  0  0  0  0]
 [ 0  0  0  0  0  0 56  0  0  0]
 [ 0  0  0  0  0  0  0 36  0  0]
 [ 0  1  0  0  0  1  0  0 34  2]
 [ 0  0  0  0  0  0  0  2  0 44]]


Random guessing from the training data

In [24]:
tags, predict = random_classifier("./Datasets/q1/train.csv", 1)

In [25]:
evaluate(tags, predict, 1)

k =  1
Accuracy 0.105
F1 Score 0.10499999999999998
Confusion matrix
[[ 4  4  3  2  5  4  4  1  5  7]
 [ 4  4  4  8  4  5  2  8  4  4]
 [ 7  1  4  3  1  4  1  3  3  5]
 [ 4  6  6  3  1  2  7  2  4  6]
 [ 4  3  3  1  2  1  9  5  0  1]
 [ 6  3  2  2  2  4  4  4  5  4]
 [ 8  4  6  9  6  5 10  4  3  1]
 [ 2  5  3  4  7  4  2  4  2  3]
 [ 3  4  4  1  5  3  8  6  2  2]
 [ 2  5  6  5  5  4  3  6  5  5]]


Majority (mode) from the training data

In [26]:
tags, predict = majority_classifier("./Datasets/q1/train.csv", 1)

In [27]:
evaluate(tags, predict, 1)

k =  1
Accuracy 0.1175
F1 Score 0.1175
Confusion matrix
[[ 0 39  0  0  0  0  0  0  0  0]
 [ 0 47  0  0  0  0  0  0  0  0]
 [ 0 32  0  0  0  0  0  0  0  0]
 [ 0 41  0  0  0  0  0  0  0  0]
 [ 0 29  0  0  0  0  0  0  0  0]
 [ 0 36  0  0  0  0  0  0  0  0]
 [ 0 56  0  0  0  0  0  0  0  0]
 [ 0 36  0  0  0  0  0  0  0  0]
 [ 0 38  0  0  0  0  0  0  0  0]
 [ 0 46  0  0  0  0  0  0  0  0]]


KNN implementation is done using numpy, collections and other such libraries. Two different distances are checked here, manhattan distance and euclidean distance. The best accuracy is acheived for k=1. Our implentation is comaprable with the Inbuilt KNN from sklearn library. It also works way better than the baseline cases such as random guessing, majority choosing.