In [1]:
from math import sqrt
import csv
import collections
import numpy as np 
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from random import seed
from random import randint

In [2]:
def mode(num_list):
    # print("len of num_list", len(num_list))
    # print("num_list1", num_list)
    num_list = np.array(list(filter(lambda a: a != '?', num_list)))
    # print("num_list2", num_list)
    modev = collections.Counter(num_list).most_common(1)
    mode_val = modev[0][0]
    return mode_val

# calculate the distance between 2 vectors 
def distanceChar(test_row, trainData):
    # print("train data", len(trainData), len(trainData[0]))
    # print("test_row", len(test_row))
    # exit(0)
    # a = trainData - test_row
    b = trainData != test_row
    c = b.astype(np.int)
    distances = np.sum(c, axis = 1)
    return distances.reshape(-1,1)

# Locate the most similar neighbors
def get_neighbors(trainData, test_row, num_neighbors):
    distances = distanceChar(test_row[1:22], trainData[:,1:22])
    # print(distances[0])
    dist = np.append(trainData, distances, axis=1)
    dist = np.array(sorted(dist, key=lambda a_entry: a_entry[-1]))
    # print(distances[0,:])
    neighbors = dist[0:num_neighbors,0]
    # print("NRIGH", neigh`bors[0])
    return neighbors

def prepData(trainData):
    for i in range(len(trainData[0])):
        mode_val = mode(trainData[:,i])
        for j in range(len(trainData[:,i])):
            if trainData[j][i] == '?':
                trainData[j][i] = mode_val
    return trainData

# Make a classification prediction with neighbors
def predict_classification(trainDataFile, num_neighbors):
    with open(trainDataFile,'r') as f:
        reader  = csv.reader(f)
        trainData = np.array(list(reader))
        np.random.shuffle(trainData)
        trainData = prepData(trainData)
        validate = trainData[0:400,:]
        trainData = trainData[400:,:]
        tags = np.empty(len(validate), dtype = "<U10") 
        predict = np.empty(len(validate), dtype = "<U10")
        for index in range(len(validate)):
            # print(index)
            test_row = validate[index,:]
            output_values = get_neighbors(trainData, test_row, num_neighbors)
            b = collections.Counter(output_values)
            prediction = b.most_common()[0][0]
            tags[index] = test_row[0]
            predict[index] = prediction
    return list(tags), list(predict)

In [31]:
def evaluate(tags, predictions, num_neighbors):
#     print("tags", tags)
#     print("predictions", predict)
    print("k = ", num_neighbors)
    print("Accuracy",accuracy_score(tags, predictions))
    print("F1 Score",f1_score(tags, predictions, average='micro'))
    print("Confusion matrix")
    print(confusion_matrix(tags, predictions))

In [48]:
#Inbuilt KNN implementation with missing data replaced by mode
def inbuiltKNN(trainFile, num_neighbor):
    with open(trainFile,'r') as f:
        reader  = csv.reader(f)
        trainData = np.array(list(reader))
        train = np.empty([len(trainData), len(trainData[0])], dtype=int)
        for i in range(len(trainData[0])):
            mode_val = mode(trainData[:,i])
            for j in range(len(trainData[:,i])):
                if trainData[j][i] == '?':
                    trainData[j][i] = mode_val
                train[j][i] = ord(trainData[j][i])
        np.random.shuffle(trainData)
        validate = train[0:400,:]
        train = train[400:,:]
        knn = KNeighborsClassifier(n_neighbors=num_neighbor)
        knn.fit(train[:,1:], train[:,0])
        predict = knn.predict(validate[:,1:])
        tags = validate[:,0]
        return tags, predict

#Random choosing from the training data, with the missing data replaced by mode
def random_classifier(trainFile, num_neighbor):
    with open(trainFile,'r') as f:
        reader  = csv.reader(f)
        train = np.array(list(reader))
        train = prepData(train)
        validate = train[0:400,:]
        train = train[400:,:]
        seed(1)
        tags = np.empty(len(validate), dtype = str)
        predict = np.empty(len(validate), dtype = str)
        for index in range(len(validate)):
            i = randint(0, len(train))
            tags[index] = validate[index,0]
            predict[index] = train[i,0]
    return tags, predict

#Majority choosing(mode), with the missing data replaced by mode
def majority_classifier(trainFile, num_neighbors):
    with open(trainFile,'r') as f:
        reader  = csv.reader(f)
        train = np.array(list(reader))
        np.random.shuffle(train)
        train = prepData(train)
        validate = train[0:400,:]
        train = train[400:,:]
        tags = validate[:,0]
        b = collections.Counter(train[:,0])
        prediction = b.most_common()[0][0]
        predict = np.full(len(validate), prediction)
#         print("predict", predict)
#         print("tag", tags)
        return tags, predict

In [8]:
tags, predict = predict_classification("./Datasets/q2/train.csv", 1)
evaluate(tags, predict, 1)

k =  1
Accuracy 1.0
F1 Score 1.0
Confusion matrix
[[147   0   0]
 [  0   0   0]
 [  0   0 253]]


In [10]:
tags, predict = predict_classification("./Datasets/q2/train.csv", 2)
evaluate(tags, predict, 2)

k =  2
Accuracy 1.0
F1 Score 1.0
Confusion matrix
[[140   0   0]
 [  0   0   0]
 [  0   0 260]]


In [11]:
tags, predict = predict_classification("./Datasets/q2/train.csv", 3)
evaluate(tags, predict, 3)

k =  3
Accuracy 0.99
F1 Score 0.99
Confusion matrix
[[164   0   4]
 [  0   0   0]
 [  0   0 232]]


KNN classifier with k=1 is the best we can do with our implementation. Now we will compare with inbuilt, random choosing and majority choosing applications of KNN

In [34]:
tags, predict = inbuiltKNN("./Datasets/q2/train.csv", 1)
evaluate(tags, predict, 1)

k =  1
Accuracy 1.0
F1 Score 1.0
Confusion matrix
[[400]]


In [39]:
tags, predict = random_classifier("./Datasets/q2/train.csv", 1)
evaluate(tags, predict, 1)

k =  1
Accuracy 0.2975
F1 Score 0.2975
Confusion matrix
[[119 281]
 [  0   0]]


In [50]:
tags, predict = majority_classifier("./Datasets/q2/train.csv", 1)
evaluate(tags, predict, 1)

k =  1
Accuracy 0.6675
F1 Score 0.6675
Confusion matrix
[[  0 133]
 [  0 267]]


KNN implementation is done using numpy, collections and other such libraries. The best accuracy is acheived for k=1. Our implentation is comaprable with the Inbuilt KNN from sklearn library. It also works way better than the baseline cases such as random guessing, majority choosing.