In [1]:
import csv
import random
import math
import operator
import pandas as pd
import numpy as np


# function to read dataset from file
def readDataset(filename):
    dataset = pd.read_csv(filename, sep=",").to_numpy()
    dataset = np.where(dataset == 'I', 0, dataset)
    dataset = np.where(dataset == 'F', 1, dataset)
    dataset = np.where(dataset == 'M', 2, dataset)
    
    # splitting dataset into training set and test set
    testData = []
    trainingData = []
    split_prob = 0.80
    for i in range(dataset.shape[0]):
        if random.random() < split_prob:
            trainingData.append(dataset[i])
        else:
            testData.append(dataset[i])
    return trainingData, testData


# function to get euclidean distance
def getEuclideanDistance(trainingValue, testValue):
    distance = 0
    for i in range(len(testValue) - 1):
        distance += pow((trainingValue[i] - testValue[i]), 2)
    return math.sqrt(distance)


# function to get k closest neighbours of the test value
def getClosestNeighbors(trainingData, testValue, k):
    distances = []
    
    # calculating all distances between training data and test value
    for i in range(len(trainingData)):
        distance = getEuclideanDistance(trainingData[i], testValue)
        distances.append((trainingData[i], distance))
        
    # sorting on the basis of distance
    distances.sort(key=operator.itemgetter(1))
    
    # finding min k closest neighbours
    closestNeighbors = []
    for i in range(k):
        closestNeighbors.append(distances[i][0])
    return closestNeighbors


# function to get predicted value i.e. the max class label value in all k closest neighbours
def getPrediction(closestNeighbors):
    totalClassVotes = {}
    for i in range(len(closestNeighbors)):
        prediction = closestNeighbors[i][-1]
        if prediction not in totalClassVotes:
            totalClassVotes[prediction] = 1
        else:
             totalClassVotes[prediction] += 1
    totalSortedVotes = sorted(totalClassVotes.items(), key=lambda kv:(kv[1], kv[0]), reverse=True)
    return totalSortedVotes[0][0]


# function to get accuracy
def getAccuracy(testData, predictedValues):
    correct = 0
    for i in range(len(testData)):
        if testData[i][-1] == predictedValues[i]:
            correct += 1
    accuracy = (correct/float(len(testData))) * 100.0
    return accuracy
    
    
# function to get precision
def getPrecision(testData, predictedValues, value):
    correct = 0
    wrong = 0
    for i in range(len(testData)):
        if testData[i][-1] == predictedValues[i] and testData[i][-1] == value:
            correct += 1
        elif testData[i][-1] != predictedValues[i] and predictedValues[i] == value:
            wrong += 1
    if correct == 0 and wrong == 0:
        return 0
    return (correct/float(correct + wrong)) * 100.0
    
    
# function to get recall
def getRecall(testData, predictedValues, value):
    correct = 0
    wrong = 0
    for i in range(len(testData)):
        if testData[i][-1] == predictedValues[i] and testData[i][-1] == value:
            correct += 1
        elif testData[i][-1] != predictedValues[i] and testData[i][-1] == value:
            wrong += 1
    if correct == 0 and wrong == 0:
        return 0
    return (correct/float(correct + wrong)) * 100.0
    
    
# function to get f1-score
def getF1Score(testData, predictedValues, value):
    precision = getPrecision(testData, predictedValues, value)
    recall = getRecall(testData, predictedValues, value)
    if precision == 0 and recall == 0:
        return 0
    f1_score = (2*precision*recall)/(precision + recall)
    return f1_score
    
    
# function print accuracy, precision, recall and f1-score
def printEvaluations(testData, predictedValues):
    # printing accuracy
    accuracy = getAccuracy(testData, predictedValues)
    print("\nFinal Accuracy: {0}%\n".format(accuracy))
    
    # finding unique class labels in test data
    classLabels = []
    for i in testData:
        classLabels.append(i[len(testData[0]) - 1])
    classLabels = np.unique(classLabels)
    
    # printing precision, recall, f1-score of each class label in test data
    for classLabel in classLabels:
        # printing class label
        print("Class Label: " + repr(classLabel))
        
        # printing precision
        precision = getPrecision(testData, predictedValues, classLabel)
        print("\tPrecision: {0}%".format(precision))
        
        # printing recall
        recall = getRecall(testData, predictedValues, classLabel)
        print("\tRecall: {0}%".format(recall))
        
        # printing f1-score
        f1_score = getF1Score(testData, predictedValues, classLabel)
        print("\tF1-Score: {0}%\n".format(f1_score))
    
    
# main function
if __name__ == "__main__":
    # reading from file and printing
    trainingData, testData = readDataset('abalone.data')
    print("Training Data: " + repr(len(trainingData)))
    print("Test Data: " + repr(len(testData)))
    
    k = 5   # total number of closest neighbours
    print("Value of k: " + repr(k))
    
    # finding predictions of the test data
    predictedValues=[]
    for i in range(len(testData)):
        closestNeighbors = getClosestNeighbors(trainingData, testData[i], k)
        prediction = getPrediction(closestNeighbors)
        predictedValues.append(prediction)
    
    # printing evaluations
    printEvaluations(testData, predictedValues)
    

Training Data: 3374
Test Data: 802
Value of k: 5

Final Accuracy: 19.45137157107232%

Class Label: 3
	Precision: 50.0%
	Recall: 20.0%
	F1-Score: 28.571428571428573%

Class Label: 4
	Precision: 44.44444444444444%
	Recall: 33.33333333333333%
	F1-Score: 38.095238095238095%

Class Label: 5
	Precision: 26.31578947368421%
	Recall: 19.230769230769234%
	F1-Score: 22.22222222222222%

Class Label: 6
	Precision: 39.39393939393939%
	Recall: 20.0%
	F1-Score: 26.530612244897956%

Class Label: 7
	Precision: 23.25581395348837%
	Recall: 29.411764705882355%
	F1-Score: 25.974025974025974%

Class Label: 8
	Precision: 20.87912087912088%
	Recall: 18.446601941747574%
	F1-Score: 19.587628865979383%

Class Label: 9
	Precision: 21.73913043478261%
	Recall: 24.59016393442623%
	F1-Score: 23.076923076923077%

Class Label: 10
	Precision: 19.736842105263158%
	Recall: 26.31578947368421%
	F1-Score: 22.556390977443606%

Class Label: 11
	Precision: 16.3265306122449%
	Recall: 17.97752808988764%
	F1-Score: 17.1122994652406