In [2]:
import numpy as np
from scipy.spatial import distance
import pandas as pd
from scipy import stats
import operator
from sklearn.model_selection import KFold
from statistics import mean

np.set_printoptions(suppress=True) #prevent numpy exponential 

#Read data and convert unstructured data to structured.
def read_data(file):
    nominal = dict()
    gene_data = open(file)
    gene_seq = gene_data.readlines()
    all_genes_list = []
    first_row = gene_seq[0].split("\t")
    for f in range(len(first_row)):
        try:
            float(first_row[f])
        except:
            nominal[f] = []
    for line in gene_seq:
        gene = line.strip().split("\t")
        for f in range(len(gene)):
            try:
                gene[f] = float(gene[f])
            except:
                category = nominal[f]
                if gene[f] in category:
                    gene[f] = float(category.index(gene[f]))
                else:
                    category.append(gene[f])
                    gene[f] = float(category.index(gene[f]))
        all_genes_list.append(gene)
    return np.asarray(all_genes_list, dtype = float)

def normalise(data):
    return stats.zscore(data, axis=1)

def computeKNN(train_data, test_data, k, train_classes):
    classes = []
    for i in range(0,len(test_data)):
        distances = []
        for j in range(0, len(train_data)):
            distances.append(distance.euclidean(test_data[i], train_data[j]))
        neighbors = np.argpartition(np.array(distances), k)
        class_var = get_class_from_neighbors(neighbors[:k], train_classes)
        classes.append(class_var)
    return classes

def get_class_from_neighbors(neighbors, train_classes):
    votes = dict()
    max_votes = 0
    max_class = -1
    for neighbor in neighbors:
        if train_classes[neighbor] in votes:
            votes[train_classes[neighbor]] = votes[train_classes[neighbor]]+1
        else:
            votes[train_classes[neighbor]] = 1
        if votes[train_classes[neighbor]] > max_votes:
            max_votes = votes[train_classes[neighbor]]
            max_class = train_classes[neighbor]
    return max_class
    
def calculate_metrics(predicted_classes, ground_truth):
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    for i in range(0, len(predicted_classes)):
        if(predicted_classes[i] == 1 and ground_truth[i] == 1):
            tp += 1
        elif(predicted_classes[i] == 1 and ground_truth[i] == 0):
            fp += 1
        elif(predicted_classes[i] == 0 and ground_truth[i] == 1):
            fn += 1
        else:
            tn += 1
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    if (tp+fp) != 0:
        precision = tp / (tp + fp)   
    if (tp+fn) != 0:
        recall = tp / (tp + fn)   
    if ((2 * tp) + fp + fn) != 0:
        f_1_measure = (2 * tp) / ((2 * tp) + fp + fn)    
    return accuracy, precision, recall, f_1_measure

data = read_data("project3_dataset1.txt")
classes = data[:,len(data[0])-1]
normalised_data = normalise(data[:,:len(data[0])-1])

kfold = KFold(10, True, 1)
k = int(input("Please enter the k value: "))
accuracy_list = []
precision_list = []
recall_list = []
f_1_measure_list = []
for train, test in kfold.split(normalised_data):
    train_data = normalised_data[train]
    test_data = normalised_data[test]
    train_classes = classes[train]
    test_classes = classes[test]
    predicted_classes = computeKNN(train_data, test_data, k, train_classes)
    accuracy, precision, recall, f_1_measure = calculate_metrics(predicted_classes, test_classes)
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f_1_measure_list.append(f_1_measure)
print("Accuracy: ", mean(accuracy_list))
print("Precision: ", mean(precision_list))
print("Recall: ", mean(recall_list))
print("F_1_measure: ", mean(f_1_measure_list))

# #Adopting 10-fold Cross validation for splitting data set to train and test sets
# rows = normalised_data.shape[0]
# print("full data ", normalised_data.shape)
# train_rows = int(0.9*rows)
# test_rows = rows - train_rows
# train_data = normalised_data[0:train_rows,:]
# test_data = normalised_data[train_rows:,:]
# train_classes = classes[:train_rows]
# test_classes = classes[train_rows:]
# print("train ",train_data.shape)
# print("test ",test_data.shape)
# # print(train_data)
# k = int(input("Please enter the k value: "))
# predicted_classes = computeKNN(train_data, test_data, k, train_classes)
# print(len(predicted_classes))
# # print(predicted_classes)
# # print(test_classes)
# calculate_metrics(predicted_classes, test_classes)


Please enter the k value: 5
Accuracy:  0.9332393483709273
Precision:  0.9373129794182425
Recall:  0.8855788648737832
F_1_measure:  0.9085466954696556
