In [11]:
import numpy as np

def categorical_label(data_entry,data):
    category_data = []
    
    for i in range(len(data_entry)):
        if data_entry[i].isalpha():
            ft_array,indices = np.unique(data[:,i],return_inverse = True)
            data[:,i] = indices.astype(np.float)
            category_data.append(i)
        
    return data,category_data


def normalization(data_points,category_data):
    
    for i in range(data_points[0].size):
        if i in category_data:
            continue
        else:
            col_data = data_points[:,i]
            
            normalized_data = (col_data - col_data.min()) / (col_data.max() - col_data.min())
            
            data_points[:,i] = normalized_data
            
    
    return data_points
    

    


def performance_metric(test_actual_version, test_predicted_version):

    TP = 0
    FN = 0
    FP = 0
    TN = 0
    acc = 0
    prec = 0
    recall = 0
    f1_measure = 0

    for i in range(len(test_predicted_version)):
        if test_actual_version[i] == 1 and test_predicted_version[i] == 1:
            TP += 1
        elif test_actual_version[i] == 1 and test_predicted_version[i] == 0:
            FN += 1
        elif test_actual_version[i] == 0 and test_predicted_version[i] == 1:
            FP += 1
        elif test_actual_version[i] == 0 and test_predicted_version[i] == 0:
            TN += 1

    acc += float((TP+TN)/(TP+FN+FP+TN))

    if(TP+FP != 0):
        prec += float((TP)/(TP+FP))

    if(TP+FN != 0):
        recall += float((TP)/(TP+FN))

    f1_measure += float((2*TP)/((2*TP)+FN+FP))

    return acc, prec, recall, f1_measure



    


In [62]:
def knn_implementation(split_data,split_labels,k):
    total_accuracy = 0 
    total_precision = 0
    total_recall = 0 
    final_f1_measure = 0
    

    for index in range(10):
            
        
        train_data=np.asarray(np.vstack([x for i,x in enumerate(split_data) if i != index])) 
        train_label=np.asarray(np.concatenate([x for i,x in enumerate(split_labels) if i != index]))
        
        test_data=np.asarray(split_data[index])
        test_label=np.asarray(split_labels[index])

     
        test_pred_label=[]
        for index in range(len(test_data)):
            dist_list=[]
            for i in range(len(train_data)):
                dist = np.linalg.norm(train_data[i] - test_data[index])
                dist_list.append([i,dist])

            dist_list.sort(key = lambda x: x[1])

            neigh_list=[]

            label_list =[]
            for i in range(k):
                neighbour = dist_list[i]
                neigh_list.append(neighbour)
                neigh_point=neighbour[0]

                label_list.append(train_label[neigh_point])

            test_pred_label.append(max(set(label_list),key=label_list.count))



        acc, prec, recall, f1_measure = performance_metric(test_label, test_pred_label)
        print("\n")
        print("acc : " + str(acc) )
        print("prec : " + str(prec) )
        print("recall : " + str(recall) )
        print("f1 : " + str(f1_measure) )
        

        total_accuracy  += acc*10
        total_precision += prec*10
        total_recall += recall*10
        final_f1_measure += f1_measure*0.1
    
    print("\n" + "Average Accuracy Obtained:",total_accuracy)
    print("Average Precision Obtained:",total_precision)
    print("Average Recall Obtained:",total_recall)
    print("Average F_measure Obtained:",final_f1_measure)


    
    

In [63]:
def knn_model(train_file,k):
    train_data = []
    test_data = []
    k = int(k)
    train = open(train_file)
 
    
    for line in train.readlines():
        train_data.append(line.split("\t"))
        
   

    data_matrix = np.asarray(train_data)
    

    data_matrix,category_data = categorical_label(data_matrix[0],data_matrix)

    train_data_points = normalization(np.matrix(data_matrix[:,:-1],dtype=float,copy=False),category_data)
    
    train_ground_truths = np.asarray(data_matrix[:,-1],dtype=int)
    
   
    split_data = np.array_split(train_data_points,10)
    
    

    split_labels = np.array_split(train_ground_truths,10)
    
    

    knn_implementation(split_data,split_labels,k)
        
        
        

In [66]:
train_file = input("Enter the train file name:")
# test_file = input("Enter the test file name:")
k = input("Enter the number of nearest neighbors :")

knn_model(train_file,k)



Enter the train file name:project3_dataset1.txt
Enter the number of nearest neighbors :5


acc : 0.9649122807017544
prec : 0.9565217391304348
recall : 0.9565217391304348
f1 : 0.9565217391304348


acc : 0.9649122807017544
prec : 0.9411764705882353
recall : 0.9411764705882353
f1 : 0.9411764705882353


acc : 0.9824561403508771
prec : 1.0
recall : 0.9285714285714286
f1 : 0.9629629629629629


acc : 0.9824561403508771
prec : 1.0
recall : 0.9523809523809523
f1 : 0.975609756097561


acc : 0.9298245614035088
prec : 1.0
recall : 0.8
f1 : 0.8888888888888888


acc : 0.9649122807017544
prec : 1.0
recall : 0.9259259259259259
f1 : 0.9615384615384616


acc : 1.0
prec : 1.0
recall : 1.0
f1 : 1.0


acc : 0.9824561403508771
prec : 1.0
recall : 0.9333333333333333
f1 : 0.9655172413793104


acc : 0.9473684210526315
prec : 1.0
recall : 0.9032258064516129
f1 : 0.9491525423728814


acc : 0.9642857142857143
prec : 0.9166666666666666
recall : 1.0
f1 : 0.9565217391304348

Average Accuracy Obtained: 96.83583959899