# All the required libraries

In [1]:
# All the required Libraries are here
import pandas as pd
import random as r
from random import seed
from math import sqrt

# Function to read values from .csv file

In [2]:
def read_csv(file_name):
    df = pd.read_csv(file_name,delimiter=',',header = None)
#     print(df)
    dataset = [list(row) for row in df.values]
    dataset.insert(0, df.columns.to_list())
    return dataset

# Function to convert column values into float

In [3]:
def str_column_into_float(dataset,column):
    for row in dataset:
        row[column] = float(row[column])
#     print('\n\n')
#     print(type(row[column]))

# Function to convert column values into int

In [4]:
def str_column_into_int(dataset,column):
    for row in dataset:
        row[column] = int(row[column])

# Functions to convert class values into numeric integer

In [5]:
def stringColumn_to_integer(dataset,column):
    # column = 4 ie class
    class_values = [row[column] for row in dataset]
#     print("the class_values is: ",class_values)
#     print(type(class_values))
#     print('\n\n')
    unique_class_values = set(class_values)
#     print(unique_class_values)
    look_up_for = dict()
    for i,values in enumerate(unique_class_values):
#         print('\n\n')
#         print(i,values)
        look_up_for[values] = i
        print('[%s] => %d'%(values,i))
#         print(look_up_for)
    for row in dataset:
        row[column] = look_up_for[row[column]]
    return look_up_for

# K fold cross validation

In [6]:
def k_fold_cross_validation(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
#     print(len(dataset)) == 150
    fold_size = int(len(dataset) / n_folds)
#     print(fold_size) == 30
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = r.randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
#             https://www.geeksforgeeks.org/python-list-pop
#             print("fold value is: ",fold)
        dataset_split.append(fold)
    return dataset_split

https://machinelearningmastery.com/distance-measures-for-machine-learning/#:~:text=of%20Distance%20Measures-,Distance%20measures%20play%20an%20important%20role%20in%20machine%20learning.,objects%20in%20a%20problem%20domain.&text=Another%20unsupervised%20learning%20algorithm%20that,the%20K%2Dmeans%20clustering%20algorithm.

# Euclidean_distances

In [7]:
def euclidean_distance(a, b):
    a = a[:-1]
    b = b[:-1]
    return sqrt(sum((e1-e2)**2 for e1, e2 in zip(a,b)))

# hamming_distance

In [8]:
def hamming_distance(a, b):
    a = a[:-1]
    b = b[:-1]
    return sum(abs(e1 - e2) for e1, e2 in zip(a, b)) / len(a)

# calculate manhattan distance

In [9]:
def manhattan_distance(a, b):
    a = a[:-1]
    b = b[:-1]
    return sum(abs(e1-e2) for e1, e2 in zip(a,b))

# calculating minkowski distance between vectors

In [10]:
def minkowski_distance(a, b, p):
    a = a[:-1]
    b = b[:-1]
    return sum(abs(e1-e2)**p for e1, e2 in zip(a,b))**(1/p)

# Function to get neighbors

In [11]:
def get_neighbors(train_set,test_row,number_of_neighbours):
    euclidian_distances = list()
    hamming_distances = list()
    manhattan_distances = list()
    minkowski_distances = list()
    p = 2
    
    for train_row in train_set:
        euclidean_dist = euclidean_distance(test_row,train_row)
        hamming_dist = hamming_distance(test_row,train_row)
        manhattan_dist = manhattan_distance(test_row,train_row)
        minkowski_dist = minkowski_distance(test_row,train_row,p)
        
        euclidian_distances.append((train_row, euclidean_dist))
        hamming_distances.append((train_row, hamming_dist))
        manhattan_distances.append((train_row, manhattan_dist))
        minkowski_distances.append((train_row, minkowski_dist))
        
    euclidian_distances.sort(key = lambda tup: tup[1])
    hamming_distances.sort(key = lambda tup: tup[1])
    manhattan_distances.sort(key = lambda tup: tup[1])
    minkowski_distances.sort(key = lambda tup: tup[1])
    
    
    euclidian_neighbors = list()
    hamming_neighbors = list()
    manhattan_neighbors = list()
    minkowski_neighbors = list()
    for i in range(number_of_neighbours):
        euclidian_neighbors.append(euclidian_distances[i][0])
        hamming_neighbors.append(hamming_distances[i][0])
        manhattan_neighbors.append(manhattan_distances[i][0])
        minkowski_neighbors.append(minkowski_distances[i][0])
        
    return euclidian_neighbors,hamming_neighbors,manhattan_neighbors,minkowski_neighbors
 

# Function to Predict my classifiers

In [12]:
def predict_classification(train_set,row,number_of_neighbours):
    euclid_neighbors,hamming_neighbors,manhattan_neighbors,minkowski_neighbors= get_neighbors(train_set,row,number_of_neighbours)
    
    euclid_output = [row[-1] for row in euclid_neighbors]
    euclid_prediction = max(set(euclid_output),key = euclid_output.count)
    
    hamming_output = [row[-1] for row in hamming_neighbors]
    hamming_prediction = max(set(hamming_output),key = hamming_output.count)
    
    manhattan_output = [row[-1] for row in manhattan_neighbors]
    manhattan_prediction = max(set(manhattan_output),key = manhattan_output.count)
    
    minkowski_output = [row[-1] for row in minkowski_neighbors]
    minkowski_prediction = max(set(minkowski_output),key = minkowski_output.count)
    
    return euclid_prediction,hamming_prediction,manhattan_prediction,minkowski_prediction

# KNN

In [13]:
def k_nearest_neighbors(train_set,test_set, number_of_neighbours):
    euclid_predictions = list()
    hamming_predictions = list()
    manhattan_predictions = list()
    minkowski_predictions = list()
    
    for row in test_set:
        euclid_predict_output,hamming_predict_output,manhattan_predict_output,minkowski_predict_output = predict_classification(train_set,row,number_of_neighbours)
        
        euclid_predictions.append(euclid_predict_output)
        hamming_predictions.append(hamming_predict_output)
        manhattan_predictions.append(manhattan_predict_output)
        minkowski_predictions.append(minkowski_predict_output)
        
    return euclid_predictions,hamming_predictions,manhattan_predictions,minkowski_predictions

# Accuray metric

In [14]:
def accuracy_metrix(actual,prediction):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == prediction[i]:
            correct += 1
    return correct / float(len(actual))*100.0

# Evaluation function

In [15]:
def eval_algorithm(dataset,k_nearest_neighbors, n_folds, *args):
    k_fold_data_set = k_fold_cross_validation(dataset, n_folds)
    
    euclid_prediction_score = list()
    hamming_prediction_score = list()
    manhattan_prediction_score = list()
    minkowski_prediction_score = list()
    
    for fold in k_fold_data_set:
        train_set = list(k_fold_data_set)
        train_set.remove(fold)            #https://www.programiz.com/python-programming/methods/list/remove
        train_set = sum(train_set,[])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
            
        euclid_prediction,hamming_prediction,manhattan_prediction,minkowski_prediction = k_nearest_neighbors(train_set,test_set, *args)
        
        actual = [row[-1] for row in fold]
        
        euclid_accuracy = accuracy_metrix(actual,euclid_prediction)
        euclid_prediction_score.append(euclid_accuracy)
        
        hamming_accuracy = accuracy_metrix(actual,hamming_prediction)
        hamming_prediction_score.append(hamming_accuracy)
        
        manhattan_accuracy = accuracy_metrix(actual,manhattan_prediction)
        manhattan_prediction_score.append(manhattan_accuracy)
        
        minkowski_accuracy = accuracy_metrix(actual,minkowski_prediction)
        minkowski_prediction_score.append(minkowski_accuracy)
        
    return euclid_prediction_score,hamming_prediction_score,manhattan_prediction_score,minkowski_prediction_score

# Main function

In [16]:
def main():
    seed(1)    # generate the same random value each time i run my code rather then a new number
    print("Enter 1 ---> car dataset, 2 -----> hayes-roth dataset, 3 -----> breast-cancer dataset, 4 ---> irish dataset")
    print("\n")
    input_file = input("enter a number for dataset: ")
    
    if(input_file == '1'):
        print("The Dataset is car")
        file_to_be_read = 'car.csv'
        dataset = read_csv(file_to_be_read)
        dataset = dataset[1:]
        print("\n\n")
#         print(dataset)
        print(" Vectorized form ")
        print(" ")
        for column in range(len(dataset[0])-1):
#             print("the column is : ",column)
            stringColumn_to_integer(dataset,column) 
        print("\n\n")
        print("class values")
        print(" ")
        stringColumn_to_integer(dataset,len(dataset[0])-1)
    
    
    elif(input_file == '2'):
        print("The Dataset is hayes-roth")
        file_to_be_read = 'hayes-roth.csv'
        dataset = read_csv(file_to_be_read)
        dataset = dataset[1:]
        print("\n\n")
#         print(dataset)
        print(" Vectorized form ")
        print(" ")
        for column in range(len(dataset[0])-1):
            str_column_into_int(dataset,column)
        print("\n\n")
        print("class values")
        print(" ")
        stringColumn_to_integer(dataset,len(dataset[0])-1)
        
    elif(input_file == '3'):
        print("The Dataset is breast-cancer")
        file_to_be_read = 'breast-cancer.csv'
        dataset = read_csv(file_to_be_read)
        dataset = dataset[1:]
        print("\n\n")
#         print(dataset)
        print(" Vectorized form ")
        print(" ")
        for column in range(len(dataset[0])-1):
            stringColumn_to_integer(dataset,column)
        print("\n\n")
        print("class values")
        print(" ")
        stringColumn_to_integer(dataset,len(dataset[0])-1)
        
    elif(input_file == '4'):
        print("The Dataset is irish")
        file_to_be_read = 'irish.csv'
        dataset = read_csv(file_to_be_read)
        dataset = dataset[1:]
        print("\n\n")
#         print(dataset)
        print(" Vectorized form ")
        print(" ")
        for column in range(len(dataset[0])-1):
            #convert the column with string values into float [col:0,1,2,3]
            str_column_into_float(dataset,column)
        print("class values")
        print(" ")
        stringColumn_to_integer(dataset,len(dataset[0])-1)
    
#     print(dataset)
    # algorithm
    n_folds = 10
    number_of_neighbours = 5
    
    euclid_prediction_score,hamming_prediction_score,manhattan_prediction_score,minkowski_prediction_score = eval_algorithm(dataset,k_nearest_neighbors, n_folds, number_of_neighbours)
    print("\n\n")
    print("Euclid Predicted Score is: %s" %euclid_prediction_score)
    print("")
    print("Euclid Mean of accuracy: %.3f%%" %(sum(euclid_prediction_score)/float(len(euclid_prediction_score))))
    print("\n\n")
    
    print("\n\n")
    print("Hamming Predicted Score is: %s" %hamming_prediction_score)
    print("")
    print("Hamming Mean of accuracy: %.3f%%" %(sum(hamming_prediction_score)/float(len(hamming_prediction_score))))
    print("\n\n")
    
    print("\n\n")
    print("Manhattan Predicted Score is: %s" %manhattan_prediction_score)
    print("")
    print("Manhattan Mean of accuracy: %.3f%%" %(sum(manhattan_prediction_score)/float(len(manhattan_prediction_score))))
    print("\n\n")
    
    print("\n\n")
    print("Minkowski Predicted Score is: %s" %minkowski_prediction_score)
    print("")
    print("Minkowski Mean of accuracy: %.3f%%" %(sum(minkowski_prediction_score)/float(len(minkowski_prediction_score))))
    print("\n\n")
    


In [17]:
if __name__ == '__main__':
    main()

Enter 1 ---> car dataset, 2 -----> hayes-roth dataset, 3 -----> breast-cancer dataset, 4 ---> irish dataset


enter a number for dataset: 1
The Dataset is car



 Vectorized form 
 
[high] => 0
[low] => 1
[med] => 2
[vhigh] => 3
[high] => 0
[low] => 1
[med] => 2
[vhigh] => 3
[2] => 0
[3] => 1
[5more] => 2
[4] => 3
[2] => 0
[more] => 1
[4] => 2
[small] => 0
[med] => 1
[big] => 2
[high] => 0
[low] => 1
[med] => 2



class values
 
[good] => 0
[unacc] => 1
[acc] => 2
[vgood] => 3



Euclid Predicted Score is: [90.11627906976744, 89.53488372093024, 91.86046511627907, 88.95348837209302, 91.86046511627907, 91.27906976744185, 87.79069767441861, 90.69767441860465, 87.20930232558139, 91.86046511627907]

Euclid Mean of accuracy: 90.116%






Hamming Predicted Score is: [90.11627906976744, 89.53488372093024, 91.86046511627907, 88.95348837209302, 91.86046511627907, 91.27906976744185, 87.79069767441861, 90.69767441860465, 87.20930232558139, 91.86046511627907]

Hamming Mean of accuracy: 90.116%



