In [132]:
# special IPython command to prepare the notebook for matplotlib
%matplotlib inline 

#Array processing
import numpy as np

#Data analysis, wrangling and common exploratory operations
import pandas as pd
from random import randrange
from math import sqrt


In [133]:
#read the csv file and writing headers
df_data = pd.read_csv('hayes-roth.csv',header=None)
df_data.to_csv('hayes-roth-new.csv',header=["name","hobby","age","educational level","marital status","class"],index=False)



In [134]:
new_data = pd.read_csv('hayes-roth-new.csv')
#print the dataset         
new_data

Unnamed: 0,name,hobby,age,educational level,marital status,class
0,92,2,1,1,2,1
1,10,2,1,3,2,2
2,83,3,1,4,1,3
3,61,2,4,2,2,3
4,107,1,1,3,4,3
...,...,...,...,...,...,...
127,44,1,1,4,3,3
128,40,2,1,2,1,1
129,90,1,2,1,2,2
130,21,1,2,2,1,2


In [135]:
#Checking the datatypes of attributes
new_data.dtypes

name                 int64
hobby                int64
age                  int64
educational level    int64
marital status       int64
class                int64
dtype: object

In [136]:
#Printing the numeric data
modified_data=new_data.iloc[:,:].values
modified_data

array([[ 92,   2,   1,   1,   2,   1],
       [ 10,   2,   1,   3,   2,   2],
       [ 83,   3,   1,   4,   1,   3],
       [ 61,   2,   4,   2,   2,   3],
       [107,   1,   1,   3,   4,   3],
       [113,   1,   1,   3,   2,   2],
       [ 80,   3,   1,   3,   2,   2],
       [125,   3,   4,   2,   4,   3],
       [ 36,   2,   2,   1,   1,   1],
       [105,   3,   2,   1,   1,   1],
       [ 81,   1,   2,   1,   1,   1],
       [122,   2,   2,   3,   4,   3],
       [ 94,   1,   1,   2,   1,   1],
       [ 60,   2,   1,   2,   2,   2],
       [  8,   2,   4,   1,   4,   3],
       [ 20,   1,   1,   3,   3,   1],
       [ 85,   3,   2,   1,   2,   2],
       [ 50,   1,   2,   1,   1,   1],
       [ 68,   3,   3,   2,   1,   1],
       [ 89,   3,   1,   3,   2,   1],
       [ 52,   1,   2,   2,   1,   2],
       [ 19,   3,   2,   1,   3,   1],
       [118,   2,   1,   2,   1,   1],
       [ 16,   3,   2,   1,   3,   1],
       [ 91,   2,   3,   2,   1,   1],
       [ 79,   3,   2,   

In [137]:
split_data=[]
for i in range(10):
    split=list()
    while len(split)< int(len(modified_data)/10):
        loc=randrange(len(list(modified_data)))
        split.append((list(modified_data)).pop(loc))
    split_data.append(split)

def classifier(algo,args,split_data,r):
    acc_list=list()
    for x in split_data:
        train=list(split_data)
        #train.remove(x)
        train=sum(train,[])
        test=list()
        for y in x:
            z=list(y)
            test.append(z)
            z[-1]=None
        predicted=algo(args,train,test,r)
        actual=[y[-1] for y in x]
        acc=0
        for p in range(len(actual)):
            if actual[p]==predicted[p]:
                acc+=1
        final=acc/float(len(actual))*100.0
        acc_list.append(final)
    return acc_list
                
        

In [138]:
def euclidean_distance(row1, row2,r):
	distance = 0.0
	for i in range(len(row1)-1):
		distance += (row1[i] - row2[i])**r
	return (distance)**1/r

In [139]:
def knn(neighbors,train,test,r):
    predictions=list()
    for b in test:
        distances=list()
        for c in train:
            dist=euclidean_distance(b,c,r)
            distances.append((c,dist))
        distances.sort(key=lambda tup: tup[1])
        nlist = list()
        for i in range(neighbors):
            nlist.append(distances[i][0])
            output_values = [row[-1] for row in nlist]
            prediction = max(set(output_values), key=output_values.count)
        predictions.append(prediction)
    return predictions
        

In [140]:
neighbors=73
neighbors1=47
neighbors2=53
neighbors3=65
manhattan=1
euclidean=2
minkowski=4

scores = classifier(knn,neighbors,split_data,manhattan)
print('---Considering 73 neighbors---')
print('\n')
print('Accuracies for Manhattan distance: %s' % scores)
print('Mean Accuracy for Manhattan distance: %.3f%%' % (sum(scores)/float(len(scores))))
print('\n')
scores = classifier(knn,neighbors,split_data,euclidean)
print('Accuracies for Euclidean distance: %s' % scores)
print('Mean Accuracy for Euclidean distance: %.3f%%' % (sum(scores)/float(len(scores))))
print('\n')
scores = classifier(knn,neighbors,split_data,minkowski)
print('Accuracies for Minkowski distance: %s' % scores)
print('Mean Accuracy for Minkowski distance: %.3f%%' % (sum(scores)/float(len(scores))))

scores = classifier(knn,neighbors1,split_data,manhattan)
print('\n')
print('---Considering 47 neighbors---')
print('\n')
print('Accuracies for Manhattan distance: %s' % scores)
print('Mean Accuracy for Manhattan distance: %.3f%%' % (sum(scores)/float(len(scores))))
print('\n')
scores = classifier(knn,neighbors1,split_data,euclidean)
print('Accuracies for Euclidean distance: %s' % scores)
print('Mean Accuracy for Euclidean distance: %.3f%%' % (sum(scores)/float(len(scores))))
print('\n')
scores = classifier(knn,neighbors1,split_data,minkowski)
print('Accuracies for Minkowski distance: %s' % scores)
print('Mean Accuracy for Minkowski distance: %.3f%%' % (sum(scores)/float(len(scores))))

scores = classifier(knn,neighbors2,split_data,manhattan)
print('\n')
print('---Considering 53 neighbors---')
print('\n')
print('Accuracies for Manhattan distance: %s' % scores)
print('Mean Accuracy for Manhattan distance: %.3f%%' % (sum(scores)/float(len(scores))))
print('\n')
scores = classifier(knn,neighbors2,split_data,euclidean)
print('Accuracies for Euclidean distance: %s' % scores)
print('Mean Accuracy for Euclidean distance: %.3f%%' % (sum(scores)/float(len(scores))))
print('\n')
scores = classifier(knn,neighbors2,split_data,minkowski)
print('Accuracies for Minkowski distance: %s' % scores)
print('Mean Accuracy for Minkowski distance: %.3f%%' % (sum(scores)/float(len(scores))))

scores = classifier(knn,neighbors3,split_data,manhattan)
print('\n')
print('---Considering 65 neighbors---')
print('\n')
print('Accuracies for Manhattan distance: %s' % scores)
print('Mean Accuracy for Manhattan distance: %.3f%%' % (sum(scores)/float(len(scores))))
print('\n')
scores = classifier(knn,neighbors3,split_data,euclidean)
print('Accuracies for Euclidean distance: %s' % scores)
print('Mean Accuracy for Euclidean distance: %.3f%%' % (sum(scores)/float(len(scores))))
print('\n')
scores = classifier(knn,neighbors3,split_data,minkowski)
print('Accuracies for Minkowski distance: %s' % scores)
print('Mean Accuracy for Minkowski distance: %.3f%%' % (sum(scores)/float(len(scores))))

---Considering 73 neighbors---


Accuracies for Manhattan distance: [30.76923076923077, 61.53846153846154, 53.84615384615385, 53.84615384615385, 46.15384615384615, 38.46153846153847, 46.15384615384615, 30.76923076923077, 46.15384615384615, 23.076923076923077]
Mean Accuracy for Manhattan distance: 43.077%


Accuracies for Euclidean distance: [30.76923076923077, 61.53846153846154, 53.84615384615385, 53.84615384615385, 46.15384615384615, 38.46153846153847, 46.15384615384615, 30.76923076923077, 46.15384615384615, 23.076923076923077]
Mean Accuracy for Euclidean distance: 43.077%


Accuracies for Minkowski distance: [30.76923076923077, 61.53846153846154, 53.84615384615385, 53.84615384615385, 46.15384615384615, 38.46153846153847, 46.15384615384615, 30.76923076923077, 46.15384615384615, 23.076923076923077]
Mean Accuracy for Minkowski distance: 43.077%


---Considering 47 neighbors---


Accuracies for Manhattan distance: [38.46153846153847, 15.384615384615385, 30.76923076923077, 23.076923076923