In [349]:
# special IPython command to prepare the notebook for matplotlib
%matplotlib inline 

#Array processing
import numpy as np

#Data analysis, wrangling and common exploratory operations
import pandas as pd
from random import randrange
from math import sqrt


In [350]:
#read the csv file and writing headers
df_data = pd.read_csv('breast-cancer.data.csv',header=None)
df_data.to_csv('breast-cancer-new.csv',header=["Class","age","menopause","tumor-size","inv-nodes","node-caps","deg-malig","breast","breast-quad","irradiat"],index=False)



In [351]:
new_data = pd.read_csv('breast-cancer-new.csv')
#print the dataset         
new_data

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no
...,...,...,...,...,...,...,...,...,...,...
281,recurrence-events,30-39,premeno,30-34,0-2,no,2,left,left_up,no
282,recurrence-events,30-39,premeno,20-24,0-2,no,3,left,left_up,yes
283,recurrence-events,60-69,ge40,20-24,0-2,no,1,right,left_up,no
284,recurrence-events,40-49,ge40,30-34,03-May,no,3,left,left_low,no


In [352]:
#Checking the datatypes of attributes
new_data.dtypes

Class          object
age            object
menopause      object
tumor-size     object
inv-nodes      object
node-caps      object
deg-malig       int64
breast         object
breast-quad    object
irradiat       object
dtype: object

In [353]:
#Printing the numeric data
mid_data=new_data.iloc[:,:].values
modified_data=np.column_stack([np.unique(mid_data[:, i], return_inverse=True)[1] for i in range(mid_data.shape[1])])

In [354]:
split_data=[]
for i in range(10):
    split=list()
    while len(split)< int(len(modified_data)/10):
        loc=randrange(len(list(modified_data)))
        split.append((list(modified_data)).pop(loc))
    split_data.append(split)

def classifier(algo,args,split_data,r):
    acc_list=list()
    for x in split_data:
        train=list(split_data)
        #train.remove(x)
        train=sum(train,[])
        test=list()
        for y in x:
            z=list(y)
            test.append(z)
            z[-1]=None
        predicted=algo(args,train,test,r)
        actual=[y[-1] for y in x]
        acc=0
        for p in range(len(actual)):
            if actual[p]==predicted[p]:
                acc+=1
        final=acc/float(len(actual))*100.0
        acc_list.append(final)
    return acc_list
                
        

In [355]:
def calc_distance(row1,row2,r):
	distance = 0.0
	for i in range(len(row1)-1):
		distance += (row1[i] - row2[i])**r
	return (distance)**1/r

In [356]:
def knn(neighbors,train,test,r):
    predictions=list()
    for b in test:
        distances=list()
        for c in train:
            dist=calc_distance(b,c,r)
            distances.append((c,dist))
        distances.sort(key=lambda tup: tup[1])
        nlist = list()
        for i in range(neighbors):
            nlist.append(distances[i][0])
            output_values = [row[-1] for row in nlist]
            prediction = max(set(output_values), key=output_values.count)
        predictions.append(prediction)
    return predictions
        

In [359]:
neighbors=23
neighbors1=37
neighbors2=9
neighbors3=65
manhattan=1
euclidean=2
minkowski=4

scores = classifier(knn,neighbors,split_data,manhattan)
print('---Considering 23 neighbors---')
print('\n')
print('Accuracies for Manhattan distance: %s' % scores)
print('Mean Accuracy for Manhattan distance: %.3f%%' % (sum(scores)/float(len(scores))))
print('\n')
scores = classifier(knn,neighbors,split_data,euclidean)
print('Accuracies for Euclidean distance: %s' % scores)
print('Mean Accuracy for Euclidean distance: %.3f%%' % (sum(scores)/float(len(scores))))
print('\n')
scores = classifier(knn,neighbors,split_data,minkowski)
print('Accuracies for Minkowski distance: %s' % scores)
print('Mean Accuracy for Minkowski distance: %.3f%%' % (sum(scores)/float(len(scores))))

scores = classifier(knn,neighbors1,split_data,manhattan)
print('\n')
print('---Considering 37 neighbors---')
print('\n')
print('Accuracies for Manhattan distance: %s' % scores)
print('Mean Accuracy for Manhattan distance: %.3f%%' % (sum(scores)/float(len(scores))))
print('\n')
scores = classifier(knn,neighbors1,split_data,euclidean)
print('Accuracies for Euclidean distance: %s' % scores)
print('Mean Accuracy for Euclidean distance: %.3f%%' % (sum(scores)/float(len(scores))))
print('\n')
scores = classifier(knn,neighbors1,split_data,minkowski)
print('Accuracies for Minkowski distance: %s' % scores)
print('Mean Accuracy for Minkowski distance: %.3f%%' % (sum(scores)/float(len(scores))))

scores = classifier(knn,neighbors2,split_data,manhattan)
print('\n')
print('---Considering 9 neighbors---')
print('\n')
print('Accuracies for Manhattan distance: %s' % scores)
print('Mean Accuracy for Manhattan distance: %.3f%%' % (sum(scores)/float(len(scores))))
print('\n')
scores = classifier(knn,neighbors2,split_data,euclidean)
print('Accuracies for Euclidean distance: %s' % scores)
print('Mean Accuracy for Euclidean distance: %.3f%%' % (sum(scores)/float(len(scores))))
print('\n')
scores = classifier(knn,neighbors2,split_data,minkowski)
print('Accuracies for Minkowski distance: %s' % scores)
print('Mean Accuracy for Minkowski distance: %.3f%%' % (sum(scores)/float(len(scores))))

scores = classifier(knn,neighbors3,split_data,manhattan)
print('\n')
print('---Considering 65 neighbors---')
print('\n')
print('Accuracies for Manhattan distance: %s' % scores)
print('Mean Accuracy for Manhattan distance: %.3f%%' % (sum(scores)/float(len(scores))))
print('\n')
scores = classifier(knn,neighbors3,split_data,euclidean)
print('Accuracies for Euclidean distance: %s' % scores)
print('Mean Accuracy for Euclidean distance: %.3f%%' % (sum(scores)/float(len(scores))))
print('\n')
scores = classifier(knn,neighbors3,split_data,minkowski)
print('Accuracies for Minkowski distance: %s' % scores)
print('Mean Accuracy for Minkowski distance: %.3f%%' % (sum(scores)/float(len(scores))))

---Considering 23 neighbors---


Accuracies for Manhattan distance: [64.28571428571429, 82.14285714285714, 71.42857142857143, 89.28571428571429, 71.42857142857143, 64.28571428571429, 75.0, 78.57142857142857, 82.14285714285714, 78.57142857142857]
Mean Accuracy for Manhattan distance: 75.714%


Accuracies for Euclidean distance: [60.71428571428571, 82.14285714285714, 71.42857142857143, 85.71428571428571, 71.42857142857143, 64.28571428571429, 75.0, 82.14285714285714, 85.71428571428571, 82.14285714285714]
Mean Accuracy for Euclidean distance: 76.071%


Accuracies for Minkowski distance: [67.85714285714286, 82.14285714285714, 71.42857142857143, 85.71428571428571, 71.42857142857143, 64.28571428571429, 75.0, 82.14285714285714, 85.71428571428571, 82.14285714285714]
Mean Accuracy for Minkowski distance: 76.786%


---Considering 37 neighbors---


Accuracies for Manhattan distance: [64.28571428571429, 82.14285714285714, 71.42857142857143, 89.28571428571429, 71.42857142857143, 64.28571428571429, 7