In [1]:
import numpy as np
from math import sqrt
import warnings
from collections import Counter
import pandas as pd
import random

In [2]:
def k_nearest_neighbors(data, predict, k=3):
    if len(data) >= k:
        warnings.warn('K is set to a value less than total voting groups!')
    distances = []
    for group in data:
        for features in data[group]:
            euclidean_distance = np.linalg.norm(np.array(features) - np.array(predict))
            distances.append([euclidean_distance, group])
            
    votes = [i[1] for i in sorted(distances) [:k]]
    
    #print(Counter(votes).most_common(1))           # Counter is a container that keeps track of how many times the value is added
   
    
    vote_result = Counter(votes).most_common(1)[0][0]
    
    confidence = Counter(votes).most_common(1)[0][0] / k
    
    #print(vote_result, confidence)
    
    return vote_result, confidence



In [3]:
df = pd.read_csv("breast-cancer-wisconsin.data")
df.replace('?', -99999, inplace = True)
df.drop(['id'], 1, inplace= True)
full_data = df.astype(float).values.tolist()     # Everyting is converted into float

random.shuffle(full_data)

In [4]:
test_size = 0.4
train_set = {2:[], 4:[]}
test_set = {2:[], 4:[]}

train_data = full_data[:-int(test_size*len(full_data))]
test_data = full_data[:-int(test_size*len(full_data)):]


In [5]:
for i in train_data:
    train_set[i[-1]].append(i[:-1])
    
for i in test_data:
    test_set[i[-1]].append(i[:-1])

In [6]:
correct = 0
total = 0

for group in test_set:
    for data in test_set[group]:
        vote, confidence = k_nearest_neighbors(train_set, data, k=5)
        if group == vote:
            correct += 1
        total += 1
        
print('Accuracy :', correct/total)

Accuracy : 0.9761904761904762


### Looping over entire code to get differnt accuracies

In [7]:
accuracies = []

for i in range(20):
    df = pd.read_csv("breast-cancer-wisconsin.data")
    df.replace('?', -99999, inplace = True)
    df.drop(['id'], 1, inplace= True)
    full_data = df.astype(float).values.tolist()     # Everyting is converted into float

    random.shuffle(full_data)
    
    test_size = 0.4
    train_set = {2:[], 4:[]}
    test_set = {2:[], 4:[]}

    train_data = full_data[:-int(test_size*len(full_data))]
    test_data = full_data[:-int(test_size*len(full_data)):]
    
    for i in train_data:
        train_set[i[-1]].append(i[:-1])
    
    for i in test_data:
        test_set[i[-1]].append(i[:-1])

    correct = 0
    total = 0

    for group in test_set:
        for data in test_set[group]:
            vote, confidence = k_nearest_neighbors(train_set, data, k=5)
            if group == vote:
                correct += 1
            total += 1
        
    print('Accuracy :', correct/total)
    accuracies.append(correct/total)
    
print(sum(accuracies)/len(accuracies))

Accuracy : 0.9785714285714285
Accuracy : 0.9642857142857143
Accuracy : 0.9761904761904762
Accuracy : 0.969047619047619
Accuracy : 0.9714285714285714
Accuracy : 0.9785714285714285
Accuracy : 0.9666666666666667
Accuracy : 0.9714285714285714
Accuracy : 0.9642857142857143
Accuracy : 0.9738095238095238
Accuracy : 0.9809523809523809
Accuracy : 0.9714285714285714
Accuracy : 0.9833333333333333
Accuracy : 0.9714285714285714
Accuracy : 0.9785714285714285
Accuracy : 0.9785714285714285
Accuracy : 0.9738095238095238
Accuracy : 0.9761904761904762
Accuracy : 0.9761904761904762
Accuracy : 0.9738095238095238
0.9739285714285713
