# Comparison of our K-nearest Neighbors Algorithm vs. the one from Sci-kit learn
1. Import data set containing the label(s) and features as a dataframe
1. Format the dataframe to remove undesired strings and useless columns 
1. Shuffle the dataframe to prevent order bias
1. Prepare the data sets for training and testing
1. Label the features for supervised training
1. Train the algorithm on the training data
1. Run the algorithm on the test data and calculate the accuracy of the classifier

In [40]:
import numpy as np
from math import sqrt
import warnings
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd
import random

def k_nearest_neighbors(data, predict, k=3):
    if len(data) >= k:
        warnings.warn('K is set to a value less than total voting groups')
    
    distances = []
    for group in data:
        for features in data[group]:
            euclidean_distance = np.linalg.norm( np.array(features) - np.array(predict) )
            distances.append([euclidean_distance, group])
    
    votes = [ i[1] for i in sorted(distances)[:k] ]
    confidence = Counter(votes).most_common(1)[0][0] / k
    most_common_group, num_votes = Counter(votes).most_common(1)[0]
    print(most_common_group, confidence)
    
    return most_common_group, confidence

df = pd.read_csv("8. breast-cancer-wisconsin.data", na_values='?', header=0)
#df.replace('?',-99999, inplace=True)
df.drop(['ID'], axis=1, inplace=True)
df = df.loc[:, "Diagnosis":"frac_dimension"]
df['Diagnosis'] = df['Diagnosis'].map({'M': 0, 'B': 1})
full_data = df.astype(float).values.tolist()
random.shuffle(full_data)

test_size = 0.4
train_set = {0:[], 1:[]}
test_set = {0:[], 1:[]}

train_data = full_data[:-int(test_size * len(full_data) )]
test_data = full_data[-int(test_size * len(full_data) ):]

for i in train_data:
    train_set[i[0]].append(i[1:])

for i in test_data:
    test_set[i[0]].append(i[1:])
    
correct = 0
total = 0

for group in test_set:
    for data in test_set[group]:
        vote, confidence = k_nearest_neighbors(train_set, data, k=5)
        if group == vote:
            correct += 1
        total += 1

print('Accuracy:', correct/total)

1 0.2
0 0.0
1 0.2
0 0.0
0 0.0
0 0.0
1 0.2
1 0.2
0 0.0
0 0.0
0 0.0
1 0.2
0 0.0
0 0.0
0 0.0
0 0.0
0 0.0
0 0.0
0 0.0
0 0.0
1 0.2
0 0.0
0 0.0
1 0.2
0 0.0
0 0.0
0 0.0
1 0.2
0 0.0
0 0.0
0 0.0
0 0.0
0 0.0
0 0.0
0 0.0
1 0.2
1 0.2
0 0.0
0 0.0
0 0.0
0 0.0
0 0.0
0 0.0
0 0.0
0 0.0
0 0.0
0 0.0
0 0.0
1 0.2
0 0.0
0 0.0
0 0.0
0 0.0
0 0.0
1 0.2
0 0.0
0 0.0
0 0.0
0 0.0
1 0.2
0 0.0
0 0.0
1 0.2
0 0.0
1 0.2
0 0.0
1 0.2
0 0.0
0 0.0
0 0.0
1 0.2
0 0.0
0 0.0
0 0.0
1 0.2
0 0.0
0 0.0
0 0.0
0 0.0
0 0.0
0 0.0
0 0.0
0 0.0
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
0 0.0
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
0 0.0
1 0.2
1 0.2
1 0.2
0 0.0
0 0.0
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
0 0.0
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.2
1 0.