# Comparison of our K-nearest Neighbors Algorithm vs. the one from Sci-kit learn
1. Import data set containing the label(s) and features as a dataframe
1. Format the dataframe to remove undesired strings and useless columns 
1. Shuffle the dataframe to prevent order bias
1. Prepare the data sets for training and testing
1. Label the features for supervised training
1. Train the algorithm on the training data
1. Run the algorithm on the test data and calculate the accuracy of the classifier

In [28]:
import numpy as np
from math import sqrt
import warnings
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd
import random

def k_nearest_neighbors(data, predict, k=3):
    if len(data) >= k:
        warnings.warn('K is set to a value less than total voting groups')
    
    distances = []
    for group in data:
        for features in data[group]:
            euclidean_distance = np.linalg.norm( np.array(features) - np.array(predict) )
            distances.append([euclidean_distance, group])
    
    votes = [ i[1] for i in sorted(distances)[:k] ]
    
    most_common_group, num_votes = Counter(votes).most_common(1)[0]
    
    return most_common_group

## New

In [34]:
df = pd.read_csv("8. breast-cancer-wisconsin.data", na_values='?', header=0)
#df.replace('?',-99999, inplace=True)

# Drop the 'ID' column from the DataFrame as it's not needed for classification.
df.drop(['ID'], axis=1, inplace=True)

# drop useless columns
df = df.loc[:, "Diagnosis":"frac_dimension"]

# convert M for malignent and B for benign to numbers
df['Diagnosis'] = df['Diagnosis'].map({'M': 0, 'B': 1})

# convert numbers into floats for consistent format
full_data = df.astype(float).values.tolist()

# Shuffle the entire dataset to avoid any inherent order bias.
random.shuffle(full_data)

# what is the proportion of malignant and benign?
#print(full_data)
df.head()

Unnamed: 0,Diagnosis,radius,texture,perimeter,area,smoothness,compactness,concavity,concave_points,symmetry,frac_dimension
0,0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871
1,0,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667
2,0,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999
3,0,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744
4,0,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883


In [37]:
# Define the proportion of data to be used for testing.
test_size = 0.2

# Initialize dictionaries to separate feature data into classes (malignant and benign)
# labeling the features for supervised training
train_set = {0:[], 1:[]}
test_set = {0:[], 1:[]}

# Split the data into training and testing sets based on the defined proportion
train_data = full_data[:-int(test_size * len(full_data) )]
test_data = full_data[-int(test_size * len(full_data) ):]

# Populate the training and testing dictionaries with data points (feature data).
# Each data point is added to the corresponding class list.
for i in train_data:
    train_set[i[0]].append(i[1:])

for i in test_data:
    test_set[i[0]].append(i[1:])
    
# Initialize variables to track the accuracy of the classification
correct = 0
total = 0

# Perform k-nearest neighbors classification on the test set
# Compare the predicted class to the actual class and calculate accuracy
for group in test_set:
    for data in test_set[group]:
        vote = k_nearest_neighbors(train_set, data, k=5)
        if group == vote:
            correct += 1
        total += 1

# Calculate and print the accuracy of the k-nearest neighbors classification.
print('Accuracy:', correct/total)

#print(test_set)
#df.head()

Accuracy: 0.911504424778761
