# Import needed libraries, apply custom functions of euclidean distance and k-nearest neighbours.

In [88]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import numpy as np
from random import shuffle
from collections import Counter

In [89]:
def euclidean_distance(plots):
    result = 0
    for i in range(0, 1):
        for y in range(0,len(plots[0])):
            result += (plots[i][y] - plots[i+1][y])**2
    return result**0.5

In [90]:
def k_nearest_neighbours(data, predict, k):
    counter = []
    min_values = []
    if k < len(data):
        print("K should be bigger than number of classes.")
    for cl in data:
        for feat in data[cl]:
            counter.append([cl, euclidean_distance([feat, predict])])

    
    counter.sort(key=lambda x: x[1])
    min_values = counter[:k]
    check = []
    for label in min_values:
        check.append(label[0])

    return Counter(check).most_common(1)[0][0]

# Prepare dataset for using KNN algorithm.

In [91]:
df = pd.read_csv('datasets/breast-cancer-wisconsin.data')

In [92]:
df.replace('?', np.NaN, inplace=True)
df.dropna(inplace=True)
df.drop(['id'], axis=1, inplace=True)

# KNN using custom function, measure accuracy of it.

In [93]:
all_data = df.astype(int).values.tolist()
shuffle(all_data)

In [94]:
X_train = all_data[:-int(0.25*len(all_data))]
X_test = all_data[-int(0.25*len(all_data)):]
train_dict = {2: [], 4: []}
test_dict = {2: [], 4: []}
for sample in X_train:
    train_dict[sample[-1]].append(sample[:-1])
for sample in X_test:
    test_dict[sample[-1]].append(sample[:-1])

In [95]:
def measure_accuracy(train_set, test_set):
    predicted = 0
    count = 0 
    for category in test_set:
        for predict in test_set[category]:
            result = k_nearest_neighbours(train_set, predict, 5)
            if category == result:
                predicted +=1
            count+=1
    return predicted/count

In [96]:
measure_accuracy(train_dict, test_dict)
print(accuracy)

0.9707602339181286


# Measure accuracy using sklearn's KNN classifier.

In [97]:
X = np.array(df.drop(['class'], axis=1))
y = np.array(df['class'])

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [99]:
clf = KNeighborsClassifier()

In [100]:
clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [101]:
accuracy = clf.score(X_test, y_test)
print(accuracy)

0.9824561403508771
