# HW-3 KNN
Wyatt Blair

SID: 10420296

2/26/24


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import functools

## Apply same pre-processing as in HW-2-EDA

In [2]:
data = pd.read_csv('../data/breast-cancer-wisconsin.csv')

data.replace("?", np.NaN, inplace=True)
data['F6'] = data['F6'].astype(float)

for column in data:

    series = data[column]
    mean, median =  series.mean(), series.median()
    data[column].fillna(mean, inplace=True)

## Split dataset into train (70%) and test (30%)

In [3]:
train = data.sample(frac=0.7)
test = data.drop(index=train.index)

## Define some distance metrics

In [4]:
def minkowski_distance(x, y, p):

    diff = x - y
    power = diff ** p
    total = power.sum(axis=1)
    distance = total ** (1/p)

    return distance

def euclidean_distance(x, y):
    return minkowski_distance(x, y, p=2)

def manhattan_distance(x, y):

    diff = x - y
    absolute = abs(diff)
    distance = absolute.sum(axis=1)

    return distance

## Define nearest neighbors functions

In [5]:
def find_k_nearest_neighbors(point, k, distance_metric=euclidean_distance):

    neighbors = train.copy()
    feat_columns = [f"F{i}" for i in range(1, 10)]

    y = np.broadcast_to(point[feat_columns].values, (len(train.index), 9))

    x_data = neighbors[feat_columns].copy()
    x = x_data.values

    neighbors['distance'] = distance_metric(x, y)
    neighbors.sort_values(by='distance', ascending=True, inplace=True)
    nearest_neighbors = neighbors.head(k)

    return nearest_neighbors

def classify(neighbors):

    neighbor_classes = neighbors['Class']
    counts = neighbor_classes.value_counts()
    closest_class = counts.sort_values().index[0]

    return closest_class

def predict(point, k, distance_metric=euclidean_distance):

    nearest_neighbors = find_k_nearest_neighbors(point, k, distance_metric=distance_metric)
    classification = classify(nearest_neighbors)

    return classification


## Make predictions on test set

In [6]:
def test_knn(distance_metric, k_vals=[3, 5, 10]):
    
    accuracy_dict = {}
    df = test.copy()

    for k in k_vals:

        predict_func = functools.partial(
            predict,
            k=k,
            distance_metric=distance_metric,
        )

        df[f"prediction_k={k}"] = df.apply(predict_func, axis=1)
        df[f"correct_k={k}"] = df['Class'] == df[f'prediction_k={k}']

        percent_correct = df[f"correct_k={k}"].value_counts()[True] / len(df.index)
        accuracy_dict.update({f'accuracy_k={k}' : percent_correct})
    
    return accuracy_dict


In [7]:
test_knn(euclidean_distance)

{'accuracy_k=3': 0.9333333333333333,
 'accuracy_k=5': 0.9,
 'accuracy_k=10': 0.8238095238095238}

In [8]:
test_knn(manhattan_distance)

{'accuracy_k=3': 0.9428571428571428,
 'accuracy_k=5': 0.9238095238095239,
 'accuracy_k=10': 0.8666666666666667}

In [9]:
test_knn(functools.partial(minkowski_distance, p=6))

{'accuracy_k=3': 0.9380952380952381,
 'accuracy_k=5': 0.9047619047619048,
 'accuracy_k=10': 0.8095238095238095}

## According to these results, Manhattan Distance combined with k=3 are the optimal parameters for KNN on this dataset, achieving an accuracy of ~94.3% on the test set