# Knn Algorithm Implementation Exploration


In [2]:
import numpy as np
import pandas as pd

train_inputs = pd.read_csv("../datasets/train_inputs.csv", header=None)
train_labels = pd.read_csv("../datasets/train_labels.csv", header=None)
test_inputs = pd.read_csv("../datasets/test_inputs.csv", header=None)
test_labels = pd.read_csv("../datasets/test_labels.csv", header=None)

train_inputs = train_inputs.rename(columns=lambda x: f"e{x+1}")
train_labels = train_labels.rename(columns={0: "category"})
test_inputs = test_inputs.rename(columns=lambda x: f"e{x+1}")
test_labels = test_labels.rename(columns={0: "category"})

In [3]:
train_input_matrix = train_inputs.to_numpy()
train_input_matrix

array([[ 0.,  0., 10., ...,  7.,  2.,  2.],
       [ 0.,  0.,  9., ...,  4.,  0., 14.],
       [10., 15., 16., ..., 14., 16.,  1.],
       ...,
       [ 5.,  2.,  0., ...,  1.,  4.,  4.],
       [ 0.,  0.,  5., ...,  0.,  4.,  2.],
       [ 0.,  0.,  2., ...,  0.,  2.,  0.]], shape=(1000, 64))

In [4]:
train_labels_matrix = train_labels.to_numpy()
train_labels_matrix

array([[6.],
       [5.],
       [6.],
       [5.],
       [6.],
       [6.],
       [6.],
       [6.],
       [5.],
       [6.],
       [6.],
       [5.],
       [5.],
       [6.],
       [6.],
       [6.],
       [6.],
       [5.],
       [5.],
       [5.],
       [6.],
       [6.],
       [6.],
       [5.],
       [6.],
       [6.],
       [6.],
       [5.],
       [5.],
       [5.],
       [5.],
       [6.],
       [5.],
       [5.],
       [5.],
       [5.],
       [5.],
       [5.],
       [5.],
       [6.],
       [6.],
       [5.],
       [6.],
       [5.],
       [5.],
       [6.],
       [5.],
       [6.],
       [5.],
       [6.],
       [5.],
       [6.],
       [5.],
       [6.],
       [6.],
       [5.],
       [5.],
       [6.],
       [5.],
       [5.],
       [6.],
       [6.],
       [5.],
       [5.],
       [5.],
       [5.],
       [6.],
       [5.],
       [6.],
       [6.],
       [5.],
       [6.],
       [5.],
       [5.],
       [5.],
       [6.],
       [6.],

In [29]:
origin = test_inputs.to_numpy()[12]
distances = np.zeros((len(train_inputs), 2), dtype="float")

distances

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       ...,
       [0., 0.],
       [0., 0.],
       [0., 0.]], shape=(1000, 2))

In [30]:
for i, (v, l) in enumerate(zip(train_input_matrix, train_labels_matrix.flat)):
    dist = np.linalg.norm(v - origin)
    distances[i, 0] = dist
    distances[i, 1] = l

distances

array([[ 98.74715186,   6.        ],
       [ 82.48030068,   5.        ],
       [ 56.63920903,   6.        ],
       ...,
       [ 88.43641784,   6.        ],
       [101.28672174,   6.        ],
       [110.25425162,   5.        ]], shape=(1000, 2))

In [31]:
n = 3
distances.sort(kind="heapsort", axis=0)
k_nearest_neighbors = distances[:n]
k_nearest_neighbors

array([[21.07130751,  5.        ],
       [21.07130751,  5.        ],
       [21.33072901,  5.        ]])

In [32]:
df = pd.DataFrame(data=k_nearest_neighbors, columns=["distance", "label"])
df

Unnamed: 0,distance,label
0,21.071308,5.0
1,21.071308,5.0
2,21.330729,5.0


In [22]:
print(df["label"].mode().iloc[0])

5.0


In [52]:
def k_nearest(target: np.ndarray, k: int, input_table: pd.DataFrame, labels: pd.DataFrame):

    t_input_matrix = input_table.to_numpy()
    t_labels_matrix = labels.to_numpy()
    distance_label = np.zeros((len(input_table), 2), dtype="float")

    for row_index, (row, label) in enumerate(zip(t_input_matrix, t_labels_matrix.flat)):
        normalized_row = row / np.linalg.norm(row)
        normalized_target = target / np.linalg.norm(target)
        distance = np.abs(np.sum(normalized_row - normalized_target))
        distance_label[row_index, 0] = distance
        distance_label[row_index, 1] = label

    distance_label.sort(kind="heapsort", axis=0)
    neighbours = distance_label[:k]
    labeled_distances = pd.DataFrame(data=neighbours, columns=["distance", "label"])

    return labeled_distances["label"].mode().iloc[0]


In [53]:
k_nearest(origin, 3, train_inputs, train_labels)


np.float64(5.0)

In [56]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=10, shuffle=True)
result = next(kf.split(train_inputs), None)

split_training_inputs = train_inputs.iloc[result[0]]
split_training_labels = train_labels.iloc[result[0]]
split_test_inputs = train_inputs.iloc[result[1]]
split_test_labels = train_labels.iloc[result[1]]

results = []
accuracies = []

split_test_inputs_matrix = split_test_inputs.to_numpy()
split_test_labels_matrix = split_test_labels.to_numpy()

for k in range(1, 31, 1):
    for i, (vector, vector_label) in enumerate(zip(split_test_inputs_matrix, split_test_labels_matrix.flat)):
        r = k_nearest(vector, 1, split_training_inputs, split_training_labels)
        print(f"Returned Label: {r}", f" Expected Label: {vector_label}")
        results.append(float(r) == float(vector_label))

    accuracies.append((k, results.count(True) / len(results)))
    results.clear()

pd.DataFrame(data=accuracies, columns=["k", "accuracy"])

Returned Label: 5.0  Expected Label: 6.0
Returned Label: 5.0  Expected Label: 6.0
Returned Label: 5.0  Expected Label: 5.0
Returned Label: 5.0  Expected Label: 5.0
Returned Label: 5.0  Expected Label: 5.0
Returned Label: 5.0  Expected Label: 6.0
Returned Label: 5.0  Expected Label: 6.0
Returned Label: 5.0  Expected Label: 6.0
Returned Label: 5.0  Expected Label: 6.0
Returned Label: 5.0  Expected Label: 5.0
Returned Label: 5.0  Expected Label: 5.0
Returned Label: 5.0  Expected Label: 6.0
Returned Label: 5.0  Expected Label: 6.0
Returned Label: 5.0  Expected Label: 6.0
Returned Label: 5.0  Expected Label: 5.0
Returned Label: 5.0  Expected Label: 6.0
Returned Label: 5.0  Expected Label: 5.0
Returned Label: 5.0  Expected Label: 5.0
Returned Label: 5.0  Expected Label: 5.0
Returned Label: 5.0  Expected Label: 6.0
Returned Label: 5.0  Expected Label: 5.0
Returned Label: 5.0  Expected Label: 5.0
Returned Label: 5.0  Expected Label: 6.0
Returned Label: 5.0  Expected Label: 5.0
Returned Label: 

Unnamed: 0,k,accuracy
0,1,0.43
1,2,0.43
2,3,0.43
3,4,0.43
4,5,0.43
5,6,0.43
6,7,0.43
7,8,0.43
8,9,0.43
9,10,0.43
