# Knn Algorithm Implementation Exploration


In [36]:
import numpy as np
import pandas as pd

train_inputs = pd.read_csv("../datasets/train_inputs.csv", header=None)
train_labels = pd.read_csv("../datasets/train_labels.csv", header=None)
test_inputs = pd.read_csv("../datasets/test_inputs.csv", header=None)
test_labels = pd.read_csv("../datasets/test_labels.csv", header=None)

train_inputs = train_inputs.rename(columns=lambda x: f"e{x+1}")
train_labels = train_labels.rename(columns={0: "category"})
test_inputs = test_inputs.rename(columns=lambda x: f"e{x+1}")
test_labels = test_labels.rename(columns={0: "category"})

In [6]:
train_input_matrix = train_inputs.to_numpy()
train_input_matrix

array([[ 0.,  0., 10., ...,  7.,  2.,  2.],
       [ 0.,  0.,  9., ...,  4.,  0., 14.],
       [10., 15., 16., ..., 14., 16.,  1.],
       ...,
       [ 5.,  2.,  0., ...,  1.,  4.,  4.],
       [ 0.,  0.,  5., ...,  0.,  4.,  2.],
       [ 0.,  0.,  2., ...,  0.,  2.,  0.]], shape=(1000, 64))

In [7]:
train_labels_matrix = train_labels.to_numpy()
train_labels_matrix

array([[6.],
       [5.],
       [6.],
       [5.],
       [6.],
       [6.],
       [6.],
       [6.],
       [5.],
       [6.],
       [6.],
       [5.],
       [5.],
       [6.],
       [6.],
       [6.],
       [6.],
       [5.],
       [5.],
       [5.],
       [6.],
       [6.],
       [6.],
       [5.],
       [6.],
       [6.],
       [6.],
       [5.],
       [5.],
       [5.],
       [5.],
       [6.],
       [5.],
       [5.],
       [5.],
       [5.],
       [5.],
       [5.],
       [5.],
       [6.],
       [6.],
       [5.],
       [6.],
       [5.],
       [5.],
       [6.],
       [5.],
       [6.],
       [5.],
       [6.],
       [5.],
       [6.],
       [5.],
       [6.],
       [6.],
       [5.],
       [5.],
       [6.],
       [5.],
       [5.],
       [6.],
       [6.],
       [5.],
       [5.],
       [5.],
       [5.],
       [6.],
       [5.],
       [6.],
       [6.],
       [5.],
       [6.],
       [5.],
       [5.],
       [5.],
       [6.],
       [6.],

In [19]:
origin = np.zeros(64, dtype="float")
distances = np.zeros((len(train_inputs), 2), dtype="float")

distances

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       ...,
       [0., 0.],
       [0., 0.],
       [0., 0.]], shape=(1000, 2))

In [20]:
for i, (v, l) in enumerate(zip(train_input_matrix, train_labels_matrix.flat)):
    dist = np.linalg.norm(v - origin)
    distances[i, 0] = dist
    distances[i, 1] = l

distances

array([[40.37325848,  6.        ],
       [60.29925373,  5.        ],
       [95.23129738,  6.        ],
       ...,
       [51.84592559,  6.        ],
       [38.        ,  6.        ],
       [10.34408043,  5.        ]], shape=(1000, 2))

In [21]:
n = 3
distances.sort(kind="heapsort", axis=0)
k_nearest_neighbors = distances[:n]
k_nearest_neighbors

array([[6.08276253, 5.        ],
       [6.92820323, 5.        ],
       [7.        , 5.        ]])

In [22]:
df = pd.DataFrame(data=k_nearest_neighbors, columns=["distance", "label"])
df

Unnamed: 0,distance,label
0,6.082763,5.0
1,6.928203,5.0
2,7.0,5.0


In [33]:
print(df["label"].mode().iloc[0])

5.0


In [82]:
def k_nearest(target: np.ndarray, k: int, input_table: pd.DataFrame, labels: pd.DataFrame):

    t_input_matrix = input_table.to_numpy()
    t_labels_matrix = labels.to_numpy()
    distance_label = np.zeros((len(input_table), 2), dtype="float")

    for row_index, (row, label) in enumerate(zip(t_input_matrix, t_labels_matrix.flat)):
        distance = np.linalg.norm(row - target)
        distance_label[row_index, 0] = distance
        distance_label[row_index, 1] = label

    distance_label.sort(kind="heapsort", axis=0)
    neighbours = distance_label[:k]
    labeled_distances = pd.DataFrame(data=neighbours, columns=["distance", "label"])

    return labeled_distances["label"].mode().iloc[0]


In [83]:
k_nearest(origin, 3, train_inputs, train_labels)


np.float64(5.0)

In [45]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=10, shuffle=True)
result = next(kf.split(train_inputs), None)

split_training_inputs = train_inputs.iloc[result[0]]
split_training_labels = train_labels.iloc[result[0]]
split_test_inputs = train_inputs.iloc[result[1]]
split_test_labels = train_labels.iloc[result[1]]


In [63]:
v = split_test_inputs.to_numpy()[0]
v

array([16., 14., 14., 16., 14., 13., 11.,  6.,  4., 16., 16., 16., 14.,
        3.,  3.,  6.,  1., 10., 15.,  9., 16., 16., 16., 16., 12., 10.,
       16., 14., 16., 16., 16., 16., 12., 16., 16., 15., 15., 14., 16.,
        8.,  6., 12., 16.,  7.,  3., 16., 15.,  7.,  0., 14.,  7., 13.,
       14., 16., 16., 13., 11., 16., 10., 14., 15., 15., 10., 16.])

In [87]:
results = []

for row in split_test_inputs.to_numpy():
    r = k_nearest(row, 3, split_training_inputs, split_training_labels)
    results.append(float(r))

print(results)

[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0]
