In [22]:
import numpy as np
from sklearn.datasets import load_iris #testing purposes
from sklearn.neighbors import KNeighborsClassifier

In [23]:
X = load_iris().data
Y = load_iris().target
print(np.shape(X))
print(np.shape(Y))

(150, 4)
(150,)


In [24]:
train_X = X[0:125:,:]
train_y = Y[0:125]
test_X = X[125:,:] 
test_y = Y[125:]

In [25]:
np.shape(test_X)

(25, 4)

In [26]:
np.shape(train_X) # n = 125 , m = 25

(125, 4)

In [27]:
def predict(train_X,train_y,test_X,k,distance_type):
    '''
    train_X (nxp): design matrix of all training observations 
    train_y (nx1): vehicle class of each training observation. 
    test_X (mxp): design matrix of all testing observations
    k: number of neighbors to consider
    Note: 3 Minkowski distance types are considered: manhattan, euclidean, cubic.
    '''
    n,p = np.shape(train_X)
    m,p = np.shape(test_X)
    minimum_index = 0
    predicted_labels = list()

    for i in range(m):                     # predicting class of 25 vectors 
        distance = np.zeros(n)             # distance of all 125 training vectors from test vector i
        neighbor_labels = list()
        for j in range(n):                 # finding distance of every training vector from test vector
            if distance_type == 'manhattan':
                distance[j] = np.linalg.norm(test_X[i,:] - train_X[j,:],ord=1) 
            elif distance_type == 'euclidean':
                distance[j] = np.linalg.norm(test_X[i,:] - train_X[j,:],ord=2) 
            elif distance_type == 'cubic':
                distance[j] = np.linalg.norm(test_X[i,:] - train_X[j,:],ord=3) 
        ranked_distance = np.argsort(distance)                               # ranked_distance: indexes of n vectors sorted by increasing distance
        for l in range(k):                                         
            neighbor_labels.append(train_y[ranked_distance[l]])              # finding labels of neighbors
    
        pred_label = max(set(neighbor_labels), key = neighbor_labels.count)  # choosing majority label
        predicted_labels.append(pred_label)                                  # appending majority label of ith observation
    
    return predicted_labels

In [28]:
pred_labels = predict(train_X,train_y,test_X,5,'euclidean')

In [30]:
pred_labels # predicted labels of last 25 observations 

accuracy_scratch = sum(test_y == pred_labels) / len(test_y)
accuracy_scratch
# 84% accuracy

0.84

In [31]:
Y # actual labels  

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [32]:
# COMPARE WITH SKLEARN KNEIGHBORS

neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(train_X, train_y)
pred_vec = np.ones(len(test_y))
for i in range(len(test_y)):
    pred_vec[i] = neigh.predict([test_X[i,:]]) # 84% accuracy

accuracy_sk = sum(test_y == pred_vec) / len(test_y)

In [34]:
accuracy_sk

0.84