WORKING MECHANISM OF K NEAREST NEIGHBORS

  1. After loading the data, initialize the K to the choosen number of neighbors
  2. For each sample in data, calculate the distance between the query example and the current example from the data. Then, add the distance and the index of the example to an ordered collection.

  3. Sort the ordered collection of the distances and indices from smallest to largest (in ascending order ) by the distances.

  4. Pick te first k entries from the sorted entries.

  5. Get the labels

  6. Finally, if the problem is classification, return the mode of k labels, otherwise return mean correpondingly, 

In [8]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pandas as pd

iris = load_iris()
features = iris.data
target = iris.target

features = StandardScaler().fit_transform(features)
X = pd.DataFrame(features)
y = pd.Series(target)
from sklearn.model_selection import train_test_split    # create a custom k-fold cross validation.. dont use train test split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25, random_state = 42)

In [9]:
X_train.reset_index(drop = True, inplace = True)
X_test.reset_index(drop = True, inplace = True)
y_train.reset_index(drop = True, inplace = True)
y_test.reset_index(drop = True, inplace = True)

# Custom KNN

In [13]:
import statistics
import collections
import numpy as np
from tqdm import tqdm

class KNN:
  def __init__(self, mode = 'Classification', n_neighbors = 5, p = 1):
    self.mode = mode
    self.n_neighbors = n_neighbors
    self.p = p
    # self.distances = []
    # self.indexes = []
    self.X_train = None
    self.y_train = None
  


  def _accuracy(self, target, predictions):
    accuracy = 0
    for i in range(len(predictions)):
      if (predictions[i] == target[i]):
        accuracy += 1

    return accuracy / len(target)
  

  def calculate_distance(self, vectorA, vectorB):
    distance = 0
    for i in range(len(vectorA)):
      distance += abs(vectorA[i] - vectorB[i]) ** self.p
    distance = distance ** (1 / self.p)
    return distance

  def get_nearest_neighbors(self, test_row, X):
    distances = []
    for i in range(len(X)):
      distance_ = self.calculate_distance(test_row, X[i])
      distances.append([X[i], distance_, i])
    distances.sort(key = lambda x : x[1], reverse = False)
    distances = distances[:self.n_neighbors]
    return np.array([[list(i[0]), i[1], i[2]] for i in distances], dtype=object)
    # return [list(x[0]) for x in distances][:self.n_neighbors]

  def fit(self, X, y):
    # 'Manhattan distance (p=1), Euclidean distance (p=2)'
    distance = ""

    if ((self.mode != 'Classification') and (self.mode != 'Regression')):
      print('Please select a correct mode: Classification or regression')
      quit()

    if (self.p == 1):
      distance = 'Manhattan'
    elif self.p == 2:
      distance = 'Euclidean'
    else:
      print('Only Manhattan(p=1) or Euclidean(p=2) distances available');quit()

    # print('KNN(mode = ',self.mode,', n_neighbors = ',self.n_neighbors,', distance = ',distance,')')
    self.X_train = X
    self.y_train = y


  def predict(self, X):
    predictions = []
    for i in tqdm(range(len(X))):
      neighbouring_rows = self.get_nearest_neighbors(X.iloc[i].values, self.X_train.values)
      value = list(y_train.loc[X_train.loc[neighbouring_rows[:,2]].index].values)
      if self.mode == 'Classification': predictions.append(collections.Counter(value).most_common()[0][0]) # calculating mode
      else: predictions.append(statistics.median(value))

    return predictions

obj = KNN(mode = 'Classification')
obj.fit(X_train, y_train)
preds = obj.predict(X_test)

print("\nFinal Training accuracy: ",obj._accuracy(y_train.values, obj.predict(X_train)))
print("Final Testing accuracy:, ", obj._accuracy(y_test.values, preds))

# print classification report
print(classification_report(y_test, preds))

100%|██████████| 38/38 [00:00<00:00, 424.64it/s]
100%|██████████| 112/112 [00:00<00:00, 417.51it/s]


Final Training accuracy:  0.9553571428571429
Final Testing accuracy:,  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00        11
           2       1.00      1.00      1.00        12

    accuracy                           1.00        38
   macro avg       1.00      1.00      1.00        38
weighted avg       1.00      1.00      1.00        38






In [14]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, preds)

1.0

# Sklearn KNN

In [18]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)
accuracy_score(y_test, knn.predict(X_test))

1.0