In [11]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

class KNN:
  def __init__(self,k=3,distanceMetric='euclidean'):
    self.k=3
    self.distanceMetric=distanceMetric

  # x1,x2=vector in feature space
  def _euclideanDist(self,x1,x2):
    return np.sqrt(np.sum(x1-x2)**2)

  def _manHattanDist(self,x1,x2):
    return np.sum(np.abs(x1-x2))

  def _minkowskiDist(self,x1,x2):
    return np.sum(np.abs(x1-x2)**self.k)**1/self.k

  def fit(self,X,y):
    # X : array-like Training data
    # y : array-like Target values
    self.x_train=X
    self.y_train=y

  def predict(self,X):
    predicted=[self._predict(x) for x in X]
    return np.array(predicted)

  def _predict(self,x):
    if self.distanceMetric=='euclidean':
      distance_vector = [self._euclideanDist(x, x_train) for x_train in self.x_train]
    elif self.distanceMetric=='manhattan':
      distance_vector = [self._manHattanDist(x, x_train) for x_train in self.x_train]
    elif self.distanceMetric=='minkowski':
      distance_vector = [self._minkowskiDist(x, x_train) for x_train in self.x_train]
    else:
      raise ValueError(f"No metric named {self.distanceMetric}\n Choose from:euclidean manhattan or minkowski")
   # Sort by distance and return indices of the first k neighbors
    k_indices = np.argsort(distance_vector)[:self.k]
    # Extract the labels of the k nearest neighbor training samples
    k_nearest_labels = [self.y_train[i] for i in k_indices]
    # return the most common class label
    most_common = Counter(k_nearest_labels).most_common(1)
    return most_common[0][0]


iris = datasets.load_iris()
X = iris.data
y = iris.target

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

# Create KNN classifier
knn = KNN()

# Fit the classifier to the data
knn.fit(X_train_std, y_train)

# Predict the labels of the test set using Euclidean distance
y_pred_euclidean = knn.predict(X_test_std)
accuracy_euclidean = accuracy_score(y_test, y_pred_euclidean)
print(f'Accuracy with Euclidean distance: {accuracy_euclidean:.2%}')

# Now let's test with Manhattan distance
knn_manhattan = KNN(distanceMetric='manhattan')
knn_manhattan.fit(X_train_std, y_train)
y_pred_manhattan = knn_manhattan.predict(X_test_std)
accuracy_manhattan = accuracy_score(y_test, y_pred_manhattan)
print(f'Accuracy with Manhattan distance: {accuracy_manhattan:.2%}')

# Finally, test with Minkowski distance (k=3)
knn_minkowski = KNN(distanceMetric='minkowski')
knn_minkowski.fit(X_train_std, y_train)
y_pred_minkowski = knn_minkowski.predict(X_test_std)
accuracy_minkowski = accuracy_score(y_test, y_pred_minkowski)
print(f'Accuracy with Minkowski distance: {accuracy_minkowski:.2%}')



Accuracy with Euclidean distance: 73.33%
Accuracy with Manhattan distance: 100.00%
Accuracy with Minkowski distance: 100.00%


#How to choose the right Distance Metric?
1. What’s our data like?
  Continuous vs. Categorical: If our data is all about numbers and measurements (continuous data), Euclidean distance is our go-to, because it measures straight lines between points. For data that’s more about categories (like types of fruit, where “apple” and “orange” aren’t on a scale), Hamming distance, which checks if features match, makes more sense.


2. How big is our data?

  When our dataset is really wide (lots of features), traditional ideas of closeness get wonky, and everything starts to seem far apart. Here, reducing dimensions or picking metrics suited for the big stage, like cosine similarity for text, can keep things in perspective.


3. How is our data spread out?

  The way our data is distributed matters. If outliers are a big deal in our dataset, Manhattan distance might be your ally since it doesn’t get as shaken up by extreme values compared to Euclidean distance.


4. Need for speed?
  Some distance metrics are computationally more intensive than others. Metrics like Manhattan distance can be computationally more efficient than Euclidean distance in certain implementations since it lacks the square root operation.

In [10]:
#Using ScikitLearn's Inbuilt library
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load iris dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

# Create KNN classifier
knn = KNeighborsClassifier(n_neighbors=3)

# Fit the classifier to the data
knn.fit(X_train_std, y_train)

# Predict the labels of the test set
y_pred = knn.predict(X_test_std)

# Print the accuracy of the classifier
print(f'Accuracy: {accuracy_score(y_test, y_pred):.2%}')

Accuracy: 100.00%
