## LS_K_Nearest_Neighbors Class

In [1]:
import numpy as np
import scipy as sp

In [2]:
class LS_K_Nearest_Neighbors():

  # To begin we need to determine our variables, the amount of neighbors that we will 
  # measure data points against, and how the data points will be weighted against
  # each other.
  
  def __init__(self, X_train, y_train, n_neighbors=5, weights='uniform'):

    self.X_train = X_train
    self.y_train = y_train

    self.n_neighbors = n_neighbors
    self.weights = weights

    self.n_classes = 3


  # To start our algorithm, we first need to determine the distance between 2 points.
  # These points are the data points that we want to classify and the training 
  # point that we are using for comparison.
  # For this, we are using the common Euclidian distance formula, but others 
  # could be used as well, such as the Manhattan distance formula

  def euclidian_distance(self, a, b):
    distance = np.sqrt(np.sum((a - b)**2, axis=1))
    return distance


  # Of course, we want to make sure that the data is all fit to the model.
  def fit(self, X, y):
      """
      X is a list of lists containing data values
      y is a list of classifications
      """

      for i in range(len(X)):
          # Add the classification to the list with data values
          X[i].append(y[i])
      return X


  # Our next step is to find the neighbors that we want to check in with.
  # A KNN model will only work if other data points can be located, and we need 
  # to work with the closest ones.
  # For our purposes, the actual distance value itself between the data points
  # is irrelevant, what we need is the order of the data points in terms of how
  # close or far they are from the data point we are classifying.
  # The euclidean_distance function will do the measuring, and then we have to
  # determine the order of these distances, and hence, the order of the data points.

  # For the following kneighbors function, the data points that we want to classify
  # are part of the test dataset and the rest of the data points are part of the
  # training dataset.
  # When we measure the distance between a data point in the test dataset and
  # the training dataset we store these distance as point distributions in the
  # point_dist variable.
  # Each row stored in point_dist represents a list of distances between 1 test
  # dataset data point against all of the data points in the training dataset.
  # After each test dataset data point has been measured against we will
  # list (enumerate) and sort each row according to the measured distances.
  # It is important to do this for ALL of the rows because we don't want to lose
  # the information of the training data points that we calculated the distances
  # with. This information will be important when we refer to them later.
  # The sorted_neigh variable holds the first nearest neighbors of our test data
  # points, sorted accordingly by their measured euclidian distances.
  # From sorted_neigh, we extract the indicies and distance values and return them


  def kneighbors(self, X_test, return_distance=False):

    dist = []
    neigh_ind = []

    point_dist = [self.euclidian_distance(x_test, self.X_train) for x_test in X_test]

    for row in point_dist:
      enum_neigh = enumerate(row)
      sorted_neigh = sorted(enum_neigh,
                            key=lambda x: x[1])[:self.n_neighbors]

      ind_list = [tup[0] for tup in sorted_neigh]
      dist_list = [tup[1] for tup in sorted_neigh]

      dist.append(dist_list)
      neigh_ind.append(ind_list)

    if return_distance:
      return np.array(dist), np.array(neigh_ind)

    return np.array(neigh_ind)


  # Once the nearest neighbors have been located and measured, we use our
  # predict function to attempt to predict the classes that each of our test
  # data points belong to. 
  # The method that we use to determine this differs depending on the criteria
  # that we have chosen.
  # One of the more important choices that needs to be determined is how the
  # data points will be weighed against each other, uniformly or by distance.
  # Weighing by uniform involves getting the indices of each neighbor and then
  # using the indices to match their corresponding class labels with those from
  # the training dataset.
  # Each row in the neighbors variable corresponds to the set of neighbors that
  # each one of the test data points has.
  # The bincount function is then used to find the occurrences of the class labels,
  # and to get the index that has the maximum occurrence. This index corresponds
  # to the predicted class label.
  # Weighing by distance involves finding the mean inverse of the distances
  # between each neighbor and calculating the class probabilities for each test
  # data point.

  def predict(self, X_test):

    if self.weights == 'uniform':
        neighbors = self.kneighbors(X_test)
        y_pred = np.array([
            np.argmax(np.bincount(self.y_train[neighbor]))
            for neighbor in neighbors
        ])

        return y_pred

    if self.weights == 'distance':

      dist, neigh_ind = self.kneighbors(X_test, return_distance=True)

      inv_dist = 1 / dist

      mean_inv_dist = inv_dist / np.sum(inv_dist, axis=1)[:, np.newaxis]

      proba = []

      for i, row in enumerate(mean_inv_dist):

          row_pred = self.y_train[neigh_ind[i]]

          for k in range(self.n_classes):
              indices = np.where(row_pred == k)
              prob_ind = np.sum(row[indices])
              proba.append(np.array(prob_ind))

      predict_proba = np.array(proba).reshape(X_test.shape[0],
                                              self.n_classes)

      y_pred = np.array([np.argmax(item) for item in predict_proba])

      return y_pred


  # This score function will provide a simple and straightforward accuracy metric
  
  def score(self, X_test, y_test):
      y_pred = self.predict(X_test)

      return float(sum(y_pred == y_test)) / float(len(y_test))

## Testing the LS_K_Nearest_Neighbors Class

In [3]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd

### Breast Cancer Dataset

In [4]:
from sklearn.datasets import load_breast_cancer
bc_dataset = load_breast_cancer()

X = bc_dataset.data
y = bc_dataset.target

mu = np.mean(X, 0)
sigma = np.std(X, 0)
X = (X - mu ) / sigma

X_train, X_test, y_train, y_test = train_test_split(\
                X, y, test_size=0.3, random_state=45)

LSKNN_classifier = LS_K_Nearest_Neighbors(X_train, y_train, n_neighbors=3)
sklearn_classifier = KNeighborsClassifier(n_neighbors=3).fit(X_train, y_train)

LSKNN_accuracy = LSKNN_classifier.score(X_test, y_test)
sklearn_accuracy = sklearn_classifier.score(X_test, y_test)

pd.DataFrame([[LSKNN_accuracy, sklearn_accuracy]],
             ['Breast Cancer Accuracy'],    
             ['LSKNN Implementation', 'Sklearn\'s Implementation'])

Unnamed: 0,LSKNN Implementation,Sklearn's Implementation
Breast Cancer Accuracy,0.964912,0.964912


### Iris Dataset

In [5]:
from sklearn.datasets import load_iris
iris_dataset = load_iris()

X = iris_dataset.data
y = iris_dataset.target

mu = np.mean(X, 0)
sigma = np.std(X, 0)
X = (X - mu ) / sigma

X_train, X_test, y_train, y_test = train_test_split(\
                X, y, test_size=0.3, random_state=45)

LSKNN_classifier = LS_K_Nearest_Neighbors(X_train, y_train, n_neighbors=3)
sklearn_classifier = KNeighborsClassifier(n_neighbors=3).fit(X_train, y_train)

LSKNN_accuracy = LSKNN_classifier.score(X_test, y_test)
sklearn_accuracy = sklearn_classifier.score(X_test, y_test)

pd.DataFrame([[LSKNN_accuracy, sklearn_accuracy]],
             ['Iris Accuracy'],    
             ['LSKNN Implementation', 'Sklearn\'s Implementation'])

Unnamed: 0,LSKNN Implementation,Sklearn's Implementation
Iris Accuracy,0.955556,0.955556


### Wine Dataset

In [6]:
from sklearn.datasets import load_wine
wine_dataset = load_wine()

X = wine_dataset.data
y = wine_dataset.target

mu = np.mean(X, 0)
sigma = np.std(X, 0)
X = (X - mu ) / sigma

X_train, X_test, y_train, y_test = train_test_split(\
                X, y, test_size=0.3, random_state=45)

LSKNN_classifier = LS_K_Nearest_Neighbors(X_train, y_train, n_neighbors=3)
sklearn_classifier = KNeighborsClassifier(n_neighbors=3).fit(X_train, y_train)

LSKNN_accuracy = LSKNN_classifier.score(X_test, y_test)
sklearn_accuracy = sklearn_classifier.score(X_test, y_test)

pd.DataFrame([[LSKNN_accuracy, sklearn_accuracy]],
             ['Wine Accuracy'],    
             ['LSKNN Implementation', 'Sklearn\'s Implementation'])

Unnamed: 0,LSKNN Implementation,Sklearn's Implementation
Wine Accuracy,0.981481,0.981481


Blog Site: https://rctom168.github.io/2015-02-26-flake-it-till-you-make-it/

* When I make a new blog post it does not appear for some reason.
* When I rename this blog post, it disappears. 
* So unfortunately, the best that I can do is update this blog post to reflect this project, but the name can't be changed.
* 2020-06-26 LS_K_Nearest_Neighbors Algorithm.md