# K-Nearest Neighbors

The k-nearest neighbor algorithm is a modification of the nearest neighbor
algorithm in which a class label for an input is voted on by the k closest
examples to it. That is the predicted label would be the label with the
majority vote from the delegates close to it.

In [None]:
# import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# read dataset from csv file
dataset = pd.read_csv('diabetes.csv')

In [None]:
# display first five observations
dataset.head(5)

In [None]:
# get shape of dataset, number of observations, number of features
dataset.shape

In [None]:
# get information on data distribution
dataset.describe()

In [None]:
# we check for correlation amongst the features so that we do not have any redundant features
corr = dataset.corr() # data frame correlation function
fig, ax = plt.subplots(figsize=(13, 13))
ax.matshow(corr) # color code the rectangles by correlation value
plt.xticks(range(len(corr.columns)), corr.columns) # draw x tick marks
plt.yticks(range(len(corr.columns)), corr.columns) # draw y tick marks

In [None]:
#separate columns into features and labels
features = dataset.drop(['Outcome'], axis=1)
labels = dataset['Outcome']


In [None]:
# split dataset into training set and test set
from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(features, labels,test_size=0.25)

In [None]:
# import nearest neighbor classifier
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier()

In [None]:
# We fit the classifier using the features and labels from the training set
# get predicted class labels

pred = classifier.predict(features_test)


In [None]:
# get accuracy of model on test set
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(labels_test, pred)
print('Accuracy: {}'.format(accuracy))


### Another Application


In [None]:
# import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sp

In [None]:
# read dataset from csv file
dataset = pd.read_csv('Iris.csv')
dataset.head(5)

In [None]:
X = dataset.iloc[:, 1:5].values # select features ignoring non-informative column Id
y = dataset.iloc[:, 5].values # Species contains targets for our model

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y) # transform species names into categorical values


In [None]:
# split dataset into training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)


In [None]:
def euclidean_distance(training_set, test_instance):
 # number of samples inside training set
 n_samples = training_set.shape[0]
 # create array for distances
 distances = np.empty(n_samples, dtype=np.float64)
 # euclidean distance calculation
 for i in range(n_samples):
 distances[i] = np.sqrt(np.sum(np.square(test_instance - training_set[i])))
return distances

### Locating Neighbors


In [None]:
class MyKNeighborsClassifier():
 
 # Vanilla implementation of KNN algorithm.
 
 def __init__(self, n_neighbors=5):
 self.n_neighbors=n_neighbors
 def fit(self, X, y):
 
 #Fit the model using X as array of features and y as array of labels.
 
 n_samples = X.shape[0]


In [None]:
 # number of neighbors can't be larger then number of samples
 if self.n_neighbors > n_samples:
 raise ValueError("Number of neighbors can't be larger then number of samples in training
set.")

In [None]:
 # X and y need to have the same number of samples
                  if X.shape[0] != y.shape[0]:
 raise ValueError("Number of samples in X and y need to be equal.")

In [None]:
 # finding and saving all possible class labels
 self.classes_ = np.unique(y)
                  self.X = X
 self.y = y
 def pred_from_neighbors(self, training_set, labels, test_instance, k):
 distances = euclidean_distance(training_set, test_instance)


In [None]:
 # combining arrays as columns
 distances = sp.c_[distances, labels]

In [None]:
 # sorting array by value of first column
 sorted_distances = distances[distances[:,0].argsort()]

In [None]:
 # selecting labels associeted with k smallest distances
 targets = sorted_distances[0:k,1]
 unique, counts = np.unique(targets, return_counts=True)
 return(unique[np.argmax(counts)])
 def predict(self, X_test):
# number of predictions to make and number of features inside single sample
 n_predictions, n_features = X_test.shape

In [None]:
 # allocationg space for array of predictions
 predictions = np.empty(n_predictions, dtype=int)

In [None]:
 # loop over all observations
 for i in range(n_predictions):
 # calculation of single prediction
 predictions[i] = self.pred_from_neighbors(self.X, self.y, X_test[i, :], self.n_neighbors)
 return(predictions)

Generating Response


In [None]:
# instantiate learning model (k = 3)
my_classifier = MyKNeighborsClassifier(n_neighbors=3)


In [None]:
# fitting the model
my_classifier.fit(X_train, y_train)


### Evaluating Accuracy


In [None]:
# predicting the test set results
my_y_pred = my_classifier.predict(X_test)


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
accuracy = accuracy_score(y_test, my_y_pred)*100
print('Accuracy: ' + str(round(accuracy, 2)) + ' %.')