## Demo K Nearest Neighbor with Iris flower dataset

### 1. Import libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import neighbors, datasets

### 2. Load data and show samples

In [2]:
iris = datasets.load_iris()
iris_X = iris.data
iris_y = iris.target
print ("Number of classes: %d" %len(np.unique(iris_y)))
print ("Number of data points: %d" %len(iris_y))

X0 = iris_X[iris_y == 0,:]
print ("\nSamples from class 0:\n", X0[:5,:])

X1 = iris_X[iris_y == 1,:]
print ("\nSamples from class 1:\n", X1[:5,:])

X2 = iris_X[iris_y == 2,:]
print ("\nSamples from class 2:\n", X2[:5,:])

Number of classes: 3
Number of data points: 150

Samples from class 0:
 [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]

Samples from class 1:
 [[7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.1 4.9 1.5]
 [5.5 2.3 4.  1.3]
 [6.5 2.8 4.6 1.5]]

Samples from class 2:
 [[6.3 3.3 6.  2.5]
 [5.8 2.7 5.1 1.9]
 [7.1 3.  5.9 2.1]
 [6.3 2.9 5.6 1.8]
 [6.5 3.  5.8 2.2]]


### 3. Split the dataset into training and testing sets

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size = 30)

print ("Training size: %d" %len(y_train))
print ("Test size: %d" %len(y_test))

Training size: 120
Test size: 30


### 4. Train the model with K = 1
Note: p parameter is to determine the method used for calculating the distance. When p = 1, this is equivalent to using manhattan_distance, and euclidean_distance for p = 2.

In [4]:
clf = neighbors.KNeighborsClassifier(n_neighbors = 1, p = 2)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print ("Print results for test data points:")
print ("Predicted labels:   ", y_pred[0:30])
print ("Actual test labels: ", y_test[0:30])

Print results for test data points:
Predicted labels:    [1 2 2 2 2 0 0 2 2 1 1 0 2 0 2 2 0 2 0 1 0 0 2 0 0 0 1 0 2 0]
Actual test labels:  [1 1 2 2 2 0 0 2 2 1 1 0 2 0 2 1 0 1 0 1 0 0 2 0 0 0 1 0 2 0]


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


#### Calculate accuracy of 1NN model

In [5]:
from sklearn.metrics import accuracy_score
print ("Accuracy of 1NN: %.2f %%" %(100*accuracy_score(y_test, y_pred)))

Accuracy of 1NN: 90.00 %


### 5. Train the model with K = 10

In [6]:
clf = neighbors.KNeighborsClassifier(n_neighbors = 10, p = 2)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Calculate accuracy
print ("Accuracy of 10NN: %.2f %%" %(100*accuracy_score(y_test, y_pred)))

Accuracy of 10NN: 96.67 %


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


### 6. Improve the model with distance weights
weights parameter is set to 'uniform' as default. However, to improve the model, instead of treating every points equally, we weights the points based on the distance

In [11]:
clf = neighbors.KNeighborsClassifier(n_neighbors = 3, p = 2, weights = 'distance')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred_train = clf.predict(X_train)

print ("Accuracy of 3NN (1/distance weights) on test data: %.3f %%" %(100*accuracy_score(y_test, y_pred)))
print ("Accuracy of 3NN (1/distance weights) on train data: %.3f %%" %(100*accuracy_score(y_train, y_pred_train)))

Accuracy of 3NN (1/distance weights): 90.000 %
Accuracy of 3NN (1/distance weights): 100.000 %


In [17]:
clf = neighbors.KNeighborsClassifier(n_neighbors = 5, p = 2)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred_train = clf.predict(X_train)

print ("Accuracy of 5NN on test data: %.3f %%" %(100*accuracy_score(y_test, y_pred)))
print ("Accuracy of 5NN on train data: %.3f %%" %(100*accuracy_score(y_train, y_pred_train)))

Accuracy of 5NN (1/distance weights): 90.000 %
Accuracy of 5NN (1/distance weights): 99.167 %


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [16]:
clf = neighbors.KNeighborsClassifier(n_neighbors = 10, p = 2)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred_train = clf.predict(X_train)

print ("Accuracy of 10NN on test data: %.3f %%" %(100*accuracy_score(y_test, y_pred)))
print ("Accuracy of 10NN on train data: %.3f %%" %(100*accuracy_score(y_train, y_pred_train)))

Accuracy of 10NN (1/distance weights): 96.667 %
Accuracy of 10NN (1/distance weights): 99.167 %


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [15]:
clf = neighbors.KNeighborsClassifier(n_neighbors = 1, p = 2)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred_train = clf.predict(X_train)

print ("Accuracy of 1NN on test data: %.3f %%" %(100*accuracy_score(y_test, y_pred)))
print ("Accuracy of 1NN on train data: %.3f %%" %(100*accuracy_score(y_train, y_pred_train)))

Accuracy of 1NN (1/distance weights): 90.000 %
Accuracy of 1NN (1/distance weights): 100.000 %


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
