In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.datasets import load_iris

iris = load_iris()
print(iris.data.shape)
print(iris.target.shape)

(150, 4)
(150,)


In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3, random_state=0)

In [4]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=4).fit(X_train, y_train)

pred = knn.predict(X_test)

In [5]:
from sklearn.metrics import confusion_matrix, classification_report
cn = confusion_matrix(y_test, pred)
print(cn)
print(classification_report(y_test, pred))

[[16  0  0]
 [ 0 17  1]
 [ 0  0 11]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



### KNN from scratch

In [9]:
import math

def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return math.sqrt(distance)

In [36]:
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target

dataset = df.values
print(dataset[:5])
row0 = dataset[0]
for row in dataset[:5]:
    distance = euclidean_distance(row0, row)
    print(distance)

[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]]
0.0
0.5385164807134502
0.509901951359278
0.648074069840786
0.1414213562373093


In [13]:
def get_neighbors(train, test_row, num_neighbors):
    distances = []
    for train_row in train:
        dist = euclidean_distance(test_row, train_row)
        distances.append((train_row, dist))
    distances.sort(key=lambda x: x[1])
    neighbors = []
    for i in range(num_neighbors):
        neighbors.append(distances[i][0])
    return neighbors

neighbors = get_neighbors(dataset, dataset[0], 3)
for neighbor in neighbors:
    print(neighbor)

[6.3 3.3 6.  2.5 2. ]
[6.5 3.  5.8 2.2 2. ]
[6.3 2.9 5.6 1.8 2. ]


In [17]:
def predict_classification(train, test_row, num_neighbors):
    neighbors = get_neighbors(train, test_row, num_neighbors)
    output_values = [row[-1] for row in neighbors]
    prediction = max(set(output_values), key=output_values.count)
    return prediction

In [30]:
for row in dataset:
    prediction = predict_classification(dataset, row, 3)
    print('Expected {}, Got {}'.format(row[-1], prediction))

Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0.0, Got 0.0
Expected 0

In [39]:
pred = []
for row in dataset:
    prediction = predict_classification(dataset, row, 3)
    pred.append(1 if row[-1]==prediction else 0)
print('Score: {}'.format(pred.count(1) / len(pred)))

Score: 0.96
