# KNN classification

In [2]:
# import only to read Iris dataset
from sklearn import datasets
import numpy as np

In [3]:
# read Iris dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [4]:
#(optional) Check the data format 
X[:5],y[:5]

(array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2]]),
 array([0, 0, 0, 0, 0]))

## Algorithm

In [5]:
# calculate euclidean distance betweeen two data points
def calculate_euclidean_distance(data_point_1, data_point_2):
    distance = 0.0
    for i in range(len(data_point_1)):
        distance += (data_point_1[i] - data_point_2[i]) ** 2
    return np.sqrt(distance)

In [6]:
#(optional) test calculate_euclidean_distance functionality
for Xi in X[1:6]:
    print(calculate_euclidean_distance(X[0],Xi))

0.5385164807134502
0.509901951359278
0.648074069840786
0.1414213562373093
0.6164414002968979


In [7]:
# Get k nearest points of a single given point.
def get_k_nearest_points_index(user_point, neighbor_points, k):
    neighbor_distances = []
    for idx, neighbor_point in enumerate(neighbor_points):
        distance = calculate_euclidean_distance(user_point, neighbor_point)
        neighbor_distances.append((idx, distance))
    neighbor_distances.sort(key=lambda n: n[1])
    return neighbor_distances[:k]

In [8]:
#(optional) test k nearest points functionality
get_k_nearest_points_index(X[0],X[1:], 3)

[(16, 0.09999999999999998), (3, 0.1414213562373093), (38, 0.14142135623730964)]

In [9]:
def predict_for_classification(test_point, train_points, train_labels, k):
    k_nearest_point_indices = get_k_nearest_points_index(test_point, train_points, k)
    k_labels = list(dict(k_nearest_point_indices).keys())
    output_label = max(k_labels, key = k_labels.count)
    return train_labels[output_label]

In [10]:
# (optional) test classification prediction
predict_for_classification(X[0],X[1:], y[1:],3)

0

In [11]:
y[0]

0

# KNN regression

In [13]:
# Read diabetes data
diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target

In [14]:
X[:5], y[:5]

(array([[ 0.03807591,  0.05068012,  0.06169621,  0.02187235, -0.0442235 ,
         -0.03482076, -0.04340085, -0.00259226,  0.01990842, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, -0.02632783, -0.00844872,
         -0.01916334,  0.07441156, -0.03949338, -0.06832974, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, -0.00567061, -0.04559945,
         -0.03419447, -0.03235593, -0.00259226,  0.00286377, -0.02593034],
        [-0.08906294, -0.04464164, -0.01159501, -0.03665645,  0.01219057,
          0.02499059, -0.03603757,  0.03430886,  0.02269202, -0.00936191],
        [ 0.00538306, -0.04464164, -0.03638469,  0.02187235,  0.00393485,
          0.01559614,  0.00814208, -0.00259226, -0.03199144, -0.04664087]]),
 array([151.,  75., 141., 206., 135.]))

In [15]:
#(optional) test calculate_euclidean_distance functionality
for Xi in X[1:6]:
    print(calculate_euclidean_distance(X[0],Xi))

0.2364845537916607
0.06139576832475997
0.20566958779442196
0.17557613073850112
0.252015428632744


In [16]:
#(optional) test k nearest points functionality
get_k_nearest_points_index(X[0],X[1:], 3)

[(50, 0.057814687136138965),
 (1, 0.06139576832475997),
 (340, 0.07172412054855398)]

In [21]:
def predict_for_regression(test_point, train_points, train_labels, k):
    k_nearest_point_indices = get_k_nearest_points_index(test_point, train_points, k)
    k_labels = list(dict(k_nearest_point_indices).keys())
    
    summation = 0.0
    for label in k_labels:
        summation += train_labels[label]
    return summation/len(k_labels)

In [22]:
# (optiona) test regression prediction
predict_for_regression(X[0],X[1:], y[1:],3)

209.66666666666666