In [16]:
import numpy as np
import math
import sys

def read_data():
    x = []
    y = []
    with open('iris.data', 'r') as f:
        for line in f:
            if line == '\n':
                continue
            sepal_l, sepal_w, petal_l, petal_w, iris_class = line.split(',')
            x.append([sepal_l, sepal_w, petal_l, petal_w])
            if iris_class == 'Iris-setosa\n':
                y.append(1)
            elif iris_class == 'Iris-versicolor\n':
                y.append(2)
            else:
                y.append(3)
    return np.array(x).astype('float64'), np.array(y).astype('float64')

def get_distance(x_train, x):
    return np.sqrt(np.dot(x_train - x, x_train - x))

def get_error(y_train, y):
    incorrect_count = 0.0
    for i in xrange(y_train.shape[0]):
        if y_train[i] != y[i]:
            incorrect_count += 1
    return incorrect_count / y_train.shape[0] * 100.0

def knn(k, x_train, y_train, x):
    y = []
    for i in xrange(x.shape[0]):
        dist = []
        for j in xrange(x_train.shape[0]):
            dist.append((get_distance(x_train[j], x[i]), y_train[j][0]))

        results = np.array(sorted(dist)[:k])[:, 1]
        items, counts = np.unique(results, return_counts=True)
        y.append(items[np.argmax(counts)])
    return y   
   
def cross_validation(k_fold, k_nn_max, x, y):
    best_err = sys.maxint
    best_knn = 0
    for k in xrange(1, k_nn_max + 1):
        error = cross_validation_iteration(k_fold, k, x.copy(), y.copy())
        if error <= best_err:
            best_err = error
            best_knn = k
    return best_knn

def cross_validation_iteration(k_fold, k_nn, x, y):
    k_fold_size = len(x) / k_fold
    error = 0.0
    for k_idx in xrange(k_fold):
        x_train = np.concatenate((x[:k_idx * k_fold_size], x[(k_idx + 1) * k_fold_size:]))
        y_train = np.concatenate((y[:k_idx * k_fold_size], y[(k_idx + 1) * k_fold_size:]))
        
        x_validation = x[k_idx * k_fold_size:(k_idx + 1) * k_fold_size]
        y_validation = y[k_idx * k_fold_size:(k_idx + 1) * k_fold_size]
        
        y_knn = knn(k_nn, x_train, y_train, x_validation)
        error += get_error(y_validation, y_knn)
    return error / k_fold
    
x, y = read_data()

c = np.c_[x.reshape(len(x), -1), y.reshape(len(y), -1)]
np.random.shuffle(c)

x_train = c[:, :x.size//len(x)][:int(0.9 * x.shape[0])]
y_train = c[:, x.size//len(x):][:int(0.9 * x.shape[0])]

x_test = c[:, :x.size//len(x)][int(0.9 * x.shape[0]):]
y_test = c[:, x.size//len(x):][int(0.9 * x.shape[0]):]

best_k = cross_validation(5, 40, x_train.copy(), y_train.copy())

model = knn(best_k, x_train.copy(), y_train.copy(), x_test.copy())
print best_k
print get_error(y_test, model)

9
0.0
