In [1]:
import numpy as np
import math
import random

In [2]:
def means(features):
    num_columns = len(features[0])
    means = [sum(col) / len(col) for col in zip(*features)]
    return means

In [3]:
def stds(features, means_):
    num_columns = len(features[0])
    stds_ = []
    num_rows = len(features)
    for i in range(num_columns):
        sum_squared_diff = 0
        for j in range(num_rows):
            squared_diff = (features[j][i] - means_[i]) ** 2
            sum_squared_diff += squared_diff
        column_std = math.sqrt(sum_squared_diff / (num_rows- 1))
        stds_.append(column_std)
    return stds_

In [4]:
def normalize(features):
    means_ = means(features)
    stds_ = stds(features, means_)
    num_columns = len(features[0])
    num_rows = len(features)
    for i in range(num_rows):
        for j in range(num_columns):
            features[i][j] = (features[i][j] - means_[j]) / stds_[j]
    return features

In [5]:
def euclidean_distance(sample1, sample2, feats):
    e_dist = 0
    for f in feats:
        e_dist += math.pow((sample1[f] - sample2[f]), 2)
    return math.sqrt(e_dist)

In [6]:
def nearest_N(train, test, feats):
    distances = []
    for sample_t in train:
        e_dist = euclidean_distance(sample_t, test, feats)
        distances.append([e_dist, sample_t[0]])
    distances = sorted(distances, key=lambda x: x[0])
    return distances[0][1]

In [7]:
def accuracy(dataset, features, k):
    count = 0
    total = 0
    curr_feats = features[:]
    n = len(dataset)//k
    for i in range(k):
        test = dataset[i*n:(i+1)*n]
        train = dataset[:i*n] + dataset[(i+1)*n:]
        for t in (test):
            cls = nearest_N(train, t, curr_feats)
            total += 1
            if (cls == t[0]):
                count += 1
    acc = float(count) / total
    return acc