In [2]:
import numpy as np
import math
import random

In [3]:
def means(features):
    num_columns = len(features[0])
    means = [sum(col) / len(col) for col in zip(*features)]
    return means

In [4]:
def stds(features, means_):
    num_columns = len(features[0])
    stds_ = []
    num_rows = len(features)
    for i in range(num_columns):
        sum_squared_diff = 0
        for j in range(num_rows):
            squared_diff = (features[j][i] - means_[i]) ** 2
            sum_squared_diff += squared_diff
        column_std = math.sqrt(sum_squared_diff / (num_rows- 1))
        stds_.append(column_std)
    return stds_

In [5]:
def normalize(features):
    means_ = means(features)
    stds_ = stds(features, means_)
    num_columns = len(features[0])
    num_rows = len(features)
    for i in range(num_rows):
        for j in range(num_columns):
            features[i][j] = (features[i][j] - means_[j]) / stds_[j]
    return features

In [6]:
def euclidean_distance(sample1, sample2, feats):
    e_dist = 0
    for f in feats:
        e_dist += math.pow((sample1[f] - sample2[f]), 2)
    return math.sqrt(e_dist)

In [7]:
def nearest_N(train, test, feats):
    distances = []
    for sample_t in train:
        e_dist = euclidean_distance(sample_t, test, feats)
        distances.append([e_dist, sample_t[0]])
    distances = sorted(distances, key=lambda x: x[0])
    return distances[0][1]

In [8]:
def accuracy(dataset, features, k):
    count = 0
    total = 0
    curr_feats = features[:]
    n = len(dataset)//k
    for i in range(k):
        test = dataset[i*n:(i+1)*n]
        train = dataset[:i*n] + dataset[(i+1)*n:]
        for t in (test):
            cls = nearest_N(train, t, curr_feats)
            total += 1
            if (cls == t[0]):
                count += 1
    acc = float(count) / total
    return acc

In [9]:
def forward_selection(dataset, selected_features, final_features, final_acc, k):
    if(len(selected_features) == len(dataset[0]) - 1):
        return final_features, final_acc
    num_features = len(dataset[0])
    selected_features = selected_features[:]
    acc = 0
    current_features = []
    print('selected features:', selected_features)
    print('finalized features:', final_features)
    print('final accuracy:', round(final_acc*100, 1),'%')
    for n in range(1, num_features):
        prev_features = selected_features[:]
        if n not in selected_features:
            prev_features.append(n)
            acc = accuracy(dataset, prev_features, k)
            temp = []
            temp.append(acc)
            temp.append(n)
            current_features.append(temp)
            print('for feature', n,'=> the accuracy is:', round(acc*100, 1),'%')
    current_features = sorted(current_features, key=lambda x: x[0], reverse=True)
    #print(current_features[0][1])
    selected_features.append(current_features[0][1])
    acc = current_features[0][0]
    if(acc>final_acc):
        final_features.append(current_features[0][1])
        final_acc = acc
    #print(selected_features, final_features, final_acc)
    return forward_selection(dataset, selected_features, final_features, final_acc, k)

In [None]:
def backward_elimination_(dataset, selected_features, final_features, final_acc, k):
    if(len(selected_features) == 0):
        return final_features, final_acc
    num_features = len(dataset[0])
    selected_features = selected_features[:]
    acc = 0
    current_features = []
    print('selected features:', selected_features)
    print('finalized features:', final_features)
    print('final accuracy:', round(final_acc*100, 1),'%')
    for n in range(1, num_features):
        prev_features = selected_features[:]
        if n in selected_features:
            prev_features.remove(n)
            acc = accuracy_(dataset, prev_features, k)
            temp = []
            temp.append(acc)
            temp.append(n)
            current_features.append(temp)
            print('for feature', n,'=> the accuracy is:', round(acc*100, 1),'%')
    current_features = sorted(current_features, key=lambda x: x[0], reverse=True)
    #print(current_features[0][1])
    selected_features.remove(current_features[0][1])
    acc = current_features[0][0]
    if(acc>final_acc):
        final_features.remove(current_features[0][1])
        final_acc = acc
    #print(selected_features, final_features, final_acc)
    return backward_elimination_(dataset, selected_features, final_features, final_acc, k)