In [1]:
from copy import deepcopy
import math
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
def euclidian_distance(v1, v2, columns):
    summation = 0
    for col in columns:
        summation += math.pow(v1[col] - v2[col], 2)
    return math.sqrt(summation)

def manhattan_distance(v1, v2, columns):
    summation = 0
    for col in columns:
        summation += abs(v1[col] - v2[col])
    return summation

In [3]:
def min_max_normalize(column):
    return (column - column.min()) / (column.max() - column.min())

In [4]:
def calc_accuracy(Y_pred):
    global Y_test
    score = 0.0
    length = len(Y_test)
    for i in range(length):
        if Y_test[i] == Y_pred[i]:
            score = score + 1
    return score / length

def predict(current_features):
    global X_train, X_test, Y_train
    global num_of_features
    Y_pred = []
    for test_row in X_test:
        min_dist = float("inf")
        prediction = 0
        for index, train_row in enumerate(X_train):
            distance = euclidian_distance(train_row, test_row, current_features)
            if distance < min_dist:
                min_dist = distance
                prediction = Y_train[index]
        Y_pred.append(prediction)
    return Y_pred

In [5]:
def get_next_best(best_features):
    global num_of_features
    accuracy = 0.0
    features = []
    for feature in range(num_of_features):
        if feature not in best_features:
            current_features = deepcopy(best_features)
            current_features.append (feature)
            y_pred = predict(current_features)
            accuracy_ = calc_accuracy(y_pred)
            if accuracy_ > accuracy:
                features = current_features
                accuracy = accuracy_
    return features, accuracy

def forward_selection ():
    global num_of_features
    features = []
    for step in range(num_of_features):
        features, accuracy = get_next_best(features)
        print(f"step: {step}, score: {accuracy:.3f}, features: {features}")

In [6]:
def eliminate_worst(last_features):
    global num_of_features
    accuracy = 0.0
    features = []
    for feature in range(num_of_features):
        if feature in last_features:
            current_features = deepcopy(last_features)
            current_features.remove (feature)
            y_pred = predict(current_features)
            accuracy_ = calc_accuracy(y_pred)
            if accuracy_ > accuracy:
                features = current_features
                accuracy = accuracy_
    return features, accuracy

def backward_elimination ():
    global num_of_features
    features = [x for x in range(num_of_features)]
    y_pred = predict(features)
    accuracy = calc_accuracy(y_pred)
    print(f"step: 0, score: {accuracy:.3f}, features: {features}")
    for step in range(num_of_features - 1):
        features, accuracy = eliminate_worst(features)
        print(f"step: {step + 1}, score: {accuracy:.3f}, features: {features}")

In [7]:
def run(input_file, algo, normalization):
    global num_of_features
    global X_train, X_test, Y_train, Y_test
    df = pd.read_csv(input_file, delim_whitespace=True, header=None)

    if normalization:
        for col in df.columns:
            if col != 0:
                df[col] = min_max_normalize(df[col])

    X = df.iloc[:, 1:].values.astype(float)
    Y = df.iloc[:, 0].values.astype(int)

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=2, test_size=0.20)
    num_of_features = len(X[0])

    if algo == 1:
        print("Running forward_selection")
        forward_selection()
    else:
        print("Running backward_elimination")
        backward_elimination()

In [13]:
run(input_file='dataset/CS205_CalibrationData__1.txt', algo=1, normalization=True)
print("\n==========================================================================\n")
run(input_file='dataset/CS205_CalibrationData__1.txt', algo=2, normalization=True)
print("\n==========================================================================\n")
run(input_file='dataset/CS205_CalibrationData__1.txt', algo=1, normalization=False)
print("\n==========================================================================\n")
run(input_file='dataset/CS205_CalibrationData__1.txt', algo=2, normalization=False)

Running forward_selection
step: 0, score: 0.830, features: [3]
step: 1, score: 0.970, features: [3, 1]
step: 2, score: 0.960, features: [3, 1, 5]
step: 3, score: 0.910, features: [3, 1, 5, 2]
step: 4, score: 0.870, features: [3, 1, 5, 2, 0]
step: 5, score: 0.870, features: [3, 1, 5, 2, 0, 4]


Running backward_elimination
step: 0, score: 0.870, features: [0, 1, 2, 3, 4, 5]
step: 1, score: 0.890, features: [0, 1, 3, 4, 5]
step: 2, score: 0.910, features: [0, 1, 3, 4]
step: 3, score: 0.940, features: [1, 3, 4]
step: 4, score: 0.970, features: [1, 3]
step: 5, score: 0.830, features: [3]


Running forward_selection
step: 0, score: 0.830, features: [3]
step: 1, score: 0.970, features: [3, 1]
step: 2, score: 0.960, features: [3, 1, 4]
step: 3, score: 0.940, features: [3, 1, 4, 2]
step: 4, score: 0.910, features: [3, 1, 4, 2, 0]
step: 5, score: 0.870, features: [3, 1, 4, 2, 0, 5]


Running backward_elimination
step: 0, score: 0.870, features: [0, 1, 2, 3, 4, 5]
step: 1, score: 0.910, features

In [9]:
run(input_file='dataset/CS205_CalibrationData__2.txt', algo=1, normalization=True)
print("\n==========================================================================\n")
run(input_file='dataset/CS205_CalibrationData__2.txt', algo=2, normalization=True)
print("\n==========================================================================\n")
run(input_file='dataset/CS205_CalibrationData__2.txt', algo=1, normalization=False)
print("\n==========================================================================\n")
run(input_file='dataset/CS205_CalibrationData__2.txt', algo=2, normalization=False)

Running forward_selection
step: 0, score: 0.830, features: [5]
step: 1, score: 0.970, features: [5, 4]
step: 2, score: 0.940, features: [5, 4, 0]
step: 3, score: 0.930, features: [5, 4, 0, 1]
step: 4, score: 0.840, features: [5, 4, 0, 1, 3]
step: 5, score: 0.780, features: [5, 4, 0, 1, 3, 2]


Running backward_elimination
step: 0, score: 0.780, features: [0, 1, 2, 3, 4, 5]
step: 1, score: 0.850, features: [0, 2, 3, 4, 5]
step: 2, score: 0.860, features: [0, 2, 4, 5]
step: 3, score: 0.940, features: [0, 4, 5]
step: 4, score: 0.970, features: [4, 5]
step: 5, score: 0.830, features: [5]


Running forward_selection
step: 0, score: 0.830, features: [5]
step: 1, score: 0.970, features: [5, 4]
step: 2, score: 0.930, features: [5, 4, 1]
step: 3, score: 0.940, features: [5, 4, 1, 0]
step: 4, score: 0.860, features: [5, 4, 1, 0, 3]
step: 5, score: 0.800, features: [5, 4, 1, 0, 3, 2]


Running backward_elimination
step: 0, score: 0.800, features: [0, 1, 2, 3, 4, 5]
step: 1, score: 0.860, features

In [10]:
run(input_file='dataset/CS205_CalibrationData__3.txt', algo=1, normalization=True)
print("\n==========================================================================\n")
run(input_file='dataset/CS205_CalibrationData__3.txt', algo=2, normalization=True)
print("\n==========================================================================\n")
run(input_file='dataset/CS205_CalibrationData__3.txt', algo=1, normalization=False)
print("\n==========================================================================\n")
run(input_file='dataset/CS205_CalibrationData__3.txt', algo=2, normalization=False)

Running forward_selection
step: 0, score: 0.820, features: [2]
step: 1, score: 0.970, features: [2, 1]
step: 2, score: 0.920, features: [2, 1, 5]
step: 3, score: 0.890, features: [2, 1, 5, 4]
step: 4, score: 0.850, features: [2, 1, 5, 4, 0]
step: 5, score: 0.790, features: [2, 1, 5, 4, 0, 3]


Running backward_elimination
step: 0, score: 0.790, features: [0, 1, 2, 3, 4, 5]
step: 1, score: 0.850, features: [1, 2, 3, 4, 5]
step: 2, score: 0.890, features: [1, 2, 4, 5]
step: 3, score: 0.920, features: [1, 2, 5]
step: 4, score: 0.970, features: [1, 2]
step: 5, score: 0.820, features: [2]


Running forward_selection
step: 0, score: 0.820, features: [2]
step: 1, score: 0.970, features: [2, 1]
step: 2, score: 0.920, features: [2, 1, 5]
step: 3, score: 0.890, features: [2, 1, 5, 0]
step: 4, score: 0.860, features: [2, 1, 5, 0, 4]
step: 5, score: 0.770, features: [2, 1, 5, 0, 4, 3]


Running backward_elimination
step: 0, score: 0.770, features: [0, 1, 2, 3, 4, 5]
step: 1, score: 0.860, features

In [11]:
run(input_file='dataset/CS205_SP_2022_SMALLtestdata__24.txt', algo=1, normalization=True)
print("\n==========================================================================\n")
run(input_file='dataset/CS205_SP_2022_SMALLtestdata__24.txt', algo=2, normalization=True)
print("\n==========================================================================\n")
run(input_file='dataset/CS205_SP_2022_SMALLtestdata__24.txt', algo=1, normalization=False)
print("\n==========================================================================\n")
run(input_file='dataset/CS205_SP_2022_SMALLtestdata__24.txt', algo=2, normalization=False)

Running forward_selection
step: 0, score: 0.800, features: [1]
step: 1, score: 0.933, features: [1, 4]
step: 2, score: 0.950, features: [1, 4, 0]
step: 3, score: 0.933, features: [1, 4, 0, 2]
step: 4, score: 0.917, features: [1, 4, 0, 2, 8]
step: 5, score: 0.867, features: [1, 4, 0, 2, 8, 5]
step: 6, score: 0.883, features: [1, 4, 0, 2, 8, 5, 3]
step: 7, score: 0.800, features: [1, 4, 0, 2, 8, 5, 3, 6]
step: 8, score: 0.717, features: [1, 4, 0, 2, 8, 5, 3, 6, 7]
step: 9, score: 0.667, features: [1, 4, 0, 2, 8, 5, 3, 6, 7, 9]


Running backward_elimination
step: 0, score: 0.667, features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
step: 1, score: 0.733, features: [0, 1, 2, 3, 4, 5, 7, 8, 9]
step: 2, score: 0.817, features: [0, 1, 2, 3, 4, 7, 8, 9]
step: 3, score: 0.833, features: [1, 2, 3, 4, 7, 8, 9]
step: 4, score: 0.883, features: [1, 2, 3, 4, 7, 8]
step: 5, score: 0.883, features: [1, 2, 3, 4, 7]
step: 6, score: 0.917, features: [1, 2, 4, 7]
step: 7, score: 0.933, features: [1, 2, 4]
step: 8, s

In [12]:
run(input_file='dataset/CS205_SP_2022_Largetestdata__5.txt', algo=1, normalization=True)
print("\n==========================================================================\n")
run(input_file='dataset/CS205_SP_2022_Largetestdata__5.txt', algo=2, normalization=True)
print("\n==========================================================================\n")
run(input_file='dataset/CS205_SP_2022_Largetestdata__5.txt', algo=1, normalization=False)
print("\n==========================================================================\n")
run(input_file='dataset/CS205_SP_2022_Largetestdata__5.txt', algo=2, normalization=False)

Running forward_selection
step: 0, score: 0.885, features: [14]
step: 1, score: 0.975, features: [14, 36]
step: 2, score: 0.970, features: [14, 36, 33]
step: 3, score: 0.940, features: [14, 36, 33, 26]
step: 4, score: 0.930, features: [14, 36, 33, 26, 23]
step: 5, score: 0.915, features: [14, 36, 33, 26, 23, 12]
step: 6, score: 0.880, features: [14, 36, 33, 26, 23, 12, 7]
step: 7, score: 0.880, features: [14, 36, 33, 26, 23, 12, 7, 5]
step: 8, score: 0.860, features: [14, 36, 33, 26, 23, 12, 7, 5, 18]
step: 9, score: 0.875, features: [14, 36, 33, 26, 23, 12, 7, 5, 18, 9]
step: 10, score: 0.860, features: [14, 36, 33, 26, 23, 12, 7, 5, 18, 9, 32]
step: 11, score: 0.855, features: [14, 36, 33, 26, 23, 12, 7, 5, 18, 9, 32, 35]
step: 12, score: 0.850, features: [14, 36, 33, 26, 23, 12, 7, 5, 18, 9, 32, 35, 19]
step: 13, score: 0.870, features: [14, 36, 33, 26, 23, 12, 7, 5, 18, 9, 32, 35, 19, 38]
step: 14, score: 0.855, features: [14, 36, 33, 26, 23, 12, 7, 5, 18, 9, 32, 35, 19, 38, 22]
st