In [188]:
#Import data
input_file = "CS170_Small_Data__96.txt"

input_data: list[(list[float], int)] = []
FEATURES = 0
CLASS = 1

def import_data(file: str):
    input_data.clear()
    with open(file, 'r') as f:
        for line in f:
            f_list = [float(i) for i in line.split(" ") if i.strip()]
            input_data.append((f_list[1:], f_list[0]))

import_data(input_file)
print(input_data[0])

([-0.084975892, 1.4136573, 0.95165009, -0.6215477, -0.19397639, 0.19818832], 2.0)


In [189]:
import math

#Euclidean distance
#p1 and p2 must have the same dimensionality
def dist(p1: list[float], p2: list[float]) -> float:
    sum = 0
    for i in range(len(p1)):
        sum += math.pow(p1[i] - p2[i], 2)
    return math.sqrt(sum)

print("example euclidean distance: ", dist([1.0, 2.0], [2.0, 1.0]))

example euclidean distance:  1.4142135623730951


In [190]:
#I will use this hardcoded K value
K = 7

#K nearest neighbors
# returns the Classification (1 or 2)
def KNN(p: list[float], data: list[(list[float], int)], k: int) -> int:
    d: list[(float, int)] = []
    for i, tuple in enumerate(data):
        d.append((dist(p, tuple[FEATURES]), tuple[CLASS]))
    d.sort()
    d = d[:k]

    # this part is hard coded to the data:
    # classes are 1 and 2
    a = 0
    b = 0
    for i, item in enumerate(d):
        if item[CLASS] == 1:
            a += 1
        else:
            b += 1

    if a > b:
        return 1
    else:
        return 2

print("example classification:")
KNN(input_data[0][FEATURES], input_data[1:], K)


example classification:


1

In [191]:
import copy

#classify
def classify8020(data: list[(list[float], int)]):
    split = int(len(data) * 0.2)
    test = data[:split]
    train = data[split:]

    num_correct = 0
    for i, tuple in enumerate(test):
        classification = KNN(tuple[FEATURES], train, K)
        if classification == tuple[CLASS]:
            num_correct += 1
    
    return num_correct / len(test)

def classify(data: list[(list[float], int)]):
    total = 0
    num_correct = 0
    for i, tuple in enumerate(data):
        train = copy.copy(data)
        del train[i]

        classification = KNN(tuple[FEATURES], train, K)
        
        if classification == tuple[CLASS]:
            num_correct += 1
        total += 1
    
    return num_correct / total

#construct data given feature the numbers
def construct_data(features: list[float], data: list[(list[float], int)]) -> list[float]:
    my_data = []
    for i, tuple in enumerate(data):
        temp = []
        for feature in features:
            temp.append(tuple[FEATURES][feature-1])
        my_data.append((temp, tuple[CLASS]))
    return my_data

print("Test classification using all points")
print(classify(input_data))

print("Test classification using features 1, 3, and 6")
print(classify(construct_data([1,3,6], input_data)))


Test classification using all points
0.86
Test classification using features 1, 3, and 6
0.94


In [192]:
import copy

def forward_selection(DEBUG: bool = False):
    best_ever = 0
    best_set_ever = []
    best_so_far = []
    for i in range(len(input_data[0][FEATURES])):
        #keeps track of best accuracy for this group
        best = 0.0
        for j in range(len(input_data[0][FEATURES])):
            #check to see if this feature exists in this feature set yet
            if j+1 in best_so_far:
                continue

            #copy over and add a new feature
            feature_set = copy.copy(best_so_far[:i])
            feature_set.append(j+1)
            
            data = construct_data(feature_set, input_data)
            accuracy = classify(data)

            if DEBUG:
                print("testing feature(s): ", feature_set, " accuracy: ", accuracy)
            
            
            # check if this set was better than any others in its group
            if accuracy > best:
                best = accuracy
                best_so_far = feature_set
            # check if this set was better than any other set ever
            if accuracy > best_ever:
                best_ever = accuracy
                best_set_ever = feature_set
        
        if DEBUG:
            print("\n Best feature was ", best_so_far, " accuracy: ", best, "\n")

    if DEBUG:
        print("Finished... The best feature set was", best_set_ever, "with an accuracy of", best_ever)

    return (best_set_ever, best_ever)

forward_selection(True)

testing feature(s):  [1]  accuracy:  0.868
testing feature(s):  [2]  accuracy:  0.804
testing feature(s):  [3]  accuracy:  0.81
testing feature(s):  [4]  accuracy:  0.798
testing feature(s):  [5]  accuracy:  0.794
testing feature(s):  [6]  accuracy:  0.768

 Best feature was  [1]  accuracy:  0.868 

testing feature(s):  [1, 2]  accuracy:  0.81
testing feature(s):  [1, 3]  accuracy:  0.894
testing feature(s):  [1, 4]  accuracy:  0.85
testing feature(s):  [1, 5]  accuracy:  0.84
testing feature(s):  [1, 6]  accuracy:  0.964

 Best feature was  [1, 6]  accuracy:  0.964 

testing feature(s):  [1, 6, 2]  accuracy:  0.926
testing feature(s):  [1, 6, 3]  accuracy:  0.94
testing feature(s):  [1, 6, 4]  accuracy:  0.936
testing feature(s):  [1, 6, 5]  accuracy:  0.922

 Best feature was  [1, 6, 3]  accuracy:  0.94 

testing feature(s):  [1, 6, 3, 2]  accuracy:  0.9
testing feature(s):  [1, 6, 3, 4]  accuracy:  0.902
testing feature(s):  [1, 6, 3, 5]  accuracy:  0.898

 Best feature was  [1, 6, 

([1, 6], 0.964)

In [193]:
def backward_selection(DEBUG:bool = False):
    #best ever accuracy
    best_ever = classify(input_data)

    #best set ever
    best_set_ever = list(range(1, len(input_data[0][FEATURES])+1))

    #best set
    best_so_far = best_set_ever

    #initial
    if DEBUG:
        print("testing feature(s): ", best_so_far, " accuracy: ", best_ever)
        print("\n Best feature was ", best_so_far, " accuracy: ", best_ever, "\n")

    for i in range(len(input_data[0][FEATURES])-1):
        best = 0.0
        curr = best_so_far
        for j in curr:
            #test a new feature set by removing a feature
            feature_set = copy.copy(curr)
            feature_set.remove(j)
            
            data = construct_data(feature_set, input_data)
            accuracy = classify(data)
            if DEBUG:
                print("testing feature(s): ", feature_set, " accuracy: ", accuracy)

            # check if this set was better than any others in its group
            if accuracy > best:
                best = accuracy
                best_so_far = feature_set
            # check if this set was better than any other set ever
            if accuracy > best_ever:
                best_ever = accuracy
                best_set_ever = feature_set

        if DEBUG:
            print("\n Best feature was ", best_so_far, " accuracy: ", best, "\n")

    if DEBUG:
        print("Finished... The best feature set was", best_set_ever, "with an accuracy of", best_ever)
    
    return (best_set_ever, best_ever)

backward_selection(True)

testing feature(s):  [1, 2, 3, 4, 5, 6]  accuracy:  0.86

 Best feature was  [1, 2, 3, 4, 5, 6]  accuracy:  0.86 

testing feature(s):  [2, 3, 4, 5, 6]  accuracy:  0.79
testing feature(s):  [1, 3, 4, 5, 6]  accuracy:  0.876
testing feature(s):  [1, 2, 4, 5, 6]  accuracy:  0.854
testing feature(s):  [1, 2, 3, 5, 6]  accuracy:  0.86
testing feature(s):  [1, 2, 3, 4, 6]  accuracy:  0.88
testing feature(s):  [1, 2, 3, 4, 5]  accuracy:  0.832

 Best feature was  [1, 2, 3, 4, 6]  accuracy:  0.88 

testing feature(s):  [2, 3, 4, 6]  accuracy:  0.782
testing feature(s):  [1, 3, 4, 6]  accuracy:  0.902
testing feature(s):  [1, 2, 4, 6]  accuracy:  0.876
testing feature(s):  [1, 2, 3, 6]  accuracy:  0.9
testing feature(s):  [1, 2, 3, 4]  accuracy:  0.842

 Best feature was  [1, 3, 4, 6]  accuracy:  0.902 

testing feature(s):  [3, 4, 6]  accuracy:  0.796
testing feature(s):  [1, 4, 6]  accuracy:  0.936
testing feature(s):  [1, 3, 6]  accuracy:  0.94
testing feature(s):  [1, 3, 4]  accuracy:  0.8

([1, 6], 0.964)

In [201]:
#expect 0.947
import_data("CS170_Large_Data__21.txt")
print("Test classification using features 37, 36, and 40")
print(classify(construct_data([37,36,40], input_data)))

#expect 0.916
import_data("CS170_Small_Data__6.txt")
print("Test classification using features 2, 5, 3")
print(classify(construct_data([2,5,3], input_data)))

#expect 0.947
import_data("CS170_Large_Data__96.txt")
print("Test classification using features 21,8,10")
print(classify(construct_data([21,8,10], input_data)))

#expect 0.936
import_data("CS170_Small_Data__88.txt")
print("Test classification using features 5,2,1")
print(classify(construct_data([5,2,1], input_data)))

#expect 0.954
import_data("CS170_Large_Data__6.txt")
print("Test classification using features 22,1,6")
print(classify(construct_data([22,1,6], input_data)))

Test classification using features 37, 36, and 40
0.959
Test classification using features 2, 5, 3
0.918
Test classification using features 21,8,10
0.953
Test classification using features 5,2,1
0.94
Test classification using features 22,1,6
0.956


Test classification using features 2, 5, 3
0.918
