In [95]:
import numpy as np
small_file_path = 'CS205_small_Data__3.txt'
large_file_path = 'CS205_large_Data__32.txt'
small = normalize(np.loadtxt(small_file_path))
large = normalize(np.loadtxt(large_file_path))
print(small.shape)
print(large.shape)

(500, 13)
(5000, 51)


In [96]:
def normalize(data):
    #normalize the data 
    label = data[:, 0]
    features = data[:, 1:]
    features_min = features.min(axis=0)
    features_max = features.max(axis=0)
    normalized_features = (features - features_min) / (features_max - features_min)
    normalized_data = np.column_stack((label, normalized_features))
    return normalized_data


In [102]:


def leave_one_out_cross_validation(data, feature_set):
    #data should be ASCII Text, IEEE standard for 8 place floating numbers. 
    #The first column is the class, these values will always be either “1”s or “2”s. The other columns contain the features
    #Feature_Set should be a list with the features will be used to train the Nearest Neighbor model. 
    #This function would return the accuracy of the model trained by the feature_set 
    
    number_correctly_classified = 0
    num_samples = data.shape[0]
    
    for i in range(num_samples):
        object_to_classify = data[i, feature_set]
        label_object_to_classify = data[i, 0]
        
        nearest_neighbor_distance = float('inf')
        nearest_neighbor_location = -1
        
        for k in range(num_samples):
            if k != i:
                distance = np.sqrt(np.sum((object_to_classify - data[k, feature_set]) ** 2)) #Euclidean Distance
                if distance < nearest_neighbor_distance:
                    nearest_neighbor_distance = distance
                    nearest_neighbor_location = k
        
        nearest_neighbor_label = data[nearest_neighbor_location, 0]
        
        if label_object_to_classify == nearest_neighbor_label:
            number_correctly_classified += 1
    
    accuracy = number_correctly_classified / num_samples
    return accuracy

def forward_selection(data):
    #This function would train the Nearest Neighbor modeland find the best features combination forward 
    current_set = [] 
    best_overall_accuracy = 0 
    best_feature_set = []

    for i in range(data.shape[1] - 1):#first column is the label, so minus 1
        print(f"Current Feature: {current_set}")
        best_accuracy = 0

        for k in range(1, data.shape[1]): 
            if k not in current_set:
                print(f"--add the feature {k}")
                feature_set = current_set + [k]
                accuracy = leave_one_out_cross_validation(data, feature_set) #Calculate the accuracy 
                print(f"----Accuracy: {accuracy}")

                if accuracy > best_accuracy: #select the best feature in this level
                    best_accuracy = accuracy
                    feature_to_add= k

        current_set.append(feature_to_add) #Add the best feature in this level
        print(f"Added feature {feature_to_add}")
        
        if best_accuracy > best_overall_accuracy:
            best_overall_accuracy = best_accuracy
            best_feature_set = current_set.copy()


    return best_feature_set, best_overall_accuracy




In [104]:
def backward_elimination(data):
    #This function would train the Nearest Neighbor model Backward 
    current_set = list(range(1, data.shape[1])) #first column is label
    best_overall_accuracy = leave_one_out_cross_validation(data, current_set) # Set the default accuracy
    best_feature_set = current_set.copy()

    for i in range(data.shape[1]-1):#remove n-1 times feature
        print(f"Current feature: {current_set}")
        best_accuracy = 0
        
        for k in current_set:
            features = [f for f in current_set if f != k] #choose the features except one
            print(f"--remove the feature {k} ")
            accuracy = leave_one_out_cross_validation(data, features)
            print(f"----accuracy: {accuracy}")
            
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                feature_to_remove = k
        
        current_set.remove(feature_to_remove) #Remove the feature with the highest accuracy
        print(f"remove feature {feature_to_remove}")
        
        if best_accuracy > best_overall_accuracy:
            best_overall_accuracy = best_accuracy
            best_feature_set = current_set.copy()

    
    return best_feature_set, best_overall_accuracy


In [103]:
#Small data forward
file_path = 'CS205_small_Data__3.txt'
data = normalize(np.loadtxt(file_path))
best_feature_set, best_accuracy = forward_selection(data)
print(f'Best feature set: {best_feature_set}')
print(f'Best accuracy: {best_accuracy}')

Current Feature: []
--add the feature 1
----Accuracy: 0.676
--add the feature 2
----Accuracy: 0.668
--add the feature 3
----Accuracy: 0.702
--add the feature 4
----Accuracy: 0.63
--add the feature 5
----Accuracy: 0.668
--add the feature 6
----Accuracy: 0.742
--add the feature 7
----Accuracy: 0.644
--add the feature 8
----Accuracy: 0.626
--add the feature 9
----Accuracy: 0.704
--add the feature 10
----Accuracy: 0.664
--add the feature 11
----Accuracy: 0.652
--add the feature 12
----Accuracy: 0.808
Added feature 12
Current Feature: [12]
--add the feature 1
----Accuracy: 0.81
--add the feature 2
----Accuracy: 0.792
--add the feature 3
----Accuracy: 0.832
--add the feature 4
----Accuracy: 0.83
--add the feature 5
----Accuracy: 0.836
--add the feature 6
----Accuracy: 0.962
--add the feature 7
----Accuracy: 0.806
--add the feature 8
----Accuracy: 0.792
--add the feature 9
----Accuracy: 0.796
--add the feature 10
----Accuracy: 0.842
--add the feature 11
----Accuracy: 0.814
Added feature 6
Cur

In [105]:
#Small data backward
file_path = 'CS205_small_Data__3.txt'
data = normalize(np.loadtxt(file_path))
best_feature_set, best_accuracy = backward_elimination(data)
print(f'Best feature set: {best_feature_set}')
print(f'Best accuracy: {best_accuracy}')

Current feature: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
--remove the feature 1 
----accuracy: 0.746
--remove the feature 2 
----accuracy: 0.746
--remove the feature 3 
----accuracy: 0.742
--remove the feature 4 
----accuracy: 0.782
--remove the feature 5 
----accuracy: 0.736
--remove the feature 6 
----accuracy: 0.708
--remove the feature 7 
----accuracy: 0.78
--remove the feature 8 
----accuracy: 0.73
--remove the feature 9 
----accuracy: 0.748
--remove the feature 10 
----accuracy: 0.752
--remove the feature 11 
----accuracy: 0.758
--remove the feature 12 
----accuracy: 0.696
remove feature 4
Current feature: [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12]
--remove the feature 1 
----accuracy: 0.742
--remove the feature 2 
----accuracy: 0.77
--remove the feature 3 
----accuracy: 0.768
--remove the feature 5 
----accuracy: 0.752
--remove the feature 6 
----accuracy: 0.74
--remove the feature 7 
----accuracy: 0.776
--remove the feature 8 
----accuracy: 0.776
--remove the feature 9 
----accuracy:

In [85]:
#Large data forward
file_path = 'CS205_large_Data__32.txt'
data = normalize(np.loadtxt(file_path))
best_feature_set, best_accuracy = forward_selection(data)
print(f'Best feature set: {best_feature_set}')
print(f'Best accuracy: {best_accuracy}')

Current Feature: []
--add the 1 feature


KeyboardInterrupt: 

In [None]:
#Large data backward
file_path = 'CS205_large_Data__32.txt'
data = normalize(np.loadtxt(file_path))
best_feature_set, best_accuracy = backward_elimination(data)
print(f'Best feature set: {best_feature_set}')
print(f'Best accuracy: {best_accuracy}')