In [1]:
# import libraries
import sys
import math
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline            

# 2 a)

### Function to create feature matrix and and label vectors
The function 'get_vectors' takes a filename as input and returns a matrix X containing feature vectors and vector y containing target labels.

In [53]:
def get_vectors(filename):
    try:
        f = open(filename, 'r')
    except OSError:
        print(f'{filename} could not be opened.\n')
        sys.exit()
        
    # initialize list to store feature and labels for training data
    features = []             
    labels = []
    
    with f:
        line = f.readline()
        while line != '':
            # strip newline and outer parenthesis
            line = line.strip('\n')
            line = line.strip('( )')
            
            # extrace label and append to labels list
            single_label = line.split('), ')[-1]
            labels.append(single_label)
            
            # extrace features and append to features list
            feat = line.split('), ')[0].split(', ')
            features.append(feat)
            
            # read next line
            line = f.readline()
        
        # create dataframe of features and append labels
        X = np.array(features, dtype = float, ndmin = 2)
        
        # convert labels list to array
        y = np.array(labels, dtype = str, ndmin = 2)
        
        return X, y.transpose()

### K Nearest Neighbors

In [79]:
# calculates euclidean distance between training datapoints and test data point
def get_euclidean_distance(X_train, p):
    
    # n = total number of datapoints, f_n = total number of features
    n, f_n = X_train.shape
    
    sum_of_squared_diff = np.zeros((n, 1), dtype = float)
    
    # use vectorization to get sum of squared difference
    for i in range(f_n):
        x_vector = X_train[:,i].reshape((n,1))
        sum_of_squared_diff = sum_of_squared_diff + (x_vector - p[i])**2
        
    # take sq root to get array of cartesianeuclidean distance
    euc_dist = np.sqrt(sum_of_squared_diff)
    
    return euc_dist

# returns distance and labels of 'k' nearest neighbors of 'p' wrt to training data
def get_k_nearest_neighbors(X_train, y_train, p, k = 5):
    
    # get euclidean distance array
    euc_dist_arr = get_euclidean_distance(X_train, p)
    
    # concat with y_train labels and sort in ascending order of the distance
    euc_dist_arr = np.concatenate((euc_dist_arr, y_train), axis = 1)
    
    # sort wrt the euclidean distance(first col)
    euc_dist_arr = euc_dist_arr[euc_dist_arr[:,0].argsort()]
    
    # return the first 'k' rows
    return euc_dist_arr[0:k,:]

### Provide training filename & prediction data point
The file must contain 1 datapoint per line in format (( height, diameter, weight, hue ), label ) which is similar to the format provided for the assignment

In [80]:
'''
# provide filename
filename = str(input('enter file containing training data: '))

# provide prediction datapoint
h_p = float(input('Enter height: '))
d_p = float(input('Enter diameter: '))
w_p = float(input('Enter weight: '))
c_p = float(input('Enter hue/color: '))

# save prediction datapont as a 1-D numpy array
p = np.array([h_p, d_p, w_p, c_p])
'''
filename = '2_a_train.txt'
p = np.array([0.1267104769925, 0.068040454192177, 0.20859882666808, 3.9587910256346], dtype = float)

# get feature and target vectors
X_train, y_train = get_vectors(filename)

get_k_nearest_neighbors(X_train, y_train, p, 5)

array([['0.22991008654808265', 'Ceramic'],
       ['0.40132508774163955', 'Metal'],
       ['0.46414161830789835', 'Plastic'],
       ['0.49305207385401356', 'Plastic'],
       ['0.5236055005559364', 'Plastic']], dtype='<U32')

### Provide Single Datapoint for Prediction

In [16]:
get_euclidean_distance(X_train, p).shape

(12,)

In [20]:
y_train.shape

(12,)

In [47]:
X_train[:,0].shape

(12,)