In [1]:
import numpy as np
import time

In [2]:
# Reading the train and test data
train_data = np.genfromtxt('Regression/trainingData.csv', delimiter = ",")
test_data = np.genfromtxt('Regression/testData.csv', delimiter = ",")

In [3]:
# calculates and returns the euclidean distance between 1 query instance and all of the training instances
def calculate_distances(training_features, query_features):
    return np.sqrt(np.sum(np.square(query_features - training_features), axis = 1))

In [10]:
def predict(training_features, query_features):
    ''' training_features is a 2D numpy array, query_features is a single row from test data (1D numpy array)
        
            1) This function will first call calculate_distances to get the distance between a query instance
            and all of the training feature instances
            2) Select 3 nearest training feature instances (as 3 nearest neighbours)
            3) Calculate the average of the target feature of selected 3 neighbours with weights assigned to them as 
               (1/distance), where distance is the euclidean distance returned by calculate_distnaces function
            4) Return the calculated average as the predicted target value for the query instance'''
    
    distances = calculate_distances(training_features, query_features)
    
    # Set the number of nearest neighbours to consider
    k = 3 # We can paramterize this so that while calling this function, value of K can be passed as an argument
    
    # use np.argsort to get the indices of the values in sorted order
    indices = np.argsort(distances)
    
    # average the 3 nearest disances
    # predicted_value = np.mean(train_data[indices[0:k],-1])
    
    # Inverse distance weighted average of 3 nearest neighbours
    predicted_value = np.sum(train_data[indices[0:k],-1] * (1/(distances[indices[0:k]]))) / np.sum((1/(distances[indices[0:k]])))
    
    # squared Inverse distance weighted average of 3 nearest neighbours
    # predicted_value = np.sum(train_data[indices[0:k],-1] * np.square(1/(distances[indices[0:k]]))) / np.sum(np.square(1/(distances[indices[0:k]])))
    
    return predicted_value

In [11]:
def calculate_r2(actual_target_values, predicted_target_values):
    ''' This function takes the actual target values and predicted target values as arguments,
        returns the R squared score'''
    
    sum_squared_residuals = np.sum(
                                   np.square(predicted_target_values - actual_target_values)
                                  )
    
    sum_squares = np.sum(
                         np.square(np.mean(actual_target_values) - actual_target_values)
                        )
    
    r2 = 1 - (sum_squared_residuals/sum_squares)
    return r2

In [12]:
start_time = time.process_time()

# list to append the predicted values for all of the test data
predictions = []

for i in range(len(test_data)):
    predicted_value = predict(train_data[:,0:-1], test_data[i,0:-1])
    predictions.append(predicted_value)

r2_score = calculate_r2(test_data[:,-1], np.array(predictions))

end_time = time.process_time()

print("R2 Score is: ", r2_score)
print("KNN Regression completed in {} seconds".format(end_time - start_time))

R2 Score is:  0.8185732982178427
KNN Regression completed in 2.953125 seconds
