In [1]:
import numpy as np
import random as random
import math

In [2]:
path = 'ParisHousing.csv'
data = np.genfromtxt(path, delimiter=',')
print(type(data))
print(data.shape)

# convert the numpy array to a list
data = data.tolist()
print(type(data))
print(len(data))

# Take a small portion to work with
portion_size = int(0.1 * len(data))
selected_portion = random.sample(data, portion_size)
print(type(selected_portion))
print(len(selected_portion))

<class 'numpy.ndarray'>
(10000, 17)
<class 'list'>
10000
<class 'list'>
1000


In [3]:
# Data Preparation
train_set = []
val_set = []
test_set = []

def data_preparation(train_size, val_size, test_size):
    # Below method shuffles the whole dataset to remove bias
    random.shuffle(selected_portion)
    
    for s in selected_portion:
        # Num is used to take value between 0 and 1, '.random' method generates that
        # In this way, with every iteration of data, the data will be randomly set
        # upon the three lists
        num = random.random()
        if 0 <= num <= train_size:
            train_set.append(s)
        elif train_size < num <= (train_size + val_size):
            val_set.append(s)
        else:
            test_set.append(s)

In [4]:
# Prepare the data according to the question,
# Train = 65%, Validation = 10%, Test = 25%
data_preparation(0.65, 0.1, 0.25)

# Know the dataset sizes
print(len(train_set))
print(len(val_set))
print(len(test_set))

648
96
256


In [5]:
def euclidean_distance(v, t):
    distance = 0
    for x, y in zip(v, t):
        distance += (x - y) ** 2
    return math.sqrt(distance)

In [6]:
def sort_func(distance):
    return distance[1]

In [7]:
def knn_regression(k, train_data, test_data):
    error = 0
    distance = []
    
    for test in test_data:
        for train in train_data:
            # The Euclidean distance between two datapoints 'test' and 'train'
            # '[:-1]' excludes the last element from each dataset 
            # because it is the target column
            dist = euclidean_distance(test[:-1], train[:-1])
            distance.append([train, dist])
        # Sort the distance list
        distance.sort(key=sort_func)
        # Selects the first k nearest neighbors from the sorted distance list
        select = distance[:k]
        
        # Sum is the total neighbor value
        sum = 0
        for s in select:
            sum = sum + s[0][-1]
        # Below we calculate the predicted value by 
        # dividing the sum(total neighbor value) by k
        sum = sum / k
        # Error contains the sum of absolute differences
        error = error + abs(test[-1] - sum)
    # Finally Mean Absolute Error is calculated by 
    # dividing all the error value by test_data length
    return error / len(test_data)

In [8]:
print(knn_regression(1, train_set, test_set))
print(knn_regression(3, train_set, test_set))
print(knn_regression(5, train_set, test_set))
print(knn_regression(10, train_set, test_set))
print(knn_regression(15, train_set, test_set))

3663313.377343752
2726188.1170572895
2627227.3092968743
2633384.6526953126
2664343.1966666644
