In [None]:
import csv
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import numpy as np
from kNNModel import kNNModel

# loading the microchips dataset
with open("./resources/datasets/polynomial200.csv", "r") as f:
    reader = csv.reader(f)
    data = list(reader)

# defining global variables
X_train = np.array(data[:100], dtype=float)
X_test = np.array(data[100:], dtype=float)

# instantiating kNN Model
kNN = kNNModel()
kNN.fit(X_train[:, [0]], X_train[:, 1], X_test[:, [0]], X_test[:, 1])
kNN.normalize(type="sd") # 'sd' stands for "standard deviation"

## Exercise 2: k-NN Regression

The datasets polynomial200.csv contains 200 x, y samples generated from the function  
$y = f(x) = 5 + 12x - x^2 + 0.025x^3 + \text{normrnd}(0, 5)$

Create a program Polynomial that:  

1. Divide the dataset into a training set of size 100, and test set of size 100
2. Plot the training and test set side-by-side in a 1 ×2 pattern

In [None]:
# creating plot
fig, axs = plt.subplots(1,2,figsize=(15,5))
axs = axs.ravel()

axs[0].scatter(kNN.X_train, kNN.y_train, c="b", s=4)
axs[0].set_title("Training data")

axs[1].scatter(kNN.X_test, kNN.y_test, c="b", s=4)
axs[1].set_title("Testing data")

plt.show()

**3.** Display a 2 ×3 plot showing the k-NN regression result and the MSE training error for **k = 1, 3, 5, 7, 9, 11**. For example, the plot for **k = 5** might look something like this
<center><image src="./resources/images/plot.jpg" width="600"/></center>

In [None]:
step = 0.00001
x_points = np.arange(kNN.X_train.min(), kNN.X_train.max() + step, step).reshape(-1, 1)

fig, axs = plt.subplots(2, 3, figsize=(15, 10))
axs = axs.ravel()

k_values = [1, 3, 5, 7, 9, 11]
k_labels = np.empty((len(k_values), len(x_points)), float)
for i, k in enumerate(k_values):
    labels = kNN.regress(x_points, k)
    k_labels[i] = labels
    mse = kNN.MSE(x_points, labels, test=False)
    axs[i].scatter(kNN.X_train, kNN.y_train, c="b", s=4)
    axs[i].plot(x_points, labels, c="r")
    axs[i].set_title(f"k = {k}, MSE = {round(mse, 2)}")
plt.show()

**4.** Compute and present the MSE test error for **k = 1, 3, 5, 7, 9, 11**.

In [None]:
print("MSE for testing data")
for i, k in enumerate(k_values):
    mse = kNN.MSE(x_points, k_labels[i])
    print(f"k = {k}, MSE = {round(mse, 2)}")

**5.** Which k gives the best regression? Motivate your answer!

In my opinion, k = 5 provides the best regression for the following reasons:

1) **Higher k are underfitting**: The dataset contains numerous outliers that distort the MSE calculations for larger k values, resulting in an underfitting curve. This underfitting is particularly noticeable in the plots for k = 7, 9, and 11 on the training dataset, especially for the points in the top right corner.

2) **Smaller k are overfitting**: The results for k = 1 and 3 are overfitted, which leads to higher MSE values for the testing dataset.

3) **Middle Ground**: k = 5 offers a balanced approach. It effectively adapts to the outliers in the dataset, providing the smallest MSE for the normalized version and the second smallest for the non-normalized version.

It's important to note that the optimal value for k may vary depending on the dataset and the specific context of the problem. In this particular case, k = 5 is the most optimal solution, but this may not hold true for other datasets or problem contexts.