In [None]:
import csv
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import numpy as np
from methods import normalize_features, kNN_regression, calc_MSE

# load the microchips dataset
with open("./resources/datasets/polynomial200.csv", "r") as f:
    reader = csv.reader(f)
    data = list(reader)

# define global variables
X_train = np.array(data[:100], dtype=float)
X_test = np.array(data[100:], dtype=float)

# Uncomment to normalize the features
# X_train = normalize_features(X_train)
# X_test = normalize_features(X_test)

x1_train = X_train[:, 0]
x2_train = X_train[:, 1]

x1_test = X_test[:, 0]
x2_test = X_test[:, 1]

## Exercise 2: k-NN Regression

The datasets polynomial200.csv contains 200 x, y samples generated from the function  
$y = f(x) = 5 + 12x - x^2 + 0.025x^3 + \text{normrnd}(0, 5)$

Create a program Polynomial that:  

1. Divide the dataset into a training set of size 100, and test set of size 100
2. Plot the training and test set side-by-side in a 1 ×2 pattern

In [None]:
# creating plot
fig, axs = plt.subplots(1,2,figsize=(15,5))
axs = axs.ravel()

axs[0].scatter(x1_train, x2_train, c="b", s=4)
axs[0].set_title("Training data")

axs[1].scatter(x1_test, x2_test, c="b", s=4)
axs[1].set_title("Testing data")

plt.show()

**3.** Display a 2 ×3 plot showing the k-NN regression result and the MSE training error for **k = 1, 3, 5, 7, 9, 11**. For example, the plot for **k = 5** might look something like this
<center><image src="./resources/images/plot.jpg" width="600"/></center>

In [None]:
x_min, x_max = x1_train.min(), x1_train.max()
step = 0.001
x_line = np.arange(x_min, x_max + step, step)

fig, axs = plt.subplots(2, 3, figsize=(15, 10))
axs = axs.ravel()

k_values = [1, 3, 5, 7, 9, 11]
Y = np.empty((len(k_values), len(x_line)), float)
for i, k in enumerate(k_values):
    y_line = kNN_regression(X_train, x_line, k)
    Y[i] = y_line
    X_hat = np.c_[x_line, y_line]
    mse = calc_MSE(X_train, X_hat)
    axs[i].scatter(x1_train, x2_train, c="b", s=4)
    axs[i].plot(x_line, y_line, c="r")
    axs[i].set_title(f"k = {k}, MSE = {round(mse, 2)}")
plt.show()

**4.** Compute and present the MSE test error for **k = 1, 3, 5, 7, 9, 11**.

In [None]:
print("MSE for testing data")
for i, k in enumerate(k_values):
    X_hat = np.c_[x_line, Y[i]]
    mse = calc_MSE(X_test, X_hat)
    print(f"k = {k}, MSE = {round(mse, 2)}")

**5.** Which k gives the best regression? Motivate your answer!

In my opinion, k = 5 provides the best regression for the following reasons:

1) **Higher k are underfitting**: The dataset contains numerous outliers that distort the MSE calculations for larger k values, resulting in an underfitting curve. This underfitting is particularly noticeable in the plots for k = 7, 9, and 11 on the training dataset, especially for the points in the top right corner.

2) **Smaller k are overfitting**: The results for k = 1 and 3 are overfitted, which leads to higher MSE values for the testing dataset.

3) **Middle Ground**: k = 5 offers a balanced approach by providing the second smallest MSE, while also effectively adapting to the outliers in the dataset.

It's important to note that the optimal value for k may vary depending on the dataset and the specific context of the problem. In this particular case, k = 5 is the most optimal solution, but this may not hold true for other datasets or problem contexts.
