<a href="https://colab.research.google.com/github/Pauwels-Xander/DeepLearning/blob/main/Task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Exercise 1 link: https://pdf.ac/14Itnp

def generate_sin(x, epsilon):
    func = np.sin(x) + np.random.normal(loc=0.0, scale=epsilon, size=len(x))
    return func

n_samples = 20
epsilon = 0.1

# TODO: create noisy function
x = np.linspace(0, np.pi/4, n_samples)
y = generate_sin(x, epsilon)

# TODO: create true function
x_pure = x
y_pure = np.sin(x)

# TODO: plot data, line
plt.scatter(x,y, label="Sampled data")
plt.plot(x_pure, y_pure, label="true")
plt.grid()
plt.legend()
plt.show()

# -------------- Q2.2 -------------- #
def compute_sos(y_true, y_pred):
    sos = np.sum((y_true - y_pred)**2)
    return sos

def lin_reg(xs, ys):
    n = len(xs)

    x_mean = 1/n * np.sum(xs)
    y_mean = 1/n * np.sum(ys)

    b = np.sum((xs - x_mean) * (ys - y_mean)) / np.sum((xs - x_mean) ** 2)
    a = y_mean - b*x_mean

    y_pred = a + b*xs
    res = compute_sos(ys, y_pred)

    return a, b, res

# TODO: generate line of best fit
a, b, res = lin_reg(x,y)
y_best = b*x+a


# TODO: plot line of best fit, data, etc
plt.scatter(x, y, label="Noisy Data", color="red")
plt.plot(x_pure, y_pure, label="True Function (sin x)", color="blue", linestyle="dashed")
plt.plot(x, y_best, label="Line of Best Fit", color="green")

# make it pretty
plt.grid()
plt.legend()
plt.show()

print("Coefficients:", (a, b))
print("Residuals:   ", res)


# -------------- Q2.3 -------------- #
# TODO: use np polyfit here
b_np, a_np = np.polyfit(x, y, 1)
y_best_np = a_np + b_np * x
res_np = compute_sos(y, y_best_np)

# TODO: plot everything you need to
plt.scatter(x, y, label="Noisy Data", color="red")
plt.plot(x_pure, y_pure, label="True Function (sin x)", color="blue", linestyle="dashed")
plt.plot(x, y_best, label="Manual Linear Regression", color="green")
plt.plot(x, y_best_np, label="NumPy Polyfit Regression", color="purple", linestyle="dotted")
plt.grid()
plt.legend()
plt.show()

# print results
print("Coefficients:     ", (a, b))
print("Residuals:        ", res)
print("Coefficients (np):", (a_np, b_np))
print("Residuals (np):   ", res_np)
# polyfit provides a fast and convenient way to compute linear regression
# without manually implementing the formula, as they output the same.

# -------------- Q3 -------------- #
def compute_mse(y_true, y_pred, a, b):
    mse = 1/len(y_true) * np.sum((y_true-y_pred)**2)
    return mse

x_test = np.linspace(0, np.pi/4, n_samples)
y_test = np.sin(x_test) + np.random.normal(loc=0.0, scale=epsilon, size=len(x))

plt.scatter(x, y, label="Train set)")
plt.scatter(x_test, y_test, label="Test set")

# TODO: plot data, lines of best fit, true function, make it pretty
plt.figure(figsize=(8, 6))
plt.scatter(x, y, label="Train set", color='blue', alpha=0.6)
plt.scatter(x_test, y_test, label="Test set", color='red', alpha=0.6)
plt.plot(x, a * x + b, label=f'Best Fit Line: y = {a:.2f}x + {b:.2f}', color='black')
plt.xlabel("X")
plt.ylabel("Y")
plt.title("Linear Regression Fit with Training and Test Sets")
plt.legend()
plt.grid()
plt.show()


test_loss = compute_mse(x_test, y_test, a, b)
train_loss = compute_mse(x, y, a, b)

print("Coefficients: ", (a, b))
print("Train loss:   ", train_loss)
print("Test loss:    ", test_loss)

# The test loss is lower. The training loss is typically lower than the test loss because
# the model is optimized to fit the training data. So this contradicts.
# This may be because the test set is too small or not representative.
# 0.008 (train) vs 0.001 (test)

# -------------- Q4 -------------- #
# Observation 1: Increase in noise increases train loss significantly
# Observation 2: Increase in sample size decreases test loss

# -------------- Q5.1 -------------- #
# TODO: generate new data and plot
n_samples = 20
x = np.linspace(0, 2* np.pi, n_samples)
epsilon = 0.2
x_train = np.linspace(0, np.pi / 4, n_samples)  # Training set range
x_test = np.linspace(np.pi / 4, np.pi / 2, n_samples)  # Test set range

y_train = generate_sin(x_train, epsilon)
y_test = generate_sin(x_test, epsilon)

x_pure = np.linspace(0, np.pi / 2, 100)  # Smooth curve for visualization
y_pure = np.sin(x_pure)

plt.scatter(x_train, y_train, label="Train Set", color="blue", alpha=0.6)
plt.scatter(x_test, y_test, label="Test Set", color="red", alpha=0.6)
plt.plot(x_pure, y_pure, label="True f(x): sin(x)", color="black", linestyle="dashed")

# Make plot visually appealing
plt.xlabel("X")
plt.ylabel("Y")
plt.title("Training and Test Sets with True Function")
plt.legend()
plt.grid()
plt.show()

# -------------- Q5.2 -------------- #
degrees = range(1, 20)
train_losses = []
test_losses = []
coefficients = []

for d in degrees:
    # Fit polynomial model to training data
    poly_coeffs = np.polyfit(x_train, y_train, d)

    # Compute predictions on training and test sets
    y_pred_train = np.polyval(poly_coeffs, x_train)
    y_pred_test = np.polyval(poly_coeffs, x_test)

    # Compute train and test losses using Mean Squared Error (MSE)
    train_loss = compute_mse(y_train, y_pred_train, 0, 0)
    test_loss = compute_mse(y_test, y_pred_test, 0, 0)

    # Store results
    train_losses.append(train_loss)
    test_losses.append(test_loss)
    coefficients.append(poly_coeffs)

df_poly_results = pd.DataFrame({
    "Degree": degrees,
    "Train Loss": train_losses,
    "Test Loss": test_losses,
    "Coefficients": coefficients
})
print(df_poly_results)

# For low-degree polynomials, the test loss is moderate, and train loss is relatively low.
# For higher-degree polynomials the test loss starts increasing significantly,
# which suggests overfitting—the model fits the training data too well but generalizes poorly.

# -------------- Q5.3 -------------- #
train_losses_actual = train_losses
test_losses_actual = test_losses


plt.figure(figsize=(10, 5))

# Overfitting plot: Training loss decreases rapidly, validation loss worsens
plt.subplot(1, 2, 1)
plt.plot(degrees, train_losses_actual, label='Training Loss', color="blue", marker="o")
plt.plot(degrees, test_losses_actual, label='Validation Loss', color="red", marker="s")
plt.title("Overfitting: Large gap between training and validation loss")
plt.xlabel("Polynomial Degree")
plt.ylabel("Loss (MSE)")
plt.legend()
plt.grid()

# Underfitting plot: Both training and validation loss remain high
plt.subplot(1, 2, 2)
plt.plot(degrees, train_losses_actual, label='Training Loss', color="blue", marker="o")
plt.plot(degrees, test_losses_actual, label='Validation Loss', color="red", marker="s")
plt.title("Underfitting: High loss in both training and validation")
plt.xlabel("Polynomial Degree")
plt.ylabel("Loss (MSE)")
plt.legend()
plt.grid()

plt.show()

# Plot MSE losses vs. polynomial degree
plt.figure(figsize=(8, 6))

plt.plot(degrees, train_losses_actual, label="Training Loss", marker='o', linestyle='-', color='blue')
plt.plot(degrees, test_losses_actual, label="Test Loss", marker='s', linestyle='-', color='red')

plt.xlabel("Polynomial Degree (Model Flexibility)")
plt.ylabel("Mean Squared Error (MSE)")
plt.title("MSE vs. Model Flexibility")
plt.legend()
plt.grid()
plt.show()


# -------------- Q6 -------------- #
# a) Where do the polynomial models start to overfit? How can you tell?
# It is hard to see on the plots, as the problem grows exponentially. So impossible to tell from the plots.
# Manually looking, probably around d=5 as the MSE grows too big.

# b) What model fits the best to the training data? What model fits the best to the underlying function (visually, according to your intuition about the plots)? Are they the same?
# The high degree one does. But a moderate degree does better capture the underlying function because it doesnt overextense
# to account for noise.

# c) Try increasing the number of training points to 200. At what point does overfitting start now?
# around d=10

# d) In general, does increasing the number of training points always allow for more flexible models of the real world? Why/why not?
# Increasing the number of training points can help models generalize better and delay overfitting, but it doesn’t always allow for
# more flexibility. If the true function is simple or the data is noisy, adding more samples won’t improve model performance and
# may lead to diminishing returns.
