# Code for figures in problem a)

In this notebook we do experiments with minimising OLS and Ridge cost functions for the Franke function for different methods and hyperparametres

In [None]:
# IMPORT
import sys, os
sys.path.append(os.path.abspath(os.path.join('..')))
from Code.descent_methods import *

# Import the gradient methods
from Code.utilities import train_test_split, MSELoss_method

# Import the gradient methods
from Code.utilities import OLS_train_analgrad, OLS_train_autograd, ridge_train_analgrad, ridge_train_autograd

methods_dict_list = [
    {"name"   : "GD",       "method" : GD},
    {"name"   : "SGD",      "method" : SGD},
    {"name"   : "adagrad",  "method" : SGD_adagrad},
    {"name"   : "RMS prop", "method" : SGD_RMS_prop},
    {"name"   : "adam",     "method" : SGD_adam}
]

# Import tools from the first project 
from Code.project1_tools import feature_matrix_2d, r2_sampling, scale_feature_matrix

import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

import time as time

In [None]:
# Initialise beta
num_features = 10

# Start with small values
beta0 = {"b" : np.random.random(num_features)*0.1}

# Sample points and split in train and test
num_points = 100
split=0.2
data = r2_sampling(num_points)
x, y, z = data["x"], data["y"], data["z"]

# Scaling etc...
X = feature_matrix_2d(x, y, num_features)
X, means, var = scale_feature_matrix(X)
X_train, y_train, X_test, y_test, test_index = train_test_split(X, z, split)

# Squeeze to avoid errors due to shape [100, 1]
y_train, y_test = np.squeeze(y_train), np.squeeze(y_test)

# Define the polynomial fit model
def model(beta, X):
    return jnp.dot(X, beta["b"])

loss_func = MSELoss_method(model)

### Comparing the 4 gradients

In [None]:
# Use even number (:
n_epochs = 1000

# Initialise test parameters
lam=0.1
lr = 0.05

# Loop
linestyles = ["r", "b--", "black", "g--"]
labels = ["OLS anal", "OLS auto", "Ridge anal", "Ridge auto"]
for i, loss_grad in enumerate([OLS_train_analgrad(model), OLS_train_autograd(model), 
                  ridge_train_analgrad(model, lam), ridge_train_autograd(model, lam)]):

    t0 = time.time()

    result_GD = GD(X_train, y_train, X_test, y_test, 
                grad_method=loss_grad, 
                n_epochs=n_epochs, 
                lr=lr,
                beta0=beta0, 
                test_loss_func=(MSELoss_method(model)))

    print(time.time()-t0)

    # Plot the second half of training
    plt.plot(np.arange(n_epochs//2, n_epochs+1, 1), result_GD["train_loss_list"][n_epochs//2::], linestyles[i], label=labels[i])

plt.legend()
plt.show()

See that anal and auto give same results, but automatic takes 10* as much time ---> we use analytic in the following!

Note also that Ridge takes twice as long for auto. This might be because we have 2 terms of jnpsum(jnppower)...

### Testing momentum 

In [None]:
# ensure the same random numbers appear every time
np.random.seed(0)

# Fixed parametres
epochs = 100

# Values for experiment
lr_vals = np.logspace(-2, 1, 6)
gamma_vals = np.array([0.01, 0.05, 0.2, 0.5, 1])
results = np.zeros((len(lr_vals), len(gamma_vals)))

# We juse analytic ridge
grad_method= OLS_train_analgrad(model)

# Perform algorithm for each value
for i, lr in enumerate(lr_vals):
    for j, gamma in enumerate(gamma_vals):

        # Perform experiment
        result_GD = GD(X_train, y_train, X_test, y_test, 
                    grad_method=grad_method, 
                    n_epochs=epochs, 
                    lr=lr,
                    beta0=beta0, 
                    gamma=gamma,
                    test_loss_func=loss_func)

        # Append the final error from training
        results[i, j] = result_GD["test_loss_list"][-1]


# Make plot
sns.set()
fig, ax = plt.subplots(figsize = (10, 10))
df = pd.DataFrame(results, index=lr_vals, columns=gamma_vals)
sns.heatmap(df, annot=True, ax=ax, cmap="viridis",                  
            xticklabels=df.columns.values.round(4),
            yticklabels=df.index.values.round(4))
ax.set_title("Final test MSE loss")
ax.set_ylabel("learning rate $\eta$")
ax.set_xlabel("momentum parameter $\gamma$")

plt.show()

### Comparing the methods for different number of epochs

In [None]:
## We compare final train error to see "how far convergence has come"

# Use OLS analytic
loss_grad = OLS_train_analgrad(model)
lr = 0.01

# Max epoch number
n_epochs = 1000

def epoch_experiment(lr, n_epochs=n_epochs, loss_grad = loss_grad, methods_dict_list=methods_dict_list):

    # For methods with n_batches, the default 5 is used!
    n_batches = 5

    for method_dict in methods_dict_list:

        method_func = method_dict["method"]
        method_name = method_dict["name"]

        result = method_func(X_train, y_train, X_test, y_test, 
                grad_method=loss_grad, 
                n_epochs=n_epochs, 
                lr=lr,
                beta0=beta0, 
                test_loss_func=(MSELoss_method(model)))
            
        if method_name=="GD": # batch size 1
            plt.plot(result["train_loss_list"][::1], label=method_name)
        else:
            plt.plot(result["train_loss_list"][::n_batches], label=method_name)
        print(f"Final error for {method_name} = {result['train_loss_list'][-1]}")


    plt.legend()
    plt.show()

# epoch_experiment(0.001)
epoch_experiment(0.01)
# epoch_experiment(0.1)

### Learning rate experiment

In [None]:
## We compare final train error to see "how far convergence has come"

# Use OLS analytic
loss_grad = OLS_train_analgrad(model)
lr = 0.01


lr_values = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5]

def lr_experiment(lr_values, n_epochs=200, methods_dict_list=methods_dict_list):

    # For methods with n_batches, the default 5 is used!
    n_batches = 5

    for method_dict in methods_dict_list:

        method_func = method_dict["method"]
        method_name = method_dict["name"]

        final_errors = []

        for n_epochs in n_epochs_to_test:


            result = method_func(X_train, y_train, X_test, y_test, 
                    grad_method=loss_grad, 
                    n_epochs=n_epochs, 
                    lr=lr,
                    beta0=beta0, 
                    test_loss_func=(MSELoss_method(model)))
            
            final_errors.append(result["train_loss_list"][-1])

        plt.plot(n_epochs_to_test, final_errors, label=method_name)

    plt.legend()
    plt.show()

lr_experiment(lr_values)

In [None]:
"""
### analysis of results for OLS and RIDGE as function of
- lr
- #mini batches
- #epochs
- algorithm

- lambda for ridge!
    - use seabord ... to show results as function of lr and lam !
"""


In [None]:
def beta_init(layer_list):
    """
    layer list, eg [2, 10, 1] for 2 input, 10 hidden neurons and 1 output
    """

    beta0 = {}

    # Add random initialisation
    for i in range(1, len(layer_list)):
        # Weight matrix
        beta0[f"W{i}"] = np.random.random((layer_list[i - 1], layer_list[i]))

        # Bias vector
        beta0[f"b{i}"] = np.random.random(layer_list[i])

    return beta0

def make_neural_network_string(beta):
    """
    beta determines the architecture of the neural network
    """

    # Begin definition
    function_defining_string = """
    
    def neural_network_model(beta, X, activation=sigmoid, output_activation = (lambda x: x)):

        out = X.copy()

        # For each remaining layer we propagate forward
        for i in range(1, len(beta.keys()) // 2):  # for each layer

            # Dot with weights, add biases, apply activation function
            out = activation(jnp.add(jnp.dot(out, beta[f"W{i}"]), beta[f"b{i}"]))

    """     

    # Add hidden layer computation   

    function_defining_string +=    """
        out_final = output_activation(jnp.add(
            jnp.dot(out, beta[f"W{len(beta.keys())//2}"]), beta[f"b{len(beta.keys())//2}"]
        ))

        return out_final
    """

    return string

def neural_network_model(beta, X, activation=sigmoid, output_activation = (lambda x: x)):
    """
    Function to evaluate the neural network prediction for feature matrix X
    """
    # First layer = input
    out = X.copy()

    # For each remaining layer we propagate forward
    for i in range(1, len(beta.keys()) // 2):  # for each layer

        # Dot with weights, add biases, apply activation function
        out = activation(jnp.add(jnp.dot(out, beta[f"W{i}"]), beta[f"b{i}"]))

    out_final = output_activation(jnp.add(
        jnp.dot(out, beta[f"W{len(beta.keys())//2}"]), beta[f"b{len(beta.keys())//2}"]
    ))

    return out_final

beta = beta_init([2, 3, 3, 1])
print(beta)

In [None]:
exec("print('hello')")

### Seaborn Ridge
Ridge as function of learning rate and lambda

use seabord ... to show results as function of lr and lam !


In [None]:
# ensure the same random numbers appear every time
np.random.seed(0)

# Fixed parametres
epochs = 10

# Values for experiment
lr_vals = np.logspace(-3, 1, 6)
lam_vals = np.logspace(-10, 10, 6)
results = np.zeros((len(lr_vals), len(lam_vals)))

# We juse analytic ridge
train_grad = ridge_train_analgrad

# Perform algorithm for each value
for i, lr in enumerate(lr_vals):
    for j, lam in enumerate(lam_vals):

        # Get gradient function for given lambda
        grad_method = train_grad(model, lam)

        # Perform experiment
        result_GD = SGD_adam(X_train, y_train, X_test, y_test, 
                    grad_method=grad_method, 
                    n_epochs=epochs, 
                    n_batches=5,
                    lr=lr,
                    beta0=beta0, 
                    test_loss_func=loss_func)

        # Append the final error from training
        results[i, j] = result_GD["test_loss_list"][-1]


# Make plot
sns.set()
fig, ax = plt.subplots(figsize = (10, 10))
df = pd.DataFrame(results, index=lr_vals, columns=lam_vals)
sns.heatmap(df, annot=True, ax=ax, cmap="viridis",                  
            xticklabels=df.columns.values.round(4),
            yticklabels=df.index.values.round(4))
ax.set_title("Final test MSE loss")
ax.set_ylabel("learning rate $\eta$")
ax.set_xlabel("$\lambda$")

plt.show()