# Regularization of ANN weights


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import time
import jax.numpy as jnp
import jax

Load the [Auto MPG dataset](https://archive.ics.uci.edu/ml/datasets/auto+mpg).

In [None]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year', 'Origin']
data = pd.read_csv(url, names=column_names, na_values='?', comment='\t', sep=' ', skipinitialspace=True)
data

Check is there are missing entries in the dataset.

In [None]:
print(data.isna().sum())

Remove records with missing entries.

In [None]:
data = data.dropna()
print(data.isna().sum())

## Data inspection

Display some basic information.

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

We are interested in predicting the field `MPG`, measuring [fuel efficiency](https://en.wikipedia.org/wiki/Fuel_efficiency#:~:text=Fuel%20economy%20is%20the%20distance,a%20certain%20volume%20of%20fuel)), expressed in miles per gallon (MPG), where 1 MPG = 0.354006 km/L. Plot its distribution.

In [None]:
sns.displot(data['MPG'], kde = True)

Look for linear correlations among data.

In [None]:
data.corr()

In [None]:
sns.heatmap(data.corr(), annot = True, cmap = 'vlag_r', vmin = -1, vmax = 1)

In [None]:
sns.pairplot(data, diag_kind='kde')

## Data normalization

Apply an affine transformation to the data, so that each feature has zero mean and unitary standard deviation.

In [None]:
data_mean = data.mean()
data_std = data.std()
data_normalized = ...

In [None]:
_, ax = plt.subplots(figsize=(16,6))
sns.violinplot(data = data_normalized, ax = ax)

## Train-validation split

Shuffle the data using the [np.random.shuffle](https://numpy.org/doc/stable/reference/random/generated/numpy.random.shuffle.html) function and split the data as follows:
- put 80% in the train dataset
- put 20% in the validation dataset

In [None]:
data_normalized_np = data_normalized.to_numpy()
np.random.seed(0)
np.random.shuffle(data_normalized_np)

fraction_validation = 0.2
num_train = int(data_normalized_np.shape[0] * (1 - fraction_validation))
x_train = data_normalized_np[:num_train,1:]
y_train = data_normalized_np[:num_train,:1]
x_valid = data_normalized_np[num_train:,1:]
y_valid = data_normalized_np[num_train:,:1]

print('train set size     : %d' % x_train.shape[0])
print('validation set size: %d' % x_valid.shape[0])

## ANN setup

Write a function `params = initialize_params(layers_size)` that initializes the parameters, given the ANN architecture.
Initialize biases with zero values, and weights with a [Glorot Normal](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf) initialization, i.e. sampling from a Gaussian distribution with zero mean and with standard deviation 
$$
\sqrt{\frac{2}{n + m}},
$$
where $n$ and $m$ are the number of input and output neurons of the corresponding weights matrix.


In [None]:
def initialize_params(layers_size):
  np.random.seed(0) # for reproducibility
  params = list()
  ...
  return params

Implement a generic feedforward ANN with a function `y = ANN(x, params)`, using $ReLU$ as activation function.

In [None]:
activation = lambda x: jnp.maximum(0.0, x)

def ANN(x, params):
  ...


params = initialize_params([7, 10, 1])
ANN(x_train[:10,:], params)

Implement the quadratic loss (MSE) function `L = MSE(x, y, params)`.

In [None]:
def MSE(x, y, params):
  ...

params = initialize_params([7, 10, 1])
print(MSE(x_train, y_train, params))

Implement an $l^2$ regularization term for the ANN weights:
$$
\mathrm{MSW} = \frac{1}{n_{weights}} \sum_{i=1}^{n_{weights}} w_i^2
$$
and define the loss function as
$$
\mathcal{L} = \mathrm{MSE} + \beta \, \mathrm{MSW}
$$
where $\beta$ is a suitable penalization parameter.

In [None]:
def MSW(params):
  ...

def loss(x, y, params, penalization):
  ...

print(MSW(params))
print(loss(x_train, y_train, params, 1.0))

Run this cell: we will this callback to monitor training.

In [None]:
from IPython import display

class Callback:
  def __init__(self, refresh_rate = 250):
    self.refresh_rate = refresh_rate
    self.fig, self.axs = plt.subplots(1, figsize=(16,8))
    self.epoch = 0
    self.__call__(-1)

  def __call__(self, epoch):
    self.epoch = epoch
    if (epoch + 1) % self.refresh_rate == 0:
      self.draw()
      display.clear_output(wait=True)
      display.display(plt.gcf())
      time.sleep(1e-16)

  def draw(self):
    if self.epoch > 0:
      self.axs.clear()
      self.axs.loglog(history_loss_train, 'b-', label = 'loss train')
      self.axs.loglog(history_loss_valid, 'r-', label = 'loss validation')
      self.axs.loglog(history_MSE_train, 'b--', label = 'RMSE train')
      self.axs.loglog(history_MSE_valid, 'r--', label = 'RMSE validation')
      self.axs.legend()
      self.axs.set_title('epoch %d' % (self.epoch + 1))

## Training

Train an ANN with two hidden layers with 20 neurons each, using 5000 epochs of the SGD method (with minibatch size 100) with momentum ($\alpha = 0.9$).
Employ a linear decay of the learning rate:
$$
\lambda_k = \max\left(\lambda_\min, \lambda_\max \left(1 - \frac{k}{K}\right)\right)
$$
with $\lambda_\min = 5e-3$, $\lambda_\max = 1e-1$ and decay length $K= 1000$.

During training, store both the MSE error and the loss function obtained on the train and validation sets in 4 lists, respectively called:
- `history_loss_train`
- `history_loss_valid`
- `history_MSE_train`
- `history_MSE_valid`

Test different choices of the penalization parameter $\beta$.



In [None]:
# Hyperparameters
layers_size = [7, 20, 20, 1]
penalization = 0.0
# Training options
num_epochs = 5000
learning_rate_max = 1e-1
learning_rate_min = 5e-3
learning_rate_decay = 1000
batch_size = 100
alpha = 0.9
########################################

...

history_loss_train = list()
history_loss_valid = list()
history_MSE_train = list()
history_MSE_valid = list()

...

print('loss (train     ): %1.3e' % history_loss_train[-1])
print('loss (validation): %1.3e' % history_loss_valid[-1])
print('MSE  (train     ): %1.3e' % history_MSE_train[-1])
print('MSE  (validation): %1.3e' % history_MSE_valid[-1])

We now want to to investigate more in depth the effect of the penalization parameter $\beta$.
Write a function that, given the penalization parameter, trains the ANN (with the same setting used above) and returns a dictionary containing the final values of:
- train loss
- validation loss
- train MSE
- validation MSE
- MSW

In [None]:
# Hyperparameters
layers_size = [7, 20, 20, 1]
# Training options
num_epochs = 5000
learning_rate_max = 1e-1
learning_rate_min = 5e-3
learning_rate_decay = 1000
batch_size = 100
alpha = 0.9

def train(penalization):

  ...

  return {
      'MSE_train'  : ... ,
      'MSE_valid'  : ... ,
      'MSW'        : ... ,
      }

Using the above defined function, store the obtained results for $\beta = 0, 0.25, 0.5, 0.75, \dots, 2$.

Plot the trend of the five quantities as functions of $\beta$.

Plot the *Tikhonov L-curve*, which is - in this context - the curve "train MSE" versus "MSW".