# Regularization

- Weight decay
- Droput
- Early stopping

In [1]:
import sys, os
sys.path.append(os.path.abspath('../data'))
sys.path.append(os.path.abspath('../nnets'))
from dense import DenseUpdate as Dense, forward, backward
from csv_data import HousePricesDatasetWrapper
import numpy as np

wrapper = HousePricesDatasetWrapper()
train_data, valid_data, test_data = wrapper.get_flat_datasets()

In [8]:
%env WANDB_SILENT=True

import wandb
wandb.login()

def training_run(epochs, regularization, layers, optimizer, train_data, valid_data, name=None):
    # Initialize a new W&B run, with the right parameters
    wandb.init(project="regularization",
               name=name,
               config={"regularization": regularization})

    # Split the training and valid data into x and y
    train_x, train_y = train_data
    valid_x, valid_y = valid_data

    for epoch in range(epochs):
        running_loss = 0
        for i in range(len(train_x)):
            # Get the x and y batches
            x_batch = train_x[i:(i+1)]
            y_batch = train_y[i:(i+1)]
            # Make a prediction
            pred = forward(x_batch, layers)

            # Run the backward pass
            loss = pred - y_batch
            layer_grads = backward(loss, layers)
            running_loss += np.mean(loss ** 2)

            # Run the optimizer
            optimizer(layer_grads, layers, 1)

        # Calculate and log validation loss
        valid_preds = forward(valid_x, layers)
        valid_loss = np.mean((valid_preds - valid_y) ** 2)
        train_loss = running_loss / len(train_x)
        wandb.log({
            "valid_loss": valid_loss,
            "epoch": epoch,
            "train_loss": train_loss,
        })

    # Mark the run as complete
    wandb.finish()

env: WANDB_SILENT=True


# Weight decay

Weight decay is l2 regularization.  Goal is to shrink the weights towards 0 (lower the l2 norm).

l1 regularization lowers the l1 norm (sum of absolute values of weights).

In [6]:
class SGDW():
    def __init__(self, lr, decay):
        self.lr = lr
        self.decay = decay

    def __call__(self, layer_grads, layers, batch_size):
        # Loop through the layer grads.  Reverse the layers to match the grads (from output backward to input).
        for layer_grad, layer in zip(layer_grads, reversed(layers)):
            w_grad, b_grad = layer_grad

            # Normalize the weight gradient by batch size
            w_grad /= batch_size

            # Calculate the update sizes
            w_update = w_grad + self.decay * layer.weights
            w_update *= -self.lr
            # We don't usually decay the bias
            b_update = -self.lr * b_grad

            # Actually do the update
            layer.update(w_update, b_update)

In [None]:
layers = [
    Dense(7, 25),
    Dense(25, 10),
    Dense(10, 1, activation=False)
]
# No decay is equal to SGD
sgd = SGDW(1e-4, 0)
training_run(10, "None", layers, sgd, train_data, valid_data, name="sgd")

In [None]:
class Dropout():
    def __init__(self, drop_p):
        self.drop_p = drop_p

    def forward(self, input):
        # Generate a mask of 0s and 1s
        self.mask = np.random.binomial(1, 1-self.drop_p, input.shape)
        # Apply the mask.  If the mask is 0, the input is set to 0
        return np.where(self.mask, input, 0)

    def backward(self, grad):
        # Use np.where to apply the mask
        return np.where(self.mask, grad, 0)

In [None]:
# 50% of inputs will be set to 0
dropout = Dropout(0.5)
dropout.forward(input_embed.forward(data[0]["en"]))

In [None]:
- Weight decay
- Early stopping

In [None]:
# Improve generalization and convergence
# Not strictly regularization

- Layernorm
- Residual connections


In [None]:
class LayerNorm():
    def __init__(self, embed_dim):
        self.embed_dim = embed_dim

    def forward(self, input):
        # Cache for backward pass
        self.input = input
        # Calculate the mean and standard deviation
        self.mean = np.mean(input, axis=1, keepdims=True)
        self.std = np.std(input, axis=1, keepdims=True)
        # Normalize the input
        return (input - self.mean) / self.std

    def backward(self, grad):
        # Find the derivative of denominator
        # Derivative of multiplication of (input - mean) and 1 / std wrt denominator
        grad_denom = grad * (self.input - self.mean)
        # std is a single number
        grad_std = np.sum(grad_denom, axis=1, keepdims=True)
        # Derivative of 1 / std
        grad_std = grad_std * -1 / (self.std**2)

        # Derivative of numerator
        grad_num = grad * 1 / self.std
        # Derivative of (input - mean) wrt mean
        grad_mean = grad_num * -1
        # Mean is a single number, so sum across grad
        grad_mean = np.sum(grad_mean, axis=1, keepdims=True)
        # Derivative of numerator wrt input
        grad_input = grad_num

        # Derivative of std wrt input and mean
        # std formula is sqrt(sum((input - mean)**2) / n)
        # Undo sqrt
        grad_std = grad_std * .5 * 1 / self.std
        # Undo division by n (multiplication by 1/n)
        grad_std = grad_std * 1 / self.input.shape[1]
        # Undo square and sum
        grad_std = grad_std * 2 * (self.input - self.mean)
        # Derivative wrt input
        grad_input += grad_std
        # Derivative wrt mean
        grad_mean += grad_std * -1

        # Derivative of mean wrt input, mean formula is sum(input) / n
        # Undo division by n (multiplication by 1/n)
        grad_mean = grad_mean * 1 / self.input.shape[1]
        # The derivative of sum is to distribute the gradient across the input
        # Any change to any single input will proportionally change the sum of the input
        grad_input += grad_mean

        return grad_input

In [None]:
layer_norm = LayerNorm(512)
layer_norm.forward(input_embed.forward(data[0]["en"]))

In [None]:
## Residual connections