# Chapter XIV: Regularizations

In [1]:
import matplotlib.pyplot as plt
import nnfs
from nnfs.datasets import spiral_data
import numpy as np
nnfs.init()

In [2]:
import sys, os

# Go up one directory and into 'notebooks'
sys.path.append(os.path.abspath('../modules'))

from layers import Layer_Dense, Layer_Dropout
from activation_functions import Activation_ReLU
from losses import Activation_Softmax_Loss_CategoricalCrossentropy
from optimizers import Optimizer_Adam

In [3]:
# Create dataset
X, y = spiral_data(samples=100, classes=3)
# Create Dense layer with 2 input features and 64 output values
dense1 = Layer_Dense(2, 64, weight_regularizer_l2=5e-4, bias_regularizer_l2=5e-4)

# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()
# Create second Dense layer with 64 input features (as we take output
# of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense(64, 3)
# Create Softmax classifier's combined loss and activation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()
# Create optimizer
optimizer = Optimizer_Adam(learning_rate=0.02, decay=5e-7)

# Train in loop
for epoch in range(10001):
    # Perform a forward pass of our training data through this layer
    dense1.forward(X)
    # Perform a forward pass through activation function
    # takes the output of first dense layer here
    activation1.forward(dense1.output)
    # Perform a forward pass through second Dense layer
    # takes outputs of activation function of first layer as inputs
    dense2.forward(activation1.output)
    # Perform a forward pass through the activation/loss function
    # takes the output of second dense layer here and returns loss
    data_loss = loss_activation.forward(dense2.output, y)
    # Calculate regularization penalty
    regularization_loss = loss_activation.loss.regularization_loss(dense1) + loss_activation.loss.regularization_loss(dense2)
    # Calculate overall loss
    loss = data_loss + regularization_loss
    # Calculate accuracy from output of activation2 and targets
    # calculate values along first axis
    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)
    if not epoch % 100:
        print(f'epoch: {epoch}, ' +
        f'acc: {accuracy:.3f}, ' +
        f'loss: {loss:.3f} (' +
        f'data_loss: {data_loss:.3f}, ' +
        f'reg_loss: {regularization_loss:.3f}), ' +
        f'lr: {optimizer.current_learning_rate}')

    # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    # Update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.360, loss: 1.099 (data_loss: 1.099, reg_loss: 0.000), lr: 0.02
epoch: 100, acc: 0.690, loss: 0.829 (data_loss: 0.787, reg_loss: 0.042), lr: 0.019999010049002574
epoch: 200, acc: 0.783, loss: 0.681 (data_loss: 0.602, reg_loss: 0.079), lr: 0.019998010197985302
epoch: 300, acc: 0.817, loss: 0.601 (data_loss: 0.507, reg_loss: 0.094), lr: 0.019997010446938183
epoch: 400, acc: 0.830, loss: 0.552 (data_loss: 0.449, reg_loss: 0.102), lr: 0.01999601079584623
epoch: 500, acc: 0.863, loss: 0.517 (data_loss: 0.411, reg_loss: 0.106), lr: 0.01999501124469445
epoch: 600, acc: 0.857, loss: 0.488 (data_loss: 0.381, reg_loss: 0.108), lr: 0.01999401179346786
epoch: 700, acc: 0.873, loss: 0.469 (data_loss: 0.361, reg_loss: 0.108), lr: 0.01999301244215147
epoch: 800, acc: 0.877, loss: 0.451 (data_loss: 0.344, reg_loss: 0.108), lr: 0.0199920131907303
epoch: 900, acc: 0.877, loss: 0.440 (data_loss: 0.333, reg_loss: 0.107), lr: 0.019991014039189386
epoch: 1000, acc: 0.883, loss: 0.425 (data_l

In [4]:
# Validate the model
# Create test dataset
X_test, y_test = spiral_data(samples=100, classes=3)
# Perform a forward pass of our testing data through this layer
dense1.forward(X_test)
# Perform a forward pass through activation function
# takes the output of first dense layer here
activation1.forward(dense1.output)
# Perform a forward pass through second Dense layer
# takes outputs of activation function of first layer as inputs
dense2.forward(activation1.output)
# Perform a forward pass through the activation/loss function
# takes the output of second dense layer here and returns loss
loss = loss_activation.forward(dense2.output, y_test)
# Calculate accuracy from output of activation2 and targets
# calculate values along first axis
predictions = np.argmax(loss_activation.output, axis=1)
if len(y_test.shape) == 2:
    y_test = np.argmax(y_test, axis=1)
accuracy = np.mean(predictions==y_test)
print(f'validation, acc: {accuracy:.3f}, loss: {loss:.3f}')

validation, acc: 0.840, loss: 0.462


In [5]:
# Create dataset
X, y = spiral_data(samples=1000, classes=3)
# Create Dense layer with 2 input features and 64 output values
dense1 = Layer_Dense(2, 64, weight_regularizer_l2=5e-4, bias_regularizer_l2=5e-4)

# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()
# Create second Dense layer with 64 input features (as we take output
# of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense(64, 3)
# Create Softmax classifier's combined loss and activation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()
# Create optimizer
optimizer = Optimizer_Adam(learning_rate=0.02, decay=5e-7)

# Train in loop
for epoch in range(10001):
    # Perform a forward pass of our training data through this layer
    dense1.forward(X)
    # Perform a forward pass through activation function
    # takes the output of first dense layer here
    activation1.forward(dense1.output)
    # Perform a forward pass through second Dense layer
    # takes outputs of activation function of first layer as inputs
    dense2.forward(activation1.output)
    # Perform a forward pass through the activation/loss function
    # takes the output of second dense layer here and returns loss
    data_loss = loss_activation.forward(dense2.output, y)
    # Calculate regularization penalty
    regularization_loss = loss_activation.loss.regularization_loss(dense1) + loss_activation.loss.regularization_loss(dense2)
    # Calculate overall loss
    loss = data_loss + regularization_loss
    # Calculate accuracy from output of activation2 and targets
    # calculate values along first axis
    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)
    if not epoch % 100:
        print(f'epoch: {epoch}, ' +
        f'acc: {accuracy:.3f}, ' +
        f'loss: {loss:.3f} (' +
        f'data_loss: {data_loss:.3f}, ' +
        f'reg_loss: {regularization_loss:.3f}), ' +
        f'lr: {optimizer.current_learning_rate}')

    # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    # Update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.391, loss: 1.099 (data_loss: 1.099, reg_loss: 0.000), lr: 0.02
epoch: 100, acc: 0.642, loss: 0.922 (data_loss: 0.894, reg_loss: 0.029), lr: 0.019999010049002574
epoch: 200, acc: 0.744, loss: 0.749 (data_loss: 0.680, reg_loss: 0.069), lr: 0.019998010197985302
epoch: 300, acc: 0.794, loss: 0.653 (data_loss: 0.561, reg_loss: 0.092), lr: 0.019997010446938183
epoch: 400, acc: 0.812, loss: 0.602 (data_loss: 0.501, reg_loss: 0.100), lr: 0.01999601079584623
epoch: 500, acc: 0.826, loss: 0.569 (data_loss: 0.466, reg_loss: 0.102), lr: 0.01999501124469445
epoch: 600, acc: 0.840, loss: 0.541 (data_loss: 0.438, reg_loss: 0.103), lr: 0.01999401179346786
epoch: 700, acc: 0.843, loss: 0.519 (data_loss: 0.417, reg_loss: 0.102), lr: 0.01999301244215147
epoch: 800, acc: 0.853, loss: 0.501 (data_loss: 0.401, reg_loss: 0.100), lr: 0.0199920131907303
epoch: 900, acc: 0.856, loss: 0.487 (data_loss: 0.389, reg_loss: 0.098), lr: 0.019991014039189386
epoch: 1000, acc: 0.855, loss: 0.476 (data_l

In [7]:
# Validate the model
# Create test dataset
X_test, y_test = spiral_data(samples=100, classes=3)
# Perform a forward pass of our testing data through this layer
dense1.forward(X_test)
# Perform a forward pass through activation function
# takes the output of first dense layer here
activation1.forward(dense1.output)
# Perform a forward pass through second Dense layer
# takes outputs of activation function of first layer as inputs
dense2.forward(activation1.output)
# Perform a forward pass through the activation/loss function
# takes the output of second dense layer here and returns loss
loss = loss_activation.forward(dense2.output, y_test)
# Calculate accuracy from output of activation2 and targets
# calculate values along first axis
predictions = np.argmax(loss_activation.output, axis=1)
if len(y_test.shape) == 2:
    y_test = np.argmax(y_test, axis=1)
accuracy = np.mean(predictions==y_test)
print(f'validation, acc: {accuracy:.3f}, loss: {loss:.3f}')

validation, acc: 0.907, loss: 0.275


In [6]:
# Create dataset
X, y = spiral_data(samples=1000, classes=3)
# Create Dense layer with 2 input features and 64 output values
dense1 = Layer_Dense(2, 256, weight_regularizer_l2=5e-4, bias_regularizer_l2=5e-4)

# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()
# Create second Dense layer with 64 input features (as we take output
# of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense(256, 3)
# Create Softmax classifier's combined loss and activation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()
# Create optimizer
optimizer = Optimizer_Adam(learning_rate=0.02, decay=5e-7)

# Train in loop
for epoch in range(10001):
    # Perform a forward pass of our training data through this layer
    dense1.forward(X)
    # Perform a forward pass through activation function
    # takes the output of first dense layer here
    activation1.forward(dense1.output)
    # Perform a forward pass through second Dense layer
    # takes outputs of activation function of first layer as inputs
    dense2.forward(activation1.output)
    # Perform a forward pass through the activation/loss function
    # takes the output of second dense layer here and returns loss
    data_loss = loss_activation.forward(dense2.output, y)
    # Calculate regularization penalty
    regularization_loss = loss_activation.loss.regularization_loss(dense1) + loss_activation.loss.regularization_loss(dense2)
    # Calculate overall loss
    loss = data_loss + regularization_loss
    # Calculate accuracy from output of activation2 and targets
    # calculate values along first axis
    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)
    if not epoch % 100:
        print(f'epoch: {epoch}, ' +
        f'acc: {accuracy:.3f}, ' +
        f'loss: {loss:.3f} (' +
        f'data_loss: {data_loss:.3f}, ' +
        f'reg_loss: {regularization_loss:.3f}), ' +
        f'lr: {optimizer.current_learning_rate}')

    # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    # Update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.302, loss: 1.099 (data_loss: 1.099, reg_loss: 0.000), lr: 0.02
epoch: 100, acc: 0.699, loss: 0.793 (data_loss: 0.742, reg_loss: 0.051), lr: 0.019999010049002574
epoch: 200, acc: 0.809, loss: 0.624 (data_loss: 0.537, reg_loss: 0.087), lr: 0.019998010197985302
epoch: 300, acc: 0.841, loss: 0.548 (data_loss: 0.449, reg_loss: 0.098), lr: 0.019997010446938183
epoch: 400, acc: 0.867, loss: 0.501 (data_loss: 0.400, reg_loss: 0.101), lr: 0.01999601079584623
epoch: 500, acc: 0.872, loss: 0.469 (data_loss: 0.369, reg_loss: 0.099), lr: 0.01999501124469445
epoch: 600, acc: 0.881, loss: 0.446 (data_loss: 0.350, reg_loss: 0.097), lr: 0.01999401179346786
epoch: 700, acc: 0.885, loss: 0.428 (data_loss: 0.334, reg_loss: 0.094), lr: 0.01999301244215147
epoch: 800, acc: 0.887, loss: 0.416 (data_loss: 0.325, reg_loss: 0.091), lr: 0.0199920131907303
epoch: 900, acc: 0.889, loss: 0.406 (data_loss: 0.317, reg_loss: 0.088), lr: 0.019991014039189386
epoch: 1000, acc: 0.894, loss: 0.396 (data_l

In [8]:
# Validate the model
# Create test dataset
X_test, y_test = spiral_data(samples=100, classes=3)
# Perform a forward pass of our testing data through this layer
dense1.forward(X_test)
# Perform a forward pass through activation function
# takes the output of first dense layer here
activation1.forward(dense1.output)
# Perform a forward pass through second Dense layer
# takes outputs of activation function of first layer as inputs
dense2.forward(activation1.output)
# Perform a forward pass through the activation/loss function
# takes the output of second dense layer here and returns loss
loss = loss_activation.forward(dense2.output, y_test)
# Calculate accuracy from output of activation2 and targets
# calculate values along first axis
predictions = np.argmax(loss_activation.output, axis=1)
if len(y_test.shape) == 2:
    y_test = np.argmax(y_test, axis=1)
accuracy = np.mean(predictions==y_test)
print(f'validation, acc: {accuracy:.3f}, loss: {loss:.3f}')

validation, acc: 0.880, loss: 0.303


In [11]:
# Create dataset
X, y = spiral_data(samples=1000, classes=3)
# Create Dense layer with 2 input features and 64 output values
dense1 = Layer_Dense(2, 64, weight_regularizer_l2=5e-4,
bias_regularizer_l2=5e-4)
# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()
# Create dropout layer
dropout1 = Layer_Dropout(0.1)
# Create second Dense layer with 64 input features (as we take output
# of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense(64, 3)
# Create Softmax classifier's combined loss and activation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()
# Create optimizer
optimizer = Optimizer_Adam(learning_rate=0.05, decay=5e-5)
# Train in loop
for epoch in range(10001):
    # Perform a forward pass of our training data through this layer
    dense1.forward(X)
    # Perform a forward pass through activation function
    # takes the output of first dense layer here
    activation1.forward(dense1.output)
    # Perform a forward pass through Dropout layer
    dropout1.forward(activation1.output)
    # Perform a forward pass through second Dense layer
    # takes outputs of activation function of first layer as inputs
    dense2.forward(dropout1.output)
    # Perform a forward pass through the activation/loss function
    # takes the output of second dense layer here and returns loss
    data_loss = loss_activation.forward(dense2.output, y)
    # Calculate regularization penalty
    regularization_loss = \
    loss_activation.loss.regularization_loss(dense1) + \
    loss_activation.loss.regularization_loss(dense2)
    # Calculate overall loss
    loss = data_loss + regularization_loss
    # Calculate accuracy from output of activation2 and targets
    # calculate values along first axis
    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)
    if not epoch % 100:
        print(f'epoch: {epoch}, ' +
        f'acc: {accuracy:.3f}, ' +
        f'loss: {loss:.3f} (' +
        f'data_loss: {data_loss:.3f}, ' +
        f'reg_loss: {regularization_loss:.3f}), ' +
        f'lr: {optimizer.current_learning_rate}')
    # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    dropout1.backward(dense2.dinputs)
    activation1.backward(dropout1.dinputs)
    dense1.backward(activation1.dinputs)
    # Update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.290, loss: 1.099 (data_loss: 1.099, reg_loss: 0.000), lr: 0.05
epoch: 100, acc: 0.556, loss: 0.913 (data_loss: 0.887, reg_loss: 0.026), lr: 0.04975371909050202
epoch: 200, acc: 0.611, loss: 0.849 (data_loss: 0.816, reg_loss: 0.033), lr: 0.049507401356502806
epoch: 300, acc: 0.631, loss: 0.828 (data_loss: 0.795, reg_loss: 0.033), lr: 0.0492635105177595
epoch: 400, acc: 0.617, loss: 0.831 (data_loss: 0.800, reg_loss: 0.031), lr: 0.04902201088288642
epoch: 500, acc: 0.647, loss: 0.827 (data_loss: 0.795, reg_loss: 0.031), lr: 0.048782867456949125
epoch: 600, acc: 0.638, loss: 0.805 (data_loss: 0.774, reg_loss: 0.030), lr: 0.04854604592455945
epoch: 700, acc: 0.651, loss: 0.792 (data_loss: 0.763, reg_loss: 0.029), lr: 0.048311512633460556
epoch: 800, acc: 0.622, loss: 0.801 (data_loss: 0.773, reg_loss: 0.028), lr: 0.04807923457858551
epoch: 900, acc: 0.662, loss: 0.795 (data_loss: 0.767, reg_loss: 0.028), lr: 0.04784917938657352
epoch: 1000, acc: 0.654, loss: 0.786 (data_lo

In [12]:
# Validate the model
# Create test dataset
X_test, y_test = spiral_data(samples=100, classes=3)
# Perform a forward pass of our testing data through this layer
dense1.forward(X_test)
# Perform a forward pass through activation function
# takes the output of first dense layer here
activation1.forward(dense1.output)
# Perform a forward pass through second Dense layer
# takes outputs of activation function of first layer as inputs
dense2.forward(activation1.output)
# Perform a forward pass through the activation/loss function
# takes the output of second dense layer here and returns loss
loss = loss_activation.forward(dense2.output, y_test)
# Calculate accuracy from output of activation2 and targets
# calculate values along first axis
predictions = np.argmax(loss_activation.output, axis=1)
if len(y_test.shape) == 2:
    y_test = np.argmax(y_test, axis=1)
accuracy = np.mean(predictions==y_test)
print(f'validation, acc: {accuracy:.3f}, loss: {loss:.3f}')

validation, acc: 0.710, loss: 0.632
