# Chapter 10: Optimizers

### Full Code

In [95]:
import numpy as np
import random

random.seed(0)
np.random.seed(0)


# Our sample dataset
def create_data(n, k):
    X = np.zeros((n*k, 2))  # data matrix (each row = single example)
    y = np.zeros(n*k, dtype='uint8')  # class labels
    for j in range(k):
        ix = range(n*j, n*(j+1))
        r = np.linspace(0.0, 1, n)  # radius
        t = np.linspace(j*4, (j+1)*4, n) + np.random.randn(n)*0.2  # theta
        X[ix] = np.c_[r*np.sin(t*2.5), r*np.cos(t*2.5)]
        y[ix] = j
    return X, y


# Dense layer
class Layer_Dense:

    # Layer initialization
    def __init__(self, inputs, neurons):
        # Initialize weights and biases
        self.weights = 0.01 * np.random.randn(inputs, neurons)
        self.biases = np.zeros((1, neurons))

    # Forward pass
    def forward(self, inputs):
        # Remember input values
        self.inputs = inputs
        # Calculate output values from input ones, weights and biases
        self.output = np.dot(inputs, self.weights) + self.biases

    # Backward pass
    def backward(self, dvalues):
        # Gradients on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        # Gradient on values
        self.dvalues = np.dot(dvalues, self.weights.T)


# ReLU activation
class Activation_ReLU:

    # Forward pass
    def forward(self, inputs):
        # Remember input values
        self.inputs = inputs
        # Calculate output values from input ones
        self.output = np.maximum(0, inputs)

    # Backward pass
    def backward(self, dvalues):
        # Since we need to modify original variable, 
        # let's make a copy of values first
        self.dvalues = dvalues.copy()

        # Zero gradient where input values were negative 
        self.dvalues[self.inputs <= 0] = 0 


# Softmax activation
class Activation_Softmax:

    # Forward pass
    def forward(self, inputs):
        # Remember input values
        self.inputs = inputs

        # get unnormalized probabilities
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
        # normalize them for each sample
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)

        self.output = probabilities

    # Backward pass
    def backward(self, dvalues):
        self.dvalues = dvalues.copy()


# Cross-entropy loss
class Loss_CategoricalCrossentropy:

    # Forward pass
    def forward(self, y_pred, y_true):

        # Number of samples in a batch
        samples = y_pred.shape[0]

        # Probabilities for target values - only if categorical labels
        if len(y_true.shape) == 1:
            y_pred = y_pred[range(samples), y_true]

        # Losses
        negative_log_likelihoods = -np.log(y_pred)

        # Mask values - only for one-hot encoded labels
        if len(y_true.shape) == 2:
            negative_log_likelihoods *= y_true

        # Overall loss
        data_loss = np.sum(negative_log_likelihoods) / samples
        return data_loss

    # Backward pass
    def backward(self, dvalues, y_true):

        samples = dvalues.shape[0]

        self.dvalues = dvalues.copy()  # Copy so we can safely modify
        self.dvalues[range(samples), y_true] -= 1
        self.dvalues = self.dvalues / samples


# Create dataset
X, y = create_data(100, 3)

# Create Dense layer with 2 input features and 3 output values
dense1 = Layer_Dense(2, 3)  # first dense layer, 2 inputs (each sample has 2 features), 3 outputs

# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()

# Create second Dense layer with 3 input features (as we take output of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense(3, 3)  # second dense layer, 3 inputs, 3 outputs

# Create Softmax activation (to be used with Dense layer):
activation2 = Activation_Softmax()

# Create loss function
loss_function = Loss_CategoricalCrossentropy()

# Make a forward pass of our training data thru this layer
dense1.forward(X)

# Make a forward pass thru activation function - we take output of previous layer here
activation1.forward(dense1.output)

# Make a forward pass thru second Dense layer - it takes outputs of activation function of first layer as inputs
dense2.forward(activation1.output)

# Make a forward pass thru activation function - we take output of previous layer here
activation2.forward(dense2.output)

# Let's see output of the first few samples:
print(activation2.output[:5])

# Calculate loss from output of activation2 (softmax activation)
loss = loss_function.forward(activation2.output, y)

# Print loss value
print('loss:', loss)

# Calculate accuracy from output of activation2 and targets
predictions = np.argmax(activation2.output, axis=1)  # calculate values along first axis
accuracy = np.mean(predictions==y)

print('acc:', accuracy)

# Backward pass
loss_function.backward(activation2.output, y)
activation2.backward(loss_function.dvalues)
dense2.backward(activation2.dvalues)
activation1.backward(dense2.dvalues)
dense1.backward(activation1.dvalues)

# Print gradients
print(dense1.dweights)
print(dense1.dbiases)
print(dense2.dweights)
print(dense2.dbiases)

[[0.33333333 0.33333333 0.33333333]
 [0.33333317 0.33333318 0.33333364]
 [0.33333289 0.33333292 0.3333342 ]
 [0.33333259 0.33333264 0.33333477]
 [0.33333233 0.33333239 0.33333528]]
loss: 1.0986104615465142
acc: 0.34
[[ 1.57663575e-04  7.83685868e-05  4.73243939e-05]
 [ 1.81610390e-04  1.10455707e-05 -3.30962973e-05]]
[[-3.60553684e-04  9.66122221e-05 -1.03671511e-04]]
[[ 5.44109554e-05  1.07411413e-04 -1.61822369e-04]
 [-4.07913528e-05 -7.16780945e-05  1.12469447e-04]
 [-5.30112970e-05  8.58172904e-05 -3.28059934e-05]]
[[-1.06521079e-05 -9.44490453e-06  2.00970125e-05]]


- once we have calculated the gradient, we can use this to adjust weights and biases to decrease the loss
- in a previous example, we showed how we could successfully decrease a neuron’s activation function’s (ReLU) output in this manner
- recall that we subtracted a fraction of the gradient for each weight and bias parameter
- while very rudimentary, this is still a common optimizer, which is called **Stochastic Gradient Descent** (SGD)
- as you will soon discover, most optimizers are just variants of SGD

### Stochastic Gradient Descent (SGD)
- we choose a learning rate of 1.0
- we then subtract `learning_rate * parameter_gradients` from the actual parameter values
- because our learning rate is 1, we're subtracting the full gradient value from our parameters

In [96]:
class Optimizer_SGD:

    # Initialize optimizer - set settings, 
    # learning rate of 1.0 is default for this optimizer
    def __init__(self, learning_rate=1.0):
        self.learning_rate = learning_rate

    # Update parameters
    def update_params(self, layer):
        layer.weights -= self.learning_rate * layer.dweights
        layer.biases -= self.learning_rate * layer.dbiases

In [97]:
optimizer = Optimizer_SGD()

- after calculating the gradient, use this to update our layer's parameters:

In [98]:
optimizer.update_params(dense1)
optimizer.update_params(dense2)

- in our main neural network code, we employ this update after backpropagation
---
- step 1): make a 2x64 (2 layers, 64 neurons each) densely-connected neural network

In [99]:
# Create dataset
X, y = create_data(100, 3)

# Create Dense layer with 2 input features and 64 output values (64 neurons)
dense1 = Layer_Dense(2, 64)

# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()

# Create second Dense layer with 64 input features (as we take output of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense(64, 3) 

# Create Softmax activation (to be used with Dense layer):
activation2 = Activation_Softmax()

# Create loss function
loss_function = Loss_CategoricalCrossentropy()

# Create Optimizer
optimizer = Optimizer_SGD()

- step 2): perform a forward pass of our sample data

In [100]:
# Make a forward pass of our training data thru this layer
dense1.forward(X)

# Make a forward pass thru activation function
# it takes the output of first dense layer here
activation1.forward(dense1.output)

# Make a forward pass thru second Dense layer 
# it takes outputs of activation function of first layer as inputs
dense2.forward(activation1.output)

# Make a forward pass thru activation function
# it takes the output of second dense layer here
activation2.forward(dense2.output)

- step 3): calculate the loss and accuracy

In [101]:
# Calculate loss from output of activation2 (softmax activation)
loss = loss_function.forward(activation2.output, y)

# Let's print loss value
print('loss:', loss)

# Calculate accuracy from output of activation2 and targets
predictions = np.argmax(activation2.output, axis=1)  # calculate values along first axis
accuracy = np.mean(predictions==y)

print('acc:', accuracy)

loss: 1.0986057583966125
acc: 0.3466666666666667


- step 4): perform a backwards pass (i.e., backpropagation)

In [102]:
# Backward pass
loss_function.backward(activation2.output, y)
activation2.backward(loss_function.dvalues)
dense2.backward(activation2.dvalues)
activation1.backward(dense2.dvalues)
dense1.backward(activation1.dvalues)

step 5): use optimizer to update weights using the calculated gradient

In [103]:
# Update weights and biases (which are in the layers)
optimizer.update_params(dense1)
optimizer.update_params(dense2)

- this is everything we need to train our model, but let's perform optimization multiple times using Python's looping capabilities

### Training for Multiple Epochs
- we will repeatedly perform a forward pass, backward pass, and optimize our weights and biases until we reach a stopping point
- each full pass through all of the training data is called an **epoch**
- in most deep learning tasks, a neural network will be trained for multiple epochs, though the ideal scenario would be to have a perfect model with ideal weights and biases after only one epoch
- to add multiple epochs, we will initialize our model and run a loop around all the code performing the forward pass, backward pass, and optimization calculations:

In [104]:
# Create dataset
X, y = create_data(100, 3)

# Create Dense layer with 2 input features and 3 output values
dense1 = Layer_Dense(2, 64)  # first dense layer, 2 inputs (each sample has 2 features), 64 outputs

# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()

# Create second Dense layer with 64 input features (as we take output of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense(64, 3)  # second dense layer, 3 inputs, 3 outputs

# Create Softmax activation (to be used with Dense layer):
activation2 = Activation_Softmax()

# Create loss function
loss_function = Loss_CategoricalCrossentropy()

# Create optimizer
optimizer = Optimizer_SGD()

# Train in loop
for epoch in range(10001):

    # Make a forward pass of our training data thru this layer
    dense1.forward(X)

    # Make a forward pass thru activation function
    # we take output of previous layer here
    activation1.forward(dense1.output)

    # Make a forward pass thru second Dense layer 
    # outputs of activation function of first layer as inputs
    dense2.forward(activation1.output)

    # Make a forward pass thru activation function
    # we take output of previous layer here
    activation2.forward(dense2.output)

    # Calculate loss from output of activation2 so softmax activation
    loss = loss_function.forward(activation2.output, y)

    # Calculate accuracy from output of activation2 and targets
    # calculate values along first axis
    predictions = np.argmax(activation2.output, axis=1) 
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print('epoch:', epoch, 'acc:', accuracy, 'loss:', loss)

    # Backward pass
    loss_function.backward(activation2.output, y)
    activation2.backward(loss_function.dvalues)
    dense2.backward(activation2.dvalues)
    activation1.backward(dense2.dvalues)
    dense1.backward(activation1.dvalues)

    # Update weights
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)

epoch: 0 acc: 0.36666666666666664 loss: 1.0985882449466124
epoch: 100 acc: 0.42 loss: 1.0720765463829551
epoch: 200 acc: 0.41 loss: 1.0624655989595677
epoch: 300 acc: 0.41 loss: 1.0616153723467232
epoch: 400 acc: 0.4166666666666667 loss: 1.0606483462374505
epoch: 500 acc: 0.42 loss: 1.0595095194010553
epoch: 600 acc: 0.42 loss: 1.05807753621963
epoch: 700 acc: 0.41333333333333333 loss: 1.0555091262523664
epoch: 800 acc: 0.43 loss: 1.050181932524032
epoch: 900 acc: 0.4533333333333333 loss: 1.0397715979508741
epoch: 1000 acc: 0.47333333333333333 loss: 1.0230670004571563
epoch: 1100 acc: 0.42 loss: 1.0214345839016705
epoch: 1200 acc: 0.39666666666666667 loss: 1.014260785726662
epoch: 1300 acc: 0.39 loss: 1.0067441796699017
epoch: 1400 acc: 0.4 loss: 1.0034592795149848
epoch: 1500 acc: 0.49333333333333335 loss: 0.9926242584545484
epoch: 1600 acc: 0.4066666666666667 loss: 1.0551332809720675
epoch: 1700 acc: 0.4066666666666667 loss: 1.0360131977649636
epoch: 1800 acc: 0.45 loss: 1.0126064197

- our neural network stays stuck at around a loss of 1 and an accuracy around 0.33
- iterating over more epochs doesn’t seem to be helpful at this point, which tells us that we’re likely stuck with our optimization
- recall that we’re adjusting our weights and biases by applying some fraction, in this case 1.0, to the gradient and subtracting this from the weights and biases
- this fraction is called the learning rate (LR) and is the primary adjustable parameter for the optimizer as it decreases loss

### Learning Rate 
- knowing exactly what the learning rate should be to get the most out of your training process isn’t possible, but a good rule is that your initial training will benefit from a larger learning rate to take those initial steps faster
- if you start with steps that are too small, you might get stuck in a local minimum, which you won't be able to escape due to not making large enough updates to the parameters
- for example, let's change SGD optimizer to have a learning rate of 0.5 instead of 1.0:

In [105]:
# Create optimizer
optimizer = Optimizer_SGD(learning_rate=.5) # adjusted learning rate

# Train in loop
for epoch in range(10001):

    dense1.forward(X)
    activation1.forward(dense1.output)

    dense2.forward(activation1.output)
    activation2.forward(dense2.output)

    loss = loss_function.forward(activation2.output, y)

    predictions = np.argmax(activation2.output, axis=1)

    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print('epoch:', epoch, 'acc:', accuracy, 'loss:', loss)


    loss_function.backward(activation2.output, y)
    activation2.backward(loss_function.dvalues)
    dense2.backward(activation2.dvalues)
    activation1.backward(dense2.dvalues)
    dense1.backward(activation1.dvalues)

epoch: 0 acc: 0.8366666666666667 loss: 0.4451942603476483
epoch: 100 acc: 0.8366666666666667 loss: 0.4451942603476483
epoch: 200 acc: 0.8366666666666667 loss: 0.4451942603476483
epoch: 300 acc: 0.8366666666666667 loss: 0.4451942603476483
epoch: 400 acc: 0.8366666666666667 loss: 0.4451942603476483
epoch: 500 acc: 0.8366666666666667 loss: 0.4451942603476483
epoch: 600 acc: 0.8366666666666667 loss: 0.4451942603476483
epoch: 700 acc: 0.8366666666666667 loss: 0.4451942603476483
epoch: 800 acc: 0.8366666666666667 loss: 0.4451942603476483
epoch: 900 acc: 0.8366666666666667 loss: 0.4451942603476483
epoch: 1000 acc: 0.8366666666666667 loss: 0.4451942603476483
epoch: 1100 acc: 0.8366666666666667 loss: 0.4451942603476483
epoch: 1200 acc: 0.8366666666666667 loss: 0.4451942603476483
epoch: 1300 acc: 0.8366666666666667 loss: 0.4451942603476483
epoch: 1400 acc: 0.8366666666666667 loss: 0.4451942603476483
epoch: 1500 acc: 0.8366666666666667 loss: 0.4451942603476483
epoch: 1600 acc: 0.8366666666666667 

- lower loss is not always associated with higher accuracy
- remember, even if we desire the best accuracy out of our model, the optimizer’s task is to decrease loss
- a common solution to keep initial updates large, but also explore a variety of learning rates during training is to implement a **learning rate decay**

### Learning Rate Decay
- the idea behind learning rate decay is to start with a large learning rate, say 1.0, and then decrease it throughout training
- one option is to decrease the learning rate in response to the behavior of the loss across epochs
- another option, which we are going to implement, is to program a **Decay Rate**, which decays the learning rate constantly per batch or epoch
- let’s plan to decay per step, which can also be referred to as **1/t decaying** 

In [106]:
starting_learning_rate = 1.
learning_rate_decay = 0.1
step = 1

learning_rate = starting_learning_rate * (1. / (1 + learning_rate_decay * step))
print(learning_rate)

0.9090909090909091


- in practice, 0.1 is an aggressive decay rate, but this should give you a sense of the concept
- this is what step 20 looks like:

In [107]:
starting_learning_rate = 1.
learning_rate_decay = 0.1
step = 20

learning_rate = starting_learning_rate * (1. / (1 + learning_rate_decay * step))
print(learning_rate)

0.3333333333333333


- we can also apply a loop, which is more comparable to our future application

In [108]:
starting_learning_rate = 1.
learning_rate_decay = 0.1

for step in range (20):
    learning_rate = starting_learning_rate * (1. / (1 + learning_rate_decay * step))
    print(learning_rate)

1.0
0.9090909090909091
0.8333333333333334
0.7692307692307692
0.7142857142857143
0.6666666666666666
0.625
0.588235294117647
0.5555555555555556
0.5263157894736842
0.5
0.47619047619047616
0.45454545454545453
0.4347826086956522
0.41666666666666663
0.4
0.3846153846153846
0.37037037037037035
0.35714285714285715
0.3448275862068965


- initially, the learning rate drops fast, but the change in the learning rate lowers at each step
- this lets the model get as close at possible to the minimum
- we can now update our `Optimizer_SGD()` class by implementing a learning rate decay: 

In [109]:
class Optimizer_SGD:

    def __init__(self, learning_rate=1., decay=0.1):
        self.learning_rate = learning_rate # initial learning rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0 # keeps track of the number of iterations the optimizer has completed (works with post_update_params)

    # Call once before any parameter updates
    def pre_update_params(self): # updates learning rate using the formula we covered earlier
        if self.decay:
            self.current_learning_rate = self.current_learning_rate * (1. / (1. + self.decay * self.iterations))

    # Update parameters
    def update_params(self, layer): # this method remains unchanged
        layer.weights -= self.current_learning_rate * layer.dweights
        layer.biases -= self.current_learning_rate * layer.dbiases

    # Call once after any parameter updates
    def post_update_params(self): # adds to our self.iterations tracking
        self.iterations += 1

- in the `__init__()` method, we added handling for the current learning rate, and `self.learning_rate` is the initial learning rate
- we also added attributes to track the decay rate as well as the number of iterations that the optimizer has completed
- next, we added a new method called `pre_update_params()`, which will update our `self.current_learning_rate` using the formula we covered earlier
- the `update_params()` method remains unchanged, but we do have a new `post_update_params()` method which adds to our `self.iterations` tracking
- with our updated SGD optimizer class, let’s use a decay rate of 1e-6 (0.000001) and train our model again:

In [110]:
np.random.seed(0)
# Create dataset
X, y = create_data(100, 3)

# Create Dense layer with 2 input features and 3 output values
dense1 = Layer_Dense(2, 64)  # first dense layer, 2 inputs (each sample has 2 features), 3 outputs

# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()

# Create second Dense layer with 3 input features (as we take output of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense(64, 3)  # second dense layer, 3 inputs, 3 outputs

# Create Softmax activation (to be used with Dense layer):
activation2 = Activation_Softmax()

# Create loss function
loss_function = Loss_CategoricalCrossentropy()

# Create optimizer
optimizer = Optimizer_SGD(decay=1e-6)

# Train in loop
for epoch in range(10001):

    # Make a forward pass of our training data thru this layer
    dense1.forward(X)

    # Make a forward pass thru activation function - we take output of previous layer here
    activation1.forward(dense1.output)

    # Make a forward pass thru second Dense layer - it takes outputs of activation function of first layer as inputs
    dense2.forward(activation1.output)

    # Make a forward pass thru activation function - we take output of previous layer here
    activation2.forward(dense2.output)

    # Calculate loss from output of activation2 so softmax activation
    loss = loss_function.forward(activation2.output, y)

    # Calculate accuracy from output of activation2 and targets
    predictions = np.argmax(activation2.output, axis=1)  # calculate values along first axis
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f'epoch: {epoch}, acc: {accuracy:.3f}, loss: {loss:.3f}, lr: {optimizer.current_learning_rate}')

    # Backward pass
    loss_function.backward(activation2.output, y)
    activation2.backward(loss_function.dvalues)
    dense2.backward(activation2.dvalues)
    activation1.backward(dense2.dvalues)
    dense1.backward(activation1.dvalues)

    # Update weights
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.360, loss: 1.099, lr: 1.0
epoch: 100, acc: 0.397, loss: 1.087, lr: 0.995062394416656
epoch: 200, acc: 0.417, loss: 1.077, lr: 0.9802979952223477
epoch: 300, acc: 0.417, loss: 1.076, lr: 0.9561451727242277
epoch: 400, acc: 0.403, loss: 1.074, lr: 0.923310797854788
epoch: 500, acc: 0.397, loss: 1.072, lr: 0.8827358848191256
epoch: 600, acc: 0.417, loss: 1.069, lr: 0.8355508252084564
epoch: 700, acc: 0.427, loss: 1.065, lr: 0.7830231828383335
epoch: 800, acc: 0.440, loss: 1.060, lr: 0.7265014060259195
epoch: 900, acc: 0.437, loss: 1.055, lr: 0.6673579255709609
epoch: 1000, acc: 0.423, loss: 1.049, lr: 0.6069349461705745
epoch: 1100, acc: 0.403, loss: 1.038, lr: 0.5464958360051433
epoch: 1200, acc: 0.400, loss: 1.028, lr: 0.4871844246103457
epoch: 1300, acc: 0.420, loss: 1.019, lr: 0.4299937982030348
epoch: 1400, acc: 0.453, loss: 1.010, lr: 0.3757454062555259
epoch: 1500, acc: 0.483, loss: 1.000, lr: 0.3250785338236289
epoch: 1600, acc: 0.503, loss: 0.991, lr: 0.278449512

- as you can clearly see, this model got stuck somewhere in the first few thousand epochs
- the model got stuck because the learning rate decayed far too quickly, which made it too small to escape local minima
- let's trying making our decay rate smaller (now 5e-8):

In [111]:
np.random.seed(0)
# Create dataset
X, y = create_data(100, 3)

# Create Dense layer with 2 input features and 3 output values
dense1 = Layer_Dense(2, 64)  # first dense layer, 2 inputs (each sample has 2 features), 3 outputs

# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()

# Create second Dense layer with 3 input features (as we take output of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense(64, 3)  # second dense layer, 3 inputs, 3 outputs

# Create Softmax activation (to be used with Dense layer):
activation2 = Activation_Softmax()

# Create loss function
loss_function = Loss_CategoricalCrossentropy()

# Create optimizer
optimizer = Optimizer_SGD(decay=5e-8) # adjusted decay rate <----------

# Train in loop
for epoch in range(10001):

    dense1.forward(X)
    activation1.forward(dense1.output)

    dense2.forward(activation1.output)
    activation2.forward(dense2.output)

    loss = loss_function.forward(activation2.output, y)

    predictions = np.argmax(activation2.output, axis=1)
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f'epoch: {epoch}, acc: {accuracy:.3f}, loss: {loss:.3f}, lr: {optimizer.current_learning_rate}')

    # Backward pass
    loss_function.backward(activation2.output, y)
    activation2.backward(loss_function.dvalues)
    dense2.backward(activation2.dvalues)
    activation1.backward(dense2.dvalues)
    dense1.backward(activation1.dvalues)

    # Update weights
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.360, loss: 1.099, lr: 1.0
epoch: 100, acc: 0.400, loss: 1.087, lr: 0.9997525310359338
epoch: 200, acc: 0.417, loss: 1.077, lr: 0.9990054981534312
epoch: 300, acc: 0.420, loss: 1.076, lr: 0.997760023693318
epoch: 400, acc: 0.403, loss: 1.074, lr: 0.9960179759343823
epoch: 500, acc: 0.403, loss: 1.071, lr: 0.9937819644233231
epoch: 600, acc: 0.417, loss: 1.067, lr: 0.9910553334608369
epoch: 700, acc: 0.437, loss: 1.063, lr: 0.9878421537665991
epoch: 800, acc: 0.453, loss: 1.055, lr: 0.9841472123522141
epoch: 900, acc: 0.407, loss: 1.062, lr: 0.9799760006374122
epoch: 1000, acc: 0.410, loss: 1.058, lr: 0.9753347008507738
epoch: 1100, acc: 0.397, loss: 1.056, lr: 0.9702301707621199
epoch: 1200, acc: 0.427, loss: 1.068, lr: 0.9646699267993539
epoch: 1300, acc: 0.413, loss: 1.055, lr: 0.9586621256079148
epoch: 1400, acc: 0.390, loss: 1.078, lr: 0.9522155441161774
epoch: 1500, acc: 0.390, loss: 1.047, lr: 0.9453395581749566
epoch: 1600, acc: 0.417, loss: 1.043, lr: 0.93804411

- decreasing the decay rate has yieled our highest accuracy and lowest loss thus far
---
- Stochastic Gradient Descent with learning rate decay can do fairly well, but remains **a basic method of optimization that only follows a gradient without any additional logic that could potentially help the model find the global minimum to the loss function**
- one option for improving the SGD optimizer is to introduce **momentum** 

### Stochastic Gradient Descent with Momentum
- the idea behind momentum is to make a rolling average of gradients over some number of updates and use this average rather than the unique gradient at each step
- imagine a ball rolling down a hill: even if it runs into a small hole or incline, momentum will allow it to **escape** such obstacles and keep rolling towards a lower minimum
---
- remember that the gradient points towards the current steepest descent for that step, which may not necessarily follow descent towards the global minimum, but point towards a local minimum instead
- therefore, this step may decrease loss for that update, but may not get us out of the local minimum
- we could end up with a gradient that points in one direction, and then the opposite direction in the next update
- the gradient could continue to bounce back and forth around a local minimum, keeping the optimization of the loss stuck
- SGD with momentum, however, **uses the previous update’s direction to influence the next update’s direction**, minimizing the chances of getting stuck around a local minimum
---
- we effectively using a portion of the gradient from preceding steps as our momentum (direction of previous changes) and a portion of the current gradient
- together, these portions form the change to our parameters

In [137]:
class Optimizer_SGD:

    # Initialize optimizer - set settings
    def __init__(self, learning_rate=1., decay=0., momentum=0.):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum = momentum


    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.current_learning_rate * (1. / (1. + self.decay * self.iterations))

    # Update parameters
    def update_params(self, layer):

        # If layer does not contain momentum arrays, create ones 
        #filled with zeros
        if not hasattr(layer, 'weight_momentums'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            # If there is no momentum array for weights
            # The array doesn’t exist for biases yet either.
            layer.bias_momentums = np.zeros_like(layer.biases)

        # If we use momentum
        if self.momentum:

            # Build weight updates with momentum - take previous 
            # updates multiplied by retain factor and update with 
            # current gradients
            weight_updates = (
                (self.momentum * layer.weight_momentums) - 
                (self.current_learning_rate * layer.dweights)
            )
            layer.weight_momentums = weight_updates

            # Build bias updates
            bias_updates = (
                (self.momentum * layer.bias_momentums) - 
                (self.current_learning_rate * layer.dbiases)
            )
            layer.bias_momentums = bias_updates

        # Vanilla SGD updates (as before momentum update)
        else:
            weight_updates = (-self.current_learning_rate *                      
                              layer.dweights)
            bias_updates = (-self.current_learning_rate * 
                            layer.dbiases)

        layer.weights += weight_updates
        layer.biases += bias_updates

    # Call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

- let's demonstrate how adding momentum changes the learning process
- we will keep the same initial **learning rate** (1) and **decay** (5e-8) from the previous training attempt:

In [121]:
np.random.seed(0)
# Create dataset
X, y = create_data(100, 3)

# Create Dense layer with 2 input features and 3 output values
dense1 = Layer_Dense(2, 64)  # first dense layer, 2 inputs (each sample has 2 features), 3 outputs

# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()

# Create second Dense layer with 3 input features (as we take output of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense(64, 3)  # second dense layer, 3 inputs, 3 outputs

# Create Softmax activation (to be used with Dense layer):
activation2 = Activation_Softmax()

# Create loss function
loss_function = Loss_CategoricalCrossentropy()

# Create optimizer
optimizer = Optimizer_SGD(decay=5e-8, momentum=0.9)

# Train in loop
for epoch in range(10001):

    # Make a forward pass of our training data thru this layer
    dense1.forward(X)

    # Make a forward pass thru activation function - we take output of previous layer here
    activation1.forward(dense1.output)

    # Make a forward pass thru second Dense layer - it takes outputs of activation function of first layer as inputs
    dense2.forward(activation1.output)

    # Make a forward pass thru activation function - we take output of previous layer here
    activation2.forward(dense2.output)

    # Calculate loss from output of activation2 so softmax activation
    loss = loss_function.forward(activation2.output, y)

    # Calculate accuracy from output of activation2 and targets
    predictions = np.argmax(activation2.output, axis=1)  # calculate values along first axis
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f'epoch: {epoch}, acc: {accuracy:.3f}, loss: {loss:.3f}, lr: {optimizer.current_learning_rate}')

    # Backward pass
    loss_function.backward(activation2.output, y)
    activation2.backward(loss_function.dvalues)
    dense2.backward(activation2.dvalues)
    activation1.backward(dense2.dvalues)
    dense1.backward(activation1.dvalues)

    # Update weights
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.360, loss: 1.099, lr: 1.0
epoch: 100, acc: 0.477, loss: 1.044, lr: 0.9997525310359338
epoch: 200, acc: 0.503, loss: 0.970, lr: 0.9990054981534312
epoch: 300, acc: 0.570, loss: 0.855, lr: 0.997760023693318
epoch: 400, acc: 0.663, loss: 0.717, lr: 0.9960179759343823
epoch: 500, acc: 0.677, loss: 0.616, lr: 0.9937819644233231
epoch: 600, acc: 0.727, loss: 0.542, lr: 0.9910553334608369
epoch: 700, acc: 0.747, loss: 0.501, lr: 0.9878421537665991
epoch: 800, acc: 0.743, loss: 0.484, lr: 0.9841472123522141
epoch: 900, acc: 0.773, loss: 0.468, lr: 0.9799760006374122
epoch: 1000, acc: 0.803, loss: 0.455, lr: 0.9753347008507738
epoch: 1100, acc: 0.777, loss: 0.502, lr: 0.9702301707621199
epoch: 1200, acc: 0.793, loss: 0.458, lr: 0.9646699267993539
epoch: 1300, acc: 0.730, loss: 0.496, lr: 0.9586621256079148
epoch: 1400, acc: 0.743, loss: 0.465, lr: 0.9522155441161774
epoch: 1500, acc: 0.813, loss: 0.432, lr: 0.9453395581749566
epoch: 1600, acc: 0.767, loss: 0.537, lr: 0.93804411

- by epoch 2000, the neural network reached ~83% accuracy, at which we should try a slower decay as change was stagnant from that point forward
- without momentum, a decay of 1e-8 performed worse than 5e-8, but let's try 1e-8 with momentum

In [122]:
np.random.seed(0)
# Create dataset
X, y = create_data(100, 3)

# Create Dense layer with 2 input features and 3 output values
dense1 = Layer_Dense(2, 64)  # first dense layer, 2 inputs (each sample has 2 features), 3 outputs

# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()

# Create second Dense layer with 3 input features (as we take output of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense(64, 3)  # second dense layer, 3 inputs, 3 outputs

# Create Softmax activation (to be used with Dense layer):
activation2 = Activation_Softmax()

# Create loss function
loss_function = Loss_CategoricalCrossentropy()

# Create optimizer
optimizer = Optimizer_SGD(decay=1e-8, momentum=0.9)

# Train in loop
for epoch in range(10001):

    # Make a forward pass of our training data thru this layer
    dense1.forward(X)

    # Make a forward pass thru activation function - we take output of previous layer here
    activation1.forward(dense1.output)

    # Make a forward pass thru second Dense layer - it takes outputs of activation function of first layer as inputs
    dense2.forward(activation1.output)

    # Make a forward pass thru activation function - we take output of previous layer here
    activation2.forward(dense2.output)

    # Calculate loss from output of activation2 so softmax activation
    loss = loss_function.forward(activation2.output, y)

    # Calculate accuracy from output of activation2 and targets
    predictions = np.argmax(activation2.output, axis=1)  # calculate values along first axis
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f'epoch: {epoch}, acc: {accuracy:.3f}, loss: {loss:.3f}, lr: {optimizer.current_learning_rate}')

    # Backward pass
    loss_function.backward(activation2.output, y)
    activation2.backward(loss_function.dvalues)
    dense2.backward(activation2.dvalues)
    activation1.backward(dense2.dvalues)
    dense1.backward(activation1.dvalues)

    # Update weights
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.360, loss: 1.099, lr: 1.0
epoch: 100, acc: 0.453, loss: 1.044, lr: 0.9999505012415218
epoch: 200, acc: 0.533, loss: 0.948, lr: 0.9998010199314954
epoch: 300, acc: 0.627, loss: 0.786, lr: 0.9995516010086423
epoch: 400, acc: 0.550, loss: 0.901, lr: 0.9992023193791428
epoch: 500, acc: 0.670, loss: 0.711, lr: 0.9987532798791473
epoch: 600, acc: 0.850, loss: 0.431, lr: 0.998204617222333
epoch: 700, acc: 0.877, loss: 0.344, lr: 0.99755649593255
epoch: 800, acc: 0.767, loss: 0.504, lr: 0.9968091102615951
epoch: 900, acc: 0.853, loss: 0.397, lr: 0.9959626840921862
epoch: 1000, acc: 0.850, loss: 0.374, lr: 0.9950174708261916
epoch: 1100, acc: 0.847, loss: 0.364, lr: 0.993973753258195
epoch: 1200, acc: 0.857, loss: 0.358, lr: 0.9928318434344862
epoch: 1300, acc: 0.847, loss: 0.353, lr: 0.9915920824975807
epoch: 1400, acc: 0.847, loss: 0.347, lr: 0.9902548405163678
epoch: 1500, acc: 0.850, loss: 0.344, lr: 0.9888205163020066
epoch: 1600, acc: 0.847, loss: 0.342, lr: 0.98728953720

- we can derive better results with more tweaking, but this is a decent example of how momentum can be useful by allowing us to use a slower rate, but still decreasing loss over local minimas
- the next modification to Stochasic Gradient Descent is **AdaGrad**

### AdaGrad (adaptive learning rate)
- short for adaptive gradient
- AdaGrad **introduces a per-parameter learning rate** instead of a globally-shared learning rate
- with AdaGrad, we allow the model to value certain features higher than others
- with our generated dataset, all features hold roughly the same value, but AdaGrad can provide significant improvements for feature sets where certain features have varying degrees of rarity or are more informative than others
- the concept of AdaGrad can be depicted in short, simple code:
---
`cache += parm_gradient ** 2`

`parm_updates = learning_rate * parm_gradient / (sqrt(cache) + eps)`

---
- `cache` holds a history of squared gradinets and `parm_updates` is a function of the learning rate multiplied by the `parm_gradient` (basic SGD at the moment), then we divide this by the square root of the `cache` plus some epsilon value
- epsilon is a hyperparameter (pre-training control knob setting), which we'll default to 1e-7 (usually a small number)
---
- **parameters** in a neural network are weights and biases that are being "learned" by the network during training
- **hyperparameters** are all the settings, like the number of neurons per layer, the learning rate, the decaying factor, the activation function(s), $l1/l2$ normalization amounts, and more
---
- overall, the impact is the learning rates for more **sparse** parameters are increased, while the less sparse parameters are decreased
- to implement AdaGrad, we'll start by copying and pasting our `Optimizer_SGD()` class
- then we'll change the name
- then we'll add `self.epsilon` to the `__init__()` method
- then we'll remove the momentum code from the `update_params()` method and replace it with AdaGrad code

In [162]:
class Optimizer_Adagrad:

    # Initialize optimizer - set settings
    def __init__(self, learning_rate=1., decay=0., epsilon=1e-7):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon


    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.current_learning_rate * (1. / (1. + self.decay * self.iterations))

    # Update parameters
    def update_params(self, layer):

        # If layer does not contain cache arrays,
        # create ones filled with zeros
        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)

        # Update cache with squared current gradients
        layer.weight_cache += layer.dweights**2
        layer.bias_cache += layer.dbiases**2

        # Vanilla SGD parameter update + normalization
        # with square rooted cache
        layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)

    # Call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

- now let's see the AdaGrad optimizer in action

In [168]:
np.random.seed(0)
# Create dataset
X, y = create_data(100, 3)

# Create Dense layer with 2 input features and 3 output values
dense1 = Layer_Dense(2, 64)  # first dense layer, 2 inputs (each sample has 2 features), 3 outputs

# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()

# Create second Dense layer with 3 input features (as we take output of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense(64, 3)  # second dense layer, 3 inputs, 3 outputs

# Create Softmax activation (to be used with Dense layer):
activation2 = Activation_Softmax()

# Create loss function
loss_function = Loss_CategoricalCrossentropy()

# Create optimizer
#optimizer = Optimizer_SGD(decay=1e-8, momentum=0.9)
optimizer = Optimizer_Adagrad(decay=1e-8)

# Train in loop
for epoch in range(10001):

    # Make a forward pass of our training data thru this layer
    dense1.forward(X)

    # Make a forward pass thru activation function - we take output of previous layer here
    activation1.forward(dense1.output)

    # Make a forward pass thru second Dense layer - it takes outputs of activation function of first layer as inputs
    dense2.forward(activation1.output)

    # Make a forward pass thru activation function - we take output of previous layer here
    activation2.forward(dense2.output)

    # Calculate loss from output of activation2 so softmax activation
    loss = loss_function.forward(activation2.output, y)

    # Calculate accuracy from output of activation2 and targets
    predictions = np.argmax(activation2.output, axis=1)  # calculate values along first axis
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f'epoch: {epoch}, acc: {accuracy:.3f}, loss: {loss:.3f}, lr: {optimizer.current_learning_rate}')

    # Backward pass
    loss_function.backward(activation2.output, y)
    activation2.backward(loss_function.dvalues)
    dense2.backward(activation2.dvalues)
    activation1.backward(dense2.dvalues)
    dense1.backward(activation1.dvalues)

    # Update weights
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.360, loss: 1.099, lr: 1.0
epoch: 100, acc: 0.493, loss: 1.012, lr: 0.9999505012415218
epoch: 200, acc: 0.510, loss: 0.939, lr: 0.9998010199314954
epoch: 300, acc: 0.573, loss: 0.892, lr: 0.9995516010086423
epoch: 400, acc: 0.600, loss: 0.855, lr: 0.9992023193791428
epoch: 500, acc: 0.617, loss: 0.802, lr: 0.9987532798791473
epoch: 600, acc: 0.650, loss: 0.778, lr: 0.998204617222333
epoch: 700, acc: 0.653, loss: 0.745, lr: 0.99755649593255
epoch: 800, acc: 0.660, loss: 0.729, lr: 0.9968091102615951
epoch: 900, acc: 0.670, loss: 0.703, lr: 0.9959626840921862
epoch: 1000, acc: 0.690, loss: 0.665, lr: 0.9950174708261916
epoch: 1100, acc: 0.707, loss: 0.663, lr: 0.993973753258195
epoch: 1200, acc: 0.693, loss: 0.642, lr: 0.9928318434344862
epoch: 1300, acc: 0.750, loss: 0.605, lr: 0.9915920824975807
epoch: 1400, acc: 0.750, loss: 0.606, lr: 0.9902548405163678
epoch: 1500, acc: 0.717, loss: 0.598, lr: 0.9888205163020066
epoch: 1600, acc: 0.780, loss: 0.567, lr: 0.98728953720

- AdaGrad worked well, and we can see that loss consistently fell throughout the entire training process
- interestingly, SGD with momentum got near its final results in 1700 epochs, whereas AdaGrad took 3000 epochs

### RMSProp (adaptive learning rate)
- continuing with modifcations of Stochastic Gradient Descent, let's meet RMSProp; short for Root Mean Squared Propagation
- similar to AdaGrad, RMSProp calculates an adaptive learning rate per parameter, but it's calculated in a different way
- AdaGrad calculates the cache (which holds a history of gradients) as: `cache += gradient ** 2`
- RMSProp calculates the cache as: `cache += rho * cache + (1- rho) * gradient ** 2`
- RMSProp is **similar to SGD with momentum**: RMSProp adds a mechanism similar to momentum, but also adds a per-parameter adaptive learning rate so the learning rate changes are smoother
- instead of constantly adding to a cache (like in AdaGrad), RMSProp uses a moving average of the cache
---
- `rho` is the cache memory decay rate
- a low learning rate of 0.001 (common default) is highly recommended for RMSProp

In [172]:
class Optimizer_RMSprop:

    # Initialize optimizer - set settings
    def __init__(self, learning_rate=0.001, decay=0., epsilon=1e-7, rho=0.9):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.rho = rho

    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.current_learning_rate * (1. / (1. + self.decay * self.iterations))

    # Update parameters
    def update_params(self, layer):

        # If layer does not contain cache arrays,
        # create ones filled with zeros
        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)

        # Update cache with squared current gradients
        layer.weight_cache = self.rho * layer.weight_cache + (1 - self.rho) * layer.dweights**2
        layer.bias_cache = self.rho * layer.bias_cache + (1 - self.rho) * layer.dbiases**2

        # Vanilla SGD parameter update + normalization
        # with square rooted cache
        layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)

    # Call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

- let's change the optimizer used in our main neural network code and begin training:

In [173]:
np.random.seed(0)
# Create dataset
X, y = create_data(100, 3)

# Create Dense layer with 2 input features and 3 output values
dense1 = Layer_Dense(2, 64)  # first dense layer, 2 inputs (each sample has 2 features), 3 outputs

# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()

# Create second Dense layer with 3 input features (as we take output of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense(64, 3)  # second dense layer, 3 inputs, 3 outputs

# Create Softmax activation (to be used with Dense layer):
activation2 = Activation_Softmax()

# Create loss function
loss_function = Loss_CategoricalCrossentropy()

# Create optimizer
#optimizer = Optimizer_SGD(decay=1e-8, momentum=0.9)
optimizer = Optimizer_RMSprop(decay=1e-8) # decay is 0 by default, unlike the other hyperparameters, so we needed to set it here
# Train in loop
for epoch in range(10001):

    # Make a forward pass of our training data thru this layer
    dense1.forward(X)

    # Make a forward pass thru activation function - we take output of previous layer here
    activation1.forward(dense1.output)

    # Make a forward pass thru second Dense layer - it takes outputs of activation function of first layer as inputs
    dense2.forward(activation1.output)

    # Make a forward pass thru activation function - we take output of previous layer here
    activation2.forward(dense2.output)

    # Calculate loss from output of activation2 so softmax activation
    loss = loss_function.forward(activation2.output, y)

    # Calculate accuracy from output of activation2 and targets
    predictions = np.argmax(activation2.output, axis=1)  # calculate values along first axis
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f'epoch: {epoch}, acc: {accuracy:.3f}, loss: {loss:.3f}, lr: {optimizer.current_learning_rate}')

    # Backward pass
    loss_function.backward(activation2.output, y)
    activation2.backward(loss_function.dvalues)
    dense2.backward(activation2.dvalues)
    activation1.backward(dense2.dvalues)
    dense1.backward(activation1.dvalues)

    # Update weights
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.360, loss: 1.099, lr: 0.001
epoch: 100, acc: 0.417, loss: 1.077, lr: 0.0009999505012415217
epoch: 200, acc: 0.457, loss: 1.072, lr: 0.0009998010199314946
epoch: 300, acc: 0.473, loss: 1.062, lr: 0.0009995516010086409
epoch: 400, acc: 0.483, loss: 1.050, lr: 0.0009992023193791406
epoch: 500, acc: 0.483, loss: 1.035, lr: 0.0009987532798791455
epoch: 600, acc: 0.497, loss: 1.019, lr: 0.0009982046172223313
epoch: 700, acc: 0.540, loss: 1.003, lr: 0.0009975564959325478
epoch: 800, acc: 0.580, loss: 0.987, lr: 0.0009968091102615926
epoch: 900, acc: 0.597, loss: 0.972, lr: 0.0009959626840921837
epoch: 1000, acc: 0.597, loss: 0.958, lr: 0.0009950174708261896
epoch: 1100, acc: 0.597, loss: 0.945, lr: 0.000993973753258193
epoch: 1200, acc: 0.613, loss: 0.933, lr: 0.0009928318434344848
epoch: 1300, acc: 0.633, loss: 0.922, lr: 0.0009915920824975796
epoch: 1400, acc: 0.633, loss: 0.912, lr: 0.0009902548405163651
epoch: 1500, acc: 0.643, loss: 0.903, lr: 0.0009888205163020038
epoch

- these results are not great, but we can tweak the hyperparameters:
- we'll increase `learning_rate` to 0.05, decrease `decay` to 4e-8, and decrease `rho` to 0.999

In [174]:
np.random.seed(0)
# Create dataset
X, y = create_data(100, 3)

# Create Dense layer with 2 input features and 3 output values
dense1 = Layer_Dense(2, 64)  # first dense layer, 2 inputs (each sample has 2 features), 3 outputs

# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()

# Create second Dense layer with 3 input features (as we take output of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense(64, 3)  # second dense layer, 3 inputs, 3 outputs

# Create Softmax activation (to be used with Dense layer):
activation2 = Activation_Softmax()

# Create loss function
loss_function = Loss_CategoricalCrossentropy()

# Create optimizer
#optimizer = Optimizer_SGD(decay=1e-8, momentum=0.9)
optimizer = Optimizer_RMSprop(learning_rate=0.05, decay=4e-8, rho=0.999) # adjusted hyperparameters
# Train in loop
for epoch in range(10001):

    # Make a forward pass of our training data thru this layer
    dense1.forward(X)

    # Make a forward pass thru activation function - we take output of previous layer here
    activation1.forward(dense1.output)

    # Make a forward pass thru second Dense layer - it takes outputs of activation function of first layer as inputs
    dense2.forward(activation1.output)

    # Make a forward pass thru activation function - we take output of previous layer here
    activation2.forward(dense2.output)

    # Calculate loss from output of activation2 so softmax activation
    loss = loss_function.forward(activation2.output, y)

    # Calculate accuracy from output of activation2 and targets
    predictions = np.argmax(activation2.output, axis=1)  # calculate values along first axis
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f'epoch: {epoch}, acc: {accuracy:.3f}, loss: {loss:.3f}, lr: {optimizer.current_learning_rate}')

    # Backward pass
    loss_function.backward(activation2.output, y)
    activation2.backward(loss_function.dvalues)
    dense2.backward(activation2.dvalues)
    activation1.backward(dense2.dvalues)
    dense1.backward(activation1.dvalues)

    # Update weights
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.360, loss: 1.099, lr: 0.05
epoch: 100, acc: 0.410, loss: 1.024, lr: 0.04999010099316666
epoch: 200, acc: 0.427, loss: 0.982, lr: 0.04996021594198117
epoch: 300, acc: 0.477, loss: 0.952, lr: 0.049910380770363756
epoch: 400, acc: 0.537, loss: 0.899, lr: 0.04984065529821222
epoch: 500, acc: 0.517, loss: 0.900, lr: 0.04975112312174363
epoch: 600, acc: 0.563, loss: 0.849, lr: 0.04964189144647378
epoch: 700, acc: 0.537, loss: 0.814, lr: 0.04951309087330241
epoch: 800, acc: 0.557, loss: 0.798, lr: 0.04936487513830117
epoch: 900, acc: 0.577, loss: 0.778, lr: 0.049197420806929945
epoch: 1000, acc: 0.607, loss: 0.771, lr: 0.049010926923533135
epoch: 1100, acc: 0.583, loss: 0.776, lr: 0.04880561461708983
epoch: 1200, acc: 0.570, loss: 0.755, lr: 0.04858172666431087
epoch: 1300, acc: 0.580, loss: 0.757, lr: 0.04833952701129078
epoch: 1400, acc: 0.660, loss: 0.752, lr: 0.048079300255034645
epoch: 1500, acc: 0.600, loss: 0.731, lr: 0.047801351086283965
epoch: 1600, acc: 0.660, loss:

- by adjusting our hyperparameter values for the RMSProp optimizer, our neural network just yielded its best results
- we still have one more SGD modification to cover

### Adam (adaptive learning rate)
- short for Adaptive Moment
- Adam is currently the most popular optimizer, and is built atop RMSProp with momentum added back in
- with Adam, the `rho` hyperparameter becomes `beta_2` and the `momentum` hyperparameter becomes `beta_1`
- to get parameter updates, we divide scaled momentum by scaled cache
- this has the effect of significantly boosting weight updates during the first training steps, speeding up the whole process, then quickly returning its weight updates back to their more-typical values for the later training steps

In [181]:
class Optimizer_Adam:

    # Initialize optimizer - set settings
    def __init__(self, learning_rate=0.001, decay=0., epsilon=1e-7, beta_1=0.9, beta_2=0.999):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2

    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.current_learning_rate * (1. / (1. + self.decay * self.iterations))

    # Update parameters
    def update_params(self, layer):

        # If layer does not contain cache arrays,
        # create them filled with zeros
        if not hasattr(layer, 'weight_cache'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)
            layer.bias_cache = np.zeros_like(layer.biases)

        # Update momentum  with current gradients
        layer.weight_momentums = self.beta_1 * layer.weight_momentums + (1 - self.beta_1) * layer.dweights
        layer.bias_momentums = self.beta_1 * layer.bias_momentums + (1 - self.beta_1) * layer.dbiases
        # Get corrected momentum
        # self.iteration is 0 at first pass
        # and we need to start with 1 here
        weight_momentums_corrected = layer.weight_momentums / (1 - self.beta_1 ** (self.iterations + 1))
        bias_momentums_corrected = layer.bias_momentums /  (1 - self.beta_1 ** (self.iterations + 1))
        # Update cache with squared current gradients --> this is where the RMSProp flavor to Adam comes into play
        layer.weight_cache = self.beta_2 * layer.weight_cache +  (1 - self.beta_2) * layer.dweights**2
        layer.bias_cache = self.beta_2 * layer.bias_cache + (1 - self.beta_2) * layer.dbiases**2
        # Get corrected cachebias
        weight_cache_corrected = layer.weight_cache / (1 - self.beta_2 ** (self.iterations + 1))
        bias_cache_corrected = layer.bias_cache / (1 - self.beta_2 ** (self.iterations + 1))
 
        # Vanilla SGD parameter update + normalization with square rooted cache
        layer.weights += -self.current_learning_rate * weight_momentums_corrected / (np.sqrt(weight_cache_corrected) + self.epsilon)
        layer.biases += -self.current_learning_rate * bias_momentums_corrected / (np.sqrt(bias_cache_corrected) + self.epsilon)

    # Call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

- let's see how the Adam optimizer performs

In [182]:
np.random.seed(0)

# Create dataset
X, y = create_data(100, 3)

# Create Dense layer with 2 input features and 3 output values
dense1 = Layer_Dense(2, 64)  # first dense layer, 2 inputs (each sample has 2 features), 3 outputs

# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()

# Create second Dense layer with 3 input features (as we take output of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense(64, 3)  # second dense layer, 3 inputs, 3 outputs

# Create Softmax activation (to be used with Dense layer):
activation2 = Activation_Softmax()

# Create loss function
loss_function = Loss_CategoricalCrossentropy()

# Create optimizer
#optimizer = Optimizer_SGD(decay=1e-8, momentum=0.9)
optimizer = Optimizer_Adam(decay=4e-8)
# Train in loop
for epoch in range(10001):

    # Make a forward pass of our training data thru this layer
    dense1.forward(X)

    # Make a forward pass thru activation function - we take output of previous layer here
    activation1.forward(dense1.output)

    # Make a forward pass thru second Dense layer - it takes outputs of activation function of first layer as inputs
    dense2.forward(activation1.output)

    # Make a forward pass thru activation function - we take output of previous layer here
    activation2.forward(dense2.output)

    # Calculate loss from output of activation2 so softmax activation
    loss = loss_function.forward(activation2.output, y)

    # Calculate accuracy from output of activation2 and targets
    predictions = np.argmax(activation2.output, axis=1)  # calculate values along first axis
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f'epoch: {epoch}, acc: {accuracy:.3f}, loss: {loss:.3f}, lr: {optimizer.current_learning_rate}')

    # Backward pass
    loss_function.backward(activation2.output, y)
    activation2.backward(loss_function.dvalues)
    dense2.backward(activation2.dvalues)
    activation1.backward(dense2.dvalues)
    dense1.backward(activation1.dvalues)

    # Update weights
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.360, loss: 1.099, lr: 0.001
epoch: 100, acc: 0.427, loss: 1.077, lr: 0.0009998020198633338
epoch: 200, acc: 0.430, loss: 1.071, lr: 0.0009992043188396232
epoch: 300, acc: 0.463, loss: 1.060, lr: 0.0009982076154072756
epoch: 400, acc: 0.480, loss: 1.043, lr: 0.0009968131059642454
epoch: 500, acc: 0.513, loss: 1.018, lr: 0.0009950224624348744
epoch: 600, acc: 0.583, loss: 0.988, lr: 0.0009928378289294775
epoch: 700, acc: 0.617, loss: 0.957, lr: 0.0009902618174660494
epoch: 800, acc: 0.643, loss: 0.926, lr: 0.0009872975027660248
epoch: 900, acc: 0.650, loss: 0.896, lr: 0.0009839484161386
epoch: 1000, acc: 0.670, loss: 0.866, lr: 0.000980218538470663
epoch: 1100, acc: 0.677, loss: 0.837, lr: 0.0009761122923417968
epoch: 1200, acc: 0.690, loss: 0.808, lr: 0.0009716345332862163
epoch: 1300, acc: 0.683, loss: 0.781, lr: 0.0009667905402258149
epoch: 1400, acc: 0.693, loss: 0.756, lr: 0.0009615860051006909
epoch: 1500, acc: 0.697, loss: 0.734, lr: 0.0009560270217256772
epoch: 1

- not too bad, but let’s increase the `learning_rate`  to 0.05 and change `decay` to 1e-8:

In [183]:
np.random.seed(0)

# Create dataset
X, y = create_data(100, 3)

# Create Dense layer with 2 input features and 3 output values
dense1 = Layer_Dense(2, 64)  # first dense layer, 2 inputs (each sample has 2 features), 3 outputs

# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()

# Create second Dense layer with 3 input features (as we take output of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense(64, 3)  # second dense layer, 3 inputs, 3 outputs

# Create Softmax activation (to be used with Dense layer):
activation2 = Activation_Softmax()

# Create loss function
loss_function = Loss_CategoricalCrossentropy()

# Create optimizer
#optimizer = Optimizer_SGD(decay=1e-8, momentum=0.9)
optimizer = Optimizer_Adam(learning_rate=0.05, decay=1e-8)
# Train in loop
for epoch in range(10001):

    # Make a forward pass of our training data thru this layer
    dense1.forward(X)

    # Make a forward pass thru activation function - we take output of previous layer here
    activation1.forward(dense1.output)

    # Make a forward pass thru second Dense layer - it takes outputs of activation function of first layer as inputs
    dense2.forward(activation1.output)

    # Make a forward pass thru activation function - we take output of previous layer here
    activation2.forward(dense2.output)

    # Calculate loss from output of activation2 so softmax activation
    loss = loss_function.forward(activation2.output, y)

    # Calculate accuracy from output of activation2 and targets
    predictions = np.argmax(activation2.output, axis=1)  # calculate values along first axis
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f'epoch: {epoch}, acc: {accuracy:.3f}, loss: {loss:.3f}, lr: {optimizer.current_learning_rate}')

    # Backward pass
    loss_function.backward(activation2.output, y)
    activation2.backward(loss_function.dvalues)
    dense2.backward(activation2.dvalues)
    activation1.backward(dense2.dvalues)
    dense1.backward(activation1.dvalues)

    # Update weights
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2) 
    optimizer.post_update_params()

epoch: 0, acc: 0.360, loss: 1.099, lr: 0.05
epoch: 100, acc: 0.723, loss: 0.682, lr: 0.04999752506207612
epoch: 200, acc: 0.800, loss: 0.512, lr: 0.049990050996574775
epoch: 300, acc: 0.827, loss: 0.419, lr: 0.04997758005043209
epoch: 400, acc: 0.863, loss: 0.356, lr: 0.04996011596895705
epoch: 500, acc: 0.887, loss: 0.316, lr: 0.04993766399395728
epoch: 600, acc: 0.903, loss: 0.284, lr: 0.04991023086111661
epoch: 700, acc: 0.900, loss: 0.264, lr: 0.049877824796627425
epoch: 800, acc: 0.920, loss: 0.242, lr: 0.0498404555130797
epoch: 900, acc: 0.913, loss: 0.229, lr: 0.04979813420460921
epoch: 1000, acc: 0.920, loss: 0.214, lr: 0.04975087354130951
epoch: 1100, acc: 0.923, loss: 0.204, lr: 0.04969868766290968
epoch: 1200, acc: 0.933, loss: 0.195, lr: 0.04964159217172425
epoch: 1300, acc: 0.920, loss: 0.191, lr: 0.04957960412487896
epoch: 1400, acc: 0.933, loss: 0.181, lr: 0.04951274202581829
epoch: 1500, acc: 0.883, loss: 0.257, lr: 0.049441025815100244
epoch: 1600, acc: 0.917, loss: 0.

- it doesn't get much better than that, both for accuracy and loss
- while Adam significantly outperformed the other optimizers, and is usually the best optimizer, that’s not always the case
- it’s a good idea to try the Adam optimizer first, but it's also important to try others
- sometimes simple SGD or SGD + momentum performs better than Adam

### Good Initial Hyperparameters
- for SGD, a good initial learning rate is 1.0 with a decay of 0.1 
- for Adam, a good initial learning rate is 0.001 (1e-3) with a decay of 0.0001 (1e-4)

### Concluding Remarks
- we achieved 98% accuracy and an almost perfect loss of 0 on the generated training section
- this is certainly exciting, but you will soon learn to fear results this good, or at least to approach them with caution
- there are cases in which you can achieve valid results of such a high degree, but, in this case, we’ve been ignoring a major concept in machine learning: out-of-sample testing data, which is the subject of the next section