### Full Code Up Until This Point

In [2]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data

nnfs.init()

class Layer_Dense:
    
    # layer initialization
    def __init__(self, n_inputs, n_neurons):
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))
        
    # forward pass
    def forward(self, inputs):
        self.output = np.dot(inputs, self.weights) + self.biases
        self.inputs = inputs
        
    # backward pass
    def backward(self, dvalues):        
        # gradients on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis = 0, keepdims = True)
        
        # gradient on values to pass further back
        self.dinputs = np.dot(dvalues, self.weights.T)

# Rectified Linear Unit Activation Function
class Activation_ReLU:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.maximum(0, inputs)
    
    # backward pass
    def backward(self, dvalues):
        self.dinputs = dvalues.copy()
        
        # zero gradient where input values were negative
        self.dinputs[self.inputs <= 0] = 0

class Activation_Softmax:
    def forward(self, inputs):
        exp_values = np.exp(inputs - np.max(inputs, axis = 1, keepdims = True))
        probabilities = exp_values / np.sum(exp_values, axis = 1, keepdims = True)
        self.output = probabilities
    
    # backward pass
    def backward(self, dvalues):
        
        #create uninitialized array
        self.dinputs = np.empty_like(dvalues)
        
        # enumerate outputs and gradients
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):
            # flatten output array
            single_output = single_output.reshape(-1, 1)
            # calculate jacobian matrix of the output
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)
            
            # calculate sample wise gradient and add it to the array of sample gradients
            self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)
            

class Loss:
    def calculate(self, outputs, y):
        # y is the intended target values
        sample_losses = self.forward(outputs, y)
        data_loss = np.mean(sample_losses)
        return data_loss
    
class Loss_CategoricalCrossentropy(Loss):
    #inheriting from the base Loss class
    def forward(self, y_pred, y_true):
        # y_pred will come from the neural network
        # y_true will come from the training set
        samples = len(y_pred)
        y_pred_clipped = np.clip(y_pred, 1e-7, 1-1e-7)
        
        if len(y_true.shape) == 1:
            # means scalar class values have been passed
            correct_confidences = y_pred_clipped[range(samples), y_true]
        elif len(y_true.shape) == 2:
            # one hot encoded values have been passed
            correct_confidences = np.sum(y_pred_clipped * y_true, axis = 1)
            
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods
    
    # Backwards pass
    def backward(self, dvalues, y_true):
        # number of samples
        samples = len(dvalues)
        # number of labels in every sample
        # we'll use the first sample to count them
        labels = len(dvalues[0])

        # if labels are sparse, turn them into one-hot vector
        # in case the shape is as [3, 0, 2] etc etc per sample
        # we turn them into one-hot vectors
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]

        # calculate gradient
        self.dinputs = -y_true / dvalues
        # normalize gradient
        self.dinputs = self.dinputs / samples
        # with a larger number of batches, it's all summed together
        # in the dot product, and some will be given more importance
        # than others when we don't normaize

class Activation_Softmax_Loss_CategoricalCrossentropy():
    
    # creates activation and loss function objects inside
    def __init__(self):
        self.activation = Activation_Softmax()
        self.loss = Loss_CategoricalCrossentropy()
        
    # forward pass
    def forward(self, inputs, y_true):
        # output layer's activation function
        self.activation.forward(inputs)
        # set the output
        self.output = self.activation.output
        # calculate and return the loss value
        return self.loss.calculate(self.output, y_true)
    
    # backward pass
    def backward(self, dvalues, y_true):
        
        # number of samples
        samples = len(dvalues)
        
        #if labels are one-hot encoded,
        # turn them into discrete values
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis = 1)
            
        # copy so we can safely modify
        self.dinputs = dvalues.copy()
        # calculate gradient
        self.dinputs[range(samples), y_true] -= 1
        # normalize gradient
        self.dinputs = self.dinputs / samples

# create dataset
X, y = spiral_data(samples = 100, classes = 3)

# dense layer with 2 input features and 3 output values
dense1 = Layer_Dense(2, 3)

# ReLU activation for dense1
activation1 = Activation_ReLU()

# second dense layer with 3 inputs and 3 output values
dense2 = Layer_Dense(3, 3)

# create softmax classifier's combined loss and acivation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# forward pass of our training data through this layer
dense1.forward(X)

# forward pass through activation
activation1.forward(dense1.output)

# forward pass through seecond dense layer
dense2.forward(activation1.output)

# forward pass through activation/loss function
loss = loss_activation.forward(dense2.output, y)

# peeking the output of the first few samples
print(loss_activation.output[:5])

# loss value
print("| || || |_:", loss)

# calculating accuracy of output of activation 2 and targets
# calculate values along first axis because of batches
predictions = np.argmax(loss_activation.output, axis = 1)
if len(y.shape) == 2:
    y = np.argmax(y, axis = 1)
    
accuracy = np.mean(predictions == y)
print("acc:", accuracy)

# backward pass
loss_activation.backward(loss_activation.output, y)
dense2.backward(loss_activation.dinputs)
activation1.backward(dense2.dinputs)
dense1.backward(activation1.dinputs)

#gradients
print(dense1.dweights)
print(dense1.dbiases)
print(dense2.dweights)
print(dense2.dbiases)

[[0.33333334 0.33333334 0.33333334]
 [0.3333332  0.3333332  0.33333364]
 [0.3333329  0.33333293 0.3333342 ]
 [0.3333326  0.33333263 0.33333477]
 [0.33333233 0.3333324  0.33333528]]
| || || |_: 1.0986104
acc: 0.34
[[ 1.5766357e-04  7.8368583e-05  4.7324400e-05]
 [ 1.8161038e-04  1.1045573e-05 -3.3096312e-05]]
[[-3.60553473e-04  9.66117223e-05 -1.03671395e-04]]
[[ 5.44109462e-05  1.07411419e-04 -1.61822361e-04]
 [-4.07913431e-05 -7.16780924e-05  1.12469446e-04]
 [-5.30112993e-05  8.58172934e-05 -3.28059905e-05]]
[[-1.0729185e-05 -9.4610732e-06  2.0027859e-05]]


Simplest and the most rudimentary optimization method is to subtract a fraction of the gradient from each weight and bias. This is the **Stochastic Gradient Descent (SGD)**. Most optimizers are actually just a variant of SGD.

### Terminology
**Stochastic Gradient Descent**: Optimizers that fits a single sample at a time

**Batch Gradient Descent**: Optimizers that fit a whole dataset at once

**Mini-batch Gradient Descnet**: Optimizers that fit slices of a dataset, ie batches, at once



In [5]:
class Optimizer_SGD:
    # initialize optimizer - set settings,
    # learning rate of 1. is the default for this optimizer
    def __init__(self, learning_rate = 1.0):
        self.learning_rate = learning_rate
        
    # update parameters
    def update_params(self, layer):
        layer.weights += -self.learning_rate * layer.dweights
        layer.biases += -self.learning_rate * layer.dbiases

And for the optimizer object

In [6]:
optimizer = Optimizer_SGD()

optimizer.update_params(dense1)
optimizer.update_params(dense1)

This is everything needed to train the model

Now, what remains is to repeatedly loop back and forth until reaching some stopping point

Each full forward, and backward pass thrugh all of the training data is the **epoch**. Most models train on the dataset for multiple epochs

In [15]:
# creating dataset
X, y = spiral_data(samples = 100, classes = 3)

# dense layer wtih 2 input features and 64 ouput values
dense1 = Layer_Dense(2, 64)

# ReLU activation for dense1
activation1 = Activation_ReLU()

# second dense layer with 64 inputs features, an 3 output values
dense2 = Layer_Dense(64, 3)

# create softmax classifier's combined loss and acivation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# optimizer
optimizer = Optimizer_SGD(0.85)

# training in loop
for epoch in range(20001):
    # forward pass of our training data through this layer
    dense1.forward(X)

    # forward pass through activation
    activation1.forward(dense1.output)

    # forward pass through seecond dense layer
    dense2.forward(activation1.output)

    # forward pass through activation/loss function
    loss = loss_activation.forward(dense2.output, y)

    # calculating accuracy of output of activation 2 and targets
    # calculate values along first axis because of batches
    predictions = np.argmax(loss_activation.output, axis = 1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis = 1)
    accuracy = np.mean(predictions == y)
    
    if not epoch % 100:
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f}')

    # backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    optimizer.update_params(dense1)
    optimizer.update_params(dense2)

epoch: 0, acc: 0.333, loss: 1.099
epoch: 100, acc: 0.373, loss: 1.093
epoch: 200, acc: 0.383, loss: 1.078
epoch: 300, acc: 0.403, loss: 1.076
epoch: 400, acc: 0.400, loss: 1.075
epoch: 500, acc: 0.397, loss: 1.074
epoch: 600, acc: 0.393, loss: 1.072
epoch: 700, acc: 0.380, loss: 1.070
epoch: 800, acc: 0.403, loss: 1.067
epoch: 900, acc: 0.407, loss: 1.064
epoch: 1000, acc: 0.417, loss: 1.060
epoch: 1100, acc: 0.443, loss: 1.055
epoch: 1200, acc: 0.400, loss: 1.063
epoch: 1300, acc: 0.383, loss: 1.063
epoch: 1400, acc: 0.383, loss: 1.062
epoch: 1500, acc: 0.400, loss: 1.061
epoch: 1600, acc: 0.407, loss: 1.061
epoch: 1700, acc: 0.400, loss: 1.060
epoch: 1800, acc: 0.420, loss: 1.059
epoch: 1900, acc: 0.407, loss: 1.052
epoch: 2000, acc: 0.403, loss: 1.060
epoch: 2100, acc: 0.413, loss: 1.048
epoch: 2200, acc: 0.443, loss: 1.052
epoch: 2300, acc: 0.400, loss: 1.068
epoch: 2400, acc: 0.443, loss: 1.052
epoch: 2500, acc: 0.400, loss: 1.090
epoch: 2600, acc: 0.447, loss: 1.044
epoch: 2700, 

### Learning Rate
The only hyper parameter for our model aside from the architecture, the learning rate is controlld to avoid a local minimum

#### Momentum
Basically inertia for our optimizer, high inertia, and low learning rate models can generally slowly creep towards a global minimum

#### Gradient Explosion
A learning rate that's too high can result in the adjustments being in the right direction but overshooting by a large amount, resulting in an unstable model, with loss jumping around. Gradient explosion is where the parameter update causes the output to be increasing, until the float can't store this variable, causing an overflow

Because larger models can take weeks or more to train, it is very important to watch out for these sorts of things. Generally you want to be able to change the learning rate during the training.

In most cases, start with larger learning rate and decrease over time; **learning rate decay**

### Learning Rate Decay
The crux here is to figure out the **Decay Rate**. Possible ways include:
* Decrease learning rate in response to the loss across epochs, eg the loss has been plateauing or jumping over large deltas  
* Simly track your loss over time and manually decrease the learning rate when deemed appropriate
* Decay per step, ie **1/t decaying** or **exponential decaying**

In [34]:
class Optimizer_SGD:
    # initialize optimizer - set settings,
    # learning rate of 1. is the default for this optimizer
    # default decay rate of 0 (no decay)
    def __init__(self, learning_rate = 1.0, decay = 0.):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        
    # call once before any parameter update
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
                (1. / ( 1. + self.decay * self.iterations ))
        
    # update parameters
    def update_params(self, layer):
        layer.weights += -self.learning_rate * layer.dweights
        layer.biases += -self.learning_rate * layer.dbiases
    
    # call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

In [38]:
# creating dataset
X, y = spiral_data(samples = 100, classes = 3)

# dense layer wtih 2 input features and 64 ouput values
dense1 = Layer_Dense(2, 64)

# ReLU activation for dense1
activation1 = Activation_ReLU()

# second dense layer with 64 inputs features, an 3 output values
dense2 = Layer_Dense(64, 3)

# create softmax classifier's combined loss and acivation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# optimizer
optimizer = Optimizer_SGD(decay = 1e-3)

# training in loop
for epoch in range(10001):
    # forward pass of our training data through this layer
    dense1.forward(X)

    # forward pass through activation
    activation1.forward(dense1.output)

    # forward pass through seecond dense layer
    dense2.forward(activation1.output)

    # forward pass through activation/loss function
    loss = loss_activation.forward(dense2.output, y)

    # calculating accuracy of output of activation 2 and targets
    # calculate values along first axis because of batches
    predictions = np.argmax(loss_activation.output, axis = 1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis = 1)
    accuracy = np.mean(predictions == y)
    
    if not epoch % 100:
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f}, ' + 
              f'lr: {optimizer.current_learning_rate}')

    # backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    
    # update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.310, loss: 1.099, lr: 1.0
epoch: 100, acc: 0.400, loss: 1.079, lr: 0.9099181073703367
epoch: 200, acc: 0.383, loss: 1.070, lr: 0.8340283569641367
epoch: 300, acc: 0.393, loss: 1.068, lr: 0.7698229407236336
epoch: 400, acc: 0.380, loss: 1.067, lr: 0.7147962830593281
epoch: 500, acc: 0.380, loss: 1.065, lr: 0.66711140760507
epoch: 600, acc: 0.380, loss: 1.063, lr: 0.6253908692933083
epoch: 700, acc: 0.387, loss: 1.060, lr: 0.5885815185403178
epoch: 800, acc: 0.417, loss: 1.051, lr: 0.5558643690939411
epoch: 900, acc: 0.463, loss: 1.033, lr: 0.526592943654555
epoch: 1000, acc: 0.433, loss: 1.028, lr: 0.5002501250625312
epoch: 1100, acc: 0.447, loss: 1.015, lr: 0.4764173415912339
epoch: 1200, acc: 0.450, loss: 1.007, lr: 0.45475216007276037
epoch: 1300, acc: 0.473, loss: 0.999, lr: 0.43497172683775553
epoch: 1400, acc: 0.457, loss: 0.999, lr: 0.4168403501458941
epoch: 1500, acc: 0.470, loss: 0.984, lr: 0.4001600640256102
epoch: 1600, acc: 0.533, loss: 0.985, lr: 0.38476337

In [37]:
# creating dataset
X, y = spiral_data(samples = 100, classes = 3)

# dense layer wtih 2 input features and 64 ouput values
dense1 = Layer_Dense(2, 64)

# ReLU activation for dense1
activation1 = Activation_ReLU()

# second dense layer with 64 inputs features, and 64 output values
dense2 = Layer_Dense(64, 64)

# ReLU activation for dense2
activation2 = Activation_ReLU()

# third dense layer with 64 input features, and 3 output values
dense3 = Layer_Dense(64, 3)

# create softmax classifier's combined loss and acivation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# optimizer
optimizer = Optimizer_SGD(decay = 1e-4)

# training in loop
for epoch in range(15001):
    # forward pass of our training data through this layer
    dense1.forward(X)

    # forward pass through activation
    activation1.forward(dense1.output)

    # forward pass through seecond dense layer
    dense2.forward(activation1.output)
    
    # forward pass through activation
    activation2.forward(dense2.output)
    
    # forward pass through third dense layer
    dense3.forward(activation2.output)

    # forward pass through activation/loss function
    loss = loss_activation.forward(dense3.output, y)

    # calculating accuracy of output of activation 2 and targets
    # calculate values along first axis because of batches
    predictions = np.argmax(loss_activation.output, axis = 1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis = 1)
    accuracy = np.mean(predictions == y)
    
    if not epoch % 100:
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f}, ' + 
              f'lr: {optimizer.current_learning_rate}')

    # backward pass
    loss_activation.backward(loss_activation.output, y)
    dense3.backward(loss_activation.dinputs)
    activation2.backward(dense3.dinputs)
    dense2.backward(activation2.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    
    # update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.update_params(dense3)
    optimizer.post_update_params()

epoch: 0, acc: 0.333, loss: 1.099, lr: 1.0
epoch: 100, acc: 0.423, loss: 1.099, lr: 0.9901970492127933
epoch: 200, acc: 0.410, loss: 1.099, lr: 0.9804882831650161
epoch: 300, acc: 0.420, loss: 1.098, lr: 0.9709680551509855
epoch: 400, acc: 0.397, loss: 1.072, lr: 0.9616309260505818
epoch: 500, acc: 0.457, loss: 1.067, lr: 0.9524716639679969
epoch: 600, acc: 0.417, loss: 1.065, lr: 0.9434852344560807
epoch: 700, acc: 0.407, loss: 1.059, lr: 0.9346667912889054
epoch: 800, acc: 0.427, loss: 1.059, lr: 0.9260116677470135
epoch: 900, acc: 0.420, loss: 1.059, lr: 0.9175153683824203
epoch: 1000, acc: 0.410, loss: 1.058, lr: 0.9091735612328392
epoch: 1100, acc: 0.400, loss: 1.056, lr: 0.9009820704567978
epoch: 1200, acc: 0.403, loss: 1.047, lr: 0.892936869363336
epoch: 1300, acc: 0.447, loss: 1.029, lr: 0.8850340738118416
epoch: 1400, acc: 0.413, loss: 1.014, lr: 0.8772699359592947
epoch: 1500, acc: 0.427, loss: 0.988, lr: 0.8696408383337683
epoch: 1600, acc: 0.517, loss: 0.910, lr: 0.86214328

epoch: 13600, acc: 0.913, loss: 0.151, lr: 0.42374676893088686
epoch: 13700, acc: 0.897, loss: 0.193, lr: 0.4219587324359677
epoch: 13800, acc: 0.917, loss: 0.160, lr: 0.4201857220891634
epoch: 13900, acc: 0.920, loss: 0.142, lr: 0.41842754926984393
epoch: 14000, acc: 0.923, loss: 0.155, lr: 0.4166840285011875
epoch: 14100, acc: 0.897, loss: 0.203, lr: 0.4149549773849537
epoch: 14200, acc: 0.903, loss: 0.235, lr: 0.41324021653787346
epoch: 14300, acc: 0.897, loss: 0.200, lr: 0.4115395695296103
epoch: 14400, acc: 0.933, loss: 0.124, lr: 0.4098528628222468
epoch: 14500, acc: 0.920, loss: 0.130, lr: 0.4081799257112535
epoch: 14600, acc: 0.923, loss: 0.131, lr: 0.40652059026789705
epoch: 14700, acc: 0.923, loss: 0.128, lr: 0.40487469128304787
epoch: 14800, acc: 0.927, loss: 0.125, lr: 0.4032420662123473
epoch: 14900, acc: 0.927, loss: 0.125, lr: 0.4016225551226957
epoch: 15000, acc: 0.923, loss: 0.128, lr: 0.40001600064002557


### Stochastic Gradient Descent With Momentum
Our model can get stuck in a local minimum, bouncing back and forth, so the momentum addscauses the previous gradient update to influence the next.

We set a parameter between 0 and 1, representing the fraction of the previous paremeter update to retain. A portion of the previous update, combined with the portion of the current gradient, together will be sed to update the parameters.

The momentum fraction setting too high can cause the model to stop learning, as the direction of the updates won't be able to follow the global gradient descent

In [57]:
class Optimizer_SGD:
    # initialize optimizer - set settings,
    # learning rate of 1. is the default for this optimizer
    # default decay rate of 0 (no decay)
    def __init__(self, learning_rate = 1.0, decay = 0., momentum = 0.):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum = momentum
        
    # call once before any parameter update
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
                (1. / ( 1. + self.decay * self.iterations ))
        
    # update parameters
    def update_params(self, layer):
        
        #if momentum is used
        if self.momentum:
            
            #if layer does not contain momentum arrays
            # create them and fill with zeroes
            if not hasattr(layer, 'weight_momentums'):
                layer.weight_momentums = np.zeros_like(layer.weights)
                
                # if there is no momentum array for weights
                # the array doesn't exist for the biases yet either
                layer.bias_momentums = np.zeros_like(layer.biases)
                
            # build weight updates with momentum - take previous
            # updates multiplied by retain factor and update with
            # current gradients
            weight_updates = \
                self.momentum * layer.weight_momentums - \
                self.current_learning_rate * layer.dweights
            layer.weight_momentums = weight_updates
            
            # build bias updates
            bias_updates = \
                self.momentum * layer.bias_momentums - \
                self.current_learning_rate * layer.dbiases
            layer.bias_momentums = bias_updates
        # otherwise vanillae SGD
        else:
            weight_updates = -self.learning_rate * layer.dweights
            bias_updates = -self.learning_rate * layer.dbiases
        
        # update weights and biases using either
        # vanilla or momentum updates
        layer.weights += weight_updates
        layer.biases += bias_updates
    
    # call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

In [51]:
# creating dataset
X, y = spiral_data(samples = 100, classes = 3)

# dense layer wtih 2 input features and 64 ouput values
dense1 = Layer_Dense(2, 64)

# ReLU activation for dense1
activation1 = Activation_ReLU()

# second dense layer with 64 inputs features, an 3 output values
dense2 = Layer_Dense(64, 3)

# create softmax classifier's combined loss and acivation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# optimizer
optimizer = Optimizer_SGD(decay = 1e-3, momentum = 0.9)

# training in loop
for epoch in range(10001):
    # forward pass of our training data through this layer
    dense1.forward(X)

    # forward pass through activation
    activation1.forward(dense1.output)

    # forward pass through seecond dense layer
    dense2.forward(activation1.output)

    # forward pass through activation/loss function
    loss = loss_activation.forward(dense2.output, y)

    # calculating accuracy of output of activation 2 and targets
    # calculate values along first axis because of batches
    predictions = np.argmax(loss_activation.output, axis = 1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis = 1)
    accuracy = np.mean(predictions == y)
    
    if not epoch % 100:
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f}, ' + 
              f'lr: {optimizer.current_learning_rate}')

    # backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    
    # update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.293, loss: 1.099, lr: 1.0
epoch: 100, acc: 0.453, loss: 1.037, lr: 0.9099181073703367
epoch: 200, acc: 0.473, loss: 0.932, lr: 0.8340283569641367
epoch: 300, acc: 0.583, loss: 0.776, lr: 0.7698229407236336
epoch: 400, acc: 0.763, loss: 0.572, lr: 0.7147962830593281
epoch: 500, acc: 0.790, loss: 0.504, lr: 0.66711140760507
epoch: 600, acc: 0.810, loss: 0.470, lr: 0.6253908692933083
epoch: 700, acc: 0.823, loss: 0.443, lr: 0.5885815185403178
epoch: 800, acc: 0.820, loss: 0.423, lr: 0.5558643690939411
epoch: 900, acc: 0.843, loss: 0.396, lr: 0.526592943654555
epoch: 1000, acc: 0.857, loss: 0.373, lr: 0.5002501250625312
epoch: 1100, acc: 0.867, loss: 0.357, lr: 0.4764173415912339
epoch: 1200, acc: 0.870, loss: 0.346, lr: 0.45475216007276037
epoch: 1300, acc: 0.870, loss: 0.339, lr: 0.43497172683775553
epoch: 1400, acc: 0.867, loss: 0.331, lr: 0.4168403501458941
epoch: 1500, acc: 0.877, loss: 0.317, lr: 0.4001600640256102
epoch: 1600, acc: 0.870, loss: 0.309, lr: 0.38476337

In [54]:
class Optimizer_SGD:
    # initialize optimizer - set settings,
    # learning rate of 1. is the default for this optimizer
    # default decay rate of 0 (no decay)
    def __init__(self, learning_rate = 1.0, decay = 0., momentum = 0.):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum = momentum
        
    # call once before any parameter update
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
                (1. / ( 1. + self.decay * self.iterations ))
        
    # update parameters
    def update_params(self, layer):
        
        #if momentum is used
        if self.momentum:
            
            #if layer does not contain momentum arrays
            # create them and fill with zeroes
            if not hasattr(layer, 'weight_momentums'):
                layer.weight_momentums = np.zeros_like(layer.weights)
                
                # if there is no momentum array for weights
                # the array doesn't exist for the biases yet either
                layer.bias_momentums = np.zeros_like(layer.biases)
                
            # build weight updates with momentum - take previous
            # updates multiplied by retain factor and update with
            # current gradients
            weight_updates = \
                self.momentum * layer.weight_momentums - \
                self.current_learning_rate * layer.dweights
            layer.weight_momentums = weight_updates
            
            # build bias updates
            bias_updates = \
                self.momentum * layer.bias_momentums - \
                self.current_learning_rate * layer.dbiases
            layer.bias_momentums = bias_updates
        # otherwise vanillae SGD
        else:
            weight_updates = -self.learning_rate * layer.dweights
            bias_updates = -self.learning_rate * layer.dbiases
        
        # update weights and biases using either
        # vanilla or momentum updates
        layer.weights += weight_updates
        layer.biases += bias_updates
    
    # call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

In [58]:
# creating dataset
X, y = spiral_data(samples = 100, classes = 3)

# dense layer wtih 2 input features and 64 ouput values
dense1 = Layer_Dense(2, 64)

# ReLU activation for dense1
activation1 = Activation_ReLU()

# second dense layer with 64 inputs features, and 64 output values
dense2 = Layer_Dense(64, 64)

# ReLU activation for dense2
activation2 = Activation_ReLU()

# third dense layer with 64 input features, and 3 output values
dense3 = Layer_Dense(64, 3)

# create softmax classifier's combined loss and acivation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# optimizer
optimizer = Optimizer_SGD(decay = 1e-4, momentum = 0.5)

# training in loop
for epoch in range(15001):
    # forward pass of our training data through this layer
    dense1.forward(X)

    # forward pass through activation
    activation1.forward(dense1.output)

    # forward pass through seecond dense layer
    dense2.forward(activation1.output)
    
    # forward pass through activation
    activation2.forward(dense2.output)
    
    # forward pass through third dense layer
    dense3.forward(activation2.output)

    # forward pass through activation/loss function
    loss = loss_activation.forward(dense3.output, y)

    # calculating accuracy of output of activation 2 and targets
    # calculate values along first axis because of batches
    predictions = np.argmax(loss_activation.output, axis = 1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis = 1)
    accuracy = np.mean(predictions == y)
    
    if not epoch % 100:
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f}, ' + 
              f'lr: {optimizer.current_learning_rate}')

    # backward pass
    loss_activation.backward(loss_activation.output, y)
    dense3.backward(loss_activation.dinputs)
    activation2.backward(dense3.dinputs)
    dense2.backward(activation2.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    
    # update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.update_params(dense3)
    optimizer.post_update_params()

epoch: 0, acc: 0.323, loss: 1.099, lr: 1.0
epoch: 100, acc: 0.373, loss: 1.099, lr: 0.9901970492127933
epoch: 200, acc: 0.377, loss: 1.098, lr: 0.9804882831650161
epoch: 300, acc: 0.377, loss: 1.083, lr: 0.9709680551509855
epoch: 400, acc: 0.363, loss: 1.081, lr: 0.9616309260505818
epoch: 500, acc: 0.380, loss: 1.080, lr: 0.9524716639679969
epoch: 600, acc: 0.363, loss: 1.077, lr: 0.9434852344560807
epoch: 700, acc: 0.377, loss: 1.068, lr: 0.9346667912889054
epoch: 800, acc: 0.417, loss: 1.051, lr: 0.9260116677470135
epoch: 900, acc: 0.557, loss: 0.964, lr: 0.9175153683824203
epoch: 1000, acc: 0.447, loss: 1.063, lr: 0.9091735612328392
epoch: 1100, acc: 0.700, loss: 0.729, lr: 0.9009820704567978
epoch: 1200, acc: 0.720, loss: 0.677, lr: 0.892936869363336
epoch: 1300, acc: 0.657, loss: 0.754, lr: 0.8850340738118416
epoch: 1400, acc: 0.783, loss: 0.544, lr: 0.8772699359592947
epoch: 1500, acc: 0.707, loss: 0.698, lr: 0.8696408383337683
epoch: 1600, acc: 0.790, loss: 0.504, lr: 0.86214328

epoch: 13500, acc: 0.993, loss: 0.016, lr: 0.42555002340525133
epoch: 13600, acc: 0.993, loss: 0.015, lr: 0.42374676893088686
epoch: 13700, acc: 0.993, loss: 0.015, lr: 0.4219587324359677
epoch: 13800, acc: 0.993, loss: 0.016, lr: 0.4201857220891634
epoch: 13900, acc: 0.993, loss: 0.014, lr: 0.41842754926984393
epoch: 14000, acc: 0.993, loss: 0.015, lr: 0.4166840285011875
epoch: 14100, acc: 0.993, loss: 0.014, lr: 0.4149549773849537
epoch: 14200, acc: 0.993, loss: 0.014, lr: 0.41324021653787346
epoch: 14300, acc: 0.993, loss: 0.015, lr: 0.4115395695296103
epoch: 14400, acc: 0.993, loss: 0.015, lr: 0.4098528628222468
epoch: 14500, acc: 0.993, loss: 0.014, lr: 0.4081799257112535
epoch: 14600, acc: 0.993, loss: 0.014, lr: 0.40652059026789705
epoch: 14700, acc: 0.993, loss: 0.014, lr: 0.40487469128304787
epoch: 14800, acc: 0.993, loss: 0.014, lr: 0.4032420662123473
epoch: 14900, acc: 0.993, loss: 0.014, lr: 0.4016225551226957
epoch: 15000, acc: 0.993, loss: 0.014, lr: 0.40001600064002557


This is clearly the best so far. The SGD optimizer with momentum is usually one of the 2 main choices for an optimizer in practice next to the Adam optimizer.

### AdaGrad (Adaptive Gradient)
Idea is to normalize the updates being made to the features. During training, some weights can change dramatically, while others don't change much, and generally it is better for weights not to change too much, compared to others.

We keep track of the sum of the squares of the updates, and keep the learning rate for that parameter inversely proportional to it, as shown here: 

In [None]:
cache += parm_gradient ** 2
parm_updates = learning_rate * parm_gradient / (sqrt(cache) + eps)

The denominator ever growing larger can cause the learning rate to come to a virtual halt, making this optimizer used in niche applications.

The eps or **epsilon** here is a pre-training hyperparamter to prevent division by 0; usually a super small value

In [66]:
class Optimizer_Adagrad:
    # initialize optimizer - set settings,
    # learning rate of 1. is the default for this optimizer
    # default decay rate of 0 (no decay)
    def __init__(self, learning_rate = 1., decay = 0., epsilon = 1e-7):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        
    # call once before any parameter update
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
                (1. / ( 1. + self.decay * self.iterations ))
        
    # update parameters
    def update_params(self, layer):
        
        # if layer does not contain cache array,
        # create them and fill with zeros
        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)
            
        # update cache with squared current gradients
        layer.weight_cache += layer.dweights ** 2
        layer.bias_cache += layer.dbiases ** 2
        
        # vanilla SGD parameter update + normalization
        # with square rooted cache
        layer.weights += -self.current_learning_rate * \
                            layer.dweights / \
                            (np.sqrt(layer.bias_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * \
                            layer.dbiases / \
                            (np.sqrt(layer.bias_cache) + self.epsilon)
    
    # call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

In [68]:
# creating dataset
X, y = spiral_data(samples = 100, classes = 3)

# dense layer wtih 2 input features and 64 ouput values
dense1 = Layer_Dense(2, 64)

# ReLU activation for dense1
activation1 = Activation_ReLU()

# second dense layer with 64 inputs features, an 3 output values
dense2 = Layer_Dense(64, 3)

# create softmax classifier's combined loss and acivation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# optimizer
optimizer = Optimizer_Adagrad(decay = 1e-4)

# training in loop
for epoch in range(10001):
    # forward pass of our training data through this layer
    dense1.forward(X)

    # forward pass through activation
    activation1.forward(dense1.output)

    # forward pass through seecond dense layer
    dense2.forward(activation1.output)

    # forward pass through activation/loss function
    loss = loss_activation.forward(dense2.output, y)

    # calculating accuracy of output of activation 2 and targets
    # calculate values along first axis because of batches
    predictions = np.argmax(loss_activation.output, axis = 1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis = 1)
    accuracy = np.mean(predictions == y)
    
    if not epoch % 100:
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f}, ' + 
              f'lr: {optimizer.current_learning_rate}')

    # backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    
    # update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.313, loss: 1.099, lr: 1.0
epoch: 100, acc: 0.453, loss: 1.257, lr: 0.9901970492127933
epoch: 200, acc: 0.597, loss: 0.988, lr: 0.9804882831650161
epoch: 300, acc: 0.657, loss: 0.901, lr: 0.9709680551509855
epoch: 400, acc: 0.657, loss: 0.851, lr: 0.9616309260505818
epoch: 500, acc: 0.713, loss: 0.790, lr: 0.9524716639679969
epoch: 600, acc: 0.737, loss: 0.753, lr: 0.9434852344560807
epoch: 700, acc: 0.743, loss: 0.724, lr: 0.9346667912889054
epoch: 800, acc: 0.753, loss: 0.700, lr: 0.9260116677470135
epoch: 900, acc: 0.763, loss: 0.680, lr: 0.9175153683824203
epoch: 1000, acc: 0.773, loss: 0.663, lr: 0.9091735612328392
epoch: 1100, acc: 0.783, loss: 0.647, lr: 0.9009820704567978
epoch: 1200, acc: 0.793, loss: 0.633, lr: 0.892936869363336
epoch: 1300, acc: 0.797, loss: 0.620, lr: 0.8850340738118416
epoch: 1400, acc: 0.797, loss: 0.609, lr: 0.8772699359592947
epoch: 1500, acc: 0.797, loss: 0.599, lr: 0.8696408383337683
epoch: 1600, acc: 0.800, loss: 0.590, lr: 0.86214328

### RMSProp (Root Mean Square Propagation)
Calclates an adaptive learning rate per parameter, like AdaGrad. The cache here is calculated as:

    cache += rho * cache + (1 - rho) * gradient ** 2

Basically adds a momentum like director for the updates, but also adds per-parameter adaptive learning rate, so the learning rate changes are smoother. Uses a moving average to keep track.

The new parameter, rho is the cache memory decay rate. Because so much of the previous update is carried over, a learning rate of 1 is far too large and causes instability.

In [71]:
class Optimizer_RMSprop:
    # initialize optimizer - set settings,
    # learning rate of 1. is the default for this optimizer
    # default decay rate of 0 (no decay)
    def __init__(self, learning_rate = 0.001, decay = 0., epsilon = 1e-7, rho = 0.9):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.rho = rho
        
    # call once before any parameter update
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
                (1. / ( 1. + self.decay * self.iterations ))
        
    # update parameters
    def update_params(self, layer):
        
        # if layer does not contain cache array,
        # create them and fill with zeros
        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)
            
        # update cache with squared current gradients
        layer.weight_cache = self.rho * layer.weight_cache + \
                    (1 - self.rho) * layer.dweights ** 2
        layer.bias_cache = self.rho * layer.bias_cache + \
                    (1 - self.rho) * layer.dbiases ** 2
        
        # vanilla SGD parameter update + normalization
        # with square rooted cache
        layer.weights += -self.current_learning_rate * \
                            layer.dweights / \
                            (np.sqrt(layer.bias_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * \
                            layer.dbiases / \
                            (np.sqrt(layer.bias_cache) + self.epsilon)
    
    # call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

In [74]:
# creating dataset
X, y = spiral_data(samples = 100, classes = 3)

# dense layer wtih 2 input features and 64 ouput values
dense1 = Layer_Dense(2, 64)

# ReLU activation for dense1
activation1 = Activation_ReLU()

# second dense layer with 64 inputs features, an 3 output values
dense2 = Layer_Dense(64, 3)

# create softmax classifier's combined loss and acivation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# optimizer
optimizer = Optimizer_RMSprop(learning_rate=0.02, decay=1e-4, rho=0.999)

# training in loop
for epoch in range(10001):
    # forward pass of our training data through this layer
    dense1.forward(X)

    # forward pass through activation
    activation1.forward(dense1.output)

    # forward pass through seecond dense layer
    dense2.forward(activation1.output)

    # forward pass through activation/loss function
    loss = loss_activation.forward(dense2.output, y)

    # calculating accuracy of output of activation 2 and targets
    # calculate values along first axis because of batches
    predictions = np.argmax(loss_activation.output, axis = 1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis = 1)
    accuracy = np.mean(predictions == y)
    
    if not epoch % 100:
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f}, ' + 
              f'lr: {optimizer.current_learning_rate}')

    # backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    
    # update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.350, loss: 1.099, lr: 0.02
epoch: 100, acc: 0.407, loss: 1.445, lr: 0.019803940984255867
epoch: 200, acc: 0.423, loss: 1.154, lr: 0.019609765663300322
epoch: 300, acc: 0.450, loss: 1.047, lr: 0.01941936110301971
epoch: 400, acc: 0.487, loss: 0.996, lr: 0.019232618521011637
epoch: 500, acc: 0.500, loss: 0.963, lr: 0.01904943327935994
epoch: 600, acc: 0.510, loss: 0.941, lr: 0.018869704689121615
epoch: 700, acc: 0.527, loss: 0.923, lr: 0.018693335825778108
epoch: 800, acc: 0.537, loss: 0.910, lr: 0.01852023335494027
epoch: 900, acc: 0.530, loss: 0.894, lr: 0.018350307367648408
epoch: 1000, acc: 0.533, loss: 0.885, lr: 0.018183471224656786
epoch: 1100, acc: 0.543, loss: 0.873, lr: 0.018019641409135957
epoch: 1200, acc: 0.547, loss: 0.862, lr: 0.01785873738726672
epoch: 1300, acc: 0.550, loss: 0.851, lr: 0.017700681476236834
epoch: 1400, acc: 0.553, loss: 0.843, lr: 0.017545398719185895
epoch: 1500, acc: 0.557, loss: 0.835, lr: 0.017392816766675364
epoch: 1600, acc: 0.560,

### Adam (Adative Momentum)
Currently most widely used optimizer. Built atop RMSProp, with momentum from SGD. Insteadof just applying the current gradients, we apply momentums like SGD optimizers, then apply per-weight adaptive learning rate with cache as does RMSProp.

Also adds a bias correction mechanism to the cache and momentum, compensating for the initial zeroed values before they warm up with the initial steps. This is done by dividing the momentum and the cache by (1-beta^step)

    1 - 0.9 ** 1 = 1 - 0.9 = 0.1
    
As training goes on, step count rises

    1 - 0.9 ** inf = 1 - 0 = 1
    
The *beta 1* for momentum starts at 0.9, and *beta 2* for cache starts at 0.999

In [84]:
class Optimizer_Adam:
    
    # initialize optimizer - set settings,
    # learning rate of 1. is the default for this optimizer
    # default decay rate of 0 (no decay)
    def __init__(self, learning_rate = 0.001, decay = 0., epsilon = 1e-7, beta_1 = 0.9, beta_2 = 0.999):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        
    # call once before any parameter update
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
                (1. / ( 1. + self.decay * self.iterations ))
        
    # update parameters
    def update_params(self, layer):
        
        # if layer does not contain cache array,
        # create them and fill with zeros
        if not hasattr(layer, 'weight_cache'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)
            layer.bias_cache = np.zeros_like(layer.biases)
            
        # update momentum with current gradients
        layer.weight_momentums = self.beta_1 * \
                                layer.weight_momentums + \
                                (1 - self.beta_1) * layer.dweights
        layer.bias_momentums = self.beta_1 * \
                                layer.bias_momentums + \
                                (1 - self.beta_1) * layer.dbiases
        
        # get corrected momentum
        # self.iteration is 0 at first pass
        # and we need to start wit h1 here
        weight_momentums_corrected = layer.weight_momentums / \
            (1 - self.beta_1 ** (self.iterations + 1))
        bias_momentums_corrected = layer.bias_momentums / \
            (1 - self.beta_1 ** (self.iterations + 1))
        
        # update cache with squared current gradients
        layer.weight_cache = self.beta_2 * layer.weight_cache + \
            (1 - self.beta_2) * layer.dweights ** 2
        layer.bias_cache = self.beta_2 * layer.bias_cache + \
            (1 - self.beta_2) * layer.dbiases ** 2
        
        # get corrected cache
        weight_cache_corrected = layer.weight_cache / \
            (1 - self.beta_2 ** (self.iterations + 1))
        bias_cache_corrected = layer.bias_cache / \
            (1 - self.beta_2 ** (self.iterations + 1))
        
        # vanilla SGD parameter update + normalization
        # with square rooted cache
        layer.weights += -self.current_learning_rate * \
                            weight_momentums_corrected / \
                            (np.sqrt(weight_cache_corrected) + \
                                self.epsilon)
        layer.biases += -self.current_learning_rate * \
                            bias_momentums_corrected / \
                            (np.sqrt(bias_cache_corrected) + \
                                self.epsilon)
    
    # call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

In [85]:
# creating dataset
X, y = spiral_data(samples = 100, classes = 3)

# dense layer wtih 2 input features and 64 ouput values
dense1 = Layer_Dense(2, 64)

# ReLU activation for dense1
activation1 = Activation_ReLU()

# second dense layer with 64 inputs features, an 3 output values
dense2 = Layer_Dense(64, 3)

# create softmax classifier's combined loss and acivation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# optimizer
optimizer = Optimizer_Adam(learning_rate=0.02, decay=1e-5)

# training in loop
for epoch in range(10001):
    # forward pass of our training data through this layer
    dense1.forward(X)

    # forward pass through activation
    activation1.forward(dense1.output)

    # forward pass through seecond dense layer
    dense2.forward(activation1.output)

    # forward pass through activation/loss function
    loss = loss_activation.forward(dense2.output, y)

    # calculating accuracy of output of activation 2 and targets
    # calculate values along first axis because of batches
    predictions = np.argmax(loss_activation.output, axis = 1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis = 1)
    accuracy = np.mean(predictions == y)
    
    if not epoch % 100:
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f}, ' + 
              f'lr: {optimizer.current_learning_rate}')

    # backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    
    # update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.313, loss: 1.099, lr: 0.02
epoch: 100, acc: 0.623, loss: 0.893, lr: 0.01998021958261321
epoch: 200, acc: 0.740, loss: 0.646, lr: 0.019960279044701046
epoch: 300, acc: 0.763, loss: 0.551, lr: 0.019940378268975763
epoch: 400, acc: 0.767, loss: 0.499, lr: 0.01992051713662487
epoch: 500, acc: 0.790, loss: 0.455, lr: 0.01990069552930875
epoch: 600, acc: 0.817, loss: 0.398, lr: 0.019880913329158343
epoch: 700, acc: 0.833, loss: 0.369, lr: 0.019861170418772778
epoch: 800, acc: 0.847, loss: 0.340, lr: 0.019841466681217078
epoch: 900, acc: 0.857, loss: 0.324, lr: 0.01982180200001982
epoch: 1000, acc: 0.873, loss: 0.306, lr: 0.019802176259170884
epoch: 1100, acc: 0.880, loss: 0.293, lr: 0.01978258934311912
epoch: 1200, acc: 0.883, loss: 0.283, lr: 0.01976304113677013
epoch: 1300, acc: 0.887, loss: 0.273, lr: 0.019743531525483964
epoch: 1400, acc: 0.897, loss: 0.264, lr: 0.01972406039507293
epoch: 1500, acc: 0.900, loss: 0.250, lr: 0.019704627631799327
epoch: 1600, acc: 0.920, lo

Slightly Different hyper-parameters

In [88]:
# creating dataset
X, y = spiral_data(samples = 100, classes = 3)

# dense layer wtih 2 input features and 64 ouput values
dense1 = Layer_Dense(2, 64)

# ReLU activation for dense1
activation1 = Activation_ReLU()

# second dense layer with 64 inputs features, an 3 output values
dense2 = Layer_Dense(64, 3)

# create softmax classifier's combined loss and acivation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# optimizer
optimizer = Optimizer_Adam(learning_rate=0.05, decay=5e-7)

# training in loop
for epoch in range(10001):
    # forward pass of our training data through this layer
    dense1.forward(X)

    # forward pass through activation
    activation1.forward(dense1.output)

    # forward pass through seecond dense layer
    dense2.forward(activation1.output)

    # forward pass through activation/loss function
    loss = loss_activation.forward(dense2.output, y)

    # calculating accuracy of output of activation 2 and targets
    # calculate values along first axis because of batches
    predictions = np.argmax(loss_activation.output, axis = 1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis = 1)
    accuracy = np.mean(predictions == y)
    
    if not epoch % 100:
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f}, ' + 
              f'lr: {optimizer.current_learning_rate}')

    # backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    
    # update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.333, loss: 1.099, lr: 0.05
epoch: 100, acc: 0.657, loss: 0.706, lr: 0.04999752512250644
epoch: 200, acc: 0.770, loss: 0.529, lr: 0.04999502549496326
epoch: 300, acc: 0.800, loss: 0.463, lr: 0.049992526117345455
epoch: 400, acc: 0.800, loss: 0.436, lr: 0.04999002698961558
epoch: 500, acc: 0.813, loss: 0.380, lr: 0.049987528111736124
epoch: 600, acc: 0.817, loss: 0.359, lr: 0.049985029483669646
epoch: 700, acc: 0.840, loss: 0.345, lr: 0.049982531105378675
epoch: 800, acc: 0.833, loss: 0.334, lr: 0.04998003297682575
epoch: 900, acc: 0.823, loss: 0.322, lr: 0.049977535097973466
epoch: 1000, acc: 0.840, loss: 0.318, lr: 0.049975037468784345
epoch: 1100, acc: 0.850, loss: 0.310, lr: 0.049972540089220974
epoch: 1200, acc: 0.853, loss: 0.303, lr: 0.04997004295924593
epoch: 1300, acc: 0.860, loss: 0.303, lr: 0.04996754607882181
epoch: 1400, acc: 0.850, loss: 0.293, lr: 0.049965049447911185
epoch: 1500, acc: 0.873, loss: 0.289, lr: 0.04996255306647668
epoch: 1600, acc: 0.870, lo