## Full Code Up Until This Point

In [14]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data

nnfs.init()

class Layer_Dense:
    
    # layer initialization
    def __init__(self, n_inputs, n_neurons):
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))
        
    # forward pass
    def forward(self, inputs):
        self.output = np.dot(inputs, self.weights) + self.biases
        self.inputs = inputs
        
    # backward pass
    def backward(self, dvalues):        
        # gradients on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis = 0, keepdims = True)
        
        # gradient on values to pass further back
        self.dinputs = np.dot(dvalues, self.weights.T)

# Rectified Linear Unit Activation Function
class Activation_ReLU:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.maximum(0, inputs)
    
    # backward pass
    def backward(self, dvalues):
        self.dinputs = dvalues.copy()
        
        # zero gradient where input values were negative
        self.dinputs[self.inputs <= 0] = 0

class Activation_Softmax:
    def forward(self, inputs):
        exp_values = np.exp(inputs - np.max(inputs, axis = 1, keepdims = True))
        probabilities = exp_values / np.sum(exp_values, axis = 1, keepdims = True)
        self.output = probabilities
    
    # backward pass
    def backward(self, dvalues):
        
        #create uninitialized array
        self.dinputs = np.empty_like(dvalues)
        
        # enumerate outputs and gradients
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):
            # flatten output array
            single_output = single_output.reshape(-1, 1)
            # calculate jacobian matrix of the output
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)
            
            # calculate sample wise gradient and add it to the array of sample gradients
            self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)
            

class Loss:
    def calculate(self, outputs, y):
        # y is the intended target values
        sample_losses = self.forward(outputs, y)
        data_loss = np.mean(sample_losses)
        return data_loss
    
class Loss_CategoricalCrossentropy(Loss):
    #inheriting from the base Loss class
    def forward(self, y_pred, y_true):
        # y_pred will come from the neural network
        # y_true will come from the training set
        samples = len(y_pred)
        y_pred_clipped = np.clip(y_pred, 1e-7, 1-1e-7)
        
        if len(y_true.shape) == 1:
            # means scalar class values have been passed
            correct_confidences = y_pred_clipped[range(samples), y_true]
        elif len(y_true.shape) == 2:
            # one hot encoded values have been passed
            correct_confidences = np.sum(y_pred_clipped * y_true, axis = 1)
            
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods
    
    # Backwards pass
    def backward(self, dvalues, y_true):
        # number of samples
        samples = len(dvalues)
        # number of labels in every sample
        # we'll use the first sample to count them
        labels = len(dvalues[0])

        # if labels are sparse, turn them into one-hot vector
        # in case the shape is as [3, 0, 2] etc etc per sample
        # we turn them into one-hot vectors
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]

        # calculate gradient
        self.dinputs = -y_true / dvalues
        # normalize gradient
        self.dinputs = self.dinputs / samples
        # with a larger number of batches, it's all summed together
        # in the dot product, and some will be given more importance
        # than others when we don't normaize

class Activation_Softmax_Loss_CategoricalCrossentropy():
    
    # creates activation and loss function objects inside
    def __init__(self):
        self.activation = Activation_Softmax()
        self.loss = Loss_CategoricalCrossentropy()
        
    # forward pass
    def forward(self, inputs, y_true):
        # output layer's activation function
        self.activation.forward(inputs)
        # set the output
        self.output = self.activation.output
        # calculate and return the loss value
        return self.loss.calculate(self.output, y_true)
    
    # backward pass
    def backward(self, dvalues, y_true):
        
        # number of samples
        samples = len(dvalues)
        
        #if labels are one-hot encoded,
        # turn them into discrete values
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis = 1)
            
        # copy so we can safely modify
        self.dinputs = dvalues.copy()
        # calculate gradient
        self.dinputs[range(samples), y_true] -= 1
        # normalize gradient
        self.dinputs = self.dinputs / samples

class Optimizer_SGD:
    # initialize optimizer - set settings,
    # learning rate of 1. is the default for this optimizer
    # default decay rate of 0 (no decay)
    def __init__(self, learning_rate = 1.0, decay = 0., momentum = 0.):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum = momentum
        
    # call once before any parameter update
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
                (1. / ( 1. + self.decay * self.iterations ))
        
    # update parameters
    def update_params(self, layer):
        
        #if momentum is used
        if self.momentum:
            
            #if layer does not contain momentum arrays
            # create them and fill with zeroes
            if not hasattr(layer, 'weight_momentums'):
                layer.weight_momentums = np.zeros_like(layer.weights)
                
                # if there is no momentum array for weights
                # the array doesn't exist for the biases yet either
                layer.bias_momentums = np.zeros_like(layer.biases)
                
            # build weight updates with momentum - take previous
            # updates multiplied by retain factor and update with
            # current gradients
            weight_updates = \
                self.momentum * layer.weight_momentums - \
                self.current_learning_rate * layer.dweights
            layer.weight_momentums = weight_updates
            
            # build bias updates
            bias_updates = \
                self.momentum * layer.bias_momentums - \
                self.current_learning_rate * layer.dbiases
            layer.bias_momentums = bias_updates
        # otherwise vanillae SGD
        else:
            weight_updates = -self.learning_rate * layer.dweights
            bias_updates = -self.learning_rate * layer.dbiases
        
        # update weights and biases using either
        # vanilla or momentum updates
        layer.weights += weight_updates
        layer.biases += bias_updates
    
    # call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

class Optimizer_Adagrad:
    # initialize optimizer - set settings,
    # learning rate of 1. is the default for this optimizer
    # default decay rate of 0 (no decay)
    def __init__(self, learning_rate = 1., decay = 0., epsilon = 1e-7):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        
    # call once before any parameter update
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
                (1. / ( 1. + self.decay * self.iterations ))
        
    # update parameters
    def update_params(self, layer):
        
        # if layer does not contain cache array,
        # create them and fill with zeros
        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)
            
        # update cache with squared current gradients
        layer.weight_cache += layer.dweights ** 2
        layer.bias_cache += layer.dbiases ** 2
        
        # vanilla SGD parameter update + normalization
        # with square rooted cache
        layer.weights += -self.current_learning_rate * \
                            layer.dweights / \
                            (np.sqrt(layer.bias_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * \
                            layer.dbiases / \
                            (np.sqrt(layer.bias_cache) + self.epsilon)
    
    # call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

class Optimizer_RMSprop:
    # initialize optimizer - set settings,
    # learning rate of 1. is the default for this optimizer
    # default decay rate of 0 (no decay)
    def __init__(self, learning_rate = 0.001, decay = 0., epsilon = 1e-7, rho = 0.9):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.rho = rho
        
    # call once before any parameter update
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
                (1. / ( 1. + self.decay * self.iterations ))
        
    # update parameters
    def update_params(self, layer):
        
        # if layer does not contain cache array,
        # create them and fill with zeros
        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)
            
        # update cache with squared current gradients
        layer.weight_cache = self.rho * layer.weight_cache + \
                    (1 - self.rho) * layer.dweights ** 2
        layer.bias_cache = self.rho * layer.bias_cache + \
                    (1 - self.rho) * layer.dbiases ** 2
        
        # vanilla SGD parameter update + normalization
        # with square rooted cache
        layer.weights += -self.current_learning_rate * \
                            layer.dweights / \
                            (np.sqrt(layer.bias_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * \
                            layer.dbiases / \
                            (np.sqrt(layer.bias_cache) + self.epsilon)
    
    # call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

class Optimizer_Adam:
    
    # initialize optimizer - set settings,
    # learning rate of 1. is the default for this optimizer
    # default decay rate of 0 (no decay)
    def __init__(self, learning_rate = 0.001, decay = 0., epsilon = 1e-7, beta_1 = 0.9, beta_2 = 0.999):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        
    # call once before any parameter update
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
                (1. / ( 1. + self.decay * self.iterations ))
        
    # update parameters
    def update_params(self, layer):
        
        # if layer does not contain cache array,
        # create them and fill with zeros
        if not hasattr(layer, 'weight_cache'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)
            layer.bias_cache = np.zeros_like(layer.biases)
            
        # update momentum with current gradients
        layer.weight_momentums = self.beta_1 * \
                                layer.weight_momentums + \
                                (1 - self.beta_1) * layer.dweights
        layer.bias_momentums = self.beta_1 * \
                                layer.bias_momentums + \
                                (1 - self.beta_1) * layer.dbiases
        
        # get corrected momentum
        # self.iteration is 0 at first pass
        # and we need to start wit h1 here
        weight_momentums_corrected = layer.weight_momentums / \
            (1 - self.beta_1 ** (self.iterations + 1))
        bias_momentums_corrected = layer.bias_momentums / \
            (1 - self.beta_1 ** (self.iterations + 1))
        
        # update cache with squared current gradients
        layer.weight_cache = self.beta_2 * layer.weight_cache + \
            (1 - self.beta_2) * layer.dweights ** 2
        layer.bias_cache = self.beta_2 * layer.bias_cache + \
            (1 - self.beta_2) * layer.dbiases ** 2
        
        # get corrected cache
        weight_cache_corrected = layer.weight_cache / \
            (1 - self.beta_2 ** (self.iterations + 1))
        bias_cache_corrected = layer.bias_cache / \
            (1 - self.beta_2 ** (self.iterations + 1))
        
        # vanilla SGD parameter update + normalization
        # with square rooted cache
        layer.weights += -self.current_learning_rate * \
                            weight_momentums_corrected / \
                            (np.sqrt(weight_cache_corrected) + \
                                self.epsilon)
        layer.biases += -self.current_learning_rate * \
                            bias_momentums_corrected / \
                            (np.sqrt(bias_cache_corrected) + \
                                self.epsilon)
    
    # call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1


In [15]:
# creating dataset
X, y = spiral_data(samples = 100, classes = 3)

# dense layer wtih 2 input features and 64 ouput values
dense1 = Layer_Dense(2, 64)

# ReLU activation for dense1
activation1 = Activation_ReLU()

# second dense layer with 64 inputs features, an 3 output values
dense2 = Layer_Dense(64, 3)

# create softmax classifier's combined loss and acivation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# optimizer
optimizer = Optimizer_Adam(learning_rate=0.05, decay=5e-7)

# training in loop
for epoch in range(10001):
    # forward pass of our training data through this layer
    dense1.forward(X)

    # forward pass through activation
    activation1.forward(dense1.output)

    # forward pass through seecond dense layer
    dense2.forward(activation1.output)

    # forward pass through activation/loss function
    loss = loss_activation.forward(dense2.output, y)

    # calculating accuracy of output of activation 2 and targets
    # calculate values along first axis because of batches
    predictions = np.argmax(loss_activation.output, axis = 1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis = 1)
    accuracy = np.mean(predictions == y)
    
    if not epoch % 100:
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f}, ' + 
              f'lr: {optimizer.current_learning_rate}')

    # backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    
    # update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.360, loss: 1.099, lr: 0.05
epoch: 100, acc: 0.670, loss: 0.705, lr: 0.04999752512250644
epoch: 200, acc: 0.797, loss: 0.522, lr: 0.04999502549496326
epoch: 300, acc: 0.847, loss: 0.430, lr: 0.049992526117345455
epoch: 400, acc: 0.887, loss: 0.344, lr: 0.04999002698961558
epoch: 500, acc: 0.910, loss: 0.303, lr: 0.049987528111736124
epoch: 600, acc: 0.907, loss: 0.276, lr: 0.049985029483669646
epoch: 700, acc: 0.917, loss: 0.252, lr: 0.049982531105378675
epoch: 800, acc: 0.920, loss: 0.245, lr: 0.04998003297682575
epoch: 900, acc: 0.930, loss: 0.228, lr: 0.049977535097973466
epoch: 1000, acc: 0.940, loss: 0.217, lr: 0.049975037468784345
epoch: 1100, acc: 0.937, loss: 0.205, lr: 0.049972540089220974
epoch: 1200, acc: 0.947, loss: 0.192, lr: 0.04997004295924593
epoch: 1300, acc: 0.947, loss: 0.184, lr: 0.04996754607882181
epoch: 1400, acc: 0.943, loss: 0.183, lr: 0.049965049447911185
epoch: 1500, acc: 0.943, loss: 0.189, lr: 0.04996255306647668
epoch: 1600, acc: 0.943, lo

With such a large scale, and enough neurons, the model can easily memorize any data; hence why we do not just throw bigger models at any given problem. Since a model can **overfit** and give us these very high accuracy percents, we can't trust them right away and must do further testing with unseen data, ie **testing**, ie **out-of-sample data**.

The training and the testing data must differ enough, and mind biases that can leak through such as sensor data taken every second, so certain individual data points being just a second apart and being very similar. The model giving very different accuracies for training and testing data is a good indicator of overfitting.

In [17]:
# creating dataset
X_test, y_test = spiral_data(samples = 100, classes = 3)

# forward pass of our training data through this layer
dense1.forward(X_test)

# forward pass through activation
activation1.forward(dense1.output)

# forward pass through seecond dense layer
dense2.forward(activation1.output)

# forward pass through activation/loss function
loss = loss_activation.forward(dense2.output, y_test)

# calculating accuracy of output of activation 2 and targets
# calculate values along first axis because of batches
predictions = np.argmax(loss_activation.output, axis = 1)
if len(y_test.shape) == 2:
    y_test = np.argmax(y, axis = 1)
accuracy = np.mean(predictions == y_test)

print(f'validation, acc: {accuracy:.3f}, loss: {loss:.3f}')

validation, acc: 0.840, loss: 0.809


Here the model gives an accuracy of 84%, and anything more than a 10% difference is very bad. Try to minimize this divergence.

Overfitting can be caused by multiple factors such as:
* Learning rate is too high
* Too many training epochs
* The model is too large

A model that's not learning is too small. A model that's learning but overfitting is too large.
**The fewer neurons you have, the smaller the chance the model can memorize the data**

One general rule to follow when selecting initial model hyperparameters is to find the smallest model possible that still learns. Trying different model settings is called **hyperparameter searching**.

You can let the model run for a short duration (perhaps a few minutes) to see if it's learning and if it is, let it run for longer, and compare it to a list of other hypermeters. 


### Hyperparameter Tuning
If there is too little data to create a separate validation set you can either:
* Split the training data into training and validation sets, and pick the hyperparameters based on how it performs on the smaller validation set, and use the best hyperparameters to train the entire set
* **Cross-Validation**: Given a dataset too small for a validation set, divide the dataset into 5 or so pieces and train the data on everything but one, and validate using the last one, then go through all of them like this in a prcoess called **k-fold cross validation**

### Training Data
Neural networks perform best when the training data is normalized between 0 and 1 or -1 and 1. The popular activation functions output values within this range:
* Softmax: 0 and 1
* Sigmoid: 0 and 1
* tanh   :-1 and 1

Also using numbers bigger than 1 can result in the weights being multiplied by them to caused overflows along the way. **It's easier to control the training process with smaller numbers**

The data is made to fit this through **preprocessing**. All the datasets, the training, the validation, and the testing need to go through the same preprocessing steps.

In the case where there is limited training data, **data augmentation** can be used to alter the existing data in a manner you can expect to see irl, such as rotating an images in a model to differentiate dogs and cats, but be careful in situations where rotating the image can change its meaning. 

### Regularization
Regularization methods decrease generalization errors. One such method is **L1 and L2 Regularization**, which calculates a **penalty** to be the added to the loss for having weights or biases too big, which is an indicator of the model memorizing the data, and it's generally better when there are lots of neurons affecting the output, rather than a select few with big weights. 

* L1 penalty is the sum of all absolute values of the weights and biases: punishes smaller weights and biases more and causes the model to only variant to really big inputs
* L2 penalty is the sum of all the squared weights and biases: punishes larger weights and biases a lot more

L1 is rarely used alone, if at all. Moreover a **lambda** value is multiplied to the penalty to dictate how much the penalty will affect the model. Overall loss is now:

    Loss = DataLoss + L_1w + L_1b + L_2w + L_2b


#### Forward Pass
    


In [1]:
class Layer_Dense:
    
    # layer initialization
    def __init__(self, n_inputs, n_neurons,
                 weight_regularizer_l1 = 0, weight_regularizer_l2 = 0, 
                 bias_regularizer_l1 = 0, bias_regularizer_l2 = 0):
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))
        
        #the lambda values of the regularization methods
        self.weight_regularizer_l1 = weight_regularizer_l1
        self.weight_regularizer_l2 = weight_regularizer_l2
        self.bias_regularizer_l1 = bias_regularizer_l1
        self.bias_regularizer_l2 = bias_regularizer_l2
        
        
    # forward pass
    def forward(self, inputs):
        self.output = np.dot(inputs, self.weights) + self.biases
        self.inputs = inputs
        
    # backward pass
    def backward(self, dvalues):        
        # gradients on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis = 0, keepdims = True)
        
        # gradient on values to pass further back
        self.dinputs = np.dot(dvalues, self.weights.T)

The lambda is now added as a hyperparameter. Next, the loss function needs to be updated

In [2]:
class Loss:
    def calculate(self, outputs, y):
        # y is the intended target values
        sample_losses = self.forward(outputs, y)
        data_loss = np.mean(sample_losses)
        return data_loss
    
    def regularization_loss(self, layer):
        
        # 0 by default
        regularization_loss = 0
        
        # calculate only when the factor is greater than 0
        
        # L1 regularization - weights
        if layer.weights_regularizer_l1 > 0:
            regularization_loss += layer.weight_regularizer_l1 * \
                                    np.sum(np.abs(layer.weights))
            
        # L2 regularization - weights
        if layer.weights_regularizer_l2 > 0:
            regularization_loss += layer.weight_regularizer_l2 * \
                                    np.sum(layer.weights * \
                                           layer.weights)
        
        # L1 regularization - bias
        if layer.bias_regularizer_l1 > 0:
            regularization_loss += layer.bias_regularizer_l1 * \
                                    np.sum(np.abs(layer.biases))
        
        # L1 regularization - bias
        if layer.bias_regularizer_l2 > 0:
            regularization_loss += layer.bias_regularizer_l2 * \
                                    np.sum(layer.weights * \
                                           layer.weights)
        
        return regularization_loss

#### Backward Pass
Since L2 means summing up all the squares then multiplying them by the lambda, in code, it means we multiply all by 2\*lambda for the derivative

Since L1 means summing up all the absolutes then multiplying them by the lambda, in code, it means we multiply the lambda by 1 if the weight is positive, and -1 if the weight is negative for every weight.

In [5]:
class Layer_Dense:
    
    # layer initialization
    def __init__(self, n_inputs, n_neurons,
                 weight_regularizer_l1 = 0, weight_regularizer_l2 = 0, 
                 bias_regularizer_l1 = 0, bias_regularizer_l2 = 0):
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))
        
        #the lambda values of the regularization methods
        self.weight_regularizer_l1 = weight_regularizer_l1
        self.weight_regularizer_l2 = weight_regularizer_l2
        self.bias_regularizer_l1 = bias_regularizer_l1
        self.bias_regularizer_l2 = bias_regularizer_l2
        
        
    # forward pass
    def forward(self, inputs):
        self.output = np.dot(inputs, self.weights) + self.biases
        self.inputs = inputs
        
    # backward pass
    def backward(self, dvalues):        
        # gradients on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis = 0, keepdims = True)
        
        # gradients on regularization
        # L1 on weights
        if self.weight_regularizer_l1 > 0:
            dL1 = np.ones_like(self.weights)
            dL1[self.weights < 0] = -1
            self.dweights += self.weight_regularizer_l1 * dL1
        # L2 on the weights
        if self.weight_regularizer_l2 > 0:
            self.dweights += self.weights * 2 \
                            * self.weight_regularizer_l2
        # L1 on biases
        if self.bias_regularizer_l1 > 0:
            dL1 = np.ones_like(self.biases)
            dL1[self.biases < 0] = -1
            self.dbiases += self.bias_regularizer_l1 * dL1
        # L2 on the weights
        if self.bias_regularizer_l2 > 0:
            self.dbiases += self.biases * 2 \
                            * self.bias_regularizer_l2
        
        # gradient on values to pass further back
        self.dinputs = np.dot(dvalues, self.weights.T)

### Full Code Up Until This Point

In [3]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data

nnfs.init()

class Layer_Dense:
    
    # layer initialization
    def __init__(self, n_inputs, n_neurons,
                 weight_regularizer_l1 = 0, weight_regularizer_l2 = 0, 
                 bias_regularizer_l1 = 0, bias_regularizer_l2 = 0):
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))
        
        #the lambda values of the regularization methods
        self.weight_regularizer_l1 = weight_regularizer_l1
        self.weight_regularizer_l2 = weight_regularizer_l2
        self.bias_regularizer_l1 = bias_regularizer_l1
        self.bias_regularizer_l2 = bias_regularizer_l2
        
        
    # forward pass
    def forward(self, inputs):
        self.output = np.dot(inputs, self.weights) + self.biases
        self.inputs = inputs
        
    # backward pass
    def backward(self, dvalues):        
        # gradients on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis = 0, keepdims = True)
        
        # gradients on regularization
        # L1 on weights
        if self.weight_regularizer_l1 > 0:
            dL1 = np.ones_like(self.weights)
            dL1[self.weights < 0] = -1
            self.dweights += self.weight_regularizer_l1 * dL1
        # L2 on the weights
        if self.weight_regularizer_l2 > 0:
            self.dweights += self.weights * 2 \
                            * self.weight_regularizer_l2
        # L1 on biases
        if self.bias_regularizer_l1 > 0:
            dL1 = np.ones_like(self.biases)
            dL1[self.biases < 0] = -1
            self.dbiases += self.bias_regularizer_l1 * dL1
        # L2 on the weights
        if self.bias_regularizer_l2 > 0:
            self.dbiases += self.biases * 2 \
                            * self.bias_regularizer_l2
        
        # gradient on values to pass further back
        self.dinputs = np.dot(dvalues, self.weights.T)

# Rectified Linear Unit Activation Function
class Activation_ReLU:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.maximum(0, inputs)
    
    # backward pass
    def backward(self, dvalues):
        self.dinputs = dvalues.copy()
        
        # zero gradient where input values were negative
        self.dinputs[self.inputs <= 0] = 0

class Activation_Softmax:
    def forward(self, inputs):
        exp_values = np.exp(inputs - np.max(inputs, axis = 1, keepdims = True))
        probabilities = exp_values / np.sum(exp_values, axis = 1, keepdims = True)
        self.output = probabilities
    
    # backward pass
    def backward(self, dvalues):
        
        #create uninitialized array
        self.dinputs = np.empty_like(dvalues)
        
        # enumerate outputs and gradients
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):
            # flatten output array
            single_output = single_output.reshape(-1, 1)
            # calculate jacobian matrix of the output
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)
            
            # calculate sample wise gradient and add it to the array of sample gradients
            self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)
            
class Loss:
    def calculate(self, outputs, y):
        # y is the intended target values
        sample_losses = self.forward(outputs, y)
        data_loss = np.mean(sample_losses)
        return data_loss
    
    def regularization_loss(self, layer):
        
        # 0 by default
        regularization_loss = 0
        
        # calculate only when the factor is greater than 0
        
        # L1 regularization - weights
        if layer.weight_regularizer_l1 > 0:
            regularization_loss += layer.weight_regularizer_l1 * \
                                    np.sum(np.abs(layer.weights))
            
        # L2 regularization - weights
        if layer.weight_regularizer_l2 > 0:
            regularization_loss += layer.weight_regularizer_l2 * \
                                    np.sum(layer.weights * \
                                           layer.weights)
        
        # L1 regularization - bias
        if layer.bias_regularizer_l1 > 0:
            regularization_loss += layer.bias_regularizer_l1 * \
                                    np.sum(np.abs(layer.biases))
        
        # L1 regularization - bias
        if layer.bias_regularizer_l2 > 0:
            regularization_loss += layer.bias_regularizer_l2 * \
                                    np.sum(layer.weights * \
                                           layer.weights)
        
        return regularization_loss
    
class Loss_CategoricalCrossentropy(Loss):
    #inheriting from the base Loss class
    def forward(self, y_pred, y_true):
        # y_pred will come from the neural network
        # y_true will come from the training set
        samples = len(y_pred)
        y_pred_clipped = np.clip(y_pred, 1e-7, 1-1e-7)
        
        if len(y_true.shape) == 1:
            # means scalar class values have been passed
            correct_confidences = y_pred_clipped[range(samples), y_true]
        elif len(y_true.shape) == 2:
            # one hot encoded values have been passed
            correct_confidences = np.sum(y_pred_clipped * y_true, axis = 1)
            
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods
    
    # Backwards pass
    def backward(self, dvalues, y_true):
        # number of samples
        samples = len(dvalues)
        # number of labels in every sample
        # we'll use the first sample to count them
        labels = len(dvalues[0])

        # if labels are sparse, turn them into one-hot vector
        # in case the shape is as [3, 0, 2] etc etc per sample
        # we turn them into one-hot vectors
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]

        # calculate gradient
        self.dinputs = -y_true / dvalues
        # normalize gradient
        self.dinputs = self.dinputs / samples
        # with a larger number of batches, it's all summed together
        # in the dot product, and some will be given more importance
        # than others when we don't normaize

class Activation_Softmax_Loss_CategoricalCrossentropy():
    
    # creates activation and loss function objects inside
    def __init__(self):
        self.activation = Activation_Softmax()
        self.loss = Loss_CategoricalCrossentropy()
        
    # forward pass
    def forward(self, inputs, y_true):
        # output layer's activation function
        self.activation.forward(inputs)
        # set the output
        self.output = self.activation.output
        # calculate and return the loss value
        return self.loss.calculate(self.output, y_true)
    
    # backward pass
    def backward(self, dvalues, y_true):
        
        # number of samples
        samples = len(dvalues)
        
        #if labels are one-hot encoded,
        # turn them into discrete values
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis = 1)
            
        # copy so we can safely modify
        self.dinputs = dvalues.copy()
        # calculate gradient
        self.dinputs[range(samples), y_true] -= 1
        # normalize gradient
        self.dinputs = self.dinputs / samples

class Optimizer_SGD:
    # initialize optimizer - set settings,
    # learning rate of 1. is the default for this optimizer
    # default decay rate of 0 (no decay)
    def __init__(self, learning_rate = 1.0, decay = 0., momentum = 0.):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum = momentum
        
    # call once before any parameter update
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
                (1. / ( 1. + self.decay * self.iterations ))
        
    # update parameters
    def update_params(self, layer):
        
        #if momentum is used
        if self.momentum:
            
            #if layer does not contain momentum arrays
            # create them and fill with zeroes
            if not hasattr(layer, 'weight_momentums'):
                layer.weight_momentums = np.zeros_like(layer.weights)
                
                # if there is no momentum array for weights
                # the array doesn't exist for the biases yet either
                layer.bias_momentums = np.zeros_like(layer.biases)
                
            # build weight updates with momentum - take previous
            # updates multiplied by retain factor and update with
            # current gradients
            weight_updates = \
                self.momentum * layer.weight_momentums - \
                self.current_learning_rate * layer.dweights
            layer.weight_momentums = weight_updates
            
            # build bias updates
            bias_updates = \
                self.momentum * layer.bias_momentums - \
                self.current_learning_rate * layer.dbiases
            layer.bias_momentums = bias_updates
        # otherwise vanillae SGD
        else:
            weight_updates = -self.learning_rate * layer.dweights
            bias_updates = -self.learning_rate * layer.dbiases
        
        # update weights and biases using either
        # vanilla or momentum updates
        layer.weights += weight_updates
        layer.biases += bias_updates
    
    # call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

class Optimizer_Adagrad:
    # initialize optimizer - set settings,
    # learning rate of 1. is the default for this optimizer
    # default decay rate of 0 (no decay)
    def __init__(self, learning_rate = 1., decay = 0., epsilon = 1e-7):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        
    # call once before any parameter update
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
                (1. / ( 1. + self.decay * self.iterations ))
        
    # update parameters
    def update_params(self, layer):
        
        # if layer does not contain cache array,
        # create them and fill with zeros
        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)
            
        # update cache with squared current gradients
        layer.weight_cache += layer.dweights ** 2
        layer.bias_cache += layer.dbiases ** 2
        
        # vanilla SGD parameter update + normalization
        # with square rooted cache
        layer.weights += -self.current_learning_rate * \
                            layer.dweights / \
                            (np.sqrt(layer.bias_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * \
                            layer.dbiases / \
                            (np.sqrt(layer.bias_cache) + self.epsilon)
    
    # call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

class Optimizer_RMSprop:
    # initialize optimizer - set settings,
    # learning rate of 1. is the default for this optimizer
    # default decay rate of 0 (no decay)
    def __init__(self, learning_rate = 0.001, decay = 0., epsilon = 1e-7, rho = 0.9):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.rho = rho
        
    # call once before any parameter update
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
                (1. / ( 1. + self.decay * self.iterations ))
        
    # update parameters
    def update_params(self, layer):
        
        # if layer does not contain cache array,
        # create them and fill with zeros
        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)
            
        # update cache with squared current gradients
        layer.weight_cache = self.rho * layer.weight_cache + \
                    (1 - self.rho) * layer.dweights ** 2
        layer.bias_cache = self.rho * layer.bias_cache + \
                    (1 - self.rho) * layer.dbiases ** 2
        
        # vanilla SGD parameter update + normalization
        # with square rooted cache
        layer.weights += -self.current_learning_rate * \
                            layer.dweights / \
                            (np.sqrt(layer.bias_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * \
                            layer.dbiases / \
                            (np.sqrt(layer.bias_cache) + self.epsilon)
    
    # call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

class Optimizer_Adam:
    
    # initialize optimizer - set settings,
    # learning rate of 1. is the default for this optimizer
    # default decay rate of 0 (no decay)
    def __init__(self, learning_rate = 0.001, decay = 0., epsilon = 1e-7, beta_1 = 0.9, beta_2 = 0.999):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        
    # call once before any parameter update
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
                (1. / ( 1. + self.decay * self.iterations ))
        
    # update parameters
    def update_params(self, layer):
        
        # if layer does not contain cache array,
        # create them and fill with zeros
        if not hasattr(layer, 'weight_cache'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)
            layer.bias_cache = np.zeros_like(layer.biases)
            
        # update momentum with current gradients
        layer.weight_momentums = self.beta_1 * \
                                layer.weight_momentums + \
                                (1 - self.beta_1) * layer.dweights
        layer.bias_momentums = self.beta_1 * \
                                layer.bias_momentums + \
                                (1 - self.beta_1) * layer.dbiases
        
        # get corrected momentum
        # self.iteration is 0 at first pass
        # and we need to start wit h1 here
        weight_momentums_corrected = layer.weight_momentums / \
            (1 - self.beta_1 ** (self.iterations + 1))
        bias_momentums_corrected = layer.bias_momentums / \
            (1 - self.beta_1 ** (self.iterations + 1))
        
        # update cache with squared current gradients
        layer.weight_cache = self.beta_2 * layer.weight_cache + \
            (1 - self.beta_2) * layer.dweights ** 2
        layer.bias_cache = self.beta_2 * layer.bias_cache + \
            (1 - self.beta_2) * layer.dbiases ** 2
        
        # get corrected cache
        weight_cache_corrected = layer.weight_cache / \
            (1 - self.beta_2 ** (self.iterations + 1))
        bias_cache_corrected = layer.bias_cache / \
            (1 - self.beta_2 ** (self.iterations + 1))
        
        # vanilla SGD parameter update + normalization
        # with square rooted cache
        layer.weights += -self.current_learning_rate * \
                            weight_momentums_corrected / \
                            (np.sqrt(weight_cache_corrected) + \
                                self.epsilon)
        layer.biases += -self.current_learning_rate * \
                            bias_momentums_corrected / \
                            (np.sqrt(bias_cache_corrected) + \
                                self.epsilon)
    
    # call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1


In [23]:
# creating dataset
X, y = spiral_data(samples = 100, classes = 3)

# dense layer wtih 2 input features and 64 ouput values
dense1 = Layer_Dense(2, 64, weight_regularizer_l2=5e-4,
                            bias_regularizer_l2=5e-4)

# ReLU activation for dense1
activation1 = Activation_ReLU()

# second dense layer with 64 inputs features, an 3 output values
dense2 = Layer_Dense(64, 3)

# create softmax classifier's combined loss and acivation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# optimizer
optimizer = Optimizer_Adam(learning_rate=0.02, decay=5e-7)

# training in loop
for epoch in range(10001):
    # forward pass of our training data through this layer
    dense1.forward(X)

    # forward pass through activation
    activation1.forward(dense1.output)

    # forward pass through seecond dense layer
    dense2.forward(activation1.output)

    # forward pass through activation/loss function
    data_loss = loss_activation.forward(dense2.output, y)
    
    # regularizatin penalty
    regularization_loss = \
        loss_activation.loss.regularization_loss(dense1) + \
        loss_activation.loss.regularization_loss(dense2)
    
    # overall loss
    loss = data_loss + regularization_loss

    # calculating accuracy of output of activation 2 and targets
    # calculate values along first axis because of batches
    predictions = np.argmax(loss_activation.output, axis = 1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis = 1)
    accuracy = np.mean(predictions == y)
    
    if not epoch % 100:
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f}, ' + 
              f'data_loss: {data_loss:.3f}, ' + 
              f'reg_loss: {regularization_loss:.3f}, ' + 
              f'lr: {optimizer.current_learning_rate}')

    # backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    
    # update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

epoch: 0, acc: 0.360, loss: 1.099, data_loss: 1.099, reg_loss: 0.000, lr: 0.02
epoch: 100, acc: 0.690, loss: 0.872, data_loss: 0.807, reg_loss: 0.065, lr: 0.019999010049002574
epoch: 200, acc: 0.773, loss: 0.743, data_loss: 0.614, reg_loss: 0.128, lr: 0.019998010197985302
epoch: 300, acc: 0.817, loss: 0.681, data_loss: 0.526, reg_loss: 0.155, lr: 0.019997010446938183
epoch: 400, acc: 0.823, loss: 0.636, data_loss: 0.469, reg_loss: 0.167, lr: 0.01999601079584623
epoch: 500, acc: 0.830, loss: 0.606, data_loss: 0.432, reg_loss: 0.174, lr: 0.01999501124469445
epoch: 600, acc: 0.843, loss: 0.576, data_loss: 0.399, reg_loss: 0.177, lr: 0.01999401179346786
epoch: 700, acc: 0.857, loss: 0.554, data_loss: 0.376, reg_loss: 0.178, lr: 0.01999301244215147
epoch: 800, acc: 0.857, loss: 0.535, data_loss: 0.357, reg_loss: 0.178, lr: 0.0199920131907303
epoch: 900, acc: 0.873, loss: 0.521, data_loss: 0.344, reg_loss: 0.176, lr: 0.019991014039189386
epoch: 1000, acc: 0.870, loss: 0.506, data_loss: 0.331

epoch: 8600, acc: 0.947, loss: 0.278, data_loss: 0.170, reg_loss: 0.108, lr: 0.019914378131224802
epoch: 8700, acc: 0.950, loss: 0.279, data_loss: 0.171, reg_loss: 0.108, lr: 0.01991338672444204
epoch: 8800, acc: 0.943, loss: 0.279, data_loss: 0.171, reg_loss: 0.107, lr: 0.0199123954163657
epoch: 8900, acc: 0.953, loss: 0.291, data_loss: 0.176, reg_loss: 0.115, lr: 0.019911404206981037
epoch: 9000, acc: 0.950, loss: 0.280, data_loss: 0.166, reg_loss: 0.114, lr: 0.019910413096273318
epoch: 9100, acc: 0.950, loss: 0.279, data_loss: 0.166, reg_loss: 0.113, lr: 0.019909422084227805
epoch: 9200, acc: 0.950, loss: 0.278, data_loss: 0.166, reg_loss: 0.112, lr: 0.019908431170829768
epoch: 9300, acc: 0.950, loss: 0.277, data_loss: 0.166, reg_loss: 0.111, lr: 0.01990744035606448
epoch: 9400, acc: 0.950, loss: 0.276, data_loss: 0.166, reg_loss: 0.110, lr: 0.01990644963991721
epoch: 9500, acc: 0.950, loss: 0.275, data_loss: 0.166, reg_loss: 0.109, lr: 0.01990545902237324
epoch: 9600, acc: 0.950, l

In [25]:
# validate the model

# creating dataset
X_test, y_test = spiral_data(samples = 100, classes = 3)

# forward pass of our training data through this layer
dense1.forward(X_test)

# forward pass through activation
activation1.forward(dense1.output)

# forward pass through seecond dense layer
dense2.forward(activation1.output)

# forward pass through activation/loss function
loss = loss_activation.forward(dense2.output, y_test)

# calculating accuracy of output of activation 2 and targets
# calculate values along first axis because of batches
predictions = np.argmax(loss_activation.output, axis = 1)
if len(y_test.shape) == 2:
    y_test = np.argmax(y, axis = 1)
accuracy = np.mean(predictions == y_test)

print(f'validation, acc: {accuracy:.3f}, loss: {loss:.3f}')

validation, acc: 0.863, loss: 0.388


With the regularization added, the model is fiting to the validation set lot better both in terms of accuracy and loss.

The following is an example of how larger training data can help:

In [28]:
# creating dataset
X, y = spiral_data(samples = 1000, classes = 3)

# dense layer wtih 2 input features and 64 ouput values
dense1 = Layer_Dense(2, 64, weight_regularizer_l2=5e-4,
                            bias_regularizer_l2=5e-4)

# ReLU activation for dense1
activation1 = Activation_ReLU()

# second dense layer with 64 inputs features, an 3 output values
dense2 = Layer_Dense(64, 3)

# create softmax classifier's combined loss and acivation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# optimizer
optimizer = Optimizer_Adam(learning_rate=0.02, decay=5e-7)

# training in loop
for epoch in range(10001):
    # forward pass of our training data through this layer
    dense1.forward(X)

    # forward pass through activation
    activation1.forward(dense1.output)

    # forward pass through seecond dense layer
    dense2.forward(activation1.output)

    # forward pass through activation/loss function
    data_loss = loss_activation.forward(dense2.output, y)
    
    # regularizatin penalty
    regularization_loss = \
        loss_activation.loss.regularization_loss(dense1) + \
        loss_activation.loss.regularization_loss(dense2)
    
    # overall loss
    loss = data_loss + regularization_loss

    # calculating accuracy of output of activation 2 and targets
    # calculate values along first axis because of batches
    predictions = np.argmax(loss_activation.output, axis = 1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis = 1)
    accuracy = np.mean(predictions == y)
    
    if not epoch % 100:
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f}, ' + 
              f'data_loss: {data_loss:.3f}, ' + 
              f'reg_loss: {regularization_loss:.3f}, ' + 
              f'lr: {optimizer.current_learning_rate}')

    # backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    
    # update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()
    
# validate the model

# creating dataset
X_test, y_test = spiral_data(samples = 100, classes = 3)

# forward pass of our training data through this layer
dense1.forward(X_test)

# forward pass through activation
activation1.forward(dense1.output)

# forward pass through seecond dense layer
dense2.forward(activation1.output)

# forward pass through activation/loss function
loss = loss_activation.forward(dense2.output, y_test)

# calculating accuracy of output of activation 2 and targets
# calculate values along first axis because of batches
predictions = np.argmax(loss_activation.output, axis = 1)
if len(y_test.shape) == 2:
    y_test = np.argmax(y, axis = 1)
accuracy = np.mean(predictions == y_test)

print(f'validation, acc: {accuracy:.3f}, loss: {loss:.3f}')

epoch: 0, acc: 0.348, loss: 1.099, data_loss: 1.099, reg_loss: 0.000, lr: 0.02
epoch: 100, acc: 0.584, loss: 0.923, data_loss: 0.872, reg_loss: 0.052, lr: 0.019999010049002574
epoch: 200, acc: 0.750, loss: 0.790, data_loss: 0.671, reg_loss: 0.119, lr: 0.019998010197985302
epoch: 300, acc: 0.789, loss: 0.728, data_loss: 0.581, reg_loss: 0.147, lr: 0.019997010446938183
epoch: 400, acc: 0.810, loss: 0.689, data_loss: 0.530, reg_loss: 0.160, lr: 0.01999601079584623
epoch: 500, acc: 0.823, loss: 0.658, data_loss: 0.491, reg_loss: 0.167, lr: 0.01999501124469445
epoch: 600, acc: 0.833, loss: 0.636, data_loss: 0.468, reg_loss: 0.167, lr: 0.01999401179346786
epoch: 700, acc: 0.840, loss: 0.616, data_loss: 0.451, reg_loss: 0.165, lr: 0.01999301244215147
epoch: 800, acc: 0.845, loss: 0.600, data_loss: 0.438, reg_loss: 0.162, lr: 0.0199920131907303
epoch: 900, acc: 0.847, loss: 0.586, data_loss: 0.427, reg_loss: 0.159, lr: 0.019991014039189386
epoch: 1000, acc: 0.855, loss: 0.575, data_loss: 0.419

epoch: 8500, acc: 0.885, loss: 0.383, data_loss: 0.292, reg_loss: 0.091, lr: 0.01991536963672872
epoch: 8600, acc: 0.884, loss: 0.384, data_loss: 0.293, reg_loss: 0.091, lr: 0.019914378131224802
epoch: 8700, acc: 0.883, loss: 0.384, data_loss: 0.293, reg_loss: 0.091, lr: 0.01991338672444204
epoch: 8800, acc: 0.883, loss: 0.383, data_loss: 0.293, reg_loss: 0.090, lr: 0.0199123954163657
epoch: 8900, acc: 0.885, loss: 0.382, data_loss: 0.292, reg_loss: 0.090, lr: 0.019911404206981037
epoch: 9000, acc: 0.889, loss: 0.384, data_loss: 0.294, reg_loss: 0.090, lr: 0.019910413096273318
epoch: 9100, acc: 0.881, loss: 0.382, data_loss: 0.292, reg_loss: 0.089, lr: 0.019909422084227805
epoch: 9200, acc: 0.889, loss: 0.381, data_loss: 0.292, reg_loss: 0.089, lr: 0.019908431170829768
epoch: 9300, acc: 0.881, loss: 0.382, data_loss: 0.293, reg_loss: 0.089, lr: 0.01990744035606448
epoch: 9400, acc: 0.887, loss: 0.378, data_loss: 0.289, reg_loss: 0.088, lr: 0.01990644963991721
epoch: 9500, acc: 0.887, l

The delta between the training and validation is tiny compared to before the regulization. The regulization methods allow us to throw larger models into the problem without the fear of overfitting. Let's try throwing a larger model at the problem.

In [29]:
# creating dataset
X, y = spiral_data(samples = 1000, classes = 3)

# dense layer wtih 2 input features and 64 ouput values
dense1 = Layer_Dense(2, 512, weight_regularizer_l2=5e-4,
                            bias_regularizer_l2=5e-4)

# ReLU activation for dense1
activation1 = Activation_ReLU()

# second dense layer with 64 inputs features, an 3 output values
dense2 = Layer_Dense(512, 3)

# create softmax classifier's combined loss and acivation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# optimizer
optimizer = Optimizer_Adam(learning_rate=0.02, decay=5e-7)

# training in loop
for epoch in range(10001):
    # forward pass of our training data through this layer
    dense1.forward(X)

    # forward pass through activation
    activation1.forward(dense1.output)

    # forward pass through seecond dense layer
    dense2.forward(activation1.output)

    # forward pass through activation/loss function
    data_loss = loss_activation.forward(dense2.output, y)
    
    # regularizatin penalty
    regularization_loss = \
        loss_activation.loss.regularization_loss(dense1) + \
        loss_activation.loss.regularization_loss(dense2)
    
    # overall loss
    loss = data_loss + regularization_loss

    # calculating accuracy of output of activation 2 and targets
    # calculate values along first axis because of batches
    predictions = np.argmax(loss_activation.output, axis = 1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis = 1)
    accuracy = np.mean(predictions == y)
    
    if not epoch % 100:
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f}, ' + 
              f'data_loss: {data_loss:.3f}, ' + 
              f'reg_loss: {regularization_loss:.3f}, ' + 
              f'lr: {optimizer.current_learning_rate}')

    # backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    
    # update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()
    
# validate the model

# creating dataset
X_test, y_test = spiral_data(samples = 100, classes = 3)

# forward pass of our training data through this layer
dense1.forward(X_test)

# forward pass through activation
activation1.forward(dense1.output)

# forward pass through seecond dense layer
dense2.forward(activation1.output)

# forward pass through activation/loss function
loss = loss_activation.forward(dense2.output, y_test)

# calculating accuracy of output of activation 2 and targets
# calculate values along first axis because of batches
predictions = np.argmax(loss_activation.output, axis = 1)
if len(y_test.shape) == 2:
    y_test = np.argmax(y, axis = 1)
accuracy = np.mean(predictions == y_test)

print(f'validation, acc: {accuracy:.3f}, loss: {loss:.3f}')

epoch: 0, acc: 0.399, loss: 1.099, data_loss: 1.099, reg_loss: 0.000, lr: 0.02
epoch: 100, acc: 0.761, loss: 0.767, data_loss: 0.665, reg_loss: 0.102, lr: 0.019999010049002574
epoch: 200, acc: 0.844, loss: 0.622, data_loss: 0.467, reg_loss: 0.155, lr: 0.019998010197985302
epoch: 300, acc: 0.868, loss: 0.554, data_loss: 0.381, reg_loss: 0.173, lr: 0.019997010446938183
epoch: 400, acc: 0.881, loss: 0.514, data_loss: 0.341, reg_loss: 0.174, lr: 0.01999601079584623
epoch: 500, acc: 0.889, loss: 0.486, data_loss: 0.318, reg_loss: 0.169, lr: 0.01999501124469445
epoch: 600, acc: 0.891, loss: 0.464, data_loss: 0.301, reg_loss: 0.163, lr: 0.01999401179346786
epoch: 700, acc: 0.897, loss: 0.447, data_loss: 0.291, reg_loss: 0.157, lr: 0.01999301244215147
epoch: 800, acc: 0.740, loss: 0.793, data_loss: 0.642, reg_loss: 0.151, lr: 0.0199920131907303
epoch: 900, acc: 0.901, loss: 0.436, data_loss: 0.275, reg_loss: 0.161, lr: 0.019991014039189386
epoch: 1000, acc: 0.901, loss: 0.426, data_loss: 0.270

epoch: 8500, acc: 0.920, loss: 0.291, data_loss: 0.207, reg_loss: 0.084, lr: 0.01991536963672872
epoch: 8600, acc: 0.917, loss: 0.290, data_loss: 0.207, reg_loss: 0.083, lr: 0.019914378131224802
epoch: 8700, acc: 0.918, loss: 0.290, data_loss: 0.208, reg_loss: 0.083, lr: 0.01991338672444204
epoch: 8800, acc: 0.920, loss: 0.286, data_loss: 0.204, reg_loss: 0.082, lr: 0.0199123954163657
epoch: 8900, acc: 0.920, loss: 0.284, data_loss: 0.203, reg_loss: 0.081, lr: 0.019911404206981037
epoch: 9000, acc: 0.920, loss: 0.283, data_loss: 0.203, reg_loss: 0.081, lr: 0.019910413096273318
epoch: 9100, acc: 0.917, loss: 0.285, data_loss: 0.205, reg_loss: 0.080, lr: 0.019909422084227805
epoch: 9200, acc: 0.915, loss: 0.288, data_loss: 0.209, reg_loss: 0.079, lr: 0.019908431170829768
epoch: 9300, acc: 0.910, loss: 0.340, data_loss: 0.247, reg_loss: 0.093, lr: 0.01990744035606448
epoch: 9400, acc: 0.916, loss: 0.314, data_loss: 0.225, reg_loss: 0.089, lr: 0.01990644963991721
epoch: 9500, acc: 0.916, l

### Dropout
Another regulization method, that randomly disables a specified portion of the neurons of a drouput layer per forward pass, forcing the model to learn from the data using multiple neurons. Also helps with **co-adoption** where a neuron doesn't learn on its own, but just passes along its input(s). Also helps with noise.

In code, a **Bernoulli Distribution**, where the only two outputs are a discrete 1 or a 0, and the chance of it being a 1, is *p*, and chance of it being a 0 is *q* or *1 - p*. The Bernoilli Distribution is a special case of Binomal Distribution, and the binomal method from numpy will be used to create the array of 1s and 0s to multiply the outputs of this layer.

One problem with zeroing out a specific portion of the outputs is that the total sum of the ouputs will be lessened by that amount, so the neurons in the next layer will learn incorrectly and will cause problem when predicting. Hence, the non-zeroed outputs are scaled up that much to make up for it, by being multipled by *1 - droput rate*

As for the partial derivative, if the value isn't zeroed, then the derivative will be the scaling up factor, or 1/(1-dropout), and if the value is zeroed, the derivative will be 0. Both cases can be encapsulated by the single equation, r/(1-dropout), where r is the 1 or 0 associated with that output. 

In [2]:
class Layer_Dropout:
    
    # init
    def __init__(self, rate):
        # store keep rate, we invert it as the dropout rate instead
        self.rate = 1 - rate
    
    # forward pass
    def forward(self, inputs):
        # save input values
        self.inputs = inputs
        # generate and save scald mask for calculating p.d. later
        self.binary_mask = np.random.binomial(1, self.rate,
                                             size = inputs.shape) / \
                                             self.rate
        # apply mask to output values
        self.output = inputs * self.binary_mask
    
    def backward(self, dvalues):
        # gradient on values
        self.dinputs = dvalues * self.binary_mask

A higher learning and decay rates work better with a dropout layer

In [4]:
# creating dataset
X, y = spiral_data(samples = 500, classes = 3)

# dense layer wtih 2 input features and 64 ouput values
dense1 = Layer_Dense(2, 64, weight_regularizer_l2=5e-4,
                            bias_regularizer_l2=5e-4)

# ReLU activation for dense1
activation1 = Activation_ReLU()

# dropout layer
dropout1 = Layer_Dropout(rate = 0.1) # ie. dropout rate of 90%

# second dense layer with 64 inputs features, an 3 output values
dense2 = Layer_Dense(64, 3)

# create softmax classifier's combined loss and acivation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# optimizer
optimizer = Optimizer_Adam(learning_rate=0.05, decay=5e-5)

# training in loop
for epoch in range(10001):
    # forward pass of our training data through this layer
    dense1.forward(X)

    # forward pass through activation
    activation1.forward(dense1.output)
    
    # forward pass through dropout
    dropout1.forward(activation1.output)

    # forward pass through seecond dense layer
    dense2.forward(dropout1.output)

    # forward pass through activation/loss function
    data_loss = loss_activation.forward(dense2.output, y)
    
    # regularizatin penalty
    regularization_loss = \
        loss_activation.loss.regularization_loss(dense1) + \
        loss_activation.loss.regularization_loss(dense2)
    
    # overall loss
    loss = data_loss + regularization_loss

    # calculating accuracy of output of activation 2 and targets
    # calculate values along first axis because of batches
    predictions = np.argmax(loss_activation.output, axis = 1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis = 1)
    accuracy = np.mean(predictions == y)
    
    if not epoch % 100:
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f}, ' + 
              f'data_loss: {data_loss:.3f}, ' + 
              f'reg_loss: {regularization_loss:.3f}, ' + 
              f'lr: {optimizer.current_learning_rate}')

    # backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    dropout1.backward(dense2.dinputs)
    activation1.backward(dropout1.dinputs)
    dense1.backward(activation1.dinputs)
    
    # update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()
    
# validate the model

# creating dataset
X_test, y_test = spiral_data(samples = 100, classes = 3)

# forward pass of our training data through this layer
dense1.forward(X_test)

# forward pass through activation
activation1.forward(dense1.output)

# forward pass through seecond dense layer
dense2.forward(activation1.output)

# forward pass through activation/loss function
loss = loss_activation.forward(dense2.output, y_test)

# calculating accuracy of output of activation 2 and targets
# calculate values along first axis because of batches
predictions = np.argmax(loss_activation.output, axis = 1)
if len(y_test.shape) == 2:
    y_test = np.argmax(y, axis = 1)
accuracy = np.mean(predictions == y_test)

print(f'validation, acc: {accuracy:.3f}, loss: {loss:.3f}')

epoch: 0, acc: 0.351, loss: 1.099, data_loss: 1.099, reg_loss: 0.000, lr: 0.05
epoch: 100, acc: 0.571, loss: 0.949, data_loss: 0.907, reg_loss: 0.042, lr: 0.04975371909050202
epoch: 200, acc: 0.649, loss: 0.858, data_loss: 0.801, reg_loss: 0.057, lr: 0.049507401356502806
epoch: 300, acc: 0.667, loss: 0.826, data_loss: 0.767, reg_loss: 0.059, lr: 0.0492635105177595
epoch: 400, acc: 0.665, loss: 0.834, data_loss: 0.776, reg_loss: 0.058, lr: 0.04902201088288642
epoch: 500, acc: 0.689, loss: 0.807, data_loss: 0.748, reg_loss: 0.058, lr: 0.048782867456949125
epoch: 600, acc: 0.689, loss: 0.785, data_loss: 0.729, reg_loss: 0.056, lr: 0.04854604592455945
epoch: 700, acc: 0.681, loss: 0.787, data_loss: 0.732, reg_loss: 0.055, lr: 0.048311512633460556
epoch: 800, acc: 0.680, loss: 0.769, data_loss: 0.715, reg_loss: 0.054, lr: 0.04807923457858551
epoch: 900, acc: 0.686, loss: 0.795, data_loss: 0.742, reg_loss: 0.053, lr: 0.04784917938657352
epoch: 1000, acc: 0.685, loss: 0.797, data_loss: 0.745,

epoch: 8500, acc: 0.709, loss: 0.696, data_loss: 0.652, reg_loss: 0.044, lr: 0.035088950489490865
epoch: 8600, acc: 0.732, loss: 0.656, data_loss: 0.612, reg_loss: 0.044, lr: 0.0349662575614532
epoch: 8700, acc: 0.735, loss: 0.669, data_loss: 0.625, reg_loss: 0.044, lr: 0.034844419666190465
epoch: 8800, acc: 0.721, loss: 0.699, data_loss: 0.654, reg_loss: 0.045, lr: 0.034723427896801974
epoch: 8900, acc: 0.705, loss: 0.671, data_loss: 0.626, reg_loss: 0.045, lr: 0.03460327346967023
epoch: 9000, acc: 0.723, loss: 0.666, data_loss: 0.622, reg_loss: 0.044, lr: 0.034483947722335255
epoch: 9100, acc: 0.717, loss: 0.695, data_loss: 0.651, reg_loss: 0.044, lr: 0.034365442111412764
epoch: 9200, acc: 0.713, loss: 0.673, data_loss: 0.629, reg_loss: 0.044, lr: 0.03424774821055516
epoch: 9300, acc: 0.718, loss: 0.713, data_loss: 0.670, reg_loss: 0.044, lr: 0.03413085770845422
epoch: 9400, acc: 0.707, loss: 0.670, data_loss: 0.627, reg_loss: 0.043, lr: 0.034014762406884586
epoch: 9500, acc: 0.712, 

The validation set performs better than the training here because the dropout layer isn't used, allowing the model to use all the neurons. Because of the regularization, a larger model can now be helpful in accuracy.

In [6]:
# creating dataset
X, y = spiral_data(samples = 100, classes = 3)

# dense layer wtih 2 input features and 64 ouput values
dense1 = Layer_Dense(2, 512, weight_regularizer_l2=5e-4,
                            bias_regularizer_l2=5e-4)

# ReLU activation for dense1
activation1 = Activation_ReLU()

# dropout layer
dropout1 = Layer_Dropout(0.1) # ie. dropout rate of 90%

# second dense layer with 64 inputs features, an 3 output values
dense2 = Layer_Dense(512, 3)

# create softmax classifier's combined loss and acivation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# optimizer
optimizer = Optimizer_Adam(learning_rate=0.05, decay=5e-5)

# training in loop
for epoch in range(10001):
    # forward pass of our training data through this layer
    dense1.forward(X)

    # forward pass through activation
    activation1.forward(dense1.output)
    
    # forward pass through dropout
    dropout1.forward(activation1.output)

    # forward pass through seecond dense layer
    dense2.forward(dropout1.output)

    # forward pass through activation/loss function
    data_loss = loss_activation.forward(dense2.output, y)
    
    # regularizatin penalty
    regularization_loss = \
        loss_activation.loss.regularization_loss(dense1) + \
        loss_activation.loss.regularization_loss(dense2)
    
    # overall loss
    loss = data_loss + regularization_loss

    # calculating accuracy of output of activation 2 and targets
    # calculate values along first axis because of batches
    predictions = np.argmax(loss_activation.output, axis = 1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis = 1)
    accuracy = np.mean(predictions == y)
    
    if not epoch % 100:
        print(f'epoch: {epoch}, ' + 
              f'acc: {accuracy:.3f}, ' + 
              f'loss: {loss:.3f}, ' + 
              f'data_loss: {data_loss:.3f}, ' + 
              f'reg_loss: {regularization_loss:.3f}, ' + 
              f'lr: {optimizer.current_learning_rate}')

    # backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    dropout1.backward(dense2.dinputs)
    activation1.backward(dropout1.dinputs)
    dense1.backward(activation1.dinputs)
    
    # update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()
    
# validate the model

# creating dataset
X_test, y_test = spiral_data(samples = 100, classes = 3)

# forward pass of our training data through this layer
dense1.forward(X_test)

# forward pass through activation
activation1.forward(dense1.output)

# forward pass through seecond dense layer
dense2.forward(activation1.output)

# forward pass through activation/loss function
loss = loss_activation.forward(dense2.output, y_test)

# calculating accuracy of output of activation 2 and targets
# calculate values along first axis because of batches
predictions = np.argmax(loss_activation.output, axis = 1)
if len(y_test.shape) == 2:
    y_test = np.argmax(y, axis = 1)
accuracy = np.mean(predictions == y_test)

print(f'validation, acc: {accuracy:.3f}, loss: {loss:.3f}')

epoch: 0, acc: 0.327, loss: 1.099, data_loss: 1.099, reg_loss: 0.000, lr: 0.05
epoch: 100, acc: 0.743, loss: 0.693, data_loss: 0.566, reg_loss: 0.126, lr: 0.04975371909050202
epoch: 200, acc: 0.797, loss: 0.668, data_loss: 0.518, reg_loss: 0.151, lr: 0.049507401356502806
epoch: 300, acc: 0.803, loss: 0.654, data_loss: 0.492, reg_loss: 0.162, lr: 0.0492635105177595
epoch: 400, acc: 0.823, loss: 0.588, data_loss: 0.421, reg_loss: 0.167, lr: 0.04902201088288642
epoch: 500, acc: 0.810, loss: 0.590, data_loss: 0.415, reg_loss: 0.175, lr: 0.048782867456949125
epoch: 600, acc: 0.837, loss: 0.556, data_loss: 0.385, reg_loss: 0.171, lr: 0.04854604592455945
epoch: 700, acc: 0.830, loss: 0.607, data_loss: 0.440, reg_loss: 0.168, lr: 0.048311512633460556
epoch: 800, acc: 0.820, loss: 0.600, data_loss: 0.433, reg_loss: 0.167, lr: 0.04807923457858551
epoch: 900, acc: 0.867, loss: 0.542, data_loss: 0.373, reg_loss: 0.170, lr: 0.04784917938657352
epoch: 1000, acc: 0.850, loss: 0.580, data_loss: 0.410,

epoch: 8500, acc: 0.867, loss: 0.500, data_loss: 0.351, reg_loss: 0.149, lr: 0.035088950489490865
epoch: 8600, acc: 0.870, loss: 0.487, data_loss: 0.340, reg_loss: 0.148, lr: 0.0349662575614532
epoch: 8700, acc: 0.877, loss: 0.547, data_loss: 0.397, reg_loss: 0.149, lr: 0.034844419666190465
epoch: 8800, acc: 0.840, loss: 0.599, data_loss: 0.451, reg_loss: 0.148, lr: 0.034723427896801974
epoch: 8900, acc: 0.863, loss: 0.508, data_loss: 0.359, reg_loss: 0.148, lr: 0.03460327346967023
epoch: 9000, acc: 0.867, loss: 0.517, data_loss: 0.364, reg_loss: 0.153, lr: 0.034483947722335255
epoch: 9100, acc: 0.860, loss: 0.517, data_loss: 0.365, reg_loss: 0.152, lr: 0.034365442111412764
epoch: 9200, acc: 0.870, loss: 0.505, data_loss: 0.355, reg_loss: 0.149, lr: 0.03424774821055516
epoch: 9300, acc: 0.860, loss: 0.521, data_loss: 0.364, reg_loss: 0.156, lr: 0.03413085770845422
epoch: 9400, acc: 0.877, loss: 0.484, data_loss: 0.333, reg_loss: 0.152, lr: 0.034014762406884586
epoch: 9500, acc: 0.873, 