In [2]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data
nnfs.init()

In [3]:
class Dense_Layer:

    def __init__(self, n_input, n_neurons,
                 weight_regularizer_l1=0, weight_regularizer_l2=0,
                 bias_regularizer_l1=0, bias_regularizer_l2=0):

        # Intialize weights as gaussian distribution
        self.weights = 0.01 * np.random.randn(n_input, n_neurons)
        self.weight_momentums = self.weights
        # Initialize biases as zero
        self.biases = np.zeros((1,n_neurons))

        self.weight_regularizer_l1 = weight_regularizer_l1
        self.weight_regularizer_l2 = weight_regularizer_l2
        self.bias_regularizer_l1 = bias_regularizer_l1
        self.bias_regularizer_l2 = bias_regularizer_l2
        
    def forward(self, inputs):

        # Calculate output values from inputs, weights and biases
        self.inputs = inputs
        self.output = np.dot(inputs, self.weights) + self.biases

    def backward(self,dvalues):
        
        # Gradients on Parameters
        self.dweights = np.dot(self.inputs.T , dvalues)
        self.dbiases = np.sum(dvalues , axis = 0 , keepdims=True )

        # L1 Regularization 
        if self.weight_regularizer_l1 > 0:
            # Initializing dL1 with same number of elements as the weights array
            dL1 = np.ones_like(self.weights)

            # Setting any gradients less than 0 to -1
            dL1[self.weights < 0] = -1

            # Finding Derivatives of the weights
            self.dweights += self.weight_regularizer_l1 * dL1
        
        # L2 Regularization
        if self.weight_regularizer_l2 > 0:
            # Similar but 2 cause of (w^2) instead of |w|
            self.dweights += 2* self.weight_regularizer_l2 * self.weights

        if self.bias_regularizer_l1 > 0:
            dL1 = np.ones_like(self.biases)
            dL1[self.biases < 0] = -1
            self.dbiases += self.bias_regularizer_l1 * dL1
        
        if self.bias_regularizer_l2 > 0:
            self.dbiases += 2* self.bias_regularizer_l2 * self.biases
        
    
        # Gradient on values
        self.dinputs = np.dot(dvalues, self.weights.T)

        
    

In [4]:
class Layer_Dropout:

    # Dropout Layer to select any percent of neurons for each iteration to avoid one neuron taking all the weight.
    def __init__(self, rate):

        self.rate = 1 - rate
    
    def forward(self, inputs):

        # Generating a mask to choose a certain amount of neurons while keeping the output of the activation function the same
        self.inputs = inputs

        self.binary_mask = np.random.binomial(1, self.rate , size=inputs.shape) / self.rate

        self.output = inputs * self.binary_mask

    def backward(self,dvalues):

        self.dinputs = dvalues * self.binary_mask
        

In [5]:
class Activation_Relu:

    def forward(self, inputs):

        # ReLU Function -> max(0, x)
        self.output = np.maximum(0, inputs)
        self.inputs = inputs
    
    def backward(self, dvalues):
        
        # Making a copy for modification
        self.dinputs = dvalues.copy()

        # Zero gradient for any negative values
        self.dinputs[self.inputs <= 0 ] = 0

In [6]:
class Activation_Softmax:

    def forward(self, inputs):
        
        # Softmax Function
        ex = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
        probabilties = ex / np.sum(ex, axis=1, keepdims=True)
        
        self.output = probabilties

In [7]:
class Loss:

    def calculate(self, output, y):

        # Calculates negative log likelihoods
        sample_losses = self.forward(output, y)
        
        data_loss = np.mean(sample_losses)

        return data_loss
    
    def regularization_loss(self, layer):

        regularization_loss = 0

        if layer.weight_regularizer_l1 > 0:
            regularization_loss += layer.weight_regularizer_l1 * np.sum(np.abs(layer.weights))

        if layer.weight_regularizer_l2 > 0:
            regularization_loss += layer.weight_regularizer_l2 * np.sum(layer.weights**2)

        if layer.bias_regularizer_l1 > 0:
            regularization_loss += layer.bias_regularizer_l1 * np.sum(np.abs(layer.biases))

        if layer.bias_regularizer_l2 > 0:
            regularization_loss += layer.bias_regularizer_l2 * np.sum(layer.biases**2)
        
        return regularization_loss

In [8]:
class Categorical_Cross_Entropy_loss(Loss):

    def forward(self, y_pred, y_true):
        
        # Number of Samples
        samples = len(y_pred)

        # Clipping so that no value is 1 or 0.
        y_pred_clipped = np.clip(y_pred, 0.0000001 , 1 - 0.0000001)

        # If y_true is a 1D vector
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[range(samples) , y_true]
        
        # If y_true is one hot encoded

        elif len(y_true.shape) == 2:

            # As y_true is one hot encoded,
            # Multiplying by it would mean , only the relevant prediction is taken
            # Then we sum it along the column.
            correct_confidences = np.sum(y_pred_clipped*y_true , axis=1)

        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods
    
    
    def backward(self, dvalues, y_true):
        
        # Length of Samples
        samples = len(dvalues)

        # Length of labels of a sample
        labels = len(dvalues[0])

        if len(y_true.shape) == 1:
            # One hot encoding 
            y_true = np.eye(labels)[y_true]

        self.dinputs = -y_true / dvalues
    
        self.dinputs = self.dinputs / samples

In [9]:
class Activation_Softmax_Loss_CategoricalCrossEntropy:

    # Combining Softmax Activation and CrossEntropy Categorical Loss
    
    def __init__(self):
        self.activation = Activation_Softmax()
        self.loss = Categorical_Cross_Entropy_loss()
    
    def forward(self, inputs , y_true):

        # Activation Function
        self.activation.forward(inputs)

        self.output = self.activation.output

        # Calculate and return loss
        return self.loss.calculate(self.output , y_true)
    
    def backward(self, dvalues, y_true):

        samples = len(dvalues)

        # If one hot encoded, turn them into discrete values
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true , axis = 1)

        # Copy for modification
        self.dinputs = dvalues.copy()
        # Calculate Gradient
        self.dinputs[range(samples) , y_true] -= 1
        # Normalize gradien
        self.dinputs = self.dinputs / samples


In [10]:
# Training the Neural Network without any optimizers

# Spiral Dataset
X,y = spiral_data(samples =100 , classes=3)

# First Dense Layer
dense1 = Dense_Layer(2,3)

# First Activation Layer
activation1= Activation_Relu()

# Second Dense Layer
dense2 = Dense_Layer(3,3)

# Loss/Softmax Activation
loss_activation = Activation_Softmax_Loss_CategoricalCrossEntropy()

# Forward pass data through first layer
dense1.forward(X)

# Pass the output of first layer to activation function
activation1.forward(dense1.output)

# Pass the output of first activation to second layer
dense2.forward(activation1.output)

# Pass the output of second layer to softmax activation and calculate losses
loss = loss_activation.forward(dense2.output, y)

print(loss_activation.output[:5])
print('Loss: ' , loss)

# Calculate accuracy from output of softmax activation and targets
# Calculate values along first axis
predictions = np.argmax(loss_activation.output , axis=1)
if len(y.shape) == 2:
    y = np.argmax(y, axis=1)
accuracy = np.mean(predictions==y)
print('acc: ', accuracy)

loss_activation.backward(loss_activation.output , y)
dense2.backward(loss_activation.dinputs)
activation1.backward(dense2.dinputs)
dense1.backward(activation1.dinputs)

print(dense1.dweights)
print(dense1.dbiases)
print(dense2.dweights)
print(dense2.dbiases)

[[0.33333334 0.33333334 0.33333334]
 [0.3333332  0.3333332  0.33333364]
 [0.3333329  0.33333293 0.3333342 ]
 [0.3333326  0.33333263 0.33333477]
 [0.33333233 0.3333324  0.33333528]]
Loss:  1.0986104
acc:  0.34
[[ 1.5766357e-04  7.8368583e-05  4.7324400e-05]
 [ 1.8161038e-04  1.1045573e-05 -3.3096312e-05]]
[[-3.60553473e-04  9.66117223e-05 -1.03671395e-04]]
[[ 5.44109462e-05  1.07411419e-04 -1.61822361e-04]
 [-4.07913431e-05 -7.16780924e-05  1.12469446e-04]
 [-5.30112993e-05  8.58172934e-05 -3.28059905e-05]]
[[-1.0729185e-05 -9.4610732e-06  2.0027859e-05]]


In [11]:
class Optimizer_GD:

    # Gradient Descent Optimizer
    def __init__(self, learning_rate=1):
        self.learning_rate = learning_rate
    
    def update_params(self, layer):
        layer.weights += -self.learning_rate * layer.dweights
        layer.biases += -self.learning_rate * layer.dbiases

In [12]:
class Optimizer_GD_Decay:

    # Gradient descent optimizer with decay
    # Essentially decreasing the regularization parameter(learning rate) as the iterations increase
    def __init__(self, learning_rate=1, decay=0.):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations= 0
    
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
                (1. / (1. + self.decay * self.iterations))
            
    def update_params(self, layer):
        layer.weights += -self.current_learning_rate * layer.dweights
        layer.biases += -self.current_learning_rate * layer.dbiases
    
    def post_update_params(self):
        self.iterations += 1
        

In [13]:
class Optimizer_SGD:

    # Stochastic Gradient Descent Optimizer
    # Using Momentum to keep track of past gradients to reduce redundant step and converge faster
    def __init__(self , learning_rate=1. , decay=0. , momentum=0.):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum = momentum

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
                  (1. / (1. + self.decay * self.iterations)  )
        
    def update_params(self, layer):

        # Creating a momentum array if there is none
        if self.momentum:
            if not hasattr(layer, 'weight_momentums'):
                layer.weight_momentums = np.zeros_like(layer.weight)
                layer.bias_momentums = np.zeros_like(layer.biases)
            
            # Updating the weight updates  and bias updates based on momentum and previous gradients
            weight_updates = self.momentum * layer.weight_momentums - \
             self.current_learning_rate * layer.dweights
            layer.weight_momentums = weight_updates

            bias_updates = self.momentum * layer.bias_momentums - \
             self.current_learning_rate * layer.dbiases
            layer.bias_momentums = bias_updates

        else:
            weight_updates = -self.current_learning_rate * layer.dweights
            bias_updates = -self.current_learning_rate * layer.dbiases
        
        # Updating weights and biases
        layer.weights += weight_updates
        layer.biases += bias_updates
    
    def post_update_params(self):
            self.iterations += 1

            

In [14]:
class Optimizer_RMSPROP:

    # RMSPROP Optimizer
    # Introducing concept of cache
    # Similar to Momentum but used in updating weights
    # Higher weight values get updated less i.e less learning rate for them
    # Lower weight values get updated more i.e more learning rate for them
    # This makes sure all neurons learn and
    # A High Discrepancy does not occur in their values.
    
    def __init__(self , learning_rate=1. , decay=0. , epsilon=1e-7, rho=0.9):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.rho = rho

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
                  (1. / (1. + self.decay * self.iterations)  )
        
    def update_params(self, layer):
    
            if not hasattr(layer, 'weight_cache'):
                layer.weight_cache=np.zeros_like(layer.weights)
                layer.bias_cache = np.zeros_like(layer.biases)

            # If you add a factor to prioritise the present cache more than the gradient squared,
            # Its RMSProp Optimizer
            # (rho * cache**2 + (1-rho) * gradd**2)
            layer.weight_cache = self.rho * layer.weight_cache + \
            (1-self.rho) * layer.dweights**2 
            
            layer.bias_cache = self.rho * layer.bias_cache + \
            (1-self.rho) * layer.dbiases**2 

            layer.weights += -self.current_learning_rate * \
                layer.dweights / \
                (np.sqrt(layer.weight_cache) + self.epsilon)
            
            layer.biases += -self.current_learning_rate * \
                layer.dbiases / \
                (np.sqrt(layer.bias_cache) + self.epsilon)
            
    def post_update_params(self):
            self.iterations += 1

            

In [15]:
class Optimizer_Adam:

    # Adam Optimizer
    # Combines concept of Momentum and Cache
    # Most widely used Optimizer generally
    def __init__(self , learning_rate=0.001 , decay=0. , epsilon=1e-7, rho=0.999, momentum=0.9):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.rho = rho
        self.momentum = momentum

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
                  (1. / (1. + self.decay * self.iterations)  )
        
    def update_params(self, layer):
    
            if not hasattr(layer, 'weight_cache'):
                layer.weight_cache=np.zeros_like(layer.weights)
                layer.bias_cache = np.zeros_like(layer.biases)
                layer.weight_momentums = np.zeros_like(layer.weights)
                layer.bias_momentums = np.zeros_like(layer.biases)

            layer.weight_momentums = self.momentum * layer.weight_momentums + (1-self.momentum) * layer.dweights
            layer.bias_momentums = self.momentum * layer.bias_momentums + (1-self.momentum) * layer.dbiases

            weight_momentums_corrected = layer.weight_momentums / (1 - self.momentum ** (self.iterations + 1))
            bias_momentums_corrected = layer.bias_momentums / (1 - self.momentum ** (self.iterations + 1))
            
            layer.weight_cache = self.rho * layer.weight_cache + (1-self.rho) * layer.dweights**2 
            layer.bias_cache = self.rho * layer.bias_cache + (1-self.rho) * layer.dbiases**2 

            weight_cache_corrected = layer.weight_cache / (1- self.rho**(self.iterations + 1))
            bias_cache_corrected = layer.bias_cache / (1- self.rho**(self.iterations + 1))


            layer.weights += -self.current_learning_rate * \
                weight_momentums_corrected / \
                (np.sqrt(weight_cache_corrected) + self.epsilon)
            
            layer.biases += -self.current_learning_rate * \
                bias_momentums_corrected / \
                (np.sqrt(bias_cache_corrected) + self.epsilon)
            
    def post_update_params(self):
            self.iterations += 1

            

In [None]:
# Training the neural network with different optimizers

X, y = spiral_data(samples=500, classes=3)

dense1 = Dense_Layer(2,64, weight_regularizer_l2=5e-4 , bias_regularizer_l2=5e-4)

activation1 = Activation_Relu()

dense2 = Dense_Layer(64,3)
activation_2 = Activation_Softmax()
loss_activation = Activation_Softmax_Loss_CategoricalCrossEntropy()

#optimizer = Optimizer_GD()
#optimizer = Optimizer_GD_Decay(decay=1e-3)
#optimizer = Optimizer_SGD(decay=1e-3 , momentum=0.9)
#optimizer = Optimizer_RMSPROP(learning_rate=0.02 , decay=1e-5 , rho=0.999)

optimizer = Optimizer_Adam(learning_rate=0.02 , decay=5e-7)

for epoch in range(5001):

    dense1.forward(X)
    activation1.forward(dense1.output)

    dense2.forward(activation1.output)

    data_loss = loss_activation.forward(dense2.output, y)

    regularization_loss = (
        loss_activation.loss.regularization_loss(dense1) +
        loss_activation.loss.regularization_loss(dense2)
    )
    loss = data_loss + regularization_loss
    predictions = np.argmax(loss_activation.output , axis=1)

    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f'epoch: {epoch}, ' + f'acc: {accuracy:.3f}, ' + f'loss:{loss:.3f} (data_loss:{data_loss:.3f} , regularization_loss:{regularization_loss:.3f}), '  + f'lr: {optimizer.current_learning_rate}')

    loss_activation.backward(loss_activation.output , y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()


X_test, y_test = spiral_data(samples=100, classes=3)

loss_activation = Activation_Softmax_Loss_CategoricalCrossEntropy()
dense1.forward(X_test)

activation1.forward(dense1.output)

dense2.forward(activation1.output)

loss = loss_activation.forward(dense2.output, y_test)

predictions = np.argmax(loss_activation.output , axis=1)

if len(y_test.shape) == 2:
    y_test = np.argmax(y_test, axis=1)
accuracy = np.mean(predictions==y_test)
print(f'Test Acc: {accuracy:.3f}, ' + f'loss:{loss:.3f}')



epoch: 0, acc: 0.343, loss:1.099 (data_loss:1.099 , regularization_loss:0.000), lr: 0.02
epoch: 100, acc: 0.669, loss:0.884 (data_loss:0.852 , regularization_loss:0.032), lr: 0.019999010049002574
epoch: 200, acc: 0.771, loss:0.695 (data_loss:0.616 , regularization_loss:0.079), lr: 0.019998010197985302
epoch: 300, acc: 0.809, loss:0.616 (data_loss:0.519 , regularization_loss:0.097), lr: 0.019997010446938183
epoch: 400, acc: 0.829, loss:0.570 (data_loss:0.467 , regularization_loss:0.103), lr: 0.01999601079584623
epoch: 500, acc: 0.842, loss:0.538 (data_loss:0.433 , regularization_loss:0.104), lr: 0.01999501124469445
epoch: 600, acc: 0.855, loss:0.514 (data_loss:0.411 , regularization_loss:0.104), lr: 0.01999401179346786
epoch: 700, acc: 0.859, loss:0.496 (data_loss:0.394 , regularization_loss:0.102), lr: 0.01999301244215147
epoch: 800, acc: 0.853, loss:0.484 (data_loss:0.384 , regularization_loss:0.100), lr: 0.0199920131907303
epoch: 900, acc: 0.871, loss:0.467 (data_loss:0.368 , regular

In [18]:
X, y = spiral_data(samples=500, classes=3)

dense1 = Dense_Layer(2,64, weight_regularizer_l2=5e-4 , bias_regularizer_l2=5e-4)
dropout1= Layer_Dropout(0.1)
activation1 = Activation_Relu()

dense2 = Dense_Layer(64,3)
#dropout2 = Layer_Dropout(0.1)
activation_2 = Activation_Softmax()
loss_activation = Activation_Softmax_Loss_CategoricalCrossEntropy()

#optimizer = Optimizer_GD()
#optimizer = Optimizer_GD_Decay(decay=1e-3)
#optimizer = Optimizer_SGD(decay=1e-3 , momentum=0.9)
#optimizer = Optimizer_RMSPROP(learning_rate=0.02 , decay=1e-5 , rho=0.999)

optimizer = Optimizer_Adam(learning_rate=0.05 , decay=5e-7)

for epoch in range(5001):

    dense1.forward(X)
    activation1.forward(dense1.output)
    dropout1.forward(activation1.output)

    dense2.forward(dropout1.output)

    data_loss = loss_activation.forward(dense2.output, y)

    regularization_loss = (
        loss_activation.loss.regularization_loss(dense1) +
        loss_activation.loss.regularization_loss(dense2)
    )
    loss = data_loss + regularization_loss
    predictions = np.argmax(loss_activation.output , axis=1)

    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f'epoch: {epoch}, ' + f'acc: {accuracy:.3f}, ' + f'loss:{loss:.3f} (data_loss:{data_loss:.3f} , regularization_loss:{regularization_loss:.3f}), '  + f'lr: {optimizer.current_learning_rate}')

    loss_activation.backward(loss_activation.output , y)
    dense2.backward(loss_activation.dinputs)
    dropout1.backward(dense2.dinputs)
    activation1.backward(dropout1.dinputs)
    dense1.backward(activation1.dinputs)

    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()


X_test, y_test = spiral_data(samples=100, classes=3)

loss_activation = Activation_Softmax_Loss_CategoricalCrossEntropy()
dense1.forward(X_test)

activation1.forward(dense1.output)

dense2.forward(activation1.output)

loss = loss_activation.forward(dense2.output, y_test)

predictions = np.argmax(loss_activation.output , axis=1)

if len(y_test.shape) == 2:
    y_test = np.argmax(y_test, axis=1)
accuracy = np.mean(predictions==y_test)
print(f'Test Acc: {accuracy:.3f}, ' + f'loss:{loss:.3f}')



epoch: 0, acc: 0.366, loss:1.099 (data_loss:1.099 , regularization_loss:0.000), lr: 0.05
epoch: 100, acc: 0.620, loss:0.881 (data_loss:0.849 , regularization_loss:0.032), lr: 0.04999752512250644
epoch: 200, acc: 0.637, loss:0.808 (data_loss:0.765 , regularization_loss:0.043), lr: 0.04999502549496326
epoch: 300, acc: 0.673, loss:0.783 (data_loss:0.737 , regularization_loss:0.045), lr: 0.049992526117345455
epoch: 400, acc: 0.665, loss:0.774 (data_loss:0.728 , regularization_loss:0.046), lr: 0.04999002698961558
epoch: 500, acc: 0.673, loss:0.760 (data_loss:0.716 , regularization_loss:0.044), lr: 0.049987528111736124
epoch: 600, acc: 0.694, loss:0.759 (data_loss:0.715 , regularization_loss:0.044), lr: 0.049985029483669646
epoch: 700, acc: 0.698, loss:0.709 (data_loss:0.665 , regularization_loss:0.043), lr: 0.049982531105378675
epoch: 800, acc: 0.689, loss:0.737 (data_loss:0.694 , regularization_loss:0.042), lr: 0.04998003297682575
epoch: 900, acc: 0.714, loss:0.721 (data_loss:0.679 , regul