In [1]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data

nnfs.init()

In [2]:

class DenseLayer:

    def __init__(self, n_inputs, n_neurons, weight_initializer='random'):
        # Choose weight initializer
        if weight_initializer == 'random':
            self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        elif weight_initializer == 'xavier':
            scale = np.sqrt(2 / (n_inputs + n_neurons))
            self.weights = np.random.randn(n_inputs, n_neurons) * scale
        else:
            raise ValueError("Invalid weight initializer")

        self.biases = np.zeros((1, n_neurons))

    def forward(self, inputs):
        # Save input for backward pass
        self.inputs = inputs
        # Calculate output values
        self.output = np.dot(inputs, self.weights) + self.biases

    def backward(self, dvalues):
        # Gradient on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        # Gradient on values
        self.dinputs = np.dot(dvalues, self.weights.T)

In [3]:
class ReLUActivation:

    def forward(self, inputs):
        # Save input for backward pass
        self.inputs = inputs
        # Calculate output values
        self.output = np.maximum(0, inputs)

    def backward(self, dvalues):
        # Copy the gradient to avoid modifying the original variable
        self.dinputs = dvalues.copy()
        # Zero gradient where input values were negative
        self.dinputs[self.inputs <= 0] = 0

In [4]:
class SoftmaxActivation:

    def forward(self, inputs):
        # Save input values for backward pass
        self.inputs = inputs

        # Calculate unnormalized probabilities
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))

        # Normalize probabilities for each sample
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)

        self.output = probabilities

    def backward(self, dvalues):
        # Create an uninitialized array
        self.dinputs = np.empty_like(dvalues)

        # Enumerate outputs and gradients
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):
            # Flatten output array
            single_output = single_output.reshape(-1, 1)

            # Calculate Jacobian matrix of the output
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)

            # Calculate sample-wise gradient and add it to the array of sample gradients
            self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)

In [5]:
class Loss:

    # Calculates the data and regularization losses
    def calculate(self, output, y):

        # Calculate sample losses
        sample_losses = self.forward(output, y)

        # Calculate mean loss
        data_loss = np.mean(sample_losses)

        # Return loss
        return data_loss

In [6]:

class CategoricalCrossentropyLoss(Loss):

    def forward(self, y_pred, y_true):
        # Number of samples in a batch
        samples = len(y_pred)

        # Clip data to prevent division by 0
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

        # Probabilities for target values -
        # only if categorical labels
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[range(samples), y_true]

        # Mask values - only for one-hot encoded labels
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(y_pred_clipped * y_true, axis=1)

        # Losses
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods

    def backward(self, dvalues, y_true):
        # Number of samples
        samples = len(dvalues)
        # Number of labels in every sample
        # We'll use the first sample to count them
        labels = len(dvalues[0])

        # If labels are sparse, turn them into one-hot vector
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]

        # Calculate gradient
        self.dinputs = -y_true / dvalues
        # Normalize gradient
        self.dinputs = self.dinputs / samples

In [7]:
class SoftmaxCrossentropyActivation:

    def __init__(self):
        self.activation = SoftmaxActivation()
        self.loss = CategoricalCrossentropyLoss()

    def forward(self, inputs, y_true):
        # Output layer's activation function
        self.activation.forward(inputs)
        # Set the output
        self.output = self.activation.output
        # Calculate and return loss value
        return self.loss.calculate(self.output, y_true)

    def backward(self, dvalues, y_true):
        # Number of samples
        samples = len(dvalues)

        # If labels are one-hot encoded,
        # turn them into discrete values
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis=1)

        # Copy so we can safely modify
        self.dinputs = dvalues.copy()
        # Calculate gradient
        self.dinputs[range(samples), y_true] -= 1
        # Normalize gradient
        self.dinputs = self.dinputs / samples

In [8]:
class NeuralNetwork:

    def __init__(self):
        self.dense1 = DenseLayer(2, 3)
        self.activation1 = ReLUActivation()
        self.dense2 = DenseLayer(3, 3)
        self.loss_activation = SoftmaxCrossentropyActivation()

    def forward(self, X):
        # Forward pass through the layers
        self.dense1.forward(X)
        self.activation1.forward(self.dense1.output)
        self.dense2.forward(self.activation1.output)
        loss = self.loss_activation.forward(self.dense2.output, y)
        return loss

    def backward(self, y):
        # Backward pass through the layers
        self.loss_activation.backward(self.loss_activation.output, y)
        self.dense2.backward(self.loss_activation.dinputs)
        self.activation1.backward(self.dense2.dinputs)
        self.dense1.backward(self.activation1.dinputs)

    def train(self, X, y, learning_rate=0.01, epochs=1000):
        for epoch in range(epochs):
            # Forward and backward pass
            loss = self.forward(X)
            self.backward(y)

            # Update weights and biases using gradient descent
            self.dense1.weights -= learning_rate * self.dense1.dweights
            self.dense1.biases -= learning_rate * self.dense1.dbiases
            self.dense2.weights -= learning_rate * self.dense2.dweights
            self.dense2.biases -= learning_rate * self.dense2.dbiases

            # Print loss for every 100 epochs
            if epoch % 100 == 0:
                print(f'Epoch {epoch}, Loss: {loss}')

# Create dataset
X, y = spiral_data(samples=100, classes=3)

# Create neural network instance
model = NeuralNetwork()

# Train the model
model.train(X, y)

print()

# Print gradients after training
print("Gradients after training:")
print("Dense1 Weights:\n", model.dense1.dweights)
print("Dense1 Biases:\n", model.dense1.dbiases)
print("Dense2 Weights:\n", model.dense2.dweights)
print("Dense2 Biases:\n", model.dense2.dbiases)

Epoch 0, Loss: 1.0986104011535645
Epoch 100, Loss: 1.098610281944275
Epoch 200, Loss: 1.0986099243164062
Epoch 300, Loss: 1.0986095666885376
Epoch 400, Loss: 1.098609209060669
Epoch 500, Loss: 1.0986088514328003
Epoch 600, Loss: 1.0986086130142212
Epoch 700, Loss: 1.098608136177063
Epoch 800, Loss: 1.0986080169677734
Epoch 900, Loss: 1.0986075401306152

Gradients after training:
Dense1 Weights:
 [[ 1.9096999e-04  7.8464247e-05  8.8730332e-05]
 [ 2.6465370e-04 -1.0290413e-05 -1.4555988e-05]]
Dense1 Biases:
 [[-1.7262298e-04  1.7588717e-04 -4.4025845e-05]]
Dense2 Weights:
 [[ 4.9024700e-05  1.9212597e-04 -2.4115064e-04]
 [-2.3269857e-05 -6.0852217e-05  8.4122090e-05]
 [-4.6445206e-05  9.7077755e-05 -5.0632541e-05]]
Dense2 Biases:
 [[-1.7051352e-06 -2.2788299e-06  4.0004961e-06]]
