In [2]:
import numpy as np

In [3]:
class Layer_Dense:
    def __init__(self, n_inputs, n_neurons):
        # Initialize weights with small random values to break symmetry.
        # (inputs, neurons) so we can dot product with inputs (samples, inputs)
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)

        # Initialize biases as zeros (one for each neuron)
        self.biases = np.zeros((1, n_neurons))

    def forward(self, inputs):
        # Saving the input values for the backward pass calculations
        self.inputs = inputs

        # Calculate output: Dot product of inputs and weights, then add bias
        self.output = np.dot(inputs, self.weights) + self.biases

    def backward(self, dvalues):
        """
        dvalues: The gradient of the loss with respect to the output of this layer.
                 Essentially, the "error signal" coming from the layer ahead.
        """
        # 1. Gradient on weights: how much did each weight affect the error?
        self.dweights = np.dot(self.inputs.T, dvalues)

        # 2. Gradient on biases: how much did each bias affect the error?
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)

        # 3. Gradient on inputs: error signal to pass to the PREVIOUS layer
        self.dinputs = np.dot(dvalues, self.weights.T)

In [4]:
class Activation_ReLU:
    def forward(self, inputs):
        # Remember inputs for the backward pass
        self.inputs = inputs
        # Apply the ReLU function
        self.output = np.maximum(0, inputs)

    def backward(self, dvalues):
        # We need to modify the upstream gradient (dvalues) Since we are modifying a's, we copy the dvalues first
        self.dinputs = dvalues.copy()

        # Zero out gradients where input values were negative because the slope of ReLU is 0 for x <= 0
        self.dinputs[self.inputs <= 0] = 0

In [5]:
class Activation_Softmax:
    def forward(self, inputs):
        # Get unnormalized probabilities (exponentiate)
        # Subtracting np.max for numerical stability
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))

        # Normalize them for each sample (sum to 1)
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        self.output = probabilities

In [6]:
class Loss_CategoricalCrossentropy:
    def forward(self, y_pred, y_true):
        # y_pred: probabilities from Softmax
        # y_true: actual labels (either one-hot encoded or scalar indices)

        samples = len(y_pred)

        # Clip data to prevent division by 0 (log(0) is undefined/inf)
        # We clip it very slightly away from 0 and 1
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

        # If labels are provided as indices (e.g., [0, 1, 1])
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[range(samples), y_true]
        # If labels are one-hot encoded (e.g., [[1,0], [0,1], [0,1]])
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(y_pred_clipped * y_true, axis=1)

        # Calculate losses
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods

    def calculate(self, output, y):
        # Helper to get the average loss
        sample_losses = self.forward(output, y)
        data_loss = np.mean(sample_losses)
        return data_loss

In [8]:
class Activation_Softmax_Loss_CategoricalCrossentropy:
    """
    Combined Softmax and Cross-Entropy for faster backward pass
    """
    def __init__(self):
        self.activation = Activation_Softmax()
        self.loss = Loss_CategoricalCrossentropy()

    def forward(self, inputs, y_true):
        # Output layer's forward pass
        self.activation.forward(inputs)
        self.output = self.activation.output
        # Calculate and return loss
        return self.loss.calculate(self.output, y_true)

    def backward(self, dvalues, y_true):
        # Number of samples
        samples = len(dvalues)
        # If labels are one-hot encoded, turn them into discrete values
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis=1)

        # Copy so we don't modify the original data
        self.dinputs = dvalues.copy()

        # Calculate gradient: (Predicted - True)
        self.dinputs[range(samples), y_true] -= 1

        # Normalize gradient (divide by number of samples)
        # This makes the learning rate independent of batch size
        self.dinputs = self.dinputs / samples

In [10]:
class Optimizer_SGD:
    def __init__(self, learning_rate=1.0):
        self.learning_rate = learning_rate

    def update_params(self, layer):
        # update = -learning_rate * gradient
        layer.weights += -self.learning_rate * layer.dweights
        layer.biases += -self.learning_rate * layer.dbiases

In [11]:
# This creates a non-linear dataset with 3 classes
def create_data(samples, classes):
    X = np.zeros((samples*classes, 2))
    y = np.zeros(samples*classes, dtype='uint8')
    for class_number in range(classes):
        ix = range(samples*class_number, samples*(class_number+1))
        r = np.linspace(0.0, 1, samples)
        t = np.linspace(class_number*4, (class_number+1)*4, samples) + np.random.randn(samples)*0.2
        X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
        y[ix] = class_number
    return X, y

X, y = create_data(100, 3)

# Input has 2 features (x, y coordinates). We'll use 64 neurons in hidden layer.
dense1 = Layer_Dense(2, 64)
activation1 = Activation_ReLU()

# Hidden layer to Output layer (3 neurons for 3 classes)
dense2 = Layer_Dense(64, 3)
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# Create Optimizer
optimizer = Optimizer_SGD(learning_rate=1.0)

# Training loop
for epoch in range(10001):
    # Forward Pass
    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    loss = loss_activation.forward(dense2.output, y)

    # Calculate Accuracy
    predictions = np.argmax(loss_activation.output, axis=1)
    accuracy = np.mean(predictions == y)

    if not epoch % 1000:
        print(f'epoch: {epoch}, acc: {accuracy:.3f}, loss: {loss:.3f}')

    # Backward Pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    # Update Weights
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)

epoch: 0, acc: 0.177, loss: 1.099
epoch: 1000, acc: 0.940, loss: 0.154
epoch: 2000, acc: 0.987, loss: 0.051
epoch: 3000, acc: 0.993, loss: 0.036
epoch: 4000, acc: 0.993, loss: 0.029
epoch: 5000, acc: 0.993, loss: 0.025
epoch: 6000, acc: 0.993, loss: 0.023
epoch: 7000, acc: 0.993, loss: 0.022
epoch: 8000, acc: 0.993, loss: 0.020
epoch: 9000, acc: 0.993, loss: 0.020
epoch: 10000, acc: 0.993, loss: 0.019
