In [None]:
import numpy as np
import matplotlib.pyplot as plt
import nnfs.datasets

X, y = nnfs.datasets.spiral_data(100, 3)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap="brg")
plt.show()

In [None]:
class Dense:
    def __init__(self, n_inputs, n_nueorns):
        self.weigths = 0.01 * np.random.randn(n_inputs, n_nueorns)
        self.biases = np.zeros((1, n_nueorns))

    def forward(self, inputs) -> np.array:
        self.inputs = inputs
        self.outputs = np.dot(inputs, self.weigths) + self.biases

    def backward(self, dvalues):
        self.dweigths = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        self.dinputs = np.dot(dvalues, self.weigths.T)

In [None]:
class ReLU:
    def forward(self, x):
        self.inputs = x
        self.outputs = np.maximum(0, x)

    def backward(self, dvalues: list):
        self.dinputs = dvalues.copy()
        self.dinputs[self.inputs <= 0] = 0

In [None]:
class Loss:
    def calculate(self, y_pred, y_true):
        loss = self.forward(y_pred, y_true)
        return np.mean(loss)


class SoftMax:
    def forward(self, inputs):
        exp_val = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
        prob = exp_val / np.sum(exp_val, axis=1, keepdims=True)
        self.outputs = prob


class CategoricalCrossEntropy(Loss):
    def forward(self, y_pred, y_true):
        y_clip = np.clip(y_pred, 1e-9, 1 - 1e-9)
        if len(y_true.shape) == 1:
            confidence_val = y_clip[
                range(len(y_pred)), y_true
            ]  ## for Sparse_categoricalCrossEntropy case
        else:
            confidence_val = np.sum(y_pred * y_true)  ## for Categorical Outputs
        return -np.log(confidence_val)

    def backward(self, dvalues, y_true):
        samples = len(y_true)
        labels = len(len(dvalues)[0])
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]
        self.dinputs = -y_true / dvalues
        self.dinputs = self.dinputs / samples


class SotMax_Loss_CategoricalCrossEntropy:
    def __init__(self):
        self.activation = SoftMax()
        self.loss = CategoricalCrossEntropy()

    def forward(self, inputs, y_true):
        self.activation.forward(inputs)
        self.outputs = self.activation.outputs
        return self.loss.calculate(self.outputs, y_true)

    def backward(self, dvalues, y_true):
        samples = len(dvalues)
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis=1)
        self.dinputs = dvalues.copy()
        self.dinputs[range(samples), y_true] -= 1
        self.dinputs = self.dinputs / samples

In [None]:
class GD:
    def __init__(self, learning_rate=0.5):
        self.learning_rate = learning_rate

    def update_params(self, layer):
        layer.weigths += -self.learning_rate * layer.dweigths
        layer.biases += -self.learning_rate * layer.dbiases

## Simple Gradient Descent Algorithm


In [None]:
dense_1 = Dense(X.shape[1], 50)
activation_1 = ReLU()

dense_2 = Dense(50, 3)
activation_2 = SotMax_Loss_CategoricalCrossEntropy()

optimizer = GD()
for epoch in range(10001):
    dense_1.forward(X)
    activation_1.forward(dense_1.outputs)

    dense_2.forward(activation_1.outputs)
    loss = activation_2.forward(dense_2.outputs, y)

    predictions = np.argmax(activation_2.outputs, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions == y)

    if not epoch % 100:
        print(f"epoch: {epoch}, " + f"acc: {accuracy:.3f}, " + f"loss: {loss:.3f}")

    activation_2.backward(dense_2.outputs, y)
    dense_2.backward(activation_2.dinputs)
    activation_1.backward(dense_2.dinputs)
    dense_1.backward(activation_1.dinputs)

    optimizer.update_params(dense_1)
    optimizer.update_params(dense_2)

In [None]:
class GD_decay:
    def __init__(self, learning_rate=1.0, decay=0.0):
        self.learning_rate = learning_rate
        self.current_lr = learning_rate
        self.decay = decay
        self.iterations = 0

    def pre_update_params(self):
        if self.decay:
            self.current_lr = self.learning_rate / (1 + self.decay * self.iterations)

    def update_params(self, layer):
        layer.weigths += -self.current_lr * layer.dweigths
        layer.biases += -self.current_lr * layer.dbiases

    def post_update_params(self):
        self.iterations += 1

## GD With Decay


In [None]:
dense_1 = Dense(X.shape[1], 50)
activation_1 = ReLU()

dense_2 = Dense(50, 3)
activation_2 = SotMax_Loss_CategoricalCrossEntropy()

optimizer = GD_decay(decay=1e-3)
for epoch in range(10001):
    dense_1.forward(X)
    activation_1.forward(dense_1.outputs)

    dense_2.forward(activation_1.outputs)
    loss = activation_2.forward(dense_2.outputs, y)

    predictions = np.argmax(activation_2.outputs, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions == y)

    if not epoch % 100:
        print(
            f"epoch: {epoch}, "
            + f"acc: {accuracy:.3f}, "
            + f"loss: {loss:.3f}, "
            + f"lr: {optimizer.current_lr}"
        )

    activation_2.backward(dense_2.outputs, y)
    dense_2.backward(activation_2.dinputs)
    activation_1.backward(dense_2.dinputs)
    dense_1.backward(activation_1.dinputs)

    optimizer.pre_update_params()
    optimizer.update_params(dense_1)
    optimizer.update_params(dense_2)
    optimizer.post_update_params()

## Gradient Descent With Momentum Param


In [None]:
class GD_decay_momentum:
    def __init__(self, learning_rate=1.0, decay=0.0, momentum=0.0):
        self.learning_rate = learning_rate
        self.current_lr = learning_rate
        self.decay = decay
        self.momentum = momentum
        self.iterations = 0

    def pre_update_params(self):
        self.current_lr = self.learning_rate / (1.0 + self.decay * self.iterations)

    def update_params(self, layer: Dense):
        if self.momentum:
            if not hasattr(layer, "weight_momentums"):
                layer.weight_momentums = np.zeros_like(layer.weigths)
                layer.bias_momentums = np.zeros_like(layer.biases)

            weight_updates = (
                self.momentum * layer.weight_momentums
                - self.current_lr * layer.dweigths
            )
            bias_updates = (
                self.momentum * layer.bias_momentums - self.current_lr * layer.dbiases
            )

            layer.weight_momentums = weight_updates
            layer.bias_momentums = bias_updates

            layer.weigths += weight_updates
            layer.biases += bias_updates

    def post_update_params(self):
        self.iterations += 1

In [None]:
dense_1 = Dense(X.shape[1], 50)
activation_1 = ReLU()

dense_2 = Dense(50, 3)
activation_2 = SotMax_Loss_CategoricalCrossEntropy()

optimizer = GD_decay_momentum(decay=1e-3, momentum=0.6)
for epoch in range(10001):
    dense_1.forward(X)
    activation_1.forward(dense_1.outputs)

    dense_2.forward(activation_1.outputs)
    loss = activation_2.forward(dense_2.outputs, y)

    predictions = np.argmax(activation_2.outputs, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions == y)

    if not epoch % 100:
        print(
            f"epoch: {epoch}, "
            + f"acc: {accuracy:.3f}, "
            + f"loss: {loss:.3f}, "
            + f"lr: {optimizer.current_lr}"
        )

    activation_2.backward(dense_2.outputs, y)
    dense_2.backward(activation_2.dinputs)
    activation_1.backward(dense_2.dinputs)
    dense_1.backward(activation_1.dinputs)

    optimizer.pre_update_params()
    optimizer.update_params(dense_1)
    optimizer.update_params(dense_2)
    optimizer.post_update_params()

## ADAGRAD Optimizer


In [None]:
import numpy as np

class Optimizer_Adagrad:
    # Initialize optimizer - set settings
    def __init__(self, learning_rate=1.0, decay=0.0, epsilon=1e-7):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon

    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (
                1.0 / (1.0 + self.decay * self.iterations)
            )

    # Update parameters
    def update_params(self, layer):
        # If layer does not contain cache arrays, create them filled with zeros
        if not hasattr(layer, "weight_cache"):
            layer.weight_cache = np.zeros_like(layer.weigths)
            layer.bias_cache = np.zeros_like(layer.biases)

        # Update cache with squared current gradients
        layer.weight_cache += layer.dweigths**2
        layer.bias_cache += layer.dbiases**2

        # Vanilla SGD parameter update + normalization with square rooted cache
        layer.weigths += (
            -self.current_learning_rate
            * layer.dweigths
            / (np.sqrt(layer.weight_cache) + self.epsilon)
        )
        layer.biases += (
            -self.current_learning_rate
            * layer.dbiases
            / (np.sqrt(layer.bias_cache) + self.epsilon)
        )

    # Call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

In [None]:
dense_1 = Dense(X.shape[1], 50)
activation_1 = ReLU()

dense_2 = Dense(50, 3)
activation_2 = SotMax_Loss_CategoricalCrossEntropy()

optimizer = Optimizer_Adagrad(decay=1e-4)
for epoch in range(10001):
    dense_1.forward(X)
    activation_1.forward(dense_1.outputs)

    dense_2.forward(activation_1.outputs)
    loss = activation_2.forward(dense_2.outputs, y)

    predictions = np.argmax(activation_2.outputs, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions == y)

    if not epoch % 100:
        print(
            f"epoch: {epoch}, "
            + f"acc: {accuracy:.3f}, "
            + f"loss: {loss:.3f}, "
            + f"lr: {optimizer.current_learning_rate}"
        )

    activation_2.backward(dense_2.outputs, y)
    dense_2.backward(activation_2.dinputs)
    activation_1.backward(dense_2.dinputs)
    dense_1.backward(activation_1.dinputs)

    optimizer.pre_update_params()
    optimizer.update_params(dense_1)
    optimizer.update_params(dense_2)
    optimizer.post_update_params()

## RMSProp Optimizer

In [None]:
import numpy as np


class RMSProp:
    # Initialize optimizer - set settings
    def __init__(self, learning_rate=0.001, decay=0.0, epsilon=1e-7, rho=0.9):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.rho = rho

    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (
                1.0 / (1.0 + self.decay * self.iterations)
            )

    # Update parameters
    def update_params(self, layer):
        # If layer does not contain cache arrays,
        # create them filled with zeros
        if not hasattr(layer, "weight_cache"):
            layer.weight_cache = np.zeros_like(layer.weigths)
            layer.bias_cache = np.zeros_like(layer.biases)

        # Update cache with squared current gradients
        layer.weight_cache = (
            self.rho * layer.weight_cache + (1 - self.rho) * layer.dweigths**2
        )
        layer.bias_cache = (
            self.rho * layer.bias_cache + (1 - self.rho) * layer.dbiases**2
        )

        # Vanilla SGD parameter update + normalization
        # with square rooted cache
        layer.weigths += (
            -self.current_learning_rate
            * layer.dweigths
            / (np.sqrt(layer.weight_cache) + self.epsilon)
        )
        layer.biases += (
            -self.current_learning_rate
            * layer.dbiases
            / (np.sqrt(layer.bias_cache) + self.epsilon)
        )

    # Call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1

In [None]:
dense_1 = Dense(X.shape[1], 50)
activation_1 = ReLU()

dense_2 = Dense(50, 3)
activation_2 = SotMax_Loss_CategoricalCrossEntropy()

optimizer = RMSProp(decay=1e-4, rho=0.9)
for epoch in range(10001):
    dense_1.forward(X)
    activation_1.forward(dense_1.outputs)

    dense_2.forward(activation_1.outputs)
    loss = activation_2.forward(dense_2.outputs, y)

    predictions = np.argmax(activation_2.outputs, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions == y)

    if not epoch % 100:
        print(
            f"epoch: {epoch}, "
            + f"acc: {accuracy:.3f}, "
            + f"loss: {loss:.3f}, "
            + f"lr: {optimizer.current_learning_rate}"
        )

    activation_2.backward(dense_2.outputs, y)
    dense_2.backward(activation_2.dinputs)
    activation_1.backward(dense_2.dinputs)
    dense_1.backward(activation_1.dinputs)

    optimizer.pre_update_params()
    optimizer.update_params(dense_1)
    optimizer.update_params(dense_2)
    optimizer.post_update_params()

In [None]:
import numpy as np

# Adam optimizer
class Adam:
    # Initialize optimizer - set settings
    def __init__(self, learning_rate=0.001, decay=0., epsilon=1e-7, beta_1=0.9, beta_2=0.999):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2

    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    # Update parameters
    def update_params(self, layer):
        # If layer does not contain cache arrays, create them filled with zeros
        if not hasattr(layer, 'weight_cache'):
            layer.weight_momentums = np.zeros_like(layer.weigths)
            layer.weight_cache = np.zeros_like(layer.weigths)
            layer.bias_momentums = np.zeros_like(layer.biases)
            layer.bias_cache = np.zeros_like(layer.biases)

        # Update momentum with current gradients
        layer.weight_momentums = self.beta_1 * layer.weight_momentums + (1 - self.beta_1) * layer.dweigths
        layer.bias_momentums = self.beta_1 * layer.bias_momentums + (1 - self.beta_1) * layer.dbiases

        # Get corrected momentum
        # self.iteration is 0 at first pass and we need to start with 1 here
        weight_momentums_corrected = layer.weight_momentums / (1 - self.beta_1 ** (self.iterations + 1))
        bias_momentums_corrected = layer.bias_momentums / (1 - self.beta_1 ** (self.iterations + 1))

        # Update cache with squared current gradients
        layer.weight_cache = self.beta_2 * layer.weight_cache + (1 - self.beta_2) * layer.dweigths**2
        layer.bias_cache = self.beta_2 * layer.bias_cache + (1 - self.beta_2) * layer.dbiases**2

        # Get corrected cache
        weight_cache_corrected = layer.weight_cache / (1 - self.beta_2 ** (self.iterations + 1))
        bias_cache_corrected = layer.bias_cache / (1 - self.beta_2 ** (self.iterations + 1))

        # Vanilla SGD parameter update + normalization with square rooted cache
        layer.weigths += -self.current_learning_rate * weight_momentums_corrected / (np.sqrt(weight_cache_corrected) + self.epsilon)
        layer.biases += -self.current_learning_rate * bias_momentums_corrected / (np.sqrt(bias_cache_corrected) + self.epsilon)

    # Call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1


In [None]:
dense_1 = Dense(X.shape[1], 50)
activation_1 = ReLU()

dense_2 = Dense(50, 3)
activation_2 = SotMax_Loss_CategoricalCrossEntropy()

optimizer = Adam(learning_rate=0.02, decay=1e-5)
for epoch in range(10001):
    dense_1.forward(X)
    activation_1.forward(dense_1.outputs)

    dense_2.forward(activation_1.outputs)
    loss = activation_2.forward(dense_2.outputs, y)

    predictions = np.argmax(activation_2.outputs, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions == y)

    if not epoch % 100:
        print(
            f"epoch: {epoch}, "
            + f"acc: {accuracy:.3f}, "
            + f"loss: {loss:.3f}, "
            + f"lr: {optimizer.current_learning_rate}"
        )

    activation_2.backward(dense_2.outputs, y)
    dense_2.backward(activation_2.dinputs)
    activation_1.backward(dense_2.dinputs)
    dense_1.backward(activation_1.dinputs)

    optimizer.pre_update_params()
    optimizer.update_params(dense_1)
    optimizer.update_params(dense_2)
    optimizer.post_update_params()