# Artificial Neural Network ANN
Objective: MNIST handwritten digits classifications using a Artificial Neural Network (ANN) without the usage of any libraries.

## Data loading

In [1]:
import random
from typing import Callable
from abc import ABC
from abc import abstractmethod
from datetime import time
from typing import Counter

from sklearn.metrics import confusion_matrix, classification_report

from oli.ml.Activation_functions import relu
from oli.math.math_utility import pretty_print_matrix
from oli.ml.Activation_functions import relu_derivative
from oli.ml.Activation_functions import softmax
from oli.ml.Loss import mean_squared_error_loss_categorical
from oli.ml.Loss import categorical_cross_entropy_loss
from oli.ml.Loss import derivative_categorical_cross_entropy_loss
from oli.ml.utility_functions import argmax

In [2]:
class Image:
    pixels: list[list[int]]
    height: int
    width: int

    def __init__(self, pixels: list[list[int]]):
        self.pixels = pixels
        self.height = len(pixels)
        self.width = len(pixels[0])

    def print(self):
        pretty_print_matrix(self.pixels, label=f"Image with dimensions width = {self.width} x height = {self.height}",
                            max_length=3)

    def get_linearized(self) -> list[int]:
        res: list[int] = []
        for row in self.pixels:
            for item in row:
                res.append(item)
        return res

In [3]:
class MNISTDataset:
    images: list[Image]
    labels: list[int]

    def __init__(self, images: list[Image], labels: list[int]):
        self.images = images
        self.labels = labels

        if len(images) != len(labels):
            raise Exception("Amount of images doesnt match amount of labels.")

    def __iter__(self):
        return ((self.images[i], self.labels[i]) for i in range(len(self.images)))

    def get_linearized_images(self) -> list[list[int]]:
        return [img.get_linearized() for img in self.images]

In [4]:
def read_image_file(path: str) -> list[Image]:
    file_stream = open(path, "rb")
    # Offset 0 - 4 --> 4 bytes
    magic_number: bytes = file_stream.read(4)
    magic_number: int = int.from_bytes(magic_number, byteorder="big", signed=False)

    # Offset 4 - 8 --> 4 bytes
    number_of_images: bytes = file_stream.read(4)
    number_of_images: int = int.from_bytes(number_of_images, byteorder="big", signed=False)

    # Offset 8 - 12 --> 4 bytes
    number_of_rows: bytes = file_stream.read(4)
    number_of_rows: int = int.from_bytes(number_of_rows, byteorder="big", signed=False)

    # Offset 8 - 12 --> 4 bytes
    number_of_columns: bytes = file_stream.read(4)
    number_of_columns: int = int.from_bytes(number_of_columns, byteorder="big", signed=False)

    print(
        f"Loading images:\tMagic number: {magic_number}, Number of images: {number_of_images}, Number of rows: {number_of_rows}, Number of columns: {number_of_columns}")

    images: list[Image] = []
    count = 0
    for image_number in range(number_of_images):
        pixels: list[list[int]] = [[0 for n in range(number_of_columns)] for i in range(number_of_rows)]
        for row_number in range(number_of_rows):
            for column_number in range(number_of_columns):
                pixel: bytes = file_stream.read(1)
                pixel: int = int.from_bytes(pixel, byteorder="big", signed=False)
                pixels[row_number][column_number] = pixel
        images.append(Image(pixels))
        if image_number % 10000 == 0:
            print("Loaded image number", image_number)
    return images

In [5]:
def read_label_file(path: str):
    file_stream = open(path, "rb")
    # Offset 0 - 4 --> 4 bytes
    magic_number: bytes = file_stream.read(4)
    magic_number: int = int.from_bytes(magic_number, byteorder="big", signed=False)

    # Offset 4 - 8 --> 4 bytes
    number_of_items: bytes = file_stream.read(4)
    number_of_items: int = int.from_bytes(number_of_items, byteorder="big", signed=False)

    print(f"Loading labels:\tMagic number: {magic_number}, Number of items: {number_of_items}")

    items: list[int] = []
    for item_index in range(number_of_items):
        item: bytes = file_stream.read(1)
        item: int = int.from_bytes(item, byteorder="big", signed=False)
        items.append(item)

    return items

In [6]:
train_images = read_image_file("../../../data/mnist/train-images.idx3-ubyte")
train_labels = read_label_file("../../../data/mnist/train-labels.idx1-ubyte")
train_set: MNISTDataset = MNISTDataset(train_images, train_labels)

Loading images:	Magic number: 2051, Number of images: 60000, Number of rows: 28, Number of columns: 28
Loaded image number 0
Loaded image number 10000
Loaded image number 20000
Loaded image number 30000
Loaded image number 40000
Loaded image number 50000
Loading labels:	Magic number: 2049, Number of items: 60000


In [7]:
test_images = read_image_file("../../../data/mnist/t10k-images.idx3-ubyte")
test_labels = read_label_file("../../../data/mnist/t10k-labels.idx1-ubyte")
test_set: MNISTDataset = MNISTDataset(test_images, test_labels)

Loading images:	Magic number: 2051, Number of images: 10000, Number of rows: 28, Number of columns: 28
Loaded image number 0
Loading labels:	Magic number: 2049, Number of items: 10000


## Utility: Matrix multiplication

In [8]:
def multiplication(A: list[list[float]], B: list[list[float]]) -> list[list[float]]:
    """
    Function to multiply two 2d matrices.
    :param A: First matrix.
    :param B: Second matrix.
    :return: Matrix product.
    """
    if len(A[0]) != len(B):
        raise Exception(
            f"Multiplication is only possible if the number of columns of A corresponds to the number of rows in B. Columns of A: {len(A[0])} Rows of B: {len(B)}")
    m = len(A)  # Rows of A
    n = len(A[0])  # Columns of A
    n = len(B)  # Rows of B
    p = len(B[0])  # Columns of B
    C = [[0 for _ in range(p)] for _ in range(m)]

    for y in range(0, m):
        for x in range(0, p):
            C[y][x] = 0
            for u in range(0, n):
                a_yu = A[y][u]
                b_ux = B[u][x]
                # print(f"i: {i}, j: {j}, u: {u}, a_iu: {a_iu}, b_uj: {b_uj}")
                C[y][x] += a_yu * b_ux

    return C

In [9]:
A = [
    [1, 2, 3],
    [4, 5, 6]
]
B = [
    [0.1, 0.2, 0.3, 0.4],
    [0.5, 0.6, 0.7, 0.8],
    [0.9, 1.0, 1.1, 1.2],
]
pretty_print_matrix(multiplication(A, B))

[
  3.8000000000000003 4.4 5.0 5.6 
  8.3 9.8 11.3 12.799999999999999 
]


# Linear layer
- Activation function $g(x)$
- Amount of inputs ($x$): $n$ (including bias)
- Amount of neurons ($l$): $m$
- For each node: Weights for each incoming edge $w_{x_{0..n}}$ --> For all nodes combined $W$ must have dimensionality $n \times m$

## Feed forward
Prediction node $l_0$: $x_0 \cdot w_{l_0}_{x_0} + x_1 \cdot w_{l_0}_{x_1} + ... + x_n \cdot w_{l_0}_{x_n}$
Prediction node $l_1$: $x_0 \cdot w_{l_1}_{x_0} + x_1 \cdot w_{l_1}_{x_1} + ... + x_n \cdot w_{l_1}_{x_n}$
Prediction node $l_2$: $x_0 \cdot w_{l_1}_{x_0} + x_1 \cdot w_{l_2}_{x_1} + ... + x_n \cdot w_{l_2}_{x_n}$
--> Feed forward as a matrix multiplication:
$$
\hat y = g(
\begin{pmatrix}
    x_0\\
    x_1\\
    ...\\
    x_n
\end{pmatrix}
\begin{bmatrix}
	w_{l_0, x_0} & w_{l_1, x_0} & ... & w_{l_m, x_0} \\
	w_{l_0, x_1} & w_{l_1, x_1} & ... & w_{l_m, x_1} \\
	\vdots & \vdots & \ddots & \vdots \\
	w_{l_0, x_n} & w_{l_1, x_n} & ...& w_{l_m, x_n}
\end{bmatrix})
$$

## Backpropagation
[3b1b explanation](https://www.3blue1brown.com/lessons/backpropagation-calculus)

### Symbols & coding
| Symbol                | Meaning                                                            |
|-----------------------|--------------------------------------------------------------------|
| $w$                   | Weight                                                             |
| $b$                   | Bias                                                               |
| $z$                   | Matrix multiplication product                                      |
| $a$                   | Activation                                                         |
| $C$                   | Total cost of network (Average of costs for each training example) |
| $C_0$ | Cost of sample                                                     |

|Indices| Meaning                     |
|-------|-----------------------------|
| $j$| Current neuron of layer L   |
| $k$| Current neuron of layer L-1 |

### Objective
Goal: How sensitive is cost $C_0$ to changes in $w^{(L)}$: $\frac{\partial C_0}{\partial w^{(L)}}$
Sensitivity given by the chain rule

### Chain rule
$$\frac{\partial C_0}{\partial w^{(L)}} = \frac{\partial z^{(L)}}{\partial w^{(L)}} \frac{\partial a^{(L)}}{\partial z^{(L)}} \frac{\partial C_0}{\partial a^{(L)}} = a^{(L-1)}g'(z^{(L)})2(a^{(L)}-y)$$

##### Chain rule components
| Description                                                  | Formula                                                   | Per neuron                                                            |
|--------------------------------------------------------------|-----------------------------------------------------------|-----------------------------------------------------------------------|
| How much do changes in $w^{(L)}$ affect changes in $z^{(L)}$ | $\frac{\partial z^{(L)}}{\partial w^{(L)}} = a^{(L-1)}$   | $\frac{\partial z_j^{(L)}}{\partial w_{ji}^{(L)}} = a_i^{(L-1)}$      |
| How much do changes in $z^{(L)}$ affect changes in $a^{(L)}$ | $\frac{\partial a^{(L)}}{\partial z^{(L)}} = g'(z^{(L)})$ | $\frac{\partial a_{j}^{(L)}}{\partial z_{j}^{(L)}} = g'(z_{j}^{(L)})$ |
| How much do changes in $a^{(L)}$ affect changes in $C_0$     | $\frac{\partial C_0}{\partial a^{(L)}} = 2(a^{(L)} - y)$  | $\frac{\partial C_0}{\partial a_j^{(L)}} = 2(a_j^{(L)} - y)$          |
| Activation                                                   | $a^{(L)} = g(z^{(L)}) = g(w^{(L)} \cdot a^{(L-1)})$       |                                                                       |

### Upstream error calculation:

| Description                                                                                                                                                  | Formula                                                                                                                                                                                                   |
|--------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Previous neuron influences multiple neurons in the following layer --> Sum the error up by summing up the chain rule expressions (one per path of influence) | $\frac{\partial C_0}{\partial a_k^{(L-1)}} = \sum_{j=0}^{n_{L}} \frac{\partial z_j^{(L)}}{\partial a_k^{(L-1)}} \frac{\partial a_j^{(L)}}{\partial z_j^{(L)}} \frac{\partial C_0}{\partial a_j^{(L)}}$  |


In [10]:
class Layer(ABC):

    @abstractmethod
    def forward(self, x: list[list[float]]) -> list[list[float]]:
        pass

    # @abstractmethod
    # def backprop(self, previous_activation: list[float], label: float, learning_rate: float):
    #     pass

In [11]:
class LinearLayer(Layer):
    activation_function: Callable[[float], float]
    derivative_activation_function: Callable[[float], float]
    derivative_cost_function: Callable[[float, float], float]
    bias: float = 1
    W = list[list[float]]
    neurons: int
    inputs: int
    test_mode: bool

    # For backpropagation
    all_activations_a: list[list[float]]
    all_multiplication_results_z: list[list[float]]

    def __init__(
            self,
            neurons: int,
            inputs: int,
            activation_function: Callable[[float], float],
            derivative_activation_function: Callable[[float], float],
            derivative_cost_function: Callable[[float, float], float],
            test_mode: bool = False
    ):
        """
        Create a linear layer with a fixed number of neurons for a fixed number of inputs.
        :param neurons: The number of neurons the linear layer shall contain.
        :param inputs: The number of inputs.
        :param activation_function: The activation function of the linear layer.
        """
        self.neurons = neurons
        self.inputs = inputs
        self.activation_function = activation_function
        self.derivative_activation_function = derivative_activation_function
        self.derivative_cost_function = derivative_cost_function
        self.test_mode = test_mode
        self.all_activations_a = []
        self.all_multiplication_results_z = []

        if test_mode:
            self.W = [[0.5 for m in range(neurons)] for n in range((inputs + 1))]
        else:
            self.W = [[random.random() * 2 - 1 for m in range(neurons)] for n in range((inputs + 1))]

        assert len(self.W) == inputs + 1
        assert len(self.W[0]) == neurons

    def forward(self, x: list[float]) -> list[list[float]]:
        """
        Forward pass through the linear layer by multiplying the weights (including bias) with the data (padded by a additional unit for the bias).
        :param x: Data used to make a prediction.
        :return: Prediction of the linear layer.
        """
        # Add bias to input to allow the forward pass to be treated as a matrix multiplication.
        assert len(x) == self.inputs
        x.insert(0, self.bias)

        if self.test_mode: print(
            f"Input count: {len(x):^6}\tWeight dimensions count: {len(self.W):^6} x {len(self.W[0]):^6} = {len(self.W) * len(self.W[0])}")

        x: list[list[float]] = [x]
        # Multiply weights with input (bias inserted into inputs)
        multi: list[float] = multiplication(x, self.W)[0]
        assert len(multi) == self.neurons
        # Save result of the multiplication (z) for backpropagation
        self.all_multiplication_results_z.append(multi)

        # Apply activation function
        activation = [self.activation_function(curr) for curr in multi]
        # Save activation (a) for backpropagation
        self.all_activations_a.append(activation)

        return activation

    def backprop(
            self,
            previous_activation: list[list[float]],
            learning_rate: float,
            labels: list[float] | None,
            predictions: list[list[float]] |None,
            layer_index: int,
            activation_cost_effect: list[list[float]] | None = None,
            print_info: bool = True,
    ) -> list[list[float]]:
        """
        Backpropagate the error through the neural network. Adjust the weights based on the learning rate and the error received at the corresponding linear layer.
        :param layer_index:
        :param labels:
        :param previous_activation: Activation received from the previous layer / input: a^{(L-1)}
        :param learning_rate: Describes how large the gradient steps are.
        :param predictions:
        :param activation_cost_effect: Costs induced by the activation of this layer. If this layer is the last layer set to None in order to calculate the effect based on the loss.
        :return: Activation cost effect of the upstream layer.
        """
        activation_cost_effect_for_upstream_layer_for_all_examples = []
        average_w_on_Ck_effect_for_all_examples = []
        for neuron_index_j in range(self.neurons):
            for previous_activation_index_k in range(len(previous_activation[0])):
                w_on_Ck_effect_sum = 0
                batch_size = len(previous_activation)
                for example_index in range(batch_size):
                    # w_on_Ck_effect: Effect of weight on cost for a single example.
                    # activation_cost_effect_for_upstream_layer: Effect of activation on cost for a single example.
                    (w_on_Ck_effect, activation_cost_effect_for_upstream_layer) =  self.apply_chain_rule(
                        previous_activation=previous_activation[example_index],
                        multiplication_result_z=self.all_multiplication_results_z[example_index],
                        label=labels[example_index] if labels is not None else None,
                        prediction=predictions[example_index] if predictions is not None else None,
                        activation_cost_effect=activation_cost_effect[example_index] if activation_cost_effect is not None else None,
                        neuron_index_j=neuron_index_j,
                        previous_activation_index_k=previous_activation_index_k,
                        layer_index=layer_index
                    )
                    w_on_Ck_effect_sum += w_on_Ck_effect
                    activation_cost_effect_for_upstream_layer_for_all_examples.append(activation_cost_effect_for_upstream_layer)

                # Adjust weights by the average cost sensitivity of the weight on the cost.
                self.W[previous_activation_index_k][neuron_index_j] = self.W[previous_activation_index_k][neuron_index_j] - learning_rate * w_on_Ck_effect_sum
                average_w_on_Ck_effect_for_all_examples.append(w_on_Ck_effect_sum)
                # Progress indicator
                if print_info and neuron_index_j == 0 and previous_activation_index_k == 0:
                    print(f"Weight change: {learning_rate * (w_on_Ck_effect_sum)} (lr={learning_rate}, w_onCk_effect_sum={w_on_Ck_effect_sum}, batch_size={batch_size}) for weight W[{previous_activation_index_k}][{neuron_index_j}]")
                    print(f"Sum of effect of weight change on costs: {w_on_Ck_effect_sum} (for Neuron index j={neuron_index_j} Previous activation index k={previous_activation_index_k} Example index={example_index})")

            # if print_info and neuron_index_j % (self.neurons / 4) == 0 or neuron_index_j == self.neurons - 1:
                # print(f"Done with {neuron_index_j / self.neurons * 100}% of the neurons in this layer.")
        if print_info:
            print(f"Average (of layer) of the sum of effect of weight change cost: {sum(average_w_on_Ck_effect_for_all_examples)/len(average_w_on_Ck_effect_for_all_examples)}\n")
        self.clear_cached_results()
        return activation_cost_effect_for_upstream_layer_for_all_examples

    def apply_chain_rule(
            self,
            previous_activation: list[float],
            multiplication_result_z: list[float],
            label: float,
            prediction: list[float],
            activation_cost_effect: list[float],
            neuron_index_j: int,
            previous_activation_index_k: int,
            layer_index: int
    ):
        activation_cost_effect_for_upstream_layer = [0 for index in range(len(previous_activation))]
        # Calculate chain rule components
        # Effect of a weight change on the matrix multiplication product
        w_on_z_effect: float = self.effect_of_weights_on_matrix_multiplication_product(previous_activation[neuron_index_j])
        # Effect of matrix multiplication product change on activation
        z_on_a_effect: float = self.effect_of_matrix_multiplication_product_on_activation(multiplication_result_z[neuron_index_j])
        # Effect of activation change on costs
        a_on_c0_effect = self.effect_of_activation_on_cost(label, prediction, activation_cost_effect, neuron_index_j)

        # Chain rule: Effect of changes of the weights on the costs
        cost_sensitivity_with_respect_to_weight_changes: float = w_on_z_effect * z_on_a_effect * a_on_c0_effect

        if layer_index > 0:
            # Effect of previous activation on matrix product
            prev_a_on_z_effect: float = self.effect_of_previous_activation_on_matrix_product(neuron_index_j, previous_activation_index_k)
            activation_cost_effect_for_upstream_layer[previous_activation_index_k] += prev_a_on_z_effect * z_on_a_effect * a_on_c0_effect
            return cost_sensitivity_with_respect_to_weight_changes, activation_cost_effect_for_upstream_layer
        else:
            return cost_sensitivity_with_respect_to_weight_changes, None

    def effect_of_weights_on_matrix_multiplication_product(self, previous_activation: float) -> float:
        """
        The effect of the weights is given by the previous activation a^{(L-1)}.
        """
        return previous_activation

    def effect_of_matrix_multiplication_product_on_activation(self, matrix_multiplication_result_z_j: float) -> float:
        """
        The effect of the matrix multiplication product on the activation is the derivative of the activation function applied to the matrix multiplication product: \sigma ' (z^{(L)}).
        """
        return self.derivative_activation_function(matrix_multiplication_result_z_j)

    def effect_of_activation_on_cost(self, label: float, predictions: list[float], activation_cost_effect: list[float], neuron_index_j: int):
        """
        The effect of the activation on the cost is the derivative of the loss function. e.g. 2(a^{(L)} -y) for squared error loss
        """
        if activation_cost_effect is None and label is not None and predictions is not None:
            return self.derivative_cost_function(predictions[neuron_index_j], label)
        elif activation_cost_effect is not None:
            return activation_cost_effect[neuron_index_j]
        else:
            raise Exception(
                f"Illegal state:\nactivation_cost_effect: {activation_cost_effect}\nlabel: {label}\npredictions:{predictions}")

    def effect_of_previous_activation_on_matrix_product(self, neuron_index_j: int, previous_activation_index_k: int) -> float:
        """
        Calculate the effect of the previous activation on the matrix multiplication product.
        It is the activation at the index of the relevant weight: Row index in the weight matrix denotes the feature index, column index denotes the neurons of the layer.
        """
        return self.W[previous_activation_index_k][neuron_index_j]

    def clear_cached_results(self):
        self.all_activations_a = []
        self.all_multiplication_results_z = []

In [12]:


class NeuralNetwork:
    layers: list[LinearLayer]

    def __init__(self, *args):
        self.layers = args

    def train(self, X: list[list[float]], y: list[int], epochs: int, batch_size: int, learning_rate: float):
        """
        :param X: Data to train on.
        :param y: Labels corresponding to the data.
        :param epochs: The number of epochs to train the NN for.
        :param batch_size: Size of a training batch.
        :param learning_rate: The learning rate (amount by which the weights are adjusted).
        """
        # Prepare for confusion matrix

        loss_history = []
        for epoch in range(epochs):
            for index in range(len(X)):
                for batch_index in range(int(len(X) / batch_size)):
                    batch_start = batch_index * batch_size
                    batch_end = batch_start + batch_size
                    x_batch: list[list[float]] = X[batch_start:batch_end]
                    y_batch: list[int] = y[batch_start:batch_end]
                    batch_predictions: list[int] = []
                    batch_probabilities: list[list[float]] = []
                    batch_loss = []
                    for index in range(len(x_batch)):
                        activation: list[float] = self.predict(x_batch[index])
                        probability: list[float] = softmax(activation)
                        batch_probabilities.append(probability)
                        curr_loss = categorical_cross_entropy_loss(probability, y_batch[index])
                        batch_loss.append(curr_loss)
                        # print(f"Epoch: {epoch} Batch: {batch_index} Index: {index} Loss: {curr_loss} Label: {y_batch[index]}\nActivation: {activation}\nProbability: {probability}\n")
                        batch_predictions.append(argmax(probability))
                    self.backprop(learning_rate, x_batch, y_batch, batch_probabilities)
                    batch_loss = sum(batch_loss)/len(batch_loss)
                    loss_history.append(batch_loss)
                    print(f"Batch {batch_index} / {int(len(X) / batch_size)} completed of epoch {epoch} / {epochs}. Loss: {batch_loss}, Loss history: {loss_history}")
                    print(f"Batch label balance: {Counter[int](y_batch)}")
                    if batch_loss < 0.3:
                        print("Loss is below 0.3, stopping training.")
                        return
        print(loss_history)

    def backprop(self, learning_rate: float, x: list[list[float]], y: list[int], predictions: list[list[float]]):
        activation_cost_effect = None
        for index in reversed(range(len(self.layers))):
            isLastLayer = index == len(self.layers) - 1
            isFirstLayer = index == 0
            print(f"Backpropagating layer {index}")
            activation_cost_effect = self.layers[index].backprop(
                previous_activation=x if isFirstLayer else self.layers[index - 1].all_activations_a,
                predictions=predictions if isLastLayer else None,
                labels=y if isLastLayer else None,
                learning_rate=learning_rate,
                activation_cost_effect=activation_cost_effect,
                layer_index=index
            )

    def predict(self, x: list[float]) -> list[float]:
        curr_x: list[float] = x
        for (index, layer) in enumerate(self.layers):
            curr_x = layer.forward(curr_x)
        return curr_x

    def predict_multiple(self, X: list[list[float]]) -> list[float]:
        predictions: list[float] = []
        for x in X:
            pred = self.predict(x)
            probs = softmax(pred)
            predictions.append(probs)
        return predictions

In [None]:
nn = NeuralNetwork(
    LinearLayer(
        neurons=128,
        inputs=784,
        activation_function=relu,
        derivative_activation_function=relu_derivative,
        derivative_cost_function=derivative_categorical_cross_entropy_loss
    ),
    LinearLayer(
        neurons=24,
        inputs=128,
        activation_function=relu,
        derivative_activation_function=relu_derivative,
        derivative_cost_function=derivative_categorical_cross_entropy_loss
    ),
    LinearLayer(
        neurons=24,
        inputs=24,
        activation_function=relu,
        derivative_activation_function=relu_derivative,
        derivative_cost_function=derivative_categorical_cross_entropy_loss
    ),
    LinearLayer(
        neurons=10,
        inputs=24,
        activation_function=relu,
        derivative_activation_function=relu_derivative,
        derivative_cost_function=derivative_categorical_cross_entropy_loss
    )
)

nn.train(
    X=train_set.get_linearized_images(),
    y=train_set.labels,
    epochs=1,
    batch_size=256,
    learning_rate=0.001
)

Backpropagating layer 3
Weight change: -0.1541437094767194 (lr=0.001, w_onCk_effect_sum=-154.1437094767194, batch_size=256) for weight W[0][0]
Sum of effect of weight change on costs: -154.1437094767194 (for Neuron index j=0 Previous activation index k=0 Example index=255)
Average (of layer) of the sum of effect of weight change cost: -1367569.815510483

Backpropagating layer 2
Weight change: 0.01932378528340894 (lr=0.001, w_onCk_effect_sum=19.32378528340894, batch_size=256) for weight W[0][0]
Sum of effect of weight change on costs: 19.32378528340894 (for Neuron index j=0 Previous activation index k=0 Example index=255)
Average (of layer) of the sum of effect of weight change cost: 0.8051577201420392

Backpropagating layer 1
Weight change: -8.843044488715365e-06 (lr=0.001, w_onCk_effect_sum=-0.008843044488715365, batch_size=256) for weight W[0][0]
Sum of effect of weight change on costs: -0.008843044488715365 (for Neuron index j=0 Previous activation index k=0 Example index=255)
Avera

# Test

In [None]:
predictions = nn.predict_multiple(test_set.get_linearized_images())
print("Classification report:\n", classification_report(test_set.labels, predictions))
print("Confusion matrix:\n", confusion_matrix(test_set.labels, predictions))

In [None]:
layer = LinearLayer(neurons=4, inputs=3, activation_function=relu, derivative_activation_function=relu_derivative,
                    derivative_cost_function=derivative_categorical_cross_entropy_loss, test_mode=True)
result = layer.forward([1, 2, 3])
assert result == [3.5, 3.5, 3.5, 3.5]
pretty_print_matrix(result)

loss = mean_squared_error_loss_categorical(result, 2)
assert loss == 10.7101
print(loss)

In [None]:
nn = NeuralNetwork(
    LinearLayer(
        neurons=256,
        inputs=784,
        activation_function=relu,
        derivative_activation_function=relu_derivative,
        derivative_cost_function=derivative_categorical_cross_entropy_loss,
        test_mode=True
    ),
    LinearLayer(
        neurons=128,
        inputs=256,
        activation_function=relu,
        derivative_activation_function=relu_derivative,
        derivative_cost_function=derivative_categorical_cross_entropy_loss,
        test_mode=True
    ),
    LinearLayer(
        neurons=64,
        inputs=128,
        activation_function=relu,
        derivative_activation_function=relu_derivative,
        derivative_cost_function=derivative_categorical_cross_entropy_loss,
        test_mode=True
    ),
    LinearLayer(
        neurons=10,
        inputs=64,
        activation_function=relu,
        derivative_activation_function=relu_derivative,
        derivative_cost_function=derivative_categorical_cross_entropy_loss,
        test_mode=True
    )
)

prediction = softmax(
    nn.predict(test_set.images[0].get_linearized())
)
test_set.images[0].print()
pretty_print_matrix(softmax(prediction))
