In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib widget

In [3]:
class LayerInitializationStrategy:
    """Parent class for initialization strategies of weights and biases in Layer."""
    def run(self, size_of_previous_layer: int, size_of_current_layer: int) -> tuple[np.ndarray, np.ndarray]:
        raise NotImplementedError("The 'run' method is implemented only in child classes")

class RandomUniform(LayerInitializationStrategy):
    """
    Initialization strategy sampling weights and biases uniformly in a given interval.

    Attributes
    ----------
    scale: float
        The half-lenght of the interval [-scale, scale] from which values are sampled.

    Methods
    -------
    run(self, size_of_previous_layer: int, size_of_current_layer: int) -> tuple[np.ndarray, np.ndarray]
        Returns the tuple (random_weights, random_biases), where random_weights and random_biases are np.ndarrays of the appropriate shape.
    """
    def __init__(self, scale_of_interval: float, center_of_interval: float = 0):
        self.scale: float = scale_of_interval
        self.center: float = center_of_interval
    
    def run(self, size_of_previous_layer: int, size_of_current_layer: int) -> tuple[np.ndarray, np.ndarray]:
        random_weights: np.ndarray = np.random.uniform(
            -self.scale + self.center, self.scale + self.center,
            (size_of_previous_layer, size_of_current_layer)
            )
        random_biases: np.ndarray = np.random.uniform(
            -self.scale + self.center, self.scale + self.center,
            size_of_current_layer
            )
        return random_weights, random_biases

In [4]:
class ActivationFunction:
    """Parent class for activation functions of neural nodes."""
    def __call__(self, x: np.ndarray) -> np.ndarray:
        raise NotImplementedError("The '__call__' method must be implemented in child classes")

    def derivative(self, x: np.ndarray) -> np.ndarray:
        raise NotImplementedError("The 'derivative' method must be implemented in child classes")

class Sigmoid(ActivationFunction):
    """Sigmoid activation function. f(x) = 1 / (1 + np.exp(-x))"""
    def __call__(self, x: np.ndarray) -> np.ndarray:
        return 1 / (1 + np.exp(-x))

    def derivative(self, x: np.ndarray) -> np.ndarray:
        sigmoid = self(x)  # Reuse the __call__ method to compute sigmoid
        return sigmoid * (1 - sigmoid)

In [5]:
class Layer:
    """
    A Layer component of a NeuralNetwork.

    Attributes
    ----------
    unit_number: int
        The number of nodes/units in the Layer.
    init_strat: LayerInitializationStrategy
        The initialization strategy for the weights and biases of the Layer
    activation_function: ActivationFunction
        The function.
    
    I should continue writing docstrings later
    """

    def __init__(self, number_of_units: int,
                 initialization_strategy: LayerInitializationStrategy,
                 activation_function: ActivationFunction):
        self.unit_number: int = number_of_units  # The number of nodes/units in the Layer.
        self.init_strat: LayerInitializationStrategy = initialization_strategy
        self.activation_function: ActivationFunction = activation_function
        self.linear_output: np.ndarray = None
        self.output: np.ndarray = None  # The values computed by the units based on the outputs of the previous layer. Stored for later backprop.
        self.previous_layer: Layer = None  # The layer preceding the current one in the Neural Network. The NN should connect layers during initialization.
        self.next_layer: Layer = None
        self.weights: np.ndarray = None; self.biases: np.ndarray = None  # Weights and biases connecting the layer with the previous layer of the neural network.
        self.delta: np.ndarray = None
        

    def initialize_weights(self):
        self.weights, self.biases = self.init_strat.run(self.previous_layer.unit_number, self.unit_number)

    def compute_output(self):
        self.linear_output = np.dot(self.previous_layer.output, self.weights) + self.biases
        self.output: np.ndarray = self.activation_function(self.linear_output)
        return self.output
    
    # def compute_gradient(self)  # Should this function be here or in a later TrainingAlgorithm class?

class InputLayer(Layer):
    def __init__(self, number_of_units: int):
        super().__init__(number_of_units, None, None)
        # An input layer has no previous layer to connect to, so attributes referring to a previous layer are deleted.
        del self.previous_layer, self.weights, self.biases, self.init_strat, self.activation_function
    
    def feed_input(self, value: np.ndarray) -> None:
        self.output: np.ndarray = value
    
    def initialize_weights(self):
        raise NotImplementedError("InputLayer does not require weight initialization.")

    def compute_output(self):
        return self.output

class HiddenLayer(Layer):
    pass

class OutputLayer(Layer):
    def __init__(self, number_of_units: int,
                 initialization_strategy: LayerInitializationStrategy,
                 activation_function: ActivationFunction):
        super().__init__(number_of_units, initialization_strategy, activation_function)
        del self.next_layer

In [6]:
class NeuralNetwork:
    def __init__(self, layers: list[Layer]):
        self.layers: list[Layer] = layers
        # Maybe here I should ensure that layers are correctly typed (layers[0] should be an InputLayer, layers[-1] an OutputLayer, all other layers should be HiddenLayer).
        self.input_layer: InputLayer = layers[0]; self.hidden_layers: list[HiddenLayer] = layers[1: -1]; self.output_layer: OutputLayer = layers[-1]
        self.layers_with_weights: list[Layer] = self.layers[1: ]
        self.connect_layers()
        self.initialize_weights()
    
    def connect_layers(self) -> None:
        for (i, layer) in enumerate(self.layers):
            if not isinstance(layer, InputLayer): layer.previous_layer = self.layers[i - 1]
            if not isinstance(layer, OutputLayer): layer.next_layer = self.layers[i + 1]

    def initialize_weights(self) -> None:
        
        for layer in self.layers_with_weights: layer.initialize_weights()
    
    def feed_input(self, value: np.ndarray) -> None:
        self.input_layer.feed_input(value)

    def activate_network(self) -> np.ndarray:
        for i in range(len(self.layers)): self.layers[i].compute_output()
        return self.output_layer.output
    
    def compute_output(self, value: np.ndarray) -> np.ndarray:
        self.feed_input(value)
        return self.activate_network()
    
    def compute_multiple_outputs(self, x_data: pd.DataFrame | np.ndarray) -> np.ndarray[np.ndarray]:
        if isinstance(x_data, pd.DataFrame): x_data = x_data.to_numpy()
        outputs = np.array(
            [
                self.compute_output(x_data[i]) for i in range(len(x_data))
            ]
        )
        return outputs

In [7]:
layers = [InputLayer(2), HiddenLayer(3, RandomUniform(0.0), Sigmoid()), OutputLayer(2, RandomUniform(0.0), Sigmoid())]
nn = NeuralNetwork(layers)

# Initialize layers
nn.hidden_layers[0].weights = np.array(
    [[1, 0.5, 2],
    [-1, 0, -1.5]], dtype = float
)
nn.hidden_layers[0].biases = np.array([1, 2, 3], dtype=float)

nn.output_layer.weights = np.array(
    [[1, 3],
     [2, 3],
     [-1, 0]], dtype = float
)
nn.output_layer.biases = np.array([0,2], dtype=float)


# Some fake training data
x = np.array(
    [[1, 0]], dtype = float
)
y = np.array(
    [[1,0]], dtype = float
)

In [8]:
# Test forward pass
nn.compute_output(x[0])

print(nn.input_layer.output)
print(nn.hidden_layers[0].linear_output)
print(nn.hidden_layers[0].output)
print(nn.output_layer.linear_output)
print(nn.output_layer.output)

[1. 0.]
[2.  2.5 5. ]
[0.88079708 0.92414182 0.99330715]
[1.73577357 7.41481669]
[0.85014943 0.9993981 ]


In [9]:
class LossFunction:
    def __call__(self, y_data: np.ndarray, y_predicted: np.ndarray) -> float:
        pass
    
    def simple_gradient(self, y_data: np.ndarray, y_predicted: np.ndarray) -> np.ndarray:
        pass

class MSE(LossFunction):
    def __call__(self, y_data: np.ndarray, y_predicted: np.ndarray) -> float:
        return 0.5 * np.sum((y_data - y_predicted)**2) / len(y_data)
    
    def simple_gradient(self, y_data: np.ndarray, y_predicted: np.ndarray) -> np.ndarray:
        return (y_data - y_predicted)

In [10]:
class ListOfArrays:
    """
    An utility class for defining operations on lists containing hetero-shaped np.ndarrays.
    Useful for storing network weights and biases in MLP-architecture NeuralNetworks.
    """
    def __init__(self, arrays: list[np.array]):
        self.arrays: list[np.array] = arrays
    
    def __repr__(self):
        return f"ListOfArrays{(self.arrays)}"

    def __getitem__(self, index):
        return self.arrays[index]

    def __setitem__(self, index, value):
        self.arrays[index] = value
    
    def __add__(self, other):
        if not isinstance(other, ListOfArrays): raise TypeError("Operand is not a ListOfArrays")
        return ListOfArrays([x + y for x, y in zip(self.arrays, other.arrays)])
    
    def __sub__(self, other):
        if not isinstance(other, ListOfArrays): raise TypeError("Operand is not a ListOfArrays")
        return ListOfArrays([x - y for x, y in zip(self.arrays, other.arrays)])
    
    def __mul__(self, scalar):
        return ListOfArrays([x * scalar for x in self.arrays])
    
    def __truediv__(self, scalar):
        return ListOfArrays([x / scalar for x in self.arrays])
    
    def __pow__(self, power):
        return ListOfArrays([x**power for x in self.arrays])
    
    def sum(self) -> float:
        return np.sum([np.sum(array) for array in self.arrays])

In [11]:
class RegularizationTerm:
    def set_network(self, network: NeuralNetwork) -> None:
        self.network = network

    def __call__(self) -> float:
        pass

    def gradient(self) -> tuple[ListOfArrays, ListOfArrays]:
        pass

class NoRegularization(RegularizationTerm):
    def __init__(self):
        pass

    def __call__(self) -> float:
        return 0
    
    def gradient(self) -> tuple[ListOfArrays, ListOfArrays]:
        layers = self.network.layers_with_weights
        return ListOfArrays([np.zeros_like(l.weights) for l in layers]), ListOfArrays([np.zeros_like(l.biases) for l in layers])

class Tikhonov(RegularizationTerm):
    def __init__(self, penalty: float):
        self.penalty: float = penalty
        self.network: NeuralNetwork = None

    def __call__(self) -> float:
        layers: list[Layer] = self.network.layers_with_weights
        weights_term = np.sum([np.sum(layer.weigths**2) for layer in layers])  # The sum of squares of all the weights in the NN.
        biases_term = np.sum([np.sum(layer.biases**2) for layer in layers])
        return self.penalty * (weights_term + biases_term) / 2

    def gradient(self) -> tuple[ListOfArrays, ListOfArrays]:
        layers: list[Layer] = self.network.layers_with_weights
        gradient_on_weights: ListOfArrays = ListOfArrays([-self.penalty * l.weights for l in layers])
        gradient_on_biases: ListOfArrays = ListOfArrays([-self.penalty * l.biases for l in layers])
        return gradient_on_weights, gradient_on_biases

In [12]:
class StoppingCondition:
    def is_satisfied(self, loss: float, gradients: np.ndarray = None) -> bool:
        raise NotImplementedError("This method is only implemented in child classes")

class ThresholdOnLoss(StoppingCondition):
    def __init__(self, threshold: float, patience: int):
        self.threshold = threshold; self.patience = patience
        self.consecutive_epochs = 0
    
    def is_satisfied(self, loss: float, gradients: np.ndarray = None) -> bool:
        if loss < self.threshold:
            self.consecutive_epochs += 1
            return self.consecutive_epochs > self.patience
        else:
            self.consecutive_epochs = 0
            return False

In [13]:
class ErrorFunction:
    def __call__(self, y_data: np.ndarray, y_predicted: np.ndarray) -> float:
        pass
    
    def simple_gradient(self, y_data: np.ndarray, y_predicted: np.ndarray) -> np.ndarray:
        pass

class MSE(ErrorFunction):
    def __call__(self, y_data: np.ndarray, y_predicted: np.ndarray) -> float:
        """
        Returns the average over the dataset of the square euclidean distance between the training outputs and the predictions.
        """
        num_patterns = 1 if y_data.ndim == 1 else len(y_data)
        return 0.5 * np.sum((y_data - y_predicted)**2) / num_patterns
    
    def simple_gradient(self, y_data: np.ndarray, y_predicted: np.ndarray) -> np.ndarray:
        """
        Returns y_data - y_predicted. It is meant to be used on a single pattern at a time, during backpropagation.
        """
        return (y_data - y_predicted)

In [14]:
class TrainingAlgorithm:
    def __init__(self, x_data: pd.DataFrame, y_data: pd.DataFrame, network: NeuralNetwork):
        self.x: np.ndarray = x_data.to_numpy() if isinstance(x_data, pd.DataFrame) else x_data
        self.y: np.ndarray = y_data.to_numpy() if isinstance(y_data, pd.DataFrame) else y_data
        self.network: NeuralNetwork = network

        self.history: dict[list] = {'training error': []}
        self.cache: dict = {}

class Backpropagation(TrainingAlgorithm):
    def __init__(self, x_data: pd.DataFrame, y_data: pd.DataFrame, network: NeuralNetwork,
                 learning_rate: float,
                 error_function: ErrorFunction, stopping_condition: StoppingCondition,
                 regularization_term: RegularizationTerm = None, batch_size: int = None
                 ):
        super().__init__(x_data, y_data, network)
        self.learning_rate: float = learning_rate
        self.err_fun: ErrorFunction = error_function; self.stop_cond: StoppingCondition = stopping_condition


        # The gradients of the loss function (= error function + regularization term) with respect to the network weights and biases.
        # Used for updating the network parameters after each training epoch.
        self.weights_gradient: ListOfArrays = None; self.biases_gradient: ListOfArrays = None
        

        
        self.regularization_term: RegularizationTerm = regularization_term or NoRegularization()
        self.regularization_term.set_network(self.network)

        self.batch_size: int = batch_size or len(self.x)


    def compute_loss(self, x: np.ndarray, y: np.ndarray) -> float:
        """
        Computes the loss of the current network over the given data.
        Such data may be the whole training data (self.x, self.y) or may be a mini-batch.

        
        Parameters
        ----------
        x: np.ndarray
            The input training (mini)batch data.
        y: np.ndarray
            The output training (mini)batch data.

        Returns
        -------
        loss: float
            Training error + Regularization penalty term. 
        """
        return self.compute_training_error(x, y) + self.regularization_term()
    
    def compute_training_error(self, x: np.ndarray, y: np.ndarray) -> float:
        """
        Computes the error of the current network over the given data.
        Such data may be the whole training data (self.x, self.y) or may be a mini-batch.
        Such error includes no regularization penalty term.


        Parameters
        ----------
        x: np.ndarray
            The input training (mini)batch data.
        y: np.ndarray
            The output training (mini)batch data.        

        Returns
        -------
        error: float
            Average of single-pattern errors over the dataset. 
        """
        network_outputs: np.ndarray = self.network.compute_multiple_outputs(x)
        return self.err_fun(y, network_outputs)
    
    def update_network_parameters(self, current_minibatch_size: int) -> None:
        """
        NOTE TO SELF: add the momentum term's contribution here later.
        Updates the weights and biases of the NeuralNetwork based on the computed gradients and the learning rate.
        """
        factor: float = self.learning_rate * current_minibatch_size / len(self.x)
        # Note that weights and biases are stored in each individual layer, so an iteration over
        # layers is necessary. NeuralNetwork has no weights: ListOfArrays attribute, so we can't just
        # write self.network.weights += ... * self.weights_gradient
        for i, l in enumerate(self.network.layers_with_weights):
            l.weights += factor * self.weights_gradient[i]   
            l.biases += factor * self.biases_gradient[i]
    
    def run(self, max_epochs: int) -> None:
        """
        Train the NeuralNetwork according to this strategy.
        """
        epoch: int = 0; tr_err: float = float('inf')
        indices = np.arange(len(self.x))
        while not self.stop_cond.is_satisfied(tr_err, None) and epoch < max_epochs:
            epoch += 1; tr_err = 0

            # Shuffle the data
            np.random.shuffle(indices)
            shuffled_x = self.x[indices]; shuffled_y = self.y[indices]

            for start in range(0, len(self.x), self.batch_size):  # For loop over the minibatches.
                end = start + self.batch_size
                minibatch_x = shuffled_x[start:end]; minibatch_y = shuffled_y[start:end]
                current_minibatch_size = len(minibatch_x)

                self.update_gradients(minibatch_x, minibatch_y, cache_error = True)  # NOTE define this function.
                tr_err += self.cache['minibatch_training_error'] * current_minibatch_size / len(self.x)  # Add the error of the minibatch to the whole-batch error.
                self.update_network_parameters(current_minibatch_size)


            self.history['training error'] += [tr_err]
    
    def update_gradients(self, x, y, cache_error: bool) -> None:
        # Set gradients to 0.
        self.reset_gradients()
        if cache_error: self.cache['minibatch_training_error'] = 0
        # Set gradients to the mean of single-pattern gradients of the error (without regularization) function.
        for pattern_x, pattern_y in zip(x,y):
            self.add_single_pattern_gradients_contribution(pattern_x, pattern_y, cache_error)
        current_minibatch_size: int = len(x)
        self.weights_gradient /= current_minibatch_size; self.biases_gradient /= current_minibatch_size
        if cache_error: self.cache['minibatch_training_error'] /= current_minibatch_size
        # Add the regularization term contribution.
        w_reg_term, b_reg_term = self.regularization_term.gradient()
        self.weights_gradient += w_reg_term; self.biases_gradient += b_reg_term
    
    def reset_gradients(self) -> None:
        """
        Sets self.weights_gradient and self.biases_gradient to a list of appropriately-shaped arrays with all zero entries.
        """
        lwws: list[Layer] = self.network.layers_with_weights
        self.weights_gradient = ListOfArrays([np.zeros_like(l.weights) for l in lwws])
        self.biases_gradient = ListOfArrays([np.zeros_like(l.biases) for l in lwws])

    def add_single_pattern_gradients_contribution(self, pattern_x: np.ndarray, pattern_y: np.ndarray, cache_error: bool) -> None:
        predicted_y: np.ndarray = self.network.compute_output(pattern_x)
        if cache_error: self.cache['minibatch_training_error'] += self.err_fun(pattern_y, predicted_y)
        self.backpropagate(pattern_y)

        w_term = ListOfArrays([np.outer(l.previous_layer.output, l.delta) for l in self.network.layers_with_weights])
        b_term = ListOfArrays([l.delta for l in self.network.layers_with_weights])

        self.weights_gradient += w_term; self.biases_gradient += b_term
    
    def backpropagate(self, pattern_y: np.ndarray) -> None:
        out_l: OutputLayer = self.network.output_layer; hid_ls: list[HiddenLayer] = self.network.hidden_layers  # Give short, convenient names to the OutputLayer and the HiddenLayers.

        out_l.delta = out_l.activation_function.derivative(out_l.linear_output) * self.err_fun.simple_gradient(pattern_y, out_l.output)  # Compute delta for the OutputLayer.
        for l in reversed(hid_ls):  # For each HiddenLayer, starting from the one closest to the OutputLayer and proceeding backwards:
            l.delta = l.activation_function.derivative(l.linear_output) * np.dot(l.next_layer.weights, l.next_layer.delta)  # Compute delta for the HiddenLayer.
        



In [15]:
training_alg = Backpropagation(x, y, nn, 200.0, MSE(), ThresholdOnLoss(0.00001, 10), regularization_term = Tikhonov(0.000), batch_size = None)

In [16]:
# Test backward pass
training_alg.backpropagate(y[0])
print(nn.output_layer.delta)
print(nn.hidden_layers[0].delta)

[ 0.01909027 -0.00060118]
[ 0.001815    0.00255016 -0.00012691]


In [45]:
old_weights = ListOfArrays([np.copy(nn.hidden_layers[0].weights), np.copy(nn.output_layer.weights)])
old_biases = ListOfArrays([np.copy(nn.hidden_layers[0].biases), np.copy(nn.output_layer.biases)])

In [46]:
# Test one epoch update
training_alg.run(1)
new_weights = ListOfArrays([nn.hidden_layers[0].weights, nn.output_layer.weights])
new_biases = ListOfArrays([nn.hidden_layers[0].biases, nn.output_layer.biases])

In [47]:
print(new_weights - old_weights)

ListOfArrays[array([[ 0.36299938,  0.51003272, -0.02538264],
       [ 0.        ,  0.        ,  0.        ]]), array([[ 3.3629306 , -0.1059028 ],
       [ 3.52842316, -0.11111436],
       [ 3.79250011, -0.11943047]])]


In [48]:
print(
    training_alg.learning_rate * np.outer(nn.input_layer.output, nn.hidden_layers[0].delta)
)
print(
    training_alg.learning_rate * np.outer(nn.hidden_layers[0].output, nn.output_layer.delta)
)

[[ 0.36299938  0.51003272 -0.02538264]
 [ 0.          0.         -0.        ]]
[[ 3.3629306  -0.1059028 ]
 [ 3.52842316 -0.11111436]
 [ 3.79250011 -0.11943047]]


In [49]:
tensor_prod_for_1010 = ListOfArrays(
    [np.copy(np.outer(nn.input_layer.output, nn.hidden_layers[0].delta)),
    np.copy(np.outer(nn.hidden_layers[0].output, nn.output_layer.delta))]
)
deltas_for_1010 = ListOfArrays(
    [np.copy(nn.hidden_layers[0].delta),
    np.copy(nn.output_layer.delta)]
)

In [50]:
layers = [InputLayer(2), HiddenLayer(3, RandomUniform(0.0), Sigmoid()), OutputLayer(2, RandomUniform(0.0), Sigmoid())]
nn = NeuralNetwork(layers)

# Initialize layers
nn.hidden_layers[0].weights = np.array(
    [[1, 0.5, 2],
    [-1, 0, -1.5]], dtype = float
)
nn.hidden_layers[0].biases = np.array([1, 2, 3], dtype=float)

nn.output_layer.weights = np.array(
    [[1, 3],
     [2, 3],
     [-1, 0]], dtype = float
)
nn.output_layer.biases = np.array([0,2], dtype=float)


# Some fake training data, now with more data
x = np.array(
    [[1, 0],
     [1, 0],
     [-1, 3]], dtype = float
)
y = np.array(
    [[1, 0],
     [1, 0],
     [2, 3]], dtype = float
)

training_alg = Backpropagation(x, y, nn, 200.0, MSE(), ThresholdOnLoss(0.00001, 10), regularization_term = Tikhonov(0.000), batch_size = None)

In [51]:
y[2]

array([2., 3.])

In [52]:
nn.compute_output(x[2])
training_alg.backpropagate(y[2])
print(nn.hidden_layers[0].delta)
tensor_prod_for_m1323 = ListOfArrays(
    [np.copy(np.outer(nn.input_layer.output, nn.hidden_layers[0].delta)),
    np.copy(np.outer(nn.hidden_layers[0].output, nn.output_layer.delta))]
)
deltas_for_m1323 = ListOfArrays(
    [np.copy(nn.hidden_layers[0].delta),
    np.copy(nn.output_layer.delta)]
)

[ 0.00976832  0.05559372 -0.00445349]


In [53]:
training_alg = Backpropagation(x, y, nn, 200.0, MSE(), ThresholdOnLoss(0.00001, 10), regularization_term = Tikhonov(0.000), batch_size = None)
old_weights = ListOfArrays([np.copy(nn.hidden_layers[0].weights), np.copy(nn.output_layer.weights)])
old_biases = ListOfArrays([np.copy(nn.hidden_layers[0].biases), np.copy(nn.output_layer.biases)])

# Test one epoch update
training_alg.run(1)
new_weights = ListOfArrays([nn.hidden_layers[0].weights, nn.output_layer.weights])
new_biases = ListOfArrays([nn.hidden_layers[0].biases, nn.output_layer.biases])

print(new_weights - old_weights)
print(
     ( tensor_prod_for_1010 * 2/3 + tensor_prod_for_m1323 * 1/3)*training_alg.learning_rate 
)
(new_weights - old_weights) - ( tensor_prod_for_1010 * 2/3 + tensor_prod_for_m1323 * 1/3)*training_alg.learning_rate 

ListOfArrays[array([[-0.40922172, -3.36622628,  0.27997783],
       [ 1.95366392, 11.11874428, -0.89069877]]), array([[ 2.73682987e+00, -7.67934138e-03],
       [ 1.08834501e+01,  1.01064497e+00],
       [ 2.83419858e+00, -4.07301575e-02]])]
ListOfArrays[array([[-0.40922172, -3.36622628,  0.27997783],
       [ 1.95366392, 11.11874428, -0.89069877]]), array([[ 2.73682987e+00, -7.67934138e-03],
       [ 1.08834501e+01,  1.01064497e+00],
       [ 2.83419858e+00, -4.07301575e-02]])]


ListOfArrays[array([[ 0.00000000e+00,  0.00000000e+00,  1.66533454e-16],
       [ 0.00000000e+00,  0.00000000e+00, -1.11022302e-16]]), array([[-4.44089210e-16, -1.56125113e-17],
       [ 0.00000000e+00, -2.22044605e-16],
       [ 0.00000000e+00,  0.00000000e+00]])]

In [54]:
print((new_biases - old_biases) - ( deltas_for_1010 * 2/3 + deltas_for_m1323 * 1/3)*training_alg.learning_rate  )

ListOfArrays[array([0.00000000e+00, 0.00000000e+00, 1.11022302e-16]), array([0., 0.])]


In [55]:
class TrainingAlgorithm:
    def __init__(self, x_data: pd.DataFrame, y_data: pd.DataFrame, network: NeuralNetwork):
        self.x: np.ndarray = x_data.to_numpy() if isinstance(x_data, pd.DataFrame) else x_data
        self.y: np.ndarray = y_data.to_numpy() if isinstance(y_data, pd.DataFrame) else y_data
        self.network: NeuralNetwork = network

        self.history: dict[list] = {'training error': []}
        self.cache: dict = {}

class Backpropagation(TrainingAlgorithm):
    def __init__(self, x_data: pd.DataFrame, y_data: pd.DataFrame, network: NeuralNetwork,
                 learning_rate: float,
                 error_function: ErrorFunction, stopping_condition: StoppingCondition,
                 regularization_term: RegularizationTerm = None, batch_size: int = None
                 ):
        super().__init__(x_data, y_data, network)
        self.learning_rate: float = learning_rate
        self.err_fun: ErrorFunction = error_function; self.stop_cond: StoppingCondition = stopping_condition


        # The gradients of the loss function (= error function + regularization term) with respect to the network weights and biases.
        # Used for updating the network parameters after each training epoch.
        self.weights_gradient: ListOfArrays = None; self.biases_gradient: ListOfArrays = None
        

        
        self.regularization_term: RegularizationTerm = regularization_term or NoRegularization()
        self.regularization_term.set_network(self.network)

        self.batch_size: int = batch_size or len(self.x)









        self.history['weights'] = []
        self.history['biases'] = []


    def compute_loss(self, x: np.ndarray, y: np.ndarray) -> float:
        """
        Computes the loss of the current network over the given data.
        Such data may be the whole training data (self.x, self.y) or may be a mini-batch.

        
        Parameters
        ----------
        x: np.ndarray
            The input training (mini)batch data.
        y: np.ndarray
            The output training (mini)batch data.

        Returns
        -------
        loss: float
            Training error + Regularization penalty term. 
        """
        return self.compute_training_error(x, y) + self.regularization_term()
    
    def compute_training_error(self, x: np.ndarray, y: np.ndarray) -> float:
        """
        Computes the error of the current network over the given data.
        Such data may be the whole training data (self.x, self.y) or may be a mini-batch.
        Such error includes no regularization penalty term.


        Parameters
        ----------
        x: np.ndarray
            The input training (mini)batch data.
        y: np.ndarray
            The output training (mini)batch data.        

        Returns
        -------
        error: float
            Average of single-pattern errors over the dataset. 
        """
        network_outputs: np.ndarray = self.network.compute_multiple_outputs(x)
        return self.err_fun(y, network_outputs)
    
    def update_network_parameters(self, current_minibatch_size: int) -> None:
        """
        NOTE TO SELF: add the momentum term's contribution here later.
        Updates the weights and biases of the NeuralNetwork based on the computed gradients and the learning rate.
        """
        factor: float = self.learning_rate * current_minibatch_size / len(self.x)
        # Note that weights and biases are stored in each individual layer, so an iteration over
        # layers is necessary. NeuralNetwork has no weights: ListOfArrays attribute, so we can't just
        # write self.network.weights += ... * self.weights_gradient
        for i, l in enumerate(self.network.layers_with_weights):
            l.weights += factor * self.weights_gradient[i]   
            l.biases += factor * self.biases_gradient[i]









        self.history['weights'] += [ListOfArrays([
            l.weights for l in self.network.layers_with_weights
        ])]
        self.history['biases'] += [ListOfArrays([
            l.biases for l in self.network.layers_with_weights
        ])]
    
    def run(self, max_epochs: int) -> None:
        """
        Train the NeuralNetwork according to this strategy.
        """
        epoch: int = 0; tr_err: float = float('inf')
        indices = np.arange(len(self.x))
        while not self.stop_cond.is_satisfied(tr_err, None) and epoch < max_epochs:
            epoch += 1; tr_err = 0

            # Shuffle the data
            np.random.shuffle(indices)
            shuffled_x = self.x[indices]; shuffled_y = self.y[indices]

            for start in range(0, len(self.x), self.batch_size):  # For loop over the minibatches.
                end = start + self.batch_size
                minibatch_x = shuffled_x[start:end]; minibatch_y = shuffled_y[start:end]
                current_minibatch_size = len(minibatch_x)

                self.update_gradients(minibatch_x, minibatch_y, cache_error = True)  # NOTE define this function.
                tr_err += self.cache['minibatch_training_error'] * current_minibatch_size / len(self.x)  # Add the error of the minibatch to the whole-batch error.
                self.update_network_parameters(current_minibatch_size)


            self.history['training error'] += [tr_err]
    
    def update_gradients(self, x, y, cache_error: bool) -> None:
        # Set gradients to 0.
        self.reset_gradients()
        if cache_error: self.cache['minibatch_training_error'] = 0
        # Set gradients to the mean of single-pattern gradients of the error (without regularization) function.
        for pattern_x, pattern_y in zip(x,y):
            self.add_single_pattern_gradients_contribution(pattern_x, pattern_y, cache_error)
        current_minibatch_size: int = len(x)
        self.weights_gradient /= current_minibatch_size; self.biases_gradient /= current_minibatch_size
        if cache_error: self.cache['minibatch_training_error'] /= current_minibatch_size
        # Add the regularization term contribution.
        w_reg_term, b_reg_term = self.regularization_term.gradient()
        self.weights_gradient += w_reg_term; self.biases_gradient += b_reg_term
    
    def reset_gradients(self) -> None:
        """
        Sets self.weights_gradient and self.biases_gradient to a list of appropriately-shaped arrays with all zero entries.
        """
        lwws: list[Layer] = self.network.layers_with_weights
        self.weights_gradient = ListOfArrays([np.zeros_like(l.weights) for l in lwws])
        self.biases_gradient = ListOfArrays([np.zeros_like(l.biases) for l in lwws])

    def add_single_pattern_gradients_contribution(self, pattern_x: np.ndarray, pattern_y: np.ndarray, cache_error: bool) -> None:
        predicted_y: np.ndarray = self.network.compute_output(pattern_x)
        if cache_error: self.cache['minibatch_training_error'] += self.err_fun(pattern_y, predicted_y)
        self.backpropagate(pattern_y)

        w_term = ListOfArrays([np.outer(l.previous_layer.output, l.delta) for l in self.network.layers_with_weights])
        b_term = ListOfArrays([l.delta for l in self.network.layers_with_weights])

        self.weights_gradient += w_term; self.biases_gradient += b_term
    
    def backpropagate(self, pattern_y: np.ndarray) -> None:
        out_l: OutputLayer = self.network.output_layer; hid_ls: list[HiddenLayer] = self.network.hidden_layers  # Give short, convenient names to the OutputLayer and the HiddenLayers.

        out_l.delta = out_l.activation_function.derivative(out_l.linear_output) * self.err_fun.simple_gradient(pattern_y, out_l.output)  # Compute delta for the OutputLayer.
        for l in reversed(hid_ls):  # For each HiddenLayer, starting from the one closest to the OutputLayer and proceeding backwards:
            l.delta = l.activation_function.derivative(l.linear_output) * np.dot(l.next_layer.weights, l.next_layer.delta)  # Compute delta for the HiddenLayer.
        


In [56]:
layers = [InputLayer(2), HiddenLayer(3, RandomUniform(0.0), Sigmoid()), OutputLayer(2, RandomUniform(0.0), Sigmoid())]
nn = NeuralNetwork(layers)

# Initialize layers
nn.hidden_layers[0].weights = np.array(
    [[1, 0.5, 2],
    [-1, 0, -1.5]], dtype = float
)
nn.hidden_layers[0].biases = np.array([1, 2, 3], dtype=float)

nn.output_layer.weights = np.array(
    [[1, 3],
     [2, 3],
     [-1, 0]], dtype = float
)
nn.output_layer.biases = np.array([0,2], dtype=float)


# Some fake training data, now with more data
x = np.array(
    [[1, 0],
     [1, 0],
     [-1, 3]], dtype = float
)
y = np.array(
    [[1, 0],
     [1, 0],
     [2, 3]], dtype = float
)

training_alg = Backpropagation(x, y, nn, 200.0, MSE(), ThresholdOnLoss(0.00001, 10), regularization_term = Tikhonov(0.000), batch_size = 2)

old_weights = ListOfArrays([np.copy(nn.hidden_layers[0].weights), np.copy(nn.output_layer.weights)])
old_biases = ListOfArrays([np.copy(nn.hidden_layers[0].biases), np.copy(nn.output_layer.biases)])

np.random.seed(1)
training_alg.run(1)

In [60]:
intermediate_weights = training_alg.history['weights'][0]
new_weights = training_alg.history['weights'][1]

intermediate_biases = training_alg.history['biases'][0]
new_biases = training_alg.history['biases'][1]

In [61]:
print((intermediate_weights - old_weights)
      -
      ( tensor_prod_for_1010 * 1/3 + tensor_prod_for_m1323 * 1/3)*training_alg.learning_rate )

ListOfArrays[array([[-9.27071293e-04, -7.44532939e-04,  2.19349024e-08],
       [ 0.00000000e+00,  1.77635684e-15, -1.11022302e-16]]), array([[ 0.00000000e+00, -3.18824173e-03],
       [ 1.77635684e-15, -3.33229203e-03],
       [ 0.00000000e+00, -3.50297351e-03]])]


In [64]:
intermediate_nn = NeuralNetwork([
    InputLayer(2),
    HiddenLayer(3, RandomUniform(0), activation_function=Sigmoid()),
    OutputLayer(2, RandomUniform(0), activation_function=Sigmoid())
])
for i, l in enumerate(intermediate_nn.layers_with_weights):
    l.weights = intermediate_weights[i]
    l.biases = intermediate_biases[i]


training_alg2 = Backpropagation(x, y, intermediate_nn, 200.0, MSE(), ThresholdOnLoss(0.00001, 10), regularization_term = Tikhonov(0.000), batch_size = 2)

In [65]:
intermediate_nn.compute_output(x[2])

training_alg2.backpropagate(y[2])

intermediate_tensors = ListOfArrays([
    np.outer(
        l.previous_layer.output, l.delta
    ) for l in intermediate_nn.layers_with_weights
])

In [66]:
new_weights - intermediate_weights - (intermediate_tensors / 3)*training_alg.learning_rate

ListOfArrays[array([[ 2.03390184e-04,  0.00000000e+00, -2.26760412e-08],
       [-6.10170553e-04,  0.00000000e+00,  6.80281237e-08]]), array([[-3.38358582e-10, -4.39081919e-03],
       [-3.43621428e-10, -4.45911420e-03],
       [-3.95538681e-13, -5.13283518e-06]])]