In [1]:
import numpy as np
import import_ipynb
from ANN import Layer_Dense, Activation_Softmax, Loss_CategoricalCrossentropy, Activation_Softmax_Loss_CategoricalCrossentropy

In [2]:
class RNN:
    def __init__(self, input_size, hidden_size, output_size):
        # Hyperparameters
        self.hidden_size = hidden_size

        # Weights - Initialized with small random numbers
        # Weight for Input -> Hidden
        self.Wxh = np.random.randn(hidden_size, input_size) * 0.01
        # Weight for Hidden -> Hidden (The Recurrent Weight)
        self.Whh = np.random.randn(hidden_size, hidden_size) * 0.01
        # Weight for Hidden -> Output
        self.Why = np.random.randn(output_size, hidden_size) * 0.01

        # Biases
        self.bh = np.zeros((hidden_size, 1))
        self.by = np.zeros((output_size, 1))


    def forward(self, inputs):
        """
        inputs: List of one-hot encoded vectors or features for each time step.
        """
        h = {} # Store hidden states for each time step
        y = {} # Store outputs for each time step

        # Initialize the very first hidden state as zeros
        h[-1] = np.zeros((self.hidden_size, 1))

        for t in range(len(inputs)):
            # Equation: h_t = tanh(Wxh * x_t + Whh * h_t-1 + bh)
            # This combines current input with previous memory
            self.z_h = np.dot(self.Wxh, inputs[t]) + np.dot(self.Whh, h[t-1]) + self.bh
            h[t] = np.tanh(self.z_h)

            # Equation: y_t = Why * h_t + by (The prediction at this step)
            y[t] = np.dot(self.Why, h[t]) + self.by

        self.last_inputs = inputs
        self.last_hs = h
        return y, h


    def backward(self, targets, outputs, hs, learning_rate):
        # Initialize gradients as zeros
        dWxh, dWhh, dWhy = np.zeros_like(self.Wxh), np.zeros_like(self.Whh), np.zeros_like(self.Why)
        dbh, dby = np.zeros_like(self.bh), np.zeros_like(self.by)
        dh_next = np.zeros_like(hs[0]) # Gradient from the "future" step

        # Loop backwards through time
        for t in reversed(range(len(self.last_inputs))):
            # 1. Error on the output at time t (Assuming Softmax/Cross-Entropy)
            # dy = Prediction - Truth
            dy = np.copy(outputs[t])
            dy[targets[t]] -= 1

            # 2. Gradient for Output Weights (Why)
            dWhy += np.dot(dy, hs[t].T)
            dby += dy

            # 3. Error on Hidden State (dh)
            # Contribution from output + Contribution from the future hidden state
            dh = np.dot(self.Why.T, dy) + dh_next

            # 4. Backprop through Tanh non-linearity
            # Derivative of tanh is (1 - h^2)
            dh_raw = (1 - hs[t] * hs[t]) * dh

            # 5. Gradients for Hidden Weights
            dbh += dh_raw
            dWxh += np.dot(dh_raw, self.last_inputs[t].T)
            dWhh += np.dot(dh_raw, hs[t-1].T)

            # 6. Pass the gradient to the previous time step
            dh_next = np.dot(self.Whh.T, dh_raw)

        # Update Weights using Gradient Descent
        for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
            np.clip(dparam, -5, 5, out=dparam) # Clip to prevent Exploding Gradients

        self.Wxh -= learning_rate * dWxh
        self.Whh -= learning_rate * dWhh
        self.Why -= learning_rate * dWhy
        self.bh -= learning_rate * dbh
        self.by -= learning_rate * dby