In [5]:
import numpy as np
import import_ipynb
from ANN import Layer_Dense, Activation_Softmax, Loss_CategoricalCrossentropy, Activation_Softmax_Loss_CategoricalCrossentropy

In [6]:
class RNN:
    def __init__(self, input_size, hidden_size, output_size):
        # Hyperparameters
        self.hidden_size = hidden_size

        # Weights - Initialized with small random numbers
        # Weight for Input -> Hidden
        self.Wxh = np.random.randn(hidden_size, input_size) * 0.01
        # Weight for Hidden -> Hidden (The Recurrent Weight)
        self.Whh = np.random.randn(hidden_size, hidden_size) * 0.01
        # Weight for Hidden -> Output
        self.Why = np.random.randn(output_size, hidden_size) * 0.01

        # Biases
        self.bh = np.zeros((hidden_size, 1))
        self.by = np.zeros((output_size, 1))


    def forward(self, inputs):
        """
        inputs: List of one-hot encoded vectors or features for each time step.
        """
        h = {} # Store hidden states for each time step
        y = {} # Store outputs for each time step

        # Initialize the very first hidden state as zeros
        h[-1] = np.zeros((self.hidden_size, 1))

        for t in range(len(inputs)):
            # Equation: h_t = tanh(Wxh * x_t + Whh * h_t-1 + bh)
            # This combines current input with previous memory
            self.z_h = np.dot(self.Wxh, inputs[t]) + np.dot(self.Whh, h[t-1]) + self.bh
            h[t] = np.tanh(self.z_h)

            # Equation: y_t = Why * h_t + by (The prediction at this step)
            y[t] = np.dot(self.Why, h[t]) + self.by

        self.last_inputs = inputs
        self.last_hs = h
        return y, h


    def backward(self, targets, outputs, hs, learning_rate):
        # Initialize gradients as zeros
        dWxh, dWhh, dWhy = np.zeros_like(self.Wxh), np.zeros_like(self.Whh), np.zeros_like(self.Why)
        dbh, dby = np.zeros_like(self.bh), np.zeros_like(self.by)
        dh_next = np.zeros_like(hs[0]) # Gradient from the "future" step

        # Loop backwards through time
        for t in reversed(range(len(self.last_inputs))):
            # 1. Error on the output at time t (Assuming Softmax/Cross-Entropy)
            # dy = Prediction - Truth
            dy = np.copy(outputs[t])
            dy[targets[t]] -= 1

            # 2. Gradient for Output Weights (Why)
            dWhy += np.dot(dy, hs[t].T)
            dby += dy

            # 3. Error on Hidden State (dh)
            # Contribution from output + Contribution from the future hidden state
            dh = np.dot(self.Why.T, dy) + dh_next

            # 4. Backprop through Tanh non-linearity
            # Derivative of tanh is (1 - h^2)
            dh_raw = (1 - hs[t] * hs[t]) * dh

            # 5. Gradients for Hidden Weights
            dbh += dh_raw
            dWxh += np.dot(dh_raw, self.last_inputs[t].T)
            dWhh += np.dot(dh_raw, hs[t-1].T)

            # 6. Pass the gradient to the previous time step
            dh_next = np.dot(self.Whh.T, dh_raw)

        # Update Weights using Gradient Descent
        for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
            np.clip(dparam, -5, 5, out=dparam) # Clip to prevent Exploding Gradients

        self.Wxh -= learning_rate * dWxh
        self.Whh -= learning_rate * dWhh
        self.Why -= learning_rate * dWhy
        self.bh -= learning_rate * dbh
        self.by -= learning_rate * dby

In [7]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

class LSTM_Cell:
    def __init__(self, input_size, hidden_size):
        self.input_size = input_size
        self.hidden_size = hidden_size

        # Combined weights for all 4 gates for efficiency
        # (Forget, Input, Candidate, Output)
        self.W = np.random.randn(4 * hidden_size, input_size + hidden_size) * 0.01
        self.b = np.zeros((4 * hidden_size, 1))

    def forward(self, x, h_prev, c_prev):
        # Stack input and previous hidden state
        combined = np.vstack((h_prev, x))

        # Compute all gate activations at once
        gates = np.dot(self.W, combined) + self.b

        # Split gates
        f = sigmoid(gates[0:self.hidden_size])
        i = sigmoid(gates[self.hidden_size:2*self.hidden_size])
        c_tilde = np.tanh(gates[2*self.hidden_size:3*self.hidden_size])
        o = sigmoid(gates[3*self.hidden_size:4*self.hidden_size])

        # Update Cell State (The Conveyor Belt)
        c_next = f * c_prev + i * c_tilde
        # Update Hidden State
        h_next = o * np.tanh(c_next)

        return h_next, c_next, (f, i, c_tilde, o, combined, c_next)

In [8]:
class Layer_Dense:
    def __init__(self, n_inputs, n_neurons):
        # Initialize weights with small random values to break symmetry.
        # (inputs, neurons) so we can dot product with inputs (samples, inputs)
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)

        # Initialize biases as zeros (one for each neuron)
        self.biases = np.zeros((1, n_neurons))

    def forward(self, inputs):
        # Saving the input values for the backward pass calculations
        self.inputs = inputs

        # Calculate output: Dot product of inputs and weights, then add bias
        self.output = np.dot(inputs, self.weights) + self.biases
        return self.output

    def backward(self, dvalues):
        """
        dvalues: The gradient of the loss with respect to the output of this layer.
                 Essentially, the "error signal" coming from the layer ahead.
        """
        # 1. Gradient on weights: how much did each weight affect the error?
        # Formula: dL/dW = X^T . dL/dZ
        self.dweights = np.dot(self.inputs.T, dvalues)

        # 2. Gradient on biases: how much did each bias affect the error?
        # Formula: dL/db = sum of dL/dZ across the batch
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)

        # 3. Gradient on inputs: error signal to pass to the PREVIOUS layer
        # Formula: dL/dX = dL/dZ . W^T
        self.dinputs = np.dot(dvalues, self.weights.T)

In [9]:
# Setup
hidden_size = 16
vocab_size = 4 # h, e, l, o
lstm = LSTM_Cell(vocab_size, hidden_size)
output_layer = Layer_Dense(hidden_size, vocab_size) # Our ANN layer from Phase 1!
loss_fn = Activation_Softmax_Loss_CategoricalCrossentropy()

# Data Preparation (One-Hot Encoding)
inputs = [np.array([[1],[0],[0],[0]]), np.array([[0],[1],[0],[0]]),
          np.array([[0],[0],[1],[0]]), np.array([[0],[0],[1],[0]])]
targets = [1, 2, 2, 3] # e, l, l, o

# Training Simulation (Forward Pass) 

h = np.zeros((hidden_size, 1))
c = np.zeros((hidden_size, 1))

print("Predicting the word 'hello'...")
for t in range(len(inputs)):
    # 1. LSTM Step
    h, c, cache = lstm.forward(inputs[t], h, c)

    # 2. Prediction Step (The Dense layer takes the Hidden State)
    dense_out = output_layer.forward(h.T)

    # 3. Softmax (Calculated manually here for the printout)
    exp_values = np.exp(dense_out - np.max(dense_out)) # max subtraction for stability
    probs = exp_values / np.sum(exp_values)

    idx = np.argmax(probs)
    chars = "helo" # The unique characters in our vocab
    print(f"Input: {'hello'[t]} -> Predicted Next: '{chars[idx]}'")


Predicting the word 'hello'...
Input: h -> Predicted Next: 'l'
Input: e -> Predicted Next: 'l'
Input: l -> Predicted Next: 'e'
Input: l -> Predicted Next: 'e'
