In [12]:
import numpy as np

np.random.seed(42)
vocab = ["Reading", "books", "enhances", "knowledge"]
vocab_size = len(vocab)
hidden_size = 3 
output_size = vocab_size 


In [13]:
word_to_one_hot = {word: np.eye(vocab_size)[i] for i, word in enumerate(vocab)}
sequence = ["Reading", "books", "enhances", "knowledge"]
input_seq = [word_to_one_hot[word] for word in sequence[:3]]
target = word_to_one_hot["knowledge"] 

W_h = np.random.randn(hidden_size, hidden_size) * 0.01  
W_x = np.random.randn(hidden_size, vocab_size) * 0.01  
W_y = np.random.randn(vocab_size, hidden_size) * 0.01  

In [14]:
# Hyperparameters
learning_rate = 0.01
epochs = 1000

# Activation functions
def tanh(x):
    return np.tanh(x)

def softmax(x):
    exp_x = np.exp(x - np.max(x))  #
    return exp_x / np.sum(exp_x)

In [15]:
def forward_propagation(inputs):
    h_prev = np.zeros((hidden_size, 1))
    h_states = [] 
    a_states = [] 
    outputs = [] 
    
    for x_t in inputs:
        x_t = x_t.reshape(-1, 1)  
        a_t = np.dot(W_h, h_prev) + np.dot(W_x, x_t)
        h_t = tanh(a_t)
        y_t = softmax(np.dot(W_y, h_t))
        
        h_states.append(h_t)
        a_states.append(a_t)
        outputs.append(y_t)
        h_prev = h_t  
    
    return h_states, a_states, outputs, h_prev

In [16]:
# (Mean Squared Error)
def compute_loss(predicted, target):
    return np.mean((predicted - target) ** 2)

In [17]:
def backpropagation(inputs, target, h_states, a_states, outputs):
    global W_h, W_x, W_y
    target = target.reshape(-1, 1) 
    
    dW_h = np.zeros_like(W_h)
    dW_x = np.zeros_like(W_x)
    dW_y = np.zeros_like(W_y)
    dh_next = np.zeros((hidden_size, 1))
    
    for t in range(len(inputs) - 1, -1, -1):
        y_t = outputs[t]
        h_t = h_states[t]
        a_t = a_states[t]
        x_t = inputs[t].reshape(-1, 1)
        dy = 2 * (y_t - target) / len(y_t)  
        dW_y += np.dot(dy, h_t.T)
        dh = np.dot(W_y.T, dy) + dh_next
        da = dh * (1 - h_t ** 2) 
        h_prev = h_states[t-1] if t > 0 else np.zeros((hidden_size, 1))
        dW_h += np.dot(da, h_prev.T)
        dW_x += np.dot(da, x_t.T)
        dh_next = np.dot(W_h.T, da)
        
    for dparam in [dW_h, dW_x, dW_y]:
        np.clip(dparam, -5, 5, out=dparam)
    
    return dW_h, dW_x, dW_y

In [18]:
# Training 
for epoch in range(epochs):
    h_states, a_states, outputs, h_prev = forward_propagation(input_seq)
    loss = compute_loss(outputs[-1], target)
    dW_h, dW_x, dW_y = backpropagation(input_seq, target, h_states, a_states, outputs)
    W_h -= learning_rate * dW_h
    W_x -= learning_rate * dW_x
    W_y -= learning_rate * dW_y
    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.6f}")


Epoch 0, Loss: 0.187500
Epoch 100, Loss: 0.187500
Epoch 200, Loss: 0.187500
Epoch 300, Loss: 0.187500
Epoch 400, Loss: 0.187501
Epoch 500, Loss: 0.187511
Epoch 600, Loss: 0.187758
Epoch 700, Loss: 0.195221
Epoch 800, Loss: 0.267010
Epoch 900, Loss: 0.322660


In [19]:
h_states, a_states, outputs, h_prev = forward_propagation(input_seq)
predicted = outputs[-1].flatten()
predicted_word = vocab[np.argmax(predicted)]
print("\nInput sequence:", sequence[:3])
print("Target word:", sequence[3])
print("Predicted word:", predicted_word)
print("Prediction probabilities:", predicted)


 ['Reading', 'books', 'enhances']
Target word: knowledge
Predicted word: knowledge
Prediction probabilities: [0.0220149  0.02201583 0.02189233 0.93407694]
