In [5]:
import numpy as np

np.random.seed(1)

vocab = ['I', 'love', 'deep', 'learning']
word_to_ix = {w: i for i, w in enumerate(vocab)}
ix_to_word = {i: w for i, w in enumerate(vocab)}

def one_hot(index, size):
    vec = np.zeros(size)
    vec[index] = 1
    return vec

vocab_size = len(vocab)
hidden_size = 3
learning_rate = 0.01

Wx = np.random.randn(hidden_size, vocab_size) * 0.1
Wh = np.random.randn(hidden_size, hidden_size) * 0.1
Wy = np.random.randn(vocab_size, hidden_size) * 0.1

inputs = [word_to_ix[w] for w in ['I', 'love', 'deep']]
target = word_to_ix['learning']

mWx, mWh, mWy = np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(Wy)
vWx, vWh, vWy = np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(Wy)
beta1, beta2, epsilon = 0.9, 0.999, 1e-8
t = 0

for epoch in range(1000):
    t += 1
    hs = {}
    hs[-1] = np.zeros((hidden_size,))
    xs = []

    for t_input in range(3):
        x = one_hot(inputs[t_input], vocab_size)
        xs.append(x)
        hs[t_input] = np.tanh(np.dot(Wx, x) + np.dot(Wh, hs[t_input - 1]))

    y_pred = np.dot(Wy, hs[2])
    y_probs = np.exp(y_pred) / np.sum(np.exp(y_pred))

    loss = -np.log(y_probs[target])
    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")

    dy = y_probs.copy()
    dy[target] -= 1

    dWy = np.outer(dy, hs[2])
    dh = np.dot(Wy.T, dy) * (1 - hs[2]**2)

    dWx = np.zeros_like(Wx)
    dWh = np.zeros_like(Wh)
    for t_input in reversed(range(3)):
        dWx += np.outer(dh, xs[t_input])
        if t_input > 0:
            dWh += np.outer(dh, hs[t_input - 1])
        dh = np.dot(Wh.T, dh) * (1 - hs[t_input - 1]**2)

    mWx = beta1 * mWx + (1 - beta1) * dWx
    mWh = beta1 * mWh + (1 - beta1) * dWh
    mWy = beta1 * mWy + (1 - beta1) * dWy

    vWx = beta2 * vWx + (1 - beta2) * (dWx**2)
    vWh = beta2 * vWh + (1 - beta2) * (dWh**2)
    vWy = beta2 * vWy + (1 - beta2) * (dWy**2)

    mWx_hat = mWx / (1 - beta1**t)
    mWh_hat = mWh / (1 - beta1**t)
    mWy_hat = mWy / (1 - beta1**t)

    vWx_hat = vWx / (1 - beta2**t)
    vWh_hat = vWh / (1 - beta2**t)
    vWy_hat = vWy / (1 - beta2**t)

    Wx -= learning_rate * mWx_hat / (np.sqrt(vWx_hat) + epsilon)
    Wh -= learning_rate * mWh_hat / (np.sqrt(vWh_hat) + epsilon)
    Wy -= learning_rate * mWy_hat / (np.sqrt(vWy_hat) + epsilon)

hs = {}
hs[-1] = np.zeros((hidden_size,))
for t_input in range(3):
    x = one_hot(inputs[t_input], vocab_size)
    hs[t_input] = np.tanh(np.dot(Wx, x) + np.dot(Wh, hs[t_input - 1]))

y_pred = np.dot(Wy, hs[2])
y_probs = np.exp(y_pred) / np.sum(np.exp(y_pred))

pred_index = np.argmax(y_probs)
print("\nPredicted word:", ix_to_word[pred_index])


Epoch 0, Loss: 1.3981
Epoch 100, Loss: 0.0314
Epoch 200, Loss: 0.0114
Epoch 300, Loss: 0.0063
Epoch 400, Loss: 0.0041
Epoch 500, Loss: 0.0029
Epoch 600, Loss: 0.0022
Epoch 700, Loss: 0.0017
Epoch 800, Loss: 0.0014
Epoch 900, Loss: 0.0012

Predicted word: learning
