In [17]:
import numpy as np

In [18]:
words = ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
vocab = set(words)
vocab_size = len(vocab)
embedding_size = 10

In [19]:
word_to_index = { word: i for i, word in enumerate(vocab)}
index_to_word = { i: word for word, i in word_to_index.items()}

In [20]:
embedding = np.random.randn(vocab_size, embedding_size)
linear = np.random.randn(embedding_size, vocab_size)

In [21]:
def one_hot_encod(index):
    one_hot = np.zeros(vocab_size)
    one_hot[index] = 1
    return one_hot

In [22]:
def softmax(array):
    array = np.exp(array)
    return array / np.sum(array)

In [23]:
def forward(index):
    return softmax(embedding[index,:] @ linear)

In [24]:
def criterion(y, output):
    return -1/vocab_size*np.inner(y, np.log(output))

In [25]:
index = 4
y = one_hot_encod(index)
o = forward(index)
loss = criterion(y, o)
loss

0.3671987876935347

## Backward
This is the forward graph
```
embedding --(@ linear)--> x --(softmax)--> o --(criterion)--> loss
```

$L = -\frac{1}{V}y^Tlog(o)$

$\frac{\partial L}{\partial o} = -\frac{1}{V} y / o$ (term by term division)

In [26]:
dL_do = - 1/vocab_size*y/o
dL_do

array([-0.        , -0.        , -0.        , -0.        , -2.35878996,
       -0.        , -0.        , -0.        ])

Computing $\frac{\partial L}{\partial x} = \frac{\partial L}{\partial o}\frac{\partial o}{\partial x}$ leads to a lot of simplifications...

$\frac{\partial L}{\partial x} = \frac{1}{V} (o-y)$

In [27]:
dL_dx = 1/vocab_size*(o-y)
dL_dx

array([ 0.0075362 ,  0.00585483,  0.00417056,  0.01973817, -0.11837584,
        0.01104708,  0.05633672,  0.01369228])

In [28]:
dL_de = linear @ dL_dx
dL_de

array([-0.06941078, -0.34673677, -0.10697694, -0.23240771,  0.19650469,
        0.09956768, -0.09942405, -0.02107459, -0.03876713,  0.03872456])

Gradient descent step

In [29]:
learning_rate = 1e-3

embedding[index,:] -= learning_rate*dL_de

In [30]:
def step(index, learning_rate=1.):
    y = one_hot_encod(index)
    o = forward(index)
    
    loss = criterion(y, o)
    print(f"Loss - {loss:.3f}")
    
    grad = linear @ (o-y) / vocab_size
    embedding[index,:] -= learning_rate*grad

In [31]:
for _ in range(20):
    step(index)
forward(index)

Loss - 0.367
Loss - 0.144
Loss - 0.051
Loss - 0.028
Loss - 0.019
Loss - 0.015
Loss - 0.012
Loss - 0.010
Loss - 0.009
Loss - 0.008
Loss - 0.007
Loss - 0.006
Loss - 0.006
Loss - 0.005
Loss - 0.005
Loss - 0.004
Loss - 0.004
Loss - 0.004
Loss - 0.004
Loss - 0.003


array([5.89988700e-04, 2.54744803e-03, 4.96841740e-03, 3.22029883e-03,
       9.74169910e-01, 7.97206092e-04, 9.90784411e-03, 3.79888652e-03])

It learns very well!