In [1]:
import numpy as np

In [32]:
words = ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
vocab = set(words)
vocab_size = len(vocab)
embedding_size = 10

In [16]:
word_to_index = { word: i for i, word in enumerate(vocab)}
index_to_word = { i: word for word, i in word_to_index.items()}

In [18]:
embedding = np.random.randn(vocab_size, emb_size)
linear = np.random.randn(emb_size, vocab_size)

In [27]:
def one_hot_encod(index):
    one_hot = np.zeros(vocab_size)
    one_hot[index] = 1
    return one_hot

In [21]:
def softmax(array):
    array = np.exp(array)
    return array / np.sum(array)

In [24]:
def forward(index):
    return softmax(embedding[index,:] @ linear)

In [33]:
def criterion(y, output):
    return -1/vocab_size*np.inner(y, np.log(output))

In [36]:
index = 4
y = one_hot_encod(index)
o = forward(index)
loss = criterion(y, o)
loss

0.5382935797309683

## Backward
This is the forward graph
```
embedding --(@ linear)--> x --(softmax)--> o --(criterion)--> loss
```

$L = -\frac{1}{V}y^Tlog(o)$

$\frac{\partial L}{\partial o} = -\frac{1}{V} y / o$ (term by term division)

In [49]:
dL_do = - 1/vocab_size*y/o
dL_do

array([-0.        , -0.        , -0.        , -0.        , -9.27114692,
       -0.        , -0.        , -0.        ])

In [50]:
dL_dx = dL_do @ np.outer(o, 1-o)
dL_dx

array([-0.1246947 , -0.12482869, -0.03213739, -0.124979  , -0.12331466,
       -0.09907023, -0.12169376, -0.12428156])

In [53]:
dL_de = linear @ dL_dx
dL_de

array([ 0.15909719,  0.49766231,  0.27669867, -0.14649541,  0.05396358,
        0.68517847, -0.21329033, -0.10972845,  0.18132987, -0.02091397])

Gradient descent step

In [54]:
learning_rate = 1e-3

embedding[index,:] -= learning_rate*dL_de