In [12]:
import numpy as np
import random

In [13]:
corpus = "the quick brown fox jumps over the lazy dog".split()

In [14]:
vocab = set(corpus)
vocab_size = len(vocab)
word2idx = {word : i for i, word in enumerate(vocab)}
idx2word = {i : word for word, i in word2idx.items()}

In [15]:
def generate_training_data(corpus, window_size):
    data = []
    for i in range(window_size, len(corpus)-window_size):
        context = []
        for j in range(i-window_size, i+window_size+1):
            if j != i:
                context.append(word2idx[corpus[j]])
        target = word2idx[corpus[i]]
        data.append((context,target))
    return data

window_size = 2
training_data = generate_training_data(corpus, window_size)

In [16]:
def softmax(x):
    e_x = np.exp(x-np.max(x))
    return e_x / np.sum(e_x, axis = 0)

#Initializes weight
embedding_dim = 5
W1 = np.random.randn(vocab_size, embedding_dim)
W2 = np.random.randn(embedding_dim, vocab_size)

#Training rate
learning_rate = 0.01
epochs = 1000

for epoch in range(epochs):
    loss = 0
    for context, target in training_data:
        #average
        x = np.mean(W1[context], axis = 0).reshape(embedding_dim, 1)

        #forward pass
        z = np.dot(W2.T, x)
        y_pred = softmax(z)

        #true label
        y_true = np.zeros((vocab_size, 1))
        y_true[target] = 1

        # error
        e = y_pred - y_true
        loss += -np.sum(y_true * np.log(y_pred + 1e-9))

        #gradient
        dw2 = np.dot(x, e.T)
        dw1_context = np.dot(W2, e)

        #update weight
        for c in context:
            W1 -= learning_rate *  dw1_context.flatten()
            W2 -= learning_rate * dw2

    if epoch % 200 == 0:
            print(f"epoch : {epoch}, Loss:{loss:.4f} ")
        

epoch : 0, Loss:14.6189 
epoch : 200, Loss:4.9356 
epoch : 400, Loss:3.8818 
epoch : 600, Loss:3.2123 
epoch : 800, Loss:2.7591 


In [17]:
def get_vector(word):
    return W1[word2idx[word]]

print("\nWord Embeddings (sample):")
for w in ["quick", "fox", "dog"]:
    print(f"{w}: {get_vector(w)}")


Word Embeddings (sample):
quick: [ 0.53309606 -1.59600914  1.85117536 -0.42553776 -1.87424922]
fox: [-0.34822859 -1.06694149 -1.35956686 -0.8190879  -0.41388069]
dog: [ 0.13667406 -1.7527321  -0.00606207 -1.30964627 -0.72173492]
