In [4]:
import numpy as np
import torch
from torch.autograd import Variable
import torch.nn.functional as F

Nano corpus:

In [5]:
corpus = [
          "he is a king",
          "she is a queen",
          "he is a man",
          "she is a woman",
          "Warsaw is Poland capital",
          "Berlin is German capital",
          "Paris is France capital"          
]

Create vocabulary word2vec:

In [6]:
def tokenize_corpus(corpus):
  tokens = [text.lower().split() for text in corpus]
  return tokens

tokenized_corpus = tokenize_corpus(corpus)
tokenized_corpus

[['he', 'is', 'a', 'king'],
 ['she', 'is', 'a', 'queen'],
 ['he', 'is', 'a', 'man'],
 ['she', 'is', 'a', 'woman'],
 ['warsaw', 'is', 'poland', 'capital'],
 ['berlin', 'is', 'german', 'capital'],
 ['paris', 'is', 'france', 'capital']]

In [7]:
vocabulary = []
for sentence in tokenized_corpus:
  for token in sentence:
    if token not in vocabulary:
      vocabulary.append(token)

word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

vocabulary_size = len(vocabulary)
vocabulary_size, idx2word

(15,
 {0: 'he',
  1: 'is',
  2: 'a',
  3: 'king',
  4: 'she',
  5: 'queen',
  6: 'man',
  7: 'woman',
  8: 'warsaw',
  9: 'poland',
  10: 'capital',
  11: 'berlin',
  12: 'german',
  13: 'paris',
  14: 'france'})

Generate pairs `center word`, `context word`. Context window is equal 2.

In [8]:
window_size = 2
idx_pairs = []
for sentence in tokenized_corpus:
  indices = [word2idx[word] for word in sentence]
  for center_word_pos in range(len(indices)):
    for w in range(-window_size, window_size+1):
      context_word_pos = center_word_pos + w
      if (context_word_pos < 0) or (context_word_pos >= len(indices)) or center_word_pos == context_word_pos:
        continue
      context_word_idx = indices[context_word_pos]
      idx_pairs.append((indices[center_word_pos], context_word_idx))

idx_paris = np.array(idx_pairs)
idx_paris[:5]

array([[0, 1],
       [0, 2],
       [1, 0],
       [1, 2],
       [1, 3]])

In [9]:
# Input layer shape = [1, vocabulary_size]
def get_input_layer(word_idx):
  x = torch.zeros(vocabulary_size).float()
  x[word_idx] = 1.0
  return x

In [113]:
# Hidden layer shape = [embeddings_dim, vocabulary size]
embedding_dims = 7
W1 = Variable(torch.randn(embedding_dims, vocabulary_size), requires_grad=True)
# Output layer
W2 = Variable(torch.randn(vocabulary_size, embedding_dims), requires_grad=True)
EPOCHS = 100
lr = 10e-3

for epoch in range(EPOCHS):
  losses = 0
  loss_acc = []
  idx_pairs = np.random.permutation(idx_pairs)
  for data, target in idx_pairs:
    X = Variable(get_input_layer(data))
    y = Variable(torch.from_numpy(np.array([target])))

    z1 = torch.matmul(W1, X)
    z2 = torch.matmul(W2, z1)

    log_softmax = F.log_softmax(z2, dim=0)

    loss = F.nll_loss(log_softmax.view(1,-1), y)
    losses += loss.item()
    loss_acc.append(loss.item())
    loss.backward()
    W1.data -= lr * W1.grad.data
    W2.data -= lr * W2.grad.data

    W1.grad.data.zero_()
    W2.grad.data.zero_()
    
  if epoch%10 == 0:
    print(f"Loss at epoch {epoch}: {losses / len(idx_pairs)}")

Loss at epoch 0: 5.402669748983213
Loss at epoch 10: 2.6605129037584576
Loss at epoch 20: 2.2096155958516257
Loss at epoch 30: 2.0115505201475963
Loss at epoch 40: 1.9084281478609357
Loss at epoch 50: 1.8458724771227155
Loss at epoch 60: 1.802310329249927
Loss at epoch 70: 1.7685917500938688
Loss at epoch 80: 1.7419487054858889
Loss at epoch 90: 1.7185039928981236


In [114]:
def similarity(v, u):
  return (torch.dot(v, u) / (torch.norm(v) * torch.norm(u))).item()

In [115]:
similarity(W2[word2idx["she"]], W2[word2idx["king"]]), similarity(W2[word2idx["he"]], W2[word2idx["king"]])

(0.2857823967933655, 0.614136815071106)

In [116]:
similarity(W2[word2idx["she"]], W2[word2idx["queen"]]), similarity(W2[word2idx["he"]], W2[word2idx["queen"]])

(0.4709746539592743, 0.620449423789978)