In [1]:
from word2vec import *

In [63]:
words = ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
vocab = set(words)
embedding_size = 10

## Backward
This is the forward graph
```
embedding --(@ linear)--> x --(softmax)--> o --(criterion)--> loss
```

Gradient descent step

In [139]:
import numpy as np
from dataset import SentencesDataset

def softmax(array):
    array = np.exp(array)
    return array / np.sum(array, axis=1, keepdims=True)

class Word2Vec():
    def __init__(self, vocab, embedding_size, learning_rate=1.):
        self.vocab = vocab
        self.embedding_size = embedding_size
        self.learning_rate = learning_rate
        
        self.vocab_size = len(self.vocab)
        self.word_to_index = { word: i for i, word in enumerate(self.vocab)}
        self.index_to_word = { i: word for word, i in self.word_to_index.items()}
        
        self.embedding = np.random.randn(self.vocab_size, self.embedding_size)
        self.linear = np.random.randn(self.embedding_size, self.vocab_size)
        
    def one_hot_encod(self, output_indices):
        one_hot = np.zeros((len(output_indices), self.vocab_size))
        one_hot[np.arange(len(output_indices)), output_indices] = 1
        return one_hot
    
    def forward(self, input_indices):
        embeddings = self.embedding[input_indices, :]

        return embeddings, softmax(embeddings @ self.linear)
    
    def criterion(self, y, outputs):
        return -1/self.vocab_size*np.sum(y*np.log(outputs), axis=1)
    
    def step(self, input_indices, output_indices):
        y = self.one_hot_encod(output_indices)
        embeddings, outputs = self.forward(input_indices)

        loss = self.criterion(y, outputs)

        dL_dx = 1/self.vocab_size*(outputs-y)

        grad_linear = embeddings.T @ dL_dx
        self.linear -= self.learning_rate*grad_linear
        
        grad_embedding = dL_dx @ self.linear.T
        self.embedding[input_indices, :] -= self.learning_rate*grad_embedding

        return loss.mean()

In [140]:
input_indices = [1,2,2]
output_indices = [0,2,3]

In [141]:
word2vec = Word2Vec(vocab, embedding_size)
e, o = word2vec.forward(input_indices)

In [142]:
y = word2vec.one_hot_encod(output_indices)

In [143]:
word2vec.step(input_indices, output_indices)

0.4615007761471727

In [144]:
input_indices, output_indices = [0,1], [2,3]

for _ in range(20):
    print(word2vec.step(input_indices, output_indices))
_, outputs = word2vec.forward(input_indices)
outputs

0.6510014487379439
0.37416519566696105
0.20625483228547825
0.14466301387770514
0.10419005293889565
0.07375236755664163
0.051896978184955714
0.03721310284640155
0.02768196606628731
0.02144456736252744
0.017225185752799594
0.014254101085234727
0.012080590563486301
0.010436555241408924
0.009157255692508007
0.008137709624571842
0.007308621652623333
0.006622759563574707
0.006046993205997484
0.005557486688026594


array([[6.89855561e-06, 1.09933664e-04, 9.79568607e-01, 8.87112328e-03,
        1.50245225e-03, 4.95329737e-03, 3.96035458e-03, 1.02733371e-03],
       [8.83148284e-04, 1.84495314e-02, 1.05908505e-02, 9.40311355e-01,
        6.34542699e-03, 2.42105704e-03, 8.00264324e-03, 1.29959878e-02]])

In [187]:
import re

class WordPairsDataset:
    """
    Very simple dataset composed of pairs of words
    """
    def __init__(self, data_filename, window_size=5):
        """
        window_size must be odd
        """
        super(WordPairsDataset, self).__init__()
        self.pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*') # Tokens containing an alpha symbol
        self.padding = window_size // 2
        
        with open(data_filename, "r") as f:
            self.sentences = [l for l in f]

        self.window_size = window_size
        self.preprocess()
        
        self.pairs = []
        for sentence in self.sentences:
            for i, word in enumerate(sentence):
                for delta in range(-self.padding, self.padding+1):
                    if delta != 0 and i + delta >= 0 and i + delta < len(sentence):
                        self.pairs.append((word, sentence[i + delta]))
                    

    def tokenize(self, sentence):
        return self.pattern.findall(sentence.lower())

    def preprocess(self):
        self.sentences = list(map(self.tokenize, self.sentences))
        self.sentences = list(filter(lambda l: len(l) >= self.window_size, self.sentences))

    def __len__(self):
        return 10

    def __getitem__(self, index):
        return self.pairs[index]

In [188]:
from math import ceil

In [200]:
class Loader:
    def __init__(self, dataset, batch_size, word_to_index):
        self.dataset = dataset
        self.batch_size = batch_size
        self.word_to_index = word_to_index
        
        self.n_batches = ceil(len(self.dataset) / batch_size)
        self.current_index = 0
        
    def __iter__(self):
        return self

    def __next__(self):
        input_indices, output_indices = [], []
        
        for input_word, output_word in self.dataset[self.current_index:self.current_index+self.batch_size]:
            input_indices.append(self.word_to_index[input_word])
            output_indices.append(self.word_to_index[output_word])
            
        self.current_index += self.n_batches
        if self.current_index < len(self.dataset):
            return np.array(input_indices), np.array(output_indices)
        else:
            self.current_index = 0
            raise StopIteration

In [201]:
wpd = WordPairsDataset("example_dataset.txt")

In [202]:
from functools import reduce

words = reduce(lambda x,y: x+y, wpd.sentences)
vocab = set(words)
embedding_size = 10

word2vec = Word2Vec(vocab, embedding_size)

In [203]:
loader = Loader(wpd, 3, word2vec.word_to_index)

In [204]:
for x in loader:
    print(x)

(array([24, 24, 50]), array([50, 30, 24]))
(array([50, 30, 30]), array([18, 24, 50]))


In [163]:
wpd.pairs[:10]

[('as', 'word2vec'),
 ('as', 'is'),
 ('word2vec', 'as'),
 ('word2vec', 'is'),
 ('word2vec', 'a'),
 ('is', 'as'),
 ('is', 'word2vec'),
 ('is', 'a'),
 ('is', 'neural'),
 ('a', 'word2vec')]