# Naive word2vec

This task can be formulated very simply. Follow this [paper](https://arxiv.org/pdf/1411.2738.pdf) and implement word2vec like a two-layer neural network with matrices $W$ and $W'$. One matrix projects words to low-dimensional 'hidden' space and the other - back to high-dimensional vocabulary space.

![word2vec](https://i.stack.imgur.com/6eVXZ.jpg)

You can use TensorFlow/PyTorch (numpy too, if you love to calculate gradients on your own and want some extra points, but don't forget to numerically check your gradients) and code from your previous task. Again: you don't have to implement negative sampling (you may reduce your vocabulary size for faster computation).

**Results of this task**:
 * trained word vectors (mention somewhere, how long it took to train)
 * plotted loss (so we can see that it has converged)
 * function to map token to corresponding word vector
 * beautiful visualizations (PCE, T-SNE), you can use TensorBoard and play with your vectors in 3D (don't forget to add screenshots to the task)
 * qualitative evaluations of word vectors: nearest neighbors, word analogies

**Extra:**
 * quantitative evaluation:
   * for intrinsic evaluation you can find datasets [here](https://aclweb.org/aclwiki/Analogy_(State_of_the_art))
   * for extrincis evaluation you can use [these](https://medium.com/@dataturks/rare-text-classification-open-datasets-9d340c8c508e)

Also, you can find any other datasets for quantitative evaluation. If you chose to do this, please use the same datasets across tasks 3, 4, 5 and 6.

Again. It is **highly recommended** to read this [paper](https://arxiv.org/pdf/1411.2738.pdf)

Example of visualization in tensorboard:
https://projector.tensorflow.org

Example of 2D visualisation:

![2dword2vec](https://www.tensorflow.org/images/tsne.png)

If you struggle with something, ask your neighbor. If it is not obvious for you, probably someone else is looking for the answer too. And in contrast, if you see that you can help someone - do it! Good luck!

## Word2vec preprocessing:

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import numpy as np
from collections import Counter
import itertools

from sklearn.model_selection import train_test_split

In [3]:
USE_GPU = False
if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print('using device:', device)

using device: cpu


In [12]:
class CBOWBatcher:
    def __init__(self, words):
        self.words = words
        self.vocab_size = len(words)
        self.remaining_words = self.words
    
        self.index_words = []
        self.word2index = dict()
        self.index2word = dict()
        self.window_size = 0
        
    def delete_rare_words(self, min_frequency = 9):
        words_count = Counter(self.words)
        
        sorted_words_count = sorted(words_count.items(), key=lambda pair: pair[1], reverse=True)

        # choose words with frequency > min_frequency
        mask = list(map(lambda x: x[1] > min_frequency, sorted_words_count))
        sorted_words_count = np.array(sorted_words_count)[mask]

        # add __unk__
        remaining_words = list(map(lambda x: x[0], sorted_words_count))
        remaining_words.append("__unk__")
        self.remaining_words = remaining_words
        self.vocab_size = len(remaining_words)
    
    def create_indexing(self):
        # numericalization
        numbers = np.arange(len(self.remaining_words))

        self.word2index = dict(zip(self.remaining_words, numbers))
        self.index2word = dict(zip(numbers, self.remaining_words))
        # __unk__
        for word in self.words:
            if word in self.word2index:
                self.index_words.append(self.word2index[word])
            else:
                self.index_words.append(self.word2index["__unk__"])
        
    def CBOW(self, window_size = 2):
        self.window_size = window_size
        x_batch = []
        labels_batch = []
        
        for i in np.arange(window_size, len(self.index_words) - window_size):
            labels_batch.append(self.index_words[i])
            temp_list = self.index_words[i - window_size: i] + self.index_words[i+1: i+window_size+1]
            x_batch.append(temp_list)
        return x_batch, labels_batch
    
    def indices_to_words(self, batch):
        words_batch = []
        shape = np.array(batch).shape
        batch_flatten = np.array(batch).flatten()
        
        for i in batch_flatten:
            if i in self.index2word:
                words_batch.append(self.index2word[i])
            else:
                raise Exception("Incorrect numericalization: {} in {}".format(i, batch))
                
        words_batch = np.array(words_batch).reshape(shape)
        return words_batch

In [13]:
text_file = open("text8", "r")
words = text_file.read().split(' ')
test = words[:10]

batcher = CBOWBatcher(test)
batcher.delete_rare_words(0)
batcher.create_indexing()
y = batcher.CBOW()

In [6]:
def oneHot_generate(batches, labels, vocab_size):
    for batch, label in zip(batches, labels):
        batch = np.array(batch)
        oneHot_batch = np.zeros((batch.shape[0], vocab_size))
        oneHot_batch[np.arange(batch.shape[0]), batch] = 1

        oneHot_label = np.zeros((vocab_size))
        oneHot_label[label] = 1
        yield (oneHot_batch, label)

In [16]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super().__init__()
        self.layer1 = nn.Linear(vocab_size, hidden_size)
        self.layer2 = nn.Linear(hidden_size, vocab_size)

    def forward(self, context):
        in_ = context.sum(dim = 0, dtype = torch.float32)
        out = self.layer1(in_)
        out = out/context.shape[0]
        out = self.layer2(out)
        return out


def test_CBOW():
    vocab_size = 5
    embedding_dim = 2
    context_size = 2
    hidden_size = 4
    x = torch.tensor([[0, 0, 0, 0, 1], [0, 0, 0, 1, 0]], dtype = torch.float32)
    x = x.to(device=device, dtype=torch.float32)
    model = CBOW(vocab_size, hidden_size)
    scores = model(x)
    print(scores)
    
test_CBOW()

tensor([-0.5414,  0.0847, -0.0120,  0.5146, -0.4327], grad_fn=<AddBackward0>)


In [15]:
print_every = 10

In [21]:
def train_part34(model, optimizer, contexts, labels, epochs=1):
    
    model = model.to(device=device)  # move the model parameters to CPU/GPU
    
    X_train, X_test, y_train, y_test = train_test_split(contexts, labels)
    
    for e in range(epochs):
        for t, (context, label) in enumerate(oneHot_generate(X_train, y_train)):
            model.train()  # put model to training mode
            
            x = torch.tensor([w for w in context], dtype=torch.float32)        
#             y = torch.tensor([label], dtype=torch.float32)
            
            x = x.to(device=device, dtype=torch.long)  # move to device, e.g. GPU
#             y = y.to(device=device, dtype=torch.long)
            
            scores = model(x)
            loss_func = nn.CrossEntropyLoss()
            #print(scores, y, scores.shape, y.shape)
            loss = loss_func(scores, label)

            optimizer.zero_grad()

            loss.backward()
            
            optimizer.step()

            if t % print_every == 0:
                print('Iteration %d, loss = %.4f' % (t, loss.item()))
                check_accuracy_part34(loader_val, model)
                print()

In [20]:
hidden_size = 4
model = CBOW(batcher.vocab_size, hidden_size)
optimizer = optim.SGD(model.parameters)
train_part34(model, optimizer, y[0], y[1], epochs=1)

TypeError: 'method' object is not iterable

In [11]:
import gc
gc.collect()

446

In [0]:
def check_accuracy_part34(loader, model):
    num_correct = 0
    num_samples = 0
    model.eval()  # set model to evaluation mode
    with torch.no_grad():
        for x, y in loader:
            x = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)        
            y = torch.tensor([word_to_ix[y]], dtype=torch.long)


            x = x.to(device=device, dtype=torch.long)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=torch.long)
            scores = model(x)
            _, preds = scores.max(1)
            num_correct += (preds == y)
            num_samples += 1
        print(num_correct, num_samples)
        acc = float(num_correct) / num_samples
        print('Got {} acc'.format(100 * acc))