In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

In [288]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn

import torchtext
from collections import Counter, OrderedDict
from torchtext.datasets import WikiText2
from torchtext.data import to_map_style_dataset
from torchtext.data.utils import get_tokenizer, ngrams_iterator
from torchtext.vocab import build_vocab_from_iterator

import nltk

## Loading dataset

In [161]:
train_data = WikiText2(root="../data/", split=("train"))
train_data = to_map_style_dataset(train_data)

## Building Vocabulary

In [None]:
tokenizer = get_tokenizer(tokenizer="basic_english", language="en")

In [214]:
vocabulary = build_vocab_from_iterator(
    map(tokenizer, train_data), min_freq=50, specials=["<unk>"]
)
vocabulary.set_default_index(vocabulary["<unk>"])
vocab_size = len(vocabulary.get_stoi())
print("Vocabulary size:", vocab_size)

Vocabulary size: 4099


## Preparing Dataset

In [169]:
len(vocab.get_stoi())
vocab.lookup_indices([".", ",", "the", "a", "<unk>"])

4099

In [None]:
data = train_data.__getitem__(5)
ngrams = list(nltk.ngrams(tokenizer(data), 9))

In [217]:
vocab["sales"]

1446

In [280]:
output = [x[4] for x in ngrams]
output = np.array([vocab[token] for token in output])

inputs = [x[:4] + x[5:] for x in ngrams]
inputs = np.array([[vocab[token] for token in tokens] for tokens in inputs])

In [270]:
inputs.shape

(95, 8)

In [283]:
input_vector = np.zeros(shape=(8, vocab_size))
print("Input Vector size:", input_vector.shape)
for i, index in enumerate(inputs[0]):
    input_vector[i, index] = 1


output_vector = np.zeros(shape=(vocab_size,))
output_vector[output[0]] = 1
print("Output Vector size:", output_vector.shape)

Input Vector size: (8, 4099)
Output Vector size: (4099,)


In [286]:
output_vector.sum()

1.0

In [287]:
input_vector.sum()

8.0

## Model

In [330]:
class Word2Vec_Model(nn.Module):
    def __init__(self, vocab_size: int, embed_dimension: int):
        super(Word2Vec_Model, self).__init__()

        self.linear1 = nn.Linear(in_features=vocab_size, out_features=embed_dimension)
        self.sigmoid = nn.Sigmoid()
        self.linear2 = nn.Linear(in_features=embed_dimension, out_features=vocab_size)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, inputs_):
        x = self.linear1(inputs_)
        x = self.sigmoid(x)
        x = x.sum(axis=1)
        x = self.linear2(x)
        x = self.softmax(x)
        return x

In [331]:
model = Word2Vec_Model(vocab_size=vocab_size, embed_dimension=300)

In [332]:
batch = torch.tensor(input_vector, dtype=torch.float32).unsqueeze(0)
out = model(batch)
out.shape

torch.Size([1, 4099])

In [334]:
out

tensor([[1.3731e-06, 1.1676e-04, 6.8774e-07,  ..., 5.4389e-05, 2.6613e-06,
         2.7039e-03]], grad_fn=<SoftmaxBackward>)

In [294]:
input_vector.shape

(8, 4099)

In [306]:
batch.sum(axis=-2).shape

torch.Size([1, 4099])

In [315]:
batch

tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 1.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]], dtype=torch.float64)

In [24]:
def data_process(raw_text_iter):
    """Converts raw text into a flat Tensor."""
    data = [
        torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter
    ]
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))


# train_iter was "consumed" by the process of building the vocab,
# so we have to create it again
train_iter, val_iter, test_iter = WikiText2()
train_data = data_process(train_iter)
val_data = data_process(val_iter)
test_data = data_process(test_iter)

In [91]:
for batch in train_data:
    break

In [25]:
train_data.shape, val_data.shape, test_data.shape

(torch.Size([2049990]), torch.Size([214417]), torch.Size([241859]))

In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')