In [54]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from auxiliary_file import *

### Dataset construction dataset

In [98]:
def data_construct(num_of_documents):   
    training_data, index_to_word = training_data_cnstructing([pre_processing(f'article{i+1}.txt') for i in range(num_of_documents)])
    return training_data, index_to_word

training_data, index_to_word = data_construct(3)

device = torch.device("cpu")

In [99]:
class MyDataset(Dataset):

    def __init__(self, data, index_to_word):
        self.data = data
        self.word_to_index = index_to_word

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        context, target_word = self.data[idx]
        context = torch.tensor(context, dtype=torch.long)
        target_word = torch.tensor(target_word, dtype=torch.float)
        context = context.to(device)
        target_word = target_word.to(device)
        return context, target_word

In [100]:
data_set = MyDataset(training_data, index_to_word)

### Dataloader construction

In [101]:
batch_size = 2048
dataloader = torch.utils.data.DataLoader(data_set, batch_size=batch_size, shuffle=True, num_workers=0)


In [102]:
for context, target in dataloader:
    print(context.shape)
    print(target.shape)
    break

torch.Size([2048, 5])
torch.Size([2048, 2399])


### Architecure

In [None]:
class Word2vec(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(Word2vec, self).__init__()
        
        self.embed = nn.Embedding(vocab_size, embed_size)      
        self.linear = nn.Linear(embed_size, vocab_size)
        
    def forward(self, context_words):

        embeds = self.embed(context_words)
        
        embeds = torch.mean(embeds, dim=1)
        
        out = self.linear(embeds)
        
        log_probs = F.log_softmax(out, dim=1)
        
        return log_probs

### Training model

In [103]:
vocab_size = len(data_set[0][1])
embed_size = 100
num_epochs = 1000

model = Word2vec(vocab_size, embed_size)

model = model.to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)

loss_function = nn.NLLLoss()

for epoch in range(num_epochs):
    total_loss = 0
    for context, target_word in dataloader:

        log_probs = model(context)
        
        loss = loss_function(log_probs, torch.argmax(target_word, dim=1))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss}")


Epoch 1, Loss: 31.22797966003418
Epoch 2, Loss: 30.992433547973633
Epoch 3, Loss: 30.781078338623047
Epoch 4, Loss: 30.57188320159912
Epoch 5, Loss: 30.365200996398926
Epoch 6, Loss: 30.159289836883545
Epoch 7, Loss: 29.952723503112793
Epoch 8, Loss: 29.74591302871704
Epoch 9, Loss: 29.539020538330078
Epoch 10, Loss: 29.331144332885742
Epoch 11, Loss: 29.122068405151367
Epoch 12, Loss: 28.91184425354004
Epoch 13, Loss: 28.700344562530518
Epoch 14, Loss: 28.486588954925537
Epoch 15, Loss: 28.271172046661377
Epoch 16, Loss: 28.053550243377686
Epoch 17, Loss: 27.833199501037598
Epoch 18, Loss: 27.610095500946045
Epoch 19, Loss: 27.386303901672363
Epoch 20, Loss: 27.15950632095337
Epoch 21, Loss: 26.92857265472412
Epoch 22, Loss: 26.69578456878662
Epoch 23, Loss: 26.46040630340576
Epoch 24, Loss: 26.2240047454834
Epoch 25, Loss: 25.98377275466919
Epoch 26, Loss: 25.740981578826904
Epoch 27, Loss: 25.494571685791016
Epoch 28, Loss: 25.248740673065186
Epoch 29, Loss: 25.000077724456787
Epoch

### Collection of word embeddings

In [113]:
embeddings = model.embed.weight.data

word_vectors = {}

for index, word in data_set.word_to_index.items():

    word_vectors[word] = embeddings[index].numpy()


### Using embeddings

In [123]:
documents =[]
for i in range(1,10):
    documents.append(document_vector(word_vectors, pre_processing(f'article{i+1}.txt')))
np.save('documents.npy', documents)