In [None]:
#there is no activation function applied to the output layer. CBOW is essentially a simple neural network with a linear layer 
#(also known as the projection layer) that maps the input (context words) to the output (target word).

In [4]:
import torch 
import torch.nn as nn
import torch.optim as optim
import spacy

In [11]:
class CBOW(nn.Module):
    def __init__(self, embedding_size = 128, vocab_size = -1):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_size)
        self.linear = nn.Linear(embedding_size, vocab_size)
    
    def forward(self, inputs):
        embeddings = self.embeddings(inputs).mean(1).squeeze(1) # batch_size x embedding_size
        return self.linear(embeddings)

In [6]:
def data_genarator():
    with open("/kaggle/input/raw-text/raw.txt", "r", encoding = "utf-8") as f:
        raw_text = f.read()
        
    nlp = spacy.load("en_core_web_sm")
    tokenized_text = [token.text for token in nlp(raw_text)]
    vocab = set(tokenized_text)
    
    word_to_idx = { word : i for i, word in enumerate(vocab)}
    idx_to_word = { i : word for i, word in enumerate(vocab)}
    
    data = []
    for i in range(2, len(tokenized_text)-2):
        #take 2 words before and 2 words after
        context = [
            tokenized_text[i-2],
            tokenized_text[i-1],
            tokenized_text[i+1],
            tokenized_text[i+2],
        ]
        #take target word
        target = tokenized_text[i]
        
        context_ids = [word_to_idx[w] for w in context]
        target_id = word_to_idx[target]
        data.append((context_ids, target_id))
    
    return data, word_to_idx, idx_to_word

In [15]:
data, word_to_idx, idx_to_word = data_genarator()
loss_func = nn.CrossEntropyLoss()
model = CBOW(vocab_size = len(word_to_idx))
optimizer = optim.Adam(model.parameters(), lr = 1e-4)

#convert to tensors
contexts = torch.tensor([ex[0] for ex in data])
targets = torch.tensor([ex[1] for ex in data])

#create dataset from tensors and datataloader
dataset = torch.utils.data.TensorDataset(contexts, targets)
dataloader = torch.utils.data.DataLoader(dataset, batch_size = 32, shuffle = True)

#train
for epoch in range(25):
    for context, target in dataloader:
        output = model(context)
        loss = loss_func(output, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f"Epoch : {epoch}, Loss : {loss.item()}")

Epoch : 0, Loss : 6.713914394378662
Epoch : 1, Loss : 6.592629432678223
Epoch : 2, Loss : 6.564905643463135
Epoch : 3, Loss : 6.512890815734863
Epoch : 4, Loss : 6.348079204559326
Epoch : 5, Loss : 6.458359718322754
Epoch : 6, Loss : 6.294544219970703
Epoch : 7, Loss : 6.266358375549316
Epoch : 8, Loss : 6.227489471435547
Epoch : 9, Loss : 5.98853063583374
Epoch : 10, Loss : 6.073175430297852
Epoch : 11, Loss : 6.195258140563965
Epoch : 12, Loss : 6.098715782165527
Epoch : 13, Loss : 6.042919635772705
Epoch : 14, Loss : 5.451259613037109
Epoch : 15, Loss : 5.844895362854004
Epoch : 16, Loss : 5.879215240478516
Epoch : 17, Loss : 5.399789810180664
Epoch : 18, Loss : 5.50230598449707
Epoch : 19, Loss : 5.100342750549316
Epoch : 20, Loss : 5.287597179412842
Epoch : 21, Loss : 5.580905437469482
Epoch : 22, Loss : 5.354588031768799
Epoch : 23, Loss : 5.023834228515625
Epoch : 24, Loss : 5.487366199493408
