Importing essential libraries

In [None]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
import functools
import torchtext
from torchtext.datasets import IMDB
from torchtext.data import get_tokenizer
from torchtext.legacy import data
from torchtext.legacy import datasets

Defining CBOW Network model as definition

In [None]:
import functools

CUDA = torch.cuda.is_available()

torch.manual_seed(42)
class CBOW(nn.Module):
    
    def __init__(self, vocab_size, embedding_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        
        if CUDA:
            self.embedding = self.embedding.cuda()
        self.hidden = nn.Linear(embedding_size, vocab_size)
        self.op = nn.LogSoftmax(dim=-1)
        
    def forward(self, X):
        p = self.embedding(X.long())
        q = torch.mean(p, dim=0).view(1,-1)
        r = self.hidden(q)
        s = self.op(r)
        
        return s

Function for converting raw text to list of <context,target> base on window size

In [None]:
def text_to_train(text, context_window):

    data = []
    
    for i in range(context_window, len(text) - context_window):
        context = [
            text[i+e] for e in range(-context_window, context_window+1) if i+e != i
        ]
        target = text[i]
        
        data.append((context, target))
    return data

defining iteration on train and test set of IMDB dataset

In [None]:
train_iter = IMDB(split='train')
test_iter = IMDB(split = 'test')

converting all text in train and test set into one string using file appending

In [None]:

file = open("data.txt" , "a")
for label,line in train_iter:
  file.write(line)
file.close()
file = open("data.txt","a")
for label,line in test_iter:
  file.write(line)
file.close()

with open('data.txt') as file:
  raw_text = file.readline()

raw_text = raw_text.lower().split()
raw_text = raw_text[0:5000]

preprocess raw data and create two dictionary for w2i and i2w and produce data samples based on preprocessed raw data

In [None]:
for i, element in enumerate(raw_text):
    element = element.replace('/>','')
    element = element.replace('<br','')
    raw_text[i] = element
vocab = set(raw_text)
word2index = {w:i for i,w in enumerate(vocab)}
index2word = {i:w for i,w in enumerate(vocab)}

data = text_to_train(raw_text, 2)

define function for converting one sentence to tensor of indexes using indexes in vocabulary

In [None]:
def words_to_tensor(words: list, w2i: dict, dtype=torch.FloatTensor):
    tensor =  dtype([
        w2i[word] for word in words
    ])
    
    if CUDA:
        tensor = tensor.cuda()
    return Variable(tensor)

define one function for predict one word from its context and one function for check accuracy of model based on i2w vector

In [None]:
def get_prediction(context, model):
    model.eval()
    prediction = model(words_to_tensor(context, word2index))
    _, index = torch.max(prediction, 1)
    return index2word[index.item()]

def check_accuracy(model):
    correct = 0
    for context, target in data:
        prediction = get_prediction(context, model)
        if prediction == target:
            correct += 1
    return correct/len(data)

training phase(250 epoch on 5000 sample because of computation issues)

In [None]:
learning_rate = 0.001
epochs = 250

model = CBOW(len(vocab), 100)

if CUDA:
    model = model.cuda()

loss_func = torch.nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
losses = []
for epoch in range(epochs):
    total_loss = 0
    for context, target in data:
        ids = words_to_tensor(context,word2index)
        target = words_to_tensor([target], word2index, dtype=torch.LongTensor)

        model.zero_grad()
        output = model(ids)

        loss = loss_func(output, target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    accuracy = check_accuracy(model)
    print("Accuracy after epoch {} is {}".format(epoch, accuracy))
    losses.append(total_loss)

Accuracy after epoch 0 is 0.005804643714971977
Accuracy after epoch 1 is 0.026621297037630103
Accuracy after epoch 2 is 0.04643714971977582
Accuracy after epoch 3 is 0.05604483586869496
Accuracy after epoch 4 is 0.058246597277822255
Accuracy after epoch 5 is 0.059247397918334666
Accuracy after epoch 6 is 0.059647718174539635
Accuracy after epoch 7 is 0.06104883907125701
Accuracy after epoch 8 is 0.06164931945556445
Accuracy after epoch 9 is 0.06345076060848678
Accuracy after epoch 10 is 0.06505204163330665
Accuracy after epoch 11 is 0.06765412329863892
Accuracy after epoch 12 is 0.06905524419535629
Accuracy after epoch 13 is 0.07045636509207366
Accuracy after epoch 14 is 0.07185748598879103
Accuracy after epoch 15 is 0.07385908726981585
Accuracy after epoch 16 is 0.0754603682946357
Accuracy after epoch 17 is 0.07686148919135308
Accuracy after epoch 18 is 0.07826261008807045
Accuracy after epoch 19 is 0.07966373098478784
Accuracy after epoch 20 is 0.08146517213771017
Accuracy after epoc