In [None]:
import torch
import numpy as np
from datasets import load_dataset
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from collections import defaultdict
from tqdm import tqdm

In [2]:
# The English dataset is too large to be used for local training
data = load_dataset("wikipedia", "20220301.frr", split="train", trust_remote_code=True)

In [3]:
# Getting all the texts
def preprocess_data(data):
    sentences = []
    for article in data:
        sentences.extend(article['text'].split('\n'))
    return sentences

In [4]:
texts = preprocess_data(data)

In [5]:
# Creating a tokenizer instance
tokenizer = defaultdict(lambda: len(tokenizer))
tokenized_sentences = [[tokenizer[word] for word in text.split()] for text in texts]
vocab_size = len(tokenizer)
window_sz = 2

In [6]:
# Making training and label paris for continious bag of word (CBOW)
def cbow_pairs(sequences, window_sz):
    pairs = []
    labels = []

    for sequence in sequences:
        for i in range(window_sz, len(sequence) - window_sz):
            context = sequence[i - window_sz: i] + sequence[i + 1: window_sz + 1]
            target = sequence[i]
            pairs.append(context)
            labels.append(target)
    return np.array(pairs), np.array(labels)

In [7]:
pairs, labels = cbow_pairs(tokenized_sentences, window_sz)

In [8]:
# CBOWDataset class for creating a PyTorch dataset
class CBOWDataset(Dataset):
    def __init__(self, pairs, labels):
        self.pairs = pairs
        self.labels = labels

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        return self.pairs[idx], self.labels[idx]

In [9]:
dataset = CBOWDataset(pairs, labels)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [10]:
# Making the CBOW neural network for making word embeddings
class CBOWModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim, vocab_size)
    
    def forward(self, x):
        embedded = self.embedding(x)
        mean_embedded = embedded.mean(dim=1)
        out = self.fc(mean_embedded)
        return out

In [11]:
# Setting CUDA device if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
# Setting up the hyperparameters
embedding_dim = 100
cbow_model = CBOWModel(vocab_size, embedding_dim)
cbow_model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(cbow_model.parameters(), lr=0.001)

In [None]:
# Training loop
for epoch in range(10):
    total_loss = 0

    # Initializing the progress bar
    with tqdm(total=len(dataloader), desc=f'Epoch {epoch + 1}', unit='batch') as pbar:
        for context, target in dataloader:

            # Moveing context and target tensors to the same device as the model
            context = context.to(device)
            target = target.to(device).long()  # Ensuring target is of type torch.LongTensor

            optimizer.zero_grad()
            output = cbow_model(context)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()  # Accumulating the loss
            
            # Updating the progress bar
            pbar.set_postfix(loss=loss.item())
            pbar.update(1)

    avg_loss = total_loss / len(dataloader)  # Computing the average loss
    print(f'Epoch {epoch + 1} average loss: {avg_loss:.4f}')