In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import defaultdict

# Continuous Bag of Words (CBOW) Model

The Continuous Bag of Words (CBOW) model is a neural network model that predicts a word given its context. The context is defined as the surrounding words. The model is trained on a large corpus of text and learns to predict the target word based on the context words. The CBOW model is a type of word embedding model that learns a dense vector representation of words in a continuous vector space.

In [2]:
# Sample corpus
corpus = [
    "we are learning natural language processing",
    "learning embeddings is fun",
    "we are creating word embeddings with pytorch",
    "pytorch makes machine learning easy",
]

In [3]:
# Preprocess the corpus to get context-target pairs
# In other words what are the words that are surrounding a word within a window size
# Similar to the skip-gram model/masked language model in BERT
def build_context_target_pairs(corpus, window_size=2):
    # Tokenize the sentences as words
    tokenized_sentences = [sentence.split() for sentence in corpus]
    # Create a vocabulary of unique words
    vocabulary = set([word for sentence in tokenized_sentences for word in sentence])
    # Create a mapping from words to indices and vice versa
    word_to_idx = {word: i for i, word in enumerate(vocabulary)}
    # Create a mapping from indices to words
    idx_to_word = {i: word for word, i in word_to_idx.items()}

    # Create context-target pairs
    pairs = []
    for sentence in tokenized_sentences:
        # For each word in the sentence
        for i, word in enumerate(sentence):
            context = []
            # Parse the context window around the word
            for j in range(-window_size, window_size + 1):
                if j != 0 and 0 <= i + j < len(sentence):
                    context.append(sentence[i + j])
            if context:
                pairs.append((context, word))

    return pairs, word_to_idx, idx_to_word


pairs, word_to_idx, idx_to_word = build_context_target_pairs(corpus)
print(pairs)

[(['are', 'learning'], 'we'), (['we', 'learning', 'natural'], 'are'), (['we', 'are', 'natural', 'language'], 'learning'), (['are', 'learning', 'language', 'processing'], 'natural'), (['learning', 'natural', 'processing'], 'language'), (['natural', 'language'], 'processing'), (['embeddings', 'is'], 'learning'), (['learning', 'is', 'fun'], 'embeddings'), (['learning', 'embeddings', 'fun'], 'is'), (['embeddings', 'is'], 'fun'), (['are', 'creating'], 'we'), (['we', 'creating', 'word'], 'are'), (['we', 'are', 'word', 'embeddings'], 'creating'), (['are', 'creating', 'embeddings', 'with'], 'word'), (['creating', 'word', 'with', 'pytorch'], 'embeddings'), (['word', 'embeddings', 'pytorch'], 'with'), (['embeddings', 'with'], 'pytorch'), (['makes', 'machine'], 'pytorch'), (['pytorch', 'machine', 'learning'], 'makes'), (['pytorch', 'makes', 'learning', 'easy'], 'machine'), (['makes', 'machine', 'easy'], 'learning'), (['machine', 'learning'], 'easy')]


# Model Architecture

In [4]:
# CBOW Model Definition
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context_word_idxs):
        # Look up embeddings for context words
        context_embeddings = self.embeddings(context_word_idxs)
        # Average the embeddings along the context dimension
        context_mean = context_embeddings.mean(dim=1)
        # Predict target word
        out = self.linear(context_mean)
        return out

In [5]:
# Hyperparameters
embedding_dim = 10
vocab_size = len(word_to_idx)
model = CBOW(vocab_size, embedding_dim)

# Loss and Optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [6]:
# Training the CBOW Model
epochs = 100
for epoch in range(epochs):
    total_loss = 0
    for context, target in pairs:
        context_idxs = torch.tensor(
            [word_to_idx[word] for word in context], dtype=torch.long
        ).unsqueeze(0)
        target_idx = torch.tensor([word_to_idx[target]], dtype=torch.long)

        optimizer.zero_grad()
        output = model(context_idxs)
        loss = loss_function(output, target_idx)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

Epoch 10/100, Loss: 55.8372
Epoch 20/100, Loss: 49.9302
Epoch 30/100, Loss: 44.9566
Epoch 40/100, Loss: 40.7091
Epoch 50/100, Loss: 37.0343
Epoch 60/100, Loss: 33.8157
Epoch 70/100, Loss: 30.9667
Epoch 80/100, Loss: 28.4236
Epoch 90/100, Loss: 26.1390
Epoch 100/100, Loss: 24.0771


# Acessing the Embeddings

In [7]:
# Access embeddings
embeddings = model.embeddings.weight.data
print("\nWord Embeddings:")
for word, idx in word_to_idx.items():
    print(f"{word}: {embeddings[idx].numpy()}")


Word Embeddings:
pytorch: [ 1.4685774  -0.38831982 -1.2889267   0.09289108 -0.14142293  0.22172852
 -0.7539682   1.6191927  -1.5351536   0.67711204]
creating: [-0.8724453   2.1080534  -0.23306686 -0.53616524  0.866615    0.1564897
 -0.31115413 -1.251447    0.3580911   0.01662499]
processing: [ 0.9482841   0.48435536 -0.8159492   1.4652754   0.6811016  -0.87585086
 -0.12284453 -0.5475216   0.5106665   1.5228214 ]
learning: [ 0.6470066   0.811096   -3.0511477  -0.8240166   1.2186054  -0.27947414
  0.6884183  -0.70278436  0.8353686  -1.084025  ]
easy: [ 1.730673    1.3103616   0.02741366 -1.0480531  -0.2312773   1.2492096
 -0.34549558  1.1095289  -0.8407805  -0.84121364]
language: [ 1.1508207  -0.12721634 -0.395085   -1.00345    -1.7495767  -1.5213605
 -0.58156365  0.5056273  -0.03067692 -1.1313243 ]
makes: [ 0.4294922   0.6526363  -0.02071382 -1.4665662  -0.5823222  -0.08748587
  0.12629311  1.453429    2.2575507   0.61133784]
we: [-1.5584903  -0.46279067  1.2836462  -1.2402459  -0.1381

In [12]:
word = "learning"
print(
    f"The token '{word}' is represented by the vector embedding of:\n{embeddings[word_to_idx[word]]}"
)

The token 'learning' is represented by the vector embedding of:
tensor([ 0.6470,  0.8111, -3.0511, -0.8240,  1.2186, -0.2795,  0.6884, -0.7028,
         0.8354, -1.0840])
