## Step 1: Import necessary libraries

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import nltk
nltk.download('brown')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

## Step 2: Load real corpus

In [2]:
# Load the Brown corpus from NLTK
corpus = nltk.corpus.brown.sents()[:50]

In [3]:
corpus

[['The',
  'Fulton',
  'County',
  'Grand',
  'Jury',
  'said',
  'Friday',
  'an',
  'investigation',
  'of',
  "Atlanta's",
  'recent',
  'primary',
  'election',
  'produced',
  '``',
  'no',
  'evidence',
  "''",
  'that',
  'any',
  'irregularities',
  'took',
  'place',
  '.'],
 ['The',
  'jury',
  'further',
  'said',
  'in',
  'term-end',
  'presentments',
  'that',
  'the',
  'City',
  'Executive',
  'Committee',
  ',',
  'which',
  'had',
  'over-all',
  'charge',
  'of',
  'the',
  'election',
  ',',
  '``',
  'deserves',
  'the',
  'praise',
  'and',
  'thanks',
  'of',
  'the',
  'City',
  'of',
  'Atlanta',
  "''",
  'for',
  'the',
  'manner',
  'in',
  'which',
  'the',
  'election',
  'was',
  'conducted',
  '.'],
 ['The',
  'September-October',
  'term',
  'jury',
  'had',
  'been',
  'charged',
  'by',
  'Fulton',
  'Superior',
  'Court',
  'Judge',
  'Durwood',
  'Pye',
  'to',
  'investigate',
  'reports',
  'of',
  'possible',
  '``',
  'irregularities',
  "''",
 

## Step 3: Prepare train data

In [4]:
# Flatten the corpus and get unique words
vocab = list(set([word.lower() for sent in corpus for word in sent]))

# Numericalize the words in the corpus
word2index = {w: i for i, w in enumerate(vocab)}
index2word = {i: w for i, w in enumerate(vocab)}

# Define a function to generate random batches
def random_batch(batch_size, context_size):
    # Select a random sentence from the corpus
    sentence = corpus[np.random.randint(len(corpus))]
    # Select a random target word from the sentence
    target_word_idx = np.random.randint(len(sentence))
    target_word = sentence[target_word_idx]
    # Generate context words for the target word
    context_words = [sentence[i] for i in range(max(0, target_word_idx - context_size), min(len(sentence), target_word_idx + context_size + 1)) if i != target_word_idx]
    # Convert the target word and context words to numerical indices
    target_idx = word2index.get(target_word.lower(), word2index['<UNK>'])
    context_idxs = [word2index.get(w.lower(), word2index['<UNK>']) for w in context_words]
    # Return the target and context word indices
    return np.array([target_idx]), np.array(context_idxs)

## Step 4: Build CBOW Model Class

In [5]:
# Define the CBOW model
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super(CBOW, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.linear = nn.Linear(embedding_size, vocab_size)
        
    def forward(self, context_idxs):
        # Get embeddings for the context words
        embeddings = self.embedding(context_idxs)
        # Average the embeddings to get the context vector
        context_vector = embeddings.mean(dim=0)
        # Get logits for all words in the vocabulary
        logits = self.linear(context_vector)
        return logits

## Step 5: Train CBOW Model

In [6]:
# Instantiate the model and optimizer
vocab_size = len(vocab)
embedding_size = 100
model = CBOW(vocab_size, embedding_size)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Define a function to train the model
def train(model, optimizer, num_epochs, batch_size, context_size):
    loss_history = []
    for epoch in range(num_epochs):
        total_loss = 0
        for i in range(batch_size):
            target_idxs, context_idxs = random_batch(batch_size, context_size)
            target_idxs = torch.LongTensor(target_idxs)
            context_idxs = torch.LongTensor(context_idxs)
            logits = model(context_idxs)
            loss = nn.CrossEntropyLoss()(logits, target_idxs)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / batch_size
        loss_history.append(avg_loss)
        if epoch % 100 == 0:
            print(f"Epoch {epoch}: loss = {avg_loss:.4f}")
    return loss_history