In [None]:
import torch 
import torch.nn as nn 
import torch.nn.functional as F 
import torch.optim as optim

# Define a simple Continuous Bag of Words (CBOW) style model
class CBOWModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOWModel, self).__init__()
        # The embedding layer stores the word vectors we want to learn
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # The linear layer maps the averaged embedding back to the vocabulary size to predict the target word
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        # inputs: tensor of word indices for the context
        embeds = self.embeddings(inputs)
        # Aggregate context by calculating the mean of the word embeddings
        h = torch.mean(embeds, dim=1)
        # Produce logits (raw scores) for each word in the vocabulary
        logits = self.linear(h)
        return logits

# Setup vocabulary and mappings
word_to_ix = {"the": 0, "quick": 1, "brown": 2, "fox": 3, "jumps": 4, "over": 5, "lazy": 6, "dog": 7}
ix_to_word = {v: k for k, v in word_to_ix.items()}

# Configuration constants
EMBEDDING_DIM = 5 
VOCAB_SIZE = len(word_to_ix)
LEARNING_RATE = 0.01

# Initialize the model, loss function, and optimizer
model = CBOWModel(VOCAB_SIZE, EMBEDDING_DIM)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)

# Prepare dummy training data: context words and the target word
# Context: ["quick", "brown", "jumps", "over"] -> Target: "fox"
context_idxs = torch.tensor([1, 2, 4, 5], dtype=torch.long)
target_idxs = torch.tensor([3], dtype=torch.long)

# Perform a single optimization step
model.zero_grad()
# Forward pass: we reshape context to (1, -1) to simulate a batch of size 1
logits = model(context_idxs.view(1, -1))
# Calculate loss against the target word index
loss = loss_function(logits, target_idxs)
# Backpropagate and update weights
loss.backward()
optimizer.step()

# Output results for the blog post
print("=== Model Training Snapshot ===")
print(f"Calculated Loss: {loss.item():.6f}")

print(f"\n=== Learned Vector for 'jumps' ===")
word_vec = model.embeddings(torch.tensor([word_to_ix['jumps']]))
print(word_vec.detach().numpy())

print("\n=== Embedding Matrix (Weights) ===")
print(model.embeddings.weight.detach().numpy())

print("\n=== Linear Layer Weights ===")
print(model.linear.weight.detach().numpy())

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class SkipGramNegativeSampling(nn.Module):
    """
    Skip-Gram with Negative Sampling (SGNS) implementation.
    
    SGNS approximates the softmax over the entire vocabulary by instead 
    distinguishing between a real context word (positive) and K noise words (negative).
    """
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramNegativeSampling, self).__init__()
        # Input embeddings: used when the word is the center word
        self.in_embed = nn.Embedding(vocab_size, embedding_dim)
        # Output embeddings: used when the word is a context or negative sample
        self.out_embed = nn.Embedding(vocab_size, embedding_dim)

        # Initialize weights with small values to prevent gradient saturation
        initrange = 0.5 / embedding_dim
        self.in_embed.weight.data.uniform_(-initrange, initrange)
        self.out_embed.weight.data.uniform_(-initrange, initrange)

    def forward(self, center_words, target_words, negative_words):
        """
        Computes the negative sampling loss.
        
        Args:
            center_words: (batch_size)
            target_words: (batch_size)
            negative_words: (batch_size, K) where K is number of negative samples
        """
        # Retrieve vectors
        v_c = self.in_embed(center_words)      # (batch_size, embed_dim)
        u_o = self.out_embed(target_words)     # (batch_size, embed_dim)
        u_n = self.out_embed(negative_words)   # (batch_size, K, embed_dim)

        # 1. Positive Score: log(sigmoid(v_c · u_o))
        # Compute dot product: (batch, 1, dim) @ (batch, dim, 1) -> (batch, 1)
        pos_score = torch.bmm(u_o.unsqueeze(1), v_c.unsqueeze(2)).squeeze(2)
        pos_loss = torch.log(torch.sigmoid(pos_score) + 1e-7)

        # 2. Negative Score: sum(log(sigmoid(-v_c · u_n)))
        # Compute dot products for all K samples: (batch, K, dim) @ (batch, dim, 1) -> (batch, K)
        neg_score = torch.bmm(u_n, v_c.unsqueeze(2)).squeeze(2)
        neg_loss = torch.sum(torch.log(torch.sigmoid(-neg_score) + 1e-7), dim=1, keepdim=True)
        
        # Total loss is the negative of the objective function
        loss = -(pos_loss + neg_loss)
        return torch.mean(loss)

# --- Configuration & Mock Data ---
VOCAB_SIZE = 100
EMBED_DIM = 10
word_to_ix = {'fox': 0} # Example vocabulary mapping

model = SkipGramNegativeSampling(VOCAB_SIZE, EMBED_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Mock inputs: 1 center word, 1 target word, 5 negative samples
center_id = torch.tensor([0], dtype=torch.long)
target_id = torch.tensor([1], dtype=torch.long)
negative_ids = torch.tensor([[50, 23, 99, 4, 12]], dtype=torch.long)

# Training Step
model.zero_grad()
loss = model(center_id, target_id, negative_ids)
loss.backward()
optimizer.step()

# Output Results
print(f"Loss after one step: {loss.item():.6f}")
word_vec = model.in_embed(torch.tensor([word_to_ix['fox']]))
print(f"Vector for 'fox':\n{word_vec.detach().numpy()}")
        