In [24]:
import torch 
import torch.nn as nn 
import torch.nn.functional as F 
import torch.optim as optim

# Define a simple Continuous Bag of Words (CBOW) style model
class CBOWModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOWModel, self).__init__()
        # The embedding layer stores the word vectors we want to learn
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # The linear layer maps the averaged embedding back to the vocabulary size to predict the target word
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        # inputs: tensor of word indices for the context
        embeds = self.embeddings(inputs)
        # Aggregate context by calculating the mean of the word embeddings
        h = torch.mean(embeds, dim=1)
        # Produce logits (raw scores) for each word in the vocabulary
        logits = self.linear(h)
        return logits

# Setup vocabulary and mappings
word_to_ix = {"the": 0, "quick": 1, "brown": 2, "fox": 3, "jumps": 4, "over": 5, "lazy": 6, "dog": 7}
ix_to_word = {v: k for k, v in word_to_ix.items()}

# Configuration constants
EMBEDDING_DIM = 5 
VOCAB_SIZE = len(word_to_ix)
LEARNING_RATE = 0.01

# Initialize the model, loss function, and optimizer
model = CBOWModel(VOCAB_SIZE, EMBEDDING_DIM)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)

# Prepare dummy training data: context words and the target word
# Context: ["quick", "brown", "jumps", "over"] -> Target: "fox"
context_idxs = torch.tensor([1, 2, 4, 5], dtype=torch.long)
target_idxs = torch.tensor([3], dtype=torch.long)

# Perform a single optimization step
model.zero_grad()
# Forward pass: we reshape context to (1, -1) to simulate a batch of size 1
logits = model(context_idxs.view(1, -1))
# Calculate loss against the target word index
loss = loss_function(logits, target_idxs)
# Backpropagate and update weights
loss.backward()
optimizer.step()

# Output results for the blog post
print("=== Model Training Snapshot ===")
print(f"Calculated Loss: {loss.item():.6f}")

print(f"\n=== Learned Vector for 'jumps' ===")
word_vec = model.embeddings(torch.tensor([word_to_ix['jumps']]))
print(word_vec.detach().numpy())

print("\n=== Embedding Matrix (Weights) ===")
print(model.embeddings.weight.detach().numpy())

print("\n=== Linear Layer Weights ===")
print(model.linear.weight.detach().numpy())

=== Model Training Snapshot ===
Calculated Loss: 1.863045

=== Learned Vector for 'jumps' ===
[[ 0.29590198 -0.48890254 -0.8542548  -0.9656708   0.36455676]]

=== Embedding Matrix (Weights) ===
[[-0.13485748  0.45796484  0.5082475  -0.9655638   0.22472069]
 [-1.0922837  -0.8376392  -1.9895118  -0.6761363   0.52810454]
 [-0.48990375 -1.7160931  -0.10055047 -0.21604995  0.25440407]
 [ 1.1479449   0.37760517 -1.2026685  -0.32337672 -1.864741  ]
 [ 0.29590198 -0.48890254 -0.8542548  -0.9656708   0.36455676]
 [-0.20498867 -0.29153195 -0.4145538   1.1002443  -0.29684016]
 [ 1.2364109  -1.3861771  -1.2367728   0.9709441  -0.3087463 ]
 [ 1.1985914   1.7510175   0.4587708  -1.8716775  -0.15679668]]

=== Linear Layer Weights ===
[[-0.30654982 -0.19728437  0.41693833  0.03748522 -0.38215488]
 [-0.188174   -0.16843402  0.26216137 -0.14779729 -0.08642971]
 [-0.30187497 -0.26113632 -0.19111304 -0.13131228  0.10903408]
 [-0.13401619 -0.09882405 -0.25202116  0.41303945  0.2886999 ]
 [-0.05993969 -0.35

In [None]:
## Skip gram using NEgative Sampling

class SkipGramNegativeSampling(nn.Module):

    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramNegativeSampling, self).__init__()
        self.in_embed = nn.Embedding(vocab_size, embedding_dim)
        self.out_embed = nn.Embedding(vocab_size, embedding_dim)

        self.in_embed.weight.data.uniform_(-0.5/embedding_dim, 0.5/embedding_dim)
        self.out_embed.weight.data.uniform_(-0.5/embedding_dim, 0.5/embedding_dim)

    def forward(self, center_words, target_words, negative_words):
        center_vectors = self.in_embed(center_words)
        target_vectors = self.out_embed(target_words)
        negative_vectors = self.out_embed(negative_words)

        pos_score = torch.bmm(target_vectors.unsqueeze(1), center_vectors.unsqueeze(2))
        pos_score = pos_score.squeeze(-1)
        pos_score = torch.sigmoid(pos_score)

        neg_score = torch.bmm(negative_vectors, center_vectors.unsqueeze(2))
        neg_score = neg_score.squeeze(-1)
        neg_score = torch.sigmoid(-neg_score)
        
        loss = -torch.log(pos_score + 1e-5) - torch.sum(torch.log(neg_score + 1e-5), dim=1)
        
        return torch.mean(loss)

VOCAB_SIZE = 100
EMBED_DIM = 10
BATCH_SIZE = 1
K_NEGS = 5 

model = SkipGramNegativeSampling(VOCAB_SIZE, EMBED_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.01)
center_id = torch.tensor([0], dtype=torch.long)
target_id = torch.tensor([1], dtype=torch.long)
negative_ids = torch.tensor([[50, 23, 99, 4, 12]], dtype=torch.long)
model.zero_grad()
loss = model(center_id, target_id, negative_ids)
loss.backward()
optimizer.step()
print(f"Loss: {loss.item()}")
word_vec = model.in_embed(torch.tensor([word_to_ix['fox']]))
print(f"Vector for fox is {word_vec.detach().numpy()}")
        

Loss: 4.164300441741943
Vector for is [[ 0.00032597  0.04756524 -0.0195841   0.03475741  0.0301356  -0.03958734
   0.02311508 -0.00974941 -0.00969105  0.03378046]]
