# Word2vec (Skipgram Algorithm) (softmax)

In [1]:
import torch 
import torch.nn as nn 
import torch.optim as optim 
from collections import Counter
import numpy as np 


Preparing the dataset

In [9]:
class Word2VecDataset: 
    def __init__(self , corpus , window_size = 2):
        self.corpus = corpus 
        self.window_size = window_size
        self.word_to_idx , self.idx_to_word , self.vocab_size = self.build_vocab()
        self.data = self.generate_to_training_pairs()
        
    
    def build_vocab(self):
        words = [word for sentence in self.corpus for word in sentence]
        word_counts = Counter(words)
        vocab = list(word_counts.keys())
        word_to_idx = { word : idx for idx ,  word in enumerate(vocab)}
        idx_to_word = {idx:word for word , idx in word_to_idx.items()}
        return word_to_idx , idx_to_word , len(vocab)
    
    
    def generate_training_pairs(self):
        pairs = []

        for sentence in self.corpus:
            indices = [self.word_to_idx[word] for word in sentence]
            for center_pos , center_idx in enumerate(indices):
             for offset in range(-self.window_size  , self.window_size + 1):
                context_pos = center_pos + offset 
                if context_pos >=0 and context_pos < len(indices) and context_pos != center_pos:
                    pairs.append((center_idx , indices[context_pos]))
                    
        return pairs

In [3]:
class Word2Vec(nn.Module):
    def __init__(self , vocab_size , embedding_dim):
        super(Word2Vec , self).__init__()
        self.center_embedding = nn.Embedding(vocab_size , embedding_dim)
        self.outside_embedding = nn.Embedding(self.vocab_size , embedding_dim)
        
        
    def forward(self , center_words):
        center_embeds  = self.center_embedding(center_words)
        return center_embeds 
    
    
    def predict(self , center_embeds):
        logits = torch.matmul(center_embeds , self.outside_embedding.T)
        probs = torch.softmax(logits , dim = 1 )
        return probs

In [4]:
class Softmaxloss(nn.Module):
    def __init__(self):
        super(Softmaxloss  , self).__init__()
        
    def forward(self , center_embeds  , target_indices , model):
        logits = torch.matmul(center_embeds , model.outside_embeddings.weight.T)
        log_probs = torch.log_softmax(logits , dim = 1 )
        loss = -torch.sum(log_probs[range(log_probs.size(0) , target_indices)])
        return loss 

In [None]:
def train_word2vec_softmax(dataset , embedding_dim = 10 , epochs = 10 , learning_rate =  0.01):
    vocab_size = dataset.vocab_size
    model = Word2Vec(vocab_size , embedding_dim)
    loss_fn = Softmaxloss()
    
    optimizer = optim.SGD(model.parameters() , lr = learning_rate)
    
    
    for epoch in range(epochs):
        total_loss = 0 
        for center_idx, outside_idx in dataset.data:
            # Convert to tensors
            center_tensor = torch.tensor([center_idx], dtype=torch.long)
            outside_tensor = torch.tensor([outside_idx], dtype=torch.long)

            # Forward pass
            center_embeds = model(center_tensor)
            loss = loss_fn(center_embeds, outside_tensor, model)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

    return model , total_loss


# Example Usage for Softmax
corpus = [
    ["I", "like", "learning", "deep", "learning"],
    ["deep", "learning", "is", "fun"],
    ["word2vec", "uses", "word", "embeddings"]
]

dataset = Word2VecDataset(corpus)
trained_model_softmax = train_word2vec_softmax(dataset)
    
    
    
    

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter

# Step 1: Dataset Preparation
class Word2VecDataset:
    def __init__(self, corpus, window_size=2):
        self.corpus = corpus
        self.window_size = window_size
        self.word_to_idx, self.idx_to_word, self.vocab_size = self.build_vocab()
        self.data = self.generate_training_pairs()

    def build_vocab(self):
        words = [word for sentence in self.corpus for word in sentence]
        word_counts = Counter(words)
        vocab = list(word_counts.keys())
        word_to_idx = {word: idx for idx, word in enumerate(vocab)}
        idx_to_word = {idx: word for word, idx in word_to_idx.items()}
        return word_to_idx, idx_to_word, len(vocab)

    def generate_training_pairs(self):
        pairs = []
        for sentence in self.corpus:
            indices = [self.word_to_idx[word] for word in sentence]
            for center_pos, center_idx in enumerate(indices):
                for offset in range(-self.window_size, self.window_size + 1):
                    context_pos = center_pos + offset
                    if context_pos >= 0 and context_pos < len(indices) and context_pos != center_pos:
                        pairs.append((center_idx, indices[context_pos]))
        return pairs


# Step 2: Word2Vec Model
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.center_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.outside_embeddings = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, center_words):
        center_embeds = self.center_embeddings(center_words)
        return center_embeds

    def predict(self, center_embeds):
        logits = torch.matmul(center_embeds, self.outside_embeddings.weight.T)
        probs = torch.softmax(logits, dim=1)
        return probs


# Step 3: Softmax Loss Function
class SoftmaxLoss(nn.Module):
    def __init__(self):
        super(SoftmaxLoss, self).__init__()

    def forward(self, center_embeds, target_indices, model):
        logits = torch.matmul(center_embeds, model.outside_embeddings.weight.T)
        log_probs = torch.log_softmax(logits, dim=1)
        loss = -torch.sum(log_probs[range(log_probs.size(0)), target_indices])
        return loss


# Step 4: Training Loop for Softmax
def train_word2vec_softmax(dataset, embedding_dim=10, epochs=1000, learning_rate=0.01):
    vocab_size = dataset.vocab_size
    model = Word2Vec(vocab_size, embedding_dim)
    loss_fn = SoftmaxLoss()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        total_loss = 0
        for center_idx, outside_idx in dataset.data:
            # Convert to tensors
            center_tensor = torch.tensor([center_idx], dtype=torch.long)
            outside_tensor = torch.tensor([outside_idx], dtype=torch.long)

            # Forward pass
            center_embeds = model(center_tensor)
            loss = loss_fn(center_embeds, outside_tensor, model)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

    return model


# Example Usage for Softmax
corpus = [
    ["I", "like", "learning", "deep", "learning"],
    ["deep", "learning", "is", "fun"],
    ["word2vec", "uses", "word", "embeddings"]
]

dataset = Word2VecDataset(corpus)
trained_model_softmax = train_word2vec_softmax(dataset)


Epoch 1, Loss: 143.7927
Epoch 2, Loss: 123.9490
Epoch 3, Loss: 111.7279
Epoch 4, Loss: 103.4972
Epoch 5, Loss: 97.2527
Epoch 6, Loss: 92.2798
Epoch 7, Loss: 88.2154
Epoch 8, Loss: 84.8135
Epoch 9, Loss: 81.9092
Epoch 10, Loss: 79.3927
Epoch 11, Loss: 77.1857
Epoch 12, Loss: 75.2292
Epoch 13, Loss: 73.4775
Epoch 14, Loss: 71.8952
Epoch 15, Loss: 70.4545
Epoch 16, Loss: 69.1335
Epoch 17, Loss: 67.9148
Epoch 18, Loss: 66.7844
Epoch 19, Loss: 65.7307
Epoch 20, Loss: 64.7442
Epoch 21, Loss: 63.8166
Epoch 22, Loss: 62.9412
Epoch 23, Loss: 62.1121
Epoch 24, Loss: 61.3245
Epoch 25, Loss: 60.5741
Epoch 26, Loss: 59.8575
Epoch 27, Loss: 59.1718
Epoch 28, Loss: 58.5145
Epoch 29, Loss: 57.8837
Epoch 30, Loss: 57.2780
Epoch 31, Loss: 56.6960
Epoch 32, Loss: 56.1368
Epoch 33, Loss: 55.5997
Epoch 34, Loss: 55.0840
Epoch 35, Loss: 54.5893
Epoch 36, Loss: 54.1150
Epoch 37, Loss: 53.6607
Epoch 38, Loss: 53.2258
Epoch 39, Loss: 52.8098
Epoch 40, Loss: 52.4118
Epoch 41, Loss: 52.0310
Epoch 42, Loss: 51.66