# Negative sampling (Skipgram )

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import Counter


In [2]:

class Word2VecDataset:
    def __init__(self, corpus, window_size=2):
        self.corpus = corpus
        self.window_size = window_size
        self.word_to_idx, self.idx_to_word, self.vocab_size = self.build_vocab()
        self.data = self.generate_training_pairs()

    def build_vocab(self):
        words = [word for sentence in self.corpus for word in sentence]
        word_counts = Counter(words)
        vocab = list(word_counts.keys())
        word_to_idx = {word: idx for idx, word in enumerate(vocab)}
        idx_to_word = {idx: word for word, idx in word_to_idx.items()}
        return word_to_idx, idx_to_word, len(vocab)

    def generate_training_pairs(self):
        pairs = []
        for sentence in self.corpus:
            indices = [self.word_to_idx[word] for word in sentence]
            for center_pos, center_idx in enumerate(indices):
                for offset in range(-self.window_size, self.window_size + 1):
                    context_pos = center_pos + offset
                    if context_pos >= 0 and context_pos < len(indices) and context_pos != center_pos:
                        pairs.append((center_idx, indices[context_pos]))
        return pairs


In [5]:

class NegativeSamplingLoss(nn.Module):
    def __init__(self, vocab_size, num_negative_samples=5):
        super(NegativeSamplingLoss, self).__init__()
        self.vocab_size = vocab_size
        self.num_negative_samples = num_negative_samples

    def forward(self, center_embeds, target_indices, model):
        
        true_logits = torch.matmul(center_embeds, model.outside_embeddings(target_indices).T)
        positive_loss = -torch.log(torch.sigmoid(true_logits))

        # Negative Sample Loss
        neg_samples = torch.randint(0, self.vocab_size, (center_embeds.size(0), self.num_negative_samples))
        negative_embeds = model.outside_embeddings(neg_samples)
        negative_logits = torch.bmm(negative_embeds, center_embeds.unsqueeze(2)).squeeze(2)
        negative_loss = -torch.sum(torch.log(torch.sigmoid(-negative_logits)), dim=1)

       
        total_loss = torch.sum(positive_loss + negative_loss)
        return total_loss


In [6]:
class Word2Vec(nn.Module):
    def __init__(self , vocab_size , embedding_dim):
        super(Word2Vec , self).__init__()
        self.center_embedding = nn.Embedding(vocab_size , embedding_dim)
        self.outside_embedding = nn.Embedding(self.vocab_size , embedding_dim)
        
        
    def forward(self , center_words):
        center_embeds  = self.center_embedding(center_words)
        return center_embeds 
    
    
    def predict(self , center_embeds):
        logits = torch.matmul(center_embeds , self.outside_embedding.T)
        probs = torch.softmax(logits , dim = 1 )
        return probs

In [None]:


# Step 3: Training Loop for Negative Sampling
def train_word2vec_negative_sampling(dataset, embedding_dim=10, epochs=10, learning_rate=0.01, num_negative_samples=5):
    vocab_size = dataset.vocab_size
    model = Word2Vec(vocab_size, embedding_dim)
    loss_fn = NegativeSamplingLoss(vocab_size, num_negative_samples)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        total_loss = 0
        for center_idx, outside_idx in dataset.data:
            # Convert to tensors
            center_tensor = torch.tensor([center_idx], dtype=torch.long)
            outside_tensor = torch.tensor([outside_idx], dtype=torch.long)

            # Forward pass
            center_embeds = model(center_tensor)
            loss = loss_fn(center_embeds, outside_tensor, model)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

    return model


corpus = [
    ["I", "like", "learning", "deep", "learning"],
    ["deep", "learning", "is", "fun"],
    ["word2vec", "uses", "word", "embeddings"]
]
Dataset = Word2VecDataset(corpus)


# Example Usage for Negative Sampling
trained_model_negative_sampling = train_word2vec_negative_sampling(Dataset)
