# Word2Vec Skip-Gram from Scratch Using PyTorch
This notebook builds a Word2Vec Skip-Gram model using negative sampling on a custom text dataset.

In [None]:
# Step 1: Imports
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
import random
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Step 2: Use the provided text
text = """Getting numbers is easy; getting numbers you can trust is hard. This practical guide by experimentation leaders at Google, LinkedIn, and Microsoft will teach you how to accelerate innovation using trustworthy online controlled experiments, or A/B tests. Based on practical experiences at companies that each runs more than 20,000 controlled experiments a year, the authors share examples, pitfalls, and advice for students and industry professionals getting started with experiments, plus deeper dives into advanced topics for experienced practitioners who want to improve the way they and their organizations make data-driven decisions.
Learn how to:
● Use the scientific method to evaluate hypotheses using controlled experiments
● Define key metrics and ideally an Overall Evaluation Criterion
● Test for trustworthiness of the results and alert experimenters to violated assumptions
● Interpret and iterate quickly based on the results
● Implement guardrails to protect key business goals
● Build a scalable platform that lowers the marginal cost of experiments close to zero ● Avoid pitfalls such as carryover effects, Twyman’s law, Simpson’s paradox, and network interactions
● Understand how statistical issues play out in practice, including common violations of assumptions"""

# Preprocessing
tokens = text.lower().replace(';', '').replace('.', '').replace(',', '').split()
vocab = list(set(tokens))
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}
vocab_size = len(vocab)

# Generate skip-gram pairs
def generate_pairs(words, window_size=2):
    pairs = []
    for i, center in enumerate(words):
        for j in range(-window_size, window_size + 1):
            if j != 0 and 0 <= i + j < len(words):
                pairs.append((center, words[i + j]))
    return pairs

pairs = generate_pairs(tokens)
training_data = [(word2idx[w1], word2idx[w2]) for w1, w2 in pairs]

In [None]:
# Step 3: Model Definition
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.in_embed = nn.Embedding(vocab_size, embedding_dim)
        self.out_embed = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, center_word, context_word):
        center_embed = self.in_embed(center_word)
        context_embed = self.out_embed(context_word)
        score = torch.sum(center_embed * context_embed, dim=1)
        return score

In [None]:
# Step 4: Training
embedding_dim = 50
model = SkipGramModel(vocab_size, embedding_dim)
optimizer = optim.SGD(model.parameters(), lr=0.01)
loss_fn = nn.BCEWithLogitsLoss()

def get_negative_samples(pos_word_idx, num_neg=5):
    neg_samples = []
    while len(neg_samples) < num_neg:
        neg = random.randint(0, vocab_size - 1)
        if neg != pos_word_idx:
            neg_samples.append(neg)
    return neg_samples

# Train the model
for epoch in range(10):
    total_loss = 0
    random.shuffle(training_data)
    for center, context in training_data:
        center_tensor = torch.tensor([center], dtype=torch.long)
        context_tensor = torch.tensor([context], dtype=torch.long)
        pos_label = torch.tensor([1.0])

        neg_samples = get_negative_samples(context)
        neg_tensor = torch.tensor(neg_samples, dtype=torch.long)
        neg_labels = torch.zeros(len(neg_samples))

        pos_score = model(center_tensor, context_tensor)
        pos_loss = loss_fn(pos_score, pos_label)
        neg_score = model(center_tensor.repeat(len(neg_samples)), neg_tensor)
        neg_loss = loss_fn(neg_score, neg_labels)

        loss = pos_loss + neg_loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

In [None]:
# Step 5: Similarity Function
def most_similar(word, topn=5):
    if word not in word2idx:
        return []
    vec = model.in_embed(torch.tensor(word2idx[word])).detach().numpy().reshape(1, -1)
    all_vecs = model.in_embed.weight.detach().numpy()
    sims = cosine_similarity(vec, all_vecs)[0]
    top_idxs = np.argsort(-sims)[1:topn+1]
    return [(idx2word[i], sims[i]) for i in top_idxs]

# Example
print(most_similar('experiments'))