In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from itertools import chain
import random
import numpy as np

# Step 1: Prepare a small corpus for training
corpus = [
    "natural language processing is amazing",
    "deep learning is a key area of artificial intelligence",
    "word embeddings represent words in vector space",
    "embeddings are the foundation of modern nlp",
]

# Tokenize the sentences
tokenized_corpus = [sentence.split() for sentence in corpus]

# Create a vocabulary and assign an index to each word
vocab = list(set(chain(*tokenized_corpus)))
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

# Step 2: Generate training data for the Skip-gram model
def generate_training_data(tokenized_corpus, window_size=2):
    training_data = []
    for sentence in tokenized_corpus:
        for i, target_word in enumerate(sentence):
            start = max(i - window_size, 0)
            end = min(i + window_size + 1, len(sentence))
            context_words = [sentence[j] for j in range(start, end) if j != i]
            for context_word in context_words:
                training_data.append((target_word, context_word))
    return training_data

training_data = generate_training_data(tokenized_corpus)

# Step 3: Define the Skip-gram model
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.output_layer = nn.Linear(embedding_dim, vocab_size)

    def forward(self, target_word_idx):
        # Look up embeddings for the target word
        target_embedding = self.embedding(target_word_idx)
        # Project the embedding to the vocabulary size
        output = self.output_layer(target_embedding)
        return output

# Model parameters
embedding_dim = 50
vocab_size = len(vocab)

# Initialize the Skip-gram model
model = SkipGramModel(vocab_size, embedding_dim)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Step 4: Training the Skip-gram model
def word_to_tensor(word):
    return torch.tensor([word_to_idx[word]], dtype=torch.long)

n_epochs = 100
for epoch in range(n_epochs):
    total_loss = 0
    random.shuffle(training_data)  # Shuffle training data each epoch
    for target_word, context_word in training_data:
        optimizer.zero_grad()
        target_tensor = word_to_tensor(target_word)
        context_idx = word_to_idx[context_word]

        # Forward pass
        output = model(target_tensor)
        loss = criterion(output, torch.tensor([context_idx]))
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}/{n_epochs}, Loss: {total_loss:.4f}")

# Step 5: Visualizing word embeddings
def get_word_embedding(word):
    with torch.no_grad():
        word_idx = word_to_tensor(word)
        return model.embedding(word_idx).numpy().squeeze()

# Visualize a few word embeddings
print("\nWord Embeddings (example words):")
for word in ["nlp", "embeddings", "vector", "deep", "processing"]:
    if word in word_to_idx:
        print(f"{word}: {get_word_embedding(word)}")

# Optional: Save the embeddings for later use
np.save("word_embeddings.npy", model.embedding.weight.detach().numpy())

Epoch 1/100, Loss: 326.5692
Epoch 2/100, Loss: 240.5756
Epoch 3/100, Loss: 221.9105
Epoch 4/100, Loss: 210.2976
Epoch 5/100, Loss: 208.9994
Epoch 6/100, Loss: 203.8143
Epoch 7/100, Loss: 199.3090
Epoch 8/100, Loss: 193.3830
Epoch 9/100, Loss: 189.8686
Epoch 10/100, Loss: 189.0976
Epoch 11/100, Loss: 187.1076
Epoch 12/100, Loss: 183.1389
Epoch 13/100, Loss: 188.2026
Epoch 14/100, Loss: 182.0420
Epoch 15/100, Loss: 180.2733
Epoch 16/100, Loss: 184.4121
Epoch 17/100, Loss: 177.8322
Epoch 18/100, Loss: 177.3433
Epoch 19/100, Loss: 175.0189
Epoch 20/100, Loss: 172.9039
Epoch 21/100, Loss: 174.6234
Epoch 22/100, Loss: 170.9439
Epoch 23/100, Loss: 173.5810
Epoch 24/100, Loss: 171.8322
Epoch 25/100, Loss: 170.2956
Epoch 26/100, Loss: 169.7782
Epoch 27/100, Loss: 169.3074
Epoch 28/100, Loss: 164.6531
Epoch 29/100, Loss: 164.5360
Epoch 30/100, Loss: 164.9567
Epoch 31/100, Loss: 165.6603
Epoch 32/100, Loss: 168.7977
Epoch 33/100, Loss: 166.8100
Epoch 34/100, Loss: 169.9723
Epoch 35/100, Loss: 167