In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import defaultdict
import numpy as np
import random


In [None]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGram, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, target):
        embedded = self.embedding(target)
        predicted = self.linear(embedded)
        return predicted


In [None]:
def prepare_data(text, window_size=2):
    data = []
    for i in range(len(text)):
        for j in range(max(0, i - window_size), min(len(text), i + window_size + 1)):
            if i != j:
                data.append((text[i], text[j]))
    return data

In [None]:
def train_model(data, vocab_size, embedding_dim, epochs=10, learning_rate=0.001):
    model = SkipGram(vocab_size, embedding_dim)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        random.shuffle(data)
        total_loss = 0
        for target_word, context_word in data:
            model.zero_grad()
            target_idx = word_to_idx[target_word]
            context_idx = word_to_idx[context_word]
            target_tensor = torch.LongTensor([target_idx])
            context_tensor = torch.LongTensor([context_idx])
            output = model(target_tensor)
            loss = criterion(output, context_tensor)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(data)}")

    return model.embedding.weight.data.numpy()


In [None]:
# Example usage:
text = "natural language processing is fun and interesting".split()
data = prepare_data(text)
word_to_idx = {word: idx for idx, word in enumerate(set(text))}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}
vocab_size = len(word_to_idx)
embedding_dim = 50

word_embeddings = train_model(data, vocab_size, embedding_dim)

# Print word embeddings
for i in range(len(word_embeddings)):
    word = idx_to_word[i]
    embedding = word_embeddings[i]
    print(f"{word}: {embedding}")

Epoch 1, Loss: 2.0539676655422556
Epoch 2, Loss: 2.0235471075231377
Epoch 3, Loss: 1.9947397654706782
Epoch 4, Loss: 1.9674148776314475
Epoch 5, Loss: 1.941401801326058
Epoch 6, Loss: 1.9167742891745134
Epoch 7, Loss: 1.8933285854079507
Epoch 8, Loss: 1.8710150664502925
Epoch 9, Loss: 1.849716305732727
Epoch 10, Loss: 1.8294603662057356
is: [-0.6531595   0.33865714 -0.3624338  -1.018374   -0.95648164 -1.3083072
  0.7871692  -0.7319469  -1.0730002  -1.6556005   0.9644851  -1.4050752
 -0.31364802 -1.1399194  -1.1470544   0.40877202 -0.44794974  0.6461737
 -2.7971988  -0.56297785 -0.4739546   0.784937    0.10262039 -0.9652856
  0.32698455 -0.14238454  0.32448795  2.059471   -0.5894113   0.86387897
  1.7739866  -0.4988416  -1.5006317  -0.33302632 -0.10389426 -1.139832
 -0.9651045  -0.17161614 -1.4075046  -2.345691   -0.6596367  -3.0982912
  1.3073208   0.85510087 -1.553278   -0.8986698  -0.6245564   0.8722838
 -0.12938458  0.7257798 ]
natural: [ 0.5534912   1.4498534  -0.11348208 -0.850385