In [1]:
import random
import torch
import torch.nn as nn
import torch.optim as optim

# --- Data Setup ---
color = ['blue','red','green','yellow','white']
noun  = ['cat','dog','car','boat','house']
verb  = ['is','was','seems','looks']
adverb = ['quite','absurdly','extremely']
adjective = ['slow','fast','big','small']

# Generate random sentences
num_sentences = 100
all_words = color + noun + verb + adverb + adjective + ['.']
vocab_size = len(all_words)
word_to_idx = {word: idx for idx, word in enumerate(all_words)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

sentences = [
    " ".join([
        random.choice(color), random.choice(noun), random.choice(verb),
        random.choice(adverb), random.choice(adjective), '.'
    ])
    for _ in range(num_sentences)
]

# --- Training Data Preparation ---
# Context-Target Word Pairs (simple skip-gram-like setup)
training_pairs = []
window_size = 1
for sentence in sentences:
    words = sentence.split()
    for center_idx in range(len(words)):
        for offset in range(-window_size, window_size + 1):
            context_idx = center_idx + offset
            if context_idx != center_idx and 0 <= context_idx < len(words):
                training_pairs.append((words[center_idx], words[context_idx]))

# Convert to indices
training_data = [(word_to_idx[w1], word_to_idx[w2]) for w1, w2 in training_pairs]

# --- Model Definition ---
class WordEmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(WordEmbeddingModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_dim)

    def forward(self, word_idx):
        return self.embeddings(word_idx)

# Hyperparameters
embed_dim = 8  # Size of the embedding vector
model = WordEmbeddingModel(vocab_size, embed_dim)
optimizer = optim.Adam(model.parameters(), lr=0.0003)
loss_fn = nn.CrossEntropyLoss()

# --- Training Loop ---
for epoch in range(100):
    total_loss = 0
    for center, target in training_data:
        center_tensor = torch.tensor([center], dtype=torch.long)
        target_tensor = torch.tensor([target], dtype=torch.long)

        optimizer.zero_grad()
        center_embed = model(center_tensor)
        scores = model.embeddings.weight @ center_embed.T
        scores = scores.T  # Transpose to shape [1, vocab_size]
        loss = loss_fn(scores.squeeze(1), target_tensor)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

# --- Cosine Similarity Function ---
def cosine_similarity(word1, word2):
    idx1, idx2 = word_to_idx[word1], word_to_idx[word2]
    embed1 = model.embeddings(torch.tensor(idx1))
    embed2 = model.embeddings(torch.tensor(idx2))
    cos_sim = nn.functional.cosine_similarity(embed1, embed2, dim=0)
    return cos_sim.item()

# --- Testing Cosine Similarity ---
print("\nSample Cosine Similarities:")
print("blue vs red:", cosine_similarity("blue", "red"))
print("cat vs dog:", cosine_similarity("cat", "dog"))
print("big vs small:", cosine_similarity("big", "small"))
print("car vs boat:", cosine_similarity("car", "boat"))



Epoch 0, Loss: 8341.1722
Epoch 10, Loss: 3911.9449
Epoch 20, Loss: 2826.4286
Epoch 30, Loss: 2598.5980
Epoch 40, Loss: 2550.0374
Epoch 50, Loss: 2533.0261
Epoch 60, Loss: 2524.9756
Epoch 70, Loss: 2520.4426
Epoch 80, Loss: 2517.5066
Epoch 90, Loss: 2515.3896

Sample Cosine Similarities:
blue vs red: 0.4979579746723175
cat vs dog: 0.7290394902229309
big vs small: 0.7445892095565796
car vs boat: 0.2670944035053253


In [3]:
print("cat vs small:", cosine_similarity("is", "white"))

cat vs small: -0.030236084014177322
