# Step 1: Import Libraries

In [1]:
import numpy as np
import random

In [2]:
# Sample text corpus
corpus = "the quick brown fox jumps over the lazy dog".split()

In [3]:
# Create a vocabulary and word-to-index mapping
vocab = set(corpus)
vocab_size = len(vocab)
word2idx = {word: i for i, word in enumerate(vocab)}
idx2word = {i: word for word, i in word2idx.items()}

# Step 2: Generate Training Data

In [4]:
def generate_training_data(corpus, window_size):
    data = []
    for i in range(window_size, len(corpus) - window_size):
        context = []
        for j in range(i - window_size, i + window_size + 1):
            if j != i:
                context.append(word2idx[corpus[j]])
        target = word2idx[corpus[i]]
        data.append((context, target))
    return data

window_size = 2
training_data = generate_training_data(corpus, window_size)


# Step 3: Train CBOW Model

In [5]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / np.sum(e_x, axis=0)

# Initialize weights
embedding_dim = 5
W1 = np.random.randn(vocab_size, embedding_dim)
W2 = np.random.randn(embedding_dim, vocab_size)

# Training parameters
learning_rate = 0.01
epochs = 1000

for epoch in range(epochs):
    loss = 0
    for context, target in training_data:
        # Average of context word embeddings
        x = np.mean(W1[context], axis=0).reshape(embedding_dim, 1)

        # Forward pass
        z = np.dot(W2.T, x)            # (vocab_size, 1)
        y_pred = softmax(z)

        # True label
        y_true = np.zeros((vocab_size, 1))
        y_true[target] = 1

        # Error
        e = y_pred - y_true
        loss += -np.sum(y_true * np.log(y_pred + 1e-9))

        # Gradients
        dW2 = np.dot(x, e.T)                  # (embedding_dim, vocab_size)
        dW1_context = np.dot(W2, e)           # (embedding_dim, 1)

        # Update weights
        for c in context:
            W1[c] -= learning_rate * dW1_context.flatten()
        W2 -= learning_rate * dW2

    if epoch % 200 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")


Epoch 0, Loss: 9.9991
Epoch 200, Loss: 0.9853
Epoch 400, Loss: 0.2779
Epoch 600, Loss: 0.1474
Epoch 800, Loss: 0.0974


# Step 4: Output

In [6]:
def get_vector(word):
    return W1[word2idx[word]]

print("\nWord Embeddings (sample):")
for w in ["quick", "fox", "dog"]:
    print(f"{w}: {get_vector(w)}")


Word Embeddings (sample):
quick: [ 4.54842939  2.25254961 -0.75956454  1.17414518 -1.06427633]
fox: [-0.0134341  -1.5883493   4.30338727  0.53444457 -0.72027081]
dog: [ 0.09237521 -2.67347207 -2.16003865 -1.11802079 -0.27676876]
