In [2]:
corpus = [
    'the quick brown fox jumped over the lazy dog',
    'I seek the quickest route to the store',
    'we are about to make a quick trip to the store',
    'the quick brown fox leaped over the lazy dogs'
]

In [3]:
import numpy as np

def tokenize_corpus(corpus):
    words = []
    for sentence in corpus:
        words.append(sentence.split())
    return words

def build_vocab(words):
    vocab = {}
    for sentence in words:
        for word in sentence:
            if word not in vocab:
                vocab[word] = len(vocab)
    return vocab

def generate_training_data(words, vocab, context_window):
    X = []
    Y = []
    for sentence in words:
        sentence_len = len(sentence)
        for i, word in enumerate(sentence):
            w_index = vocab[word]
            window_start = max(i - context_window, 0)
            window_end = min(i + context_window + 1, sentence_len)
            for j in range(window_start, window_end):
                if i != j:
                    context_word = sentence[j]
                    c_index = vocab[context_word]
                    X.append(to_one_hot(w_index, len(vocab)))
                    Y.append(to_one_hot(c_index, len(vocab)))
    return np.asarray(X), np.asarray(Y)

def to_one_hot(word_index, vocab_size):
    temp = np.zeros(vocab_size)
    temp[word_index] = 1
    return temp

def train(X, Y, vocab_size, embed_size, epochs, learning_rate):
    W1 = np.random.uniform(-1, 1, (vocab_size, embed_size))
    W2 = np.random.uniform(-1, 1, (embed_size, vocab_size))

    for epoch in range(epochs):
        loss = 0
        for x, y_true in zip(X, Y):
            h = np.dot(W1.T, x)
            u = np.dot(W2.T, h)
            y_pred = softmax(u)

            # Error
            e = -y_true + y_pred
            
            # Backpropagation
            dW2 = np.outer(h, e)
            dW1 = np.outer(x, np.dot(W2, e))
            
            # Update weights
            W1 -= learning_rate * dW1
            W2 -= learning_rate * dW2
            
            # Calculate loss (Cross-entropy)
            loss -= np.sum(y_true * np.log(y_pred))

        print(f'Epoch: {epoch}, Loss: {loss:.4f}')
        
    return W1, W2

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

# Preparing the data
tokenized_corpus = tokenize_corpus(corpus)
vocab = build_vocab(tokenized_corpus)

# Hyperparameters
context_window = 1  # Window size for context (on one side)
embed_size = 10  # Dimensions of the embedding vector
epochs = 1000
learning_rate = 0.01

# Generate training data
X, Y = generate_training_data(tokenized_corpus, vocab, context_window)

# Training the model
W1, _ = train(X, Y, len(vocab), embed_size, epochs, learning_rate)

# Extracting the vector for a word
def get_word_vector(word, vocab, W1):
    index = vocab[word]
    return W1[index]

# Test the function
word_str = "quick"
word_vector = get_word_vector(word_str, vocab, W1)
print(f'Word vector for {word_str}: {word_vector}')

Epoch: 0, Loss: 253.7409
Epoch: 1, Loss: 243.6357
Epoch: 2, Loss: 234.7621
Epoch: 3, Loss: 226.8667
Epoch: 4, Loss: 219.7563
Epoch: 5, Loss: 213.2815
Epoch: 6, Loss: 207.3249
Epoch: 7, Loss: 201.7936
Epoch: 8, Loss: 196.6141
Epoch: 9, Loss: 191.7286
Epoch: 10, Loss: 187.0917
Epoch: 11, Loss: 182.6684
Epoch: 12, Loss: 178.4328
Epoch: 13, Loss: 174.3663
Epoch: 14, Loss: 170.4566
Epoch: 15, Loss: 166.6970
Epoch: 16, Loss: 163.0850
Epoch: 17, Loss: 159.6211
Epoch: 18, Loss: 156.3079
Epoch: 19, Loss: 153.1488
Epoch: 20, Loss: 150.1464
Epoch: 21, Loss: 147.3016
Epoch: 22, Loss: 144.6129
Epoch: 23, Loss: 142.0763
Epoch: 24, Loss: 139.6850
Epoch: 25, Loss: 137.4302
Epoch: 26, Loss: 135.3020
Epoch: 27, Loss: 133.2896
Epoch: 28, Loss: 131.3825
Epoch: 29, Loss: 129.5705


Epoch: 30, Loss: 127.8445
Epoch: 31, Loss: 126.1963
Epoch: 32, Loss: 124.6189
Epoch: 33, Loss: 123.1062
Epoch: 34, Loss: 121.6529
Epoch: 35, Loss: 120.2547
Epoch: 36, Loss: 118.9078
Epoch: 37, Loss: 117.6090
Epoch: 38, Loss: 116.3555
Epoch: 39, Loss: 115.1447
Epoch: 40, Loss: 113.9744
Epoch: 41, Loss: 112.8428
Epoch: 42, Loss: 111.7478
Epoch: 43, Loss: 110.6880
Epoch: 44, Loss: 109.6617
Epoch: 45, Loss: 108.6675
Epoch: 46, Loss: 107.7040
Epoch: 47, Loss: 106.7702
Epoch: 48, Loss: 105.8646
Epoch: 49, Loss: 104.9864
Epoch: 50, Loss: 104.1345
Epoch: 51, Loss: 103.3078
Epoch: 52, Loss: 102.5056
Epoch: 53, Loss: 101.7270
Epoch: 54, Loss: 100.9711
Epoch: 55, Loss: 100.2373
Epoch: 56, Loss: 99.5247
Epoch: 57, Loss: 98.8328
Epoch: 58, Loss: 98.1609
Epoch: 59, Loss: 97.5083
Epoch: 60, Loss: 96.8746
Epoch: 61, Loss: 96.2591
Epoch: 62, Loss: 95.6613
Epoch: 63, Loss: 95.0807
Epoch: 64, Loss: 94.5168
Epoch: 65, Loss: 93.9691
Epoch: 66, Loss: 93.4372
Epoch: 67, Loss: 92.9206
Epoch: 68, Loss: 92.4189