In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Lambda, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Sample corpus
corpus = [
    "the cat sat on the mat",
    "the dog sat on the log",
    "cats and dogs are great pets",
    "the mat is soft and warm"
]
# Preprocess text: Tokenization and Lowercasing
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1  # +1 for padding
# Convert text to sequences
sequences = tokenizer.texts_to_sequences(corpus)


In [2]:
# Stage b: Generate Training Data
def generate_training_data(sequences, window_size=2):
    contexts = []
    targets = []
    for sequence in sequences:
        for i in range(window_size, len(sequence) - window_size):
            context = sequence[i - window_size:i] + sequence[i + 1:i + window_size + 1]
            target = sequence[i]
            contexts.append(context)
            targets.append(target)
    return np.array(contexts), np.array(targets)
X, y = generate_training_data(sequences)
# Pad sequences for consistent input shape
X = pad_sequences(X, maxlen=4)  # Adjust maxlen based on context size

In [3]:
# Stage c: Train Model
# Define CBOW model architecture
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=10, input_length=4))
model.add(Lambda(lambda x: tf.reduce_mean(x, axis=1)))  # Average embeddings
model.add(Dense(total_words, activation='softmax'))
# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Train the model
model.fit(X, y, epochs=100)


Epoch 1/100





[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 0.0000e+00 - loss: 2.8405
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step - accuracy: 0.0000e+00 - loss: 2.8370
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step - accuracy: 0.0000e+00 - loss: 2.8336
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - accuracy: 0.1250 - loss: 2.8301
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - accuracy: 0.1250 - loss: 2.8267
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.3750 - loss: 2.8232
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.3750 - loss: 2.8198
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step - accuracy: 0.3750 - loss: 2.8164
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x150488e7620>

In [4]:
# Stage d: Output
# Get word embeddings from the trained model
word_embeddings = model.layers[0].get_weights()[0]

# Create a mapping of words to their embeddings
word_index = tokenizer.word_index
print('Vocabulary Size:', len(word_index))
print('Vocabulary Sample:', list(word_index.items())[:10], "\n\n")

embeddings_dict = {word: word_embeddings[idx] for word, idx in word_index.items()}

# Output the embeddings for each word in a structured format
print("{:<10} | {}".format("Word", "Embedding"))
print("-" * 40)
for word, embedding in embeddings_dict.items():
    print("{:<10} | {}".format(word, np.round(embedding, 3)))


Vocabulary Size: 16
Vocabulary Sample: [('the', 1), ('sat', 2), ('on', 3), ('mat', 4), ('and', 5), ('cat', 6), ('dog', 7), ('log', 8), ('cats', 9), ('dogs', 10)] 


Word       | Embedding
----------------------------------------
the        | [ 0.198 -0.197 -0.048 -0.023 -0.274  0.031 -0.163  0.281 -0.118  0.298]
sat        | [-0.17  -0.17   0.207  0.144 -0.152  0.134  0.142  0.135 -0.145  0.159]
on         | [ 0.208 -0.163  0.143 -0.131 -0.178 -0.135 -0.105  0.16  -0.109  0.2  ]
mat        | [-0.004 -0.185 -0.159 -0.001 -0.04   0.201 -0.025  0.025  0.126  0.134]
and        | [ 0.176  0.021 -0.123 -0.134 -0.006  0.062 -0.079 -0.018  0.099  0.05 ]
cat        | [ 0.055 -0.124  0.192  0.076 -0.129  0.012 -0.056  0.131 -0.132  0.2  ]
dog        | [ 0.006 -0.157  0.187  0.084 -0.114  0.031 -0.066  0.195 -0.16   0.147]
log        | [-0.093 -0.123  0.093  0.063 -0.131  0.081  0.123  0.154 -0.063  0.145]
cats       | [ 0.122  0.1    0.085  0.133 -0.161 -0.093  0.116 -0.143  0.147 -0.054]
dogs  