In [1]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:

# REPLACE THIS WITH YOUR TEXTUAL DOCUMENT 2
text = """
machine learning is a subset of artificial intelligence and it enables computers to learn from data
"""
# PUT YOUR DOCUMENT 2 ABOVE ↑↑↑


In [3]:
# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

word_index = tokenizer.word_index
index_word = {v: k for k, v in word_index.items()}
vocab_size = len(word_index) + 1

sequence = tokenizer.texts_to_sequences([text])[0]

print("Vocabulary size:", vocab_size)
print("Sequence:", sequence)

Vocabulary size: 17
Sequence: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]


In [4]:
# b. Generate Training Data (CBOW)

window_size = 2
X_train = []
y_train = []

for i in range(window_size, len(sequence) - window_size):
    context = [
        sequence[i - 2],
        sequence[i - 1],
        sequence[i + 1],
        sequence[i + 2]
    ]
    target = sequence[i]

    X_train.append(context)
    y_train.append(target)

X_train = np.array(X_train)
y_train = np.array(y_train)

print("\nContext samples:\n", X_train[:5])
print("\nTarget samples:\n", y_train[:5])


Context samples:
 [[1 2 4 5]
 [2 3 5 6]
 [3 4 6 7]
 [4 5 7 8]
 [5 6 8 9]]

Target samples:
 [3 4 5 6 7]


In [5]:
# c. Train CBOW Model

embedding_dim = 8

input_layer = Input(shape=(4,))
embedding_layer = Embedding(vocab_size, embedding_dim)(input_layer)

hidden = Dense(embedding_dim, activation="linear")(embedding_layer)
hidden = Flatten()(hidden)

output_layer = Dense(vocab_size, activation="softmax")(hidden)

cbow_model = Model(inputs=input_layer, outputs=output_layer)
cbow_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")

cbow_model.summary()

cbow_model.fit(X_train, y_train, epochs=200, verbose=0)

print("\nCBOW Model Training Complete!")


CBOW Model Training Complete!


In [6]:
# d. Output – Word Embeddings

from tensorflow.keras.layers import Embedding

embedding_layer_obj = [layer for layer in cbow_model.layers if isinstance(layer, Embedding)][0]
weights = embedding_layer_obj.get_weights()[0]

print("\nWord Embeddings:")
for word, idx in word_index.items():
    print(f"{word} → {weights[idx]}")


Word Embeddings:
machine → [ 0.15624414  0.26478443 -0.17034341  0.23046717 -0.23017046 -0.27309552
 -0.19606149  0.2510501 ]
learning → [ 0.04472045  0.16956355  0.12970538  0.28957585 -0.06256787 -0.19760543
 -0.31120324  0.35366547]
is → [-0.26839644  0.3205866   0.22506742  0.29706118  0.31188262 -0.27621716
 -0.18323728  0.10514062]
a → [-0.29228798  0.31464186  0.41193384 -0.00242852 -0.03171038 -0.12410478
 -0.01203216  0.23001714]
subset → [ 0.29738575  0.03909953 -0.13457917 -0.27092776 -0.44089073  0.01929818
  0.02004438  0.05644285]
of → [ 0.04758733 -0.26365364 -0.39087546  0.28449428  0.35043016 -0.00595682
 -0.23148149  0.16824238]
artificial → [-0.25243768  0.21878526  0.25234157  0.38008127 -0.05254502 -0.22115992
 -0.38507813  0.38680592]
intelligence → [-0.04244814  0.28553507  0.39681202 -0.36581567 -0.31025442 -0.05521359
  0.10411581  0.24170245]
and → [ 0.08839057  0.06284437 -0.19359687  0.1306285   0.35451904 -0.3413426
 -0.3774453  -0.3080955 ]
it → [ 0.37067