Assignment 5: Implementing the Continuous Bag of Words (CBOW) Model 
Implement the Continuous Bag of Words (CBOW) model.

a. Data preparation. 
b. Generate training data. 
c. Train the model. 
d. Output. 

In [1]:
#data preparation
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Dense, Input, Lambda
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K

In [2]:
# Sample corpus
corpus = [
    "I love deep learning",
    "Deep learning loves me",
    "I enjoy machine learning",
    "Machine learning is amazing"
]

In [3]:
# Tokenize words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
word2id = tokenizer.word_index
id2word = {v: k for k, v in word2id.items()}
vocab_size = len(word2id) + 1

In [4]:
# Convert corpus to sequence of word IDs
sequences = tokenizer.texts_to_sequences(corpus)
window_size = 2

In [5]:
# b. Generate Training Data 
data = []
for seq in sequences:
    for i, word in enumerate(seq):
        start = max(i - window_size, 0)
        end = min(i + window_size + 1, len(seq))
        context = [seq[j] for j in range(start, end) if j != i]
        target = word
        data.append((context, target))

In [6]:
# Prepare input and output
X, y = [], []
for context, target in data:
    context_vec = np.zeros(vocab_size)
    for w in context:
        context_vec[w] += 1
    X.append(context_vec)
    y.append(target)
X, y = np.array(X), np.array(y)
y = to_categorical(y, num_classes=vocab_size)

In [8]:
# c. Train the Model 
embedding_dim = 10
inp = Input(shape=(vocab_size,))
hidden = Dense(embedding_dim, activation='linear')(inp)
output = Dense(vocab_size, activation='softmax')(hidden)

model = Model(inputs=inp, outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

print("Training CBOW model...")
model.fit(X, y, epochs=100, verbose=0)

Training CBOW model...


<keras.src.callbacks.history.History at 0x1e055cb5e50>

In [9]:
# d. Output 
weights = model.get_weights()[0]
print("\nWord Embeddings (sample):\n")
for word, idx in word2id.items():
    print(f"{word:10s} → {weights[idx][:5]}")


Word Embeddings (sample):

learning   → [ 0.00464135 -0.05570217 -0.16432285  0.24950802  0.32383275]
i          → [-0.39216858  0.23683631  0.382338    0.04389751 -0.01643245]
deep       → [-0.03806103 -0.14604086  0.25951385  0.07641445  0.23680714]
machine    → [ 0.29325497 -0.04739945 -0.18001223 -0.45772907 -0.2553591 ]
love       → [ 0.2621176   0.02216213 -0.37858602  0.28329006  0.4511873 ]
loves      → [-0.50317806  0.12171028 -0.32249603 -0.11636546  0.06174104]
me         → [ 0.45435497 -0.3104306  -0.14612693 -0.61224675 -0.1365502 ]
enjoy      → [-0.50823635 -0.20408818 -0.22386348  0.07269451 -0.3263174 ]
is         → [-0.62297046 -0.6165733   0.614272    0.30192378 -0.46494496]
amazing    → [ 0.11767042 -0.43316394 -0.30302748  0.36006618  0.12427479]


In [10]:
from numpy.linalg import norm

def similar(word):
    if word not in word2id: return []
    vec = weights[word2id[word]]
    sims = {w: np.dot(vec, weights[word2id[w]]) / (norm(vec)*norm(weights[word2id[w]]))
            for w in word2id if w != word}
    return sorted(sims.items(), key=lambda x: x[1], reverse=True)[:3]

print("\nSimilar words to 'learning':", similar("learning"))


Similar words to 'learning': [('deep', np.float32(0.52126825)), ('i', np.float32(0.26465905)), ('amazing', np.float32(0.071893506))]
