In [None]:
'''
Implement the Continuous Bag of Words (CBOW) Model for the given (textual 
document 1) using the below steps: 
a. Data preparation 
b. Generate training data 
c. Train model 
d. Output 
'''

In [15]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Lambda
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [16]:
# Read and tokenize
with open("CBOW.txt", "r", encoding="utf-8") as f:
    text = f.read()
    
# Keras tokenizer: lowercases, splits, builds word index (index starts at 1; 0 is padding)
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
sequences = tokenizer.texts_to_sequences([text])[0] # single list of word indices
word_index = tokenizer.word_index
total_vocab = len(word_index) + 1 # +1 for PAD=0

print("Total tokens: ", len(sequences))
print("Total vocabulary (incl. PAD=0): ", total_vocab)

Total tokens:  198
Total vocabulary (incl. PAD=0):  103


In [17]:
# Generate CBOW training data
window = 1                 # 2 left + 2 right context
context_len = 2 * window   # fixed context length

X_list, y_list = [], []

for i in range(window, len(sequences) - window):
    center = sequences[i]
    context = sequences[i-window:i] + sequences[i+1:i+1+window] # 4 context ids
    # pad (left) just in case, and ensure fixed length
    context = pad_sequences([context], maxlen=context_len)[0]
    X_list.append(context)
    y_list.append(center)

X = np.array(X_list, dtype="int32") # shape: (N, 4)
y = to_categorical(y_list, num_classes=total_vocab) # one-hot targets (N, V)
print("Training shapes: ", X.shape, y.shape)

Training shapes:  (196, 2) (196, 103)


In [18]:
# Build CBOW model
embedding_dim = 64

model = Sequential([
    Embedding(input_dim=total_vocab, output_dim=embedding_dim, input_length=context_len),
    Lambda(lambda x: tf.reduce_mean(x, axis=1)), # average context embeddings
    Dense(total_vocab, activation="softmax") # predict center word id
])

model.compile(
    loss="categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

model.summary()

In [19]:
# Train
history = model.fit(
    X, y,
    epochs=300,
    batch_size=32, 
    verbose=0
)

print("Final train accuracy: ", float(history.history["accuracy"][-1]))

Final train accuracy:  0.9642857313156128


In [20]:
# Export vectors (word2vec text format)
weights = model.get_weights()[0] # (V, embedding_dim); row 0 is pad

with open("vectors.txt", "w", encoding="utf-8") as f:
    f.write(f"{total_vocab} {embedding_dim}\n")
    # write only real words (skip PAD=0)
    for w, idx in word_index.items():
        vec_str = " ".join(map(str, weights[idx]))
        f.write(f"{w} {vec_str}\n")

# Manual cosine similarity 
def most_similar(word, top_n=6):
    if word not in word_index:
        return []
    idx = word_index[word]
    vec = weights[idx]
    sims = {}
    for w, i in word_index.items():
        if w == word: continue
        denom = (np.linalg.norm(vec) * np.linalg.norm(weights[i]) + 1e-8)
        sims[w] = float(np.dot(vec, weights[i]) / denom)
    return sorted(sims.items(), key=lambda kv: kv[1], reverse=True)[:top_n]

for probe in ["learning", "virus", "influenza", "transmission"]:
    print(f"\nMost similar to ''{probe}': ", most_similar(probe))


Most similar to ''learning':  [('there', 0.9801399027934891), ('means', 0.6369313552773362), ('very', 0.4358067756118927), ('people', 0.3086252476220105), ('in', 0.2862352331208635), ('we', 0.27965548778781923)]

Most similar to ''virus':  [('period', 0.5114822494138954), ('in', 0.4579066707772553), ('two', 0.41254897584107275), ('before', 0.32313307208814224), ('interval', 0.30888567090308644), ('appearance', 0.2986814000507786)]

Most similar to ''influenza':  [('estimates', 0.5442079654017228), ('both', 0.38889310546566513), ('transmission', 0.3170994295129359), ('2', 0.2720643764754481), ('covid', 0.2622601310106544), ('19', 0.25713373143525736)]

Most similar to ''transmission':  [('appearance', 0.5075881774317195), ('interval', 0.4257374091929768), ('–transmission', 0.419810425889036), ('secondary', 0.3590000168346107), ('illness', 0.34482362424811824), ('influenza', 0.3170994295129359)]
