In [None]:
'''
Implement the Continuous Bag of Words (CBOW) Model for the given (textual 
document 1) using the below steps: 
a. Data preparation 
b. Generate training data 
c. Train model 
d. Output 
'''

In [7]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Lambda
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [8]:
# Read and tokenize
with open("CBOW.txt", "r", encoding="utf-8") as f:
    text = f.read()
    
# Keras tokenizer: lowercases, splits, builds word index (index starts at 1; 0 is padding)
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
sequences = tokenizer.texts_to_sequences([text])[0] # single list of word indices
word_index = tokenizer.word_index
total_vocab = len(word_index) + 1 # +1 for PAD=0

print("Total tokens: ", len(sequences))
print("Total vocabulary (incl. PAD=0): ", total_vocab)

Total tokens:  198
Total vocabulary (incl. PAD=0):  103


In [9]:
# Generate CBOW training data
window = 1                 # 2 left + 2 right context
context_len = 2 * window   # fixed context length

X_list, y_list = [], []

for i in range(window, len(sequences) - window):
    center = sequences[i]
    context = sequences[i-window:i] + sequences[i+1:i+1+window] # 4 context ids
    # pad (left) just in case, and ensure fixed length
    context = pad_sequences([context], maxlen=context_len)[0]
    X_list.append(context)
    y_list.append(center)

X = np.array(X_list, dtype="int32") # shape: (N, 4)
y = to_categorical(y_list, num_classes=total_vocab) # one-hot targets (N, V)
print("Training shapes: ", X.shape, y.shape)

Training shapes:  (196, 2) (196, 103)


In [10]:
# Build CBOW model
embedding_dim = 64

model = Sequential([
    Embedding(input_dim=total_vocab, output_dim=embedding_dim, input_length=context_len),
    Lambda(lambda x: tf.reduce_mean(x, axis=1)), # average context embeddings
    Dense(total_vocab, activation="softmax") # predict center word id
])

model.compile(
    loss="categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

model.summary()

In [11]:
# Train
history = model.fit(
    X, y,
    epochs=300,
    batch_size=32, 
    verbose=0
)

print("Final train accuracy: ", float(history.history["accuracy"][-1]))

Final train accuracy:  0.9642857313156128


In [12]:
# Export vectors (word2vec text format)
weights = model.get_weights()[0] # (V, embedding_dim); row 0 is pad

with open("vectors.txt", "w", encoding="utf-8") as f:
    f.write(f"{total_vocab} {embedding_dim}\n")
    # write only real words (skip PAD=0)
    for w, idx in word_index.items():
        vec_str = " ".join(map(str, weights[idx]))
        f.write(f"{w} {vec_str}\n")

# Manual cosine similarity 
def most_similar(word, top_n=6):
    if word not in word_index:
        return []
    idx = word_index[word]
    vec = weights[idx]
    sims = {}
    for w, i in word_index.items():
        if w == word: continue
        denom = (np.linalg.norm(vec) * np.linalg.norm(weights[i]) + 1e-8)
        sims[w] = float(np.dot(vec, weights[i]) / denom)
    return sorted(sims.items(), key=lambda kv: kv[1], reverse=True)[:top_n]

for probe in ["learning", "virus", "influenza", "transmission"]:
    print(f"\nMost similar to ''{probe}': ", most_similar(probe))


Most similar to ''learning':  [('there', 0.9825371525823378), ('means', 0.6920809397187476), ('very', 0.5183119353320393), ('people', 0.5047735979160014), ('we', 0.38309854851599107), ('does', 0.2771236800716859)]

Most similar to ''virus':  [('in', 0.4600843008076739), ('period', 0.41353458840682117), ('two', 0.3986613986036858), ('before', 0.37503843693646094), ('first', 0.3554624984275816), ('viruses', 0.32450863382232215)]

Most similar to ''influenza':  [('estimates', 0.5185747235023264), ('2', 0.34873342917122147), ('context', 0.3370020688461078), ('transmission', 0.32565163545720005), ('two', 0.3066692960726644), ('both', 0.29584410073562867)]

Most similar to ''transmission':  [('appearance', 0.5309132338943533), ('interval', 0.4245929351111452), ('–transmission', 0.3680599658953909), ('illness', 0.3359019617462476), ('contrast', 0.3275844772415228), ('speed', 0.3266145786755268)]


In [14]:
# --- minimal add-on: index_word + predict_target_word() ---
# create reverse map (id -> word) from your tokenizer's word_index
index_word = {i: w for w, i in word_index.items()}

def predict_target_word(context_words):
    """
    Predict the center word given a list of context words.
    - context_words: list of strings (length <= context_len).
      For WINDOW=1, provide two words (left, right). Order matters.
    - returns: (predicted_word, probability)
    """
    # convert context words to ids (unknown -> 0 = PAD)
    ids = [word_index.get(w, 0) for w in context_words[:context_len]]
    # pad/truncate to fixed context_len
    ids = pad_sequences([ids], maxlen=context_len)[0]
    # predict (model expects integer ids input shaped like (1, context_len))
    probs = model.predict(np.array([ids]), verbose=0)[0]
    pred_idx = int(np.argmax(probs))
    return index_word.get(pred_idx, "<unk>"), float(probs[pred_idx])

# --- Example usage (run after training) ---
example_context = ["median", "period"]   # for WINDOW=1 use two words (left then right)
predicted_word, probability = predict_target_word(example_context)
print("Context:", example_context)
print("Predicted center word:", predicted_word, f"(prob={probability:.4f})")


Context: ['median', 'period']
Predicted center word: incubation (prob=0.9743)
