In [1]:
# ---------------------------------------------------------
# EXPERIMENT 5 - Continuous Bag of Words (CBOW) with Keras
# ---------------------------------------------------------


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Lambda, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import numpy as np
import re, os
from gensim.models import KeyedVectors

# Reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [2]:
# Step 2: Small dataset (you can replace with your own sentences)

paragraph = (
    "It is a pleasant day. I love to walk in the park. "
    "The birds are singing and the breeze is cool. "
    "Children are playing games with joy. "
    "Learning new things every day feels rewarding."
)

print(paragraph)

It is a pleasant day. I love to walk in the park. The birds are singing and the breeze is cool. Children are playing games with joy. Learning new things every day feels rewarding.


In [3]:
# Step 3: Preprocess and tokenize

def simple_clean(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s\.]", "", text)  # keep letters, numbers, space, dot
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Clean text
cleaned = simple_clean(paragraph)

# Split into sentences
sentences = [s.strip() for s in cleaned.split('.') if s.strip()]
print("Sentences:", sentences)

# Tokenize
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index
vocab_size = len(word_index) + 1  # +1 for padding
print("Vocabulary size:", vocab_size)
print("Word index:", word_index)

# Convert sentences to sequences of word IDs
seqs = tokenizer.texts_to_sequences(sentences)
print("Sequences:", seqs)

Sentences: ['it is a pleasant day', 'i love to walk in the park', 'the birds are singing and the breeze is cool', 'children are playing games with joy', 'learning new things every day feels rewarding']
Vocabulary size: 31
Word index: {'<OOV>': 1, 'the': 2, 'is': 3, 'day': 4, 'are': 5, 'it': 6, 'a': 7, 'pleasant': 8, 'i': 9, 'love': 10, 'to': 11, 'walk': 12, 'in': 13, 'park': 14, 'birds': 15, 'singing': 16, 'and': 17, 'breeze': 18, 'cool': 19, 'children': 20, 'playing': 21, 'games': 22, 'with': 23, 'joy': 24, 'learning': 25, 'new': 26, 'things': 27, 'every': 28, 'feels': 29, 'rewarding': 30}
Sequences: [[6, 3, 7, 8, 4], [9, 10, 11, 12, 13, 2, 14], [2, 15, 5, 16, 17, 2, 18, 3, 19], [20, 5, 21, 22, 23, 24], [25, 26, 27, 28, 4, 29, 30]]


In [4]:
def cbow_training_pairs(seqs, window_size, vocab_size):
    contexts = []
    targets = []
    total_len = 2 * window_size  # left + right context
    
    for text in seqs:
        L = len(text)
        for idx, word in enumerate(text):
            if word == 0:
                continue
            
            start = max(0, idx - window_size)
            end = min(L, idx + window_size + 1)
            
            # Collect context words (exclude target word)
            context = [text[i] for i in range(start, end) if i != idx]
            
            # Pad context
            if len(context) < total_len:
                pad_left = (total_len - len(context)) // 2
                pad_right = total_len - len(context) - pad_left
                context = [0] * pad_left + context + [0] * pad_right
            else:
                context = context[:total_len]
            
            contexts.append(context)
            targets.append(word)
    
    X = np.array(contexts)
    y = to_categorical(np.array(targets), num_classes=vocab_size)
    return X, y

In [8]:
# Step 5: Build CBOW model
window_size = 2
embed_dim = 50
context_len = 2 * window_size

model = Sequential([
    Input(shape=(context_len,)),
    Embedding(input_dim=vocab_size, output_dim=embed_dim, name="embedding"),
    Lambda(lambda x: tf.reduce_mean(x, axis=1), name="context_mean"),
    Dense(vocab_size, activation="softmax")
])

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()




In [9]:
window_size = 2
X, y = cbow_training_pairs(seqs, window_size, vocab_size)
print("X shape:", X.shape, "y shape:", y.shape)

X shape: (34, 4) y shape: (34, 31)


In [10]:
history = model.fit(X, y, epochs=200, batch_size=32, verbose=1)

Epoch 1/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 103ms/step - accuracy: 0.0294 - loss: 3.4357
Epoch 2/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - accuracy: 0.0882 - loss: 3.4293 
Epoch 3/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step - accuracy: 0.1176 - loss: 3.4243
Epoch 4/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step - accuracy: 0.1176 - loss: 3.4194
Epoch 5/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step - accuracy: 0.1176 - loss: 3.4145
Epoch 6/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - accuracy: 0.1176 - loss: 3.4097
Epoch 7/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - accuracy: 0.1471 - loss: 3.4049
Epoch 8/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step - accuracy: 0.1765 - loss: 3.4001
Epoch 9/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [11]:
# Step 7: Save word vectors

weights = model.get_layer("embedding").get_weights()[0]

items = sorted([(w, idx) for w, idx in word_index.items() if idx < weights.shape[0] and idx != 0], key=lambda t: t[1])
num_vectors = len(items)

with open("vectors.txt", "w", encoding="utf-8") as f:
    f.write(f"{num_vectors} {embed_dim}\n")
    for word, idx in items:
        vec = " ".join(map(str, weights[idx].tolist()))
        f.write(f"{word} {vec}\n")

print("vectors.txt saved successfully!")

vectors.txt saved successfully!


In [12]:
# Step 8: Test with Gensim

cbow_vectors = KeyedVectors.load_word2vec_format("vectors.txt", binary=False)
print("Words in vocab:", cbow_vectors.index_to_key)

for w in ["day", "park", "children", "learning"]:
    if w in cbow_vectors:
        print(f"\nMost similar to '{w}':")
        print(cbow_vectors.most_similar(positive=[w], topn=5))

Words in vocab: ['<OOV>', 'the', 'is', 'day', 'are', 'it', 'a', 'pleasant', 'i', 'love', 'to', 'walk', 'in', 'park', 'birds', 'singing', 'and', 'breeze', 'cool', 'children', 'playing', 'games', 'with', 'joy', 'learning', 'new', 'things', 'every', 'feels', 'rewarding']

Most similar to 'day':
[('feels', 0.9427475333213806), ('new', 0.9356144666671753), ('things', 0.8961781859397888), ('love', 0.04066450148820877), ('learning', 0.028660761192440987)]

Most similar to 'park':
[('walk', 0.7422353625297546), ('birds', 0.514449417591095), ('in', 0.42957285046577454), ('and', 0.4218091368675232), ('to', 0.38650864362716675)]

Most similar to 'children':
[('games', 0.7136662006378174), ('birds', 0.6096868515014648), ('and', 0.574961245059967), ('singing', 0.42470690608024597), ('are', 0.3531719744205475)]

Most similar to 'learning':
[('every', 0.714691698551178), ('i', 0.2033466398715973), ('with', 0.15126410126686096), ('to', 0.1130979135632515), ('love', 0.11085328459739685)]
