In [219]:
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Lambda
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [224]:
with open("CBOW.txt", "r", encoding="utf-8") as f:
    text = f.read()

text = re.sub(r"[^a-zA-Z\s]", "", text.lower()).strip()
    
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
tokens = tokenizer.texts_to_sequences([text])[0]
word_index = tokenizer.word_index
index_word = {i: w for w, i in word_index.items()}
vocab_size = len(word_index) + 1

print("Total tokens: ", len(sequences))
print("Total vocabulary: ", vocab_size)

Total tokens:  177
Total vocabulary:  93


In [225]:
window = 1
context_len = 2 * window

X_list, y_list = [], []

for i in range(window, len(tokens) - window):
    context = tokens[i-window:i] + tokens[i+1:i+1+window]
    context = pad_sequences([context], maxlen=context_len)[0]
    X_list.append(context)
    y_list.append(tokens[i])

X = np.array(X_list, dtype="int32")
y = to_categorical(y_list, num_classes=vocab_size)
print(X.shape, y.shape)

(175, 2) (175, 93)


In [227]:
embed_dim = 50
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=context_len),
    Lambda(lambda x: tf.reduce_mean(x, axis=1)),
    Dense(vocab_size, activation="softmax")
])

model.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

model.fit(X, y, epochs=300, batch_size=32, verbose=2)

Epoch 1/300
6/6 - 1s - 217ms/step - accuracy: 0.0286 - loss: 4.5321
Epoch 2/300
6/6 - 0s - 11ms/step - accuracy: 0.0743 - loss: 4.5179
Epoch 3/300
6/6 - 0s - 16ms/step - accuracy: 0.1257 - loss: 4.5059
Epoch 4/300
6/6 - 0s - 14ms/step - accuracy: 0.2171 - loss: 4.4939
Epoch 5/300
6/6 - 0s - 13ms/step - accuracy: 0.2971 - loss: 4.4816
Epoch 6/300
6/6 - 0s - 13ms/step - accuracy: 0.3314 - loss: 4.4690
Epoch 7/300
6/6 - 0s - 12ms/step - accuracy: 0.3486 - loss: 4.4559
Epoch 8/300
6/6 - 0s - 12ms/step - accuracy: 0.3714 - loss: 4.4422
Epoch 9/300
6/6 - 0s - 15ms/step - accuracy: 0.3886 - loss: 4.4278
Epoch 10/300
6/6 - 0s - 12ms/step - accuracy: 0.4114 - loss: 4.4126
Epoch 11/300
6/6 - 0s - 12ms/step - accuracy: 0.4343 - loss: 4.3964
Epoch 12/300
6/6 - 0s - 13ms/step - accuracy: 0.4514 - loss: 4.3793
Epoch 13/300
6/6 - 0s - 10ms/step - accuracy: 0.4571 - loss: 4.3610
Epoch 14/300
6/6 - 0s - 13ms/step - accuracy: 0.4743 - loss: 4.3417
Epoch 15/300
6/6 - 0s - 13ms/step - accuracy: 0.4686 - l

Epoch 122/300
6/6 - 0s - 11ms/step - accuracy: 0.9143 - loss: 0.6531
Epoch 123/300
6/6 - 0s - 11ms/step - accuracy: 0.9143 - loss: 0.6426
Epoch 124/300
6/6 - 0s - 10ms/step - accuracy: 0.9200 - loss: 0.6325
Epoch 125/300
6/6 - 0s - 9ms/step - accuracy: 0.9200 - loss: 0.6225
Epoch 126/300
6/6 - 0s - 10ms/step - accuracy: 0.9200 - loss: 0.6129
Epoch 127/300
6/6 - 0s - 10ms/step - accuracy: 0.9200 - loss: 0.6035
Epoch 128/300
6/6 - 0s - 10ms/step - accuracy: 0.9143 - loss: 0.5943
Epoch 129/300
6/6 - 0s - 9ms/step - accuracy: 0.9143 - loss: 0.5853
Epoch 130/300
6/6 - 0s - 9ms/step - accuracy: 0.9143 - loss: 0.5766
Epoch 131/300
6/6 - 0s - 10ms/step - accuracy: 0.9143 - loss: 0.5681
Epoch 132/300
6/6 - 0s - 10ms/step - accuracy: 0.9143 - loss: 0.5598
Epoch 133/300
6/6 - 0s - 9ms/step - accuracy: 0.9143 - loss: 0.5517
Epoch 134/300
6/6 - 0s - 8ms/step - accuracy: 0.9143 - loss: 0.5438
Epoch 135/300
6/6 - 0s - 9ms/step - accuracy: 0.9200 - loss: 0.5361
Epoch 136/300
6/6 - 0s - 9ms/step - accu

Epoch 242/300
6/6 - 0s - 11ms/step - accuracy: 0.9371 - loss: 0.2099
Epoch 243/300
6/6 - 0s - 9ms/step - accuracy: 0.9371 - loss: 0.2089
Epoch 244/300
6/6 - 0s - 9ms/step - accuracy: 0.9371 - loss: 0.2078
Epoch 245/300
6/6 - 0s - 8ms/step - accuracy: 0.9371 - loss: 0.2068
Epoch 246/300
6/6 - 0s - 10ms/step - accuracy: 0.9371 - loss: 0.2057
Epoch 247/300
6/6 - 0s - 9ms/step - accuracy: 0.9371 - loss: 0.2047
Epoch 248/300
6/6 - 0s - 8ms/step - accuracy: 0.9371 - loss: 0.2037
Epoch 249/300
6/6 - 0s - 11ms/step - accuracy: 0.9371 - loss: 0.2027
Epoch 250/300
6/6 - 0s - 9ms/step - accuracy: 0.9371 - loss: 0.2017
Epoch 251/300
6/6 - 0s - 11ms/step - accuracy: 0.9371 - loss: 0.2008
Epoch 252/300
6/6 - 0s - 10ms/step - accuracy: 0.9371 - loss: 0.1998
Epoch 253/300
6/6 - 0s - 9ms/step - accuracy: 0.9371 - loss: 0.1989
Epoch 254/300
6/6 - 0s - 10ms/step - accuracy: 0.9371 - loss: 0.1980
Epoch 255/300
6/6 - 0s - 10ms/step - accuracy: 0.9371 - loss: 0.1970
Epoch 256/300
6/6 - 0s - 8ms/step - accur

<keras.src.callbacks.history.History at 0x1fde58bdbb0>

In [229]:
emb_matrix = model.get_layer("embedding_4").get_weights()[0]

def predict_word(context_words):
    ids = [word_index.get(w, 0) for w in context_words[:context_len]]
    ids = pad_sequences([ids], maxlen=context_len)[0]
    probs = model.predict(np.array([ids]), verbose=0)[0]
    idx = int(np.argmax(probs))
    return index_word.get(idx, "<unk>"), float(probs[idx])

def most_similar(word, top_n=5):
    if word not in word_index: return []
    q = emb_matrix[word_index[word]]
    sims = []
    for w, i in word_index.items():
        if w==word: continue
        sims.append((w, float(np.dot(q, emb_matrix[i]) / (np.linalg.norm(q)*np.linalg.norm(emb_matrix[i]) + 1e-9))))
    return sorted(sims, key=lambda x:-x[1])[:top_n]

print("Predict example:")
ctx = ["median", "period"]
print("Context: ", ctx, "-> Predicted: ", predict_target_word(ctx))

print("\nMost similar to 'virus': ", most_similar("virus"))

Predict example:
Context:  ['median', 'period'] -> Predicted:  ('incubation', 0.9665399789810181)

Most similar to 'virus':  [('both', 0.5945498224649076), ('before', 0.3989975552868747), ('interval', 0.3969363815992646), ('reproductive', 0.39251266033199345), ('first', 0.3919351198677229)]
