In [10]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer

In [11]:
# Example textual document
text = """The speed of transmission is an important point of difference between the two viruses. Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than COVID-19 virus. The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days. This means that influenza can spread faster than COVID-19. 
Further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza. In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission. 
The reproductive number – the number of secondary infections generated from one infected individual – is understood to be between 2 and 2.5 for COVID-19 virus, higher than for influenza. However, estimates for both COVID-19 and influenza viruses are very context and time-specific, making direct comparisons more difficult.  """

# a) DATA PREPARATION
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

word_index = tokenizer.word_index
index_word = {v: k for k, v in word_index.items()}
vocab_size = len(word_index) + 1

print("Vocabulary Size:", vocab_size)

sequence = tokenizer.texts_to_sequences([text])[0]
print("Tokenized Sequence Sample:", sequence[:20])

Vocabulary Size: 103
Tokenized Sequence Sample: [1, 38, 2, 8, 9, 39, 40, 41, 2, 42, 13, 1, 43, 23, 3, 44, 11, 24, 45, 46]


In [12]:
# b. Generate Training Data (CBOW)

window_size = 2
X_train = []
y_train = []

for i in range(window_size, len(sequence) - window_size):
    context = [
        sequence[i-2], sequence[i-1],
        sequence[i+1], sequence[i+2]
    ]
    target = sequence[i]

    X_train.append(context)
    y_train.append(target)

X_train = np.array(X_train)
y_train = np.array(y_train)

print("\nContexts:", X_train[:5])
print("Targets:", y_train[:5])


Contexts: [[ 1 38  8  9]
 [38  2  9 39]
 [ 2  8 39 40]
 [ 8  9 40 41]
 [ 9 39 41  2]]
Targets: [ 2  8  9 39 40]


In [13]:
# c. Train CBOW Model

embedding_dim = 8

input_layer = Input(shape=(4,))
emb = Embedding(vocab_size, embedding_dim)(input_layer)     # (batch, 4, 8)
ctx_vector = GlobalAveragePooling1D()(emb)                  # (batch, 8) ← TRUE CBOW
output_layer = Dense(vocab_size, activation='softmax')(ctx_vector)

cbow_model = Model(input_layer, output_layer)
cbow_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

cbow_model.summary()

cbow_model.fit(X_train, y_train, epochs=200, verbose=0)
print("\nCBOW Training Complete!")


CBOW Training Complete!


In [14]:
# d. Output – Show Word Embeddings

embedding_matrix = cbow_model.layers[1].get_weights()[0]

print("\nWord Embeddings:")
for w, i in list(word_index.items())[:15]:
    print(w, "→", embedding_matrix[i])


Word Embeddings:
the → [-0.12777582 -0.18269342  0.54291904  0.1741451   0.69108635  0.07480668
 -1.1810809   1.2484272 ]
of → [-0.6583774   0.79729635 -0.42675275 -0.81752974  0.7836856   0.27928406
  0.22826065  0.9524344 ]
influenza → [-0.84088594  0.22523804 -0.06240618  0.8385332  -0.2678879  -0.09841077
 -0.45663124 -0.20454293]
covid → [-1.2026259  -0.1838603   0.9550191   1.2049483   0.14064023  1.1465019
 -0.6935503  -1.0576988 ]
19 → [-1.1463654   0.6823199   0.39818513  1.0626802  -1.0415082   0.8855158
 -1.4066536  -0.2655181 ]
virus → [-0.8999294   0.417879   -0.01532158  0.73093164  0.1622683   0.919327
 -1.5176464  -0.16578682]
for → [-0.7755549  -0.0398257   0.9015526   1.2233901  -0.2459772   0.3613577
 -1.2156577  -0.35635933]
transmission → [-0.3408475   0.19490416  0.8438161  -0.58046335  0.21236275  0.875527
 -0.15226471  0.83671117]
is → [-0.05789379 -1.0906037  -0.46255812 -0.06941149  1.2127653   0.03675452
 -0.55367285  0.22381172]
to → [-0.18452455  1.0023977

In [15]:
def predict_missing(w1, w2, w3, w4):
    try:
        seq = np.array([word_index[w1], word_index[w2], word_index[w3], word_index[w4]]).reshape(1,4)
    except KeyError as e:
        return f"Word not found in vocabulary: {e}"

    pred = cbow_model.predict(seq, verbose=0)
    pred_id = int(np.argmax(pred))
    return index_word[pred_id]

In [21]:
print("\nPredictions:")
print(predict_missing("the", "speed", "transmission", "is"))
print(predict_missing("the", "serial", "for", "virus"))


Predictions:
of
the
