In [22]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Lambda,Dense
import tensorflow.keras.backend as K

In [23]:
with open("dataset/CBOW(Ass5)/CBOW.txt","r",encoding="utf-8") as f:
    text=f.read()
print(text)

The speed of transmission is an important point of difference between the two viruses. Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than COVID-19 virus. The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days. This means that influenza can spread faster than COVID-19. 

Further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza. In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission. 

The reproductive number – the number of secondary infections generated from one infected individual – is understood to be between 2 

In [24]:
# Tokenize text

tok=Tokenizer()
tok.fit_on_texts([text])
seq=tok.texts_to_sequences([text])[0]
vocab_size=len(tok.word_index)+1

word_index=tok.word_index
index_word={i: w for w, i in word_index.items()}
print("Vocabulary size:",vocab_size)
print("Sequences:",seq)

Vocabulary size: 103
Sequences: [1, 38, 2, 8, 9, 39, 40, 41, 2, 42, 13, 1, 43, 23, 3, 44, 11, 24, 45, 46, 47, 1, 14, 25, 48, 10, 26, 2, 27, 12, 11, 24, 15, 16, 1, 14, 13, 49, 50, 17, 4, 5, 6, 1, 15, 16, 7, 4, 5, 6, 9, 51, 10, 18, 19, 52, 20, 28, 7, 3, 6, 1, 15, 16, 9, 29, 20, 30, 53, 31, 3, 32, 54, 55, 17, 4, 5, 56, 8, 33, 1, 57, 29, 19, 20, 2, 58, 59, 60, 61, 62, 8, 63, 2, 1, 6, 64, 1, 26, 2, 27, 21, 9, 11, 34, 35, 2, 8, 7, 3, 33, 65, 28, 66, 22, 67, 31, 68, 22, 69, 70, 32, 71, 4, 5, 6, 72, 73, 74, 75, 10, 76, 77, 78, 79, 30, 80, 81, 82, 10, 18, 11, 34, 35, 2, 8, 1, 83, 36, 21, 1, 36, 2, 84, 85, 86, 25, 87, 88, 89, 21, 9, 90, 10, 18, 13, 37, 12, 37, 19, 7, 4, 5, 6, 91, 17, 7, 3, 92, 93, 7, 94, 4, 5, 12, 3, 23, 22, 95, 96, 12, 14, 97, 98, 99, 100, 101, 102]


In [25]:
# GENERATE TRAINING DATA (CBOW)

window=2
x,y=[],[]

for i in range(window,len(seq)-window):
    context=seq[i-window:i]+seq[i+1:i+window+1]
    x.append(context)
    y.append(seq[i])

In [26]:
x=np.array(x)
y=np.array(y)
print("\nContext samples:\n", x[:5])
print("\nTarget samples:\n", y[:5])


Context samples:
 [[ 1 38  8  9]
 [38  2  9 39]
 [ 2  8 39 40]
 [ 8  9 40 41]
 [ 9 39 41  2]]

Target samples:
 [ 2  8  9 39 40]


In [27]:
# TRAIN CBOW MODEL

model=Sequential([
    Embedding(vocab_size,8),
    Lambda(lambda x: K.mean(x,axis=1)),
    Dense(vocab_size,activation='softmax')
])

In [28]:
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam'
)

model.fit(
    x,y,
    epochs=200,
    verbose=0
)

model.summary()
print("Model Trained")

Model Trained


In [30]:
emb=model.layers[0].get_weights()[0]

print("\n Word Embedding")
for word,idx in word_index.items():
    print(f"{word} --> {emb[idx]}")


 Word Embedding
the --> [-0.9554629   0.7423102  -0.9001153  -0.36666992 -0.4878943   0.5838243
  0.9133314   0.9924052 ]
of --> [-0.9681605   0.31655404 -0.64119965 -0.0829096   0.19651125  1.0559859
  0.2937089  -0.93572366]
influenza --> [-0.16699165  0.8529881  -0.46283615 -0.00659552  0.71289563 -0.49013436
 -0.29138064  0.96847934]
covid --> [-0.82776356  1.052623   -1.1652381  -0.75029945  0.05280452  0.10206936
 -1.6513042   0.35223618]
19 --> [-1.2217395  -0.20738818 -1.3668352  -1.2556393   0.08545034  0.6325189
 -1.3685169  -1.5409791 ]
virus --> [-9.3321311e-01  3.7536308e-02 -1.1754440e+00 -1.3747227e+00
 -9.9219602e-01  1.1100518e+00  1.2814663e-03 -1.4356995e-02]
for --> [-0.70713216 -0.05229075 -1.3437215  -0.77843946 -0.3656204  -0.35689586
 -0.57592374  0.01130927]
transmission --> [-0.95630133  1.1458284  -0.08716225 -1.1939806  -0.08574095  0.7594289
  0.5343564   0.3544207 ]
is --> [-0.43379596  0.23533723 -0.03473659 -0.11649913 -1.1979414   0.8112048
 -0.0362157

In [31]:
def predict_missing(w1, w2, w3, w4):
    try:
        seq = np.array([word_index[w1], word_index[w2], word_index[w3], word_index[w4]]).reshape(1, 4)
    except KeyError as e:
        return f"Word not found in vocabulary: {e}"

    pred = model.predict(seq, verbose=0)
    pred_id = int(np.argmax(pred))
    return index_word[pred_id]

print("\nPredictions:")
print(predict_missing("the", "speed", "transmission", "is"))
print(predict_missing("the", "serial", "for", "virus"))


Predictions:
of
the
