In [1]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Lambda,Dense
import tensorflow.keras.backend as K

In [2]:
with open("dataset/CBOW(Ass5)/CBOW.txt","r",encoding="utf-8") as f:
    text=f.read()
print(text)

The speed of transmission is an important point of difference between the two viruses. Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than COVID-19 virus. The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days. This means that influenza can spread faster than COVID-19. 

Further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza. In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission. 

The reproductive number – the number of secondary infections generated from one infected individual – is understood to be between 2 

In [3]:
# Tokenize text

tok=Tokenizer()
tok.fit_on_texts([text])
seq=tok.texts_to_sequences([text])[0]
vocab_size=len(tok.word_index)+1

word_index=tok.word_index
index_word={i: w for w, i in word_index.items()}
print("Vocabulary size:",vocab_size)
print("Sequences:",seq)

Vocabulary size: 103
Sequences: [1, 38, 2, 8, 9, 39, 40, 41, 2, 42, 13, 1, 43, 23, 3, 44, 11, 24, 45, 46, 47, 1, 14, 25, 48, 10, 26, 2, 27, 12, 11, 24, 15, 16, 1, 14, 13, 49, 50, 17, 4, 5, 6, 1, 15, 16, 7, 4, 5, 6, 9, 51, 10, 18, 19, 52, 20, 28, 7, 3, 6, 1, 15, 16, 9, 29, 20, 30, 53, 31, 3, 32, 54, 55, 17, 4, 5, 56, 8, 33, 1, 57, 29, 19, 20, 2, 58, 59, 60, 61, 62, 8, 63, 2, 1, 6, 64, 1, 26, 2, 27, 21, 9, 11, 34, 35, 2, 8, 7, 3, 33, 65, 28, 66, 22, 67, 31, 68, 22, 69, 70, 32, 71, 4, 5, 6, 72, 73, 74, 75, 10, 76, 77, 78, 79, 30, 80, 81, 82, 10, 18, 11, 34, 35, 2, 8, 1, 83, 36, 21, 1, 36, 2, 84, 85, 86, 25, 87, 88, 89, 21, 9, 90, 10, 18, 13, 37, 12, 37, 19, 7, 4, 5, 6, 91, 17, 7, 3, 92, 93, 7, 94, 4, 5, 12, 3, 23, 22, 95, 96, 12, 14, 97, 98, 99, 100, 101, 102]


In [4]:
# GENERATE TRAINING DATA (CBOW)

window=2
x,y=[],[]

for i in range(window,len(seq)-window):
    context=seq[i-window:i]+seq[i+1:i+window+1]
    x.append(context)
    y.append(seq[i])

In [6]:
x=np.array(x)
y=np.array(y)
print("\nContext samples:\n", x[:5])
print("\nTarget samples:\n", y[:5])


Context samples:
 [[ 1 38  8  9]
 [38  2  9 39]
 [ 2  8 39 40]
 [ 8  9 40 41]
 [ 9 39 41  2]]

Target samples:
 [ 2  8  9 39 40]


In [7]:
# TRAIN CBOW MODEL

model=Sequential([
    Embedding(vocab_size,8),
    Lambda(lambda x: K.mean(x,axis=1)),
    Dense(vocab_size,activation='softmax')
])

In [8]:
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam'
)

model.fit(
    x,y,
    epochs=200,
    verbose=0
)

model.summary()
print("Model Trained")




Model Trained


In [9]:
emb=model.layers[0].get_weights()[0]

print("\n Word Embedding")
for word,idx in word_index.items():
    print(f"{word} --> {emb[idx]}")


 Word Embedding
the --> [ 0.88846433 -0.23621991  1.6989975  -0.6649959  -0.01140946 -0.83470446
  1.0666364   0.41906703]
of --> [ 1.2752314   1.1848305   0.4633136  -0.11279809  0.41935343  0.08258112
  0.3050054  -0.24593022]
influenza --> [-0.81381786  0.74782056 -0.20416345  0.41900694  0.4486996   1.5685575
  0.23043913  0.16766183]
covid --> [-1.3495474   0.22067639  0.32874063  1.0060319   1.1197304   1.5652163
  0.24383879  1.4582778 ]
19 --> [ 0.6568432  -0.32586598  1.1052196   1.1122506   1.4157788   1.2079442
  0.87872535  0.7984697 ]
virus --> [ 0.83610827 -0.44932467  1.2610612   0.31070226  1.1324084   0.46216914
  0.9829549   1.136391  ]
for --> [ 0.18832429 -0.8050696   0.28078374  0.4705151  -0.35836053  1.0219985
  1.3191063   0.46262753]
transmission --> [ 0.45574415  1.0777373   0.9675641  -0.36646038  0.28192878  0.0662162
  0.40997517 -0.08606846]
is --> [ 0.9882401  -0.4017404   0.08341212 -0.01304766  0.35218456 -0.07764236
  0.3962567  -0.52173954]
to --> [-

In [10]:
def predict_missing(w1, w2, w3, w4):
    try:
        seq = np.array([word_index[w1], word_index[w2], word_index[w3], word_index[w4]]).reshape(1, 4)
    except KeyError as e:
        return f"Word not found in vocabulary: {e}"

    pred = model.predict(seq, verbose=0)
    pred_id = int(np.argmax(pred))
    return index_word[pred_id]

print("\nPredictions:")
print(predict_missing("the", "speed", "transmission", "is"))
print(predict_missing("the", "serial", "for", "virus"))


Predictions:
of
the
