In [None]:
import tensorflow as tf

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Dense,Lambda
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
import matplotlib.pyplot as plt

In [None]:
text = "The speed of transmission is an important point of difference between the two viruses. Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than COVID-19 virus. The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days. This means that influenza can spread faster than COVID-19. Further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza. In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission. The reproductive number – the number of secondary infections generated from one infected individual – is understood to be between 2 and 2.5 for COVID-19 virus, higher than for influenza. However, estimates for both COVID-19 and influenza viruses are very context and time-specific, making direct comparisons more difficult."

In [None]:
# Assuming corpus is a string with sentences separated by periods
sentences = text.split('.')
sentences = [sentence.lower() for sentence in sentences]
print(sentences)

['the speed of transmission is an important point of difference between the two viruses', ' influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than covid-19 virus', ' the serial interval for covid-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days', ' this means that influenza can spread faster than covid-19', ' further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza', ' in contrast, while we are learning that there are people who can shed covid-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission', ' the reproductive number – the number of secondary infections generated from one infected individual – is understood 

In [None]:
# Assuming corpus is a string with sentences separated by periods
sentences = text.split('.')
sentences = [sentence.lower() for sentence in sentences]
sentences


['the speed of transmission is an important point of difference between the two viruses',
 ' influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than covid-19 virus',
 ' the serial interval for covid-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days',
 ' this means that influenza can spread faster than covid-19',
 ' further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza',
 ' in contrast, while we are learning that there are people who can shed covid-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission',
 ' the reproductive number – the number of secondary infections generated from one infected individual – is under

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[1, 38, 2, 8, 9, 39, 40, 41, 2, 42, 13, 1, 43, 23], [3, 44, 11, 24, 45, 46, 47, 1, 14, 25, 48, 10, 26, 2, 27, 12, 11, 24, 15, 16, 1, 14, 13, 49, 50, 17, 4, 5, 6], [1, 15, 16, 7, 4, 5, 6, 9, 51, 10, 18, 19, 52, 20, 28, 7, 3, 6, 1, 15, 16, 9, 29, 20], [30, 53, 31, 3, 32, 54, 55, 17, 4, 5], [56, 8, 33, 1, 57, 29, 19, 20, 2, 58, 59, 60, 61, 62, 8, 63, 2, 1, 6, 64, 1, 26, 2, 27, 21, 9, 11, 34, 35, 2, 8, 7, 3], [33, 65, 28, 66, 22, 67, 31, 68, 22, 69, 70, 32, 71, 4, 5, 6, 72, 73, 74, 75, 10, 76, 77, 78, 79, 30, 80, 81, 82, 10, 18, 11, 34, 35, 2, 8], [1, 83, 36, 21, 1, 36, 2, 84, 85, 86, 25, 87, 88, 89, 21, 9, 90, 10, 18, 13, 37, 12, 37], [19, 7, 4, 5, 6, 91, 17, 7, 3], [92, 93, 7, 94, 4, 5, 12, 3, 23, 22, 95, 96, 12, 14, 97, 98, 99, 100, 101, 102], []]


In [None]:
word2idx = tokenizer.word_index
idx2word = tokenizer.index_word
print(word2idx)
print(idx2word)

{'the': 1, 'of': 2, 'influenza': 3, 'covid': 4, '19': 5, 'virus': 6, 'for': 7, 'transmission': 8, 'is': 9, 'to': 10, 'a': 11, 'and': 12, 'between': 13, 'time': 14, 'serial': 15, 'interval': 16, 'than': 17, 'be': 18, '5': 19, 'days': 20, '–': 21, 'are': 22, 'viruses': 23, 'shorter': 24, 'from': 25, 'appearance': 26, 'symptoms': 27, 'while': 28, '3': 29, 'this': 30, 'that': 31, 'can': 32, 'in': 33, 'major': 34, 'driver': 35, 'number': 36, '2': 37, 'speed': 38, 'an': 39, 'important': 40, 'point': 41, 'difference': 42, 'two': 43, 'has': 44, 'median': 45, 'incubation': 46, 'period': 47, 'infection': 48, 'successive': 49, 'cases': 50, 'estimated': 51, '6': 52, 'means': 53, 'spread': 54, 'faster': 55, 'further': 56, 'first': 57, 'illness': 58, 'or': 59, 'potentially': 60, 'pre': 61, 'symptomatic': 62, '–transmission': 63, 'before': 64, 'contrast': 65, 'we': 66, 'learning': 67, 'there': 68, 'people': 69, 'who': 70, 'shed': 71, '24': 72, '48': 73, 'hours': 74, 'prior': 75, 'symptom': 76, 'onset

In [None]:
vocab_len = len(word2idx) + 1
emb_size = 100
context_size = 2

contexts = []
targets = []

for sequence in sequences:
  for i in range(context_size, len(sequence)-context_size):
    target = sequence[i]
    context = [sequence[i-2],sequence[i-1],sequence[i+1],sequence[i+2]]
    contexts.append(context)
    targets.append(target)

print(targets)
print(contexts)


[2, 8, 9, 39, 40, 41, 2, 42, 13, 1, 11, 24, 45, 46, 47, 1, 14, 25, 48, 10, 26, 2, 27, 12, 11, 24, 15, 16, 1, 14, 13, 49, 50, 17, 4, 16, 7, 4, 5, 6, 9, 51, 10, 18, 19, 52, 20, 28, 7, 3, 6, 1, 15, 16, 9, 31, 3, 32, 54, 55, 17, 33, 1, 57, 29, 19, 20, 2, 58, 59, 60, 61, 62, 8, 63, 2, 1, 6, 64, 1, 26, 2, 27, 21, 9, 11, 34, 35, 2, 8, 28, 66, 22, 67, 31, 68, 22, 69, 70, 32, 71, 4, 5, 6, 72, 73, 74, 75, 10, 76, 77, 78, 79, 30, 80, 81, 82, 10, 18, 11, 34, 35, 36, 21, 1, 36, 2, 84, 85, 86, 25, 87, 88, 89, 21, 9, 90, 10, 18, 13, 37, 4, 5, 6, 91, 17, 7, 94, 4, 5, 12, 3, 23, 22, 95, 96, 12, 14, 97, 98, 99, 100]
[[1, 38, 8, 9], [38, 2, 9, 39], [2, 8, 39, 40], [8, 9, 40, 41], [9, 39, 41, 2], [39, 40, 2, 42], [40, 41, 42, 13], [41, 2, 13, 1], [2, 42, 1, 43], [42, 13, 43, 23], [3, 44, 24, 45], [44, 11, 45, 46], [11, 24, 46, 47], [24, 45, 47, 1], [45, 46, 1, 14], [46, 47, 14, 25], [47, 1, 25, 48], [1, 14, 48, 10], [14, 25, 10, 26], [25, 48, 26, 2], [48, 10, 2, 27], [10, 26, 27, 12], [26, 2, 12, 11], [2,

In [None]:
# for i in range(5):
#   target = idx2word.get(targets[i])
#   words = []
#   for j in contexts[i]:
#     words.append(idx2word(j))
#   print(words , "->" ,target)

for i in range(5):
    target = idx2word.get(targets[i])
    words = []
    for j in contexts[i]:
        words.append(idx2word.get(j))  # Use .get(j) to retrieve the word
    print(words, "->", target)


['the', 'speed', 'transmission', 'is'] -> of
['speed', 'of', 'is', 'an'] -> transmission
['of', 'transmission', 'an', 'important'] -> is
['transmission', 'is', 'important', 'point'] -> an
['is', 'an', 'point', 'of'] -> important


In [None]:
x = np.array(contexts)
y = np.array(targets)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
model = Sequential([
    Embedding(input_dim=vocab_len,output_dim=emb_size,input_length=4),
    # Lambda(lambda x:tf.mean(x,axis=1)),
    Lambda(lambda x: tf.reduce_mean(x, axis=1)),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(vocab_len, activation='sigmoid')
])

In [None]:
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
H = model.fit(x_train,y_train,epochs=10,validation_data=(x_test,y_test))

Epoch 1/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 62ms/step - accuracy: 0.0056 - loss: 4.6357 - val_accuracy: 0.0303 - val_loss: 4.6360
Epoch 2/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.0666 - loss: 4.6218 - val_accuracy: 0.0303 - val_loss: 4.6362
Epoch 3/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.1727 - loss: 4.6074 - val_accuracy: 0.0303 - val_loss: 4.6359
Epoch 4/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.1970 - loss: 4.5917 - val_accuracy: 0.0606 - val_loss: 4.6363
Epoch 5/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.1909 - loss: 4.5731 - val_accuracy: 0.0606 - val_loss: 4.6366
Epoch 6/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.2187 - loss: 4.5476 - val_accuracy: 0.0303 - val_loss: 4.6350
Epoch 7/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[

In [None]:
y_pred = model.predict(x_test)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 154ms/step


In [None]:
import random
n = random.choice(range(len(x_test)))
print(x_test[n])
print(np.argmax(y_pred[n]))

target = idx2word.get(np.argmax(y_pred[n]))
words = []
for i in x_test[i]:
  words.append(idx2word.get(i))
print(words, '->', target)

[32 71  5  6]
9
['are', 'learning', 'there', 'are'] -> is


In [None]:
x_test

array([[12, 14, 98, 99],
       [75, 10, 77, 78],
       [86, 25, 88, 89],
       [30, 53,  3, 32],
       [22, 67, 68, 22],
       [16,  1, 13, 49],
       [32, 71,  5,  6],
       [ 3,  6, 15, 16],
       [70, 32,  4,  5],
       [ 5,  6, 17,  7],
       [25, 48, 26,  2],
       [27, 21, 11, 34],
       [46, 47, 14, 25],
       [29, 19,  2, 58],
       [27, 12, 24, 15],
       [ 1, 14, 49, 50],
       [ 4,  5, 91, 17],
       [ 5,  6, 73, 74],
       [22, 69, 32, 71],
       [47,  1, 25, 48],
       [ 8, 63,  1,  6],
       [14, 25, 10, 26],
       [11, 24, 46, 47],
       [42, 13, 43, 23],
       [14, 13, 50, 17],
       [ 3, 23, 95, 96],
       [68, 22, 70, 32],
       [53, 31, 32, 54],
       [87, 88, 21,  9],
       [ 6, 72, 74, 75],
       [90, 10, 13, 37],
       [ 1,  6,  1, 26],
       [54, 55,  4,  5]])