In [1]:
import numpy as np
import pandas as pd
import keras.backend as K
from tensorflow.keras.preprocessing import text
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Lambda,Dense,Embedding
from sklearn.metrics.pairwise import euclidean_distances




In [2]:
data = """The speed of transmission is an important point of difference between the two viruses. 
Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval 
(the time between successive cases) than COVID-19 virus. The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, 
the serial interval is 3 days. This means that influenza can spread faster than COVID-19. Further, transmission in the first 3-5 days of illness, 
or potentially pre-symptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza.
In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, at present, 
this does not appear to be a major driver of transmission. The reproductive number – the number of secondary infections generated from one infected 
individual – is understood to be between 2 and 2.5 for COVID-19 virus, higher than for influenza. However, estimates for both COVID-19 and influenza 
viruses are very context and time-specific, making direct comparisons more difficult."""
dl_data = data.split()

In [10]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(dl_data)
word2id = tokenizer.word_index
word2id['PAD'] = 0
id2word = {v:k for k,v in word2id.items()}

In [11]:
vocab_size = len(word2id)
embed_size = 100
window_size = 2

print(vocab_size)
print(list(word2id.items())[:10])

103
[('the', 1), ('of', 2), ('influenza', 3), ('covid', 4), ('19', 5), ('virus', 6), ('for', 7), ('transmission', 8), ('is', 9), ('to', 10)]


In [12]:
cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size,output_dim=embed_size,input_length=window_size*2))
cbow.add(Lambda(lambda x:K.mean(x,axis=1),output_shape=(embed_size,)))
cbow.add(Dense(vocab_size,activation='softmax'))
cbow.compile(loss='categorical_crossentropy',optimizer='rmsprop')
cbow.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 4, 100)            10300     
                                                                 
 lambda_1 (Lambda)           (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 103)               10403     
                                                                 
Total params: 20703 (80.87 KB)
Trainable params: 20703 (80.87 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [13]:
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

pd.DataFrame(weights,index=list(word2id.items())[1:]).head()

(102, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
"(of, 2)",-0.027155,0.016629,0.030162,-0.045222,-0.014636,0.021419,-0.040152,0.017723,-0.039187,0.01069,...,-0.020054,-0.03177,-0.024963,-6.8e-05,-0.017733,0.048797,0.018793,-0.041782,0.029082,0.029078
"(influenza, 3)",0.013942,0.022082,-0.031841,-0.033755,0.034786,-0.019104,0.031,0.042883,0.03758,-0.005664,...,0.041989,0.020625,0.001743,-0.030841,0.01798,-0.032029,2.8e-05,0.036878,0.045074,0.029346
"(covid, 4)",-0.003934,-0.030969,-0.032125,-0.003037,-0.02048,-0.018783,0.027192,-0.014669,-0.002525,-0.009622,...,-0.02139,-0.035189,-0.00702,0.019265,0.034532,-0.036243,-0.017553,0.025444,-0.043445,-0.021466
"(19, 5)",-0.033971,-0.022676,-0.047201,0.014664,-0.042441,0.044232,-0.014768,0.009756,-0.023725,0.022874,...,-0.00074,0.005064,-0.039613,-0.013329,0.047222,0.002465,-0.031346,-0.031687,-0.013415,-0.025209
"(virus, 6)",0.005643,-0.041921,-0.025259,-0.031735,0.032451,-0.000526,-0.030992,-0.009853,-0.026423,0.006004,...,0.02822,0.045575,0.035756,-0.021867,0.008646,0.006979,0.044811,0.027222,0.005807,0.038241


In [16]:
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

similar_words = {search_term : [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1]
                    for search_term in ['influenza']}
similar_words

(102, 102)


{'influenza': ['there', 'however', 'learning', 'difference', 'appearance']}