In [25]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer

In [26]:
corpus=["The speed of transmission is an important point of difference between the two viruses. Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than COVID-19 virus. The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days. This means that influenza can spread faster than COVID-19."]

In [27]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
word_index = tokenizer.word_index

In [28]:
sequences = tokenizer.texts_to_sequences(corpus)
X, y = [], []

for seq in sequences:
    for i, target_word in enumerate(seq):
        for j in range(max(0, i - 2), min(i + 3, len(seq))):
            if i != j:
                X.append([target_word, seq[j]])
                y.append(1)  # Positive context

X = np.array(X)
y = np.array(y)

In [29]:
model = Sequential([
    Embedding(input_dim=len(word_index) + 1, output_dim=10, input_length=2),
    GlobalAveragePooling1D(),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X, y, epochs=100, verbose=0)

<keras.src.callbacks.History at 0x268851f86a0>

In [50]:
target_word = "interval"
if target_word not in word_index:
    print(f"{target_word} not found in the vocabulary.")
else:
    target_embedding = model.layers[0].get_weights()[0][word_index[target_word]]

    # Calculate cosine similarity
    similarities = [(word, np.dot(target_embedding, model.layers[0].get_weights()[0][index]) /
                    (np.linalg.norm(target_embedding) * np.linalg.norm(model.layers[0].get_weights()[0][index])))
                    for word, index in word_index.items() if word != target_word]

    # Sort by similarity and get top N
    related_words = sorted(similarities, key=lambda x: x[1], reverse=True)[:5]

    print(f"Words related to '{target_word}': {[word for word, _ in related_words]}")

Words related to 'interval': ['from', 'time', 'virus', '3', 'difference']
