In [35]:
import numpy as np
import nltk
import re
import os # Import for file handling
from nltk.tokenize import word_tokenize
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

In [37]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shravaninomulwar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/shravaninomulwar/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [39]:
# --- Model Parameters ---
TEXT_FILE_PATH = '/Users/shravaninomulwar/Desktop/abc/dataset/CBOW.txt' # <-- CHANGE THIS if your file name is different
WINDOW_SIZE = 2                # Number of context words to consider on each side
EMBEDDING_DIM = 100            # Dimension of the final word vector
EPOCHS = 50                    # Number of training epochs (Increase for better results)

In [41]:
#a
def preprocess_text(file_path):

    # 1. Read the text from the file
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    # 2. Clean and Tokenize
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text) # Remove punctuation and numbers
    tokens = word_tokenize(text)

    # 3. Build Vocabulary
    vocabulary = sorted(list(set(tokens)))

    # Create mappings
    word_to_index = {word: i for i, word in enumerate(vocabulary)}
    index_to_word = {i: word for i, word in enumerate(vocabulary)}

    VOCAB_SIZE = len(vocabulary)
    print(f"Total vocabulary size: {VOCAB_SIZE} unique words.")

    return tokens, VOCAB_SIZE, word_to_index, index_to_word

In [43]:
# Load and process the data
tokens, VOCAB_SIZE, word_to_index, index_to_word = preprocess_text(TEXT_FILE_PATH)

Total vocabulary size: 92 unique words.


In [65]:
#b
def generate_cbow_data(tokens, word_to_index, vocab_size, window_size):

    data = []

    for i, target_word in enumerate(tokens):
        target_index = word_to_index[target_word]
        context_indices = []

        # Collect context words within the window
        for j in range(1, window_size + 1):
            if i - j >= 0:
                context_indices.append(word_to_index[tokens[i - j]])
            if i + j < len(tokens):
                context_indices.append(word_to_index[tokens[i + j]])

        if context_indices:
            data.append((context_indices, target_index))

    # Convert the context indices into a summed one-hot vector for simplicity in Keras
    X_cbow = np.zeros((len(data), vocab_size), dtype='float32')
    Y_cbow = np.zeros((len(data), vocab_size), dtype='float32')

    for row_idx, (context_indices, target_index) in enumerate(data):
        # Create summed one-hot vector for context (X)
        for index in context_indices:
            X_cbow[row_idx, index] += 1

        # Create one-hot vector for target (Y)
        Y_cbow[row_idx, target_index] = 1

    print(f"Total training samples generated: {len(data)}")
    print(f"Final Input Shape (X): {X_cbow.shape}")
    print(f"Final Output Shape (Y): {Y_cbow.shape}")

    return X_cbow, Y_cbow

In [47]:
# Generate the data

X_cbow, Y_cbow = generate_cbow_data(tokens, word_to_index, VOCAB_SIZE, WINDOW_SIZE)

Total training samples generated: 177
Final Input Shape (X): (177, 92)
Final Output Shape (Y): (177, 92)


In [49]:
#c
print("\nDefining CBOW Model Architecture...")
model = Sequential([
    # Input Layer: One-hot encoded context vector (size: VOCAB_SIZE)
    # This Dense layer is the projection layer (it learns the embeddings)
    Dense(EMBEDDING_DIM, activation='linear', input_shape=(VOCAB_SIZE,), name='Embedding_Projection'),

    # Output Layer: Predicts the target word (size: VOCAB_SIZE)
    Dense(VOCAB_SIZE, activation='softmax', name='Output_Softmax')
])


Defining CBOW Model Architecture...


In [51]:
model.compile(
    optimizer=Adam(learning_rate=0.01),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

In [53]:
# Training the model
print(f"\nStarting CBOW model training for {EPOCHS} epochs...")
model.fit(
    X_cbow, Y_cbow,
    epochs=EPOCHS,
    batch_size=128,
    verbose=1
)
print("CBOW model training complete.")


Starting CBOW model training for 50 epochs...
Epoch 1/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.0165 - loss: 4.5499 
Epoch 2/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.2638 - loss: 4.2030
Epoch 3/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.4941 - loss: 3.8862
Epoch 4/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.5911 - loss: 3.5537
Epoch 5/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 554ms/step - accuracy: 0.6166 - loss: 3.2219
Epoch 6/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.6099 - loss: 2.8881
Epoch 7/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.6137 - loss: 2.5543
Epoch 8/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.6406 - loss: 2.2506
Epoch 9/50
[1m

In [55]:
#d
# The word embeddings are the weights of the 'Embedding_Projection' layer.
word_embeddings = model.get_layer('Embedding_Projection').get_weights()[0]

print(f"Extracted Embedding Matrix Shape: {word_embeddings.shape}")

Extracted Embedding Matrix Shape: (92, 100)


In [57]:
# Example: Get the embedding vector for the word 'learning' (or any other word from the document)

word_of_interest = 'learning'
if word_of_interest in word_to_index:
    idx = word_to_index[word_of_interest]
    vector = word_embeddings[idx]

    print(f"\nEmbedding vector for '{word_of_interest}' (First 5 dimensions):")
    # Display the first 5 dimensions of the vector
    print(vector[:5], '...')

    # The resulting vector is the semantic representation of the word.
else:
    print(f"Word '{word_of_interest}' not found in vocabulary.")


Embedding vector for 'learning' (First 5 dimensions):
[ 0.43694344 -0.03588932 -0.3820069  -0.19882883  0.22411405] ...


In [59]:
def predict_target_word(context_words, model, word_to_index, index_to_word, vocab_size, window_size):

    # 1. Convert context words to indices and then to a summed one-hot vector
    context_vector = np.zeros((1, vocab_size), dtype='float32')
    for word in context_words:
        if word in word_to_index:
            context_vector[0, word_to_index[word]] += 1
        else:
            print(f"Warning: Context word '{word}' not in vocabulary. Skipping.")

    # 2. Use the model to predict the probability distribution of the target word
    predictions = model.predict(context_vector, verbose=0)[0] # Get the first (and only) sample's predictions

    # 3. Get the index of the word with the highest probability
    predicted_index = np.argmax(predictions)

    # 4. Convert the index back to a word
    predicted_word = index_to_word[predicted_index]

    # You can also get the probability of the predicted word
    predicted_probability = predictions[predicted_index]

    return predicted_word, predicted_probability

In [61]:
# Example usage:

example_context = ['shorter', 'incubation', 'period'] # Example: Predict the word between 'making' and 'important'

predicted_word, probability = predict_target_word(
    example_context, model, word_to_index, index_to_word, VOCAB_SIZE, WINDOW_SIZE
)

In [63]:
print(f"\nGiven the context words: {example_context}")
print(f"Predicted target word: '{predicted_word}' with probability {probability:.4f}")


Given the context words: ['shorter', 'incubation', 'period']
Predicted target word: 'median' with probability 0.8750


In [31]:
# =============================
# CBOW Model Implementation
# =============================

import numpy as np
import re
from collections import defaultdict
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras.preprocessing.sequence import make_sampling_table

# -----------------------------
# a. Data Preparation
# -----------------------------
text = """
The speed of transmission is an important point of difference between the two viruses. 
Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) 
and a shorter serial interval (the time between successive cases) than COVID-19 virus. 
The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, 
the serial interval is 3 days. This means that influenza can spread faster than COVID-19.

Further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission 
– transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza. 
In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, 
at present, this does not appear to be a major driver of transmission.

The reproductive number – the number of secondary infections generated from one infected individual – 
is understood to be between 2 and 2.5 for COVID-19 virus, higher than for influenza. 
However, estimates for both COVID-19 and influenza viruses are very context and time-specific, 
making direct comparisons more difficult.
"""

# Clean and preprocess text
text = text.lower()
text = re.sub(r'[^a-z\s]', '', text)
sentences = text.split('.')

# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
word2idx = tokenizer.word_index
idx2word = {v: k for k, v in word2idx.items()}
vocab_size = len(word2idx) + 1

print(f"Vocabulary Size: {vocab_size}")
print("Sample word2idx mapping:", dict(list(word2idx.items())[:10]))

# -----------------------------
# b. Generate Training Data (Context → Target)
# -----------------------------
window_size = 2
data = []

# Create word sequences
for sentence in sentences:
    words = [w for w in sentence.split() if w in word2idx]
    for i, target_word in enumerate(words):
        context = []
        for j in range(i - window_size, i + window_size + 1):
            if j != i and 0 <= j < len(words):
                context.append(words[j])
        for w in context:
            data.append((w, target_word))

print(f"Total training pairs: {len(data)}")
print("Sample pairs:", data[:5])

# Prepare input and output for CBOW
def generate_training_data(data, word2idx, vocab_size):
    X, Y = [], []
    for context_word, target_word in data:
        x = np.zeros(vocab_size)
        y = np.zeros(vocab_size)
        x[word2idx[context_word]] = 1
        y[word2idx[target_word]] = 1
        X.append(x)
        Y.append(y)
    return np.array(X), np.array(Y)

X, Y = generate_training_data(data, word2idx, vocab_size)

# -----------------------------
# c. Train the CBOW Model
# -----------------------------
embedding_dim = 10

# Input layer
input_layer = Input(shape=(vocab_size,))
hidden_layer = Dense(embedding_dim, activation='linear')(input_layer)
output_layer = Dense(vocab_size, activation='softmax')(hidden_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

print(model.summary())

# Train model
model.fit(X, Y, epochs=100, verbose=1)

# -----------------------------
# d. Output: Word Embeddings
# -----------------------------
weights = model.get_weights()[0]

print("\nWord Embeddings (sample):")
for word, idx in list(word2idx.items())[:10]:
    print(f"{word}: {weights[idx][:5]}")


Vocabulary Size: 93
Sample word2idx mapping: {'the': 1, 'of': 2, 'transmission': 3, 'influenza': 4, 'covid': 5, 'virus': 6, 'for': 7, 'is': 8, 'to': 9, 'a': 10}
Total training pairs: 702
Sample pairs: [('speed', 'the'), ('of', 'the'), ('the', 'speed'), ('of', 'speed'), ('transmission', 'speed')]


None
Epoch 1/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 638us/step - accuracy: 0.0077 - loss: 4.5354    
Epoch 2/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 699us/step - accuracy: 0.0239 - loss: 4.5254  
Epoch 3/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 730us/step - accuracy: 0.0274 - loss: 4.5112   
Epoch 4/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 671us/step - accuracy: 0.0552 - loss: 4.4959
Epoch 5/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 673us/step - accuracy: 0.0626 - loss: 4.4837
Epoch 6/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 669us/step - accuracy: 0.0725 - loss: 4.4641
Epoch 7/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 668us/step - accuracy: 0.0817 - loss: 4.4530
Epoch 8/100
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 675us/step - accuracy: 0.0712 - loss: 4.4348
Epoch 9/100
[1m22