In [51]:
import nltk
import numpy as np
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Lambda, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K

In [52]:
# A
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

with open("CBOW.txt", "r") as file:
    text = file.read().lower()

# Tokenize text
nltk.download("punkt")
tokens = nltk.word_tokenize(text)

# Remove punctuation and short tokens
tokens = [t for t in tokens if t.isalpha() and t not in stopwords]

# Create word index mapping
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokens)
word2idx = tokenizer.word_index
idx2word = {v: k for k, v in word2idx.items()}
vocab_size = len(word2idx) + 1

print(f"Vocabulary Size: {vocab_size}")


Vocabulary Size: 59


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\srira\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\srira\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [58]:
# b. GENERATE TRAINING DATA
# -------------------------------

def generate_cbow_data(words, window_size):
    data = []
    for i in range(window_size, len(words) - window_size):
        context = []
        for j in range(-window_size, window_size + 1):
            if j != 0:
                context.append(word2idx[words[i + j]])
        target = word2idx[words[i]]
        data.append((context, target))
    return data

window_size = 2
data = generate_cbow_data(tokens, window_size)

# Extract context and target words
contexts = [x[0] for x in data]
targets = [x[1] for x in data]

# Pad contexts to have uniform length (4 words when window_size=2)
X = pad_sequences(contexts, maxlen=2 * window_size, padding='pre')
y = to_categorical(targets, num_classes=vocab_size)

print("Training samples:", X.shape[0])

Training samples: 83


In [None]:
# c. TRAIN MODEL
# -------------------------------

embedding_dim = 100

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=2 * window_size))
model.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embedding_dim,)))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy')

print(model.summary())

# Train the model
model.fit(X, y, epochs=100, verbose=1, batch_size=64)


In [60]:
# d. OUTPUT
# -------------------------------
# Display learned embeddings for a few words

embeddings = model.layers[0].get_weights()[0]

# Choose a few words to display
sample_words = list(word2idx.keys())[:10]
print("\nWord Embeddings (first 10 words):\n")
for word in sample_words:
    print(f"{word}: {embeddings[word2idx[word]][:10]}")



Word Embeddings (first 10 words):

transmission: [-0.13669029 -0.2356931  -0.23462848  0.22132131 -0.09976947 -0.24282283
 -0.0304919   0.05004571 -0.14603662 -0.10281289]
influenza: [-0.16292754  0.01356544 -0.00779893 -0.11601117 -0.03903944  0.19749321
 -0.00531609 -0.11682897  0.11274537 -0.14728487]
virus: [ 0.10860355 -0.11521085  0.17946602  0.0419849  -0.22062562  0.16980483
  0.21313797 -0.10354988 -0.25543106  0.23004846]
serial: [ 0.20319796 -0.02197481  0.18700454  0.25183597 -0.19920234  0.25551102
  0.12720193 -0.25603944 -0.17428419  0.01812837]
interval: [ 0.18431884  0.2111258   0.13806671  0.0203828  -0.23104802  0.19944033
  0.176656   -0.16870098 -0.06027011 -0.05120527]
days: [ 0.09380592 -0.22281852  0.16495118 -0.05796074 -0.21057105  0.18846947
  0.25903335 -0.16117305 -0.14271003 -0.18910405]
viruses: [ 0.12073462  0.1060987   0.2137282  -0.18995665 -0.07419154  0.19145706
  0.13143004 -0.09222515  0.11901081 -0.18350185]
shorter: [ 0.10283981  0.04082376  0.0

In [61]:
# Function to predict the most likely target word
def predict_target_word(context_words, tokenizer, model, window_size=2):
    # Convert words to indices
    context_indices = [tokenizer.word_index.get(w, 0) for w in context_words if w in tokenizer.word_index]
    
    # Pad to required context size
    X = pad_sequences([context_indices], maxlen=2 * window_size, padding='pre')
    
    # Predict probabilities for all words
    prediction = model.predict(X, verbose=0)
    
    # Get index of most probable word
    target_index = np.argmax(prediction)
    
    # Convert index back to word
    for word, index in tokenizer.word_index.items():
        if index == target_index:
            return word
    return None




In [62]:
import numpy as np

# Function to calculate accuracy
def evaluate_accuracy(model, X, y_true):
    # Predict probabilities for all samples
    y_pred = model.predict(X, verbose=0)
    
    # Convert predicted probabilities to class indices
    y_pred_classes = np.argmax(y_pred, axis=1)
    
    # Convert true one-hot vectors to class indices
    y_true_classes = np.argmax(y_true, axis=1)
    
    # Compare predictions with true labels
    accuracy = np.mean(y_pred_classes == y_true_classes)
    return accuracy

# Evaluate on training data
accuracy = evaluate_accuracy(model, X, y)
print(f"Training Accuracy: {accuracy * 100:.2f}%")


Training Accuracy: 61.45%
