In [1]:
# Assignment No. 5:- Implement the Continuous Bag of Words (CBOW) Model

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, Lambda
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


# Sample text data (replace with your own text data)
corpus = [
    'this is a sample sentence',
    'continuous bag of words model',
    'implementation of cbow',
    'natural language processing',
    'deep learning for nlp tasks',
]

# Stage a: Data Preparation
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
vocab_size = len(tokenizer.word_index) + 1

# Generate word sequences and context-target pairs for CBOW
context_window = 2

def generate_training_data(corpus, tokenizer, window_size):
    sequences = tokenizer.texts_to_sequences(corpus)
    X = []
    y = []
    for sequence in sequences:
        for i in range(window_size, len(sequence) - window_size):
            context = sequence[i - window_size:i] + sequence[i + 1:i + window_size + 1]
            target = sequence[i]
            X.append(context)
            y.append(target)
    return np.array(X), np.array(y)

X_train, y_train = generate_training_data(corpus, tokenizer, context_window)

# Stage b: Generate training data (completed above)

# Stage c: Train CBOW model
embedding_dim = 50

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=context_window * 2),
    Lambda(lambda x: tf.reduce_mean(x, axis=1)),
    Dense(units=vocab_size, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
model.fit(X_train, y_train, epochs=100, verbose=2)

# Stage d: Output
word_embeddings = model.layers[0].get_weights()[0]

# Function to find the closest words to a given word
def find_closest_words(word_embeddings, target_word, tokenizer, top_n=5):
    target_word_index = tokenizer.word_index.get(target_word)
    if target_word_index is None:
        return []

    target_embedding = word_embeddings[target_word_index]
    distances = np.linalg.norm(word_embeddings - target_embedding, axis=1)
    closest_indices = np.argsort(distances)[1:top_n + 1]
    closest_words = [word for word, index in tokenizer.word_index.items() if index in closest_indices]
    return closest_words

# Test the CBOW model by finding closest words
target_word = 'cbow'
closest_words = find_closest_words(word_embeddings, target_word, tokenizer)
print(f"Closest words to '{target_word}': {closest_words}")



Epoch 1/100
1/1 - 1s - loss: 3.0381 - 558ms/epoch - 558ms/step
Epoch 2/100
1/1 - 0s - loss: 3.0288 - 4ms/epoch - 4ms/step
Epoch 3/100
1/1 - 0s - loss: 3.0195 - 5ms/epoch - 5ms/step
Epoch 4/100
1/1 - 0s - loss: 3.0102 - 5ms/epoch - 5ms/step
Epoch 5/100
1/1 - 0s - loss: 3.0009 - 4ms/epoch - 4ms/step
Epoch 6/100
1/1 - 0s - loss: 2.9916 - 3ms/epoch - 3ms/step
Epoch 7/100
1/1 - 0s - loss: 2.9823 - 5ms/epoch - 5ms/step
Epoch 8/100
1/1 - 0s - loss: 2.9729 - 4ms/epoch - 4ms/step
Epoch 9/100
1/1 - 0s - loss: 2.9635 - 4ms/epoch - 4ms/step
Epoch 10/100
1/1 - 0s - loss: 2.9540 - 4ms/epoch - 4ms/step
Epoch 11/100
1/1 - 0s - loss: 2.9445 - 5ms/epoch - 5ms/step
Epoch 12/100
1/1 - 0s - loss: 2.9350 - 3ms/epoch - 3ms/step
Epoch 13/100
1/1 - 0s - loss: 2.9253 - 5ms/epoch - 5ms/step
Epoch 14/100
1/1 - 0s - loss: 2.9156 - 5ms/epoch - 5ms/step
Epoch 15/100
1/1 - 0s - loss: 2.9058 - 3ms/epoch - 3ms/step
Epoch 16/100
1/1 - 0s - loss: 2.8958 - 4ms/epoch - 4ms/step
Epoch 17/100
1/1 - 0s - loss: 2.8858 - 4ms/ep