<a href="https://colab.research.google.com/github/Sayyadhujefa/DeepLearning/blob/main/Prac5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten
import numpy as np
import collections


In [None]:
# --- a. Data Preparation ---
# Sample corpus
corpus = """
The quick brown fox jumps over the lazy dog.
The dog barks, and the fox runs away.
A quick brown rabbit also jumps.
"""


In [None]:
# Tokenize the corpus
tokenizer = Tokenizer()
tokenizer.fit_on_texts([corpus])
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1 # +1 for padding/unknown words
print(f"Vocabulary: {word_index}")
print(f"Vocabulary size: {vocab_size}")
# Convert corpus to sequences of integers
sequences = tokenizer.texts_to_sequences([corpus])[0]
print(f"Sequences: {sequences}")
# --- b. Generate Training Data (Skip-gram is used here for demonstration, CBOW concept is similar) ---
# For CBOW, we'd typically create pairs of (context_words, target_word)
# Here, we use skipgrams for simplicity as it generates pairs from sequences.
# skipgrams returns pairs of (target_word, context_word)
# Parameters for skipgrams
window_size = 2 # Context window size
# Note: For true CBOW, you'd structure this differently, but skipgrams is often used to generate pairs for word2vec-like models.
# We will generate pairs of (target_word, context_word) using skipgrams for now,
# and then adapt it to mimic CBOW's prediction goal.
# A typical CBOW implementation involves averaging context embeddings.
# A simplified approach to get CBOW data structure:

In [None]:
# For each word, gather its context.
# Example: "The quick brown fox" -> target: "brown", context: ["The", "quick", "fox"]
data = []
target = []
context_window = 2
# Iterate through the sequences
for i, word in enumerate(sequences):
      context_start = max(0, i - context_window)
      context_end = min(len(sequences), i + context_window + 1)
      context = sequences[context_start:i] + sequences[i+1:context_end]
# Ensure context is not empty
if context:
# For CBOW, we want to predict the word 'word' from 'context'
# We can represent context by averaging its embeddings later, or use its indices
# Let's prepare data where input is context indices, and target is the word index
# For simplicity in Keras, we might one-hot encode contexts or average embeddings
# A common simplification for Keras is to use a multi-hot encoding of context
# or directly use skipgrams pairs and adapt the model.
# Let's stick to generating data in a way that the model can learn the prediction.
# A typical CBOW structure in Keras: Input (context word indices) -> Embedding ->Average -> Dense -> Output
# For demonstration, let's create pairs where target is 'word' and input is a context word from 'context'
# This is closer to Skip-gram but can be adapted.
# To truly implement CBOW, one would average context embeddings.
# We'll use a simplified model that predicts target from individual context words, then can be modified.
# Let's generate pairs of (context_word_index, target_word_index)
    for context_word_index in context:
        data.append(context_word_index)
        target.append(word)


In [None]:
# Convert lists to numpy arrays
data = np.array(data)
target = np.array(target)
print(f"\nGenerated context-target pairs (simplified): {len(data)} pairs")
# print(f"Example pair (context_word_index, target_word_index): ({data[0]}, {target[0]})")

In [7]:
# --- c. Train Model ---
embedding_dim = 10 # Dimension of the word embeddings
# CBOW Model: Predict target word from context words
# Input layer will take context word index
context_input = Input(shape=(1,), name='context_input')
# Embedding layer: Maps word indices to dense vectors
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim,
name='word_embedding')
context_embedding = embedding_layer(context_input) # Embeddings for context words
# We need to aggregate context embeddings. In true CBOW, we average them.
# For Keras, this is often done implicitly or by custom layers.


In [8]:
output_layer = Dense(vocab_size, activation='softmax', name='output_layer')
output_probs = output_layer(context_embedding)

In [21]:
import numpy as np
# Re-preparing data for actual CBOW structure:
# For each word, gather its context indices.
# Input: [context_word1_idx, context_word2_idx, ...]
# Target: target_word_idx
data_cbow = []
target_cbow = []
sequences=[]
context_indices=[]
for i, word in enumerate(sequences):
      context_start = max(0, i - context_window)
      context_end = min(len(sequences), i + context_window + 1)
      context_indices = sequences[context_start:i] + sequences[i+1:context_end]
if context_indices:
# Pad context if needed to have a fixed length, or handle variable length
# For simplicity, let's assume a fixed context window size that we can pad
# Here, we'll use the actual context and average.
      data_cbow.append(context_indices)
      target_cbow.append(word)
# Convert to numpy arrays
data_cbow = np.array(data_cbow)
target_cbow = np.array(target_cbow)
print(f"\nCBOW data shape: {data_cbow.shape}") # (num_samples, avg_num_context_words)
print(f"CBOW target shape: {target_cbow.shape}") # (num_samples,)
# Building the CBOW model architecture



CBOW data shape: (0,)
CBOW target shape: (0,)


In [22]:
from tensorflow.keras.layers import Lambda
import tensorflow.keras.backend as K

# --- CBOW model architecture ---
cbow_input = Input(shape=(None,), name='cbow_input')  # variable number of context words

# Embedding layer
embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, name='word_embedding')
embedded_contexts = embedding(cbow_input)  # (batch_size, num_context_words, embedding_dim)

# Average the embeddings
def average_embeddings(x):
    return K.mean(x, axis=1)

averaged_context = Lambda(average_embeddings, output_shape=(embedding_dim,), name='average_context')(embedded_contexts)

# Output layer — THIS is where you made the mistake before
cbow_output = Dense(vocab_size, activation='softmax', name='output_layer')(averaged_context)

# Build and compile the model
cbow_model = Model(inputs=cbow_input, outputs=cbow_output)
cbow_model.compile(optimizer='adam',
                   loss='sparse_categorical_crossentropy',
                   metrics=['accuracy'])

cbow_model.summary()


In [23]:
# Create the CBOW model
cbow_model = Model(inputs=cbow_input, outputs=cbow_output)
cbow_model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy', # Use sparse because target is integer index
metrics=['accuracy'])

In [15]:

# Example fixed value (you can compute dynamically too)
max_context_len = 4
vocab_size = 5000
embedding_dim = 10

# Define model layers
context_input = Input(shape=(max_context_len,), name='context_input')
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, name='word_embedding')(context_input)
context_avg = Lambda(lambda x: tf.reduce_mean(x, axis=1))(embedding_layer)
output = Dense(vocab_size, activation='softmax', name='output')(context_avg)

cbow_model = Model(inputs=context_input, outputs=output)
cbow_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
cbow_model.summary()
