In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten
import numpy as np
import collections

In [2]:
# --- a. Data Preparation ---
# Sample corpus
corpus = """
The quick brown fox jumps over the lazy dog.
The dog barks, and the fox runs away.
A quick brown rabbit also jumps.
"""

In [4]:
# Tokenize the corpus
tokenizer = Tokenizer()
tokenizer.fit_on_texts([corpus])
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1 # +1 for padding/unknown words
print(f"Vocabulary: {word_index}")
print(f"Vocabulary size: {vocab_size}")
# Convert corpus to sequences of integers
sequences = tokenizer.texts_to_sequences([corpus])[0]
print(f"Sequences: {sequences}")
# --- b. Generate Training Data (Skip-gram is used here for demonstration, CBOW concept is similar) ---
# For CBOW, we'd typically create pairs of (context_words, target_word)
# Here, we use skipgrams for simplicity as it generates pairs from sequences.
# skipgrams returns pairs of (target_word, context_word)
# Parameters for skipgrams
window_size = 2 # Context window size
# Note: For true CBOW, you'd structure this differently, but skipgrams is often used to generate pairs for word2vec-like models.
# We will generate pairs of (target_word, context_word) using skipgrams for now,
# and then adapt it to mimic CBOW's prediction goal.
# A typical CBOW implementation involves averaging context embeddings.
# A simplified approach to get CBOW data structure:

Vocabulary: {'the': 1, 'quick': 2, 'brown': 3, 'fox': 4, 'jumps': 5, 'dog': 6, 'over': 7, 'lazy': 8, 'barks': 9, 'and': 10, 'runs': 11, 'away': 12, 'a': 13, 'rabbit': 14, 'also': 15}
Vocabulary size: 16
Sequences: [1, 2, 3, 4, 5, 7, 1, 8, 6, 1, 6, 9, 10, 1, 4, 11, 12, 13, 2, 3, 14, 15, 5]


In [7]:
# For each word, gather its context.
# Example: "The quick brown fox" -> target: "brown", context: ["The", "quick", "fox"]
data = []
target = []
context_window = 2
# Iterate through the sequences
for i, word in enumerate(sequences):
      context_start = max(0, i - context_window)
      context_end = min(len(sequences), i + context_window + 1)
      context = sequences[context_start:i] + sequences[i+1:context_end]
# Ensure context is not empty
if context:
# For CBOW, we want to predict the word 'word' from 'context'
# We can represent context by averaging its embeddings later, or use its indices
# Let's prepare data where input is context indices, and target is the word index
# For simplicity in Keras, we might one-hot encode contexts or average embeddings
# A common simplification for Keras is to use a multi-hot encoding of context
# or directly use skipgrams pairs and adapt the model.
# Let's stick to generating data in a way that the model can learn the prediction.
# A typical CBOW structure in Keras: Input (context word indices) -> Embedding ->Average -> Dense -> Output
# For demonstration, let's create pairs where target is 'word' and input is a context word from 'context'
# This is closer to Skip-gram but can be adapted.
# To truly implement CBOW, one would average context embeddings.
# We'll use a simplified model that predicts target from individual context words, then can be modified.
# Let's generate pairs of (context_word_index, target_word_index)
    for context_word_index in context:
        data.append(context_word_index)
        target.append(word)

In [8]:
# Convert lists to numpy arrays
data = np.array(data)
target = np.array(target)
print(f"\nGenerated context-target pairs (simplified): {len(data)} pairs")
# print(f"Example pair (context_word_index, target_word_index): ({data[0]}, {target[0]})")


Generated context-target pairs (simplified): 2 pairs


In [9]:
# --- c. Train Model ---
embedding_dim = 10 # Dimension of the word embeddings
# CBOW Model: Predict target word from context words
# Input layer will take context word index
context_input = Input(shape=(1,), name='context_input')
# Embedding layer: Maps word indices to dense vectors
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim,
name='word_embedding')
context_embedding = embedding_layer(context_input) # Embeddings for context words
# We need to aggregate context embeddings. In true CBOW, we average them.
# For Keras, this is often done implicitly or by custom layers.

In [10]:
# A simpler model might predict target from each context word individually, then average results,
# or use a multi-hot encoding for context and average after embedding.
# Let's use a simplified model: input is ONE context word, predict the target.
# This is more akin to skip-gram, but can illustrate the embedding learning.
# For a true CBOW, you'd process all context words together.
# Output layer: Predicts the target word (softmax over vocabulary)
output_layer = Dense(vocab_size, activation='softmax', name='output_layer')
output_probs = output_layer(context_embedding)
# Create the model
# This simplified model predicts target from a SINGLE context word.
# To make it CBOW, you'd need to:
# 1. input multiple context words (e.g., a sequence of indices)
# 2. pass them through the embedding layer
# 3. average the resulting embeddings
# 4. pass the averaged embedding through a Dense layer
# Let's adapt to predict target from averaging context embeddings.
# This requires a slightly more complex setup or a pre-processing step.
# For a practical Keras CBOW:
# 1. Prepare data where each training instance has multiple context word indices.
# 2. Input shape would be (num_context_words,)
# 3. Use an Embedding layer.
# 4. Use a Lambda layer to average the embeddings of context words.
# 5. Then feed into Dense output.

In [13]:
# Re-preparing data for actual CBOW structure:
# For each word, gather its context indices.
# Input: [context_word1_idx, context_word2_idx, ...]
# Target: target_word_idx
data_cbow = []
target_cbow = []
for i, word in enumerate(sequences):
      context_start = max(0, i - context_window)
      context_end = min(len(sequences), i + context_window + 1)
      context_indices = sequences[context_start:i] + sequences[i+1:context_end]
if context_indices:
# Pad context if needed to have a fixed length, or handle variable length
# For simplicity, let's assume a fixed context window size that we can pad
# Here, we'll use the actual context and average.
      data_cbow.append(context_indices)
      target_cbow.append(word)
# Convert to numpy arrays
data_cbow = np.array(data_cbow)
target_cbow = np.array(target_cbow)
print(f"\nCBOW data shape: {data_cbow.shape}") # (num_samples, avg_num_context_words)
print(f"CBOW target shape: {target_cbow.shape}") # (num_samples,)
# Building the CBOW model architecture


CBOW data shape: (1, 2)
CBOW target shape: (1,)


In [22]:
from tensorflow.keras.layers import Lambda
import tensorflow.keras.backend as K

# --- CBOW model architecture ---
cbow_input = Input(shape=(None,), name='cbow_input')  # variable number of context words

# Embedding layer
embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, name='word_embedding')
embedded_contexts = embedding(cbow_input)  # (batch_size, num_context_words, embedding_dim)

# Average the embeddings
def average_embeddings(x):
    return K.mean(x, axis=1)

averaged_context = Lambda(average_embeddings, output_shape=(embedding_dim,), name='average_context')(embedded_contexts)

# Output layer — THIS is where you made the mistake before
cbow_output = Dense(vocab_size, activation='softmax', name='output_layer')(averaged_context)

# Build and compile the model
cbow_model = Model(inputs=cbow_input, outputs=cbow_output)
cbow_model.compile(optimizer='adam',
                   loss='sparse_categorical_crossentropy',
                   metrics=['accuracy'])

cbow_model.summary()


In [23]:
# Create the CBOW model
cbow_model = Model(inputs=cbow_input, outputs=cbow_output)
cbow_model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy', # Use sparse because target is integer index
metrics=['accuracy'])

In [24]:
cbow_model.summary()
# Train the CBOW model
print("\nTraining the CBOW model...")
# You might need to pad sequences if using fixed input shapes or handle variable lengths.
# For 'None' input shape, Keras handles variable lengths automatically for this setup.
history_cbow = cbow_model.fit(data_cbow, target_cbow, epochs=100, batch_size=16,
verbose=0) # Increased epochs for better learning
# --- d. Output ---
print("\nTraining finished. Extracting word embeddings.")
# Extract the word embeddings from the embedding layer
word_embeddings = cbow_model.get_layer('word_embedding').get_weights()[0]
print(f"Shape of word embeddings: {word_embeddings.shape}") # (vocab_size, embedding_dim)
# Display embeddings for a few words
print("\nWord Embeddings:")
for word, i in word_index.items():
    print(f"'{word}': {word_embeddings[i][:5]}...") # Print first 5 dimensions
# You can now use these embeddings for similarity calculation, etc.
# For example, finding words similar to 'fox':
# Calculate cosine similarity between 'fox' vector and all other vectors.


Training the CBOW model...

Training finished. Extracting word embeddings.
Shape of word embeddings: (16, 10)

Word Embeddings:
'the': [ 0.02066291  0.00528953  0.01402289 -0.00101294 -0.04050182]...
'quick': [ 0.01566211  0.03506346  0.03470664  0.03439732 -0.03385506]...
'brown': [ 0.01559262 -0.02316748 -0.03885341 -0.01975107 -0.03973562]...
'fox': [-0.00189878  0.02852105  0.03664095  0.01970431 -0.01887131]...
'jumps': [-0.01504601 -0.02598949  0.01867386  0.00542425  0.03774353]...
'dog': [-0.03331993 -0.04652214 -0.02331786  0.01824592  0.03961295]...
'over': [-0.03141618 -0.02075325 -0.04179676 -0.02061243  0.00206099]...
'lazy': [-0.03564785 -0.03857515  0.03767255  0.01030811  0.01026169]...
'barks': [-0.01016742 -0.01193502  0.02322466 -0.00797284 -0.02806881]...
'and': [-0.04502343  0.04186846  0.04536377  0.01505884  0.02160091]...
'runs': [ 0.04338505 -0.01640583  0.01897473  0.00752043  0.00414664]...
'away': [ 0.0410695   0.04887905 -0.04533111 -0.01337685 -0.01348662