In [None]:
# Import Required Libraries for CBOW (Continuous Bag of Words) Implementation
# Purpose: Set up necessary Python libraries for NLP and deep learning

# pandas (pd): Data manipulation library
# - Used for data handling and preprocessing

# numpy (np): Numerical computing library
# - Handles array operations and mathematical computations

# re: Regular expressions library
# - Used for text cleaning and pattern matching

# tensorflow (tf): Deep learning framework
# - Provides tools for building and training neural networks
import pandas as pd
import numpy as np
import re
import tensorflow as tf

In [None]:
# Text Data Loading and Preprocessing
# Purpose: Load and clean text data for CBOW model training

# Step 1: Load text file
# - Open file in read mode with UTF-8 encoding
# - Read entire content into 'text' variable
with open('CBOW.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Step 2: Split text into sentences
# - Use period (.) as sentence delimiter
sentences = text.split('.')

# Step 3: Clean sentences
# Parameters for cleaning:
# - re.sub('[^A-Za-z0-9]+', ' ', sen): 
#   * Remove all characters except letters and numbers
#   * Replace special chars with space
# - lower(): Convert to lowercase
# - strip(): Remove leading/trailing whitespace
clean_sen = []
for sen in sentences:
    if sen.strip() == "":  # Skip empty sentences
        continue
    sen = re.sub('[^A-Za-z0-9]+', ' ', sen)
    sen = sen.lower().strip()
    clean_sen.append(sen)

# Display cleaned sentences
print(clean_sen)

In [None]:
# Import Tokenizer from Keras
# Purpose: Convert text to numerical sequences

# Tokenizer: Tool for text vectorization
# - Builds vocabulary from text
# - Converts words to unique integers
# - Handles text-to-sequence conversion
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
# Tokenize Text Data
# Purpose: Convert sentences to sequences of integers

# Create and configure tokenizer
tokenizer = Tokenizer()

# fit_on_texts: Build vocabulary from text data
# - Creates word-to-index mapping
# - Updates internal vocabulary statistics
tokenizer.fit_on_texts(clean_sen)

# texts_to_sequences: Convert text to integer sequences
# - Each word is replaced by its corresponding index
seq = tokenizer.texts_to_sequences(clean_sen)
seq

In [None]:
# Generate Training Data for CBOW Model
# Purpose: Create context-target pairs for training

# Parameters:
vocab_size = len(tokenizer.word_index) + 1  # Total unique words + 1 for padding
emb_size = 10     # Dimensionality of word embeddings
context_size = 2  # Number of words before/after target to use as context

# Create training pairs:
# - contexts: Lists of words around target word
# - targets: Center words to predict
contexts = []
targets = []

# Generate context-target pairs from sequences
for sequence in seq:
    # Iterate through each sequence with context window
    for i in range(context_size, len(sequence)-context_size):
        target = sequence[i]  # Center word
        # Context: 2 words before and 2 words after target
        context = [sequence[i-2], sequence[i-1], sequence[i+1], sequence[i+2]]
        targets.append(target)
        contexts.append(context)

print(contexts)
print(targets)

In [None]:
# Convert Training Data to NumPy Arrays
# Purpose: Prepare data for model training

# X: Context words (input)
# - Shape: (n_samples, context_window_size)
X = np.array(contexts)

# Y: Target words (output)
# - Shape: (n_samples,)
Y = np.array(targets)

In [None]:
# Import Required Keras Components
# Purpose: Import layers for building CBOW model

# Sequential: Linear stack of layers
# Dense: Fully connected neural network layer
# Embedding: Special layer for word embeddings
# Lambda: Custom layer for arbitrary operations
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, Lambda

In [None]:
# Build and Train CBOW Model
# Purpose: Create and train the CBOW neural network

model = Sequential([
    # Embedding Layer:
    # - input_dim: Size of vocabulary
    # - output_dim: Size of embedding vector
    # - input_length: Length of input sequences (context window * 2)
    Embedding(input_dim=vocab_size, output_dim=emb_size, input_length=2*context_size),
    
    # Lambda Layer:
    # - Averages context word embeddings
    # - tf.reduce_mean averages along axis 1 (sequence dimension)
    Lambda(lambda x: tf.reduce_mean(x, axis=1)),
    
    # Hidden Layers:
    Dense(256, activation='relu'),  # First hidden layer with 256 neurons
    Dense(512, activation='relu'),  # Second hidden layer with 512 neurons
    
    # Output Layer:
    # - vocab_size neurons (one per word)
    # - softmax activation for probability distribution
    Dense(vocab_size, activation='softmax')
])

# Compile model:
# optimizer='adam': Adaptive learning rate optimization
# loss='sparse_categorical_crossentropy': For integer targets
# metrics=['accuracy']: Track prediction accuracy
model.compile(optimizer='adam', 
             loss='sparse_categorical_crossentropy',
             metrics=['accuracy'])

# Train model:
# X: Context word indices
# Y: Target word indices
# epochs=80: Number of complete passes through the data
history = model.fit(X, Y, epochs=80)

In [None]:
# Visualize Training Progress
# Purpose: Plot training metrics over epochs

# seaborn lineplot:
# - x-axis: Epochs
# - y-axis: Loss and Accuracy values
# - Shows how model performance changes during training
import seaborn as sns
sns.lineplot(model.history.history)

In [None]:
# Test CBOW Model
# Purpose: Evaluate model on a test sentence

# Step 1: Prepare test sentence
test_sentence = ["an important point difference"]

# Step 2: Convert to sequence
# - Use same tokenizer as training
# - Convert words to integer indices
test_seq = tokenizer.texts_to_sequences(test_sentence)
test_seq = test_seq[0]  # Flatten list of lists
print("Test sequence:", test_seq)

# Step 3: Create context window
context_size = 2  # Same as training
center_index = len(test_seq) // 2  # Choose middle word as target

# Step 4: Extract context words
# - Get words before and after target
# - Skip target word itself
test_context = []
for j in range(center_index - context_size, center_index + context_size + 1):
    if j == center_index:  # Skip target word
        continue
    if 0 <= j < len(test_seq):  # Check bounds
        test_context.append(test_seq[j])

# Step 5: Make prediction
x_test = np.array([test_context])
print("Context indices:", x_test)

# Get prediction and convert back to word
pred = model.predict(x_test)
pred_index = np.argmax(pred[0])  # Get most likely word index
pred_word = index_to_word.get(pred_index, "<UNK>")  # Convert to word

# Print results
print("\nContext words:", [index_to_word[i] for i in test_context])
print("Predicted target word =", pred_word)