<a href="https://colab.research.google.com/github/Siddharth0317/osl/blob/main/Exp_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
# Sample text corpus
corpus = """Natural language processing enables computers to understand human language.
Deep learning models like CBOW help us learn word representations.
Word embeddings capture semantic meaning in vector space.
This makes NLP applications like translation and sentiment analysis possible.
The CBOW model predicts a word based on its context words."""

# Split into sentences and lowercase
sentences = [s.strip().lower() for s in corpus.split('.') if s.strip()]

# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

# Vocabulary and mapping
vocab_size = len(tokenizer.word_index) + 1
word_index = tokenizer.word_index
index_word = {v: k for k, v in word_index.items()}
index_word[0] = "<PAD>"

print("Vocabulary:", word_index)
print("Sequences:", sequences)
print("Vocab Size:", vocab_size)

In [None]:
# Cell 2: Generate training data (context -> target)
def generate_cbow_pairs(sequences, window_size=2):
    contexts, targets = [], []
    for seq in sequences:
        for i, target in enumerate(seq):
            context = []
            for j in range(i - window_size, i + window_size + 1):
                if j == i:
                    continue
                if 0 <= j < len(seq):
                    context.append(seq[j])
                else:
                    context.append(0)  # padding
            contexts.append(context)
            targets.append(target)
    return np.array(contexts), np.array(targets)

# Generate dataset
X, y = generate_cbow_pairs(sequences, window_size=2)

print("Context shape:", X.shape)
print("Target shape:", y.shape)
print("Example:")
for i in range(5):
    print([index_word[idx] for idx in X[i]], "->", index_word[y[i]])


In [None]:
# Cell 3: Build and train CBOW model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Lambda, Dense, Input

embedding_dim = 50
context_len = X.shape[1]

# Define CBOW model
model = Sequential()
model.add(Input(shape=(context_len,)))
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))
model.add(Lambda(lambda x: tf.reduce_mean(x, axis=1)))  # average embeddings
model.add(Dense(vocab_size, activation='softmax'))

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

# Train the model
history = model.fit(X, y, epochs=80, batch_size=16, verbose=2)


In [None]:
# Cell 4: Output - Predictions and embeddings

# Get learned embeddings
embeddings = model.get_layer("embedding").get_weights()[0]
print("Embedding matrix shape:", embeddings.shape)

# Function to predict missing word from context
def predict_word(context_words):
    context_indices = [word_index.get(w, 0) for w in context_words]
    context_indices = np.array(context_indices).reshape(1, -1)
    probs = model.predict(context_indices, verbose=0)[0]
    pred_idx = np.argmax(probs)
    return index_word[pred_idx], float(probs[pred_idx])

# Example prediction
context = ["deep", "models", "cbow", "help"]  # must match context length = 4
pred_word, prob = predict_word(context)
print("Context:", context)
print("Predicted Word:", pred_word, "with probability:", prob)

# Function to find nearest words in embedding space
def nearest_words(word, top_k=5):
    if word not in word_index:
        return []
    w_idx = word_index[word]
    vec = embeddings[w_idx]
    norms = np.linalg.norm(embeddings, axis=1)
    sims = embeddings.dot(vec) / (norms * np.linalg.norm(vec) + 1e-9)
    top = np.argsort(-sims)[1: top_k+1]
    return [(index_word[i], float(sims[i])) for i in top]

print("Nearest words to 'learning':", nearest_words("learning"))
