<a href="https://colab.research.google.com/github/SujeetSaxena/AI-ML/blob/main/CBOW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:

pip install gensim




In [10]:
# Import necessary libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Lambda
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K

# Sample data
sentences = [
    "the quick brown fox jumped over the lazy dog",
    "I love machine learning",
    "deep learning is a subset of machine learning"
]

# Tokenize the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
sequences = tokenizer.texts_to_sequences(sentences)

# Generate CBOW data
def generate_cbow_data(sequences, window_size):
    context_target_pairs = []
    for sequence in sequences:
        for i in range(window_size, len(sequence) - window_size):
            context = sequence[i - window_size:i] + sequence[i + 1:i + window_size + 1]
            target = sequence[i]
            context_target_pairs.append((context, target))
    return context_target_pairs

window_size = 2
context_target_pairs = generate_cbow_data(sequences, window_size)

# Prepare data for training
def prepare_data(context_target_pairs, vocab_size):
    contexts, targets = zip(*context_target_pairs)
    contexts = np.array(contexts)
    targets = to_categorical(targets, vocab_size)
    return contexts, targets

contexts, targets = prepare_data(context_target_pairs, vocab_size)

# Build the CBOW model
embedding_dim = 50
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=2 * window_size),
    Lambda(lambda x: K.mean(x, axis=1)),
    Dense(vocab_size, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Train the model
model.fit(contexts, targets, epochs=100, verbose=2)

# Get the word embeddings
embeddings = model.get_layer('embedding').get_weights()[0]

# Print the embedding for a specific word
word = "learning"
word_idx = word_index[word]
print(f"Embedding for '{word}': {embeddings[word_idx]}")


Epoch 1/100
1/1 - 1s - 1s/step - accuracy: 0.0000e+00 - loss: 2.8923
Epoch 2/100
1/1 - 0s - 182ms/step - accuracy: 0.1111 - loss: 2.8857
Epoch 3/100
1/1 - 0s - 62ms/step - accuracy: 0.1111 - loss: 2.8791
Epoch 4/100
1/1 - 0s - 51ms/step - accuracy: 0.1111 - loss: 2.8725
Epoch 5/100
1/1 - 0s - 45ms/step - accuracy: 0.2222 - loss: 2.8659
Epoch 6/100
1/1 - 0s - 61ms/step - accuracy: 0.2222 - loss: 2.8593
Epoch 7/100
1/1 - 0s - 46ms/step - accuracy: 0.3333 - loss: 2.8526
Epoch 8/100
1/1 - 0s - 56ms/step - accuracy: 0.4444 - loss: 2.8460
Epoch 9/100
1/1 - 0s - 59ms/step - accuracy: 0.4444 - loss: 2.8393
Epoch 10/100
1/1 - 0s - 46ms/step - accuracy: 0.5556 - loss: 2.8326
Epoch 11/100
1/1 - 0s - 57ms/step - accuracy: 0.5556 - loss: 2.8259
Epoch 12/100
1/1 - 0s - 46ms/step - accuracy: 0.5556 - loss: 2.8191
Epoch 13/100
1/1 - 0s - 60ms/step - accuracy: 0.5556 - loss: 2.8123
Epoch 14/100
1/1 - 0s - 128ms/step - accuracy: 0.5556 - loss: 2.8054
Epoch 15/100
1/1 - 0s - 57ms/step - accuracy: 0.6667 