In [1]:
# a. Data Preparation

import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
# Replace with your own text/document
text = """
natural language processing is a field of artificial intelligence
that focuses on understanding and generating human language
"""

In [3]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

word_index = tokenizer.word_index
index_word = {v: k for k, v in word_index.items()}
vocab_size = len(word_index) + 1

# Convert to sequence of integers
sequence = tokenizer.texts_to_sequences([text])[0]

print("Vocabulary Size:", vocab_size)
print("Sequence:", sequence)


Vocabulary Size: 17
Sequence: [2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1]


In [4]:
# b. Generate Training Data (CBOW)

window_size = 2   # context window
X_train = []
y_train = []

for i in range(window_size, len(sequence) - window_size):
    context = [
        sequence[i - 2],
        sequence[i - 1],
        sequence[i + 1],
        sequence[i + 2]
    ]
    target = sequence[i]

    X_train.append(context)
    y_train.append(target)

X_train = np.array(X_train)
y_train = np.array(y_train)

print("\nContext samples:", X_train[:5])
print("Target samples:", y_train[:5])


Context samples: [[2 1 4 5]
 [1 3 5 6]
 [3 4 6 7]
 [4 5 7 8]
 [5 6 8 9]]
Target samples: [3 4 5 6 7]


In [5]:
# c. Train CBOW Model

embedding_dim = 8

input_layer = Input(shape=(4,))
embedding_layer = Embedding(vocab_size, embedding_dim)(input_layer)

hidden = Dense(embedding_dim, activation="linear")(embedding_layer)
hidden = Flatten()(hidden)

output_layer = Dense(vocab_size, activation="softmax")(hidden)

cbow_model = Model(inputs=input_layer, outputs=output_layer)
cbow_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")

cbow_model.summary()

cbow_model.fit(X_train, y_train, epochs=200, verbose=0)

print("\nCBOW Model Training Completed!")


CBOW Model Training Completed!


In [6]:
# d. Output – Word Embeddings

embedding_weights = cbow_model.layers[1].get_weights()[0]

print("\nWord Embeddings:")
for word, idx in word_index.items():
    print(f"{word} → {embedding_weights[idx]}")


Word Embeddings:
language → [ 0.38464203 -0.33371705 -0.0047535  -0.37300074  0.3403622  -0.16027053
 -0.28299788 -0.10906445]
natural → [-0.18785885 -0.15964565 -0.16568713  0.23478054 -0.22599702 -0.17954488
  0.23203214  0.20932727]
processing → [ 0.02806833 -0.31626514  0.05289735  0.16024962  0.12087699  0.20712394
 -0.28223246  0.26863754]
is → [-0.18179853  0.08407958  0.0227987  -0.29677325  0.03737898 -0.3778105
  0.31323785 -0.16042428]
a → [-0.02766507  0.33193403  0.37663785 -0.39043316 -0.02056454 -0.0142524
 -0.14991665  0.15437092]
field → [ 0.19335154 -0.04754332 -0.3191517   0.3003838  -0.24197793 -0.02036203
  0.32898247 -0.05636626]
of → [ 0.3691816  -0.05308909 -0.40678275  0.27489027  0.11875176  0.3214613
 -0.32495138 -0.36266047]
artificial → [ 0.36808577  0.35232455  0.20068914 -0.29358196  0.20266928  0.05653012
 -0.14582041 -0.006607  ]
intelligence → [ 0.31048533  0.06969741  0.12221466 -0.16836351  0.39578664  0.04059293
 -0.02663809 -0.11524902]
that → [-0

In [7]:
print(index_word[np.argmax(cbow_model.predict(np.array([X_train[0]])))])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
processing
