In [1]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer

In [8]:
# REPLACE THIS WITH YOUR TEXTUAL DOCUMENT 3
text = """
    Artificial intelligence and machine learning are transforming industries. 
    Companies use data to train models that can understand language, 
    make predictions, and automate complex tasks.
"""
# PUT YOUR DOCUMENT 3 ABOVE

In [9]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

word_index = tokenizer.word_index
index_word = {v: k for k, v in word_index.items()}
vocab_size = len(word_index) + 1

sequence = tokenizer.texts_to_sequences([text])[0]

print("Vocabulary Size:", vocab_size)
print("Sequence:", sequence)

Vocabulary Size: 24
Sequence: [2, 3, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 21, 22, 23]


In [4]:
# b. Generate Training Data (CBOW)

window_size = 2
X_train = []
y_train = []

for i in range(window_size, len(sequence) - window_size):
    context = [
        sequence[i - 2],
        sequence[i - 1],
        sequence[i + 1],
        sequence[i + 2]
    ]
    target = sequence[i]

    X_train.append(context)
    y_train.append(target)

X_train = np.array(X_train)
y_train = np.array(y_train)

print("\nContext samples:\n", X_train[:5])
print("\nTarget samples:\n", y_train[:5])


Context samples:
 [[2 3 4 5]
 [3 1 5 6]
 [1 4 6 7]
 [4 5 7 8]
 [5 6 8 9]]

Target samples:
 [1 4 5 6 7]


In [5]:
# c. Train Model (CBOW)

embedding_dim = 8

input_layer = Input(shape=(4,))
embedding_layer = Embedding(vocab_size, embedding_dim)(input_layer)

hidden = Dense(embedding_dim, activation="linear")(embedding_layer)
hidden = Flatten()(hidden)

output_layer = Dense(vocab_size, activation="softmax")(hidden)

cbow_model = Model(inputs=input_layer, outputs=output_layer)
cbow_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")

cbow_model.summary()

cbow_model.fit(X_train, y_train, epochs=200, verbose=0)

print("\nCBOW Model Training Complete!")


CBOW Model Training Complete!


In [6]:
# d. Output – Word Embeddings

from tensorflow.keras.layers import Embedding

embedding_layer_obj = [layer for layer in cbow_model.layers if isinstance(layer, Embedding)][0]
weights = embedding_layer_obj.get_weights()[0]

print("\nWord Embeddings:")
for word, idx in word_index.items():
    print(f"{word} → {weights[idx]}")


Word Embeddings:
and → [ 0.15621     0.32053098 -0.18528996  0.30319571  0.4884645   0.30342087
 -0.10042603  0.09858076]
artificial → [-0.2318643   0.25763404  0.03483338  0.28602898  0.18428507  0.17300233
  0.22551322  0.17132953]
intelligence → [-0.29399934  0.05936664  0.2795036  -0.12943694 -0.36929822  0.1802812
  0.02413907  0.2698054 ]
machine → [-0.08952197 -0.22685188 -0.08893734 -0.27162734 -0.0663601  -0.35951588
 -0.3438871  -0.19163887]
learning → [-0.1648227   0.4646425  -0.14915682 -0.17060828  0.1622629  -0.00453295
  0.02649874  0.00562825]
are → [ 0.28259823  0.36420196  0.389856    0.29321522  0.24533466 -0.13921128
  0.29195583 -0.44840622]
transforming → [-0.42768753  0.17902027 -0.4127369   0.27064502  0.17534049  0.2998569
 -0.3512172   0.35093302]
industries → [ 0.28268582  0.38657907  0.19856858 -0.4175635   0.03208609 -0.3190877
  0.18284974 -0.4132808 ]
companies → [-0.1584337   0.34233493  0.24463023  0.17779505 -0.09825984  0.29114214
  0.35129318  0.246

In [7]:
# EXTRA SECTION: CHECK IF CBOW MODEL IS WORKING

from numpy import dot
from numpy.linalg import norm

# Similarity function
def similarity(w1, w2):
    v1 = weights[word_index[w1]]
    v2 = weights[word_index[w2]]
    return dot(v1, v2) / (norm(v1) * norm(v2))

print("\nChecking Word Similarity:")
print("Similarity(learning, intelligence):", similarity("learning", "intelligence"))
print("Similarity(models, predictions):", similarity("models", "predictions"))
print("Similarity(data, companies):", similarity("data", "companies"))


# Predict the missing target word from context
context_words = ["machine", "learning", "are", "transforming"]

context_ids = np.array([[word_index[w] for w in context_words]])

pred = cbow_model.predict(context_ids)
predicted_word = index_word[np.argmax(pred)]

print("\nContext:", context_words)
print("Predicted Missing Word:", predicted_word)


Checking Word Similarity:
Similarity(learning, intelligence): -0.0057946863
Similarity(models, predictions): -0.09324876
Similarity(data, companies): -0.5541615
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step

Context: ['machine', 'learning', 'are', 'transforming']
Predicted Missing Word: to
