In [21]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing import text
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [14]:
corpus = [
    'this is a sample sentence',
    'another example for training this',
    'one more sentence to use'
]

In [15]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(corpus)
word_index = tokenizer.word_index
word_index

{'this': 1,
 'sentence': 2,
 'is': 3,
 'a': 4,
 'sample': 5,
 'another': 6,
 'example': 7,
 'for': 8,
 'training': 9,
 'one': 10,
 'more': 11,
 'to': 12,
 'use': 13}

In [16]:
window_size = 2
vocab_size = len(word_index) + 1
sequences = tokenizer.texts_to_sequences(corpus)
sequences

[[1, 3, 4, 5, 2], [6, 7, 8, 9, 1], [10, 11, 2, 12, 13]]

In [None]:
x = []
y = []
for sequence in sequences:
    for i in range(window_size, len(sequence) - window_size):
        context = sequence[i-window_size : i] + sequence[i + 1: i+window_size + 1]
        target = sequence[i]
        x.append(context)
        y.append(target)
y

[4, 8, 2]

In [27]:
x = pad_sequences(x, maxlen=window_size * 2)
y = to_categorical(y, num_classes=vocab_size)

In [28]:
x, y

(array([[ 1,  3,  5,  2],
        [ 6,  7,  9,  1],
        [10, 11, 12, 13]]),
 array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]))

In [29]:
embedding_dim = 100

cbow_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    tf.keras.layers.Lambda(lambda x: tf.reduce_mean(x, axis=1)),
    tf.keras.layers.Dense(vocab_size, activation='softmax')
])

In [30]:
cbow_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
cbow_model.fit(x, y, epochs=50, batch_size=16)

Epoch 1/50

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 726ms/step - accuracy: 0.0000e+00 - loss: 2.6442
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.0000e+00 - loss: 2.6302
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.0000e+00 - loss: 2.6162
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.6667 - loss: 2.6022
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 1.0000 - loss: 2.5882
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 1.0000 - loss: 2.5741
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 1.0000 - loss: 2.5601
Epoch 8/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 1.0000 - loss: 2.5460
Epoch 9/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[

<keras.src.callbacks.history.History at 0x1bc622a9010>

In [32]:
embedding_weights = cbow_model.layers[0].get_weights()[0]
embedding_weights

array([[ 0.02034006,  0.02028139, -0.00767187, ...,  0.01988966,
         0.00768151,  0.04152813],
       [-0.0165398 , -0.10300218,  0.00352357, ..., -0.04473084,
         0.02086291,  0.06858039],
       [-0.03676633, -0.0352583 , -0.09978631, ..., -0.02732353,
        -0.09604567,  0.01697694],
       ...,
       [ 0.02351741,  0.00710142, -0.01570841, ..., -0.03511023,
         0.01374175, -0.04615513],
       [ 0.07754136,  0.08705931, -0.01785648, ..., -0.05017237,
         0.06452472, -0.06641962],
       [ 0.06263565,  0.08489707, -0.02448575, ..., -0.0762534 ,
         0.09344567, -0.0021745 ]], dtype=float32)

In [33]:
for word, index in word_index.items():
    print(f'word: {word}, embedding: {embedding_weights[index]}')

word: this, embedding: [-0.0165398  -0.10300218  0.00352357  0.08929167 -0.08683573 -0.02687878
  0.09003623 -0.02927724  0.00042969  0.10610063  0.09181257  0.09040575
 -0.02917653  0.01803196  0.03734789 -0.0067294   0.0266013  -0.01091461
 -0.09764872 -0.01108759 -0.06140157  0.09965797 -0.05650276  0.02297851
  0.01204668  0.08688913 -0.10037839  0.03155684 -0.11560455 -0.02746874
 -0.10044873  0.00446534  0.04945168 -0.07362105 -0.00176368  0.05969793
  0.0816465  -0.02665667  0.02867062 -0.02575292 -0.0613834   0.04748626
 -0.07207133  0.0174475   0.06124455  0.04743626 -0.02872558 -0.06806921
  0.01825881  0.03421443  0.05353619  0.00813287 -0.04954229 -0.00944204
 -0.05853362 -0.06568462  0.07945254  0.0508744  -0.1060519  -0.00828628
  0.06593474  0.00989058  0.02690917 -0.06929731  0.05021228 -0.02242985
  0.09814024 -0.06261732 -0.00508583 -0.07304005 -0.02715414  0.09467049
 -0.03456342 -0.03928822 -0.02872525 -0.0095884  -0.01622641  0.05052416
  0.02979689 -0.0529125  -0.