# Word Embedding

In [92]:
from keras.src.layers import Embedding
from keras.src.models import Sequential
from keras.src.ops import one_hot
from keras.src.utils import pad_sequences
from keras.src.legacy.preprocessing.text import Tokenizer

In [93]:
### sentences
sent = ['the glass of milk',
    'the glass of juice',
    'the cup of tea',
    'I am a good boy',
    'I am a good developer',
    'understand the meaning of words',
    'your videos are good', ]

In [94]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [95]:
## Define the vocabulary size
voc_size = 10000

In [96]:
# Tokenization
tokenizer = Tokenizer(num_words=voc_size)

tokenizer.fit_on_texts(sent)

word_index = tokenizer.word_index

In [97]:
# Convert sentences to sequences of integers
sequences = tokenizer.texts_to_sequences(sent)

In [99]:
### One Hot Representation
one_hot_repr = [one_hot(words, voc_size) for words in sequences]

one_hot_repr

[<tf.Tensor: shape=(4, 10000), dtype=float32, numpy=
 array([[0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>,
 <tf.Tensor: shape=(4, 10000), dtype=float32, numpy=
 array([[0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>,
 <tf.Tensor: shape=(4, 10000), dtype=float32, numpy=
 array([[0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>,
 <tf.Tensor: shape=(5, 10000), dtype=float32, numpy=
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>,
 <tf.Tensor: shape=(5, 10000), dtype=float32, numpy=


In [100]:
# Padding

sent_length = 8

embedded_docs = pad_sequences(one_hot_repr, padding = 'pre', maxlen = sent_length)

print(embedded_docs)

[[[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 1 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 1 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 1 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 ...

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 1 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]]


In [101]:
## feature representation
dim = 10

In [102]:
model = Sequential()

model.add(Embedding(voc_size, dim))

model.compile('adam', 'mse')

In [103]:
model.summary()

In [104]:
model.predict(embedded_docs)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 177ms/step


array([[[[-0.0164068 ,  0.04342392,  0.0326358 , ...,  0.01285896,
           0.04561868,  0.02118376],
         [-0.0164068 ,  0.04342392,  0.0326358 , ...,  0.01285896,
           0.04561868,  0.02118376],
         [-0.0164068 ,  0.04342392,  0.0326358 , ...,  0.01285896,
           0.04561868,  0.02118376],
         ...,
         [-0.0164068 ,  0.04342392,  0.0326358 , ...,  0.01285896,
           0.04561868,  0.02118376],
         [-0.0164068 ,  0.04342392,  0.0326358 , ...,  0.01285896,
           0.04561868,  0.02118376],
         [-0.0164068 ,  0.04342392,  0.0326358 , ...,  0.01285896,
           0.04561868,  0.02118376]],

        [[-0.0164068 ,  0.04342392,  0.0326358 , ...,  0.01285896,
           0.04561868,  0.02118376],
         [-0.0164068 ,  0.04342392,  0.0326358 , ...,  0.01285896,
           0.04561868,  0.02118376],
         [-0.0164068 ,  0.04342392,  0.0326358 , ...,  0.01285896,
           0.04561868,  0.02118376],
         ...,
         [-0.0164068 ,  0.04342392

In [105]:
embedded_docs[0]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)

In [106]:
import numpy as np

# Prediction for a single sequence
single_sequence = embedded_docs[0]

single_sequence_reshaped = np.expand_dims(single_sequence, axis=0) #Reshape to 2d array

single_prediction = model.predict(single_sequence_reshaped)

print(single_prediction)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step
[[[[-0.0164068   0.04342392  0.0326358  ...  0.01285896  0.04561868
     0.02118376]
   [-0.0164068   0.04342392  0.0326358  ...  0.01285896  0.04561868
     0.02118376]
   [-0.0164068   0.04342392  0.0326358  ...  0.01285896  0.04561868
     0.02118376]
   ...
   [-0.0164068   0.04342392  0.0326358  ...  0.01285896  0.04561868
     0.02118376]
   [-0.0164068   0.04342392  0.0326358  ...  0.01285896  0.04561868
     0.02118376]
   [-0.0164068   0.04342392  0.0326358  ...  0.01285896  0.04561868
     0.02118376]]

  [[-0.0164068   0.04342392  0.0326358  ...  0.01285896  0.04561868
     0.02118376]
   [-0.0164068   0.04342392  0.0326358  ...  0.01285896  0.04561868
     0.02118376]
   [-0.0164068   0.04342392  0.0326358  ...  0.01285896  0.04561868
     0.02118376]
   ...
   [-0.0164068   0.04342392  0.0326358  ...  0.01285896  0.04561868
     0.02118376]
   [-0.0164068   0.04342392  0.0326358  ...  0.01285896  0.04

In [107]:
# Prediction on a batch of sequences

model.predict(embedded_docs)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step


array([[[[-0.0164068 ,  0.04342392,  0.0326358 , ...,  0.01285896,
           0.04561868,  0.02118376],
         [-0.0164068 ,  0.04342392,  0.0326358 , ...,  0.01285896,
           0.04561868,  0.02118376],
         [-0.0164068 ,  0.04342392,  0.0326358 , ...,  0.01285896,
           0.04561868,  0.02118376],
         ...,
         [-0.0164068 ,  0.04342392,  0.0326358 , ...,  0.01285896,
           0.04561868,  0.02118376],
         [-0.0164068 ,  0.04342392,  0.0326358 , ...,  0.01285896,
           0.04561868,  0.02118376],
         [-0.0164068 ,  0.04342392,  0.0326358 , ...,  0.01285896,
           0.04561868,  0.02118376]],

        [[-0.0164068 ,  0.04342392,  0.0326358 , ...,  0.01285896,
           0.04561868,  0.02118376],
         [-0.0164068 ,  0.04342392,  0.0326358 , ...,  0.01285896,
           0.04561868,  0.02118376],
         [-0.0164068 ,  0.04342392,  0.0326358 , ...,  0.01285896,
           0.04561868,  0.02118376],
         ...,
         [-0.0164068 ,  0.04342392