In [1]:
# first lets look for the one hot representation for the specific word
from tensorflow.keras.preprocessing.text import one_hot

In [2]:
# trying this one_hot on below sentences
sentences = [
             "the glass of milk",
             "the glass of juice",
             "the cup of tea",
             "I am good boy",
             "I am good developer",
             "understand the meaning of words",
             "your videos are good",
             ]

In [3]:
sentences

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am good boy',
 'I am good developer',
 'understand the meaning of words',
 'your videos are good']

In [4]:
# Define the vocabulary size first!
voc_size = 10000

In [5]:
# One hot representation for every word
one_hot_repr = [one_hot(words, voc_size) for words in sentences]


In [6]:
one_hot_repr

[[5446, 8133, 5391, 8076],
 [5446, 8133, 5391, 1545],
 [5446, 1662, 5391, 9186],
 [7970, 7092, 4671, 9137],
 [7970, 7092, 4671, 4439],
 [8505, 5446, 3394, 5391, 1745],
 [2220, 4733, 3325, 4671]]

In [7]:
# so the o/p means that for 1st list - the words are at posn as follows
# at 9783 = the
# at 5873 = glass
# at 7728 = of
# at 5298 = milk
# and same for the others also!

In [8]:
# So if we use this in our embedding layer then instead of 1s and 0s we use this form that represents the index of the words
# One more point to note is in 1st 2 sentences we hve glass of milk and glass of juice
# So both sentences have the same word hence it is written same verctors for the words which are same in both
# So now we take all the words sentence by sentence and pass it to word embedding layer and will create word embedding representation
# So along with the dense layer we will have the embedding layer also

In [9]:
# word embedding representation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential

In [10]:
# Now we see sentences length are varying, so first thing is to make these sentences of equal size, because we cant train in rnn if not handled
# because every word goes in rnn in fixed timestamp
sent_length = 8
embeded_docs = pad_sequences(one_hot_repr, padding='pre', maxlen=sent_length) #adds zeros before or after in order to make sentences of same length
print(embeded_docs)

[[   0    0    0    0 5446 8133 5391 8076]
 [   0    0    0    0 5446 8133 5391 1545]
 [   0    0    0    0 5446 1662 5391 9186]
 [   0    0    0    0 7970 7092 4671 9137]
 [   0    0    0    0 7970 7092 4671 4439]
 [   0    0    0 8505 5446 3394 5391 1745]
 [   0    0    0    0 2220 4733 3325 4671]]


In [11]:
# feature representation
dim = 10 # we want 10 features wrt dimensions in word embedding
model = Sequential()
model.add(Embedding(voc_size, dim, input_shape=(sent_length,)))
model.compile('adam', 'mse')

  super().__init__(**kwargs)


In [12]:
model.summary()

In [13]:
model.predict(embeded_docs)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 506ms/step


array([[[-0.04988834,  0.03828622,  0.0026867 ,  0.02479657,
          0.02648352,  0.03695536,  0.01237201, -0.04516041,
          0.02589697,  0.04723257],
        [-0.04988834,  0.03828622,  0.0026867 ,  0.02479657,
          0.02648352,  0.03695536,  0.01237201, -0.04516041,
          0.02589697,  0.04723257],
        [-0.04988834,  0.03828622,  0.0026867 ,  0.02479657,
          0.02648352,  0.03695536,  0.01237201, -0.04516041,
          0.02589697,  0.04723257],
        [-0.04988834,  0.03828622,  0.0026867 ,  0.02479657,
          0.02648352,  0.03695536,  0.01237201, -0.04516041,
          0.02589697,  0.04723257],
        [ 0.00164769, -0.01747533, -0.02779658,  0.01387708,
          0.00039252, -0.03910379,  0.02104744,  0.00829311,
          0.03576732, -0.00435557],
        [-0.03795994,  0.00831016,  0.03209284,  0.04497489,
         -0.02341609,  0.01411721,  0.03143645,  0.02434466,
         -0.01466987,  0.01090251],
        [-0.01484128, -0.02885677, -0.04790437,  0.0