In [1]:
import pandas as pd
import numpy as np
import nltk
import re

In [2]:
# Sentence:

sen = [  'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good',]

In [3]:
sen

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [4]:
# Vocabulary Size

voc_size = 10000       # Size of the dictionary

### One Hot Representation of the sentence

In [5]:
from tensorflow.keras.preprocessing.text import one_hot

onehot = [one_hot(words, voc_size) for words in sen]
print(onehot)

[[2693, 8565, 1821, 6042], [2693, 8565, 1821, 4027], [2693, 1972, 1821, 2776], [4337, 7830, 7824, 2318, 1848], [4337, 7830, 7824, 2318, 9187], [786, 2693, 7454, 1821, 7628], [5022, 3206, 7656, 2318]]


Got the index from the dictionary.

### Word Embedding Representation

<b>onehot</b> (which is representation of OneHot) we will be passing it through Embedding layers from keras to form a Embedding Matrix.

In [8]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

sen_len = 8    # can be set as per the corpus

embedded_docs = pad_sequences(onehot, 
                              padding='pre', # We can also use 'post' 
                              maxlen=sen_len)
print(embedded_docs)

[[   0    0    0    0 2693 8565 1821 6042]
 [   0    0    0    0 2693 8565 1821 4027]
 [   0    0    0    0 2693 1972 1821 2776]
 [   0    0    0 4337 7830 7824 2318 1848]
 [   0    0    0 4337 7830 7824 2318 9187]
 [   0    0    0  786 2693 7454 1821 7628]
 [   0    0    0    0 5022 3206 7656 2318]]


In [9]:
dim = 10  # Dimensions         

model = Sequential()
model.add(Embedding(voc_size, 10, input_length = sen_len))
model.compile('adam','mse')

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 8, 10)             100000    
Total params: 100,000
Trainable params: 100,000
Non-trainable params: 0
_________________________________________________________________


In [10]:
print(model.predict(embedded_docs))

[[[-4.04459238e-03 -4.51572053e-02  1.86618418e-03  2.03719772e-02
    8.66150856e-03  1.46572702e-02 -3.57058048e-02  3.18227522e-02
    4.39305231e-03  1.07787848e-02]
  [-4.04459238e-03 -4.51572053e-02  1.86618418e-03  2.03719772e-02
    8.66150856e-03  1.46572702e-02 -3.57058048e-02  3.18227522e-02
    4.39305231e-03  1.07787848e-02]
  [-4.04459238e-03 -4.51572053e-02  1.86618418e-03  2.03719772e-02
    8.66150856e-03  1.46572702e-02 -3.57058048e-02  3.18227522e-02
    4.39305231e-03  1.07787848e-02]
  [-4.04459238e-03 -4.51572053e-02  1.86618418e-03  2.03719772e-02
    8.66150856e-03  1.46572702e-02 -3.57058048e-02  3.18227522e-02
    4.39305231e-03  1.07787848e-02]
  [ 3.04269530e-02 -4.01014313e-02  3.85929458e-02 -3.71327996e-02
   -2.03336831e-02  3.47872637e-02 -4.58647124e-02 -2.25541834e-02
   -1.20015368e-02 -1.30778439e-02]
  [-4.17716429e-03 -2.09396482e-02  3.40023898e-02  2.88313963e-02
   -4.55485955e-02  5.89333475e-04  1.24856085e-03  2.88786776e-02
    4.38852198e-

In [11]:
embedded_docs[0]

array([   0,    0,    0,    0, 2693, 8565, 1821, 6042])

All the 8 sen_len got converted into 10 dimension each.

In [12]:
print(model.predict(embedded_docs)[0])

[[-0.00404459 -0.04515721  0.00186618  0.02037198  0.00866151  0.01465727
  -0.0357058   0.03182275  0.00439305  0.01077878]
 [-0.00404459 -0.04515721  0.00186618  0.02037198  0.00866151  0.01465727
  -0.0357058   0.03182275  0.00439305  0.01077878]
 [-0.00404459 -0.04515721  0.00186618  0.02037198  0.00866151  0.01465727
  -0.0357058   0.03182275  0.00439305  0.01077878]
 [-0.00404459 -0.04515721  0.00186618  0.02037198  0.00866151  0.01465727
  -0.0357058   0.03182275  0.00439305  0.01077878]
 [ 0.03042695 -0.04010143  0.03859295 -0.0371328  -0.02033368  0.03478726
  -0.04586471 -0.02255418 -0.01200154 -0.01307784]
 [-0.00417716 -0.02093965  0.03400239  0.0288314  -0.0455486   0.00058933
   0.00124856  0.02887868  0.04388522  0.03230511]
 [ 0.0258062  -0.00557094  0.02942469 -0.02628375  0.01583444 -0.03148772
   0.03973449 -0.02018503 -0.03576378  0.02351857]
 [-0.00154265  0.00150223 -0.02068273 -0.04085366  0.03259684 -0.02558907
  -0.04721032  0.01557017  0.02145462  0.02814858]]