<a href="https://colab.research.google.com/github/Sudheendra-RD/NLP/blob/main/WordEmbedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import keras
from tensorflow.keras.preprocessing.text import one_hot

In [2]:
sent = ['the glass of milk',
        'the glass of juice',
        'the cup of tea',
        'I am a good boy',
        'I am a good developer',
        'understand the meaning of words',
        'your videos are good']

In [3]:
# Vocabulary size
voc_size = 10000

In [4]:
onehot_repr = [one_hot(words, voc_size) for words in sent]
print(onehot_repr)

[[5771, 9952, 7767, 906], [5771, 9952, 7767, 3731], [5771, 4973, 7767, 7518], [1144, 4822, 96, 5948, 8916], [1144, 4822, 96, 5948, 3865], [4991, 5771, 952, 7767, 1254], [5948, 7795, 7430, 5948]]


In [5]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [6]:
import numpy as np

In [7]:
# Word Embedding or LSTM works best when the length of sentences is same
# But here we have 4 words in some case and 5 in other case
# so we add a function called pad_sequence to make the length of input same
# This is like padding in CNN, adding 0's to make the process efficient

In [None]:
# There are 2 types of padding, 'pre' and 'post'
# Pre is used when we want to add padding in the beginning
# Post is used when we want to add padding at the end
# 'maxlen' is the length of each sentence after padding

In [8]:
embedded = pad_sequences(onehot_repr, padding='pre', maxlen=8)
print(embedded)

# Here we have given 'pre' padding and maxlen of 8

[[   0    0    0    0 5771 9952 7767  906]
 [   0    0    0    0 5771 9952 7767 3731]
 [   0    0    0    0 5771 4973 7767 7518]
 [   0    0    0 1144 4822   96 5948 8916]
 [   0    0    0 1144 4822   96 5948 3865]
 [   0    0    0 4991 5771  952 7767 1254]
 [   0    0    0    0 5948 7795 7430 5948]]


In [9]:
dim = 8
# In embedding, we will convert a word into n number of vectors
# This will specify as to how many vectors you want the word to be converted

In [13]:
# We neec to create a sequential layer with an embedding layer.
# We need to input the vocabulary size, no. of vectors & length of embedded layer

model = Sequential()
model.add(Embedding(
    input_dim = voc_size,
    output_dim = dim,
    input_length= 8
))
model.compile(optimizer='adam', loss='mse')

In [14]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 8, 8)              80000     
                                                                 
Total params: 80,000
Trainable params: 80,000
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.predict(embedded)

array([[[ 0.01401231,  0.03912989,  0.02971024,  0.03558644,
          0.01310122, -0.04958609, -0.04426647,  0.00912289],
        [ 0.01401231,  0.03912989,  0.02971024,  0.03558644,
          0.01310122, -0.04958609, -0.04426647,  0.00912289],
        [ 0.01401231,  0.03912989,  0.02971024,  0.03558644,
          0.01310122, -0.04958609, -0.04426647,  0.00912289],
        [ 0.01401231,  0.03912989,  0.02971024,  0.03558644,
          0.01310122, -0.04958609, -0.04426647,  0.00912289],
        [ 0.00949375, -0.02182547, -0.03836508,  0.02201133,
         -0.04275142,  0.03388841, -0.02342689, -0.0284763 ],
        [-0.0471159 , -0.01218945, -0.02967845,  0.02938415,
          0.02552232, -0.0348387 , -0.00067619, -0.01852181],
        [ 0.04766543, -0.01343378, -0.0151849 , -0.04734063,
         -0.02773576, -0.02108033,  0.00668893,  0.00026109],
        [-0.0474693 ,  0.02347339,  0.00990988, -0.00746365,
          0.04408156,  0.01141564, -0.04468708,  0.02919081]],

       [[ 0.01

In [15]:
embedded[0]

array([   0,    0,    0,    0, 5771, 9952, 7767,  906], dtype=int32)

In [17]:
model.predict(embedded)[0]

array([[ 0.02440865, -0.0339918 , -0.03517089,  0.03403086, -0.00674397,
         0.0354887 ,  0.03872419,  0.03369043],
       [ 0.02440865, -0.0339918 , -0.03517089,  0.03403086, -0.00674397,
         0.0354887 ,  0.03872419,  0.03369043],
       [ 0.02440865, -0.0339918 , -0.03517089,  0.03403086, -0.00674397,
         0.0354887 ,  0.03872419,  0.03369043],
       [ 0.02440865, -0.0339918 , -0.03517089,  0.03403086, -0.00674397,
         0.0354887 ,  0.03872419,  0.03369043],
       [ 0.04996114,  0.01930262, -0.04301338,  0.04000736,  0.02467145,
        -0.00122646, -0.01657243,  0.03436524],
       [-0.04873593, -0.00293987, -0.0480734 ,  0.01699766,  0.0212618 ,
         0.04317651, -0.04058989, -0.03932529],
       [-0.04560199,  0.03740123, -0.01367122, -0.02418545,  0.00885201,
        -0.01395698,  0.02960006,  0.049202  ],
       [-0.03373505,  0.01925254, -0.04943389, -0.02050625,  0.04770286,
        -0.02562121, -0.02955829, -0.02621728]], dtype=float32)