<a href="https://colab.research.google.com/github/VenkateshSoni/Word-Embedding/blob/main/Word_Embedding_Techniques_using_Embedding_Layer_in_Keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Word Embedding Techniques using Embedding Layer in Keras

In [18]:
#if not tensorflow>2.0
from keras.preprocessing.text import one_hot

In [19]:
#in tensorflow > 2.0 you get keras integrated in the tensorflow itself
#for tensorflow > 2.0
from tensorflow.keras.preprocessing.text import one_hot

In [20]:
#sentences
sent = ['the glass of milk',
       'the glass of juice',
       'the cup of tea',
       'I am a good boy',
       'I am a good developer',
       'understand the meaning of words',
       'english is easy', 
       'mathematics is easy',
       'geography is difficult']

In [22]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'english is easy',
 'mathematics is easy',
 'geography is difficult']

In [24]:
voc_size = 10000 #Size of Dictionary/Vocabulary

#One Hot Representation

In [25]:
onehot_representation = [one_hot(words, voc_size) for words in sent]
print(onehot_representation)
#note that same word indexes are same

[[6477, 3805, 3598, 8148], [6477, 3805, 3598, 7935], [6477, 3275, 3598, 7166], [7538, 4396, 1254, 7086, 433], [7538, 4396, 1254, 7086, 2731], [6213, 6477, 2454, 3598, 2818], [7914, 6491, 7792], [1596, 6491, 7792], [3568, 6491, 7038]]


#Word Embedding Representation

In [26]:
from tensorflow.keras.layers import Embedding
from keras.preprocessing.sequence import pad_sequences 
#from tensorflow.keras.preprocessing.squence import pad_sequences --Not Working*
from tensorflow.keras.models import Sequential #using sequential model inorder to use Embedding Layer

In [27]:
import numpy as np

In [28]:
sent_length = 8
embedded_docs= pad_sequences(onehot_representation, padding = 'pre', maxlen = sent_length)
#pad_sequence = takes in onehot representation, padding 'pre' - adds zeros infornt and make sentences complete
#padding 'post' - adds zeros behind and make sentences complete
print(embedded_docs)

[[   0    0    0    0 6477 3805 3598 8148]
 [   0    0    0    0 6477 3805 3598 7935]
 [   0    0    0    0 6477 3275 3598 7166]
 [   0    0    0 7538 4396 1254 7086  433]
 [   0    0    0 7538 4396 1254 7086 2731]
 [   0    0    0 6213 6477 2454 3598 2818]
 [   0    0    0    0    0 7914 6491 7792]
 [   0    0    0    0    0 1596 6491 7792]
 [   0    0    0    0    0 3568 6491 7038]]


In [29]:
dimension = 10

In [30]:
model = Sequential()
model.add(Embedding(voc_size, 10, input_length=sent_length)) #Embedding Layer - Helps to convert representation based on number of dimension to featurized representation
model.compile('adam','mse') #adam optimizer, mean squared error

In [31]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 8, 10)             100000    
                                                                 
Total params: 100,000
Trainable params: 100,000
Non-trainable params: 0
_________________________________________________________________


In [32]:
print(model.predict(embedded_docs)) 

[[[-1.8569231e-03 -4.5436632e-02  3.3192519e-02  4.6544150e-04
   -5.4703467e-03  7.0324317e-03 -3.3323668e-02  4.0849220e-02
   -2.7282907e-02  1.6198289e-02]
  [-1.8569231e-03 -4.5436632e-02  3.3192519e-02  4.6544150e-04
   -5.4703467e-03  7.0324317e-03 -3.3323668e-02  4.0849220e-02
   -2.7282907e-02  1.6198289e-02]
  [-1.8569231e-03 -4.5436632e-02  3.3192519e-02  4.6544150e-04
   -5.4703467e-03  7.0324317e-03 -3.3323668e-02  4.0849220e-02
   -2.7282907e-02  1.6198289e-02]
  [-1.8569231e-03 -4.5436632e-02  3.3192519e-02  4.6544150e-04
   -5.4703467e-03  7.0324317e-03 -3.3323668e-02  4.0849220e-02
   -2.7282907e-02  1.6198289e-02]
  [-7.4841082e-05 -4.9949180e-02 -4.6701171e-02 -4.6553791e-02
    3.6232661e-02 -4.6381056e-02 -4.9087096e-02  9.3529820e-03
   -7.4069276e-03 -1.0302197e-02]
  [ 2.9061500e-02  1.8357638e-02  1.2882020e-02  3.6871884e-02
   -2.1024287e-02  4.1192200e-02 -1.2389503e-02  1.7648529e-02
   -4.4204559e-02 -4.3961033e-03]
  [ 2.4099674e-02 -2.4795676e-02  1.0108

In [33]:
embedded_docs[0]

array([   0,    0,    0,    0, 6477, 3805, 3598, 8148], dtype=int32)

In [34]:
print(model.predict(embedded_docs)[0]) #Embedding Matrix

[[-1.8569231e-03 -4.5436632e-02  3.3192519e-02  4.6544150e-04
  -5.4703467e-03  7.0324317e-03 -3.3323668e-02  4.0849220e-02
  -2.7282907e-02  1.6198289e-02]
 [-1.8569231e-03 -4.5436632e-02  3.3192519e-02  4.6544150e-04
  -5.4703467e-03  7.0324317e-03 -3.3323668e-02  4.0849220e-02
  -2.7282907e-02  1.6198289e-02]
 [-1.8569231e-03 -4.5436632e-02  3.3192519e-02  4.6544150e-04
  -5.4703467e-03  7.0324317e-03 -3.3323668e-02  4.0849220e-02
  -2.7282907e-02  1.6198289e-02]
 [-1.8569231e-03 -4.5436632e-02  3.3192519e-02  4.6544150e-04
  -5.4703467e-03  7.0324317e-03 -3.3323668e-02  4.0849220e-02
  -2.7282907e-02  1.6198289e-02]
 [-7.4841082e-05 -4.9949180e-02 -4.6701171e-02 -4.6553791e-02
   3.6232661e-02 -4.6381056e-02 -4.9087096e-02  9.3529820e-03
  -7.4069276e-03 -1.0302197e-02]
 [ 2.9061500e-02  1.8357638e-02  1.2882020e-02  3.6871884e-02
  -2.1024287e-02  4.1192200e-02 -1.2389503e-02  1.7648529e-02
  -4.4204559e-02 -4.3961033e-03]
 [ 2.4099674e-02 -2.4795676e-02  1.0108449e-02  1.2847807e