<a href="https://colab.research.google.com/github/SaurabhSRP/POC-NLP-conceptProjects/blob/main/NLP_Word_Embedding_idea_explained_with_simple_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Important blog to refer for 'Word Embedding' - https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/

In [1]:
##tensorflow >2.0
from tensorflow.keras.preprocessing.text import one_hot   #this is a dictionary which represent words by one hot encoding and also provides word index

In [2]:
### sentences
sent=[  'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good']

In [3]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [4]:
### Vocabulary size
voc_size=10000 

#**One Hot Representation**

In [6]:
onehot_repr=[one_hot(words,voc_size)for words in sent] 
onehot_repr
##we get index of each word , in our 10,000 dictionary of words

[[3305, 6572, 2329, 5943],
 [3305, 6572, 2329, 6245],
 [3305, 4792, 2329, 9216],
 [2180, 6652, 7516, 6860, 7753],
 [2180, 6652, 7516, 6860, 49],
 [2667, 3305, 1559, 2329, 9754],
 [4886, 8867, 4580, 6860]]

#**Word Embedding Representation**

In [7]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences ##used to bring all sentences in common dimensions
from tensorflow.keras.models import Sequential

In [8]:
import numpy as np

In [9]:
##as we have different length of sentences 
##it is very important in word embedding to have all sentences with standard length 
##it is done with padding sequences 
sent_length=8
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

##we can see the length of each record is eight 

[[   0    0    0    0 3305 6572 2329 5943]
 [   0    0    0    0 3305 6572 2329 6245]
 [   0    0    0    0 3305 4792 2329 9216]
 [   0    0    0 2180 6652 7516 6860 7753]
 [   0    0    0 2180 6652 7516 6860   49]
 [   0    0    0 2667 3305 1559 2329 9754]
 [   0    0    0    0 4886 8867 4580 6860]]


In [10]:
dim=10 ##it is important to give the dimension for word embedding 

In [11]:
model=Sequential()
model.add(Embedding(voc_size,10,input_length=sent_length))
model.compile('adam','mse')

In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 10)             100000    
                                                                 
Total params: 100,000
Trainable params: 100,000
Non-trainable params: 0
_________________________________________________________________


In [13]:
##we need to use predict method to see our word embedding 
print(model.predict(embedded_docs))

[[[ 0.03828437 -0.0307813   0.02732508  0.04613396  0.01720034
   -0.01722948  0.02113691  0.01599536  0.034605   -0.02533269]
  [ 0.03828437 -0.0307813   0.02732508  0.04613396  0.01720034
   -0.01722948  0.02113691  0.01599536  0.034605   -0.02533269]
  [ 0.03828437 -0.0307813   0.02732508  0.04613396  0.01720034
   -0.01722948  0.02113691  0.01599536  0.034605   -0.02533269]
  [ 0.03828437 -0.0307813   0.02732508  0.04613396  0.01720034
   -0.01722948  0.02113691  0.01599536  0.034605   -0.02533269]
  [-0.03573041  0.03165773  0.00456451  0.001142   -0.03596716
   -0.01891023 -0.0100862  -0.01542521 -0.02220854 -0.01651694]
  [-0.01588271  0.03508813 -0.04255356 -0.01058732 -0.04496762
   -0.00358163  0.03663936 -0.03691731  0.02633878  0.04447743]
  [ 0.01181983  0.01429582  0.03265263  0.02338098 -0.01742543
   -0.04166339 -0.02116196 -0.0027596   0.01745507  0.00632004]
  [-0.02448467 -0.01265135 -0.01422969 -0.04933825 -0.02244031
    0.01846318  0.0416587  -0.00886778 -0.013826

In [14]:
embedded_docs[0]
##Here we can see the length of sentence is eight 

array([   0,    0,    0,    0, 3305, 6572, 2329, 5943], dtype=int32)

In [16]:
print(model.predict(embedded_docs)[0])
##Now here we can see as we had give dimension 10 , each word is represented at 10 dimensions

[[ 0.03828437 -0.0307813   0.02732508  0.04613396  0.01720034 -0.01722948
   0.02113691  0.01599536  0.034605   -0.02533269]
 [ 0.03828437 -0.0307813   0.02732508  0.04613396  0.01720034 -0.01722948
   0.02113691  0.01599536  0.034605   -0.02533269]
 [ 0.03828437 -0.0307813   0.02732508  0.04613396  0.01720034 -0.01722948
   0.02113691  0.01599536  0.034605   -0.02533269]
 [ 0.03828437 -0.0307813   0.02732508  0.04613396  0.01720034 -0.01722948
   0.02113691  0.01599536  0.034605   -0.02533269]
 [-0.03573041  0.03165773  0.00456451  0.001142   -0.03596716 -0.01891023
  -0.0100862  -0.01542521 -0.02220854 -0.01651694]
 [-0.01588271  0.03508813 -0.04255356 -0.01058732 -0.04496762 -0.00358163
   0.03663936 -0.03691731  0.02633878  0.04447743]
 [ 0.01181983  0.01429582  0.03265263  0.02338098 -0.01742543 -0.04166339
  -0.02116196 -0.0027596   0.01745507  0.00632004]
 [-0.02448467 -0.01265135 -0.01422969 -0.04933825 -0.02244031  0.01846318
   0.0416587  -0.00886778 -0.01382625  0.01050168]]