### Word Embedding Techniques using Embedding Layers in Keras

In [3]:
from tensorflow.keras.preprocessing.text import one_hot

In [4]:
#sentences
sentences = [
    'the glass of milk',
    'the glass of juice',
    'the cup of tea',
    'I am a good boy',
    'I am a good developer',
    'Understand the meaning of words',
    'Your videos are good'
]

In [5]:
sentences

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'Understand the meaning of words',
 'Your videos are good']

In [6]:
# Vocabulary size
vocab_size = 10000

### One Hot Representation

In [7]:
onehot_repr = [one_hot(sent,vocab_size)for sent in sentences]
print(onehot_repr)

[[5582, 9050, 15, 2054], [5582, 9050, 15, 3497], [5582, 2066, 15, 1762], [8090, 9910, 8481, 9935, 3076], [8090, 9910, 8481, 9935, 7868], [2987, 5582, 3061, 15, 8320], [4672, 480, 3980, 9935]]


### Word Embedding Representation

In [8]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences

#### We have Embedding as layer, Sequential as model

Also,we have pad_sequences bcz in order to generate better feature vector,all sentences should have same no. of words,so padding is applied to do so

In [9]:
import numpy as np

In [10]:
sent_length = 8 #defining padded sentence length (8 words)
## 1st argument is vector to be padded, 2nd is padding technique to be applied,
## 3rd is max_length of padded sentence
## pre menas padding is done pre(before words)
##post would have meant padding after words
embedded_docs = pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)

print(embedded_docs)


[[   0    0    0    0 5582 9050   15 2054]
 [   0    0    0    0 5582 9050   15 3497]
 [   0    0    0    0 5582 2066   15 1762]
 [   0    0    0 8090 9910 8481 9935 3076]
 [   0    0    0 8090 9910 8481 9935 7868]
 [   0    0    0 2987 5582 3061   15 8320]
 [   0    0    0    0 4672  480 3980 9935]]


In [11]:
## no. of features to be considered for each word
dim = 15

In [15]:
#initialise a sequential model
model = Sequential()

## add an embedding layer to our model
## we have to specify vocab_size, no. of features and input sentence length
model.add(Embedding(vocab_size,dim,input_length=sent_length))

## then we compile with adam optimiser considering MSE performance
model.compile('adam','mse')

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 15)             150000    
                                                                 
Total params: 150,000
Trainable params: 150,000
Non-trainable params: 0
_________________________________________________________________


In [17]:
## to see fetaure vector of our sentences
model.predict(embedded_docs)

array([[[ 0.03178154, -0.02516034,  0.0425708 , -0.02583668,
          0.00923711, -0.02231016,  0.0452461 ,  0.00674402,
          0.04629923,  0.01603715,  0.04306186, -0.03126876,
          0.04457512,  0.01926822,  0.02409092],
        [ 0.03178154, -0.02516034,  0.0425708 , -0.02583668,
          0.00923711, -0.02231016,  0.0452461 ,  0.00674402,
          0.04629923,  0.01603715,  0.04306186, -0.03126876,
          0.04457512,  0.01926822,  0.02409092],
        [ 0.03178154, -0.02516034,  0.0425708 , -0.02583668,
          0.00923711, -0.02231016,  0.0452461 ,  0.00674402,
          0.04629923,  0.01603715,  0.04306186, -0.03126876,
          0.04457512,  0.01926822,  0.02409092],
        [ 0.03178154, -0.02516034,  0.0425708 , -0.02583668,
          0.00923711, -0.02231016,  0.0452461 ,  0.00674402,
          0.04629923,  0.01603715,  0.04306186, -0.03126876,
          0.04457512,  0.01926822,  0.02409092],
        [ 0.02024851, -0.03414328,  0.04631389, -0.01571766,
         -0

In [20]:
#feature rep of 1st sentence's words(padded matrix)

model.predict(embedded_docs)[0]   #each word is changed to a vector of size= no. of features

array([[ 0.03178154, -0.02516034,  0.0425708 , -0.02583668,  0.00923711,
        -0.02231016,  0.0452461 ,  0.00674402,  0.04629923,  0.01603715,
         0.04306186, -0.03126876,  0.04457512,  0.01926822,  0.02409092],
       [ 0.03178154, -0.02516034,  0.0425708 , -0.02583668,  0.00923711,
        -0.02231016,  0.0452461 ,  0.00674402,  0.04629923,  0.01603715,
         0.04306186, -0.03126876,  0.04457512,  0.01926822,  0.02409092],
       [ 0.03178154, -0.02516034,  0.0425708 , -0.02583668,  0.00923711,
        -0.02231016,  0.0452461 ,  0.00674402,  0.04629923,  0.01603715,
         0.04306186, -0.03126876,  0.04457512,  0.01926822,  0.02409092],
       [ 0.03178154, -0.02516034,  0.0425708 , -0.02583668,  0.00923711,
        -0.02231016,  0.0452461 ,  0.00674402,  0.04629923,  0.01603715,
         0.04306186, -0.03126876,  0.04457512,  0.01926822,  0.02409092],
       [ 0.02024851, -0.03414328,  0.04631389, -0.01571766, -0.04976541,
         0.03997058,  0.00209851, -0.04012892, 