<a href="https://colab.research.google.com/github/Shivank-thapa/Keras_Test/blob/main/WordEmbedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Word embedding techniques using Embedding layer in Keras**

In [1]:
from tensorflow.keras.preprocessing.text import one_hot

In [2]:
# initialise sentences
sentences = ['the glass of milk',
             'the glass of juice',
             'the cup of tea',
             'I am a good boy',
             'I am a good developer',
             'understand the meaning of words',
             'your videos are good']

In [3]:
vocab_size = 10000

##### One hot representation

In [5]:
onehot_repr = [one_hot(words, vocab_size) for words in sentences]

# for each word we get an index from dictionary
print(onehot_repr)

[[7217, 8021, 9952, 2121], [7217, 8021, 9952, 2494], [7217, 9634, 9952, 6981], [1973, 9029, 39, 1246, 4036], [1973, 9029, 39, 1246, 9541], [8665, 7217, 3508, 9952, 8069], [4566, 104, 9188, 1246]]


##### Word embedding representation

In [6]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
import numpy as np

In [7]:
sent_length=8
# make all sentences of same length by padding
embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)
print(embedded_docs)

[[   0    0    0    0 7217 8021 9952 2121]
 [   0    0    0    0 7217 8021 9952 2494]
 [   0    0    0    0 7217 9634 9952 6981]
 [   0    0    0 1973 9029   39 1246 4036]
 [   0    0    0 1973 9029   39 1246 9541]
 [   0    0    0 8665 7217 3508 9952 8069]
 [   0    0    0    0 4566  104 9188 1246]]


In [8]:
# dimesions for embedding
dim = 10

In [9]:
model = Sequential()
# add a Embedding layer to convert into a featurised representation based on dimension
model.add(Embedding(vocab_size, 10, input_length=sent_length))

In [10]:
model.compile('adam', 'mse')

In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 8, 10)             100000    
Total params: 100,000
Trainable params: 100,000
Non-trainable params: 0
_________________________________________________________________


In [None]:
print(model.predict(embedded_docs))

In [13]:
embedded_docs[0]

array([   0,    0,    0,    0, 7217, 8021, 9952, 2121], dtype=int32)

In [14]:
print(model.predict(embedded_docs)[0])

[[ 0.02055169  0.00684401 -0.0043185   0.02364429 -0.02539853 -0.03732651
  -0.0246421  -0.01053581  0.0397105  -0.00724469]
 [ 0.02055169  0.00684401 -0.0043185   0.02364429 -0.02539853 -0.03732651
  -0.0246421  -0.01053581  0.0397105  -0.00724469]
 [ 0.02055169  0.00684401 -0.0043185   0.02364429 -0.02539853 -0.03732651
  -0.0246421  -0.01053581  0.0397105  -0.00724469]
 [ 0.02055169  0.00684401 -0.0043185   0.02364429 -0.02539853 -0.03732651
  -0.0246421  -0.01053581  0.0397105  -0.00724469]
 [-0.03332315  0.02101848  0.04549983  0.0036564  -0.03582742  0.00945202
   0.03817279 -0.0057795  -0.02406625 -0.00071912]
 [-0.00845493  0.0167082  -0.04144489  0.01511389 -0.03229548  0.04999883
  -0.04209101  0.04976979 -0.01102878 -0.00756145]
 [ 0.04504846 -0.01315738  0.03152053 -0.02955397  0.04453244 -0.04164732
   0.0173764   0.02215953 -0.03403151  0.00422977]
 [-0.03094018 -0.01310843 -0.02114939 -0.01808636  0.0221723  -0.00789423
  -0.0244459  -0.02609177  0.00913771 -0.03091261]]