In [1]:
from tensorflow.keras.preprocessing.text import one_hot




In [2]:
### sentences
sent=[  'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good',]

In [3]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [4]:
## Define the vocabulary size
voc_size = 10000

In [5]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [6]:
### One Hot Representation
one_hot_repr = [one_hot(words,voc_size)for words in sent]   
one_hot_repr

# Converts words of each sentence into one hot encoded integers
# The output represents the index of each word in the vocabulary. For example, the first sentence "the glass of milk" is represented as [6186, 6775, 637, 4895] where each number corresponds to a unique word in the vocabulary. 6186 means in a vector of size 10000, the word "the" is represented by the index 6186 i.e 6186th index will be 1 and all other indices will be 0.

# This creates a sparse matrux which could lead to overfitting.
# To avoid this, we can use embedding layer which will convert these sparse vectors into dense vectors

[[8924, 2573, 5464, 3358],
 [8924, 2573, 5464, 2070],
 [8924, 6241, 5464, 7311],
 [9217, 8067, 2526, 7023, 8245],
 [9217, 8067, 2526, 7023, 9842],
 [2769, 8924, 7778, 5464, 3119],
 [1361, 8829, 3574, 7023]]

In [7]:
## word Embedding Representation

from tensorflow.keras.layers import Embedding
#from tensorflow.keras.processing.sequence import pad_sequences
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential

In [8]:
import numpy as np

In [None]:
sent_length = 8
embedded_docs = pad_sequences(one_hot_repr, padding='pre', maxlen=sent_length)
print(embedded_docs)

# In each sentence there are a different number of words. To make them equal, we use padding. Here, we have taken the maximum length of a sentence as 8. If a sentence has less than 8 words, it will be padded with zeros at the beginning (pre-padding). If it has more than 8 words, it will be truncated to fit.

[[   0    0    0    0 8924 2573 5464 3358]
 [   0    0    0    0 8924 2573 5464 2070]
 [   0    0    0    0 8924 6241 5464 7311]
 [   0    0    0 9217 8067 2526 7023 8245]
 [   0    0    0 9217 8067 2526 7023 9842]
 [   0    0    0 2769 8924 7778 5464 3119]
 [   0    0    0    0 1361 8829 3574 7023]]


In [11]:
## feature representation
dim = 10

In [12]:
model = Sequential()
model.add(Embedding(voc_size, dim, input_length=sent_length))
model.compile('adam', 'mse')

# model.add(Embedding(voc_size, dim, input_length=sent_length)) : Adds an Embedding layer, which converts word indices (integers) into dense word vectors. 
# voc_size is the size of the vocabulary, 	
# dim is the Embedding dimension — each word will be represented by a vector of length dim (e.g., 10 here).
# input_length is The length of each padded sequence (e.g., 8 here) — needed so the model knows the shape of its input.

# model.compile('adam', 'mse') : Compiles the model with the Adam optimizer and Mean Squared Error loss function.




In [13]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 10)             100000    
                                                                 
Total params: 100000 (390.62 KB)
Trainable params: 100000 (390.62 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model.predict(embedded_docs)

# This gives the dense vector representation of each word in the sentences. Each word is now represented by a vector of size 10 (as specified by `dim`), which captures the semantic meaning of the words in a more compact form compared to one-hot encoding.



array([[[-0.01715923,  0.01986555,  0.00510715, -0.01948509,
          0.00451503,  0.03158509, -0.01334063, -0.00300102,
          0.03776613, -0.00146614],
        [-0.01715923,  0.01986555,  0.00510715, -0.01948509,
          0.00451503,  0.03158509, -0.01334063, -0.00300102,
          0.03776613, -0.00146614],
        [-0.01715923,  0.01986555,  0.00510715, -0.01948509,
          0.00451503,  0.03158509, -0.01334063, -0.00300102,
          0.03776613, -0.00146614],
        [-0.01715923,  0.01986555,  0.00510715, -0.01948509,
          0.00451503,  0.03158509, -0.01334063, -0.00300102,
          0.03776613, -0.00146614],
        [ 0.02685023, -0.02638841,  0.04074228,  0.00441874,
         -0.0208487 ,  0.02411045, -0.0335707 ,  0.0363767 ,
          0.02465141, -0.01287327],
        [-0.00339707,  0.03555569, -0.04781895,  0.01279496,
          0.01662549,  0.02712948,  0.02542276,  0.00889934,
          0.04381249,  0.03135285],
        [-0.03874917,  0.01256781, -0.0172983 , -0.0

In [None]:
embedded_docs[0]

# This is the padded output of first sentence " the glass of milk".

array([   0,    0,    0,    0, 8924, 2573, 5464, 3358])

In [None]:
model.predict(embedded_docs[0])

# This gives the dense vector representation of the first sentence. Each word in the sentence is now represented by a vector of size 10, which captures the semantic meaning of the words in a more compact form compared to one-hot encoding.



array([[ 0.01453609,  0.03697893,  0.02267558,  0.01149287, -0.03695335,
         0.01416664,  0.03655917,  0.00734384,  0.03028754,  0.00339943],
       [ 0.01453609,  0.03697893,  0.02267558,  0.01149287, -0.03695335,
         0.01416664,  0.03655917,  0.00734384,  0.03028754,  0.00339943],
       [ 0.01453609,  0.03697893,  0.02267558,  0.01149287, -0.03695335,
         0.01416664,  0.03655917,  0.00734384,  0.03028754,  0.00339943],
       [ 0.01453609,  0.03697893,  0.02267558,  0.01149287, -0.03695335,
         0.01416664,  0.03655917,  0.00734384,  0.03028754,  0.00339943],
       [-0.03792738,  0.01958679, -0.04232483, -0.03475742,  0.02182527,
         0.01143194, -0.03125288,  0.02584182,  0.0050171 ,  0.04725457],
       [-0.02213118,  0.00730393,  0.02797868, -0.02386508, -0.0024281 ,
         0.04419583, -0.02011771, -0.00502002, -0.03373672, -0.04126013],
       [-0.02629154,  0.02487988, -0.02824695,  0.0302802 , -0.01835672,
        -0.00683415,  0.01606056, -0.04426531