In [26]:
import numpy as np
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding

Word Embedding
There are two techniques for creating word embeddings
1. supervised learning
2. self supervised learning techniques such as word2vec, glove
In this notebook we will do first technique of supervised learning using simple food review classification and see how word embeddings are calculated while solving that problem

In [27]:
#let's create a simple embedding for food reviews
#Why Embedding ?
#what are the advantages of Embedding ?
reviews = ['nice food',
          'amazing restaurant',
          'too good',
          'just loved it',
          'will go again',
          'horrible food',
          'never go there',
          'poor service',
          'poor quality',
          'needs improvement']
sentiment = np.array([1,1,1,1,1,0,0,0,0,0])

In [28]:
#step-1: convert words into one-hot encoding

#for that we need the size of the vocabulary
all_words = []
for review in reviews:
    all_words.extend(review.split())
    
#extract unique words
unique_words = list(set(all_words))

#get the length of the unique words
vocab_size = len(unique_words)

print(vocab_size)

# encoded_reviews = [one_hot(review, vocab_size) for review in reviews]
# encoded_reviews

#create the Tokenizer
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(reviews)

#get the encoded sequences
encoded_reviews = tokenizer.texts_to_sequences(reviews)
encoded_reviews

20


[[4, 1],
 [5, 6],
 [7, 8],
 [9, 10, 11],
 [12, 2, 13],
 [14, 1],
 [15, 2, 16],
 [3, 17],
 [3, 18],
 [19]]

In [29]:
max_length = 3
padded_reviews = pad_sequences(encoded_reviews, maxlen=max_length, padding='post')
padded_reviews

array([[ 4,  1,  0],
       [ 5,  6,  0],
       [ 7,  8,  0],
       [ 9, 10, 11],
       [12,  2, 13],
       [14,  1,  0],
       [15,  2, 16],
       [ 3, 17,  0],
       [ 3, 18,  0],
       [19,  0,  0]], dtype=int32)

In [30]:
embedded_vector_size = 5

model = Sequential()
model.add(Embedding(vocab_size, embedded_vector_size, input_length = max_length, name="embedding"))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [31]:
X = padded_reviews
y = sentiment

In [32]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 5)              100       
                                                                 
 flatten_1 (Flatten)         (None, 15)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 16        
                                                                 
Total params: 116
Trainable params: 116
Non-trainable params: 0
_________________________________________________________________


In [33]:
model.fit(X, y, epochs=50, verbose=0)

<keras.callbacks.History at 0x7ff3131bb6d0>

In [34]:
#evaluate the accuracy
loss, accuracy = model.evaluate(X, y)



In [35]:
weights = model.get_layer('embedding').get_weights()[0]
print(len(weights))
print(weights)

20
[[ 0.0716628  -0.09954268 -0.03390084  0.09419224  0.05571979]
 [ 0.01435196  0.01547963 -0.00504182 -0.02935691 -0.00208431]
 [ 0.00291872 -0.00704472 -0.04038826  0.01525257 -0.03483064]
 [-0.00639649 -0.09466038 -0.07559213 -0.0681598   0.07046977]
 [ 0.05348936  0.02487477  0.07024904  0.07723635 -0.02511389]
 [ 0.06718577  0.04029544  0.02720521  0.02474062 -0.02246781]
 [-0.08267525  0.0319889   0.03807167 -0.03400251  0.08244995]
 [ 0.0649171   0.07585292  0.05543591  0.00757449 -0.06787428]
 [-0.02492717  0.06892437  0.08207031 -0.0721574   0.09007613]
 [ 0.02570036  0.08238337  0.08103435  0.03068156 -0.03278252]
 [-0.07485533  0.05879176  0.09615874 -0.01727545  0.01173386]
 [ 0.00785985  0.02833569 -0.07220946  0.00291598 -0.00085939]
 [ 0.02416662  0.0099695   0.00347817  0.03642908 -0.08789387]
 [ 0.06555734  0.04334445 -0.06891023  0.05799097 -0.03539651]
 [-0.06509853 -0.09145979 -0.09027926 -0.05491392  0.0018347 ]
 [-0.0228709  -0.10464235 -0.09699249 -0.02976553  0

In [36]:
weights[4]

array([ 0.05348936,  0.02487477,  0.07024904,  0.07723635, -0.02511389],
      dtype=float32)

In [37]:
weights[5]

array([ 0.06718577,  0.04029544,  0.02720521,  0.02474062, -0.02246781],
      dtype=float32)

In [None]:
#next:: add pca and see how it classifies the word embedding