In [30]:
import numpy as np
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
import numpy as np
from numpy.linalg import norm

reviews = np.array(['nice service',
        'amazing restaurant',
        'too good',
        'just loved it!',
        'will go again',
        'horrible service',
        'never go there',
        'poor service',
        'poor quality',
        'needs improvement'])

sentiment = np.array([1,1,1,1,1,0,0,0,0,0])

In [14]:
#train tokenizer
vocab_size = 10000
tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(reviews)

In [16]:
#encode the sentences
encoded_review = tokenizer.texts_to_sequences(reviews)
encoded_review

[[4, 1],
 [5, 6],
 [7, 8],
 [9, 10, 11],
 [12, 2, 13],
 [14, 1],
 [15, 2, 16],
 [3, 1],
 [3, 17],
 [18, 19]]

In [17]:
max_length = 4
padded_review = pad_sequences(encoded_review,maxlen=max_length,padding='post')
padded_review

array([[ 4,  1,  0,  0],
       [ 5,  6,  0,  0],
       [ 7,  8,  0,  0],
       [ 9, 10, 11,  0],
       [12,  2, 13,  0],
       [14,  1,  0,  0],
       [15,  2, 16,  0],
       [ 3,  1,  0,  0],
       [ 3, 17,  0,  0],
       [18, 19,  0,  0]])

In [18]:
embeded_vector_size = 6

model = Sequential()
model.add(Embedding(vocab_size,output_dim= embeded_vector_size, input_length=max_length,name="embedding"))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))



In [19]:
X = padded_review
y = sentiment

In [20]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [21]:
model.fit(X, y, epochs=50, verbose=2)

Epoch 1/50
1/1 - 1s - 701ms/step - accuracy: 0.5000 - loss: 0.6899
Epoch 2/50
1/1 - 0s - 26ms/step - accuracy: 0.5000 - loss: 0.6883
Epoch 3/50
1/1 - 0s - 35ms/step - accuracy: 0.5000 - loss: 0.6868
Epoch 4/50
1/1 - 0s - 24ms/step - accuracy: 0.6000 - loss: 0.6852
Epoch 5/50
1/1 - 0s - 18ms/step - accuracy: 0.7000 - loss: 0.6836
Epoch 6/50
1/1 - 0s - 27ms/step - accuracy: 0.9000 - loss: 0.6820
Epoch 7/50
1/1 - 0s - 19ms/step - accuracy: 0.9000 - loss: 0.6805
Epoch 8/50
1/1 - 0s - 50ms/step - accuracy: 0.9000 - loss: 0.6789
Epoch 9/50
1/1 - 0s - 30ms/step - accuracy: 0.9000 - loss: 0.6773
Epoch 10/50
1/1 - 0s - 24ms/step - accuracy: 0.9000 - loss: 0.6757
Epoch 11/50
1/1 - 0s - 24ms/step - accuracy: 0.9000 - loss: 0.6741
Epoch 12/50
1/1 - 0s - 27ms/step - accuracy: 0.9000 - loss: 0.6725
Epoch 13/50
1/1 - 0s - 26ms/step - accuracy: 0.9000 - loss: 0.6709
Epoch 14/50
1/1 - 0s - 24ms/step - accuracy: 0.9000 - loss: 0.6693
Epoch 15/50
1/1 - 0s - 24ms/step - accuracy: 0.9000 - loss: 0.6677
Epo

<keras.src.callbacks.history.History at 0x252ce0b9c50>

In [22]:
model.summary()

In [23]:
loss,accuracy = model.evaluate(padded_review, sentiment, verbose=2)

1/1 - 0s - 233ms/step - accuracy: 1.0000 - loss: 0.6035


In [24]:
embedding_weights = model.get_layer('embedding').get_weights()[0]
len(embedding_weights)

10000

In [10]:
embedding_weights[6179]

array([-0.09574086,  0.10009015,  0.10751449, -0.0836001 ,  0.01544824],
      dtype=float32)

In [25]:
w2v_my = {}

for word, index in tokenizer.word_index.items():
    w2v_my[word] = embedding_weights[index]

In [26]:
w2v_my

{'service': array([-0.05742276, -0.00824009,  0.03412312,  0.06828044,  0.02239429,
        -0.04595489], dtype=float32),
 'go': array([-0.00414472,  0.01964854, -0.02402994, -0.03045799,  0.03911931,
        -0.01932709], dtype=float32),
 'poor': array([-0.0481586 , -0.04493286, -0.05551243,  0.10930685,  0.02118309,
        -0.0944111 ], dtype=float32),
 'nice': array([ 0.01163845,  0.07250951,  0.0027564 , -0.06915941, -0.08880052,
         0.0388923 ], dtype=float32),
 'amazing': array([ 0.02783069,  0.09771395,  0.08737823, -0.0732104 , -0.02370714,
         0.03525441], dtype=float32),
 'restaurant': array([ 0.0364366 ,  0.01414193, -0.05506751, -0.08960474,  0.01263289,
         0.09589031], dtype=float32),
 'too': array([ 0.03779981,  0.02774276,  0.02058887, -0.09975199, -0.02492266,
         0.06192296], dtype=float32),
 'good': array([ 0.07828788,  0.10118476, -0.09585537, -0.04963742,  0.07117011,
         0.02387343], dtype=float32),
 'just': array([ 0.04325438,  0.0993533

In [27]:
print(w2v_my['good'])

[ 0.07828788  0.10118476 -0.09585537 -0.04963742  0.07117011  0.02387343]


In [28]:
print(w2v_my['amazing'])

[ 0.02783069  0.09771395  0.08737823 -0.0732104  -0.02370714  0.03525441]


In [33]:
cosine = np.dot(w2v_my['good'],w2v_my['loved'])/(norm(w2v_my['good'])*(norm(w2v_my['loved'])))
cosine

0.8365706

In [35]:
## Save weights of embeddings
np.savez('./saved_weights/embeddings.npz', **w2v_my)

In [38]:
load_embeddings = np.load('./saved_weights/embeddings.npz')

# check existing words (keys)
print(load_embeddings.files)

# get embeddings for some specific word
print(load_embeddings['amazing'])

['service', 'go', 'poor', 'nice', 'amazing', 'restaurant', 'too', 'good', 'just', 'loved', 'it', 'will', 'again', 'horrible', 'never', 'there', 'quality', 'needs', 'improvement']
[ 0.02783069  0.09771395  0.08737823 -0.0732104  -0.02370714  0.03525441]
