> ## Sentement Analysis - Embeddings

In [1]:
# Importing necessary libraries
import numpy as np
from keras.datasets import imdb
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras import Sequential
from keras.layers import Dense, SimpleRNN, Embedding

In [2]:
# Downloading the dataset
from keras.datasets import imdb
(X_train,y_train),(X_test,y_test) = imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


> *Make sure to `pad the documents` before passing it into the `embeddings`*

In [3]:
# Padding
X_train = pad_sequences(X_train,padding='post',maxlen=50)
X_test = pad_sequences(X_test,padding='post',maxlen=50)

X_train.shape

(25000, 50)

> When you set input_dim=10000 in your Embedding layer, it means:
<br>
> - The Embedding layer is designed to look up embeddings for integer indices from 0 up to 9999 (i.e., input_dim - 1). It allocates a unique embedding vector for each of these 10,000 possible indices.
<br>
> - It will NOT automatically create embeddings for words with indices 10000 or higher.

> So, if your dictionary has 17,000 words, and your input_dim is 10,000, it means you are intentionally or unintentionally limiting the vocabulary that your model will learn embeddings for.

In [4]:
model = Sequential()

# The Embedding layer implicitly acts as your input layer when you don't explicitly add an Input layer beforehand.
model.add(Input(shape=(50,), dtype='int32')) # 'int32' because embeddings take integer indices
model.add(Embedding(10000, 2))
model.add(SimpleRNN(32, return_sequences=False))
model.add(Dense(1, activation='sigmoid'))

model.summary()

In [5]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train,epochs=5,validation_data=(X_test,y_test))

Epoch 1/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 13ms/step - acc: 0.5549 - loss: 0.6627 - val_acc: 0.7792 - val_loss: 0.4863
Epoch 2/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - acc: 0.8240 - loss: 0.4009 - val_acc: 0.7933 - val_loss: 0.4559
Epoch 3/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - acc: 0.8595 - loss: 0.3419 - val_acc: 0.7988 - val_loss: 0.4374
Epoch 4/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 12ms/step - acc: 0.8795 - loss: 0.3044 - val_acc: 0.7982 - val_loss: 0.4656
Epoch 5/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 9ms/step - acc: 0.8927 - loss: 0.2764 - val_acc: 0.7846 - val_loss: 0.4959


In [6]:
# Model Testing
test_review = "The storyline of the movie is pretty good but the performance of the actress is not that effective"

# Encoding
test_review_encoded, i = [], 0
dictonary = imdb.get_word_index(path="imdb_word_index.json")
for word in test_review.split():
    if i == 50: break
    if word in dictonary.keys():
        test_review_encoded.append(dictonary[word])
    else:
        test_review_encoded.append(0) # oov_word
    i += 1

test_review_encoded

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


[0, 766, 4, 1, 17, 6, 181, 49, 18, 1, 236, 4, 1, 521, 6, 21, 12, 1131]

In [7]:
# Padding
test_review_encoded = pad_sequences(np.array(test_review_encoded).reshape(1, -1), maxlen = 50, padding = "post") # Make sure to pass 2D list/array into pad_sequences
test_review_encoded

array([[   0,  766,    4,    1,   17,    6,  181,   49,   18,    1,  236,
           4,    1,  521,    6,   21,   12, 1131,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0]], dtype=int32)

In [8]:
# Classification
probab_positive_class = model.predict(test_review_encoded)
if (probab_positive_class > 0.5):
    print("Positive Sentement: ", probab_positive_class)
else:
    print("Negative Sentement: ", 1 - probab_positive_class)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 508ms/step
Positive Sentement:  [[0.8667216]]
