In [17]:
import numpy as np
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Step 1: Load word index mapping
word_index = imdb.get_word_index()

# Step 2: Shift indices to account for special tokens
word_index = {k: (v + 3) for k, v in word_index.items()}

word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2
word_index["<UNUSED>"] = 3

# Step 3: Reverse index to get words from IDs
reverse_word_index = {value: key for key, value in word_index.items()}

# Step 4: Load GloVe vectors
embedding_index = {}
embedding_dim = 100

with open("../GloVe/glove.6B.100d.txt", encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = vector

# Step 5: Create embedding matrix for the words in the IMDb dataset
vocab_size = 10000
embedding_matrix = np.zeros((vocab_size, embedding_dim))
# print("embedding_matrix: ", embedding_matrix)
for i in range(4, vocab_size):  # skip special tokens
    word = reverse_word_index.get(i, None)
    if word:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

model = Sequential()
model.add(Embedding(input_dim=vocab_size,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    input_length=500,
                    trainable=False))  # Freeze embeddings

model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 100)          1000000   
                                                                 
 lstm (LSTM)                 (None, 128)               117248    
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1117377 (4.26 MB)
Trainable params: 117377 (458.50 KB)
Non-trainable params: 1000000 (3.81 MB)
_________________________________________________________________


2025-05-23 12:25:21.901549: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:274] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


In [19]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)

# Pad sequences
x_train = pad_sequences(x_train, maxlen=500, padding='post')
x_test = pad_sequences(x_test, maxlen=500, padding='post')

Each review have different length. So we are going to use `pad_sequences()` function to ensure that all sequences are of the same length (`max_length=200`) by adding zeros at the end (`padding='post'`).

In [20]:
history = model.fit(x_train, y_train,
                    epochs=5,
                    batch_size=128,
                    validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [21]:
loss, accuracy = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.5186


First we are getting `word and index` by using function `get_word_index()`. Those are the top 10000 words from imbd reviews. So we can use each word for getting that perticular words vectors from glove embedding and store it as key and vector as value in `embedding_matrix`. Because we are going to pass that embedding_matrix dictionary to the `Embedding()` layer.


#### Why do we pass embedding_matrix to Embedding()?
The `Embedding()` layer in Keras is responsible for converting each word (represented by an integer index) into a dense vector of fixed size (like 100 dimensions). But instead of learning these word vectors from scratch, we can use pre-trained word vectors like GloVe `(embedding_matrix in our case)` — which are already trained on a huge dataset and capture word meanings well.


When you pass `weights=[embedding_matrix]` into the Embedding layer, you're telling the model:
"Use these pre-trained GloVe vectors instead of learning them from scratch."