In [None]:
import numpy as np
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense

# 1. Load and Prepare the Dataset

max_words = 10000

maxlen = 256

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=max_words)
print(len(train_data), 'train sequences')
print(len(test_data), 'test sequences')

print('Pad sequences (samples x time)')
train_data = pad_sequences(train_data, maxlen=maxlen)
test_data = pad_sequences(test_data, maxlen=maxlen)
print('train_data shape:', train_data.shape)
print('test_data shape:', test_data.shape)

# Convert labels to float32 (standard for neural network inputs)
train_labels = np.asarray(train_labels).astype('float32')
test_labels = np.asarray(test_labels).astype('float32')


# 2. Build the Model

embedding_dim = 16 # Dimension of the word embeddings

model = Sequential()
# Embedding layer: converts word indices into dense vectors of fixed size
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
# Global Average Pooling: A simpler way to handle variable length sequences
# by averaging the embeddings across the sequence dimension.
model.add(GlobalAveragePooling1D())
# Dense layers for classification
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid')) # Output layer for binary classification

# 3. Compile the Model

model.compile(optimizer='adam',
              loss='binary_crossentropy', # Appropriate loss for binary classification
              metrics=['accuracy'])

# 4. Train the Model

# Create a validation set from the training data
x_val = train_data[:10000]
partial_x_train = train_data[10000:]

y_val = train_labels[:10000]
partial_y_train = train_labels[10000:]


print('Training the model...')
history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=10, # Reduced epochs as embedding layer learns faster
                    batch_size=512,
                    validation_data=(x_val, y_val),
                    verbose=1) # Show training progress

# 5. Evaluate the Model

print('Evaluating the model...')
loss, accuracy = model.evaluate(test_data, test_labels, verbose=0)

print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')


25000 train sequences
25000 test sequences
Pad sequences (samples x time)
train_data shape: (25000, 256)
test_data shape: (25000, 256)
Training the model...
Epoch 1/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 27ms/step - accuracy: 0.5484 - loss: 0.6916 - val_accuracy: 0.7109 - val_loss: 0.6837
Epoch 2/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.6866 - loss: 0.6794 - val_accuracy: 0.6856 - val_loss: 0.6667
Epoch 3/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.7227 - loss: 0.6571 - val_accuracy: 0.7525 - val_loss: 0.6367
Epoch 4/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.7659 - loss: 0.6224 - val_accuracy: 0.7788 - val_loss: 0.5949
Epoch 5/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.7891 - loss: 0.5760 - val_accuracy: 0.7901 - val_loss: 0.5499
Epoch 6/10
[1m30/30[0m [32m━━━━━━━━