# DAY 20 — Attention Mechanisms

## Overview

This notebook demonstrates how Attention helps a model identify useful words in a sequence.
We apply attention on top of a Bidirectional LSTM for IMDB sentiment classification.

## Load & Prepare Data

In [1]:
import tensorflow as tf
from tensorflow.keras import layers, models, datasets, preprocessing, callbacks
import numpy as np
import matplotlib.pyplot as plt

tf.random.set_seed(42)


## Load & Prepare Data

In [2]:
vocab_size = 10000
max_len = 200

(x_train, y_train), (x_test, y_test) = datasets.imdb.load_data(num_words=vocab_size)
x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=max_len)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=max_len)

print("Train shape:", x_train.shape)


Train shape: (25000, 200)


## Define Custom Attention Layer

In [6]:
class AttentionLayer(layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.score_dense = layers.Dense(1)

    def call(self, lstm_output, mask=None):
        score = self.score_dense(lstm_output)          # (batch, timesteps, 1)
        weights = tf.nn.softmax(score, axis=1)        # normalize across timesteps
        context = tf.reduce_sum(weights * lstm_output, axis=1)  # (batch, features)
        return context


## Build Attention-Based Model

In [7]:
inputs = layers.Input(shape=(max_len,))
x = layers.Embedding(vocab_size, 128)(inputs)
lstm_out = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)

context = AttentionLayer()(lstm_out)
outputs = layers.Dense(1, activation='sigmoid')(context)

model = models.Model(inputs, outputs)
model.summary()


## Train the Model

In [8]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

cb = [callbacks.EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)]

history = model.fit(
    x_train, y_train,
    validation_data=(x_test, y_test),
    epochs=10, batch_size=128, callbacks=cb
)


Epoch 1/10
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m211s[0m 1s/step - accuracy: 0.7956 - loss: 0.4147 - val_accuracy: 0.8748 - val_loss: 0.2997
Epoch 2/10
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 777ms/step - accuracy: 0.9099 - loss: 0.2259 - val_accuracy: 0.8698 - val_loss: 0.3006
Epoch 3/10
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 751ms/step - accuracy: 0.9303 - loss: 0.1851 - val_accuracy: 0.8648 - val_loss: 0.3341
Epoch 4/10
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 750ms/step - accuracy: 0.9404 - loss: 0.1543 - val_accuracy: 0.8625 - val_loss: 0.4012


## Evaluate

In [9]:
loss, acc = model.evaluate(x_test, y_test)
print(f"\nTest Accuracy: {acc*100:.2f}%")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 58ms/step - accuracy: 0.8748 - loss: 0.2997

Test Accuracy: 87.48%
