# Week 5: Residual Connections and Layer Normalization

In [1]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras.utils import pad_sequences 

max_features = 10000 # vocabulary size
max_len = 250 # words per sample 

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) 

# pad all samples to same length 
x_train = pad_sequences(x_train, maxlen=max_len, padding='post') 
x_test = pad_sequences(x_test, maxlen=max_len, padding='post')

2025-04-22 15:00:49.461893: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745323249.630467    1372 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745323249.674224    1372 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745323250.034232    1372 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745323250.034251    1372 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745323250.034252    1372 computation_placer.cc:177] computation placer alr

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 0us/step


In [2]:
from tensorflow.keras.layers import Layer, Embedding
import tensorflow as tf

class TokenAndPositionEmbedding(Layer):
    def __init__(self, seq_len, vocab_size, emb_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=emb_dim)
        self.pos_emb = Embedding(input_dim=seq_len, output_dim=emb_dim)

    def call(self, x_input):
        seq_len = tf.shape(x_input)[-1]
        positions = tf.range(start=0, limit=seq_len, delta=1)
        positions = self.pos_emb(positions)
        x_input = self.token_emb(x_input)
        return x_input + positions

## Residual connection and layer normalization
I added a residual connection that links the input of the attention layer to its output, followed by a layer normalization. This follows the structure shown in Figure 11.9 (Section 11.4.3) of Chollet’s Deep Learning with Python.

In [10]:
from tensorflow.keras.layers import Input, Dense, GlobalAveragePooling1D, Dropout, LayerNormalization, Add, MultiHeadAttention
from tensorflow.keras.models import Model 

embed_dim = 32
num_heads = 2
key_dim = embed_dim // num_heads

inputs = Input(shape=(max_len,))
x = TokenAndPositionEmbedding(max_len, max_features, embed_dim)(inputs)
attention_output = MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)(x, x)
x = Add()([x, attention_output]) # residual connection around the attention layer
x = LayerNormalization()(x) # layer normalization right after 
x = GlobalAveragePooling1D()(x)
x = Dropout(0.5)(x)
outputs = Dense(1, activation='sigmoid')(x)

model = Model(inputs=inputs, outputs=outputs)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [11]:
model.fit(x_train, y_train, epochs=5, batch_size=32, verbose=0)

<keras.src.callbacks.history.History at 0x7fd3d86e08d0>

In [12]:
print(f'Test accuracy = {model.evaluate(x_test, y_test)[1]:.4f}')

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8527 - loss: 0.4654
Test accuracy = 0.8502
