# DAY 21 — Transformer From Scratch

## Overview

This notebook builds a Transformer Encoder completely from scratch and applies it to IMDB sentiment classification.

## Import Libraries

In [1]:
import tensorflow as tf
from tensorflow.keras import layers, models, datasets, preprocessing, callbacks
import numpy as np
tf.random.set_seed(42)

## Load and Prepare Dataset

In [2]:
vocab_size = 10000
max_len = 200

(x_train, y_train), (x_test, y_test) = datasets.imdb.load_data(num_words=vocab_size)
x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=max_len)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=max_len)

x_train.shape, x_test.shape

((25000, 200), (25000, 200))

## Define Positional Encoding

In [3]:
class PositionalEncoding(layers.Layer):
    def __init__(self, max_len, d_model):
        super().__init__()
        pos = tf.range(max_len, dtype=tf.float32)[:, tf.newaxis]
        i = tf.range(d_model, dtype=tf.float32)[tf.newaxis, :]
        angles = pos / tf.pow(10000.0, (2*(i//2))/d_model)
        pe = tf.where(i % 2 == 0, tf.sin(angles), tf.cos(angles))
        self.pos_encoding = pe[tf.newaxis, ...]

    def call(self, x):
        return x + self.pos_encoding[:, :tf.shape(x)[1], :]


## Implement Transformer Encoder Block

In [4]:
def transformer_encoder(embed_dim, num_heads, ff_dim):
    inputs = layers.Input(shape=(None, embed_dim))
    
    attn_out = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(inputs, inputs)
    x = layers.LayerNormalization(epsilon=1e-6)(inputs + attn_out)

    ffn = layers.Dense(ff_dim, activation='relu')(x)
    ffn = layers.Dense(embed_dim)(ffn)
    outputs = layers.LayerNormalization(epsilon=1e-6)(x + ffn)

    return models.Model(inputs, outputs)


## Build and Train Model

In [5]:
embed_dim = 64
num_heads = 4
ff_dim = 128

inputs = layers.Input(shape=(max_len,))
x = layers.Embedding(vocab_size, embed_dim)(inputs)
x = PositionalEncoding(max_len, embed_dim)(x)

encoder = transformer_encoder(embed_dim, num_heads, ff_dim)
x = encoder(x)

x = layers.GlobalAveragePooling1D()(x)
x = layers.Dense(64, activation='relu')(x)
outputs = layers.Dense(1, activation='sigmoid')(x)

model = models.Model(inputs, outputs)
model.summary()

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
cb = [callbacks.EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)]

history = model.fit(
    x_train, y_train,
    validation_data=(x_test, y_test),
    epochs=8, batch_size=128,
    callbacks=cb
)





Epoch 1/8
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 729ms/step - accuracy: 0.6704 - loss: 0.5630 - val_accuracy: 0.8484 - val_loss: 0.3474
Epoch 2/8
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 816ms/step - accuracy: 0.8828 - loss: 0.2776 - val_accuracy: 0.8752 - val_loss: 0.2992
Epoch 3/8
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 1s/step - accuracy: 0.9144 - loss: 0.2146 - val_accuracy: 0.8680 - val_loss: 0.3213
Epoch 4/8
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 989ms/step - accuracy: 0.9275 - loss: 0.1866 - val_accuracy: 0.8613 - val_loss: 0.3769
Epoch 5/8
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 944ms/step - accuracy: 0.9372 - loss: 0.1596 - val_accuracy: 0.8518 - val_loss: 0.4291


## Evaluate

In [6]:
loss, acc = model.evaluate(x_test, y_test)
acc

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 65ms/step - accuracy: 0.8752 - loss: 0.2992


0.8751999735832214