In [2]:
# 🚀 Transformer from Scratch - TensorFlow Implementation
# Based on Vaswani et al.'s "Attention is All You Need"

# 📦 Imports
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

In [4]:
# ## 🔢 Positional Encoding
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    return pos * angle_rates


def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                            np.arange(d_model)[np.newaxis, :],
                            d_model)
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[np.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)

In [5]:
# Visualize Positional Encoding
def visualize_positional_encoding():
    pos_encoding = positional_encoding(50, 128)
    plt.figure(figsize=(10, 6))
    plt.pcolormesh(pos_encoding[0], cmap='viridis')
    plt.xlabel('Depth')
    plt.ylabel('Position')
    plt.title('Positional Encoding')
    plt.colorbar()
    plt.show()

In [6]:
# ## 🎯 Scaled Dot-Product Attention
def scaled_dot_product_attention(q, k, v, mask=None):
    matmul_qk = tf.matmul(q, k, transpose_b=True)
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
    
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)
    
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
    output = tf.matmul(attention_weights, v)
    
    return output, attention_weights


In [7]:

# ## 🤹 Multi-Head Attention Layer
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.depth = d_model // num_heads
        
        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)
        
        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, depth)."""
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    
    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]
        
        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)
        
        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)
        
        scaled_attention, _ = scaled_dot_product_attention(q, k, v, mask)
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
        
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
        
        return self.dense(concat_attention)

# Uncomment the line below to visualize positional encoding when running this script
# visualize_positional_encoding()


In [11]:
multih = MultiHeadAttention(d_model=512, num_heads=8)
dummy_q = tf.random.uniform((1, 60, 512))
dummy_k = tf.random.uniform((1, 60, 512))
dummy_v = tf.random.uniform((1, 60, 512))
output = multih(dummy_v, dummy_k, dummy_q, None)
print("Output shape:", output.shape)  # Should be (1, 60, 512)
# Output shape: (1, 60, 512)

Output shape: (1, 60, 512)


In [15]:
model= multih  # Assign the model to a variable for further use
model = multih  # Assign the model to a variable for further use
model  # Display the model summary to verify the architecture

<MultiHeadAttention name=multi_head_attention_3, built=True>