In [None]:
'''


GPT-2 consists of multiple Transformer decoder blocks stacked together. The main components are:

(i) Input Embeddings
Token Embedding: Converts input tokens into dense vectors.
Positional Embedding: Adds positional information since transformers don't have inherent sequential understanding.
(ii) Transformer Blocks (Repeated N Times)
Each transformer block consists of:

Multi-Head Self-Attention (MHSA) – Computes attention scores between all tokens in a sequence.
Feedforward Network (FFN) – Applies two dense layers with activation functions.
Layer Normalization – Normalizes activations to prevent instability.
Dropout – Regularization to avoid overfitting.
(iii) Output Layer
A final dense layer projects outputs to vocabulary size for token prediction.
Softmax converts logits to probabilities.


GPT-2 Architecture Diagram
mathematica
Copy
Edit
Input Token Sequence → [Token Embedding] + [Positional Embedding]  
    ↓  
[Transformer Block 1] → [Transformer Block 2] → ... → [Transformer Block N]  
    ↓  
Final Dense Layer → Output Probabilities (Next Token Prediction)
Each Transformer Block consists of:

css
Copy
Edit
[Input] → [Multi-Head Self Attention] → [Layer Normalization] → [Feed Forward NN] → [Layer Normalization] → [Output]



GPT-2 Model Hyperparameters (Different Variants)

Model	      Layers	Heads	Embedding Size	Parameters
GPT-2 Small	   12	    12	      768	          124M
GPT-2 Medium   24	    16	     1024	          355M
GPT-2 Large	   36	    20	     1280 	          774M
GPT-2 XL	   48	    25	     1600             1.5B'''



#Making invalid relationships zero by adding a large negative value: Mask


In [16]:
import tensorflow as tf
from tensorflow.keras.layers import Layer, Embedding, Dense, LayerNormalization, Dropout
from tensorflow.keras import Model

# MultiHead Self Attention 
class MultiHeadSelfAttention(Layer):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.att_head_size = embed_dim // num_heads
        
        self.wq = Dense(embed_dim)
        self.wk = Dense(embed_dim)
        self.wv = Dense(embed_dim)
        self.dense = Dense(embed_dim)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.att_head_size))
        return tf.transpose(x, perm=[0, 2, 1, 3])
        
    def call(self, q, k, v, mask=None):
        batch_size = tf.shape(q)[0]

        q = self.split_heads(self.wq(q), batch_size)
        k = self.split_heads(self.wk(k), batch_size)
        v = self.split_heads(self.wv(v), batch_size)

        matmul_qk = tf.matmul(q, k, transpose_b=True)
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

        if mask is not None:  
            scaled_attention_logits += (mask * -1e9)

        attention_w = tf.nn.softmax(scaled_attention_logits, axis=-1)
        output = tf.matmul(attention_w, v)

        output = tf.transpose(output, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(output, (batch_size, -1, self.embed_dim))
        
        return self.dense(concat_attention)


class FeedForwardNN(Layer):
    def __init__(self, embed_dim, dff):
        super().__init__()
        self.dense1 = Dense(dff, activation='gelu')
        self.dense2 = Dense(embed_dim)

    def call(self, x):
        return self.dense2(self.dense1(x))


class Transformer(Layer):
    def __init__(self, embed_dim, num_heads, dff, dropout_rate=0.1):
        super().__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = FeedForwardNN(embed_dim, dff)
        self.norm1 = LayerNormalization(epsilon=1e-6)
        self.norm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(dropout_rate)
        self.dropout2 = Dropout(dropout_rate)

    def call(self, x, mask=None):
        att_output = self.att(x, x, x, mask)
        att_output = self.dropout1(att_output)
        out1 = self.norm1(x + att_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)

        return self.norm2(out1 + ffn_output)


class GPT2(Model):
    def __init__(self, vocab_size, max_length, embed_dim=768, num_heads=12, dff=3072, num_layers=12, dropout_rate=0.1):
        super().__init__()
        self.token_emb = Embedding(vocab_size, embed_dim)
        self.pos_emb = Embedding(max_length, embed_dim)

        self.transformer_blocks = [Transformer(embed_dim, num_heads, dff, dropout_rate) for _ in range(num_layers)]
        self.norm = LayerNormalization(epsilon=1e-6)
        self.out = Dense(vocab_size)

    def create_causal_mask(self, seq_len):
        mask = tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
        return 1 - mask  # Converts upper-triangle values to zero

    def call(self, x):
        seq_len = tf.shape(x)[1]
        mask = self.create_causal_mask(seq_len)

        token_emb = self.token_emb(x)
        pos_emb = self.pos_emb(tf.range(seq_len)[:, tf.newaxis])
        x = token_emb + pos_emb
        
        for transformer in self.transformer_blocks:
            x = transformer(x, mask)

        x = self.norm(x)
        return self.out(x)


# Model Initialization
VOCAB_SIZE = 3027
MAX_LENGTH = 1027

inputs = tf.keras.layers.Input(shape=(MAX_LENGTH,), dtype=tf.int32)
output = GPT2(vocab_size=VOCAB_SIZE, max_length=MAX_LENGTH)(inputs)
gpt2 = Model(inputs, output)

gpt2.build(input_shape=(1, MAX_LENGTH))
gpt2.summary()
