### GPT-2 Architecture

In [2]:
# GPT COnfiguration
GPT_CONFIG_124M ={
    "Vocab_size": 50527,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim":768,          # Embedding dimension
    "n_heads":12,           # Number of attention heads
    "n_layers":12,          # NUmber of layers
    "drop_rate":0.1,         # Dropout rate
    "qkv_bias":False        # Query-key-value bias
}

### GPT ARCHITECTURE : DUMMY GPT MODEL CLASS

In [8]:
import torch
import torch.nn as nn

# Dummy GPT Model
class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["Vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        # Use a placeholder for Transformer block
        self.trf_blocks = nn.Sequential(
            *[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )

        # Use a placeholder for LayerNorm
        self.final_norm = DummyLayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["Vocab_size"] , bias = False
        )
    
    # forward method
    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits


# Transformer Block
class DummyTransformerBlock(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        # simple placeholder

    # Forward Method
    def forward(self,x):
        # this block return input x .
        return x

# Dummy Layer Norm Block
class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps = 1e-5):
        super().__init__()
        # simple placeholder
    
    # Forward Method
    def forward(self, x):
        # this block return input x .
        return x

### Initalize Dummpy GPT Class

In [12]:
import tiktoken as tk

tokenizer = tk.get_encoding("gpt2")
batch = []
txt1 ="your every effort moves"
txt2 = "your day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))

batch= torch.stack(batch, dim=0)

# print shape of batch
print(batch.shape) # (2, 20)

torch.Size([2, 4])


In [13]:
torch.manual_seed(123)

# define the model
model = DummyGPTModel(GPT_CONFIG_124M)

# print model structure
print(model)


DummyGPTModel(
  (tok_emb): Embedding(50527, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): DummyTransformerBlock()
    (1): DummyTransformerBlock()
    (2): DummyTransformerBlock()
    (3): DummyTransformerBlock()
    (4): DummyTransformerBlock()
    (5): DummyTransformerBlock()
    (6): DummyTransformerBlock()
    (7): DummyTransformerBlock()
    (8): DummyTransformerBlock()
    (9): DummyTransformerBlock()
    (10): DummyTransformerBlock()
    (11): DummyTransformerBlock()
  )
  (final_norm): DummyLayerNorm()
  (out_head): Linear(in_features=768, out_features=50527, bias=False)
)


In [14]:
logits = model(batch)
# print logits shape
print(logits.shape) # (2, 20, 50527)
# print logits
print(logits) # tensor of shape (2, 20, 50527)

torch.Size([2, 4, 50527])
tensor([[[-1.2256,  0.7756, -0.0776,  ..., -0.5314, -0.7036, -1.0176],
         [-0.3171, -0.7252, -0.3411,  ..., -0.3467, -1.6318, -0.5773],
         [ 0.5597,  0.0986,  0.4047,  ..., -0.0595, -0.8096, -1.8774],
         [ 1.5765, -0.0267,  0.5045,  ..., -0.0574, -1.3881, -1.0802]],

        [[-1.3013,  0.7430,  0.1117,  ..., -0.0726,  0.1091, -1.4815],
         [-0.4063, -0.6273,  0.9828,  ..., -0.6344,  0.0545, -0.0326],
         [-0.2660, -1.7609,  0.6074,  ..., -1.1383, -0.7710, -0.9709],
         [ 0.5479, -1.3437,  1.2603,  ...,  0.6285, -0.2088, -2.5608]]],
       grad_fn=<UnsafeViewBackward0>)
