<a href="https://colab.research.google.com/github/SantoshB-Github/Build_LLM/blob/main/LLM_Chapter3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
GPT_CONFIG_124M = {
"vocab_size": 50257, # Vocabulary size
"ctx_len": 1024, # Context length
"emb_dim": 768, # Embedding dimension
"n_heads": 12, # Number of attention heads
"n_layers": 12, # Number of layers
"drop_rate": 0.1, # Dropout rate
"qkv_bias": False # Query-Key-Value bias
}

In [36]:
import torch.nn as nn

#The DummyGPTModel class in this code defines a simplified version of a GPT-like model using PyTorch's neural network module (nn.Module)
class DummyGPTModel (nn.Module):
  def __init__(self, cfg):
    super().__init__()
    self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
    print(self.tok_emb)
    self.pos_emb = nn.Embedding(cfg["ctx_len"], cfg["emb_dim"])
    print(self.pos_emb)
    self.drop_emb = nn.Dropout (cfg["drop_rate"])

    #a series of transformer blocks (DummyTransformerBlock)
    self.trf_blocks = nn.Sequential (*[DummyTransformerBlock(cfg) for _ in range (cfg["n_layers"]) ]) #Use a placeholder for TransformerBlock
    #a final layer normalization (DummyLayerNorm)
    self.final_norm = DummyLayerNorm(cfg["emb_dim"])      #Use a placeholder for LayerNorm
    #a linear output layer (out_head)
    self.out_head = nn.Linear (cfg["emb_dim"], cfg["vocab_size"], bias = False)

#The forward method describes the data flow through the model: it computes token and positional embeddings for the input indices, applies dropout, processes the data through
#the transformer blocks, applies normalization, and finally produces logits with the linear output layer.
  def forward (self, in_idx):
    batch_size, seq_len=in_idx.shape
    tok_embeds = self.tok_emb(in_idx)
    pos_embeds = self.pos_emb(torch.arange(seq_len, device = in_idx.device))
    print ("tok_embeds, size\n", tok_embeds, tok_embeds.shape)
    print ("pos_embeds, size\n", pos_embeds, pos_embeds.shape)
    print ("in_idx\n", in_idx)
    print ("seq_len \n", seq_len)
    x = tok_embeds + pos_embeds
    print("x = tok_embeds + pos_embeds, size\n", x, x.shape)
    x = self.drop_emb(x)
    x = self.trf_blocks(x)
    x = self.final_norm(x)
    logits = self.out_head(x)
    return logits

class DummyTransformerBlock(nn.Module):   #A simple placeholder class that will be replaced by a real TransformerBlock later
  def __init__(self, cfg):
    super().__init__()

  def forward(self, x):                   # This block does nothing and just returns its input.
    return x

class DummyLayerNorm(nn.Module):          #  A simple placeholder class that will be replaced by a real TransformerBlock later
  def __init__(self, normalized_shape, eps = 1e-5):    #The parameters here are just to mimic the LayerNorm interface.
    super().__init__()

  def forward (self, x):
    return x

In [5]:
pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.6.0


In [37]:
import tiktoken
import torch

tokenizer = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
print(batch)
batch = torch.stack(batch, dim=0)
print(batch)

#The resulting token IDs for the two texts are as follows:
#tensor([ [ 6109, 3626, 6100, 345], # The first row corresponds to the first text, and the second row corresponds to the second text
#         [ 6109, 1110, 6622, 257]])

#Next, we initialize a new 124 million parameter DummyGPTModel instance and feed it the tokenized batch:

torch.manual_seed(123)
model = DummyGPTModel(GPT_CONFIG_124M)
logits = model(batch)
print("output shape:", logits.shape)
print(logits)

[tensor([6109, 3626, 6100,  345]), tensor([6109, 1110, 6622,  257])]
tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])
Embedding(50257, 768)
Embedding(1024, 768)
tok_embeds, size
 tensor([[[ 1.6146,  2.1622,  1.2770,  ...,  0.0111, -1.1353, -0.2450],
         [-1.6020, -1.3996,  0.4308,  ...,  0.0913, -0.0614, -0.0538],
         [ 0.7265,  1.0869, -0.2251,  ..., -0.0742,  0.1081,  0.9774],
         [-0.4047, -0.0914, -1.5747,  ..., -1.5130,  0.4232,  0.5841]],

        [[ 1.6146,  2.1622,  1.2770,  ...,  0.0111, -1.1353, -0.2450],
         [-0.7858,  0.0101,  1.2537,  ..., -0.2667, -0.7853,  1.3047],
         [ 1.8873,  0.7559, -0.1797,  ..., -0.0418, -0.4365, -0.3167],
         [ 1.2758, -0.2896,  0.9539,  ...,  0.3405, -1.0636, -0.0674]]],
       grad_fn=<EmbeddingBackward0>) torch.Size([2, 4, 768])
pos_embeds, size
 tensor([[ 0.8769,  0.2550,  0.8441,  ..., -1.0354,  1.3085,  1.7957],
        [-1.0029,  0.0995,  1.2459,  ...,  1.5453, -0.1126, -1.5197],
        [ 

In [43]:
class LayerNorm(nn.Module):
  def __init__(self, emb_dim):
    super().__init__()
    self.eps = 1e-5
    self.scale = nn.Parameter (torch.ones(emb_dim))
    self.shift = nn.Parameter(torch.zeros(emb_dim))

  def forward (self, x):
    mean = x.mean(dim = -1, keepdim=True)
    var = x.var(dim = -1, keepdim=True, unbiased=False)
    norm_x = (x - mean) / torch.sqrt(var + self.eps)
    return (self.scale * norm_x + self.shift)

In [48]:
#torch.set_printoptions(sci_mode=False)
ln = LayerNorm(emb_dim=5)
batch_example = torch.rand(2, 5)
print("batch_example = \n", batch_example)
out_ln = ln (batch_example)
print("normalized batch_example = \n", out_ln)
print("mean \n", out_ln.mean(dim = -1, keepdim=True))
print("var \n", out_ln.var(dim = -1, keepdim=True, unbiased=False))

batch_example = 
 tensor([[0.6978, 0.4475, 0.9203, 0.7511, 0.5432],
        [0.2719, 0.9347, 0.7308, 0.9974, 0.9793]])
normalized batch_example = 
 tensor([[ 0.1566, -1.3630,  1.5079,  0.4803, -0.7819],
        [-1.8743,  0.5572, -0.1909,  0.7872,  0.7209]], grad_fn=<AddBackward0>)
mean 
 tensor([[    -0.0000],
        [     0.0000]], grad_fn=<MeanBackward1>)
var 
 tensor([[0.9996],
        [0.9999]], grad_fn=<VarBackward0>)


In [60]:
class GELU(nn.Module):
    def __init__(self):
      super().__init__()
    def forward(self, x):
      return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) *(x + 0.044715 * torch.pow(x, 3))))

In [None]:
#Next, to get an idea of what this GELU function looks like and how it compares to the ReLU function, let's plot these functions side by side:
import matplotlib.pyplot as plt
gelu, relu = GELU(), nn.ReLU()
x = torch.linspace(-3, 3, 100) #A
y_gelu, y_relu = gelu(x), relu(x)
plt.figure(figsize=(8, 3))
for i, (y, label) in enumerate(zip([y_gelu, y_relu], ["GELU", "ReLU"]),1):
  plt.subplot(1, 2, i)
  plt.plot(x, y)
  plt.title(f"{label} activation function")
  plt.xlabel("x")
  plt.ylabel(f"{label}(x)")
  plt.grid(True)
plt.tight_layout()
plt.show()



In [63]:
class FeedForward (nn.Module):
  def __init__(self, cfg):
    self.layers = nn.sequnetial (nn.Linear (cfg["emb_dim"], 4* cfg["emb_dim"]), GELU(), nn.Linear (4*cfg["emb_dim"], cfg["emb_dim"]), nn.Dropout(cfg["drop_rate"]))

  def forward(self, x):
    return self.layers(x)

In [2]:
from LLM_Chapter3 import MultiHeadAttention

ModuleNotFoundError: No module named 'LLM_Chapter3'