### GPT-2 Architecture

In [61]:
# GPT COnfiguration
GPT_CONFIG_124M ={
    "Vocab_size": 50527,    # Vocabulary size
    "context_length": 256, # Context length
    "emb_dim":768,          # Embedding dimension
    "n_heads":12,           # Number of attention heads
    "n_layers":12,          # NUmber of layers
    "drop_rate":0.1,         # Dropout rate
    "qkv_bias":False        # Query-key-value bias
}

### GPT ARCHITECTURE : DUMMY GPT MODEL CLASS

In [2]:
import torch
import torch.nn as nn

In [3]:
# Dummy GPT Model
class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["Vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        # Use a placeholder for Transformer block
        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )

        # Use a placeholder for LayerNorm
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["Vocab_size"] , bias = False
        )
    
    # forward method
    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [4]:
import tiktoken as tk

tokenizer = tk.get_encoding("gpt2")
batch = []
txt1 ="your every effort moves"
txt2 = "your day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))

batch= torch.stack(batch, dim=0)

# print shape of batch
print(batch.shape) # (2, 20)

torch.Size([2, 4])


### Coding 124-M Parameter GPT-2

In [5]:
class LayerNorm(nn.Module):
    def __init__(self,emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
    
    def forward(self, x):
        mean = x.mean(dim = -1, keepdim = True)
        var = x.var(dim= -1, keepdim = True)
        norm_x  = (x-mean) / torch.sqrt(var +self.eps)
        return self.scale * norm_x + self.shift

In [6]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self,x):
        return 0.5 * x *(1+torch.tanh(
            torch.sqrt(torch.tensor(2.0/torch.pi)) *
            (x + 0.044715*torch.pow(x,3))
        ))

In [7]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4*cfg["emb_dim"]),
            GELU(),
            nn.Linear(4*cfg["emb_dim"], cfg["emb_dim"])
        )
    
    def forward(self, x):
        return self.layers(x)

In [8]:
class MultiHeadAttention(nn.Module):

    def __init__(self, d_in :int,d_out:int,num_of_heads:int,
                 context_length:int,dropout:float,qkv_bias:bool=False):
        super().__init__()
        self.d_out = d_out
        assert (d_out % num_of_heads == 0),\
              "d_out is not divisible by num_of_heads"
        self.head_dim = d_out // num_of_heads
        self.num_of_heads = num_of_heads
        self.w_query = nn.Linear(d_in,d_out,bias=qkv_bias)
        self.w_Key = nn.Linear(d_in,d_out,bias=qkv_bias) 
        self.w_value = nn.Linear(d_in,d_out,bias=qkv_bias) 
        self.projLayer = nn.Linear(d_out,d_out,bias=qkv_bias) 
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length,context_length),
                       diagonal=1)
        )   


    def forward(self,x):
        b, num_of_token , d_in = x.shape
        
        Query = self.w_query(x) 
        Key = self.w_Key(x) 
        Value = self.w_value(x) 

        #(b,num_of_token,d_in) ---> (b,num_of_token,num_of_heads,head_dim) 
        Query = Query.view(b,num_of_token,self.num_of_heads,self.head_dim)
        Key = Key.view(b,num_of_token,self.num_of_heads,self.head_dim)
        Value = Value.view(b,num_of_token,self.num_of_heads,self.head_dim)

        # Grouping By heads ( transpose 1 and 2 shape)
        #shape : (b,num_of_heads,num_of_token,head_dim) 
        Query = Query.transpose(1,2)
        Key = Key.transpose(1,2)
        Value = Value.transpose(1,2)

        # attaention score
        atten_score  = Query @ Key.transpose(2,3)

        # create mask
        mask_bool  = self.mask.bool()[:num_of_token,:num_of_token]

        #apply mask
        atten_score = atten_score.masked_fill(
                            mask_bool,
                            -torch.inf
                        )

        # apply softmax
        attn_weight = torch.softmax(
            atten_score / Key.shape[-1] ** 0.5 , dim = -1
        )

        #context vector
        context_vec  = (attn_weight @ Value).transpose(1,2)

        #Merge Head_dim (b,num_of_token,num_of_heads,head_dim) --> (b,num_of_token,d_out)
        context_vec = context_vec.contiguous().view(b,num_of_token,self.d_out)

        # apply projection layer
        context_vec = self.projLayer(context_vec)
        
        # apply dropout and return
        return self.dropout(context_vec)


In [9]:
class TransformerBlock(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.attention = MultiHeadAttention(d_in=cfg["emb_dim"],
                                            d_out=cfg["emb_dim"],
                                            num_of_heads=cfg["n_heads"],
                                            context_length=cfg["context_length"],
                                            dropout = cfg["drop_rate"],
                                            qkv_bias=cfg["qkv_bias"])
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])

        self.feedforward = FeedForward(cfg)
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        shortcut = x 
        x = self.norm1(x)
        x = self.attention(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.feedforward(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        return x

In [10]:
# Dummy GPT Model
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["Vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        # Use a placeholder for Transformer block
        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )

        # Use a placeholder for LayerNorm
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["Vocab_size"] , bias = False
        )
    
    # forward method
    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [11]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)

In [12]:
### Initalize Dummpy GPT Class
import tiktoken as tk

tokenizer = tk.get_encoding("gpt2")
batch = []
txt1 ="your every effort moves"
txt2 = "your day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))

batch= torch.stack(batch, dim=0)

In [13]:
out = model(batch)
print("Input Batch:\n",batch)
print("Input shape:\n",batch.shape)
print("Output shape:\n",out.shape)
print(out)

Input Batch:
 tensor([[14108,   790,  3626,  6100],
        [14108,  1110,  6622,   257]])
Input shape:
 torch.Size([2, 4])
Output shape:
 torch.Size([2, 4, 50527])
tensor([[[ 0.2335, -0.0799,  0.4891,  ...,  0.7267,  0.0185,  0.1434],
         [ 0.2745,  0.3663, -0.0066,  ..., -0.1622,  0.6408,  0.5155],
         [ 0.7870, -0.8251,  0.6020,  ..., -0.6042,  0.3742,  0.4305],
         [-0.2272, -0.4116,  0.2539,  ..., -0.8637,  0.1731, -0.0040]],

        [[ 0.2679, -0.2616,  0.3264,  ...,  0.5212,  0.0633,  0.7434],
         [ 0.2792, -0.1173,  0.4353,  ...,  0.7961,  0.0773,  1.0298],
         [ 0.4440,  0.1749, -0.3539,  ..., -0.4682, -0.6033,  0.1506],
         [-0.0408, -0.2424,  0.3245,  ..., -0.4561,  0.0220,  0.6345]]],
       grad_fn=<UnsafeViewBackward0>)


In [14]:
# Total parameter count
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameter is:{total_params:,}")

Total number of parameter is:163,415,040


In [15]:
print("Token Embedding layer shape:",model.tok_emb.weight.shape)
print("Output layer shape:",model.out_head.weight.shape)

Token Embedding layer shape: torch.Size([50527, 768])
Output layer shape: torch.Size([50527, 768])


In [16]:
total_params_gpt2 = total_params - sum(p.numel() for p in model.out_head.parameters())
print(f"Total number of parameter in GPT 2 is:{total_params_gpt2:,}")

Total number of parameter in GPT 2 is:124,610,304


In [17]:
total_size_bytes = total_params * 4
total_size_mb = total_size_bytes / (1024 *1024)
print(f"total size of the model:{total_size_mb:.2f} MB")

total size of the model:623.38 MB


In [18]:
def generate_text_simple(model:GPTModel, idx:list[int], max_new_tokens:int,context_size:int):
    for _ in range(max_new_tokens):
        # get last token size of context size
        idx_cond = idx[:,-context_size:]

        # get output
        with torch.no_grad():
            logits = model(idx_cond)
        
        # get last raw from logits tensor
        logits = logits[:,-1,:]

        # apply softmax
        probs = torch.softmax(logits,dim=-1)

        ids_next = torch.argmax(probs, dim = -1, keepdim=True)

        idx = torch.cat((idx, ids_next), dim = 1)

    return idx

In [19]:
start_context = "Hello, I am"
encoded = tokenizer.encode(start_context)
print("encode:",encoded)
encoded_tensor = torch.tensor(encoded).unsqueeze(0)
print("encoded_tensor.shape",encoded_tensor.shape)

encode: [15496, 11, 314, 716]
encoded_tensor.shape torch.Size([1, 4])


In [20]:
model.eval()
out = generate_text_simple(model = model,
                           idx = encoded_tensor,
                           max_new_tokens=6,
                           context_size=GPT_CONFIG_124M['context_length'])
print("Output:",out)
print("Output length:",len(out[0]))

Output: tensor([[15496,    11,   314,   716, 42009, 18283, 37256, 10358, 28640, 24274]])
Output length: 10


In [21]:
decoded_text = tokenizer.decode(out.squeeze(0).tolist()) # type: ignore
print(decoded_text)

Hello, I amlectic trackedacons Should inflictInternational


In [22]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text , allowed_special={"<|endoftext|>"})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

start_context = "Every effort moves you"

token_ids = generate_text_simple(
    model = model ,
    idx = text_to_token_ids(start_context,tokenizer),
    max_new_tokens= 10,
    context_size= GPT_CONFIG_124M['context_length']
)

print("Output text:\n", token_ids_to_text(token_ids=token_ids, tokenizer= tokenizer))

Output text:
 Every effort moves you anywhere sovere fat oct Swordsman Supportsycle Cron virtueism


In [23]:
inputs = torch.tensor([[16833,3626,6100],
                       [40,1107,588]])

targets = torch.tensor([[3626,6100,345],
                        [1107,588,11311]])

In [24]:
with torch.no_grad():
    logits = model(inputs)

probs = torch.softmax(logits, dim = -1)
print(probs.shape)

torch.Size([2, 3, 50527])


In [25]:
token_ids = torch.argmax(probs, dim = -1, keepdim=True)
token_ids

tensor([[[27728],
         [44487],
         [28687]],

        [[25797],
         [21714],
         [ 5485]]])

In [26]:
print(f"Traget Batch 1:{token_ids_to_text(targets[0],tokenizer)}")
print(f"Ouputs Batch 1:{token_ids_to_text(token_ids[0].flatten(),tokenizer)}")

Traget Batch 1: effort moves you
Ouputs Batch 1:298 Sidd 370


### Cross Entropy Loss

In [28]:
text_idx = 0

target_probabs_0 = probs[text_idx, [0,1,2], targets[text_idx]]
print("target 0 prob:\n",target_probabs_0)

text_idx = 1
target_probabs_1 = probs[text_idx, [0,1,2], targets[text_idx]]
print("target 1 prob:\n",target_probabs_1)

target 0 prob:
 tensor([1.0570e-05, 8.3353e-06, 2.2679e-05])
target 1 prob:
 tensor([9.4317e-06, 4.3987e-06, 6.1416e-06])


In [29]:
log_probs = torch.log(torch.cat((target_probabs_0,target_probabs_1)))
print(log_probs)

tensor([-11.4575, -11.6950, -10.6941, -11.5714, -12.3342, -12.0004])


In [30]:
avg_log_probs = torch.mean(log_probs)
print(avg_log_probs)

tensor(-11.6254)


In [31]:
neg_avg_log_probs = avg_log_probs * -1
print(neg_avg_log_probs)

tensor(11.6254)


In [32]:
logits_flat = logits.flatten(0,1)

targets_flat = targets.flatten(0,1)

print("Flatterned logits:",logits_flat.shape)

print("Flatterned targets:",targets_flat.shape)

Flatterned logits: torch.Size([6, 50527])
Flatterned targets: torch.Size([6])


In [33]:
torch.nn.functional.cross_entropy(logits_flat,targets_flat)

tensor(11.6254)

### Perplexity


#### Measure how well the probability distribution predicted by the model matches the actual distribution of words in dataset

#### More interetable way of understanding model uncertanity in pred next token

#### lower score ---> better score

#### preplexity  = torch.exp(corss entropy loss)

In [34]:
perplexity_loss = torch.exp(torch.nn.functional.cross_entropy(logits_flat,targets_flat))
print(perplexity_loss)

tensor(111909.0234)


### Dataset

In [38]:
with open(r'D:\Data_Science_Study\Course_Pracitse_Code\GenerativeAi\LLM_From_Scratch\Building-LLMs-from-Scratch\0. Data\the-verdict.txt') as f:
    data = f.read()

In [40]:
#last 100 char
print(data[-99:])

it for me! The Strouds stand alone, and happen once--but there's no exterminating our kind of art."


In [42]:
total_characters = len(data)
total_tokens = len(tokenizer.encode(data))

print("Characters:",total_characters)
print("Tokens:",total_tokens)

Characters: 20479
Tokens: 5145


In [43]:
from torch.utils.data import Dataset,DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self,txt, tokenizer,max_length, stride):
        self.input_ids = []
        self.target_ids = []


        token_ids = tokenizer.encode(txt, allowed_special={"|<endoftext>|"})


        for i in range(0,len(token_ids)-max_length,stride):
            input_chunks = token_ids[i:i+max_length]
            target_chunks = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunks))
            self.target_ids.append(torch.tensor(target_chunks))

    
    def __len__(self):
        return len(self.input_ids)
    

    def __getitem__(self,idx):
        return self.input_ids[idx], self.target_ids[idx] # type: ignore

In [53]:
import tiktoken

In [55]:
def create_dataloader_v1(text,batch_size=4,max_length =256,
                        stride = 128, shuffle = True, drop_last = True,
                        num_workers = 0):


    tokenizer = tiktoken.get_encoding('gpt2')

    # create dataset
    dataset = GPTDatasetV1(text, tokenizer,max_length, stride)

    # create dataloader

    dataloader = DataLoader(
        dataset,
        batch_size = batch_size,
        shuffle = shuffle,
        drop_last = drop_last,
        num_workers = num_workers
    )

    return dataloader

In [65]:
train_ratio = 0.90
split_idx = int(train_ratio * len(data))
train_data = data[:split_idx]
val_data = data[split_idx:]

In [66]:
torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride = GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

In [67]:
val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride = GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [68]:
if total_tokens *(train_ratio) < GPT_CONFIG_124M["context_length"]:
    print("Not enough tokrns for the train loader"
          "Try with more data")
    
if total_tokens *(1- train_ratio) < GPT_CONFIG_124M["context_length"]:
    print("Not enough tokrns for the train loader"
          "Try with more data")

In [69]:
print("Train Loader:")
for x , y in train_loader:
    print(x.shape, y.shape)

print("\nValidation Loader:")
for x , y in val_loader:
    print(x.shape, y.shape)


print(len(train_loader))

Train Loader:
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])

Validation Loader:
torch.Size([2, 256]) torch.Size([2, 256])
9


In [76]:
def calc_loss_batch(input_batch,target_batch, model ,device):
    input_batch ,target_batch = input_batch.to(device),target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0,1),target_batch.flatten())
    return loss

def calc_loss_loader(data_loader,model, device,num_batches =None):
    total_loss = 0
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    

    for i , (input_batch, target_batch) in enumerate(data_loader):
        if i <num_batches:
            loss = calc_loss_batch(input_batch,target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    
    return total_loss/num_batches


In [72]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [73]:
model = model.to(device)

torch.manual_seed(123)

<torch._C.Generator at 0x21d9d15a210>

In [77]:
with torch.no_grad():
    train_loss = calc_loss_loader(train_loader,model,device)
    val_loss = calc_loss_loader(val_loader,model, device)

print("training loss:", train_loss)
print("validation loss:", val_loss)

training loss: 10.966801431443956
validation loss: 10.966813087463379


### Layer Normalization

In [None]:
import torch
import torch.nn as nn
torch.manual_seed(123) # type: ignore

batch_example = torch.randn(2,5)

layer = nn.Sequential(nn.Linear(5,6),nn.ReLU())

out = layer(batch_example)

print(out.shape) # (2, 6)
print(out)

torch.Size([2, 6])
tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)


In [None]:
mean = out.mean(dim= -1, keepdim=True)
var = out.var(dim= -1, keepdim=True)

print("Mean:\n",mean)
print("Variance:\n",var) 

Mean:
 tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[0.0231],
        [0.0398]], grad_fn=<VarBackward0>)


In [None]:
out_norm = (out - mean)/torch.sqrt(var)

mean_norm = out_norm.mean(dim= -1, keepdim=True)
var_norm = out_norm.var(dim= -1, keepdim=True)

print("Normalized Output:\n",out_norm)
print("Mean of Normalized Output:\n",mean_norm)
print("Variance of Normalized Output:\n",var_norm)

Normalized Output:
 tensor([[ 0.6159,  1.4126, -0.8719,  0.5872, -0.8719, -0.8719],
        [-0.0189,  0.1121, -1.0876,  1.5173,  0.5647, -1.0876]],
       grad_fn=<DivBackward0>)
Mean of Normalized Output:
 tensor([[9.9341e-09],
        [0.0000e+00]], grad_fn=<MeanBackward1>)
Variance of Normalized Output:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [None]:
torch.set_printoptions(sci_mode = False)
print("Mean of Normalized Output:\n",mean_norm)
print("Variance of Normalized Output:\n",var_norm)

Mean of Normalized Output:
 tensor([[    0.0000],
        [    0.0000]], grad_fn=<MeanBackward1>)
Variance of Normalized Output:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [None]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
    

    def forward(self, x):
        mean = x.mean(dim = -1 ,keepdim = True)
        var = x.var(dim = -1, keepdim = True, unbiased =False)
        norm_x = (x-mean)/ torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [None]:
ln = LayerNorm(emb_dim=5)

out_ln = ln(batch_example)

mean_ln = out_ln.mean(dim = -1,keepdim = True)
var_ln = out_ln.var(dim = -1,keepdim= True,unbiased =False)

print("out \n", out_ln)

print("Mean \n", mean_ln)

print("Variance \n", var_ln)

out 
 tensor([[ 0.5528,  1.0693, -0.0223,  0.2656, -1.8654],
        [ 0.9087, -1.3767, -0.9564,  1.1304,  0.2940]], grad_fn=<AddBackward0>)
Mean 
 tensor([[    -0.0000],
        [     0.0000]], grad_fn=<MeanBackward1>)
Variance 
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)
