In [1]:
import torch
import torch.nn.functional as f
import math
from torch.utils.data import Dataset, DataLoader
import tiktoken

In [2]:
class MultiHeadAttention(torch.nn.Module):
    def __init__(self, dim_model, num_heads, bias, model_dropout):
        super().__init__()
        self.dim_model = dim_model
        self.bias = bias
        self.num_heads = num_heads
        self.head_dim = dim_model // num_heads
        self.weights_query = torch.nn.Linear(self.dim_model, self.dim_model, bias=self.bias)
        self.weights_value = torch.nn.Linear(self.dim_model, self.dim_model, bias=self.bias)
        self.weights_key = torch.nn.Linear(self.dim_model, self.dim_model, bias=self.bias)
        self.dropout = torch.nn.Dropout(model_dropout)
    def forward(self,inputs):
        query = self.weights_query(inputs)
        key = self.weights_key(inputs)
        value = self.weights_value(inputs)
        query =  torch.reshape(query,(inputs.shape[0], inputs.shape[1], self.num_heads,self.head_dim))
        key =  torch.reshape(key,(inputs.shape[0], inputs.shape[1], self.num_heads,self.head_dim))
        value = torch.reshape(value,(inputs.shape[0], inputs.shape[1], self.num_heads,self.head_dim))
        query = query.transpose(1,2)
        key = key.transpose(1,2)
        value = value.transpose(1,2)
        multi_head_attention_scores = torch.matmul(query, key.transpose(2,3))
        mask = torch.triu(torch.ones(multi_head_attention_scores.shape),diagonal=1) ==1
        result =  multi_head_attention_scores.masked_fill(mask==True, -torch.inf)
        multi_head_attn_weights = torch.softmax(result / key.shape[-1]**0.5, dim=-1)
        dropout = torch.nn.Dropout(0.5)
        multi_head_attn_weights = dropout(multi_head_attn_weights)
        multi_head_context_vectors = torch.matmul(multi_head_attn_weights,value).transpose(1,2)
        multi_head_context_vectors = torch.reshape(multi_head_context_vectors,(inputs.shape[0],inputs.shape[1],self.dim_model))
        return multi_head_context_vectors

In [3]:
class layer_normalisation(torch.nn.Module):
    def __init__(self, dim_model):
        super().__init__()
        self.scale = torch.nn.Parameter(torch.ones(dim_model))
        self.shift = torch.nn.Parameter(torch.zeros(dim_model))
        self.eps = 1e-5
    def forward(self, inputs):
        mean = inputs.mean( dim =-1, keepdim=True)
        var = inputs.var( dim =-1, keepdim=True)
        normalized_inputs = (inputs - mean) / torch.sqrt(var + self.eps)
        normalized_inputs = self.scale * normalized_inputs + self.shift
        return normalized_inputs

In [4]:
class GeLU(torch.nn.Module):
    def __init__(self):
     super().__init__()

    def forward(self, inputs):
       return 0.5 * inputs * (1 + torch.tanh(torch.sqrt(torch.tensor(2 / math.pi)) * (inputs + 0.044715 * torch.pow(inputs, 3))))

In [5]:
class feed_forward(torch.nn.Module):
    def __init__(self,dim_model):
        super().__init__()
        self.layer = torch.nn.Sequential(torch.nn.Linear(dim_model,4*dim_model),GeLU(),
                                         torch.nn.Linear(4*dim_model,dim_model))
    def forward(self, inputs):
        return self.layer(inputs)

In [6]:
class Transformer(torch.nn.Module):
    def __init__(self, dim_model, num_heads, model_dropout):
        super().__init__()
        self.layer_norm = layer_normalisation(dim_model)
        self.layer_norm2 = layer_normalisation(dim_model)
        self.attention = MultiHeadAttention(dim_model, num_heads, False, model_dropout)
        self.feed_forward = feed_forward(dim_model)
        self.dropout = torch.nn.Dropout(model_dropout)
    def forward(self, inputs):
        shortcut_connection = inputs
        layer_norm_output = self.layer_norm(inputs)
        attention_output = self.attention(layer_norm_output)
        attention_output = self.dropout(attention_output)
        attention_output += shortcut_connection
        shortcut_connection = attention_output
        layer_norm_output2 = self.layer_norm2(attention_output)
        feed_forward_output = self.feed_forward(layer_norm_output2)
        feed_forward_output = self.dropout(feed_forward_output)
        feed_forward_output += shortcut_connection
        return feed_forward_output


In [7]:
class gpt2_architecture(torch.nn.Module):
    def __init__(self,config):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(config["vocab_size"],config["emb_dim"])
        self.positional_embedding = torch.nn.Embedding(config['context_length'],config["emb_dim"])
        self.dropout = torch.nn.Dropout(config['drop_rate'])
        self.transformer = torch.nn.Sequential(*[Transformer(config['emb_dim'],config['n_heads'],config['drop_rate']) for i in range(config['n_layers'])])
        self.final_layer_norm = layer_normalisation(config['emb_dim'])
        self.output_layer = torch.nn.Linear(config["emb_dim"],config["vocab_size"])
    
    def forward(self, inputs):
        x = self.token_embedding(inputs)
        x = x + self.positional_embedding(torch.arange(inputs.shape[-1]))
        x = self.dropout(x)
        x = self.transformer(x)
        x = self.final_layer_norm(x)
        x = self.output_layer(x)
        return x


In [8]:

GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}
inputs = torch.LongTensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])
torch.manual_seed(123)
model = gpt2_architecture(GPT_CONFIG_124M)
out = model(inputs)

In [9]:
out

tensor([[[-0.1810, -0.1195, -0.5631,  ...,  0.4638, -0.9327,  0.2023],
         [-0.9883, -0.2057, -1.1612,  ...,  1.3426, -0.9906, -0.5683],
         [-0.3225, -0.3010,  0.1254,  ...,  0.8342, -0.9038,  0.2887],
         [-0.8906,  0.0978,  0.0120,  ...,  0.3022, -0.4171,  0.8031]],

        [[-0.2862, -0.7426, -0.8761,  ...,  0.8589, -1.0763, -0.5628],
         [ 0.6452, -0.5032, -0.8414,  ...,  0.7574, -0.7960,  0.8602],
         [-0.3754,  0.8685, -0.2129,  ...,  0.3230, -0.8398,  0.6426],
         [ 0.4806,  0.5054,  0.1545,  ...,  1.0216, -1.2249,  1.7587]]],
       grad_fn=<ViewBackward0>)

Checking the need for softmax for generating the tokens to apply it to logit tensor

In [10]:
out.shape

torch.Size([2, 4, 50257])

In [11]:
torch.argmax(out[0][-1]).unsqueeze(0)

tensor([27383])

In [12]:
out[:,-1,:]
torch.argmax(out[:,-1,:],dim=1,keepdim=True)

tensor([[27383],
        [38152]])

In [13]:
torch.argmax(torch.softmax(out[:,-1,:],dim=1),dim=1,keepdim=True)

tensor([[27383],
        [38152]])

Concludes we dont need softmax as both give the same result with or without softmax probably softmax might be needed during the training as it can be helpful in giving out the probabilities of the index and other indexes proababilites in becoming the next token.

In [None]:
def generate_new_tokens(model, inputs, expected_context_length):
    model.eval()
    for i in range(expected_context_length):
        with torch.no_grad():
            out = model(inputs)
        indexes = torch.argmax(out[:,-1,:],keepdim=True,dim=1)
        inputs = torch.cat((inputs,indexes),dim=1)
    return inputs

In [15]:
inp = torch.LongTensor([[6109, 3626, 6100,  345]])
nu = generate_new_tokens(model,inp,10)
print(nu)
print(nu.shape)

tensor([[ 6109,  3626,  6100,   345, 40204, 45193, 22418, 16115,  3268, 35496,
          9099, 19482, 26846, 28670]])
torch.Size([1, 14])


In [16]:
import re
filename = 'the-verdict.txt'
with open(filename,'r') as f:
    data = f.read()
print("The total length of the filename is:", len(data))

The total length of the filename is: 20479


In [17]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

start_context = "Every effort moves you"



token_ids = generate_new_tokens(
    model=model,
    inputs=text_to_token_ids(start_context, tokenizer),expected_context_length=10)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you Kendisal users Maidpart adaptive Elk reconcil ground throughout


Choosing train_test_split and manual split by understanding the difference

In [18]:
from sklearn.model_selection import train_test_split
tokenizer = tiktoken.get_encoding("gpt2")
encoded_text = tokenizer.encode(data)
train_data, test_data = train_test_split(encoded_text,test_size=0.1,random_state=123,shuffle = False)


In [19]:
tokenizer.decode(train_data[:5])

'I HAD always thought'

In [20]:
s = encoded_text[:5]
# s.reverse()
tokenizer.decode(s)


'I HAD always thought'

In [21]:
len(train_data)

4630

In [22]:
train_ratio = 0.90
split_idx = int(train_ratio * len(encoded_text))
train_data1 = encoded_text[:split_idx]
test_data1 = encoded_text[split_idx:]
len(train_data1)

4630

In [23]:
tokenizer.decode(train_data1[:5])

'I HAD always thought'

So both gives same result if you want convienece or building pipelines we can go with test train split for simplicity we can go with manaual test split.

In [24]:
train_ratio = 0.90
split_idx = int(train_ratio * len(data))
train_data2 = data[:split_idx]
val_data2 = data[split_idx:]

In [25]:
t = tokenizer.encode(train_data2)
print(len(t))
print(tokenizer.decode(t[:5]))

4612
I HAD always thought
