In [4]:
import urllib.request
url = ("https://raw.githubusercontent.com/rasbt/"
"LLMs-from-scratch/main/ch02/01_main-chapter-code/"
"the-verdict.txt")
file_path = "the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

('the-verdict.txt', <http.client.HTTPMessage at 0x1064effe0>)

In [5]:
with open(file_path, "r") as f:
    text = f.read()

print(len(text))

20479


In [6]:
import re

In [7]:
preprocessed_text = re.split(r'([,.:;?_!"()\']|--|\s)',text)
preprocessed_text = [x.strip() for x in preprocessed_text if x.strip()]
print((preprocessed_text[:30]))

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [8]:
all_words = sorted(set(preprocessed_text))
all_words.extend(["<|endoftext|>", "<|unk|>"])
print(len(all_words))

1132


In [9]:
vocabulary = {word : i for i, word in enumerate(all_words)}

In [10]:
class SimpleTokenizerV2:
    def __init__(self,vocabulary):
        self.str_to_int = vocabulary
        self.int_to_str = {i:word for word,i in vocabulary.items()}

    def encode(self,text):
        preprocessed_text = re.split(r'([,.:;?_!"()\']|--|\s)',text)
        preprocessed_text = [x.strip() for x in preprocessed_text if x.strip()]
        preprocessed_text = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed_text]
        ids = [self.str_to_int[item] for item in preprocessed_text]
        return ids
    
    def decode(self,ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text
    


In [11]:
import tiktoken

In [12]:
tokenizer = tiktoken.get_encoding("gpt2")
enc_text = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(tokenizer.decode(enc_text[:30]))

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear


In [13]:
context_size = 4
for i in range(1,context_size + 1):
    context = enc_text[:i]
    desired = enc_text[i]
    print(context ," --->>" ,desired)


[40]  --->> 367
[40, 367]  --->> 2885
[40, 367, 2885]  --->> 1464
[40, 367, 2885, 1464]  --->> 1807


In [14]:
import torch
from torch.utils.data import Dataset, DataLoader

In [15]:
class GPTDatasetV1(Dataset):
    def __init__(self,text,tokenizer,max_length,stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(text)

        for i in range(0,len(token_ids) - max_length,stride):
            input_chunks = token_ids[i:i+max_length]
            target_chunks = token_ids[i + 1:i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunks))
            self.target_ids.append(torch.tensor(target_chunks))
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self,idx):
        return self.input_ids[idx],self.target_ids[idx]

In [16]:
def create_dataset_loaderv1(txt,batch_size=4,max_length = 256,stride = 128,shuffle = True,drop_last = True,num_workers = 0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt,tokenizer,max_length,stride)
    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle = shuffle,
        num_workers = num_workers,
        drop_last=drop_last
    )

In [17]:
max_length = 5 
dataloader = create_dataset_loaderv1(
    text,
    batch_size=8,
    max_length=5,
    stride=max_length,
    shuffle=False,
    num_workers=0
)
data_iter = iter(dataloader)
x,y = next(data_iter)
len(dataloader) 


128

In [18]:
vocab_size = tokenizer.n_vocab
output_dim = 256
embeddings_layer = torch.nn.Embedding(vocab_size,output_dim)
token_embeddings = embeddings_layer(x)
token_embeddings.shape


torch.Size([8, 5, 256])

In [19]:
context_size = 5
poss_embeddings_layer = torch.nn.Embedding(context_size, output_dim)
poss_embedding = poss_embeddings_layer(torch.arange(context_size))
input_embeddings = token_embeddings + poss_embedding
input_embeddings[1]


tensor([[ 3.2992, -1.7415, -2.1483,  ...,  1.0238,  0.1985,  3.2135],
        [-1.1955, -0.8196,  0.2026,  ..., -1.5924,  2.4118, -1.2677],
        [-3.0836,  0.7119, -0.4596,  ...,  0.1939, -0.2283,  0.4772],
        [-0.0575, -0.2945, -0.1653,  ..., -0.4322, -0.4730, -0.7959],
        [ 1.4317,  2.9960, -1.6659,  ..., -0.5130,  0.3145,  0.0635]],
       grad_fn=<SelectBackward0>)

In [20]:
input_embeddings.shape[0]

8

In [21]:
#simple attention mechanism

inputs = torch.tensor(
    [
        [0.43,0.15,0.89],
        [0.55, 0.87, 0.66],
        [0.57, 0.85, 0.64],
        [0.22, 0.58, 0.33],
        [0.77, 0.25, 0.10],
        [0.05, 0.80, 0.55]
    ]
)


In [22]:
attention_scores = inputs @ inputs.T
attention_scores

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])

In [23]:
attention_weights = torch.nn.functional.softmax(attention_scores,dim=1)
attention_weights

tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])

In [24]:
attention_weights.sum(dim=-1)

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])

In [25]:
context = attention_weights @ inputs
context
    
tmp_attn = attention_weights[1]
con_temp = tmp_attn @ inputs
con_temp

tensor([0.4419, 0.6515, 0.5683])

In [26]:
torch.manual_seed(123)
d_in = inputs.shape[-1]
d_out = 2
W_query = torch.nn.Parameter(torch.randn(d_in,d_out),requires_grad=False)
W_key = torch.nn.Parameter(torch.randn(d_in,d_out),requires_grad=False)
W_value = torch.nn.Parameter(torch.randn(d_in,d_out),requires_grad=False)


In [27]:
x_2 = inputs[1]
query_2 = x_2 @ W_query
key_2 = x_2 @ W_key
value_2 = x_2 @ W_value

In [28]:
keys = inputs @ W_key
queries = inputs @ W_query
values = inputs @ W_value


In [29]:
keys_2 = keys[1]
attention_scores_22 = query_2.dot(keys_2)
attention_scores_22 


tensor(0.1376)

In [30]:
attention_scores_2 = queries @ keys.T
attention_scores_2

tensor([[ 0.0740, -0.0216,  0.0126, -0.1230,  0.6250, -0.4498],
        [ 0.2172,  0.1376,  0.1730, -0.0491,  0.7616, -0.3809],
        [ 0.2098,  0.1320,  0.1665, -0.0489,  0.7408, -0.3725],
        [ 0.1458,  0.1061,  0.1254, -0.0118,  0.4384, -0.1919],
        [ 0.0175, -0.0071,  0.0017, -0.0321,  0.1580, -0.1153],
        [ 0.2240,  0.1642,  0.1935, -0.0161,  0.6667, -0.2888]])

In [31]:
attention_weights_2 = torch.nn.functional.softmax(attention_scores_2,dim=-1)
context_2 = attention_weights_2 @ values
context_2

tensor([[0.2833, 0.4180],
        [0.2845, 0.4193],
        [0.2846, 0.4183],
        [0.2861, 0.4026],
        [0.2861, 0.3930],
        [0.2855, 0.4126]])

In [32]:
import torch.nn as nn
class SelfAttention_v2(nn.Module):
    def __init__(self,din,dout,qkv_bias=False):
        super().__init__()
        self.W_query = nn.Linear(din,dout,bias=qkv_bias)
        self.W_key = nn.Linear(din,dout,bias=qkv_bias)
        self.W_value = nn.Linear(din,dout,bias=qkv_bias)

    def forward(self,x):
        keys = self.W_key(x)
        queries =self.W_query(x)
        values = self.W_value(x)
        attention_scores = queries @ keys.T
        attention_weights = torch.softmax(attention_scores / keys.shape[-1]**0.5,dim=-1)
        context_vec = attention_weights @ values
        return context_vec
            



In [33]:
torch.manual_seed(789)
sa_v2 = SelfAttention_v2(d_in, d_out)
sa_v2(inputs)

tensor([[-0.0739,  0.0713],
        [-0.0748,  0.0703],
        [-0.0749,  0.0702],
        [-0.0760,  0.0685],
        [-0.0763,  0.0679],
        [-0.0754,  0.0693]], grad_fn=<MmBackward0>)

In [34]:
queries = sa_v2.W_query(inputs)
keys = sa_v2.W_key(inputs)
attention_scores = queries @ keys.T
attention_weights = torch.softmax(attention_scores / inputs.shape[0] **0.5, dim=-1)
context_vec = attention_weights @ values
context_vec

tensor([[0.2846, 0.3766],
        [0.2839, 0.3713],
        [0.2839, 0.3715],
        [0.2853, 0.3782],
        [0.2856, 0.3797],
        [0.2847, 0.3755]], grad_fn=<MmBackward0>)

In [35]:
context_mask = torch.tril(torch.ones(attention_scores.shape[0], attention_scores.shape[0]))
context_mask

tensor([[1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1.]])

In [36]:
masked_context = context_mask * attention_weights
row_sum = masked_context.sum(dim=-1,keepdim=True)
masked_context = masked_context / row_sum
masked_context

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5299, 0.4701, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3599, 0.3199, 0.3202, 0.0000, 0.0000, 0.0000],
        [0.2647, 0.2478, 0.2479, 0.2395, 0.0000, 0.0000],
        [0.2100, 0.1991, 0.1991, 0.1935, 0.1983, 0.0000],
        [0.1818, 0.1666, 0.1667, 0.1595, 0.1667, 0.1587]],
       grad_fn=<DivBackward0>)

In [37]:
mask = torch.triu(torch.ones(6,6),diagonal=1)
masked = attention_scores.masked_fill(mask.bool(),-torch.inf)
masked

tensor([[0.2899,   -inf,   -inf,   -inf,   -inf,   -inf],
        [0.4656, 0.1723,   -inf,   -inf,   -inf,   -inf],
        [0.4594, 0.1703, 0.1731,   -inf,   -inf,   -inf],
        [0.2642, 0.1024, 0.1036, 0.0186,   -inf,   -inf],
        [0.2183, 0.0874, 0.0882, 0.0177, 0.0786,   -inf],
        [0.3408, 0.1270, 0.1290, 0.0198, 0.1290, 0.0078]],
       grad_fn=<MaskedFillBackward0>)

In [38]:
attention_weights = torch.softmax(masked/attention_scores.shape[-1],dim=-1)
attention_weights

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5122, 0.4878, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3441, 0.3279, 0.3280, 0.0000, 0.0000, 0.0000],
        [0.2560, 0.2491, 0.2492, 0.2457, 0.0000, 0.0000],
        [0.2040, 0.1996, 0.1997, 0.1973, 0.1993, 0.0000],
        [0.1727, 0.1667, 0.1667, 0.1637, 0.1667, 0.1634]],
       grad_fn=<SoftmaxBackward0>)

In [39]:
dropout = torch.nn.Dropout(p = 0.5 )
dropout(torch.ones(5,5))


tensor([[2., 2., 0., 0., 2.],
        [2., 0., 0., 2., 0.],
        [0., 2., 0., 0., 2.],
        [2., 0., 0., 2., 2.],
        [0., 2., 2., 2., 0.]])

In [40]:
class CasualSelfAttention(nn.Module):
    def __init__(self, d_in,d_out,context_len,dropout,qkv_bias=False):
        super().__init__()
        self.d_out = d_out
        self.W_query = nn.Linear(d_in,d_out,qkv_bias)
        self.W_key = nn.Linear(d_in,d_out,qkv_bias)
        self.W_value = nn.Linear(d_in,d_out,qkv_bias)
        self.droupout = dropout
        self.register_buffer(
            'mask',
            torch.triu(torch.ones(context_len,context_len),diagonal=1)
        )

    def forward(self,x):
        b, num_tokens,d_in = x.shape
        keys = self.W_key(x)
        queries = self.W_query(x)
        attention_scores = queries @ keys.transpose(1,2)
        attention_scores.masked_fill_(
            self.mask.bool()[:num_tokens,:num_tokens],
            -torch.inf
        )        
        attention_weights = torch.softmax(attention_scores/keys.shape[-1]**0.5,dim=-1)
        attention_weights = self.droupout(attention_weights)

        context_vec = attention_weights @ values
        return context_vec


In [41]:
batch = torch.stack((inputs,inputs),dim = 0)


In [42]:
torch.manual_seed(123)
context_len = batch.shape[1]
ca = CasualSelfAttention(d_in = 3, d_out=2,context_len=context_len,dropout=dropout)
context_vecs = ca(batch)
context_vecs[0]

tensor([[ 0.2392, -0.7133],
        [ 0.1156, -0.3447],
        [ 0.2784,  0.4348],
        [ 0.2665,  0.1505],
        [ 0.3112,  0.2781],
        [ 0.0917,  0.1090]], grad_fn=<SelectBackward0>)

In [43]:
class MultiHeadAttentionWrapper(nn.Module):
    def __init__(self,d_in,d_out,context_length,dropout,num_heads,qkv_bias = False):
        super.__init__()
        self.heads = nn.ModuleList(
            [CasualSelfAttention(d_in,d_out,context_len,dropout,qkv_bias) for _ in range(num_heads)]
        )

    def forward(self,x):
        return torch.cat([head(x) for head in self.heads()],dim=-1)

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self,d_in,d_out,context_len,dropout,num_heads,qkv_bias=False):
        super.__init__()
        assert (d_out % num_heads == 0) , "d_out must be divisible by num_heads"
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads

        self.W_query = nn.Linear(d_in,d_out,qkv_bias)
        self.W_key = nn.Linear(d_in,d_out,qkv_bias)
        self.W_value = nn.Linear(d_in,d_out,qkv_bias)

        self.out_proj = nn.Linear(d_out,d_out)
        self.dropout = dropout

        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_len,context_len),diagonal=1)
        )

    def forward(self,x):
        b,num_tokens,d_in = x.shape
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        keys = keys.view(b,num_tokens,self.num_heads,self.head_dim)
        values = values.view(b,num_tokens,self.num_heads,self.head_dim)
        queries = queries.view(b,num_tokens,self.num_heads,self.head_dim)

        keys= keys.transpose(1,2)
        values = values.transpose(1,2)
        queries = queries.transpose(1,2)

        attention_scores = keys @ queries.transpose(2,3)

        mask_bool = self.mask.bool()[:num_tokens,:num_tokens]

        attention_scores.masked_fill_(mask_bool,-torch.inf)

        attention_weights = torch.softmax(attention_scores / keys.shape[-1]**0.5,dim=-1)

        attention_weights = self.dropout(attention_weights)

        context_vec = (attention_weights @ values).transpose(1,2)

        context_vec = context_vec.contiguous().view(b,num_tokens,self.d_out)

        context_vec = self.out_proj(context_vec)

        return context_vec


In [45]:
import torch  

x = torch.tensor([[1, 2, 3], [4, 5, 6]])
print("Original Tensor:")
print(x)

x_t = x.T
x_cont = x_t.contiguous()
x_cont

Original Tensor:
tensor([[1, 2, 3],
        [4, 5, 6]])


tensor([[1, 4],
        [2, 5],
        [3, 6]])

In [47]:
GPT_CONFIG_124M = {
"vocab_size": 50257, # Vocabulary size
"context_length": 1024, # Context length
"emb_dim": 768, # Embedding dimension
"n_heads": 12, # Number of attention heads
"n_layers": 12, # Number of layers
"drop_rate": 0.1, # Dropout rate
"qkv_bias": False # Query-Key-Value bias
}

In [46]:
class LayerNorm(nn.Module):
    def __init__(self,emb_dim):
        super.__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self,x):
        mean = x.mean(keepdim=True, dim=-1,unbiased=False)
        variance = x.var(keepdim=True,dim=-1, unbiased = False)
        x_norm = (x - mean) / torch.sqrt(variance + self.eps)
        return self.scale * x_norm + self.shift
    

In [61]:
class GELU(nn.Module):
    def __init__(self):
        super.__init__()
    
    def forward(self,x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0/torch.pi))* (x + 0.044715 * torch.pow(x,3))))


In [62]:
class FeedForward(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"],4*cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"],cfg["emb_dim"])
        )
    
    def forward(self,x):
        return self.layers(x)

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self,cfg):
        super.__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_len=cfg["context_len"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"]
        )
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self,x):

        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2
        x = self.att(x)
        x = self.drop_shortcut(x)
        x = x + shortcut
        return x
