In [2]:
import urllib.request
url = ("https://raw.githubusercontent.com/rasbt/"
"LLMs-from-scratch/main/ch02/01_main-chapter-code/"
"the-verdict.txt")
file_path = "the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

('the-verdict.txt', <http.client.HTTPMessage at 0x10a8a8770>)

In [3]:
with open(file_path, "r") as f:
    text = f.read()

print(len(text))

20479


In [4]:
import re

In [5]:
preprocessed_text = re.split(r'([,.:;?_!"()\']|--|\s)',text)
preprocessed_text = [x.strip() for x in preprocessed_text if x.strip()]
print((preprocessed_text[:30]))

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [6]:
all_words = sorted(set(preprocessed_text))
all_words.extend(["<|endoftext|>", "<|unk|>"])
print(len(all_words))

1132


In [7]:
vocabulary = {word : i for i, word in enumerate(all_words)}
vocabulary

{'!': 0,
 '"': 1,
 "'": 2,
 '(': 3,
 ')': 4,
 ',': 5,
 '--': 6,
 '.': 7,
 ':': 8,
 ';': 9,
 '?': 10,
 'A': 11,
 'Ah': 12,
 'Among': 13,
 'And': 14,
 'Are': 15,
 'Arrt': 16,
 'As': 17,
 'At': 18,
 'Be': 19,
 'Begin': 20,
 'Burlington': 21,
 'But': 22,
 'By': 23,
 'Carlo': 24,
 'Chicago': 25,
 'Claude': 26,
 'Come': 27,
 'Croft': 28,
 'Destroyed': 29,
 'Devonshire': 30,
 'Don': 31,
 'Dubarry': 32,
 'Emperors': 33,
 'Florence': 34,
 'For': 35,
 'Gallery': 36,
 'Gideon': 37,
 'Gisburn': 38,
 'Gisburns': 39,
 'Grafton': 40,
 'Greek': 41,
 'Grindle': 42,
 'Grindles': 43,
 'HAD': 44,
 'Had': 45,
 'Hang': 46,
 'Has': 47,
 'He': 48,
 'Her': 49,
 'Hermia': 50,
 'His': 51,
 'How': 52,
 'I': 53,
 'If': 54,
 'In': 55,
 'It': 56,
 'Jack': 57,
 'Jove': 58,
 'Just': 59,
 'Lord': 60,
 'Made': 61,
 'Miss': 62,
 'Money': 63,
 'Monte': 64,
 'Moon-dancers': 65,
 'Mr': 66,
 'Mrs': 67,
 'My': 68,
 'Never': 69,
 'No': 70,
 'Now': 71,
 'Nutley': 72,
 'Of': 73,
 'Oh': 74,
 'On': 75,
 'Once': 76,
 'Only': 77,
 '

In [8]:
class SimpleTokenizerV2:
    def __init__(self,vocabulary):
        self.str_to_int = vocabulary
        self.int_to_str = {i:word for word,i in vocabulary.items()}

    def encode(self,text):
        preprocessed_text = re.split(r'([,.:;?_!"()\']|--|\s)',text)
        preprocessed_text = [x.strip() for x in preprocessed_text if x.strip()]
        preprocessed_text = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed_text]
        ids = [self.str_to_int[item] for item in preprocessed_text]
        return ids
    
    def decode(self,ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text
    


In [9]:
import tiktoken

In [10]:
tokenizer = tiktoken.get_encoding("gpt2")
enc_text = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(tokenizer.decode(enc_text[:30]))

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear


In [11]:
context_size = 4
for i in range(1,context_size + 1):
    context = enc_text[:i]
    desired = enc_text[i]
    print(context ," --->>" ,desired)


[40]  --->> 367
[40, 367]  --->> 2885
[40, 367, 2885]  --->> 1464
[40, 367, 2885, 1464]  --->> 1807


In [12]:
import torch
from torch.utils.data import Dataset, DataLoader

In [13]:
class GPTDatasetV1(Dataset):
    def __init__(self,text,tokenizer,max_length,stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(text)

        for i in range(0,len(token_ids) - max_length,stride):
            input_chunks = token_ids[i:i+max_length]
            target_chunks = token_ids[i + 1:i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunks))
            self.target_ids.append(torch.tensor(target_chunks))
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self,idx):
        return self.input_ids[idx],self.target_ids[idx]

In [14]:
def create_dataset_loaderv1(txt,batch_size=4,max_length = 256,stride = 128,shuffle = True,drop_last = True,num_workers = 0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt,tokenizer,max_length,stride)
    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle = shuffle,
        num_workers = num_workers,
        drop_last=drop_last
    )

In [15]:
max_length = 5 
dataloader = create_dataset_loaderv1(
    text,
    batch_size=8,
    max_length=5,
    stride=max_length,
    shuffle=False,
    num_workers=0
)
data_iter = iter(dataloader)
x,y = next(data_iter)
len(dataloader) 


128

In [16]:
vocab_size = tokenizer.n_vocab
output_dim = 256
embeddings_layer = torch.nn.Embedding(vocab_size,output_dim)
token_embeddings = embeddings_layer(x)
token_embeddings.shape


torch.Size([8, 5, 256])

In [17]:
context_size = 5
poss_embeddings_layer = torch.nn.Embedding(context_size, output_dim)
poss_embedding = poss_embeddings_layer(torch.arange(context_size))
input_embeddings = token_embeddings + poss_embedding
input_embeddings[1]


tensor([[ 0.3333,  0.1342, -0.4348,  ..., -0.4947, -0.2358,  0.7622],
        [ 2.1306,  2.3844, -0.5400,  ..., -0.1953, -0.8761,  1.7609],
        [-1.8320,  0.9510, -2.8271,  ...,  0.6670, -1.6913, -1.5920],
        [ 1.4745, -0.0543, -1.3141,  ..., -0.2592,  1.4522,  0.7471],
        [ 0.9863, -1.7034,  0.8604,  ...,  0.6531,  0.1773,  0.6952]],
       grad_fn=<SelectBackward0>)

In [18]:
input_embeddings.shape[0]

8

In [19]:
#simple attention mechanism

inputs = torch.tensor(
    [
        [0.43,0.15,0.89],
        [0.55, 0.87, 0.66],
        [0.57, 0.85, 0.64],
        [0.22, 0.58, 0.33],
        [0.77, 0.25, 0.10],
        [0.05, 0.80, 0.55]
    ]
)


In [20]:
attention_scores = inputs @ inputs.T
attention_scores

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])

In [21]:
attention_weights = torch.nn.functional.softmax(attention_scores,dim=1)
attention_weights

tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])

In [22]:
attention_weights.sum(dim=-1)

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])

In [23]:
context = attention_weights @ inputs
context
    
tmp_attn = attention_weights[1]
con_temp = tmp_attn @ inputs
con_temp

tensor([0.4419, 0.6515, 0.5683])

In [24]:
torch.manual_seed(123)
d_in = inputs.shape[-1]
d_out = 2
W_query = torch.nn.Parameter(torch.randn(d_in,d_out),requires_grad=False)
W_key = torch.nn.Parameter(torch.randn(d_in,d_out),requires_grad=False)
W_value = torch.nn.Parameter(torch.randn(d_in,d_out),requires_grad=False)


In [25]:
x_2 = inputs[1]
query_2 = x_2 @ W_query
key_2 = x_2 @ W_key
value_2 = x_2 @ W_value

In [26]:
keys = inputs @ W_key
queries = inputs @ W_query
values = inputs @ W_value


In [27]:
keys_2 = keys[1]
attention_scores_22 = query_2.dot(keys_2)
attention_scores_22 


tensor(0.1376)

In [28]:
attention_scores_2 = queries @ keys.T
attention_scores_2

tensor([[ 0.0740, -0.0216,  0.0126, -0.1230,  0.6250, -0.4498],
        [ 0.2172,  0.1376,  0.1730, -0.0491,  0.7616, -0.3809],
        [ 0.2098,  0.1320,  0.1665, -0.0489,  0.7408, -0.3725],
        [ 0.1458,  0.1061,  0.1254, -0.0118,  0.4384, -0.1919],
        [ 0.0175, -0.0071,  0.0017, -0.0321,  0.1580, -0.1153],
        [ 0.2240,  0.1642,  0.1935, -0.0161,  0.6667, -0.2888]])

In [29]:
attention_weights_2 = torch.nn.functional.softmax(attention_scores_2,dim=-1)
context_2 = attention_weights_2 @ values
context_2

tensor([[0.2833, 0.4180],
        [0.2845, 0.4193],
        [0.2846, 0.4183],
        [0.2861, 0.4026],
        [0.2861, 0.3930],
        [0.2855, 0.4126]])

In [30]:
import torch.nn as nn
class SelfAttention_v2(nn.Module):
    def __init__(self,din,dout,qkv_bias=False):
        super().__init__()
        self.W_query = nn.Linear(din,dout,bias=qkv_bias)
        self.W_key = nn.Linear(din,dout,bias=qkv_bias)
        self.W_value = nn.Linear(din,dout,bias=qkv_bias)

    def forward(self,x):
        keys = self.W_key(x)
        queries =self.W_query(x)
        values = self.W_value(x)
        attention_scores = queries @ keys.T
        attention_weights = torch.softmax(attention_scores / keys.shape[-1]**0.5,dim=-1)
        context_vec = attention_weights @ values
        return context_vec
            



In [31]:
torch.manual_seed(789)
sa_v2 = SelfAttention_v2(d_in, d_out)
sa_v2(inputs)

tensor([[-0.0739,  0.0713],
        [-0.0748,  0.0703],
        [-0.0749,  0.0702],
        [-0.0760,  0.0685],
        [-0.0763,  0.0679],
        [-0.0754,  0.0693]], grad_fn=<MmBackward0>)

In [32]:
queries = sa_v2.W_query(inputs)
keys = sa_v2.W_key(inputs)
attention_scores = queries @ keys.T
attention_weights = torch.softmax(attention_scores / inputs.shape[0] **0.5, dim=-1)
context_vec = attention_weights @ values
context_vec

tensor([[0.2846, 0.3766],
        [0.2839, 0.3713],
        [0.2839, 0.3715],
        [0.2853, 0.3782],
        [0.2856, 0.3797],
        [0.2847, 0.3755]], grad_fn=<MmBackward0>)

In [33]:
context_mask = torch.tril(torch.ones(attention_scores.shape[0], attention_scores.shape[0]))
context_mask

tensor([[1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1.]])

In [34]:
masked_context = context_mask * attention_weights
row_sum = masked_context.sum(dim=-1,keepdim=True)
masked_context = masked_context / row_sum
masked_context

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5299, 0.4701, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3599, 0.3199, 0.3202, 0.0000, 0.0000, 0.0000],
        [0.2647, 0.2478, 0.2479, 0.2395, 0.0000, 0.0000],
        [0.2100, 0.1991, 0.1991, 0.1935, 0.1983, 0.0000],
        [0.1818, 0.1666, 0.1667, 0.1595, 0.1667, 0.1587]],
       grad_fn=<DivBackward0>)

In [35]:
mask = torch.triu(torch.ones(6,6),diagonal=1)
masked = attention_scores.masked_fill(mask.bool(),-torch.inf)
masked

tensor([[0.2899,   -inf,   -inf,   -inf,   -inf,   -inf],
        [0.4656, 0.1723,   -inf,   -inf,   -inf,   -inf],
        [0.4594, 0.1703, 0.1731,   -inf,   -inf,   -inf],
        [0.2642, 0.1024, 0.1036, 0.0186,   -inf,   -inf],
        [0.2183, 0.0874, 0.0882, 0.0177, 0.0786,   -inf],
        [0.3408, 0.1270, 0.1290, 0.0198, 0.1290, 0.0078]],
       grad_fn=<MaskedFillBackward0>)

In [36]:
attention_weights = torch.softmax(masked/attention_scores.shape[-1],dim=-1)
attention_weights

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5122, 0.4878, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3441, 0.3279, 0.3280, 0.0000, 0.0000, 0.0000],
        [0.2560, 0.2491, 0.2492, 0.2457, 0.0000, 0.0000],
        [0.2040, 0.1996, 0.1997, 0.1973, 0.1993, 0.0000],
        [0.1727, 0.1667, 0.1667, 0.1637, 0.1667, 0.1634]],
       grad_fn=<SoftmaxBackward0>)

In [37]:
dropout = torch.nn.Dropout(p = 0.5 )
dropout(torch.ones(5,5))


tensor([[2., 2., 0., 0., 2.],
        [2., 0., 0., 2., 0.],
        [0., 2., 0., 0., 2.],
        [2., 0., 0., 2., 2.],
        [0., 2., 2., 2., 0.]])

In [38]:
class CasualSelfAttention(nn.Module):
    def __init__(self, d_in,d_out,context_len,dropout,qkv_bias=False):
        super().__init__()
        self.d_out = d_out
        self.W_query = nn.Linear(d_in,d_out,qkv_bias)
        self.W_key = nn.Linear(d_in,d_out,qkv_bias)
        self.W_value = nn.Linear(d_in,d_out,qkv_bias)
        self.droupout = dropout
        self.register_buffer(
            'mask',
            torch.triu(torch.ones(context_len,context_len),diagonal=1)
        )

    def forward(self,x):
        b, num_tokens,d_in = x.shape
        keys = self.W_key(x)
        queries = self.W_query(x)
        attention_scores = queries @ keys.transpose(1,2)
        attention_scores.masked_fill_(
            self.mask.bool()[:num_tokens,:num_tokens],
            -torch.inf
        )        
        attention_weights = torch.softmax(attention_scores/keys.shape[-1]**0.5,dim=-1)
        attention_weights = self.droupout(attention_weights)

        context_vec = attention_weights @ values
        return context_vec


In [39]:
batch = torch.stack((inputs,inputs),dim = 0)


In [40]:
torch.manual_seed(123)
context_len = batch.shape[1]
ca = CasualSelfAttention(d_in = 3, d_out=2,context_len=context_len,dropout=dropout)
context_vecs = ca(batch)
context_vecs[0]

tensor([[ 0.2392, -0.7133],
        [ 0.1156, -0.3447],
        [ 0.2784,  0.4348],
        [ 0.2665,  0.1505],
        [ 0.3112,  0.2781],
        [ 0.0917,  0.1090]], grad_fn=<SelectBackward0>)

In [41]:
class MultiHeadAttentionWrapper(nn.Module):
    def __init__(self,d_in,d_out,context_length,dropout,num_heads,qkv_bias = False):
        super.__init__()
        self.heads = nn.ModuleList(
            [CasualSelfAttention(d_in,d_out,context_len,dropout,qkv_bias) for _ in range(num_heads)]
        )

    def forward(self,x):
        return torch.cat([head(x) for head in self.heads()],dim=-1)

In [42]:
import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_len, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)

        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)

        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_len, context_len), diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)

        attention_scores = queries @ keys.transpose(-2, -1)

        mask_bool = self.mask[:num_tokens, :num_tokens].bool()
        attention_scores.masked_fill_(mask_bool, float('-inf'))

        attention_weights = torch.softmax(attention_scores / self.head_dim ** 0.5, dim=-1)
        attention_weights = self.dropout(attention_weights)

        context_vec = (attention_weights @ values).transpose(1, 2).contiguous().view(b, num_tokens, self.d_out)

        return self.out_proj(context_vec)

In [43]:
GPT_CONFIG_124M = {
"vocab_size": 50257, # Vocabulary size
"context_length": 1024, # Context length
"emb_dim": 768, # Embedding dimension
"n_heads": 12, # Number of attention heads
"n_layers": 12, # Number of layers
"drop_rate": 0.1, # Dropout rate
"qkv_bias": False # Query-Key-Value bias
}

In [44]:
import torch
import torch.nn as nn

class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(1, emb_dim))  
        self.shift = nn.Parameter(torch.zeros(1, emb_dim))  

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        variance = x.var(dim=-1, keepdim=True, correction=0) 
        x_norm = (x - mean) / torch.sqrt(variance + self.eps)
        return self.scale * x_norm + self.shift

In [45]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self,x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0/torch.pi))* (x + 0.044715 * torch.pow(x,3))))


In [46]:
class FeedForward(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"],4*cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"],cfg["emb_dim"])
        )
    
    def forward(self,x):
        return self.layers(x)

In [47]:
class TransformerBlock(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_len=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"]
        )
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self,x):

        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut
        return x


In [48]:
torch.manual_seed(1234)
x = torch.rand(2,4,768)
tb = TransformerBlock(GPT_CONFIG_124M)
output = tb(x)
print(x.shape)
print(output.shape)

torch.Size([2, 4, 768])
torch.Size([2, 4, 768])


In [49]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len   = in_idx.shape
        tok_embs = self.tok_emb(in_idx)

        poss_embs = self.pos_emb(torch.arange(seq_len, device=in_idx.device))  

        x = tok_embs + poss_embs
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [50]:
batch = []

txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
batch

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

In [51]:
torch.manual_seed(1234)
model = GPTModel(GPT_CONFIG_124M)
logits = model(batch)
logits.shape


torch.Size([2, 4, 50257])

In [52]:
torch.argmax(torch.softmax(logits[:,-1,:],dim=-1),dim=-1,keepdim=True)

tensor([[  105],
        [25930]])

In [53]:
def generate_text_simple(model, idx,
    max_new_tokens, context_size):
    for _ in range(max_new_tokens):
        # idx_cond = idx[:, -context_size:]
        # with torch.no_grad():
        #     logits = model(idx_cond)
        # logits = logits[:, -1, :]
        # probas = torch.softmax(logits, dim=-1)
        # idx_next = torch.argmax(probas, dim=-1, keepdim=True)
        # idx = torch.cat((idx, idx_next), dim=1)
        idx_cond = idx[:,-context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:,-1,:]
        probas = torch.softmax(logits,dim=-1)
        idx_next = torch.argmax(probas,dim=-1,keepdim=True)
        idx = torch.cat((idx,idx_next),dim=1)
    return idx


In [54]:
start_context = "Hello, I am"
encoded = tokenizer.encode(start_context)
print("encoded:", encoded)
encoded_tensor = torch.tensor(encoded).unsqueeze(0)
print("encoded_tensor.shape:", encoded_tensor.shape)

encoded: [15496, 11, 314, 716]
encoded_tensor.shape: torch.Size([1, 4])


In [55]:
model.eval()
out = generate_text_simple(
model=model,
idx=encoded_tensor,
max_new_tokens=6,
context_size=GPT_CONFIG_124M["context_length"]
)
print("Output:", out)
print("Output length:", len(out[0]))

Output: tensor([[15496,    11,   314,   716,  2054, 13875, 45984, 37088, 33683,  9791]])
Output length: 10


In [56]:
decoded_text = tokenizer.decode(out[0].tolist())
decoded_text

'Hello, I am tre tangOnt requisite acquisitions beings'

<b style="color:green">TRAINING CHAPTER BEGIN</b>


In [57]:
GPT_CONFIG_124M = {
"vocab_size": 50257,
"context_length": 256,
"emb_dim": 768,
"n_heads": 12,
"n_layers": 12,
"drop_rate": 0.1,
"qkv_bias": False
}
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features

In [58]:
def text_to_token_ids(text,tokenizer):
    encoded = tokenizer.encode(text,allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

def token_ids_to_text(ids,tokenizer):
    flat = ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

In [59]:
start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")
token_ids = generate_text_simple(
model=model,
idx=text_to_token_ids(start_context, tokenizer),
max_new_tokens=10,
context_size=GPT_CONFIG_124M["context_length"]
)
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you rentingetic wasnم refres RexMeCHicular stren


In [60]:
token_ids.shape

torch.Size([1, 14])

In [61]:
inputs = torch.tensor([[16833, 3626, 6100], # ["every effort moves",
[40, 1107, 588]])

targets = torch.tensor([[3626, 6100, 345 ], # [" effort moves you",
[1107, 588, 11311]])

In [62]:
with torch.no_grad():
    logits = model(inputs)
probas = torch.softmax(logits, dim=-1)
print(probas.shape)

torch.Size([2, 3, 50257])


In [63]:
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
decoded_text = tokenizer.decode(token_ids.flatten().tolist())
decoded_text

' Armed heNetflix pressuring empoweredfaith'

In [64]:
with open("the-verdict.txt",'r') as file:
    data = file.read()

print(len(tokenizer.encode(data)))

5145


In [65]:
train_ratio = 0.9
split_idx = int(train_ratio*len(data))
train_data = data[:split_idx]
val_data = data[split_idx:]
len(train_data)
len(val_data)

2048

In [66]:
train_loader = create_dataset_loaderv1(
    txt=train_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
)

val_loader = create_dataset_loaderv1(
    txt=val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)



In [67]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(
    logits.flatten(0, 1), target_batch.flatten()
    )
    return loss

In [68]:
def calc_loss_loader(data_loader,model,device,num_batches= None):
    total_loss = 0
    if(len(data_loader) == 0):
        return float("nan")

    elif num_batches is None:
        num_batches = len(data_loader)

    else:
        num_batches = min(num_batches,len(data_loader))

    for i,(x,y) in enumerate(data_loader):
        if(i < num_batches):
            loss = calc_loss_batch(x,y,model=model,device=device)
            total_loss+=loss.item()
        else:
            break

    return total_loss / num_batches


In [69]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device)
    val_loss = calc_loss_loader(val_loader, model, device)
print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: 10.987583690219456
Validation loss: 10.98110580444336


In [70]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(
        train_loader, model, device, num_batches=eval_iter
        )
        val_loss = calc_loss_loader(
        val_loader, model, device, num_batches=eval_iter
        )
    model.train()
    return train_loss, val_loss

In [71]:
def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
    model=model, idx=encoded,
    max_new_tokens=50, context_size=context_size
    )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))
    model.train()

In [72]:
def train_model_simple(model, train_loader, val_loader,
optimizer, device, num_epochs,
eval_freq, eval_iter, start_context, tokenizer):
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1
    for epoch in range(num_epochs):
        model.train()
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()
            loss = calc_loss_batch(
            input_batch, target_batch, model, device
            )
            loss.backward()
            optimizer.step()
            tokens_seen += input_batch.numel()
            global_step += 1
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                f"Train loss {train_loss:.3f}, "
                f"Val loss {val_loss:.3f}"
                )
            generate_and_print_sample(
            model, tokenizer, device, start_context
            )
    return train_losses, val_losses, track_tokens_seen

In [74]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.to(device)
optimizer = torch.optim.AdamW(
model.parameters(),
lr=0.0004, weight_decay=0.1
)
num_epochs = 10
train_losses, val_losses, tokens_seen = train_model_simple(
model, train_loader, val_loader, optimizer, device,
num_epochs=num_epochs, eval_freq=5, eval_iter=5,
start_context="Every effort moves you", tokenizer=tokenizer
)

Ep 1 (Step 000000): Train loss 9.781, Val loss 9.933
Every effort moves you,,,,,,,,,,,,,,,,,,,,,,,,,,,,, the the,,,,,,,,,,,,,,,,,,,
Every effort moves you,.                                                
Every effort moves you,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Every effort moves you,,,,,,,,,,,,,,,,,,,,,,,,,,,,, the,,,,,,,,,,,,,,,,,,,,
Every effort moves you,,,, the the the the the the,,,, the the the,, the the,,,, the the the, the the the the the the the the the the the, the the the the the the,,,
Ep 1 (Step 000005): Train loss 8.111, Val loss 8.339
Every effort moves you, the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the
Every effort moves you, the, the the the the the the the the the the the the the the the the the the the the the                          
Every effort moves you,,,,,,,,,,,,.                                 

KeyboardInterrupt: 

In [75]:
tokenizer = tiktoken.get_encoding("gpt2")
token_ids = generate_text_simple(
model=model,
idx=text_to_token_ids("It had always been his fate to have women", tokenizer),
max_new_tokens=25,
context_size=GPT_CONFIG_124M["context_length"]
)
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 It had always been his fate to have womened frame. Gisburn--as such--had not existed till nearly a year after Jack's resolve had been taken.


In [90]:
text = "Amaan is a good boy"
ids = torch.tensor(tokenizer.encode(text=text))

In [None]:
myemb = nn.Embedding(vocab_size,5,)


In [94]:
embs = myemb(ids)
embs

tensor([[-0.8858,  0.5565, -0.2721,  1.1316,  0.4890],
        [ 0.9762,  0.4758, -0.2119, -0.8834, -2.3604],
        [-1.1884,  0.1559,  1.0900, -0.0044,  0.4302],
        [ 1.8207,  0.3162,  0.6851, -0.0147,  0.1794],
        [-0.3821,  0.1976, -0.6816,  0.0609,  0.9262],
        [ 0.4095,  0.2131, -1.9820, -1.3616,  1.9372],
        [ 1.5138,  1.0505, -1.2629, -0.1244, -0.5657]],
       grad_fn=<EmbeddingBackward0>)