In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as f
torch.cuda.empty_cache()

In [2]:
with open('/content/combined_harry_potter_books.txt','r',encoding= 'utf-8') as f:
  text = f.read()

In [3]:
text[:500]

'M r. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you’d expect to be involved in anything strange or mysterious, because they just didn’t hold with such nonsense.\n\nMr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amoun'

In [4]:
!pip install tiktoken



In [5]:
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')
encoding = tokenizer.encode(text)

In [6]:
import torch
data = torch.tensor(encoding,dtype = torch.long)

In [7]:
data[:50]

tensor([   44,   374,    13,   290,  9074,    13,   360,  1834,  1636,    11,
          286,  1271,  1440,    11,  4389, 16809,  9974,    11,   547,  6613,
          284,   910,   326,   484,   547,  7138,  3487,    11,  5875,   345,
          845,   881,    13,  1119,   547,   262,   938,   661,   345,   447,
          247,    67,  1607,   284,   307,  2950,   287,  1997,  6283,   393])

In [8]:
n  = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [9]:
#again bringing the train_data and val_data
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
# so basically apart from increasing the size of the ket, query and values matrix (multi head) we will also add some optimization techniques such as layer norm, dropout neurons stuff like residual connections
#increase the no. of transformer blocks to 6 as defined by the n_layer, adding feed forward neural netowrks
#so, you need to keep in mind the architecture of transformer while going thru this

device(type='cuda')

In [18]:
# defining the hyperparameters again
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 768
n_head = 6
n_layer = 6
dropout = 0.2

In [19]:
# data loading
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [20]:
xb,yb = get_batch('train')
print('inputs',xb)
print('targets',yb)

inputs tensor([[  284,  1064,   503,  ...,  1392,  4697,   532],
        [  587,  1234,   510,  ...,   257,  3091,   286],
        [  290,  6507,    13,  ...,   464,  2356,   287],
        ...,
        [  447,   247,   260,  ..., 31933,   355,   339],
        [  550,   340,  2089,  ...,   329,  1692,  7947],
        [   30,   314,   447,  ..., 32244,   259,    11]], device='cuda:0')
targets tensor([[1064,  503,  644,  ..., 4697,  532, 6575],
        [1234,  510,   11,  ..., 3091,  286, 6701],
        [6507,   13, 6363,  ..., 2356,  287, 5850],
        ...,
        [ 247,  260, 1016,  ...,  355,  339, 5982],
        [ 340, 2089,   11,  ..., 1692, 7947,  341],
        [ 314,  447,  247,  ...,  259,   11, 2263]], device='cuda:0')


In [21]:

@torch.no_grad()
#loss evaluation
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [24]:
# import torch
class Head(nn.Module):
    #as we have seen it getting implemented in the self attention as well

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,head size)
        q = self.query(x) # (B,T,head size)
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei =f.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

In [25]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [26]:
class FeedFoward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)


In [27]:
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [28]:
class GPTModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = f.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :] # becomes (B, C)
            probs = f.softmax(logits, dim=-1) # (B, C)
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


In [29]:
vocab_size = tokenizer.n_vocab
model = GPTModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

119.956561 M parameters
step 0: train loss 10.9829, val loss 10.9793
step 500: train loss 4.1381, val loss 4.4895
step 1000: train loss 3.5645, val loss 4.1473


KeyboardInterrupt: 

In [30]:
idx = torch.zeros((1,1),dtype = torch.long,device = device)
generated_text = tokenizer.decode(model.generate(idx,max_new_tokens = 1000)[0].tolist())


In [31]:
generated_text

"! Harry pointed at once nervously in mild Defensive Magical Creatures job. Harry was that the lift full to breakfast beside Hedwig’s chair and it had been erected at the twins Filch and Harry gleaned instead.\n\nI know a moment from the horrible thing that would be.\n\n“I hope we could only tomorrow,” said Ron, looking cheerful.\n\n“What did Dumbledore come up and sprinted into the Houses were talking.”\n\nHarry, Ron and Hermione headed to the entrance hall at the portrait door to a drawling-to-light as he pulled it open down.\n\n“Oh yes,” Fred suggested. “I “You’ve been asleep.”\n\nHermione turned a short sensitive and silver tube. She stopped talking to Harry's table, which was the most depressing on Fred usually was standing hidden with the room so that Harry had just stood for it off to something very into the entrance to his path once, waving his arms like him.\n\nHarry stared with his eyes. Malfoy was all glad, pale as Lee Jordan inmoon armish in the black star-shouldered with t

####ABove are the results with training till only 1000th step


In [None]:
0.783*60

46.980000000000004

####So we stopped the model much much before at around 1000 steps itself because of 2 major reasons
#####1. 1st being that to reach 1000 steps it took almost and hour which implies even if we go on a linear scale to train 5000 steps, we would require 5 hours of trainng (quite too much!!)
#####2. 2nd and a more major/important reason is that from the 1000th step itself it showing a great increase in the difference between train and validation loss
#####3. What this means ki if i let it go till 5000 sentences, a lot of overfitting will happen and it might happen that instead of producing some new words we are just generating the book test again which we definitely don't want to happen
#####4. We have some strategies like temperature scaling, top k sampling to intriduce some randomness which we'll implement later on