In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as f
torch.cuda.empty_cache()

In [10]:
with open('/content/combined_harry_potter_books.txt','r',encoding= 'utf-8') as f:
  text = f.read()

In [11]:
text[:500]

'M r. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you’d expect to be involved in anything strange or mysterious, because they just didn’t hold with such nonsense.\n\nMr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amoun'

In [12]:
!pip install tiktoken



In [13]:
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')
encoding = tokenizer.encode(text)

In [14]:
import torch
data = torch.tensor(encoding,dtype = torch.long)

In [15]:
data[:50]

tensor([   44,   374,    13,   290,  9074,    13,   360,  1834,  1636,    11,
          286,  1271,  1440,    11,  4389, 16809,  9974,    11,   547,  6613,
          284,   910,   326,   484,   547,  7138,  3487,    11,  5875,   345,
          845,   881,    13,  1119,   547,   262,   938,   661,   345,   447,
          247,    67,  1607,   284,   307,  2950,   287,  1997,  6283,   393])

In [16]:
n  = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [21]:
#again bringing the train_data and val_data
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
# so basically apart from increasing the size of the ket, query and values matrix (multi head) we will also add some optimization techniques such as layer norm, dropout neurons stuff like residual connections
#increase the no. of transformer blocks to 6 as defined by the n_layer, adding feed forward neural netowrks
#so, you need to keep in mind the architecture of transformer while going thru this

device(type='cuda')

In [22]:
# defining the hyperparameters again
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

In [23]:
# data loading
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [24]:
xb,yb = get_batch('train')
print('inputs',xb)
print('targets',yb)

inputs tensor([[ 3353,   262, 26045,  ...,  2637,   198,   198],
        [  257, 17707,  9158,  ..., 14802,  1986,   284],
        [ 6504,   276,  1752,  ...,  4656,   717,    13],
        ...,
        [  251,   198,   198,  ...,   561,   307,   286],
        [ 9190,   597,   640,  ..., 39157,    11, 32379],
        [45648,   326,   345,  ...,  2368,   614,    11]], device='cuda:0')
targets tensor([[  262, 26045,   286,  ...,   198,   198, 18308],
        [17707,  9158,  4272,  ...,  1986,   284,   262],
        [  276,  1752,   517,  ...,   717,    13,   770],
        ...,
        [  198,   198,   447,  ...,   307,   286,   502],
        [  597,   640,   783,  ...,    11, 32379,  1350],
        [  326,   345,   547,  ...,   614,    11,   780]], device='cuda:0')


In [28]:

@torch.no_grad()
#loss evaluation
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [29]:

class Head(nn.Module):
    #as we have seen it getting implemented in the self attention as well

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,head size)
        q = self.query(x) # (B,T,head size)
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei =f.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

In [30]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [31]:
class FeedFoward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)


In [32]:
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [33]:
class GPTModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = f.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :] # becomes (B, C)
            probs = f.softmax(logits, dim=-1) # (B, C)
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


In [35]:
vocab_size = tokenizer.n_vocab
model = GPTModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

49.386577 M parameters
step 0: train loss 10.8351, val loss 10.8354
step 500: train loss 4.0801, val loss 4.4487
step 1000: train loss 3.5630, val loss 4.1108
step 1500: train loss 3.2359, val loss 4.0209
step 2000: train loss 2.9743, val loss 3.9846
step 2500: train loss 2.7403, val loss 4.0191
step 3000: train loss 2.5341, val loss 4.0936
step 3500: train loss 2.3458, val loss 4.1492
step 4000: train loss 2.1774, val loss 4.2727
step 4500: train loss 2.0008, val loss 4.3763
step 4999: train loss 1.8624, val loss 4.4493


In [38]:
idx = torch.zeros((1,1),dtype = torch.long,device = device)
generated_text = tokenizer.decode(model.generate(idx,max_new_tokens = 1000)[0].tolist())


In [39]:
generated_text

'! . but what happened?”\n\n“I asked whether you know about the Stone or not,” said Riddle. “Here. Riddle’s diary, Harry. And I’ve never found out what of Hogwarts— Wouldn’t I open it for the book. Riddle told the first one found that is safe to cross, and — if —”\n\n“And — will you speak to me?” Harry said Riddle.\n\n“For the Stone!”\n\n“Yes, is this very important, very closely,” said Riddle. “But, yes, nothing, there doesn’t do even if I have a Horcrux already, not better.”\n\n“So brilliant, toilet, and your soul could on Halloween. I think I knew that — Dumbledore had first entered the forest Vol—”\n\n“Then we ran into your office, you use that hundreds of witches and wizards would have slime came. He-Must-Not-Not-Be-Named, as if ever — Voldemort realized ...”\n\nAuntie hunting screams had never been involved in this — but the piece of scrapped fell.\n\n“Well, I was convinced you, Professor Sages are you and your dad, that Professor Dumbledore must have heard before you, how you kn

In [42]:
0.783*60

46.980000000000004

####So finally after a staggering runtime of 60 mitues and 46 seconds ---> we get the above results
#####Now There are a few things that we should learn from here
1. In the actual model, the dimension of input vector is 768, here we have taken half of it 384
2. You see that after the 3000Th step the validation loss is increasing, this definetely measn that the model has started overfitting the dataset --> hence we must reduce the no. of steps, do some regularization to prevent that
3. The output though is quite good and does seem as if some human is writing it except some words in between --> we can clean the text \n with a new line and "" at some places and get an all together new ending to the series.
4. But we would get a new ending only when we don't over fit, other wise we would be regenating the entire text itself