In [566]:
# when you restart a Jupyter Notebook, even if you see the outputs from the previous session, the variables, functions, and states in memory are lost. You need to re-run the cells to reload everything into memory.
import torch 
import torch.nn as nn 
from torch.nn import functional as F

In [568]:
# Read the contents of the file
# Save them to a var. named file_contents
with open('title.txt', 'r') as file:
    text = file.read()

# Get no. of characters you are dealing with plus it's count
chars = sorted(list(set(text))) # all content in sorted order
vocab_size = len(chars) # no. of characters in file
print(''.join(chars)) # join of these characters, unique ones
print(vocab_size)

# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum content length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
#----



 !"#$%&'()+,-./0123456789:;?@ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz £°Éáãäåéëíïñóöøüāćń​–‘’“”…€﻿
111


In [570]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# This code offers a very basic form of tokeniser, wherein the seuqence of integers is long but the integers in the sequence are small, in other words the vocabulary is small

print(encode("hi there"))
print(decode(encode("hi there")))

[63, 64, 1, 75, 63, 60, 73, 60]
hi there


In [572]:
# let's now tokenise/encode our whole set of text
import torch # import PyTorch library
data = torch.tensor(encode(text), dtype=torch.long)

# data.shape returns tuple representing dimensions of tensor
'''
Tensor:
A fundamental data structure in ML.
A multi-dimensional array used to store data. It generalizes matrices to higher dimensions and can be thought of as a container for numerical data.
'''
print(data.shape, data.dtype)

print(data[:1000]) # the first 1000 characters, this is how the GPT will look our words

torch.Size([2186053]) torch.int64
tensor([50, 66, 73, 56, 64, 69, 60, 26,  1, 30, 69, 62, 73, 80,  1, 55, 60, 67,
        60, 69, 74, 66, 80,  1, 77, 70, 78, 74,  1, 75, 70,  1, 71, 76, 69, 64,
        74, 63,  1, 47, 76, 74, 74, 64, 56, 69,  1, 56, 75, 73, 70, 58, 64, 75,
        64, 60, 74,  0, 52, 56, 73,  1, 64, 69,  1, 50, 66, 73, 56, 64, 69, 60,
        26,  1, 49, 56, 66, 64, 69, 62,  1, 58, 70, 77, 60, 73,  1, 64, 69,  1,
        56,  1, 75, 70, 78, 69,  1, 76, 69, 59, 60, 73,  1, 56, 75, 75, 56, 58,
        66,  0, 50, 66, 73, 56, 64, 69, 60,  1, 78, 56, 73,  1,  8, 58, 56, 75,
        56, 74, 75, 73, 70, 71, 63, 64, 58,  1, 61, 70, 73,  1, 62, 67, 70, 57,
        56, 67,  1, 61, 70, 70, 59,  8,  0, 42, 56, 69, 58, 63, 60, 74, 75, 60,
        73,  1, 30, 73, 60, 69, 56,  1, 57, 70, 68, 57, 64, 69, 62, 26,  1, 48,
        56, 61, 61, 64, 60,  1, 47, 70, 76, 74, 74, 70, 74,  8, 74,  1, 71, 56,
        73, 60, 69, 75, 74,  1, 70, 69,  1, 63, 60, 56, 73, 64, 69, 62,  1, 75,
      

In [574]:
# This would be used to check at the end as how well our model is overfitting.
'''
Overfitting:
Overfitting is a common problem in machine learning and statistical modeling where a model learns not just the underlying patterns in the training data but also the noise or random fluctuations. This results in a model that performs very well on the training data but poorly on new, unseen data. 
'''

# Let's now split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train data, rest would be validation
train_data = data[:n]
val_data = data[n:]

In [576]:
# We will train the transformer on chunks of dataset/text so that it's computationally inexpensive
# block size states the max length of our chunks
block_size = 8
train_data[:block_size+1]
# predictions are made on the basis of relative positions of these tokens

tensor([50, 66, 73, 56, 64, 69, 60, 26,  1])

In [578]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context} the target: {target}")

When input is tensor([50]) the target: 66
When input is tensor([50, 66]) the target: 73
When input is tensor([50, 66, 73]) the target: 56
When input is tensor([50, 66, 73, 56]) the target: 64
When input is tensor([50, 66, 73, 56, 64]) the target: 69
When input is tensor([50, 66, 73, 56, 64, 69]) the target: 60
When input is tensor([50, 66, 73, 56, 64, 69, 60]) the target: 26
When input is tensor([50, 66, 73, 56, 64, 69, 60, 26]) the target: 1


In [580]:
# Using the below code you ensure that any random numbers generated by PyTorch are reproducible, which means when you run the code multiple times, you'll get the same random numbers each time.
# This is useful for debugging or comparing results.
# The specific value doesn't matter, it's just used to initialise the random number generator in a consistent way.
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data  = train_data if split == 'train' else val_data
    # The below code generates batch_size (4) nos. in the range 0 to len(data)-block_size (exclusive)
    # These integers, stored in ix, are used as starting indexes to slice the data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    # stack up the rows into a tensor
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in {'train', 'val'}:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"when input is {context.tolist()} the target: {target}")


inputs:
torch.Size([4, 8])
tensor([[70, 71, 71,  0, 31, 56, 64, 73],
        [73, 67, 59,  1, 48, 58, 70, 76],
        [57, 64, 59,  1, 61, 70, 73,  1],
        [ 1, 33, 56, 80,  1, 71, 56, 73]])
targets:
torch.Size([4, 8])
tensor([[71, 71,  0, 31, 56, 64, 73, 74],
        [67, 59,  1, 48, 58, 70, 76, 75],
        [64, 59,  1, 61, 70, 73,  1, 17],
        [33, 56, 80,  1, 71, 56, 73, 56]])
----
when input is [70] the target: 71
when input is [70, 71] the target: 71
when input is [70, 71, 71] the target: 0
when input is [70, 71, 71, 0] the target: 31
when input is [70, 71, 71, 0, 31] the target: 56
when input is [70, 71, 71, 0, 31, 56] the target: 64
when input is [70, 71, 71, 0, 31, 56, 64] the target: 73
when input is [70, 71, 71, 0, 31, 56, 64, 73] the target: 74
when input is [73] the target: 67
when input is [73, 67] the target: 59
when input is [73, 67, 59] the target: 1
when input is [73, 67, 59, 1] the target: 48
when input is [73, 67, 59, 1, 48] the target: 58
when input is [73

In [593]:
import torch.nn as nn
# below syntax is such because nn is a submodule of torch, and Python needs the full module path (torch.nn) to find the functional module correctly.
from torch.nn import functional as F
torch.manual_seed(1337)

class Head(nn.Module):
    '''one head of self-attention'''
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias = False)
        self.query = nn.Linear(n_embd, head_size, bias = False)
        self.value = nn.Linear(n_embd, head_size, bias = False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x) # (B, T, C)
        q = self.query(x) # (B, T, C)
        # complete attention scores ("affinities")
        wei = q @ k.transpose(-2, -1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim = -1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B, T, C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out
        
class MultiHeadAttention(nn.Module):
    '''multiple heads of self-attention in parallel'''

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim = -1)
        out = self.dropout(self.proj(out))
        return out 
        
class FeedForward(nn.Module):
    ''' a simple linear layer followed by a non-linearity '''

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)
        
class Block(nn.Module):
    '''Transformer block: communication followed by computation'''

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x
        
        
# A bigram langauge model is a type of statistical language model that predicts the probability of a word based on the preceding word. It assumes that the occurence of a word depends only on the previous word.
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        # nn.Embedding(vocab_size, vocab_size): vocab_size is the size of the vocabulary; each token is represented by a vector of size vocab_size, effectively creating a look up table of token embeddings.
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        # self.blocks = nn.Sequential(
        #     Block(n_embd, n_head = 4),
        #     Block(n_embd, n_head = 4),
        #     Block(n_embd, n_head = 4),
        #     nn.LayerNorm(n_embd),
        # )
        # self.sa_heads = MultiHeadAttention(4, n_embd//4) # i.e. 4 heads of 8-dimensional self-attention
        # self.ffwd = FeedForward(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    # docstrings to be placed at same indentation to avoid confusion
    '''
    Batch is the number of sequences in the batch.
    Time is the length of each sequence.
    Channels is the size of the embedding (equal to vocab_size).
    '''

    # In context of neural networks, the "forward pass" refers to the process of passing input data through the network to obtain predictions or outputs.
    def forward(self, idx, targets = None):
        B, T = idx.shape
        
        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (Batch, Time, Channels) Batch -> 4 Time -> 8 Channels -> vocab_size
        pos_emb = self.position_embedding_table(torch.arange(T, device = device)) # (T, C)
        x = tok_emb + pos_emb # (B, T, C)
        # x = self.sa_heads(x) # apply one head of self attention. (B, T, C)
        # x = self.ffwd(x) # (B, T, C)
        x = self.blocks(x) # (B, T, C)
        x = self.ln_f(x) # (B, T, C)
        logits = self.lm_head(x) # (B, T, vocab_size)
    
        # if no targets, there's no loss to compute.
        if targets is None:
            loss = None
        else:
            # reshaping our logits such that they align with the syntax of cross_entropy
            B, T, C = logits.shape 
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
        
            # quality of prediction wrt targets
            # It can be understood as a probability distribution where the correct dimension would be looking like a peak
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss

    # defines the method to generate new tokens based on the current sequence idx.
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax ro get probabilities 
            probs = F.softmax(logits, dim = -1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples = 1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim = 1) # (B, T+1)
        return idx
model = BigramLanguageModel()
m = model.to(device)
# logits, loss = m(xb, yb)
# print(logits.shape)
# print(loss)


# print(decode(m.generate(idx = torch.zeros((1, 1), dtype = torch.long), max_new_tokens = 100)[0].tolist()))

print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

torch.Size([32, 111])
tensor(4.7075, grad_fn=<NllLossBackward0>)

āāāDi…ä7#Yb00Z‘KJIx@J2åí9D8f5kI?WñE€u!la+L6ö”XoA8WS8ñlï79O6Pāle-”IáåZ#U…s&Wév…bE-…eéK2€Lñó0/féPDHåNä


In [594]:
# m = model.to(device)
# create a PyTorch optimiser
optimiser = torch.optim.AdamW(m.parameters(), lr = learning_rate)

In [597]:
# batch_size = 32 # This specifies that 32 samples will be processed in one training step called batch.
# for steps in range(50000): # This loop will run for 100 steps. Each step is one iteration of training using a batch of data.
#     # sample a batch of data
#     xb, yb = get_batch('train')
#     # evaluate the loss 
#     # logits are the raw output of the model before any activation function, representing the predicted probabilities for each class.
#     logits, loss = m(xb, yb)
#     optimiser.zero_grad(set_to_none = True)
#     loss.backward()
#     optimiser.step()

# print(loss.item())

for iter in range(max_iters):
    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimiser.zero_grad(set_to_none = True)
    loss.backward()
    optimiser.step()



step 0: train loss 4.8684, val loss 4.8683
step 500: train loss 2.7081, val loss 2.6958
step 1000: train loss 2.6286, val loss 2.6257
step 1500: train loss 2.5797, val loss 2.5802
step 2000: train loss 2.5213, val loss 2.5163
step 2500: train loss 2.4878, val loss 2.4834
step 3000: train loss 2.4929, val loss 2.4829
step 3500: train loss 2.4646, val loss 2.4595
step 4000: train loss 2.4378, val loss 2.4708
step 4500: train loss 2.4062, val loss 2.4111


In [None]:
# generate from the model
context = torch.zeros((1,1), dtype = torch.long, device = device)
print(decode(m.generate(context, max_new_tokens = 500)[0].tolist()))

## The mathematical trick in self-attention

In [531]:
# consider the following toy example:

torch.manual_seed(1337)
B, T, C = 4, 8, 2 # batch, time, channels
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [532]:
# We want x[b, t] = mean_{i<=t} x[b, i]
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] # (t, C)
        xbow[b, t] = torch.mean(xprev, 0)

In [533]:
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [534]:
# version 2
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim = True)
xbow2 = wei @ x # (B, T, T) @ (B, T, C) ---> (B, T, C)
torch.allclose(xbow, xbow2)

True

In [535]:
# version 3: use Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim = -1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

In [536]:
xbow[0], xbow2[0]

(tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]))

In [537]:
# version 4: self-attention!
torch.manual_seed(1337)
B, T, C = 4, 8, 32 # batch, time, channels
x = torch.randn(B, T, C)

# let's see a single Head perform self-attention
head_size = 16
key = nn.Linear(C, head_size, bias = False)
query = nn.Linear(C, head_size, bias = False)
value = nn.Linear(C, head_size, bias = False)
k = key(x) # (B, T, 16)
q = query(x) # (B, T, 16)
wei = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) ---> (B, T, T)

tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim = -1)

v = value(x)
out = wei @ v
# out = wei @ x

out.shape

torch.Size([4, 8, 16])

In [538]:
wei

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
         [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
         [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
         [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1687, 0.8313, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2477, 0.0514, 0.7008, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4410, 0.0957, 0.3747, 0.0887, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0069, 0.0456, 0.0300, 0.7748, 0.1427, 0.0000, 0.0000, 0.0000],
         [0.0660, 0.089

In [539]:
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
wei = q @ k.transpose(-2, -1) * head_size**(-0.5)

In [540]:
k.var()

tensor(1.0449)

In [541]:
q.var()

tensor(1.0700)

In [542]:
wei.var()

tensor(1.0918)

In [543]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim = -1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [544]:
# returns the lower triangular part of the given matrix
torch.tril(torch.ones(3, 3))

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [545]:
# we can be very efficient doing the above thing using matrix multiplication
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
# using the below syntax would get us the sum for every row in a as 1
a = a / torch.sum(a, 1, keepdim = True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print('a = ', a)
print('____')
print('b = ', b)
print('____')
print('c = ', c)
print('____')

a =  tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
____
b =  tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
____
c =  tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])
____
