In [1]:
with open('resources/input.txt', 'r', encoding='utf-8') as f: text = f.read()

In [2]:
print("length of text: ", len(text))

length of text:  1115394


In [None]:
# take a look at the first 1000 chracaters
print(text[:1000])

In [5]:
# here are all the unique character in the dataset
chars = sorted(list(set(text)))
vocab_size = len(chars)

print("vocab size: ", vocab_size)
print(''.join(chars))

vocab size:  65

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [6]:
# mapping characters to integers
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda li: ''.join([itos[i] for i in li])

print(encode('hello'))
print(decode([46, 43, 50, 50, 53]))

[46, 43, 50, 50, 53]
hello


In [7]:
stoi['\n'], itos[0]

(0, '\n')

In [8]:
# let's encode the whole text
import torch

data = torch.tensor(encode(text), dtype=torch.long)

print(data.shape)
print(data[:100])

torch.Size([1115394])
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [9]:
# train-test split
split = int(0.9*len(data))
train_data = data[:split]
val_data = data[split:]

In [10]:
# the context length: # characters the model will see at a time
block_size = 8

x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context} the target: {target}")

When input is tensor([18]) the target: 47
When input is tensor([18, 47]) the target: 56
When input is tensor([18, 47, 56]) the target: 57
When input is tensor([18, 47, 56, 57]) the target: 58
When input is tensor([18, 47, 56, 57, 58]) the target: 1
When input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [11]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(high=len(data) - block_size, size=(batch_size,)) # 4 random integers betweeen 0 to len(data)-batch_size
    x = torch.stack([data[i:i+block_size] for i in ix])                 # next 8 characters from each random integer
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])             # next 8 characters from first character in x
    return x,y, ix                                                      # returning ix for debugging purpose

xb, yb, ix = get_batch('train')

print(f"Random Indices (ix): {ix}\n")
print(f"Shapes: Input {xb.shape} Lable {yb.shape}\n")
print(f"Input: {xb}\n")
print(f"Label: {yb}", end='\n\n')

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"When input is {context.tolist()} the target: {target}")
    break

Random Indices (ix): tensor([ 71565, 224657, 932192, 557350])

Shapes: Input torch.Size([4, 8]) Lable torch.Size([4, 8])

Input: tensor([[ 1, 60, 39, 47, 50,  1, 63, 53],
        [46, 43, 39, 60, 43, 52,  1, 44],
        [ 1, 46, 43, 56, 43,  1, 63, 53],
        [61, 47, 50, 50,  1, 57, 39, 63]])

Label: tensor([[60, 39, 47, 50,  1, 63, 53, 59],
        [43, 39, 60, 43, 52,  1, 44, 53],
        [46, 43, 56, 43,  1, 63, 53, 59],
        [47, 50, 50,  1, 57, 39, 63,  0]])

When input is [1] the target: 60
When input is [1, 60] the target: 39
When input is [1, 60, 39] the target: 47
When input is [1, 60, 39, 47] the target: 50
When input is [1, 60, 39, 47, 50] the target: 1
When input is [1, 60, 39, 47, 50, 1] the target: 63
When input is [1, 60, 39, 47, 50, 1, 63] the target: 53
When input is [1, 60, 39, 47, 50, 1, 63, 53] the target: 59


In [10]:
print(xb) # our input to the transformer

tensor([[ 1, 60, 39, 47, 50,  1, 63, 53],
        [46, 43, 39, 60, 43, 52,  1, 44],
        [ 1, 46, 43, 56, 43,  1, 63, 53],
        [61, 47, 50, 50,  1, 57, 39, 63]])


In [11]:
yb # our target for the transformer

tensor([[60, 39, 47, 50,  1, 63, 53, 59],
        [43, 39, 60, 43, 52,  1, 44, 53],
        [46, 43, 56, 43,  1, 63, 53, 59],
        [47, 50, 50,  1, 57, 39, 63,  0]])

### A Simple Langage Model
 - Only has an embedding layer

In [12]:
# starting with a bigram model
import torch
import torch.nn as nn
import torch.nn.functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) # representing each token as a vector of dim=65 (vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are of shape (batch_size, block_size) or (B, T)
        logits = self.token_embedding_table(idx) # shape: (batch_size, block_size, vocab_size) or (B, T, C)
        if targets is None: loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
    
m = BigramLanguageModel(vocab_size) # vocab_size = 65
logits, loss = m(xb, yb)
print(logits.shape)

print(loss)

idx = torch.zeros((1,1), dtype=torch.long)
generated = decode(m.generate(idx, max_new_tokens=100)[0].tolist())
print(generated)

torch.Size([32, 65])
tensor(4.5629, grad_fn=<NllLossBackward0>)

l-QYjt'CL?jLDuQcLzy'RIo;'KdhpV
vLixa,nswYZwLEPS'ptIZqOZJ$CA$zy-QTkeMk x.gQSFCLg!iW3fO!3DGXAqTsq3pdgq


In [13]:
optimizer = torch.optim.AdamW(m.parameters(), lr=0.001)

In [14]:
batch_size  = 32

for steps in range(1000):
    # sample a batch of data
    xb, yb, _ = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(f"Step: {steps}, Loss: {loss.item()}")

Step: 999, Loss: 3.667173385620117


In [15]:
context = torch.tensor(encode("The meaning of life is")).reshape(1, -1)
# idx = torch.zeros((1,1), dtype=torch.long)
generated = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated)

The meaning of life ishLLkoexMyas:Iocindad.e-NNSqYPso&bFho&$;BQ$dZTMf'fKlf;DRPm'W,esPHyXAzCA$;GunqCEy&Oy;ZxjKVhmrdhxCAbTSp-Rue.HQNNbxF&kwst-s
OMckHNENUZEzlda$BPPtIhe!,epdheaRns:
AqW3E,DXU,NENT n b.u.xIYV&j'nnl;s,ngtoOm ixPLenrXElrPjIU-T'St3PJ
cra3bLGhT ALM-veAYkpr ,erPVhJQNV
P?WN3b?oYxpig;ENTy3q&j.mes; iZ,w..w&yEK
Ona$IyYWi.OU ay;,weP?AqV-XAPig;
OMGBI3Dor,EL.xy
OZ r!Nxx-shz!q pZrAQll'vPkntezN
BPy3motRMqFhoPpCbenYxubek,-Z:qddF
NIgmoP
TEhoXhn:B?ZEMv?nlnsFprzyAgwNodd  bGP EAlprPeQnqDGhdDUP.JIGNaVDwfIxx.c XhoCb&DivRS&fuF


### The mathematical trick in self-attention

In [12]:
# consider the following example

torch.manual_seed(1337)

B,T,C = 4,8,2 # batch size, timesteps, channels
x = torch.randn(B,T,C) # random input
x.shape

torch.Size([4, 8, 2])

What we want is, some communication across timestep dimention. so the nth token should know how it's related to all the previous tokens.

- example (using words for simplicity): 
    - timesteps = ["the", "meaning", "of", "life", "is"]
    - this will get vecotrized to let's say [0, 1, 2, 3, 4]
    - we want each element from the vector to be something related to all the previous elements (and not the future elements because that would be cheating in case of predicting next token)

In [13]:
# simplest thing we can do is make the nth token to be the average of the first n tokens

# we want x[b, t] = mean_{i<=t} x[b, i]
torch.manual_seed(42)

xbow = torch.zeros((B,T,C)) # x_bag_of_words
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t, C)
        xbow[b,t] = torch.mean(xprev, dim=0)

In [14]:
# an efficient way to do this is... here's an example  
torch.manual_seed(42)

a = torch.tril(torch.ones(3,3)) # lower triangular matrix
a = a / a.sum(dim=1, keepdim=True)
b = torch.randint(0, 10, (3,2)).float() 
c = a @ b

print("a", a)
print("b", b)
print("c", c)

a tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
c tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [15]:
# version 2

torch.manual_seed(42)

wei = torch.tril(torch.ones(T,T))
wei = wei / wei.sum(dim=1, keepdim=True)
xbow2 = wei @ x # (T, T) @ (B, T, C) -> (B, T, C)


In [None]:
xbow[0], xbow2[0] # pretty close

In [None]:
import torch.nn.functional as F

# version 3: using softmax
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril==0, float('-inf')) # setting the upper (zeros) triangle to -inf
wei = F.softmax(wei, dim=-1) # in each row, zeros will be replaced with probabilities, -inf will be zeros
xbow3 = wei @ x

xbow[0], xbow3[0] # pretty close

look at how the future elements are not used in the calculation :

In [None]:
print(tril) # a mask to zero out the future time steps
print(wei) # scores for each time step

# finally, xbow3 is the weighted sum of all time steps
print(xbow3[0])

**SELF-ATTENTION** (from 31b1 video on transformers)

![](./resources/self-attention.gif)

In [21]:
import torch.nn as nn

# version 4: self-attention
torch.manual_seed(42)

B,T,C = 4,8,32 # batch_size, timesteps, channels
x = torch.randn(B,T,C)  

# a sinlge self-attention head
head_size = 16 # hyperparameter: output dim of the self-attention head
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x) # (B, T, head_size)
q = query(x) # (B, T, head_size)

wei = q @ k.transpose(1, 2) # (B, T, head_size) @ (B, head_size, T) -> (B, T, T)
tril = torch.tril(torch.ones(T,T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)
v = value(x) # (B, T, head_size)
out = wei @ v

out.shape

torch.Size([4, 8, 16])

Notes:
- Attention is a **communication mechanism**. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.

- There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.

- Each example across batch dimension is of course processed completely independently and never "talk" to each other

- In an "encoder" attention block just deletes the single line (line no. 20) that does masking with `tril`, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.

- "self-attention" just means that the keys and values are produced from the same source as queries. In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)

- "Scaled" attention additional divides `wei` by 1/sqrt(head_size). This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below

In [22]:
# Only difference between LayerNorm1d and Batchnorm is the dimension we normalize.
# In Batchnorm we normalize the columns of the input, while in LayerNorm1d we normalize the rows of the input.
# We also don't need to maintain training and evaluation buffers in LayerNorm1d, since we normalize the rows of the input.

class LayerNorm1d: # (used to be BatchNorm1d)

  def __init__(self, dim, eps=1e-5):
    self.eps = eps
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)

  def __call__(self, x):
    # calculate the forward pass
    xmean = x.mean(1, keepdim=True) # here's the difference between LayerNorm1d and Batchnorm
    xvar = x.var(1, keepdim=True)   # normalizing the rows instead of columns
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

torch.manual_seed(1337)
module = LayerNorm1d(100)
x = torch.randn(32, 100) # batch size 32 of 100-dimensional vectors
x = module(x)
x.shape

torch.Size([32, 100])

In [23]:
# x is not normalized to column-wise 
x[:,0].mean(), x[:,0].std() # mean,std of one feature across all batch inputs

(tensor(0.1469), tensor(0.8803))

In [25]:
# x is normalized to row-wise
x[0,:].mean(), x[0,:].std() # mean,std of a single input from the batch, of its features

(tensor(-9.5367e-09), tensor(1.0000))