Content of this file based on [Andrej Karpathy](https://www.youtube.com/watch?v=kCc8FmEb1nY)'s youtube video on autoregressive models

## Data acquisition and processing

In [21]:
# Download dataset to train on
!curl.exe --output shakespeare.txt https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
import torch

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  3 1089k    3 39962    0     0   182k      0  0:00:05 --:--:--  0:00:05  183k
100 1089k  100 1089k    0     0  3354k      0 --:--:-- --:--:-- --:--:-- 3361k


In [22]:
# read it in to inspect it
with open('shakespeare.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    
# unique chars 
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [23]:
# Create in a mapping which will be used as a dummy tokenizer for character level encoding
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [24]:
# We can now encode the whole dataset and store it in a tensor
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)

# The tensor will be useful for random sampling during training and also for creating train and test split
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

torch.Size([1115394]) torch.int64


## Autoregressive training and generation

In [25]:
# This code block demonstrates what the model actually learns during training
#  - given a set of characters as input, the model will learn to predict the characters that come afterwards
#  => it learns to generate new text given a text input (hurray!)

# The block_size determines how many text units (characters in our case) are passed in one sample, e.g. 8 in this case
block_size = 8

# We can use one sample to actually generate n = block_size - 1 samples for training
# (This way the model learns to generate text given variable sized inputs)

def describe_sample(x,y):
    for t in range(block_size):
        context = x[:t+1]
        target = y[t]
        print(f"when input is {context} try to predict: {target}")
        
x = train_data[:block_size]
y = train_data[1:block_size+1]
describe_sample(x,y)


when input is tensor([18]) try to predict: 47
when input is tensor([18, 47]) try to predict: 56
when input is tensor([18, 47, 56]) try to predict: 57
when input is tensor([18, 47, 56, 57]) try to predict: 58
when input is tensor([18, 47, 56, 57, 58]) try to predict: 1
when input is tensor([18, 47, 56, 57, 58,  1]) try to predict: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) try to predict: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) try to predict: 58


In [26]:
# Mathematically formulated we are trying to maximize the likelihood of the observed sequences in the training data:
# P(x_1, x_2, ..., x_n) = P(x_1) * P(x_2|x_1) * P(x_3|x_1, x_2) * ... * P(x_n|x_1, ..., x_{n-1})

# For optimization purposes (log numerically more stable + TODO: come up with better explanation) we use the negative log-likelihod:
# -log(P(x_1,x_2,...,x_n)) = -\sum_i log(P(x_i|x_1, ..., x_{i-1}))

In [27]:
# During training we actually don't just want to pass a single sample, but train with multiple samples in parallel
# This will speed up training tremendously because we (when given) leverage the capabilities of gpu to process data in parallel
# In parallel does not mean that the samples interact with each other, it is just fast but in essence the same intuition as passing all samples iteratively

torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    idxs = torch.randint(len(data) - block_size, size=(batch_size,))
    x = torch.stack([data[i:i + block_size] for i in idxs])
    y = torch.stack([data[i+1:i+block_size+1] for i in idxs])
    return x,y  

In [28]:
x_b, y_b = get_batch('train)')

# We can check now every sample in the batch as before:
for i, (x_s, y_s) in enumerate(zip(x_b, y_b)):
    print(f"Sample number: {i}")
    describe_sample(x_s, y_s)

Sample number: 0
when input is tensor([6]) try to predict: 1
when input is tensor([6, 1]) try to predict: 52
when input is tensor([ 6,  1, 52]) try to predict: 53
when input is tensor([ 6,  1, 52, 53]) try to predict: 58
when input is tensor([ 6,  1, 52, 53, 58]) try to predict: 1
when input is tensor([ 6,  1, 52, 53, 58,  1]) try to predict: 58
when input is tensor([ 6,  1, 52, 53, 58,  1, 58]) try to predict: 47
when input is tensor([ 6,  1, 52, 53, 58,  1, 58, 47]) try to predict: 50
Sample number: 1
when input is tensor([6]) try to predict: 1
when input is tensor([6, 1]) try to predict: 54
when input is tensor([ 6,  1, 54]) try to predict: 50
when input is tensor([ 6,  1, 54, 50]) try to predict: 39
when input is tensor([ 6,  1, 54, 50, 39]) try to predict: 52
when input is tensor([ 6,  1, 54, 50, 39, 52]) try to predict: 58
when input is tensor([ 6,  1, 54, 50, 39, 52, 58]) try to predict: 43
when input is tensor([ 6,  1, 54, 50, 39, 52, 58, 43]) try to predict: 58
Sample number: 

In [29]:
# This is a simple bigram language model that demonstrates the core concepts of autoregressive models.
# A bigram model predicts the next token based only on the previous token, making it the simplest
# form of an autoregressive model.

import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)
        # The embedding table learns a mapping from each token to a vector of scores.
        # For a bigram model, this vector represents the "logits" (raw scores) for
        # predicting the next token. Shape: (vocab_size, vocab_size)
        # Example: row i contains scores for what token should follow token i
        
    def forward(self, inputs, targets=None):
        # inputs: tensor of token indices, shape (B,T) where:
        #   B = batch size (number of sequences)
        #   T = sequence length (number of tokens per sequence)
        # targets: tensor of next-token indices, shape (B,T), shifted one position right
        #   For input sequence "hello", targets would be "ello<end>"
        logits = self.token_embedding_table(inputs) # (B,T,C)
        # For each position in each sequence, look up the corresponding row in the embedding table
        # This gives us scores for what token should follow each input token
        # C = vocab_size (number of possible next tokens)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            # Reshape logits and targets to treat each position as an independent prediction
            logits = logits.view(B*T, C)    # shape: (B*T, C)
            targets = targets.view(B*T)      # shape: (B*T)
            # Cross entropy loss measures how well our predictions match the actual next tokens
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss
    def generate(self, inputs, max_new_tokens):
        # Generate new tokens one at a time, using only the last token to predict the next
        
        for _ in range(max_new_tokens):
            logits, _ = self(inputs)  # shape: (B,T,C)
            # For generation, we only need the predictions from the last position
            # since a bigram model only uses the previous token
            probs = F.softmax(logits[:, -1, :], dim=-1)  # shape: (B,C)
            
            # Sample from the probability distribution to get the next token
            inputs_next = torch.multinomial(probs, num_samples=1)  # shape: (B,1)
            # Append the new token to our sequence
            inputs = torch.cat((inputs, inputs_next), dim=1)  # shape: (B,T+1)
        return inputs

In [30]:
# If we now generate text with the untrained model we get glibberish
print(decode(m.generate(inputs=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


JOHulche; h co.
CouCKI:
LABe y, crd bo tarreror thindrariathitot.
Thathest
INOWe hin t 's ve het
LEL


In [31]:
# So let's train it and see how the output changes when starting from an empty sequence

optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)  # We use a relatively large learning rate, because the model is fairly small
batch_size = 32

# Train
for steps in range(10000): # increase number of steps for good results...

    xb, yb = get_batch('train')
    logits, loss = m(xb, yb)
    
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.4584102630615234


In [33]:
# It is still glibberish, but not that random, we can see fractions of real language! (yippieh)
print(decode(m.generate(inputs = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


MENG y e msbe shes, d th, h youre w ag mur ore irt
Ano and t wis, cl

Thof ty dsuran n: d athe hor
TUncall sprame I INIsatsooruraumendeleave? 'TINGisthe ing'de atioprd at b'ls lalllod ut
Orice wau e inor Lishiste,

LENTel s, S:
Au torsuth or urren tharit amigl macte'd ipr har ircr
PEREveay qust ty DWathende IN ndyond toust; canikill, UTO wickes we l f,

Yonovey ou Rod wey penmean thie bye hof ARKI:
Bu chey whimazes
The a m ssorersast CUCE:
Twilee nd:
Heewisongengay-'l hend, lfofor moue outy:

AR
