Content of this file based on [Andrej Karpathy](https://www.youtube.com/watch?v=kCc8FmEb1nY)'s youtube video on autoregressive models

## Data acquisition and processing

In [1]:
# Download dataset to train on
!curl.exe --output shakespeare.txt https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
import torch

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1089k  100 1089k    0     0  6128k      0 --:--:-- --:--:-- --:--:-- 6188k
  cpu = _conversion_method_template(device=torch.device("cpu"))


In [2]:
# read it in to inspect it
with open('shakespeare.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    
# unique chars 
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [3]:
# Create in a mapping which will be used as a dummy tokenizer for character level encoding
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [4]:
# We can now encode the whole dataset and store it in a tensor
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)

# The tensor will be useful for random sampling during training and also for creating train and test split
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

torch.Size([1115394]) torch.int64


## Autoregressive training and generation

In [5]:
# This code block demonstrates what the model actually learns during training
#  - given a set of characters as input, the model will learn to predict the characters that come afterwards
#  => it learns to generate new text given a text input (hurray!)

# The block_size determines how many text units (characters in our case) are passed in one sample, e.g. 8 in this case
block_size = 8

# We can use one sample to actually generate n = block_size - 1 samples for training
# (This way the model learns to generate text given variable sized inputs)

def describe_sample(x,y):
    for t in range(block_size):
        context = x[:t+1]
        target = y[t]
        print(f"when input is {context} try to predict: {target}")
        
x = train_data[:block_size]
y = train_data[1:block_size+1]
describe_sample(x,y)


when input is tensor([18]) try to predict: 47
when input is tensor([18, 47]) try to predict: 56
when input is tensor([18, 47, 56]) try to predict: 57
when input is tensor([18, 47, 56, 57]) try to predict: 58
when input is tensor([18, 47, 56, 57, 58]) try to predict: 1
when input is tensor([18, 47, 56, 57, 58,  1]) try to predict: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) try to predict: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) try to predict: 58


In [6]:
# Mathematically formulated we are trying to maximize the likelihood of the observed sequences in the training data:
# P(x_1, x_2, ..., x_n) = P(x_1) * P(x_2|x_1) * P(x_3|x_1, x_2) * ... * P(x_n|x_1, ..., x_{n-1})

# For optimization purposes (log numerically more stable + TODO: come up with better explanation) we use the negative log-likelihod:
# -log(P(x_1,x_2,...,x_n)) = -\sum_i log(P(x_i|x_1, ..., x_{i-1}))

In [7]:
# During training we actually don't just want to pass a single sample, but train with multiple samples in parallel
# This will speed up training tremendously because we (when given) leverage the capabilities of gpu to process data in parallel
# In parallel does not mean that the samples interact with each other, it is just fast but in essence the same intuition as passing all samples iteratively

torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    idxs = torch.randint(len(data) - block_size, size=(batch_size,))
    x = torch.stack([data[i:i + block_size] for i in idxs])
    y = torch.stack([data[i+1:i+block_size+1] for i in idxs])
    return x,y  

In [9]:
x_b, y_b = get_batch('train)')

# We can check now every sample in the batch as before:
for i, (x_s, y_s) in enumerate(zip(x_b, y_b)):
    print(f"Sample number: {i}")
    describe_sample(x_s, y_s)

Sample number: 0
when input is tensor([1]) try to predict: 47
when input is tensor([ 1, 47]) try to predict: 57
when input is tensor([ 1, 47, 57]) try to predict: 1
when input is tensor([ 1, 47, 57,  1]) try to predict: 58
when input is tensor([ 1, 47, 57,  1, 58]) try to predict: 46
when input is tensor([ 1, 47, 57,  1, 58, 46]) try to predict: 43
when input is tensor([ 1, 47, 57,  1, 58, 46, 43]) try to predict: 1
when input is tensor([ 1, 47, 57,  1, 58, 46, 43,  1]) try to predict: 44
Sample number: 1
when input is tensor([0]) try to predict: 32
when input is tensor([ 0, 32]) try to predict: 46
when input is tensor([ 0, 32, 46]) try to predict: 47
when input is tensor([ 0, 32, 46, 47]) try to predict: 57
when input is tensor([ 0, 32, 46, 47, 57]) try to predict: 1
when input is tensor([ 0, 32, 46, 47, 57,  1]) try to predict: 44
when input is tensor([ 0, 32, 46, 47, 57,  1, 44]) try to predict: 39
when input is tensor([ 0, 32, 46, 47, 57,  1, 44, 39]) try to predict: 60
Sample numb

In [107]:
# Mr. Karpathy built this simple autoregressive language model, which is completely sufficient to explain the point of this

import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)
        # nn.Embedding is in essence just a minimal wrapper around a tensor in our case of size vocab_size x vocab_size
        # and when forwardinga number through the embedding we essentially return the ith vector of the embedding tensor
        
    def forward(self, inputs, targets=None):
        
        # inputs and targets are both tensors of shape (batch_size/B, num_characters/T)
        # Mr. Karpathy revered to num_characters also as the time dimension T, which is also a very intuitive way of thinking about this
        logits = self.token_embedding_table(inputs) # (B,T,C)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # stretches out the second dimension, required for input format of cross_entropy
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss
    
    def generate(self, inputs, max_new_tokens):
        
        for _ in range(max_new_tokens):
            logits, _ = self(inputs) # (B,T, C)
            # In the Bigram model we are only interested in the last time step 
            # (We perform our prediction of the next character only based on the character that is directly preceding, hence "bi"-gram model)
            # We apply softmax to transform logits into probabilities (so the values sum up to one)
            probs = F.softmax(logits[:, -1, :], dim=-1)
            
            # Then sample from the probability distribution
            inputs_next = torch.multinomial(probs, num_samples=1) # (B,1)
            # append sampled index to running sequence
            inputs = torch.cat((inputs,inputs_next), dim=1) # (B, T+1)
        return inputs

In [108]:
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(yb.shape)
print(logits.shape)
print(logits)
print(yb)

print(loss)

torch.Size([4, 8])
torch.Size([32, 65])
tensor([[-0.5201,  0.2831,  1.0847,  ..., -0.0198,  0.7959,  1.6014],
        [ 0.3323, -0.0872, -0.7470,  ..., -0.6716, -0.9572, -0.9594],
        [-0.1679,  0.5602,  0.6467,  ...,  0.1522,  0.5109,  0.0990],
        ...,
        [ 0.5978, -0.0514, -0.0646,  ..., -1.4649, -2.0555,  1.8275],
        [ 0.2475, -0.6349, -1.2909,  ...,  1.3064, -0.2256, -1.8305],
        [ 1.0901,  0.2170, -2.9996,  ..., -0.5472, -0.8017,  0.7761]],
       grad_fn=<ViewBackward0>)
tensor([[43, 60, 43, 52,  1, 63, 43, 39],
        [43, 42,  8,  0, 25, 63,  1, 45],
        [42,  5, 57,  1, 57, 39, 49, 43],
        [57, 58, 63,  6,  1, 58, 46, 47]])
tensor(4.4150, grad_fn=<NllLossBackward0>)


In [109]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))



Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [110]:
# create PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [111]:
batch_size = 32
for steps in range(10000): # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.5727508068084717


In [112]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


Iyoteng h hasbe pave pirance
Rie hicomyonthar's
Plinseard ith henoure wounonthioneir thondy, y heltieiengerofo'dsssit ey
KIN d pe wither vouprrouthercc.
hathe; d!
My hind tt hinig t ouchos tes; st yo hind wotte grotonear 'so it t jod weancotha:
h hay.JUCle n prids, r loncave w hollular s O:
HIs; ht anjx?

DUThinqunt.

LaZAnde.
athave l.
KEONH:
ARThanco be y,-hedarwnoddy scace, tridesar, wnl'shenous s ls, theresseys
PlorseelapinghiybHen yof GLUCEN t l-t E:
I hisgothers je are!-e!
QLYotouciullle'z
