In [15]:
# https://stackabuse.com/creating-a-neural-network-from-scratch-in-python/
import numpy as np


def sigmoid(x):
    return 1/(1+np.exp(-x))

def sigmoid_der(x):
    return sigmoid(x)*(1-sigmoid(x))

feature_set = np.array([[0,1,0], [0,0,1], [1,0,0], [1,1,0], [1,1,1]])
labels = np.array([[1,0,0,1,1]])
labels = labels.reshape(5,1)

np.random.seed(42)
weights = np.random.rand(3,1)
bias = np.random.rand(1)
lr = 0.09

# forward & backward
for epoch in range(500):
    x = feature_set

    # feedforward
    z = sigmoid(np.dot(x, weights) + bias)

    # backpropagation step 1
    error = z - labels
    if epoch % 50 == 0: print(error.sum())
    # backpropagation step 2
    dpred_dz = sigmoid_der(z)

    z_delta = error * dpred_dz
    x = feature_set.T
    weights -= lr * np.dot(x, z_delta)

    for num in z_delta:
        bias -= lr * num


single_point = np.array([1,0,0])
result = sigmoid(np.dot(single_point, weights) + bias)
print(result)

single_point = np.array([0,1,0])
result = sigmoid(np.dot(single_point, weights) + bias)
print(result)

1.1484765089981492
0.3826105602496096
0.17185738520077565
0.11189389859934662
0.08484081908566549
0.06873187737629899
0.05780303777302742
0.049851715200578295
0.04380145114126244
0.03904595454490739
[0.14480521]
[0.95041035]


## Andrej Karpathy: Let's build GPT: from scratch, in code, spelled out

followed the code from the vid with added comments and explanations

In [5]:
# https://www.youtube.com/watch?v=kCc8FmEb1nY
# https://colab.research.google.com/drive/1JMLa53HDuA-i7ZBmqV7ZnA3c_fvtXnx-?usp=sharing#scrollTo=M5CvobiQ0pLr

text = "Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book."
chars = sorted(list(set(text)))

stoi = { ch:i for i,ch in enumerate(chars)}
itos = { i:ch for i,ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[e] for e in l])

print(encode('hi there'))
print(decode(encode('hi there')))

# tiktoken for subword encoding
# tokens are sometimes referred to as nodes


[16, 17, 0, 26, 16, 13, 24, 13]
hi there


In [6]:
import torch
torch.cuda.is_available()

True

In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
# ------------

torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input_sp0.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)

# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]  # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l])  # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)

print(data.shape, data.dtype)
print(data[:200])

n = int(0.9*len(data))  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]


torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59])


In [3]:
# we just train the transformer with chunks at a time (blocksize, context length,...)
block_size = 8  # multiple examples packed into this (7 here) as transformer trains on each position
train_data[:block_size+1]  # chunks are used not only for computational efficiency but also to make the transformer see context over various lengths

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [4]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):  # time dimension
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target is {target}")

when input is tensor([18]) the target is 47
when input is tensor([18, 47]) the target is 56
when input is tensor([18, 47, 56]) the target is 57
when input is tensor([18, 47, 56, 57]) the target is 58
when input is tensor([18, 47, 56, 57, 58]) the target is 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is 58


In [9]:
# batch dimension: batches are for efficiency to use parallel GPU architecture
torch.manual_seed(1337)
batch_size = 4  # how many independent sequences will we process in parallel?
block_size = 8  # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))  # random offsets into the training set
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y  # create batch_size rows of block_size data

# sample a batch of data
xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)
print('---------')

for b in range(batch_size):  # batch dimension
    for t in range(block_size):  # time dimension
        context = xb[b :t+1]
        target = yb[b, t]
        print(f"when input is {context.tolist()} the target is {target}")

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]], device='cuda:0')
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]], device='cuda:0')
---------
when input is [[24, 43, 58, 5, 57, 1, 46, 43]] the target is 43
when input is [[24, 43, 58, 5, 57, 1, 46, 43], [44, 53, 56, 1, 58, 46, 39, 58]] the target is 58
when input is [[24, 43, 58, 5, 57, 1, 46, 43], [44, 53, 56, 1, 58, 46, 39, 58], [52, 58, 1, 58, 46, 39, 58, 1]] the target is 5
when input is [[24, 43, 58, 5, 57, 1, 46, 43], [44, 53, 56, 1, 58, 46, 39, 58], [52, 58, 1, 58, 46, 39, 58, 1], [25, 17, 27, 10, 0, 21, 1, 54]] the target is 57
when input is [[24, 43, 58, 5, 57, 1, 46, 43], [44, 53, 56, 1, 58, 46, 39, 58], [52, 58, 1, 58, 46, 39, 58, 1], [

In [10]:
print(xb)  # our input to the transformer

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]], device='cuda:0')


In [16]:

import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size).to('cuda:0')  # had to add .to('cuda:0') here to ensure all is on the same device

    def forward(self, idx, targets=None):
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C) = (batch, time, channel) - just plugs out the idx-row from the embedding table; channel = vocab_size
        return logits  # logits are the score for the next character in the sequence

m = BigramLanguageModel(vocab_size)
out = m(xb, yb)
#out = m(xb.to('cuda:0'), yb.to('cuda:0'))  # with device casting - seems to be not necessary as xb, yb are on cuda here already
print(out.shape)



torch.Size([4, 8, 65])


In [22]:
vocab_size

65

In [24]:

import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size).to('cuda:0')  # had to add .to('cuda:0') here to ensure all is on the same device

    def forward(self, idx, targets=None):
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C) = (batch, time, channel) - just plugs out the idx-row from the embedding table; channel = vocab_size

        if targets is None:  # targets optional
            loss = None
        else:  # pytorch wants the channel to be the 2. dimension...
            B, T, C = logits.shape  # ...so we have to repack them
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)  # do the same for targets
            loss = F.cross_entropy(logits, targets)  # how well are we predicting the next character?
            
        return logits, loss  # logits are the score for the next character in the sequence

    def generate(self, idx, max_new_tokens):  # generates in the time dimension for all batch dimensions, till max_new_tokens
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)  # the model
logits, loss = m(xb, yb)
#out = m(xb.to('cuda:0'), yb.to('cuda:0'))  # with device casting - seems to be not necessary as xb, yb are on cuda here already
print(logits)
print(loss)  # compare to -ln(1/65) with a vocab size of 65 = 4.174...

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)  # a 0-tensor to start generation
print(decode(m.generate(context, max_new_tokens=100)[0].tolist()))  # also unplug the batch dimension

tensor([[-1.5101, -0.0948,  1.0927,  ..., -0.6126, -0.6597,  0.7624],
        [ 0.3323, -0.0872, -0.7470,  ..., -0.6716, -0.9572, -0.9594],
        [ 0.2475, -0.6349, -1.2909,  ...,  1.3064, -0.2256, -1.8305],
        ...,
        [-2.1910, -0.7574,  1.9656,  ..., -0.3580,  0.8585, -0.6161],
        [ 0.5978, -0.0514, -0.0646,  ..., -1.4649, -2.0555,  1.8275],
        [-0.6787,  0.8662, -1.6433,  ...,  2.3671, -0.7775, -0.2586]],
       device='cuda:0', grad_fn=<ViewBackward0>)
tensor(4.8786, device='cuda:0', grad_fn=<NllLossBackward0>)

pYCXxfRkRZd
wc'wfNfT;OLlTEeC K
jxqPToTb?bXAUG:C-SGJO-33SM:C?YI3a
hs:LVXJFhXeNuwqhObxZ.tSVrddXlaSZaNe


In [26]:
# now to training the model
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)


In [33]:
# train
batch_size = 32  # increase batch size
for iter in range(10000):

    # every once in a while evaluate the loss on train and val sets
    #if iter % eval_interval == 0:
    #    losses = estimate_loss()
    #    print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.403348922729492


In [36]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)  # a 0-tensor to start generation
print(decode(m.generate(context, max_new_tokens=200)[0].tolist()))  # also unplug the batch dimension


Wid the wllo INour id, mersed
Fourd?
TI idurd po venond, d Cad ty
K:
BIUSoou tiund thornofen e sutan wiporthare whanot, sitthers, spe Bllellke, on s h O, t pan, ce wat d tive wout ir f; u;

Feknen oue


In [39]:
# train with an averaging loss estimator

@torch.no_grad()
def estimate_loss():  # making loss estimates less noisy
    out = {}
    m.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = m(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    m.train()
    return out
    
batch_size = 32  # increase batch size
for iter in range(10000):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

step 0: train loss 2.4620, val loss 2.4799
step 500: train loss 2.4509, val loss 2.4872
step 1000: train loss 2.4454, val loss 2.4797
step 1500: train loss 2.4448, val loss 2.4900
step 2000: train loss 2.4579, val loss 2.4778
step 2500: train loss 2.4469, val loss 2.4761
step 3000: train loss 2.4451, val loss 2.4793
step 3500: train loss 2.4570, val loss 2.4901
step 4000: train loss 2.4614, val loss 2.4829
step 4500: train loss 2.4543, val loss 2.4756
step 5000: train loss 2.4587, val loss 2.4904
step 5500: train loss 2.4529, val loss 2.4916
step 6000: train loss 2.4576, val loss 2.4889
step 6500: train loss 2.4656, val loss 2.4859
step 7000: train loss 2.4537, val loss 2.4917
step 7500: train loss 2.4452, val loss 2.4891
step 8000: train loss 2.4580, val loss 2.4900
step 8500: train loss 2.4545, val loss 2.4920
step 9000: train loss 2.4504, val loss 2.4915
step 9500: train loss 2.4512, val loss 2.4837
2.434276819229126


In [40]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)  # a 0-tensor to start generation
print(decode(m.generate(context, max_new_tokens=200)[0].tolist()))  # also unplug the batch dimension


MPrrtano iru forealoiroret HEnk;
CUCaden tck in, d ser t ftanofallon bay ho s, agallen, meseveminds s; te worimyoin ie--
ARUSThe Whou wowhedichea blare aned hy senonirstha theint co mas, the an be ke 


In [46]:
# a mathematical trick

torch.manual_seed(1337)

B, T, C = 4, 8, 2  # batch, time, channels (channels = the number of information components at each time step in a batch)
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [66]:
# we want the token to talk to each other, but not into the future

# simplest case in communicating with the past: just the average
# we want x[b, t] = mean_{i<=t} x[b, i]
xbow = torch.zeros((B, T, C))  # x-bag-of-words
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]  # x[t, C] everything up to and including t
        xbow[b, t] = torch.mean(xprev, 0)

print(x[0])
print(xbow[0])

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])
tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])


In [51]:
# loops are very inefficient, so going for matrix algebra

torch.manual_seed(42)
a = torch.ones(3, 3)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b  # matrix multiplication
print(f"a: {a}")
print(f"b: {b}")
print(f"c: {c}")

a: tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
b: tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
c: tensor([[14., 16.],
        [14., 16.],
        [14., 16.]])


In [53]:
torch.tril(torch.ones(3, 3))  # lower triangular matrix

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [55]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b  # matrix multiplication
print(f"a: {a}")
print(f"b: {b}")
print(f"c: {c}")

# this now gives the sums over rows

a: tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
b: tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
c: tensor([[ 2.,  7.],
        [ 8., 11.],
        [14., 16.]])


In [58]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b  # matrix multiplication
print(f"a: {a}")
print(f"b: {b}")
print(f"c: {c}")

# this now gives the column averages up to the given row

a: tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b: tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
c: tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [67]:
# so turn this into a more efficient weight calculation for a simple average value lookback
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)

wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [68]:
xbow2 = wei @ x  # (T, T) @ (B, T, C) -> (B, T, T) @ (B, T, C) since dims do not match so -> (B, T, C)
# i.e. for each batch element multiply (T,C) by (T,T) in parallel

In [69]:
xbow2[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [72]:
torch.allclose(xbow, xbow2, 1e-3)

True

In [77]:
# 3. version
tril = torch.tril(torch.ones(T, T))  # lower triang matrix
wei = torch.zeros((T, T))  # all 0s: initially the interaction strength (between the tokens) is 0
wei = wei.masked_fill( tril == 0, float('-inf'))  # set to -inf wherever tril has 0 entry -> tokens from the past can't talk to the future
wei = F.softmax(wei, dim=-1)  # dim=-1 -> along rows -> averages the values over the number of cols: 1 / n_cols

print(wei)

xbow3 = wei @ x

torch.allclose(xbow, xbow3, 1e-3)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


True

In [86]:
# now adding a true embedding via a simple linear layer

import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?
max_iters = 3000
eval_interval = 300
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 32
# ------------

torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input_sp0.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)  # added: a true embedding via linear layer

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C) = (batch, time, channel), channel = vocab_size
        logits = self.lm_head(tok_emb)  # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = BigramLanguageModel()
m = model.to(device)

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        for layer in model.children():  # show some of the model weights
            if isinstance(layer, nn.Linear):
                print('weight:', layer.weight[0])
                #print('bias:', layer.bias)

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

step 0: train loss 4.3886, val loss 4.3734
weight: tensor([ 0.0048,  0.1580,  0.1353,  0.1497,  0.1003,  0.1122,  0.0204,  0.0371,
         0.1236,  0.1255, -0.0915, -0.0697, -0.0009, -0.1504,  0.0219,  0.0610,
        -0.0391,  0.1378, -0.1737, -0.0799, -0.0614,  0.0013, -0.0683,  0.0843,
         0.0443, -0.1751, -0.1476, -0.0838, -0.1292, -0.1188, -0.0336,  0.0750],
       device='cuda:0', grad_fn=<SelectBackward0>)
step 300: train loss 2.5267, val loss 2.5399
weight: tensor([ 0.3775,  0.0602,  0.6586,  0.4240,  0.1560,  0.0591,  0.4988, -0.0598,
         0.1781,  0.0452, -0.1271, -0.1626, -0.0275, -0.5549, -0.2615,  0.2745,
        -0.5423, -0.3858, -0.3118,  0.2559, -0.5134, -0.2357, -0.2948,  0.2617,
         0.2396, -0.2776, -0.5593, -0.0584,  0.4738, -0.3698, -0.0105, -0.3609],
       device='cuda:0', grad_fn=<SelectBackward0>)
step 600: train loss 2.4998, val loss 2.5315
weight: tensor([ 0.2866,  0.0867,  0.6215,  0.3985,  0.0940, -0.0382,  0.5687, -0.0393,
         0.2368,  0

In [1]:
# we do not only want to encode the identity of the tokens, but also their position

import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 32  # how many independent sequences will we process in parallel?
block_size = 8  # what is the maximum context length for predictions?
max_iters = 3000
eval_interval = 300
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 32
# ------------

torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input_sp0.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)  # embedding token
        self.position_embedding_table = nn.Embedding(block_size, n_embd)  # embedding position
        self.lm_head = nn.Linear(n_embd, vocab_size)  # adding a true embedding layer: map from embedding to token vocab

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C) = (batch, time, channel), channel = vocab_size
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T, C), arange: all ints up to t-1
        x = tok_emb + pos_emb  # (B,T,C) will be the addition of token and position embedding
        logits = self.lm_head(x)  # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        B, T = idx.shape
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens (idx also has the PE so we need to crop)
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

    def own_softmax(self, x):
        eps = 1.e-10
        means = torch.mean(x, 1, keepdim=True)[0] + eps
        x_exp = torch.exp(x - means)
        x_exp_sum = torch.sum(x_exp, 1, keepdim=True) + eps
        return x_exp / x_exp_sum + eps

model = BigramLanguageModel()
m = model.to(device)

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))


step 0: train loss 4.4801, val loss 4.4801
step 300: train loss 2.5404, val loss 2.5566
step 600: train loss 2.5160, val loss 2.5335
step 900: train loss 2.4967, val loss 2.5149
step 1200: train loss 2.5106, val loss 2.5254
step 1500: train loss 2.4853, val loss 2.5109
step 1800: train loss 2.4966, val loss 2.5198
step 2100: train loss 2.4949, val loss 2.5100
step 2400: train loss 2.4937, val loss 2.5102
step 2700: train loss 2.5040, val loss 2.5114



CExthantrid owindikis s, bll

HAPen bube t e.
S:
O:
IS:
Folatangs:
Wanthar u qurthe. bar dilasoate awice my.

Hastatom o mup
Yowhthatof isth ble mil; dilll,

W:

Ye s, hain latisttid ov ts, and Wh pomano.
Swanous l lind me l.
MIshe ce hiry ptupr aisspllw y. w'stoul noroopetelaves
Momy ll, d mothake o Windo wh t eiibys the m douris TENGByore s poo mo th; te

AN ad nthrupt f s ar irist m:

Thin maleronth, af Pre?

Whio myr f-
LI har,
S:


Thardsal this ghesthidin cour ay aney Iry ts I f my ce hy


In [4]:
import torch

torch.multinomial(torch.tensor([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0'), num_samples=1)

RuntimeError: invalid multinomial distribution (sum of probabilities <= 0)

In [10]:
# self-attention

torch.manual_seed(1337)

B, T, C = 4, 8, 32  # batch, time, channels = the information in each token is 32-dim (for 4 batches of 8 contextlength)
x = torch.randn(B, T, C)

# single-head self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)  # bias=flase: essentially just a matrix multiplication
query = nn.Linear(C, head_size, bias=False)
k = key(x)  # now apply simple linear model to x (B, T, 16)
q = query(x)  # (B, T, 16)
wei = q @ k.transpose(-2, -1)  # (B, T, 16) @ (B, 16, T) -> (B, T, T)

tril = torch.tril(torch.ones(T, T))
#wei = torch.zeros((T, T))  # we do not want this to be all 0s: some tokens might find certain other ones more interesting
# self-attention solves this: every single token in the past emits two vectors: a query Q and a key K
# Q: what am I looking for?
# K: what do I contain?
# dot-product my query Q with all other keys k -> wei (higher correlates -> larger dot-product -> larger weight)

wei = wei.masked_fill( tril == 0, float('-inf'))  # not looking into the future again
wei = F.softmax(wei, dim=-1)  # make a nice distribution out of the rather rough dot-product-values (softmax = exp and normalize)
out = wei @ x

out.shape

torch.Size([4, 8, 32])

In [11]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

In [13]:
# self-attention - a look under the hood

torch.manual_seed(1337)

B, T, C = 4, 8, 32  # batch, time, channels = the information in each token is 32-dim (for 4 batches of 8 contextlength)
x = torch.randn(B, T, C)

# single-head self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)  # bias=flase: essentially just a matrix multiplication
query = nn.Linear(C, head_size, bias=False)
k = key(x)  # now apply simple linear model to x (B, T, 16)
q = query(x)  # (B, T, 16)
wei = q @ k.transpose(-2, -1)  # (B, T, 16) @ (B, 16, T) -> (B, T, T)

tril = torch.tril(torch.ones(T, T))
#wei = torch.zeros((T, T))  # we do not want this to be all 0s: some tokens might find certain other ones more interesting
# self-attention solves this: every single token in the past emits two vectors: a query Q and a key K
# Q: what am I looking for?
# K: what do I contain?
# dot-product my query Q with all other keys k -> wei (higher correlates -> larger dot-product -> larger weight)

#wei = wei.masked_fill( tril == 0, float('-inf'))
#wei = F.softmax(wei, dim=-1)
out = wei @ x

out.shape

torch.Size([4, 8, 32])

In [14]:
wei[0]  # the raw affinities between the tokens

tensor([[-1.7629, -1.3011,  0.5652,  2.1616, -1.0674,  1.9632,  1.0765, -0.4530],
        [-3.3334, -1.6556,  0.1040,  3.3782, -2.1825,  1.0415, -0.0557,  0.2927],
        [-1.0226, -1.2606,  0.0762, -0.3813, -0.9843, -1.4303,  0.0749, -0.9547],
        [ 0.7836, -0.8014, -0.3368, -0.8496, -0.5602, -1.1701, -1.2927, -1.0260],
        [-1.2566,  0.0187, -0.7880, -1.3204,  2.0363,  0.8638,  0.3719,  0.9258],
        [-0.3126,  2.4152, -0.1106, -0.9931,  3.3449, -2.5229,  1.4187,  1.2196],
        [ 1.0876,  1.9652, -0.2621, -0.3158,  0.6091,  1.2616, -0.5484,  0.8048],
        [-1.8044, -0.4126, -0.8306,  0.5899, -0.7987, -0.5856,  0.6433,  0.6303]],
       grad_fn=<SelectBackward0>)

In [31]:
# self-attention now with value V added
# attention is essentailly a communication mechanism
# can be seen as a directed graph: a node (token) collects info from nodes (tokens) that point to it
# positional encoding (PE) needed as there is no notion of space; PE tells the tokens where they are
# not like convolution which contains location information

torch.manual_seed(1337)

B, T, C = 4, 8, 32  # batch, time, channels = the information in each token is 32-dim (for 4 batches of 8 contextlength)
x = torch.randn(B, T, C)

# single-head self-attention
head_size = 16
# self-attention solves this: every single token in the past emits two vectors: a query Q and a key K
# Q: what am I looking for?
# K: what do I contain? (publicly, I also have some private info)
# dot-product my query Q with all other keys k -> wei (higher correlates -> larger dot-product -> larger weight)
# V: value, the private information of the token: if you find me interesting, here is what I will give you
key = nn.Linear(C, head_size, bias=False)  # bias=flase: essentially just a matrix multiplication
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)  # now apply simple linear model to x (B, T, 16)
q = query(x)  # (B, T, 16)
wei = q @ k.transpose(-2, -1)  # (B, T, 16) @ (B, 16, T) -> (B, T, T)

tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill( tril == 0, float('-inf'))  # not looking into the future again - delete this for an encoder (here we have a decoder)
wei = F.softmax(wei, dim=-1)  # make a nice distribution out of the rather rough dot-product-values (softmax = exp and normalize)

v = value(x)
out = wei @ v

out.shape

torch.Size([4, 8, 16])

In [17]:
out[0]

tensor([[-0.1571,  0.8801,  0.1615, -0.7824, -0.1429,  0.7468,  0.1007, -0.5239,
         -0.8873,  0.1907,  0.1762, -0.5943, -0.4812, -0.4860,  0.2862,  0.5710],
        [ 0.6764, -0.5477, -0.2478,  0.3143, -0.1280, -0.2952, -0.4296, -0.1089,
         -0.0493,  0.7268,  0.7130, -0.1164,  0.3266,  0.3431, -0.0710,  1.2716],
        [ 0.4823, -0.1069, -0.4055,  0.1770,  0.1581, -0.1697,  0.0162,  0.0215,
         -0.2490, -0.3773,  0.2787,  0.1629, -0.2895, -0.0676, -0.1416,  1.2194],
        [ 0.1971,  0.2856, -0.1303, -0.2655,  0.0668,  0.1954,  0.0281, -0.2451,
         -0.4647,  0.0693,  0.1528, -0.2032, -0.2479, -0.1621,  0.1947,  0.7678],
        [ 0.2510,  0.7346,  0.5939,  0.2516,  0.2606,  0.7582,  0.5595,  0.3539,
         -0.5934, -1.0807, -0.3111, -0.2781, -0.9054,  0.1318, -0.1382,  0.6371],
        [ 0.3428,  0.4960,  0.4725,  0.3028,  0.1844,  0.5814,  0.3824,  0.2952,
         -0.4897, -0.7705, -0.1172, -0.2541, -0.6892,  0.1979, -0.1513,  0.7666],
        [ 0.1866, -0.0

Notes:

- Attention is a communication mechanism. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
- There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.
- Each example across batch dimension is of course processed completely independently and never "talk" to each other
- In an "encoder" attention block just delete the single line that does masking with tril, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
- "self-attention" just means that the keys and values are produced from the same source as queries. In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
- "Scaled" attention additional divides wei by 1/sqrt(head_size). This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below

In [20]:
# demo: scale to variance 1
k = torch.randn(B,T,head_size)
q = torch.randn(B,T,head_size)
wei = q @ k.transpose(-2, -1) * head_size**-0.5


In [21]:
k.var()

tensor(0.9006)

In [22]:
# softmax sharpens values, i.e. makes the data more peaky
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [24]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])*8, dim=-1) # gets too peaky, converges to one-hot; then we just aggregate info from a single node

tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])

In [25]:
class LayerNorm1d: # (used to be BatchNorm1d)
  
  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)
  
  def __call__(self, x):
    # calculate the forward pass
    xmean = x.mean(1, keepdim=True) # batch mean
    xvar = x.var(1, keepdim=True) # batch variance
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    return self.out
  
  def parameters(self):
    return [self.gamma, self.beta]

torch.manual_seed(1337)
module = LayerNorm1d(100)
x = torch.randn(32, 100) # batch size 32 of 100-dimensional vectors
x = module(x)
x.shape

torch.Size([32, 100])

In [26]:
x[:,0].mean(), x[:,0].std() # mean,std of one feature across all batch inputs

(tensor(0.1469), tensor(0.8803))

In [27]:
x[0,:].mean(), x[0,:].std() # mean,std of a single input from the batch, of its features

(tensor(-9.5367e-09), tensor(1.0000))

In [28]:
# French to English translation example:

# <--------- ENCODE ------------------><--------------- DECODE ----------------->
# les réseaux de neurones sont géniaux! <START> neural networks are awesome!<END>

# we just implemented a decoder here
# enc-dec: encoder looks at all tokens, forward and backward; K+V passes from the encoder to the decoder
# GPT just a decoder

### Full finished code, for reference
You may want to refer directly to the git repo instead though.

In [30]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 16  # how many independent sequences will we process in parallel?
block_size = 32  # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# ------------

torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input_sp0.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):  # the heads handle the attention - the communication between the nodes (tokens)
    """ one head of self-attention """

    def __init__(self, head_size):  # takes headsize
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)  # k,q,v like above
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))  # tril is not a parameter of the model, a buffer in pytorch lingo
        # (so it does not get picked up by autograd)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x)  # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5  # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))  # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x)  # (B,T,C)
        out = wei @ v  # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])  # pytorch makes this easy for us, passing just a list of heads
        self.proj = nn.Linear(n_embd, n_embd)  # project back into the original pathway
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)  # just concatinate the results, concat over channel dimension (dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):  # this handles the per-token (node) computation
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),  # the inner ff layer has 4 times the embedding dimension (to grow the computation on the residual pathway)
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),  # this is the projection layer going back into the original pathway (for the skip-connection) and going down from 4x
            nn.Dropout(dropout),  # added right before the residual pathway comes back (dropout trains subnetworks by randomly shutting off neurons)
        )

    def forward(self, x):
        return self.net(x)  # this is at a per token level: they think on all the attention data they got individually

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head  # watch out that it works out channelwise (e.g. 32 embedding, 4 heads, 8 headsize)
        self.sa = MultiHeadAttention(n_head, head_size)  # the heads running in parallel - the communication
        self.ffwd = FeedFoward(n_embd)  # simple feedforward: at token level, all tokens do this independently: gathered all info from attention, think on it
        # the ff does the computations
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):  # with skip connection, forking off direct gradient pass-through and block adjustments
        x = x + self.sa(self.ln1(x))  # the x + is the skip connection: it lets the gradient pass through (particularly initially), then the blocks adjust
        x = x + self.ffwd(self.ln2(x))  # note that the layer norms are applied before it goes into self-attention or ff, per token
        return x

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        # now the network starts to get really deep: suffers from computation problems, two ways to address this: (see Add & Norm in the figure)
        # 1. skip (residual) connections, see https://github.com/KaimingHe/deep-residual-networks (lets gradients pass through and lets blocks adjust them gradually)
        # 2. layer norms = Norm: similar to batch normalization: ensure each neuron has unit Gaussian distribution accross batch dimension
        self.ln_f = nn.LayerNorm(n_embd)  # final layer norm (now it is more common to norm before the computations unlike in the trafo paper - called prenorm)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx)  # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T,C)
        x = tok_emb + pos_emb  # (B,T,C)
        x = self.blocks(x)  # (B,T,C)
        x = self.ln_f(x)  # (B,T,C)
        logits = self.lm_head(x)  # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens (idx also has the PE so we need to crop) (not doing this causes a CUDA error due to index out of bounds)
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


0.209729 M parameters
step 0: train loss 4.4116, val loss 4.4022
step 100: train loss 2.6568, val loss 2.6670
step 200: train loss 2.5090, val loss 2.5058
step 300: train loss 2.4198, val loss 2.4339
step 400: train loss 2.3502, val loss 2.3563
step 500: train loss 2.2965, val loss 2.3128
step 600: train loss 2.2411, val loss 2.2498
step 700: train loss 2.2062, val loss 2.2199
step 800: train loss 2.1640, val loss 2.1870
step 900: train loss 2.1244, val loss 2.1507
step 1000: train loss 2.1035, val loss 2.1302
step 1100: train loss 2.0693, val loss 2.1174
step 1200: train loss 2.0380, val loss 2.0798
step 1300: train loss 2.0248, val loss 2.0646
step 1400: train loss 1.9916, val loss 2.0361
step 1500: train loss 1.9699, val loss 2.0306
step 1600: train loss 1.9635, val loss 2.0488
step 1700: train loss 1.9406, val loss 2.0144
step 1800: train loss 1.9078, val loss 1.9939
step 1900: train loss 1.9095, val loss 1.9888
step 2000: train loss 1.8847, val loss 1.9950
step 2100: train loss 1.

In [3]:
# train loss > validation loss: check for overfitting
# now we scale the model up

In [4]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 64  # how many independent sequences will we process in parallel?
block_size = 256  # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
# ------------

torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input_sp0.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):  # the heads handle the attention - the communication between the nodes (tokens)
    """ one head of self-attention """

    def __init__(self, head_size):  # takes headsize
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)  # k,q,v like above
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))  # tril is not a parameter of the model, a buffer in pytorch lingo
        # (so it does not get picked up by autograd)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x)  # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5  # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))  # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x)  # (B,T,C)
        out = wei @ v  # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])  # pytorch makes this easy for us, passing just a list of heads
        self.proj = nn.Linear(n_embd, n_embd)  # project back into the original pathway
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)  # just concatinate the results, concat over channel dimension (dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):  # this handles the per-token (node) computation
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),  # the inner ff layer has 4 times the embedding dimension (to grow the computation on the residual pathway)
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),  # this is the projection layer going back into the original pathway (for the skip-connection) and going down from 4x
            nn.Dropout(dropout),  # added right before the residual pathway comes back (dropout trains subnetworks by randomly shutting off neurons)
        )

    def forward(self, x):
        return self.net(x)  # this is at a per token level: they think on all the attention data they got individually

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head  # watch out that it works out channelwise (e.g. 32 embedding, 4 heads, 8 headsize)
        self.sa = MultiHeadAttention(n_head, head_size)  # the heads running in parallel - the communication
        self.ffwd = FeedFoward(n_embd)  # simple feedforward: at token level, all tokens do this independently: gathered all info from attention, think on it
        # the ff does the computations
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):  # with skip connection, forking off direct gradient pass-through and block adjustments
        x = x + self.sa(self.ln1(x))  # the x + is the skip connection: it lets the gradient pass through (particularly initially), then the blocks adjust
        x = x + self.ffwd(self.ln2(x))  # note that the layer norms are applied before it goes into self-attention or ff, per token
        return x

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        # now the network starts to get really deep: suffers from computation problems, two ways to address this: (see Add & Norm in the figure)
        # 1. skip (residual) connections, see https://github.com/KaimingHe/deep-residual-networks (lets gradients pass through and lets blocks adjust them gradually)
        # 2. layer norms = Norm: similar to batch normalization: ensure each neuron has unit Gaussian distribution accross batch dimension
        self.ln_f = nn.LayerNorm(n_embd)  # final layer norm (now it is more common to norm before the computations unlike in the trafo paper - called prenorm)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx)  # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T,C)
        x = tok_emb + pos_emb  # (B,T,C)
        x = self.blocks(x)  # (B,T,C)
        x = self.ln_f(x)  # (B,T,C)
        logits = self.lm_head(x)  # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens (idx also has the PE so we need to crop) (not doing this causes a CUDA error due to index out of bounds)
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))

10.788929 M parameters
step 0: train loss 4.2849, val loss 4.2823
step 500: train loss 2.0013, val loss 2.0882
step 1000: train loss 1.5956, val loss 1.7720
step 1500: train loss 1.4395, val loss 1.6371
step 2000: train loss 1.3406, val loss 1.5667
step 2500: train loss 1.2803, val loss 1.5329
step 3000: train loss 1.2261, val loss 1.5056
step 3500: train loss 1.1831, val loss 1.4855
step 4000: train loss 1.1454, val loss 1.4823
step 4500: train loss 1.1098, val loss 1.4797
step 4999: train loss 1.0763, val loss 1.4900

Clarence and enemy digsing.

Here's abroad.

ELBOW:
If it be resternoed, might he consul, these he spents ale
that this hand to drivine of substance
which his purbling and rusts rise his fight,
or his subject in a kingdom and all:
God boots it some could win him harlibly,
And knows no more blackvace.

FRIAR LAURENCE:
Be it as adoing thy grue to tyrer.

BUCKINGHAM:
Poor horse! and cross we did it so, mady young,
What's heart? what, his pathyst? loveliament!

FRIAR LAUREN

# how would we train GPT?
two steps:
1. pretraining (what we have done here, but larger with a lot of text from the internet): just enable it to babble text (does not answer questions)
2. finetuning stage: 2.1 collect question-answer data and feed it to model  2.2 let human raters evaluate the model output to train a reward model 2.3 run PPO reinforcement to optimize the model

our dataset was Shakespear (1 MM tokens), with subword (GPT) this corresponds to about 300.000 tokens - 300 billion for GPT large model