# Makemore Manual Backprop

In [20]:
import torch
import copy
import math
import random
import numpy as np
import torch.nn.functional as F
import matplotlib.pyplot as plt

%matplotlib inline

In [21]:
words = open('names.txt', 'r').read().splitlines()
words[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [22]:
chars = sorted(set(''.join(words)))

stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0

itos = {i:s for s,i in stoi.items()}

In [23]:
context_len = 3
context = [0] * context_len

X, Y = [], []
random.shuffle(words)

for w in words:    
    for char in w + '.':
        X.append(context)
        Y.append(stoi[char])  
        context = context[1:] + [stoi[char]]
        
X, Y = torch.tensor(X), torch.tensor(Y)

n1, n2 = int(len(X)*0.8), int(len(X)*0.9)
Xtr, Ytr = X[:n1], Y[:n1]
Xval, Yval = X[n1:n2], Y[n1:n2]
Xtest, Ytest = X[n2:], Y[n2:]

In [24]:
def cmp(s, dt, t):
    ex = torch.all(dt == t.grad).item()
    app = torch.allclose(dt, t.grad)
    maxdiff = (dt - t.grad).abs().max().item()
    print(f'{s:15s} | exact: {str(ex):5s} | approximate: {str(app):5s} | maxdiff: {maxdiff}')

In [25]:
# reset params to retrain
vocab_size = len(itos)
n_embed = 10
n_hidden = 64

C = torch.randn((vocab_size, n_embed))
W1 = torch.randn((n_embed * context_len, n_hidden)) * (5/3)/((n_embed * context_len)**0.5)
b1 = torch.randn(n_hidden)                          * 0.1 # useless bc of batchnorm

W2 = torch.randn((n_hidden, vocab_size))            * 0.1
b2 = torch.randn(vocab_size)                        * 0.1

bngain = torch.ones((1, n_hidden)) * 0.1 + 1.0
bnbias = torch.ones((1, n_hidden)) * 0.1

bnmean_running = torch.zeros((1, n_hidden))
bnstd_running = torch.ones((1, n_hidden))

parameters = [C, W1, b1, W2, b2, bngain, bnbias]

n_params = sum(p.numel() for p in parameters)
print(f"{n_params} parameters")

for p in parameters:
    p.requires_grad = True

4137 parameters


In [29]:
# minibatch training loop
NUM_ITERS = 1000 # 200000
batch_size = 32

for i in range(NUM_ITERS):
    
    # minibatch indices
    idxs = torch.randint(0, Xtr.shape[0], (batch_size,))
    Xb, Yb = Xtr[idxs], Ytr[idxs]
    
    # forward pass
    emb = C[Xb].view(-1, n_embed * context_len)
    hpreact = emb @ W1 + b1
    
    # running calc of stdev and mean for batchnorm
    bnmeani = hpreact.mean(0, keepdim=True)
    bnstdi = hpreact.std(0, keepdim=True)
    
    hpreact = bngain * (hpreact - bnmeani)/(bnstdi) + bnbias
    h = hpreact.tanh()
    logits = h @ W2 + b2
    
    # updating running batchnorm statistics
    with torch.no_grad():
        bnmean_running = 0.999 * bnmean_running + 0.001 * bnmeani
        bnstd_running = 0.999 * bnstd_running + 0.001 * bnstdi

    # calculating loss
    loss = F.cross_entropy(logits, Yb)
    
    # clearing old gradients
    for p in parameters:
        p.grad = None
    
    # backprop & update
    loss.backward()
    
    lr = 0.1 if i < 100000 else 0.01
    for p in parameters:
        p.data -= lr * p.grad
        
    if i % 10000 == 0:
        print(loss.item())

2.7692854404449463


## Chunkated Forward Pass

In [32]:
batch_size = 32

# minibatch indices
idxs = torch.randint(0, Xtr.shape[0], (batch_size,))
Xb, Yb = Xtr[idxs], Ytr[idxs]

In [None]:
# forward pass, "chunkated" into smaller steps that are possible to backward one at a time
emb = C[Xb] # embed the characters into vectors
embcat = emb.view(emb.shape[0], -1) # concatenate the vectors
# Linear layer 1
hprebn = embcat @ W1 + b1 # hidden layer pre-activation
# BatchNorm layer
bnmeani = 1/n*hprebn.sum(0, keepdim=True)
bndiff = hprebn - bnmeani
bndiff2 = bndiff**2
bnvar = 1/(n-1)*(bndiff2).sum(0, keepdim=True) # note: Bessel's correction (dividing by n-1, not n)
bnvar_inv = (bnvar + 1e-5)**-0.5
bnraw = bndiff * bnvar_inv
hpreact = bngain * bnraw + bnbias
# Non-linearity
h = torch.tanh(hpreact) # hidden layer
# Linear layer 2
logits = h @ W2 + b2 # output layer
# cross entropy loss (same as F.cross_entropy(logits, Yb))
logit_maxes = logits.max(1, keepdim=True).values
norm_logits = logits - logit_maxes # subtract max for numerical stability
counts = norm_logits.exp()
counts_sum = counts.sum(1, keepdims=True)
counts_sum_inv = counts_sum**-1 # if I use (1.0 / counts_sum) instead then I can't get backprop to be bit exact...
probs = counts * counts_sum_inv
logprobs = probs.log()
loss = -logprobs[range(n), Yb].mean()

# PyTorch backward pass
for p in parameters:
    p.grad = None
for t in [logprobs, probs, counts, counts_sum, counts_sum_inv, # afaik there is no cleaner way
          norm_logits, logit_maxes, logits, h, hpreact, bnraw,
         bnvar_inv, bnvar, bndiff2, bndiff, hprebn, bnmeani,
         embcat, emb]:
    t.retain_grad()
loss.backward()
loss