In [1]:
import torch
import random
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# read all words

words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
len(words)

32033

Vocab

In [4]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0

itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27


Dataset

In [5]:
block_size = 3

def build_dataset(words):

    X, Y = [], []
    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix] #crop and append

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y


random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))
X_train, Y_train = build_dataset(words[:n1])
X_dev, Y_dev = build_dataset(words[n1:n2])
X_test, Y_test = build_dataset(words[n2:])

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


MLP params

In [6]:
n_embed = 10 #embedding dimensions
n_hidden = 200 #no of neurons in the hidden layer

g = torch.Generator().manual_seed(42)

C = torch.randn((vocab_size, n_embed), generator=g)
W1 = torch.randn((n_embed*block_size, n_hidden), generator=g) * (5/3)/(n_embed*block_size)**0.5
b1 = torch.randn(n_hidden, generator=g) * 0.01
W2 = torch.randn((n_hidden, vocab_size), generator=g) * 0.01
b2 = torch.randn(vocab_size, generator=g) * 0

parameters = [C, W1, b1, W2, b2]
print(sum(p.nelement() for p in parameters)) # total no of params
for p in parameters:
    p.requires_grad = True

11897


Optimization

Eval

In [7]:
@torch.no_grad() #disable grad tracking
def split_loss(split):
    x, y = {
        'train': (X_train, Y_train),
        'val': (X_dev, Y_dev),
        'test': (X_test, Y_test)
    }[split]

    emb = C[x] #(N, block_size, n_embed)
    embcat = emb.view(emb.shape[0], -1) # concat into (N, block_size*n_embed)
    h = torch.tanh(embcat@ W1 + b1)
    logits = h @ W2 +b2
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

split_loss('train')
split_loss('val')
    

train 3.2980921268463135
val 3.2999227046966553


Sample from the model

In [8]:
g = torch.Generator().manual_seed(42)

for _ in range(20):
    out = []
    context = [0]*block_size
    while True:
        emb = C[torch.tensor([context])]
        h = torch.tanh(emb.view(1, -1) @ W1 + b1)
        logits = h @ W2 + b2
        probs = F.softmax(logits, dim=1)

        #sample from the dist
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()

        #shift the context window
        context = context[1:] + [ix]
        out.append(ix)

        if ix == 0:
            break

    print(''.join(itos[i] for i in out))


xjuguenvtps.
fabiquedxfmubnwmsflaypglzofmwhwlxoln.
epjccuodsgjdmzu.
knxcmjjobdrggbdlpk.
mnqhqyjfbscvghigeaczalcvjwzajwtphjpdmquotcc.
weltxosvgkohobr.
uklnncvrigmydlsoumf.
pjjiewx.
lxmjuhm.
fsckbirdovhgn.
kgoktfkzuacabxa.
atodr.
bxwqzjzdqvtmdampemaqj.
omtafjiirvqtlfkyeumxuoxtame.
ovzqmywog.
acdtqumkorvdyxxhlsogob.
tnslwkgmnfuyccqendhln.
quehejojixfdirndbgcpvrsczagrtpltqc.
jsnq.
dazxygkihhnynvyfjfzgxlvkqncqgahwkig.


In [9]:
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):

    #minibatch
    ix = torch.randint(0, X_train.shape[0], (batch_size, ), generator=g)
    Xb, Yb = X_train[ix], Y_train[ix] #batch X, Y

    #forward pass
    emb = C[Xb] #embed the characters into vectors
    embcat = emb.view(emb.shape[0], -1)
    h_pre_act = embcat @ W1 + b1
    h = torch.tanh(h_pre_act)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Yb)

    #backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    #learning rate
    lr = 0.1 if i < 100000 else 0.01 # step learning rate decay

    #update
    for p in parameters:
        p.data += -lr * p.grad

    #track stats
    if i % 10000 == 0:
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())

      0/ 200000: 3.2844
  10000/ 200000: 2.0816
  20000/ 200000: 2.3091
  30000/ 200000: 1.9938
  40000/ 200000: 2.2890
  50000/ 200000: 2.2568
  60000/ 200000: 2.3548
  70000/ 200000: 2.4330
  80000/ 200000: 2.0860
  90000/ 200000: 2.1756
 100000/ 200000: 2.5186
 110000/ 200000: 1.8009
 120000/ 200000: 1.8870
 130000/ 200000: 2.0851
 140000/ 200000: 1.8776
 150000/ 200000: 2.4181
 160000/ 200000: 1.8908
 170000/ 200000: 1.8506
 180000/ 200000: 1.8193
 190000/ 200000: 2.1520


In [10]:
@torch.no_grad() #disable grad tracking
def split_loss(split):
    x, y = {
        'train': (X_train, Y_train),
        'val': (X_dev, Y_dev),
        'test': (X_test, Y_test)
    }[split]

    emb = C[x] #(N, block_size, n_embed)
    embcat = emb.view(emb.shape[0], -1) # concat into (N, block_size*n_embed)
    h = torch.tanh(embcat@ W1 + b1)
    logits = h @ W2 +b2
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

split_loss('train')
split_loss('val')
    

train 2.039062023162842
val 2.1046345233917236


Batch Normalization

- Standardizing hidden states to be unit gaussians
- 2015, Ioffe et al

In [17]:
h_pre_act.mean(0, keepdim=True)

tensor([[ 1.2005,  0.1994, -0.0738, -0.1525, -0.1722, -1.1817, -0.6340, -0.7247,
         -1.1173, -0.0787,  1.3171, -0.2830, -0.3154,  0.1196,  0.1557, -1.2167,
         -0.0979,  0.7173, -0.2463, -0.8190, -0.0697,  0.2824,  0.2509,  0.1362,
         -0.4909, -0.2047, -0.2319, -0.3635, -0.1256, -0.1480, -0.7942,  0.2213,
          0.5273,  0.2264, -0.2782,  0.6224, -0.3994, -0.3575, -0.3492, -0.2396,
          1.4038, -0.5036,  0.6411, -0.1129,  0.1681, -0.5357,  0.4271,  0.7632,
         -0.2442, -0.5371,  0.2574,  0.0605,  0.5593, -0.2472, -0.5255,  0.0656,
         -0.2850,  0.0920, -0.3704,  0.3840,  0.3378,  1.0413, -0.7199, -0.4037,
          0.2607,  0.0978,  0.5380,  0.8435,  0.7100,  0.5093, -0.8039, -0.9870,
          0.7105, -0.1490,  1.2891,  0.7050, -0.6263, -0.9175,  0.3128, -0.1173,
          0.6793,  0.9945, -0.2574,  0.1931,  0.6269, -0.3658,  0.2987, -0.7211,
         -0.7010,  1.0525, -0.4769, -0.0985,  0.2889,  0.9990, -0.4814, -0.0340,
          1.1239, -0.2605,  

- Hidden states should be gaussian during init but later should be able to move, diffuse or sharpen as the grad dictates
- Hence, Scale and Shift is added

In [15]:
n_embed = 10 #embedding dimensions
n_hidden = 200 #no of neurons in the hidden layer

g = torch.Generator().manual_seed(42)

C = torch.randn((vocab_size, n_embed), generator=g)
W1 = torch.randn((n_embed*block_size, n_hidden), generator=g) * (5/3)/(n_embed*block_size)**0.5
b1 = torch.randn(n_hidden, generator=g) * 0.01
W2 = torch.randn((n_hidden, vocab_size), generator=g) * 0.01
b2 = torch.randn(vocab_size, generator=g) * 0

bngain = torch.ones((1, n_hidden))
bnbias = torch.zeros((1, n_hidden))

parameters = [C, W1, b1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters)) # total no of params
for p in parameters:
    p.requires_grad = True

12297


In [19]:
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):

    #minibatch
    ix = torch.randint(0, X_train.shape[0], (batch_size, ), generator=g)
    Xb, Yb = X_train[ix], Y_train[ix] #batch X, Y

    #forward pass
    emb = C[Xb] #embed the characters into vectors
    embcat = emb.view(emb.shape[0], -1)
    h_pre_act = embcat @ W1 + b1
    #batchnorm layer
    h_pre_act = bngain * (h_pre_act - h_pre_act.mean(0, keepdim=True))/h_pre_act.std(0, keepdim=True) + bnbias
    h = torch.tanh(h_pre_act)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Yb)

    #backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    #learning rate
    lr = 0.1 if i < 100000 else 0.01 # step learning rate decay

    #update
    for p in parameters:
        p.data += -lr * p.grad

    #track stats
    if i % 10000 == 0:
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())

      0/ 200000: 3.2757
  10000/ 200000: 2.7730
  20000/ 200000: 2.3097
  30000/ 200000: 2.3467
  40000/ 200000: 2.3512
  50000/ 200000: 2.2381
  60000/ 200000: 1.9143
  70000/ 200000: 1.9234
  80000/ 200000: 2.4858
  90000/ 200000: 2.3766
 100000/ 200000: 2.2794
 110000/ 200000: 2.2012
 120000/ 200000: 2.4623
 130000/ 200000: 1.8763
 140000/ 200000: 2.2786
 150000/ 200000: 2.2449
 160000/ 200000: 2.3779
 170000/ 200000: 2.0994
 180000/ 200000: 2.3488
 190000/ 200000: 1.8930


In [20]:
@torch.no_grad() #disable grad tracking
def split_loss(split):
    x, y = {
        'train': (X_train, Y_train),
        'val': (X_dev, Y_dev),
        'test': (X_test, Y_test)
    }[split]

    emb = C[x] #(N, block_size, n_embed)
    embcat = emb.view(emb.shape[0], -1) # concat into (N, block_size*n_embed)
    h_pre_act = embcat @ W1 + b1
    #batchnorm layer
    h_pre_act = bngain * (h_pre_act - h_pre_act.mean(0, keepdim=True))/h_pre_act.std(0, keepdim=True) + bnbias
    h = torch.tanh(h_pre_act)
    h = torch.tanh(h_pre_act)
    logits = h @ W2 +b2
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

split_loss('train')
split_loss('val')
    

train 2.0682923793792725
val 2.1103756427764893


- The mean and std of the batch effects every logit so it couples the examples of batch
- This is a type of regularization but also can create strange behaviour
- Other form of normalization avoid this 

Calibrate batch norm at end of training

In [24]:
with torch.no_grad():
    emb = C[X_train]
    embcat = emb.view(emb.shape[0], -1)
    h_pre_act = embcat @ W1 + b1
    #measure mean and stddev over entire training set
    bnmean = h_pre_act.mean(0, keepdim=True)
    bnstd = h_pre_act.std(0, keepdim=True)


In [26]:
@torch.no_grad() #disable grad tracking
def split_loss(split):
    x, y = {
        'train': (X_train, Y_train),
        'val': (X_dev, Y_dev),
        'test': (X_test, Y_test)
    }[split]

    emb = C[x] #(N, block_size, n_embed)
    embcat = emb.view(emb.shape[0], -1) # concat into (N, block_size*n_embed)
    h_pre_act = embcat @ W1 + b1
    #batchnorm layer
    h_pre_act = bngain * (h_pre_act - bnmean)/bnstd + bnbias
    h = torch.tanh(h_pre_act)
    h = torch.tanh(h_pre_act)
    logits = h @ W2 +b2
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

split_loss('train')
split_loss('val')
    

train 2.0682923793792725
val 2.11044979095459


- bnmean and bnstd can also be stored at every epoch

In [28]:
n_embed = 10 #embedding dimensions
n_hidden = 200 #no of neurons in the hidden layer

g = torch.Generator().manual_seed(42)

C = torch.randn((vocab_size, n_embed), generator=g)
W1 = torch.randn((n_embed*block_size, n_hidden), generator=g) * (5/3)/(n_embed*block_size)**0.5
b1 = torch.randn(n_hidden, generator=g) * 0.01
W2 = torch.randn((n_hidden, vocab_size), generator=g) * 0.01
b2 = torch.randn(vocab_size, generator=g) * 0

bngain = torch.ones((1, n_hidden))
bnbias = torch.zeros((1, n_hidden))
bnmean_running = torch.zeros(1, n_hidden)
bnstd_running = torch.zeros(1, n_hidden)

parameters = [C, W1, b1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters)) # total no of params
for p in parameters:
    p.requires_grad = True

12297


In [29]:
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):

    #minibatch construct
    ix = torch.randint(0, X_train.shape[0], (batch_size, ), generator=g)
    Xb, Yb = X_train[ix], Y_train[ix] #batch X, Y

    #forward pass
    emb = C[Xb] #embed the characters into vectors
    embcat = emb.view(emb.shape[0], -1)

    #Linear layer
    h_pre_act = embcat @ W1 + b1

    #batchnorm layer ------------------------------------------------
    bnmeani = h_pre_act.mean(0, keepdim=True)
    bnstdi = h_pre_act.std(0, keepdim=True)
    h_pre_act = bngain * (h_pre_act - bnmeani)/bnstdi + bnbias
    with torch.no_grad():
        bnmean_running = 0.999 * bnmean_running + 0.001 * bnmeani
        bnstd_running = 0.999 * bnstd_running + 0.001 * bnstdi
    #----------------------------------------------------------------

    #Non linearity
    h = torch.tanh(h_pre_act)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Yb)

    #backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    #learning rate
    lr = 0.1 if i < 100000 else 0.01 # step learning rate decay

    #update
    for p in parameters:
        p.data += -lr * p.grad

    #track stats
    if i % 10000 == 0:
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())

      0/ 200000: 3.3149
  10000/ 200000: 2.3311
  20000/ 200000: 2.4343
  30000/ 200000: 2.0402
  40000/ 200000: 2.1428
  50000/ 200000: 1.8935
  60000/ 200000: 2.1852
  70000/ 200000: 2.4852
  80000/ 200000: 2.2232
  90000/ 200000: 1.9615
 100000/ 200000: 2.0917
 110000/ 200000: 2.2161
 120000/ 200000: 1.9413
 130000/ 200000: 2.1396
 140000/ 200000: 1.8985
 150000/ 200000: 2.2107
 160000/ 200000: 2.2587
 170000/ 200000: 1.7979
 180000/ 200000: 2.4788
 190000/ 200000: 2.1109


In [31]:
with torch.no_grad():
    emb = C[X_train]
    embcat = emb.view(emb.shape[0], -1)
    h_pre_act = embcat @ W1 + b1
    #measure mean and stddev over entire training set
    bnmean = h_pre_act.mean(0, keepdim=True)
    bnstd = h_pre_act.std(0, keepdim=True)

In [30]:
bnmean

tensor([[ 1.5596,  1.3746,  0.1612, -0.7126, -0.4805, -0.6927, -0.9597,  0.2591,
          0.4394,  0.0750,  0.5750, -0.6567, -0.2691,  0.9165, -0.1591, -0.4971,
         -0.2713, -0.5187, -1.0631,  0.0838, -0.7864, -0.2469, -0.9088,  0.4455,
         -0.7526, -1.0427,  0.0875, -0.0486,  1.9949, -2.2137, -1.5284, -0.5154,
          0.3854,  1.5346,  0.0728,  2.4857,  0.3555, -0.6089, -0.0349,  0.8964,
          0.7842, -0.9216, -1.0879, -0.8738,  0.1997,  0.8846, -0.1156,  2.0324,
          0.7689, -0.9879,  0.9862,  0.3457, -0.3167, -0.5485, -1.4005,  0.1756,
          0.4755,  0.7596, -1.0313, -0.3012, -0.6564,  1.9854, -0.2897, -0.0745,
         -0.3472,  0.6168,  1.2853,  1.2498,  0.7799,  0.3863, -1.5962, -0.4313,
          0.4067, -1.6379,  2.2105,  0.9599, -2.2965,  0.5519, -0.9366, -0.7601,
          1.1837,  1.3497, -0.1355,  1.2523,  0.0922,  0.5584,  0.8744,  0.6830,
          0.6281,  0.2624, -0.3442, -0.2258,  1.5220,  1.3437, -0.7063, -1.4222,
          0.6407,  0.0494,  

In [32]:
bnmean_running

tensor([[ 1.7028,  1.3690,  0.1082, -0.7180, -0.5029, -0.6955, -0.9562,  0.2634,
          0.5060,  0.0681,  0.5670, -0.6369, -0.3027,  0.8649, -0.0796, -0.4696,
         -0.3063, -0.5173, -1.0338,  0.0725, -0.7233, -0.2495, -0.9057,  0.4795,
         -0.7502, -0.9779,  0.0770, -0.0637,  1.9454, -2.1977, -1.5491, -0.5451,
          0.3939,  1.5844,  0.0463,  2.5346,  0.3493, -0.5991, -0.0171,  0.9137,
          0.8166, -0.9994, -1.0877, -0.8019,  0.1964,  0.8519, -0.0754,  2.0263,
          0.7935, -1.0282,  0.9782,  0.3363, -0.3334, -0.5922, -1.4083,  0.1574,
          0.3032,  0.7818, -0.7913, -0.3773, -0.6603,  1.9914, -0.3210, -0.0724,
         -0.3683,  0.6269,  1.2812,  1.2794,  0.8071,  0.4454, -1.5811, -0.4124,
          0.4245, -1.6147,  2.2329,  0.9830, -2.1373,  0.5673, -0.9126, -0.7798,
          1.1946,  1.3530, -0.0958,  1.2758,  0.1475,  0.5680,  0.8562,  0.6815,
          0.6453,  0.2292, -0.3478, -0.2553,  1.4505,  1.4308, -0.7449, -1.3968,
          0.6376,  0.0129,  

In [33]:
@torch.no_grad() #disable grad tracking
def split_loss(split):
    x, y = {
        'train': (X_train, Y_train),
        'val': (X_dev, Y_dev),
        'test': (X_test, Y_test)
    }[split]

    emb = C[x] #(N, block_size, n_embed)
    embcat = emb.view(emb.shape[0], -1) # concat into (N, block_size*n_embed)
    h_pre_act = embcat @ W1 + b1
    #batchnorm layer
    h_pre_act = bngain * (h_pre_act - bnmean_running)/bnstd_running + bnbias
    h = torch.tanh(h_pre_act)
    h = torch.tanh(h_pre_act)
    logits = h @ W2 +b2
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

split_loss('train')
split_loss('val')
    

train 2.0680298805236816
val 2.111121892929077


- Loss is similar here compared to the calibrated one

- Can remove b1 as it cancels out during batch norm