In [1]:
import torch
import random
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# read all words

words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
len(words)

32033

Vocab

In [4]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0

itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27


Dataset

In [5]:
block_size = 3

def build_dataset(words):

    X, Y = [], []
    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix] #crop and append

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y


random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))
X_train, Y_train = build_dataset(words[:n1])
X_dev, Y_dev = build_dataset(words[n1:n2])
X_test, Y_test = build_dataset(words[n2:])

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


MLP params

In [6]:
n_embed = 10 #embedding dimensions
n_hidden = 200 #no of neurons in the hidden layer

g = torch.Generator().manual_seed(42)

C = torch.randn((vocab_size, n_embed), generator=g)
W1 = torch.randn((n_embed*block_size, n_hidden), generator=g) * (5/3)/(n_embed*block_size)**0.5
b1 = torch.randn(n_hidden, generator=g) * 0.01
W2 = torch.randn((n_hidden, vocab_size), generator=g) * 0.01
b2 = torch.randn(vocab_size, generator=g) * 0

parameters = [C, W1, b1, W2, b2]
print(sum(p.nelement() for p in parameters)) # total no of params
for p in parameters:
    p.requires_grad = True

11897


Optimization

Eval

In [7]:
@torch.no_grad() #disable grad tracking
def split_loss(split):
    x, y = {
        'train': (X_train, Y_train),
        'val': (X_dev, Y_dev),
        'test': (X_test, Y_test)
    }[split]

    emb = C[x] #(N, block_size, n_embed)
    embcat = emb.view(emb.shape[0], -1) # concat into (N, block_size*n_embed)
    h = torch.tanh(embcat@ W1 + b1)
    logits = h @ W2 +b2
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

split_loss('train')
split_loss('val')
    

train 3.2980921268463135
val 3.2999227046966553


Sample from the model

In [8]:
g = torch.Generator().manual_seed(42)

for _ in range(20):
    out = []
    context = [0]*block_size
    while True:
        emb = C[torch.tensor([context])]
        h = torch.tanh(emb.view(1, -1) @ W1 + b1)
        logits = h @ W2 + b2
        probs = F.softmax(logits, dim=1)

        #sample from the dist
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()

        #shift the context window
        context = context[1:] + [ix]
        out.append(ix)

        if ix == 0:
            break

    print(''.join(itos[i] for i in out))


xjuguenvtps.
fabiquedxfmubnwmsflaypglzofmwhwlxoln.
epjccuodsgjdmzu.
knxcmjjobdrggbdlpk.
mnqhqyjfbscvghigeaczalcvjwzajwtphjpdmquotcc.
weltxosvgkohobr.
uklnncvrigmydlsoumf.
pjjiewx.
lxmjuhm.
fsckbirdovhgn.
kgoktfkzuacabxa.
atodr.
bxwqzjzdqvtmdampemaqj.
omtafjiirvqtlfkyeumxuoxtame.
ovzqmywog.
acdtqumkorvdyxxhlsogob.
tnslwkgmnfuyccqendhln.
quehejojixfdirndbgcpvrsczagrtpltqc.
jsnq.
dazxygkihhnynvyfjfzgxlvkqncqgahwkig.


In [9]:
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):

    #minibatch
    ix = torch.randint(0, X_train.shape[0], (batch_size, ), generator=g)
    Xb, Yb = X_train[ix], Y_train[ix] #batch X, Y

    #forward pass
    emb = C[Xb] #embed the characters into vectors
    embcat = emb.view(emb.shape[0], -1)
    h_pre_act = embcat @ W1 + b1
    h = torch.tanh(h_pre_act)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Yb)

    #backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    #learning rate
    lr = 0.1 if i < 100000 else 0.01 # step learning rate decay

    #update
    for p in parameters:
        p.data += -lr * p.grad

    #track stats
    if i % 10000 == 0:
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())

      0/ 200000: 3.2844
  10000/ 200000: 2.0816
  20000/ 200000: 2.3091
  30000/ 200000: 1.9938
  40000/ 200000: 2.2890
  50000/ 200000: 2.2568
  60000/ 200000: 2.3548
  70000/ 200000: 2.4330
  80000/ 200000: 2.0860
  90000/ 200000: 2.1756
 100000/ 200000: 2.5186
 110000/ 200000: 1.8009
 120000/ 200000: 1.8870
 130000/ 200000: 2.0851
 140000/ 200000: 1.8776
 150000/ 200000: 2.4181
 160000/ 200000: 1.8908
 170000/ 200000: 1.8506
 180000/ 200000: 1.8193
 190000/ 200000: 2.1520


In [10]:
@torch.no_grad() #disable grad tracking
def split_loss(split):
    x, y = {
        'train': (X_train, Y_train),
        'val': (X_dev, Y_dev),
        'test': (X_test, Y_test)
    }[split]

    emb = C[x] #(N, block_size, n_embed)
    embcat = emb.view(emb.shape[0], -1) # concat into (N, block_size*n_embed)
    h = torch.tanh(embcat@ W1 + b1)
    logits = h @ W2 +b2
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

split_loss('train')
split_loss('val')
    

train 2.039062023162842
val 2.1046345233917236


Batch Normalization

- Standardizing hidden states to be unit gaussians
- 2015, Ioffe et al

In [17]:
h_pre_act.mean(0, keepdim=True)

tensor([[ 1.2005,  0.1994, -0.0738, -0.1525, -0.1722, -1.1817, -0.6340, -0.7247,
         -1.1173, -0.0787,  1.3171, -0.2830, -0.3154,  0.1196,  0.1557, -1.2167,
         -0.0979,  0.7173, -0.2463, -0.8190, -0.0697,  0.2824,  0.2509,  0.1362,
         -0.4909, -0.2047, -0.2319, -0.3635, -0.1256, -0.1480, -0.7942,  0.2213,
          0.5273,  0.2264, -0.2782,  0.6224, -0.3994, -0.3575, -0.3492, -0.2396,
          1.4038, -0.5036,  0.6411, -0.1129,  0.1681, -0.5357,  0.4271,  0.7632,
         -0.2442, -0.5371,  0.2574,  0.0605,  0.5593, -0.2472, -0.5255,  0.0656,
         -0.2850,  0.0920, -0.3704,  0.3840,  0.3378,  1.0413, -0.7199, -0.4037,
          0.2607,  0.0978,  0.5380,  0.8435,  0.7100,  0.5093, -0.8039, -0.9870,
          0.7105, -0.1490,  1.2891,  0.7050, -0.6263, -0.9175,  0.3128, -0.1173,
          0.6793,  0.9945, -0.2574,  0.1931,  0.6269, -0.3658,  0.2987, -0.7211,
         -0.7010,  1.0525, -0.4769, -0.0985,  0.2889,  0.9990, -0.4814, -0.0340,
          1.1239, -0.2605,  

- Hidden states should be gaussian during init but later should be able to move, diffuse or sharpen as the grad dictates
- Hence, Scale and Shift is added

In [15]:
n_embed = 10 #embedding dimensions
n_hidden = 200 #no of neurons in the hidden layer

g = torch.Generator().manual_seed(42)

C = torch.randn((vocab_size, n_embed), generator=g)
W1 = torch.randn((n_embed*block_size, n_hidden), generator=g) * (5/3)/(n_embed*block_size)**0.5
b1 = torch.randn(n_hidden, generator=g) * 0.01
W2 = torch.randn((n_hidden, vocab_size), generator=g) * 0.01
b2 = torch.randn(vocab_size, generator=g) * 0

bngain = torch.ones((1, n_hidden))
bnbias = torch.zeros((1, n_hidden))

parameters = [C, W1, b1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters)) # total no of params
for p in parameters:
    p.requires_grad = True

12297


In [19]:
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):

    #minibatch
    ix = torch.randint(0, X_train.shape[0], (batch_size, ), generator=g)
    Xb, Yb = X_train[ix], Y_train[ix] #batch X, Y

    #forward pass
    emb = C[Xb] #embed the characters into vectors
    embcat = emb.view(emb.shape[0], -1)
    h_pre_act = embcat @ W1 + b1
    #batchnorm layer
    h_pre_act = bngain * (h_pre_act - h_pre_act.mean(0, keepdim=True))/h_pre_act.std(0, keepdim=True) + bnbias
    h = torch.tanh(h_pre_act)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Yb)

    #backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    #learning rate
    lr = 0.1 if i < 100000 else 0.01 # step learning rate decay

    #update
    for p in parameters:
        p.data += -lr * p.grad

    #track stats
    if i % 10000 == 0:
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())

      0/ 200000: 3.2757
  10000/ 200000: 2.7730
  20000/ 200000: 2.3097
  30000/ 200000: 2.3467
  40000/ 200000: 2.3512
  50000/ 200000: 2.2381
  60000/ 200000: 1.9143
  70000/ 200000: 1.9234
  80000/ 200000: 2.4858
  90000/ 200000: 2.3766
 100000/ 200000: 2.2794
 110000/ 200000: 2.2012
 120000/ 200000: 2.4623
 130000/ 200000: 1.8763
 140000/ 200000: 2.2786
 150000/ 200000: 2.2449
 160000/ 200000: 2.3779
 170000/ 200000: 2.0994
 180000/ 200000: 2.3488
 190000/ 200000: 1.8930


In [20]:
@torch.no_grad() #disable grad tracking
def split_loss(split):
    x, y = {
        'train': (X_train, Y_train),
        'val': (X_dev, Y_dev),
        'test': (X_test, Y_test)
    }[split]

    emb = C[x] #(N, block_size, n_embed)
    embcat = emb.view(emb.shape[0], -1) # concat into (N, block_size*n_embed)
    h_pre_act = embcat @ W1 + b1
    #batchnorm layer
    h_pre_act = bngain * (h_pre_act - h_pre_act.mean(0, keepdim=True))/h_pre_act.std(0, keepdim=True) + bnbias
    h = torch.tanh(h_pre_act)
    h = torch.tanh(h_pre_act)
    logits = h @ W2 +b2
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

split_loss('train')
split_loss('val')
    

train 2.0682923793792725
val 2.1103756427764893


- The mean and std of the batch effects every logit so it couples the examples of batch
- This is a type of regularization but also can create strange behaviour
- Other form of normalization avoid this 