In [7]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [8]:
words = open('names.txt', 'r').read().splitlines()
print(len(words))
print(words[:8])

32033
['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']


In [9]:
chars = sorted(list(set(''.join(words))))
stoi = {c: i+1 for i, c in enumerate(chars)}
stoi['.'] = 0
itos = {i+1: c for i, c in enumerate(chars)}
vocab_size = len(stoi)
print(itos)
print(vocab_size)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'}
27


In [10]:
def buildDataset(words, block_size):
    x, y = [], []
    for w in words:
        #print(w)
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            x.append(context)
            y.append(ix)
            #print(''.join ([itos[i] for i in context]), '-->', itos[ix])
            context = context[1:] + [ix]
            
    x = torch.tensor(x)
    y = torch.tensor(y)
    print(x.shape, y.shape)
    return x, y
import random
block_size = 3
random.seed(42)
random.shuffle(words)
n1 = int (len(words)*.8)    
n2 = int (len(words)*.9)    
x_train, y_train = buildDataset(words[:n1], block_size)
x_val, y_val = buildDataset(words[n1:n2], block_size)
x_test, y_test = buildDataset(words[n2:], block_size)


torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [11]:
def cmp(s, dt, t):
    ex = torch.all(dt == torch.grad).item()
    app = torch.allclose(dt, t.grad())
    max_diff = (dt - t.grad()).abs().max().item()
    print(f'{s:15s} | exact: {str(ex):5s} | approx: {str(app):5s} | max_diff: {max_diff:.5f}')

In [13]:
n_embed = 10
n_hidden = 64

C = torch.randn(vocab_size, n_embed)
#Layer one
W1 = torch.randn(n_embed * block_size, n_hidden) * (5/3) /((n_embed * block_size)**.5)
B1 = torch.randn(n_hidden) * .1
#Layer two
W2 = torch.randn(n_hidden, vocab_size) * .1
B2 = torch.randn(vocab_size) * .1
#Batch normalization
bngain = torch.randn(1, n_hidden) * 0.1 + 1
bnbias = torch.randn(1, n_hidden) * 0.1

params = [C, W1, B1, W2, B2, bngain, bnbias]
for p in params:
    p.requires_grad = True

In [14]:
batch_size = 32
n = batch_size
ix = torch.randint(0, x_train.shape[0], (n,))
Xb, Yb = x_train[ix], y_train[ix]

In [15]:
emb = C[Xb]
embcat = emb.view(emb.shape[0], -1)
hprebn = embcat @ W1 + B1
bnmeani = 1/n*hprebn.mean(0, keepdim=True)
bndiff = hprebn - bnmeani
bndiff2 = bndiff**2
bnvar = 1/(n-1)*bndiff2.sum(0, keepdim=True)
bnvar_inv = 1/(bnvar + 1e-5)**.5
bnraw = bndiff * bnvar_inv
hpreact = bngain * bnraw + bnbias
h = torch.tanh(hpreact)
logits = h @ W2 + B2
logitmaxes = logits.max(1, keepdim=True)[0]
normlogits = logits - logitmaxes
counts = normlogits.exp()
counts_sum = counts.sum(1, keepdim=True)
counts_sum_inv = 1/counts_sum
probs = counts * counts_sum_inv
logprobs = probs.log()
loss = -logprobs[range(n), Yb].mean()

for p in params:
    p.grad = None
for t in [logprobs, probs, counts, counts_sum, counts_sum_inv, normlogits,logitmaxes,  logits, h, hpreact, hprebn, bnraw, bnvar, bnvar_inv, bndiff, bndiff2, bnmeani ,embcat, emb]:
    t.retain_grad()
loss.backward()
loss

tensor(3.5167, grad_fn=<NegBackward0>)