In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# read in all the words
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [4]:
len(words)

32033

In [6]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [22]:

block_size = 3 # context length
X, Y = [], []
for w in words[:5]:
    
    print(w)
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '---->', itos[ix])
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... ----> e
..e ----> m
.em ----> m
emm ----> a
mma ----> .
olivia
... ----> o
..o ----> l
.ol ----> i
oli ----> v
liv ----> i
ivi ----> a
via ----> .
ava
... ----> a
..a ----> v
.av ----> a
ava ----> .
isabella
... ----> i
..i ----> s
.is ----> a
isa ----> b
sab ----> e
abe ----> l
bel ----> l
ell ----> a
lla ----> .
sophia
... ----> s
..s ----> o
.so ----> p
sop ----> h
oph ----> i
phi ----> a
hia ----> .


In [103]:
emb = C[X] # Char embedding lookup-table
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # Calculate logits of model
# softmax into negative log loss
# counts = logits.exp()
# prob = counts / counts.sum(1, keepdims=True)
# loss = -prob[torch.arange(32), Y].log().mean() (This code is equivalent to the Pytorch below)
loss = F.cross_entropy(logits, Y)
loss

tensor(14.5032)

In [23]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [73]:
torch.cat(torch.unbind(emb, 1), 1).shape # Each embedding has a rowsize of 2, so the shape is 6 (2 * 3)
# equivalent to emb.view(32, 6) because of the way that tensors are processed by pytorch

torch.Size([32, 6])

In [105]:
C = torch.randn((27, 2)) # Embedding lookup table
# input layer
W1 = torch.randn((6, 100)) # inputs calculated by 3 * 2, refer to emb shape
b1 = torch.randn(100) 
# Output layer
W2 = torch.randn((100, 27))
b2 = torch.randn(27)
# Parameters
parameters = [C, W1, b1, W2, b2]

In [106]:
for p in parameters:
    p.requires_grad = True

In [109]:
steps = 1000
learning_rate = 0.1

for i in range(steps):
    # Forward Pass
    emb = C[X] # Char embedding lookup-table
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
    logits = h @ W2 + b2 # Calculate logits of model
    loss = F.cross_entropy(logits, Y)
    print(loss.item())
    # Backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    for p in parameters:
        p.data += -learning_rate * p.grad

2.5148651599884033
2.347205877304077
2.2006313800811768
2.0504367351531982
1.9096455574035645
1.763210654258728
1.6172363758087158
1.4761760234832764
1.3571674823760986
1.2705419063568115
1.217061161994934
1.2350564002990723
1.3521555662155151
1.9583319425582886
1.0929925441741943
1.0064247846603394
0.9658892154693604
0.9349462985992432
0.9097537994384766
0.8903946280479431
0.8822121620178223
0.9261757135391235
1.0773528814315796
1.706147313117981
0.8609830737113953
0.805966317653656
0.7831040024757385
0.7689955830574036
0.7629497051239014
0.766759991645813
0.8238166570663452
0.905944287776947
1.3264614343643188
0.7563217282295227
0.7114620208740234
0.6958705186843872
0.6874038577079773
0.6812167763710022
0.684677004814148
0.6891300678253174
0.737021267414093
0.7573749423027039
0.9574797749519348
0.7307043075561523
0.7565016746520996
0.6808151602745056
0.6782613396644592
0.6427730917930603
0.6340352892875671
0.6155622005462646
0.6088401675224304
0.5975707769393921
0.5932472944259644
0.