In [1]:
import torch
import torch.nn.functional as F

In [2]:
# reading the words
words = open('names.txt', 'r').read().splitlines()
words[:6]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte']

In [3]:
chars = list(sorted(set(''.join(words))))
stoi = {ch: i+1 for i, ch in enumerate(chars)}
stoi['.'] = 0
itos = {i: s  for s, i in stoi.items()}

In [4]:
# Building the dataset from the words

block_size = 3  # the input length for the NN or the user
X, Y = [], []

for w in words[:5]:

    print(w)
    context = [0]*block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)

        print(''.join(itos[ind] for ind in context), '-->', itos[ix])
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)    

emma
... --> e
..e --> m
.em --> m
emm --> a
mma --> .
olivia
... --> o
..o --> l
.ol --> i
oli --> v
liv --> i
ivi --> a
via --> .
ava
... --> a
..a --> v
.av --> a
ava --> .
isabella
... --> i
..i --> s
.is --> a
isa --> b
sab --> e
abe --> l
bel --> l
ell --> a
lla --> .
sophia
... --> s
..s --> o
.so --> p
sop --> h
oph --> i
phi --> a
hia --> .


In [5]:
X.shape, Y.shape

(torch.Size([32, 3]), torch.Size([32]))

In [44]:
# Embedding Space
C = torch.randn((27, 2)) # embed in 2-dim space
emb = C[X] # pytorch allows for a tensor to be used as index 

In [45]:
C[torch.tensor([[1,2,5],[4,1,9]])]

tensor([[[ 1.6306,  0.4048],
         [-1.0262, -0.5962],
         [ 0.4233, -1.5632]],

        [[-0.0548, -0.7221],
         [ 1.6306,  0.4048],
         [-0.5366, -1.2592]]])

In [46]:
emb.shape

torch.Size([32, 3, 2])

In [47]:
# Hidden Layer
W1 = torch.randn((6, 100)) # 6 is the input size, 100 is the hidden size
b1 = torch.randn((100))

In [48]:
torch.cat([emb[:,0,:], emb[:,1,:], emb[:,2,:]], dim=1).shape
# this method doesn't generalize to different block sizes

torch.Size([32, 6])

In [49]:
torch.cat(torch.unbind(emb, dim=1)).shape

torch.Size([96, 2])

In [52]:
# .view allows to reshape the tensor without changing the data so no memory is wasted
emb.view(32, 6) == torch.cat(torch.unbind(emb, dim=1), dim=1)

tensor([[True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, T

In [None]:
# so to the correct matrix multiplication
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
h.shape

# the matrix mult and addition works because
# 32, 100
#  1, 100

torch.Size([32, 100])

In [61]:
# Output Layer
W2 = torch.randn((100, 27))
b2 = torch.randn((27))

In [62]:
logits = h @ W2 + b2
logits.shape

torch.Size([32, 27])

In [63]:
# softmax
counts = logits.exp()
probs = counts / counts.sum(1, keepdim=True)
probs.shape

torch.Size([32, 27])

In [69]:
loss = -probs[torch.arange(32), Y].log().mean()

In [70]:
loss

tensor(18.4280)

### Cleaning up the Code

In [15]:
import torch
import torch.nn.functional as F

In [49]:
# reading the words
words = open('names.txt', 'r').read().splitlines()
words[:6]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte']

In [50]:
chars = list(sorted(set(''.join(words))))
stoi = {ch: i+1 for i, ch in enumerate(chars)}
stoi['.'] = 0
itos = {i: s  for s, i in stoi.items()}

In [74]:
# Building the dataset from the words

block_size = 3  # the input length for the NN or the user
X, Y = [], []

for w in words:

    # print(w)
    context = [0]*block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)

        # print(''.join(itos[ind] for ind in context), '-->', itos[ix])
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)    

In [112]:
# Cleaning up the code

emb_dim = 2
input_len = 3
hidden_size = 100

g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, emb_dim), generator=g)
W1 = torch.randn((emb_dim * input_len, hidden_size), generator=g)
b1 = torch.randn((hidden_size), generator=g)
W2 = torch.randn((hidden_size, 27), generator=g)
b2 = torch.randn((27), generator=g)

parameters = [C, W1, b1, W2, b2]

In [113]:
sum([p.nelement() for p in parameters])

3481

In [114]:
for p in parameters:
    p.requires_grad = True

In [None]:
lre = torch.linspace(-3, 0, 1000)
lr = 10**lre
iterations = 100
min_batch_size = 32

for i in range(iterations):

    # minibatch
    ix = torch.randint(0, X.shape[0], (min_batch_size,))

    # forward pass
    emb = C[X[ix]]  # selects the (min_batch_size, 3, 2) embeddings from X
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y[ix])
    print(loss.item())

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    for p in parameters:
        p.data -= p.grad * lr[i]

# print(loss.item())

16.521610260009766


In [106]:
emb = C[X] 
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y)
loss

tensor(2.7254, grad_fn=<NllLossBackward0>)