In [135]:
words = open('names.txt', 'r', encoding='utf-8').read().splitlines()
print(words[-3:])
print(len(words))

['zyrie', 'zyron', 'zzyzx']
32033


In [136]:
letters = sorted(list(set(''.join(words)))) # Get all individual letters sorted, should be alphabet (if all letters are once included in our names)
num_of_unique_letters = len(letters) + 1 # Add 1 for '.' our special char
stoi = {s:i+1 for i,s in enumerate(letters)} # Create a mapping from a char to a int in order to index in tensor
stoi['.'] = 0 # Add . as a special char
itos = {i:s for s,i in stoi.items()} # Create the mapping in reverse
print(num_of_unique_letters)
print(letters)
itos

27
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '.'}

In [137]:
def build_data(words, nChars):
    x, y = [], []
    for w in words:
        chs = list('.'*nChars + w + ".")
        chs = [stoi[c] for c in chs]
        for i in range(len(chs)-nChars):
            x.append(chs[i:i+nChars])
            y.append(chs[i+nChars])
    return (x,y)

# Example 'zuzanna' becomes training example (.. -> z, ..z -> u, and so on until last nna -> .) and chars are converted to the ints

nChars = 3
print(words[-1:])
print(build_data(words[-1:], nChars))

['zzyzx']
([[0, 0, 0], [0, 0, 26], [0, 26, 26], [26, 26, 25], [26, 25, 26], [25, 26, 24]], [26, 26, 25, 26, 24, 0])


In [138]:
import torch
import torch.nn.functional as F
import random
import matplotlib.pyplot as plt
%matplotlib inline

In [139]:
g = torch.Generator().manual_seed(2147483647)

random.shuffle(words)
trainIdx = int(0.8*len(words))
valIdx = int(0.9*len(words))

xTrain, yTrain = build_data(words[0:trainIdx], nChars)
xTrain = torch.tensor(xTrain)
yTrain = torch.tensor(yTrain)

xVal, yVal = build_data(words[trainIdx:valIdx], nChars)
xVal = torch.tensor(xVal)
yVal = torch.tensor(yVal)

xTest, yTest = build_data(words[valIdx:], nChars)
xTest = torch.tensor(xTest)
yTest = torch.tensor(yTest)


nC = 10 # Look up table, can also be seen as the first layer to our network
nHiddenN = 200 # Size of hidden layer

C = torch.randn((num_of_unique_letters, nC), generator=g)

layers = [
  torch.nn.Linear(nC*nChars, nHiddenN, bias=False), torch.nn.BatchNorm1d(nHiddenN), torch.nn.Tanh(),
  torch.nn.Linear(nHiddenN, nHiddenN, bias=False), torch.nn.BatchNorm1d(nHiddenN), torch.nn.Tanh(),
  torch.nn.Linear(nHiddenN, nHiddenN, bias=False), torch.nn.BatchNorm1d(nHiddenN), torch.nn.Tanh(),
  torch.nn.Linear(nHiddenN, nHiddenN, bias=False), torch.nn.BatchNorm1d(nHiddenN), torch.nn.Tanh(),
  torch.nn.Linear(nHiddenN, nHiddenN, bias=False), torch.nn.BatchNorm1d(nHiddenN), torch.nn.Tanh(),
  torch.nn.Linear(nHiddenN, num_of_unique_letters, bias=False), torch.nn.BatchNorm1d(num_of_unique_letters)
]

with torch.no_grad():
  # all other layers: apply gain
  for layer in layers[:-1]:
    if isinstance(layer, torch.nn.Linear):
      layer.weight *= 5/3

parameters = [C] + [p for layer in layers for p in layer.parameters()]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
  p.requires_grad = True

trL = []

173724


In [141]:
# same optimization as last time
iters = 200_000
batch_size = 32

for i in range(iters):

    # minibatch construct
    mIdx = torch.randint(0, xTrain.shape[0], (batch_size,))
    Yb = yTrain[mIdx] # batch X,Y
    # forward pass

    emb = C[xTrain[mIdx]] # embed the characters into vectors
    x = emb.view(emb.shape[0], -1) # concatenate the vectors
    
    for layer in layers:
        x = layer(x)
    loss = F.cross_entropy(x, Yb) # loss function

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    lr = 0.1 if i < 150_000 else 0.01 # step learning rate decay
    for p in parameters:
        p.data += -lr * p.grad

    # track stats
    if i % 10_000 == 0: # print every once in a while
        print(f'{i:7d}/{iters:7d}: {loss.item():.4f}')
    trL.append(loss.log10().item())
    with torch.no_grad():
        ud.append([((lr*p.grad).std() / p.data.std()).log10().item() for p in parameters])



AttributeError: 'Linear' object has no attribute 'retain_grad'

In [None]:
# Theoretical first loss should be around:
-torch.tensor([1/27]).log()

In [None]:
plt.plot(trL)