In [209]:
import torch
import torch.nn.functional as F

In [210]:
# Read in the names
words = open('../names.txt', 'r').read().splitlines()

In [211]:
chars = sorted(list(set(''.join(['.'] + words))))

# Create look up tables for the alphabet
  # stoi = string to index
  # itos = index to string
stoi = {s:i for i, s in enumerate(chars)}
itos = {i:s for s, i in stoi.items()}

In [212]:
xs,  ys = [], []

# Create the training data
# input xs: (ch1, ch2) 
# prediction ys: ch3
for word in words:

  # prepend two special characters and append one special characters to each word
  chs = ['.'] * 2 + list(word) + ['.']
  # Example for 'anna": 
  # zip(chs, chs[1:], chs[2:]) = 
  # [('.', '.', 'a'), ('.', 'a', 'n'), ('a', 'n', 'n'), ('n', 'n', 'a'), ('n', 'a', '.')]
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    xs.append((ix1, ix2))
    ys.append(ix3)

num = len(xs)
print('number of examples: ', num)
xs = torch.tensor(xs)
ys = torch.tensor(ys)

number of examples:  228146


In [213]:
g = torch.Generator().manual_seed(2147483647)
# weight matrix with 54 input nodes and 27 output nodes
W = torch.randn((27*2, 27), generator=g, requires_grad=True)

In [214]:
from math import floor

# gradient descent
iterations = 100
learning_rate = 50

for k in range(iterations):

  # forward pass
  xenc= F.one_hot(xs, num_classes=27).float()
  xenc_flat = xenc.flatten(1) # flatten the one-hot encoded input vector
  logits = xenc_flat @ W # predict log-counts
  # softmax
  counts = logits.exp() # counts
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  # loss function (cross-entropy) + regularization (L2)
  loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()
  # print loss every 10% of iterations
  if k % floor(iterations/10) == 0:
    print(f'loss at step {k}: {loss.item():.3f}')

  # backward pass
  W.grad = None # flush the gradients
  loss.backward()

  # update step
  cool_down = 1.0 / (1 + 0.001 * k)
  W.data += -learning_rate * cool_down * W.grad

loss at step 0: 4.242
loss at step 10: 2.633
loss at step 20: 2.511
loss at step 30: 2.466
loss at step 40: 2.441
loss at step 50: 2.426
loss at step 60: 2.415
loss at step 70: 2.408
loss at step 80: 2.402
loss at step 90: 2.397


In [215]:
g = torch.Generator().manual_seed(2147483647)
# sample names
name_count = 20
sampled_words = []
for i in range(name_count):

  out = []
  ix1 = 0
  ix2 = 0

  while True:
    xenc = F.one_hot(torch.tensor([ix1, ix2]), num_classes=27).float()
    xenc_flat = xenc.flatten()
    logits = xenc_flat @ W # predict log-counts
    # softmax
    counts = logits.exp() # counts, equivalent to N
    p = counts / counts.sum(0, keepdims=True) # probabilities for next character
    
    # move index to next character
    ix1 = ix2
    ix2 = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix2])
    if ix2 == 0:
      # stop if we reach the end of the word
      # 0 is the index of the special character '.'
      break

  sampled_words.append(''.join(out))
print(sampled_words)

['lon.', 'ays.', 'len.', 'veroydbraisemiabrada.', 'lanthizarie.', 'na.', 'nah.', 'edakaulaicherierielah.', 'yonk.', 'da.', 'ta.', 'aliyn.', 'eman.', 'kilon.', 'brien.', 'ah.', 'ca.', 'glena.', 'aausts.', 'kalirnitaey.']


# Evaluation of trigram neural network

loss: 2.397

The neural network trigram model does a little bit worse than the trigram model based on counts, but it is still better than the bigram model.