E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

In [33]:
import torch

In [34]:
# Read in the names
words = open('../names.txt', 'r').read().splitlines()

In [35]:
# Create the counts tensor
N = torch.zeros(27, 27, 27, dtype=torch.int32)

In [36]:
chars = sorted(list(set(''.join(['.'] + words))))

# Create look up tables for the alphabet
  # stoi = string to index
  # itos = index to string
stoi = {s:i for i, s in enumerate(chars)}
itos = {i:s for s, i in stoi.items()}

In [37]:
num = 0

# Create the training data
for word in words:

  # prepend two special characters and append one special characters to each word
  chs = ['.'] * 2 + list(word) + ['.']
  # Example for 'anna": 
  # zip(chs, chs[1:], chs[2:]) = 
  # [('.', '.', 'a'), ('.', 'a', 'n'), ('a', 'n', 'n'), ('n', 'n', 'a'), ('n', 'a', '.')]
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    N[ix1, ix2, ix3] += 1
    num += 1

print('number of examples: ', num)

number of examples:  228146


In [38]:
# Normalize the counts to get probabilities
P = (N+1).float()
P /= P.sum(2, keepdim=True)

In [39]:
g = torch.Generator().manual_seed(2147483647)

# sample names
name_count = 20
sampled_words = []
for i in range(name_count):

  out = []
  ix1 = 0
  ix2 = 0

  while True:
    p = P[ix1, ix2]
    # move index to next character
    ix1 = ix2
    ix2 = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix2])
    if ix2 == 0:
      # stop if we reach the end of the word
      # 0 is the index of the special character '.'
      break

  sampled_words.append(''.join(out))
print(sampled_words)

['miq.', 'axx.', 'mereyannyaar.', 'knooraen.', 'el.', 'marviovania.', 'odarimalabelon.', 'hamirelslen.', 'elyn.', 'rae.', 'bra.', 'ceevlainacelonikaireil.', 'bech.', 'amilleia.', 'trutandennimsaby.', 'crewina.', 'lanoxvkyrina.', 'khine.', 'trise.', 'koberseberryslot.']


In [40]:
# Compute the negative log likelihood of the data

log_likelihood = 0.0
n = 0

for word in words:
  chs = ['.'] * 2 + list(word) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    prob = P[ix1, ix2, ix3]
    log_likelihood += torch.log(prob)
    n += 1
nll = -log_likelihood
print(f'{nll=:.2f}')
print(f'{nll/n:.2f}')

nll=504653.00
2.21


# Evaluation of trigram counts

average loss: 2.21

Improvement compared to bigram model: 2.45-2.21 = 0.24
Improvement compared to trigram nn model: 2.40-2.21 = 0.19