E06: meta-exercise! Think of a fun/interesting exercise and complete it.

My idea is to use the model on a different data set, and see how it performs. I will use German street names, from the [OpenAdresses dataset of Germany](https://www.kaggle.com/datasets/openaddresses/openaddresses-europe?resource=download&select=germany.csv)

In [39]:
import torch
import torch.nn.functional as F
from math import floor

In [40]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [41]:
# Read in the dataset
streetnames = open('../part1/street-names.txt', 'r', encoding='utf-8').read().splitlines()

In [42]:
streetnames[:10]

['Krümpelstr.',
 'Alfred-Seepe-Straße',
 'Eitorfer Str.',
 'Bürgeweg',
 'Lager Str.',
 'Maternusstraße',
 'Am Schnellert',
 'Hönde',
 'Kleybergstraße',
 'Haselwiese']

In [43]:
# Create training, dev, and test sets
train_index = floor(len(streetnames) * 0.8)
dev_index = floor(len(streetnames) * 0.9)

train = streetnames[:train_index]
dev = streetnames[train_index:dev_index]
test = streetnames[dev_index:]

In [44]:
chars = sorted(list(set(''.join(['.'] + streetnames))))

# Create look up tables for the alphabet
  # stoi = string to index
  # itos = index to string
stoi = {s:i for i, s in enumerate(chars)}
itos = {i:s for s, i in stoi.items()}

In [45]:
alphabet_size = len(chars)
alphabet_size

90

In [46]:
xs_train,  ys_train = [], []

# Create the training data
# input xs: (ch1, ch2) 
# prediction ys: ch3
for streetname in train:

  # prepend two special characters and append one special characters to each streetname
  chs = ['.'] * 2 + list(streetname) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    xs_train.append((ix1, ix2))
    ys_train.append(ix3)

num = len(xs_train)
print('number of training examples: ', num)
xs_train = torch.tensor(xs_train, device=device)
ys_train = torch.tensor(ys_train, device=device)

number of training examples:  1692547


In [47]:
xs_dev,  ys_dev = [], []
for word in dev:
  chs = ['.'] * 2 + list(word) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    xs_dev.append((ix1, ix2))
    ys_dev.append(ix3)

xs_dev = torch.tensor(xs_dev)
ys_dev = torch.tensor(ys_dev)
print('number of training examples: ', xs_dev.nelement())

number of training examples:  421578


In [48]:
# Creat the test data
xs_test,  ys_test = [], []
for word in test:
  chs = ['.'] * 2 + list(word) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    xs_test.append((ix1, ix2))
    ys_test.append(ix3)

xs_test = torch.tensor(xs_test)
ys_test = torch.tensor(ys_test)

In [49]:
g = torch.Generator(device=device).manual_seed(2147483647)
W = torch.randn((alphabet_size*2, alphabet_size), generator=g, device=device, requires_grad=True)

In [None]:
# gradient descent
iterations = 100
learning_rate = 50
smoothing_strenth = 0.01

for k in range(iterations):

  # forward pass
  xenc= F.one_hot(xs_train, num_classes=alphabet_size).float()
  xenc_flat = xenc.flatten(1) # flatten the one-hot encoded input vector
  logits = xenc_flat @ W # predict log-counts
  loss = F.cross_entropy(logits, ys_train) + smoothing_strenth * (W**2).mean() # compute loss
  # print loss every 10% of iterations
  if k % floor(iterations/10) == 0:
    print(f'loss at step {k}: {loss.item():.3f}')

  # backward pass
  W.grad = None # flush the gradients
  loss.backward()

  # update step
  cool_down = 1.0 / (1 + 0.001 * k)
  W.data += -learning_rate * W.grad
print(f'final training loss: {loss.item():.3f}, with smoothing strength {smoothing_strenth}')

In [None]:
# Evaluate the model on the dev set
xenc= F.one_hot(xs_dev, num_classes=alphabet_size).float()
xenc_flat = xenc.flatten(1) # flatten the one-hot encoded input vector
logits = xenc_flat @ W # predict log-counts
# softmax
counts = logits.exp() # counts
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
# loss function (cross-entropy) + regularization (L2)
loss = F.cross_entropy(logits, ys_dev)
print(f'loss on dev set: {loss.item():.3f}, with smoothing strength {smoothing_strenth}')

In [None]:
# Evaluate the model on the test set
xenc= F.one_hot(xs_test, num_classes=alphabet_size).float()
xenc_flat = xenc.flatten(1) # flatten the one-hot encoded input vector
logits = xenc_flat @ W # predict log-counts
# softmax
counts = logits.exp() # counts
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
# loss function (cross-entropy) + regularization (L2)
loss = F.cross_entropy(logits, ys_test) + smoothing_strenth * (W**2).mean() # compute loss
print(f'loss on test set: {loss.item():.3f}')

In [None]:
g = torch.Generator(device=device).manual_seed(2147483647)
# sample street names
name_count = 20
sampled_street_names = []
for i in range(name_count):

  out = []
  ix1 = 0
  ix2 = 0

  while True:
    xenc = F.one_hot(torch.tensor([ix1, ix2], device=device), num_classes=alphabet_size).float()
    xenc_flat = xenc.flatten()
    logits = xenc_flat @ W # predict log-counts
    # softmax
    counts = logits.exp() # counts, equivalent to N
    p = counts / counts.sum(0, keepdims=True) # probabilities for next character
    
    # move index to next character
    ix1 = ix2
    ix2 = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix2])
    if ix2 == 0:
      # stop if we reach the end of the word
      # 0 is the index of the special character '.'
      break

  sampled_street_names.append(''.join(out))
print(sampled_street_names)