E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

In [9]:
import torch
import matplotlib.pyplot as plt
import torch.nn.functional as F
import random

In [10]:
words = open('../names.txt', 'r').read().splitlines()
random.seed(2147483647)
random.shuffle(words)

In [11]:
chars = sorted(list(set(''.join(['.'] + words))))

# Create look up tables for the alphabet
  # stoi = string to index
  # itos = index to string
stoi = {s:i for i, s in enumerate(chars)}
itos = {i:s for s, i in stoi.items()}

In [12]:
# split into train, dev, test
train_index = int(len(words) * 0.8)
dev_index = int(len(words) * 0.9)

train = words[:train_index]
dev = words[train_index:dev_index]
test = words[dev_index:]

In [13]:
# create the training data
xs_train, ys_train = [], []
for w in train:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs_train.append(ix1)
    ys_train.append(ix2)
xs_train = torch.tensor(xs_train)
ys_train = torch.tensor(ys_train)
num = xs_train.nelement()
print('number of training examples: ', num)

number of training examples:  182546


In [14]:
# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

In [15]:
# create the dev data
xs_dev, ys_dev = [], []
for w in dev:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs_dev.append(ix1)
    ys_dev.append(ix2)
xs_dev = torch.tensor(xs_dev)
ys_dev = torch.tensor(ys_dev)

In [16]:
# create the test data
xs_test, ys_test = [], []
for w in test:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs_test.append(ix1)
    ys_test.append(ix2)
xs_test = torch.tensor(xs_test)
ys_test = torch.tensor(ys_test)

In [17]:
# gradient descent
for k in range(100):
  
  # forward pass
  xenc = F.one_hot(xs_train, num_classes=27).float() # input to the network: one-hot encoding
  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  loss = -probs[torch.arange(num), ys_train].log().mean() + 0.01*(W**2).mean()
  
  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update
  W.data += -50 * W.grad
print(f'final training loss: {loss.item():.3f}')

3.7704033851623535
3.3805792331695557
3.1626715660095215
3.028703212738037
2.9360408782958984
2.8688313961029053
2.8182530403137207
2.7787086963653564
2.74676251411438
2.7202842235565186
2.6979095935821533
2.6787352561950684
2.6621346473693848
2.6476519107818604
2.634939670562744
2.6237220764160156
2.613773822784424
2.6049106121063232
2.5969772338867188
2.58984375
2.5834014415740967
2.577559232711792
2.572240114212036
2.567378282546997
2.5629193782806396
2.5588159561157227
2.555028200149536
2.551522731781006
2.5482699871063232
2.5452449321746826
2.542426586151123
2.539795398712158
2.537335157394409
2.5350310802459717
2.532870292663574
2.53084135055542
2.528932571411133
2.5271358489990234
2.525442123413086
2.523843288421631
2.5223331451416016
2.520904541015625
2.5195517539978027
2.5182690620422363
2.517052412033081
2.5158965587615967
2.5147976875305176
2.5137522220611572
2.512755870819092
2.511805772781372
2.510899305343628
2.5100340843200684
2.509207010269165
2.508415460586548
2.507657

In [18]:
# Evaluate the model on the dev set
xenc_dev = F.one_hot(xs_dev, num_classes=27).float() # input to the network: one-hot encoding
logits = xenc_dev @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
loss = -probs[torch.arange(len(xs_dev)), ys_dev].log().mean() + 0.01*(W**2).mean()
print(f"loss on dev set: {loss.item():.3f}")

loss on dev set: 2.488


In [19]:
# Evaluate the model on the test set
xenc_test = F.one_hot(xs_test, num_classes=27).float() # input to the network: one-hot encoding
logits = xenc_test @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
loss = -probs[torch.arange(len(xs_test)), ys_test].log().mean() + 0.01*(W**2).mean()
print(f"loss on test set: {loss.item():.3f}")

loss on test set: 2.486


The model performed better on the dev and on the test set than on the training set.

- loss on training set after $100$ iterations: $2.491$
- loss on dev set: $2.488$
- loss on test set: $2.486$