In [1]:
import torch
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split

In [2]:
words = open("./names.txt", "r").read().splitlines()

## Data prep

In [3]:
train_words, temp_words = train_test_split(words, train_size=0.8, random_state=42)
dev_words, test_words = train_test_split(temp_words, test_size=0.5, random_state=42)

In [4]:
len(train_words), len(dev_words), len(test_words)

(25626, 3203, 3204)

## Bigram

In [5]:
train_chars = sorted(list(set(''.join(train_words))))
N_b = torch.zeros((27, 27), dtype=torch.int32)
stoi = {s:i+1 for i,s in enumerate(train_chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

# model smoothing
N_b += 3

# "training"
for w in train_words:
  chs = ['.'] + list(w) + ['.']
  for ch1,ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    N_b[ix1, ix2] += 1

P = N_b.float()
P /= P.sum(1, keepdim=True)

In [6]:
g = torch.Generator().manual_seed(2147483647)
for i in range(10):
  out = []
  ix = 0
  while True:
    p = P[ix]
    ix = torch.multinomial(p, num_samples=1, generator=g).item()
    out.append(itos[ix])
    if ix == 0:
      break
  print(''.join(out))

junide.
janasah.
p.
cony.
a.
nn.
kohin.
tolian.
juwe.
ksahnaauranilevias.


### Eval

In [7]:
n = 0; log_likelihood = 0.0
for w in train_words:
  chs = ['.'] + list(w) + ['.']
  for ch1,ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    prob = P[ix1, ix2]
    log_prob = torch.log(prob)
    log_likelihood += log_prob
    n += 1

nll = -log_likelihood
print("---Train split scores---")
print(f"{nll=}")
print(f"{nll/n=}")

---Train split scores---
nll=tensor(448501.8438)
nll/n=tensor(2.4576)


In [8]:
n = 0; log_likelihood = 0.0
for w in dev_words:
  chs = ['.'] + list(w) + ['.']
  for ch1,ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    prob = P[ix1, ix2]
    log_prob = torch.log(prob)
    log_likelihood += log_prob
    n += 1

nll = -log_likelihood
print("---Dev split scores---")
print(f"{nll=}")
print(f"{nll/n=}")

---Dev split scores---
nll=tensor(56075.5391)
nll/n=tensor(2.4506)


In [9]:
n = 0; log_likelihood = 0.0
for w in test_words:
  chs = ['.'] + list(w) + ['.']
  for ch1,ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    prob = P[ix1, ix2]
    log_prob = torch.log(prob)
    log_likelihood += log_prob
    n += 1

nll = -log_likelihood
print("---Test split scores---")
print(f"{nll=}")
print(f"{nll/n=}")

---Test split scores---
nll=tensor(56024.1211)
nll/n=tensor(2.4608)


Inuitively, the loss (average negative log likelihood, in other words) should have been lesser on the training dataset, but it turns out here that dev split loss is lesser than the train split loss. In order to keep myself sane, I attribute this "anomaly" to the random seed 42 ;p

I'm pretty sure that I would get the expected result if I were to choose any other seed.

After trying out 2 different seeds, I did get the expected result of the training loss being higher than the dev loss (and equal to the test loss :/) with one of the seeds (69). This inconsistency might be due to the dataset being small or other statistical stuff I don't know about.

## Trigram

In [10]:
two_chars = set()
for c1 in train_chars+["."]:
  for c2 in train_chars+["."]:
    two_chars.add(c1+c2)

two_chars = sorted(list(two_chars))

stoi2 = {s:i for i,s in enumerate(two_chars)}
itos2 = {i:s for i,s in enumerate(two_chars)}

N_t = torch.zeros((729, 27), dtype=torch.int32)

# model smoothing
N_t += 3

# "training"
for w in train_words:
  chs = ["."] + list(w) + ["."]
  for ch1,ch2,ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi2[ch1+ch2]
    ix2 = stoi[ch3]
    N_t[ix1, ix2] += 1

P = N_t.float()
P /= P.sum(1, keepdim=True)

In [11]:
g = torch.Generator().manual_seed(2147483647)
for _ in range(10):
  ix = 1
  out = ["."]
  while True:
    if ix != 1:
      ix = stoi2[''.join(out[-2:])]
    p = P[ix]
    ix = torch.multinomial(p, num_samples=1, generator=g).item()
    out.append(itos[ix])
    if out[-1][-1] == ".":
      break
  
  print(''.join(out))

.luwjdvdianasid.
.ulexay.
.adin.
.vairritoper.
.maree.
.viameiaurinileniassdbduinrwibtlyssiyanaylarte.
.unvmumthyfodtumj.
.nonnslenarsani.
.rose.
.yae.


### Eval

In [12]:
n = 0
log_likelihood = 0.0
for w in train_words:
  chs = ["."] + list(w) + ["."]
  for ch1,ch2,ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi2[ch1+ch2]
    ix2 = stoi[ch3]
    prob = P[ix1, ix2]
    logprob = torch.log(prob)
    log_likelihood += logprob
    n += 1

nll = -log_likelihood
print("---Train split scores---")
print(f"{nll=}")
print(f"{nll/n=}")

---Train split scores---
nll=tensor(336548.5000)
nll/n=tensor(2.1454)


In [13]:
n = 0
log_likelihood = 0.0
for w in dev_words:
  chs = ["."] + list(w) + ["."]
  for ch1,ch2,ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi2[ch1+ch2]
    ix2 = stoi[ch3]
    prob = P[ix1, ix2]
    logprob = torch.log(prob)
    log_likelihood += logprob
    n += 1

nll = -log_likelihood
print("---Dev split scores---")
print(f"{nll=}")
print(f"{nll/n=}")

---Dev split scores---
nll=tensor(42471.5547)
nll/n=tensor(2.1582)


In [14]:
n = 0
log_likelihood = 0.0
for w in test_words:
  chs = ["."] + list(w) + ["."]
  for ch1,ch2,ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi2[ch1+ch2]
    ix2 = stoi[ch3]
    prob = P[ix1, ix2]
    logprob = torch.log(prob)
    log_likelihood += logprob
    n += 1

nll = -log_likelihood
print("---Test split scores---")
print(f"{nll=}")
print(f"{nll/n=}")

---Test split scores---
nll=tensor(42595.9688)
nll/n=tensor(2.1774)


For the trigram model, the scores are as expected - train loss is higher than dev and test loss.

## Bigram using NN

In [15]:
import torch.nn.functional as F

In [16]:
xs, ys = [], []
for w in train_words:
  chs = ["."] + list(w) + ["."]
  for ch1,ch2 in zip(chs,chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs.append(ix1)
    ys.append(ix2)

xs = torch.tensor(xs)    
ys = torch.tensor(ys)

# initialize neural net
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

In [17]:
# gradient descent
for _ in range(150):
  
  # forward pass
  xenc = F.one_hot(xs, num_classes=27).float()
  logits = xenc @ W
  counts = logits.exp()
  probs = counts / counts.sum(1, keepdim=True)
  loss = -probs[torch.arange(xs.nelement()), ys].log().mean() + 0.01*(W**2).mean()

  print(loss.item())

  # backward pass
  W.grad = None
  loss.backward()
  with torch.no_grad():
    W.data += -75 * W.grad


3.7672510147094727
3.238084077835083
3.015225887298584
2.8891587257385254
2.8101205825805664
2.7558672428131104
2.7155206203460693
2.684164524078369
2.6589431762695312
2.638404369354248
2.6213760375976562
2.6071972846984863
2.595180034637451
2.584972381591797
2.5761351585388184
2.5684926509857178
2.561751365661621
2.555835008621216
2.5505361557006836
2.545834541320801
2.5415725708007812
2.5377626419067383
2.53427791595459
2.531146287918091
2.528259038925171
2.5256547927856445
2.5232367515563965
2.5210483074188232
2.5190019607543945
2.5171456336975098
2.515397310256958
2.513808250427246
2.512301445007324
2.510929584503174
2.50961971282959
2.5084259510040283
2.50727915763855
2.5062332153320312
2.5052218437194824
2.5042994022369385
2.503401279449463
2.502582311630249
2.5017807483673096
2.5010502338409424
2.5003304481506348
2.4996755123138428
2.49902606010437
2.498436450958252
2.4978482723236084
2.4973151683807373
2.4967801570892334
2.4962964057922363
2.4958083629608154
2.495368719100952
2

In [18]:
# sampling from the network
g = torch.Generator().manual_seed(2147483647)
for i in range(10):
  out = []
  ix = 0
  while True:
    xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
    logits = xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdim=True)

    ix = torch.multinomial(probs, num_samples=1, generator=g).item()
    out.append(itos[ix])
    if ix == 0:
      break
  print(''.join(out))

junide.
janasah.
prelay.
a.
nn.
kohin.
tolia.
s.
tee.
ksahnaauranilevias.


### Eval

In [19]:
xenc = F.one_hot(xs, num_classes=27).float()
logits = xenc @ W
counts = logits.exp()
probs = counts / counts.sum(1, keepdim=True)
avg_log_likelihood = probs[torch.arange(xs.nelement()), ys].log().mean()
      
nll = -avg_log_likelihood
print("---Train split scores---")
print(f"{nll=}")

---Train split scores---
nll=tensor(2.4652, grad_fn=<NegBackward0>)


In [20]:
dev_xs, dev_ys = [], []
for w in dev_words:
  chs = ["."] + list(w) + ["."]
  for ch1,ch2 in zip(chs,chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    dev_xs.append(ix1)
    dev_ys.append(ix2)

dev_xs = torch.tensor(dev_xs)    
dev_ys = torch.tensor(dev_ys)

xenc = F.one_hot(dev_xs, num_classes=27).float()
logits = xenc @ W
counts = logits.exp()
probs = counts / counts.sum(1, keepdim=True)
avg_log_likelihood = probs[torch.arange(dev_xs.nelement()), dev_ys].log().mean()
      
nll = -avg_log_likelihood
print("---Dev split scores---")
print(f"{nll=}")

---Dev split scores---
nll=tensor(2.4578, grad_fn=<NegBackward0>)


In [21]:
test_xs, test_ys = [], []
for w in test_words:
  chs = ["."] + list(w) + ["."]
  for ch1,ch2 in zip(chs,chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    test_xs.append(ix1)
    test_ys.append(ix2)

test_xs = torch.tensor(test_xs)    
test_ys = torch.tensor(test_ys)

xenc = F.one_hot(test_xs, num_classes=27).float()
logits = xenc @ W
counts = logits.exp()
probs = counts / counts.sum(1, keepdim=True)
avg_log_likelihood = probs[torch.arange(test_xs.nelement()), test_ys].log().mean()
      
nll = -avg_log_likelihood
print("---Test split scores---")
print(f"{nll=}")

---Test split scores---
nll=tensor(2.4680, grad_fn=<NegBackward0>)


## Trigram using NN

In [39]:
xs_t, ys_t = [], []
for w in train_words:
  chs = ["."] + list(w) + ["."]
  for ch1,ch2,ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi2[ch1+ch2]
    ix2 = stoi[ch3]
    xs_t.append(ix1)
    ys_t.append(ix2)

xs_t = torch.tensor(xs_t)
ys_t = torch.tensor(ys_t)

g = torch.Generator().manual_seed(2147483647)
W = torch.randn((729, 27), generator=g, requires_grad=True)

In [40]:
# gradient descent
for _ in range(150):
  
  # forward pass
  # xenc = F.one_hot(xs_t, num_classes=729).float()
  # logits = xenc @ W
  logits = W[xs_t]
  counts = logits.exp()
  probs = counts / counts.sum(1, keepdim=True)
  loss = -probs[torch.arange(xs_t.nelement()), ys_t].log().mean() + 0.01*(W**2).mean()

  print(loss.item())

  # backward pass
  W.grad = None
  loss.backward()
  with torch.no_grad():
    W.data += -75 * W.grad

3.7336504459381104
3.6212451457977295
3.519554376602173
3.4285521507263184
3.3480427265167236
3.277346134185791
3.215322494506836
3.160595417022705
3.111837863922119
3.0679633617401123
3.028154134750366
2.991804599761963
2.9584553241729736
2.927744150161743
2.8993759155273438
2.873103380203247
2.848714590072632
2.8260245323181152
2.804870367050171
2.7851061820983887
2.766599655151367
2.749232769012451
2.732896566390991
2.7174947261810303
2.7029411792755127
2.6891579627990723
2.676077365875244
2.663638114929199
2.651787281036377
2.640477418899536
2.629666566848755
2.619316577911377
2.6093943119049072
2.599870204925537
2.59071683883667
2.5819106101989746
2.5734288692474365
2.5652520656585693
2.5573620796203613
2.5497424602508545
2.5423777103424072
2.5352542400360107
2.5283591747283936
2.5216808319091797
2.5152084827423096
2.5089316368103027
2.502840995788574
2.4969279766082764
2.491184711456299
2.4856033325195312
2.4801769256591797
2.47489857673645
2.469762086868286
2.4647622108459473
2.

In [50]:
g = torch.Generator().manual_seed(2147483647)
for _ in range(10):
  ix = 1
  out = ["."]
  while True:
    if ix != 1:
      ix = stoi2[''.join(out[-2:])]
    logits = W[ix]
    counts = logits.exp()
    probs = counts / counts.sum() 
      
    ix = torch.multinomial(probs, num_samples=1, generator=g).item()
    out.append(itos[ix])
    if out[-1][-1] == ".":
      break
  
  print(''.join(out))

.luwjde.
.ilyasiz.
.ufofyywocnzq.
.di.
.ritoniabraree.
.viameiauriniadvhassdbyainrwibwlassiyanaylartleigvmumtrifoetumj.
.nonn.
.lenariani.
.rose.
.yae.


### Eval

In [69]:
log_likelihood = 0.0
logits = W[xs_t]
counts = logits.exp()
probs = counts / counts.sum(1, keepdim=True)

nll = -probs[torch.arange(xs_t.nelement()), ys_t].log().mean()

print("---Train split scores---")
print(f"{nll=}")

---Train split scores---
nll=tensor(2.2434, grad_fn=<NegBackward0>)


In [70]:
dev_xs_t, dev_ys_t = [], []
for w in dev_words:
  chs = ["."] + list(w) + ["."]
  for ch1,ch2,ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi2[ch1+ch2]
    ix2 = stoi[ch3]
    dev_xs_t.append(ix1)
    dev_ys_t.append(ix2)

dev_xs_t = torch.tensor(dev_xs_t)
dev_ys_t = torch.tensor(dev_ys_t)

logits = W[dev_xs_t]
counts = logits.exp()
probs = counts / counts.sum(1, keepdim=True)

nll = -probs[torch.arange(dev_xs_t.nelement()), dev_ys_t].log().mean()

print("---Dev split scores---")
print(f"{nll=}")

---Dev split scores---
nll=tensor(2.2487, grad_fn=<NegBackward0>)


In [71]:
test_xs_t, test_ys_t = [], []
for w in test_words:
  chs = ["."] + list(w) + ["."]
  for ch1,ch2,ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi2[ch1+ch2]
    ix2 = stoi[ch3]
    test_xs_t.append(ix1)
    test_ys_t.append(ix2)

test_xs_t = torch.tensor(test_xs_t)
test_ys_t = torch.tensor(test_ys_t)

logits = W[test_xs_t]
counts = logits.exp()
probs = counts / counts.sum(1, keepdim=True)

nll = -probs[torch.arange(test_xs_t.nelement()), test_ys_t].log().mean()

print("---Test split scores---")
print(f"{nll=}")

---Test split scores---
nll=tensor(2.2715, grad_fn=<NegBackward0>)
