In [1]:
words = open('names.txt', 'r').read().splitlines()

In [2]:
# b = {}  # statistics of the bigrams across the words
# for w in words:
#     chs = ['<S>'] + list(w) + ['<E>']
#     for ch1, ch2 in zip(chs, chs[1:]):
#         bigram = (ch1, ch2)
#         b[bigram] = b.get(bigram, 0) + 1

In [3]:
# sorted(b.items(), key = lambda kv: -kv[1])

## Bigram character level language model counting approach

In [1]:
import torch

N = torch.zeros((27,27), dtype=torch.int32)

In [2]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

NameError: ignored

In [None]:
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    N[ix1,ix2] += 1

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.imshow(N)

In [None]:
plt.figure(figsize=(16,16))
plt.imshow(N, cmap='Blues')
for i in range(27):
  for j in range(27):
    chstr = itos[i] + itos[j]
    plt.text(j, i, chstr, ha="center", va="bottom", color='gray')
    plt.text(j, i, N[i, j].item(), ha="center", va="top", color='gray')

plt.axis('off')

In [None]:
# the counts of all characters appearing after whitespace
N[0]

In [None]:
# probabilities of any character being first character in a word
p = N[0].float()
p = p / p.sum()
p

In [None]:
ix = torch.multinomial(p, num_samples=1, replacement=True).item()
itos[ix]

In [None]:
# torch.multinomial samples from multinomial probability distribution. Takes probabilities, returns integers.
# p = torch.rand(3)
# p = p / p.sum()
# print(p)
# torch.multinomial(p, num_samples=20, replacement=True)

In [None]:
P = (N+1).float() # adding to N is model smoothing to avoid zero probabilities and infinity loss function
P /= P.sum(1, keepdim=True)

In [None]:
ix = 0
out = []
while True:
  p = P[ix]
  ix = torch.multinomial(p, num_samples=1, replacement=True).item()
  out.append(itos[ix])
  if ix == 0:
    break

print(''.join(out))

In [None]:
# loss function

log_likelihood = 0.0
n = 0
for w in words:
# for w in ['bob']:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    prob = P[ix1,ix2]
    logprob = torch.log(prob)
    log_likelihood += logprob
    n += 1
    # print(f'{ch1}{ch2}: {prob:.4f} {logprob:.4f}')

# print(f"{log_likelihood=}")
nll = -log_likelihood
# print(f"{nll=}")
print(f"average negative log likelihood: {nll/n}")

## Bigram character level language model with neural net approach

In [51]:
# create training set of bigrams (x,y)
xs, ys = [], []

for w in words[:1]:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    # print(ch1, ch2)
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs.append(ix1)
    ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

In [52]:
# print(xs)
# print(ys)
# one hot encoding
import torch.nn.functional as F
xenc = F.one_hot(xs, num_classes=27).float()
# print(xenc.shape)
# print(xenc.dtype)
# plt.imshow(xenc)

In [56]:
W = torch.randn((27,27))
logits = xenc @ W # log counts
# softmax
counts = logits.exp()
probs = counts / counts.sum(1, keepdim=True)

### Complete version

In [3]:
import torch.nn.functional as F
import torch


# initialization
words = open('names.txt', 'r').read().splitlines()

chars = sorted(list(set(''.join(words))))
stoi = {s: i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i: s for s, i in stoi.items()}

xs, ys = [], []

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        # print(ch1, ch2)
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

num = xs.nelement()

W = torch.randn((27, 27), requires_grad=True)  # weights

for i in range(100):
    # forward pass
    xenc = F.one_hot(xs, num_classes=27).float()
    logits = xenc @ W  # log counts
    probs = logits.exp() / logits.exp().sum(1, keepdim=True)  # softmax
    loss = -probs[torch.arange(num), ys].log().mean()
    print("average negative log likelihood, i.e. loss:", loss.item())

    # backward pass
    W.grad = None
    loss.backward()

    # update
    W.data += -50 * W.grad

xenc = F.one_hot(xs, num_classes=27).float()
logits = xenc @ W  # log counts
probs = logits.exp() / logits.exp().sum(1, keepdim=True)  # softmax
loss = -probs[torch.arange(num), ys].log().mean()
print("final loss:", loss.item())

FileNotFoundError: ignored