In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import os

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

#Reading Names Dataset
print(f"Working dir: {os.getcwd()}")
dataset_path = os.path.join(os.getcwd(), "../makemore/names.txt")
words = open(dataset_path, 'r').read().splitlines()

In [None]:
# Exploring Dataset
print(f"first 10 words{words[:10]}")
print(f"length of words: {len(words)}")
print(f"min word length {min(len(w) for (w) in words)} and max word length {max(len(w) for (w) in words)}")

In [29]:
# Make a list of characters (a -> z)
# Adding '.' as a word starter.
chars = sorted(list(set(''.join(words))))

# bi string to index as an input - 2 characters.
combinations_in_bigram = [ch1 + ch2 for ch1 in chars for ch2 in chars]

# Make a dictionary of character to index
bi_stoi = {s: i+len(chars) for i, s in enumerate(combinations_in_bigram)} 

for i in range(len(chars)):
    bi_stoi['.'+chars[i]] =  i 

# single string to index
si_stoi = {s:i+1 for i,s in enumerate(chars)} 
si_stoi['.'] = 0

# Index to string:
bi_itos = {i:s for s,i in bi_stoi.items()}
si_itos = {i:s for s,i in si_stoi.items()}

Statistics approach (using counting):

In [34]:
# All combinations counter:
# N gather all the possible bi-characters with the correspond next single character in the dataset, and put them with index.
# Makes a trigram 2characters + next 1character as a label.
# '.' use for end / start of a word character.
N = torch.zeros((27*27, 27), dtype=torch.int32, device=device)

# Getting the Bigrams
for w in words:
  chs = ['.'] + list(w) + ['.']
  
  for i in range(len(chs) - 2):  
    ch1_2 = chs[i] + chs[i + 1]  
    ch3 = chs[i + 2] 
    
    ix1 = bi_stoi[ch1_2]
    ix2 = si_stoi[ch3]
    
    N[ix1, ix2] += 1
    
P = (N+1).float()
P /= P.sum(1, keepdims=True)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(300,300))
plt.imshow(N, cmap='Blues')

for i in range(len(bi_itos)):
    for j in range(len(si_itos)):
        chstr = bi_itos[i] + si_itos[j]
        plt.text(j, i, chstr, ha="center", va="bottom", color='gray')
        plt.text(j, i, N[i, j].item(), ha="center", va="top", color='gray')
plt.axis('off');

In [None]:
g = torch.Generator().manual_seed(2147483147)

for i in range(5):
  
  out = []
  ix = 0
  while True:
    p = P[ix]
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(si_itos[ix])
    
    if si_itos[ix] == '.':
      break
  print(''.join(out))

In [165]:
# GOAL: maximize likelihood of the data w.r.t. model parameters (statistical modeling)
# equivalent to maximizing the log likelihood (because log is monotonic)
# equivalent to minimizing the negative log likelihood
# equivalent to minimizing the average negative log likelihood

# log(a*b*c) = log(a) + log(b) + log(c)

In [None]:
log_likelihood = 0.0
n = 0

for w in words:
#for w in ["andrejq"]:
  chs = ['.'] + list(w) + ['.']
  
  for i in range(len(chs) - 2):  
    ch1_2 = chs[i] + chs[i + 1]  
    ch3 = chs[i + 2] 
    
    ix1 = bi_stoi[ch1_2]
    ix2 = si_stoi[ch3]
    
    prob = P[ix1, ix2]
    logprob = torch.log(prob)
    log_likelihood += logprob
    n += 1

print(f'{log_likelihood=}')
nll = -log_likelihood
print(f'{nll=}')
print(f'{nll/n}')

In [None]:
# Second approach - Train on layer net:

# create the training set of bigrams (x,y)
xs, ys = [], []

for w in words[:1]:
  chs = ['.'] + list(w) + ['.']
  
  for i in range(len(chs) - 2):  
    ch1_2 = chs[i] + chs[i + 1]  
    ch3 = chs[i + 2] 
    
    ix1 = bi_stoi[ch1_2]
    ix2 = si_stoi[ch3]
  
    print(ch1_2, ch3)
    xs.append(ix1)
    ys.append(ix2)
    
xs = torch.tensor(xs)
ys = torch.tensor(ys)

In [None]:
xs

In [None]:
ys

In [None]:
import torch.nn.functional as F
xenc = F.one_hot(xs, num_classes=len(N)).float()
xenc

In [None]:
xenc.shape

In [None]:
plt.imshow(xenc)

In [None]:
xenc.dtype

In [None]:
W = torch.randn((len(N), 1))
xenc @ W

In [None]:
logits = xenc @ W # log-counts
counts = logits.exp() # equivalent N
probs = counts / counts.sum(1, keepdims=True)
probs

In [None]:
probs[0]

In [None]:
probs[0].shape

In [None]:
probs[0].sum()

In [None]:
# (5, 27) @ (27, 27) -> (5, 27)

In [None]:
# SUMMARY ------------------------------>>>>

In [None]:
xs

In [None]:
ys

In [139]:
# randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((len(N), 27), generator=g)

In [140]:
xenc = F.one_hot(xs, num_classes=len(N)).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
# btw: the last 2 lines here are together called a 'softmax'

In [None]:
probs.shape

In [None]:

nlls = torch.zeros(5)
for i in range(4):
  # i-th bigram:
  x = xs[i].item() # input character index
  y = ys[i].item() # label character index
  print('--------')
  print(f'bigram example {i+1}: {bi_itos[x]}{si_itos[y]} (indexes {x},{y})')
  print('input to the neural net:', x)
  print('output probabilities from the neural net:', probs[i])
  print('label (actual next character):', y)
  p = probs[i, y]
  print('probability assigned by the net to the the correct character:', p.item())
  logp = torch.log(p)
  print('log likelihood:', logp.item())
  nll = -logp
  print('negative log likelihood:', nll.item())
  nlls[i] = nll

print('=========')
print('average negative log likelihood, i.e. loss =', nlls.mean().item())

In [561]:
# --------- !!! OPTIMIZATION !!! yay --------------

In [None]:
xs

In [None]:
ys

In [146]:
# randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((len(N), 27), generator=g, requires_grad=True)

In [None]:
# forward pass
xenc = F.one_hot(xs, num_classes=len(N)).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
loss = -probs[torch.arange(4), ys].log().mean()

In [None]:
print(loss.item())

In [150]:
# backward pass
W.grad = None # set to zero the gradient
loss.backward()

In [151]:
W.data += -0.1 * W.grad

In [606]:
# --------- !!! OPTIMIZATION !!! yay, but this time actually --------------

In [None]:
# create the dataset
xs, ys = [], []
for w in words:
  chs = ['.'] + list(w) + ['.']
  for i in range(len(chs) - 2):  
    ch1_2 = chs[i] + chs[i + 1]  
    ch3 = chs[i + 2] 
    
    ix1 = bi_stoi[ch1_2]
    ix2 = si_stoi[ch3]
  
    xs.append(ix1)
    ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()

print('number of examples: ', num)

# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((len(N), 27), generator=g, requires_grad=True)

In [None]:
# gradient descent
ephocs = 10
for k in range(ephocs):
  
  # forward pass
  xenc = F.one_hot(xs, num_classes=len(N)).float() # input to the network: one-hot encoding
  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()
  print(loss.item())
  
  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update
  W.data += -50 * W.grad

In [None]:
# finally, sample from the 'neural net' model
g = torch.Generator().manual_seed(2147483647)

for i in range(5):
  
  out = []
  ix = 0
  while True:
    
    # ----------
    # BEFORE:
    #p = P[ix]
    # ----------
    # NOW:
    xenc = F.one_hot(torch.tensor([ix]), num_classes=len(N)).float()
    logits = xenc @ W # predict log-counts
    counts = logits.exp() # counts, equivalent to N
    p = counts / counts.sum(1, keepdims=True) # probabilities for next character
    # ----------
    
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(si_itos[ix])
    if ix == 0:
      break
  print(''.join(out))