Trigram Language Model

In [1]:
words = open('names.txt', 'r').read().splitlines()

In [2]:
import torch
import torch.nn.functional as F

In [3]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [None]:
# --------- !!! OPTIMIZATION !!! yay --------------
"""
DATA recorded --

will keep iterations to 100, so it doesn't overfit to the data, and andrej kept it to 100 :)

1. 2.098693370819092, -35, 100 iterations 

2. 2.0668435096740723 = -40 rate, 100 iterations

3. NOISE lowest - 2.0696253776550293 , final = 2.075587034225464 = -45 rate, 100 iterations
   - early stopping could work!?

"""

In [None]:
# --------- !!! NETWORK :DD !!! --------------

In [4]:

# create the dataset

xs, ys = [], []
for w in words:

  # here we add 2 '.' to construct the trigram dataset, could there be a better way? 
  chs = ['.'] + ['.'] + list(w) + ['.'] 

  # we zip 3 arrays (trigram)
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):

    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    xs.append((ix1, ix2))
    ys.append(ix3)

num = len(xs)
print('number of examples: ', num)    
xs = torch.tensor(xs)
ys = torch.tensor(ys)
# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)

# 54 inputs to 27 neurons , 54 because we give one hot encodings for 2 27 inputs. 
W = torch.randn((54, 27), generator=g, requires_grad=True)


number of examples:  228146


In [5]:
# gradient descent
for k in range(100):
  
  # forward pass

  xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding

  # converting inputs to a 54 array
  xenc = xenc.view(-1, 54)
  
  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()
  print(loss.item())
  
  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update (found 35 to be better)
  W.data += -40 * W.grad

4.242241382598877
3.5613768100738525
3.2819509506225586
3.051840305328369
2.95176362991333
2.866408109664917
2.82157039642334
2.7637434005737305
2.734311819076538
2.695187568664551
2.674452781677246
2.6452255249023438
2.6305150985717773
2.6073215007781982
2.5970523357391357
2.5777997970581055
2.570871114730835
2.5543055534362793
2.549886465072632
2.5352187156677246
2.532679319381714
2.519402503967285
2.518282175064087
2.506061553955078
2.5060365200042725
2.4946401119232178
2.4954848289489746
2.4847447872161865
2.4862983226776123
2.4760870933532715
2.4782304763793945
2.4684486389160156
2.471090793609619
2.461660861968994
2.4647293090820312
2.4555914402008057
2.4590277671813965
2.450133800506592
2.4538896083831787
2.4452016353607178
2.449237108230591
2.4407246112823486
2.4450063705444336
2.4366438388824463
2.4411444664001465
2.4329111576080322
2.437605619430542
2.429486036300659
2.434354066848755
2.426332950592041
2.431356430053711
2.4234230518341064
2.4285857677459717
2.4207301139831543

In [7]:
# finally, sample from the 'neural net' model
g = torch.Generator().manual_seed(2147483647)

for i in range(5):
  
  out = []
  #initial indexed, both 0 and 0
  ix2 = [0, 0]

  while True:
    xenc = torch.zeros(1, 54)
    xenc[0][ix2[0]] = 1

    #we add 27 here, because it's a 54 input array, and the second index has to go to the next 1-27 array
    xenc[0][ix2[1] + 27] = 1

    logits = xenc @ W # predict log-counts
    counts = logits.exp() # counts, equivalent to N
    p = counts / counts.sum(1, keepdims=True) # probabilities for next character
    # ----------
    
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    ix = ix

    #we swap the inputs, to make them the inputs for the next character
    ix2[0] = ix2[1]
    ix2[1] = ix

    out.append(itos[ix])
    if ix == 0:
      break
  print(''.join(out))

mor.
brwanikbxqwwzjavann.
loecodab.
mimpiry.
rle.


Splitting Data into dev and test sets

In [None]:

# create the dataset

xs, ys = [], []
dev_xs, dev_ys = [], []
train_xs, train_ys = [], []

for w in words:

  # here we add 2 '.' to construct the trigram dataset, could there be a better way? 
  chs = ['.'] + ['.'] + list(w) + ['.']

  # we zip 3 arrays (trigram)
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):

    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]

    probability = torch.rand(1)
  
    if (probability[0].item() <= 0.1):
      another_prob = torch.rand(1)
      if another_prob[0].item() <= 0.55:
         dev_xs.append((ix1, ix2))
         dev_ys.append(ix3)
      else:
         train_xs.append((ix1, ix2))
         train_ys.append(ix3)
    else:
      xs.append((ix1, ix2))
      ys.append(ix3)

num = len(xs)
total_samples = num + len(dev_xs) + len(train_xs)

print('number of examples: ', num, 'xs', 'percentage = ', (num/total_samples)*100)
print('number of examples: ', len(dev_xs), 'xs', 'percentage = ', (len(dev_xs)/total_samples)*100)
print('number of examples: ', len(train_xs), 'xs', 'percentage = ', (len(train_xs)/total_samples)*100)

print( (num/total_samples)*100 +(len(dev_xs)/total_samples)*100 +  (len(train_xs)/total_samples)*100, 'checking total probability')
xs = torch.tensor(xs)
ys = torch.tensor(ys)

dev_xs = torch.tensor(dev_xs)
dev_ys = torch.tensor(dev_ys)

train_xs = torch.tensor(train_xs)
train_ys = torch.tensor(train_ys)

# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)

# 54 inputs to 27 neurons , 54 because we give one hot encodings for 2 27 inputs. 
W = torch.randn((54, 27), generator=g, requires_grad=True)

EVALUATING the Dev and Test Set

In [None]:
# EVALUATION

num_dev = len(dev_xs)

num_train = len(train_xs)

for k in range(1):
  
  # forward pass

  xenc = F.one_hot(dev_xs, num_classes=27).float() # input to the network: one-hot encoding
  # converting inputs to a 54 array
  xenc = xenc.view(-1, 54)
  
  logits = xenc @ W # predict log-counts
  print(logits.shape, 'LOGITS SHAPe')
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  print(probs.shape)
  loss = -probs[torch.arange(num_dev), dev_ys].log().mean()
  print(loss.item(), 'dev set')


for k in range(1):
  
  # forward pass

  xenc = F.one_hot(train_xs, num_classes=27).float() # input to the network: one-hot encoding
  # converting inputs to a 54 array
  xenc = xenc.view(-1, 54)
  
  logits = xenc @ W # predict log-counts
  print(logits.shape, 'LOGITS SHAPe')
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  print(probs.shape, num_train)
  loss = -probs[torch.arange(num_train), train_ys].log().mean()
  print(loss.item(), 'train set')

Using cross entropy loss - (same result (and thing too, in this case))

In [None]:
# gradient descent with cross entropy loss
for k in range(100):
  
  # forward pass

  xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding

  # converting inputs to a 54 array
  xenc = xenc.view(-1, 54)
  
  logits = xenc @ W # predict log-counts
  

  loss = F.cross_entropy(logits, ys) + 0.01*(W**2).mean()
  print(loss.item())
  
  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update (found 35 to be better)
  W.data += -50 * W.grad