In [1]:
names = open('names.txt').read().splitlines()

In [58]:
chars = sorted(list(set(''.join(names))) + ['.'])
n_unique_chars = len(chars)
n_unique_chars


27

In [3]:
stoi = {character:index for index,character in enumerate(chars)}
stoi['.']
itos = {index:character for character,index in stoi.items()}

In [18]:
import torch
[names[i] for i in (torch.arange(-10,-1)).tolist()]

['micaiah',
 'gautam',
 'janya',
 'talea',
 'rhylin',
 'sira',
 'theon',
 'opal',
 'zaylynn']

In [78]:
block_size = 5 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  for w in words:

    #print(w)
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape,Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(names)
n1 = int(0.8*len(names))
n2 = int(0.9*len(names))

Xtr, Ytr = build_dataset(names[:n1])
Xval, Yval = build_dataset(names[n1:n2])
Xte, Yte = build_dataset(names[n2:])

torch.Size([182484, 5]) torch.Size([182484])
torch.Size([22869, 5]) torch.Size([22869])
torch.Size([22793, 5]) torch.Size([22793])


In [75]:
hidden_neurons = 200
vecembedding_dim = 10

W1 = torch.randn((vecembedding_dim *block_size,hidden_neurons)) 
b1 = torch.randn((hidden_neurons))
W2 = torch.randn((hidden_neurons,n_unique_chars))
b2 = torch.randn(n_unique_chars)
C = torch.randn((n_unique_chars,vecembedding_dim))
batchnorm_gain = torch.ones(size=[1,hidden_neurons])
batchnorm_bias = torch.zeros(size=[1,hidden_neurons])

# Individual stats are to keep track of these metrics when validation split 
# is being tested, because then batchnorm is not used, hence we need a global mean and std  
# ALso, these are not learnable parameters, they will be calculated under torch.no_grad()
# decoration to isolate it from the computation graph, since it is just a variable updation, 
# see training code to understand.

individual_mean = torch.zeros(size=[1,hidden_neurons])
individual_std = torch.ones(size=[1,hidden_neurons])

parameters = [C,W1,b1,W2,b2,batchnorm_gain,batchnorm_bias]
for params in parameters:
    params.requires_grad = True
    
y = sum(p.nelement() for p in parameters)
y

16297

In [66]:
C[Xtr[torch.arange(0,512)]].shape


torch.Size([512, 5, 10])

In [77]:

#learning_rates_shop = torch.linspace(0.001,2.000,2000)
#losses = []
#learning_rates = []
minibatch_size = 64

for i in range(100000):
    #Forward Pass
    mini_batch_indices = torch.randint(0,Xtr.shape[0],(minibatch_size,)) 
    embedding = C[Xtr[mini_batch_indices]]
    h_preactivation = embedding.view(minibatch_size,block_size*vecembedding_dim) @ W1 +b1
    batch_mean = h_preactivation.mean(dim=0,keepdim=True)
    batch_std = h_preactivation.std(dim=0,keepdim=True)
    h_preactivation = ((h_preactivation - batch_mean) * (batchnorm_gain))/batch_std  + batchnorm_bias
    
    with torch.no_grad():
        individual_mean = individual_mean*0.90 + batch_mean*0.1
        individual_std  = individual_std *0.90 + batch_std *0.1
    
    h = torch.nn.functional.tanh(h_preactivation)
    logits = h @ W2 + b2
    loss = torch.nn.functional.cross_entropy(logits,Ytr[mini_batch_indices])
    
    
    #Backward Pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    #update
    lr = 0.01 if i<60000 else 0.001
    for params in parameters:
        params.data += -lr * params.grad
    #losses.append(loss.item())
    #learning_rates.append(learning_rates_shop[i])
    
    if (not i%10000):
        print(f"epoch: {(i+1)}/100000 -  loss:{loss.item():.4f}")
    
    

epoch: 1/100000 -  loss:2.1626
epoch: 10001/100000 -  loss:2.1606
epoch: 20001/100000 -  loss:2.2516
epoch: 30001/100000 -  loss:2.1270
epoch: 40001/100000 -  loss:2.2880
epoch: 50001/100000 -  loss:2.4601
epoch: 60001/100000 -  loss:2.2490
epoch: 70001/100000 -  loss:2.2355
epoch: 80001/100000 -  loss:2.2356
epoch: 90001/100000 -  loss:1.9421


In [79]:

val_batch_size = 0

for i in range(Xval.nelement()):
    #Forward Pass
    mini_batch_indices = torch.randint(0,Xval.shape[0],(val_batch_size,)) 
    embedding = C[Xval[i]]
    h_preactivation = embedding.view(val_batch_size,block_size*vecembedding_dim) @ W1 +b1
    
    h_preactivation = ((h_preactivation - individual_mean) /individual_std)
    
    
    
    h = torch.nn.functional.tanh(h_preactivation)
    logits = h @ W2 + b2
    loss = torch.nn.functional.cross_entropy(logits,Yval[i])
    
    
    if(not i%100):
        print(f"val example: {i} / {len(Xval.nelement())} - val loss: {loss:.4f} ")
    
   
    
    
    

ValueError: Expected input batch_size (1) to match target batch_size (0).

In [35]:
%matplotlib inline
import matplotlib.pyplot as plt
#plt.plot(learning_rates_shop,losses)   

In [49]:
# predictions

for i in range(10):
    out = []
    context = [0] * block_size 
    while True:
      emb = C[torch.tensor([context])] 
      h = torch.tanh(emb.view(1, -1) @ W1 + b1)
      logits = h @ W2 + b2
      probs = torch.nn.functional.softmax(logits, dim=1)
      ix = torch.multinomial(probs, num_samples=1, generator=None).item()
      context = context[1:] + [ix]
      out.append(ix)
      if ix == 0:
        break
    
    print(''.join(itos[i] for i in out))

polrigan.
girya.
amerie.
jres.
deesanoen.


cay.
kanarimus.
ginrse.
ziant.
hunven.
