In [89]:
names = open('names.txt').read().splitlines()
import random
random.seed(512)
random.shuffle(names)

False


In [56]:
chars = sorted(list(set(''.join(names))) + ['.'])
n_unique_chars = len(chars)
n_unique_chars


27

In [57]:
stoi = {character:index for index,character in enumerate(chars)}
stoi['.']
itos = {index:character for character,index in stoi.items()}

In [58]:
import torch
import random
random.seed(500)
random.shuffle(names)
[names[i] for i in (torch.arange(-10,-1)).tolist()]

['muzammil',
 'jamieson',
 'kaylina',
 'devi',
 'jazlene',
 'khaidyn',
 'dakota',
 'paiden',
 'kendrik']

In [59]:
block_size = 5 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  for w in words:

    #print(w)
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape,Y.shape)
  return X, Y


n1 = int(0.8*len(names))
n2 = int(0.9*len(names))

Xtr, Ytr = build_dataset(names[:n1])
Xval, Yval = build_dataset(names[n1:n2])
Xte, Yte = build_dataset(names[n2:])

torch.Size([182560, 5]) torch.Size([182560])
torch.Size([22862, 5]) torch.Size([22862])
torch.Size([22724, 5]) torch.Size([22724])


In [60]:
hidden_neurons = 200
vecembedding_dim = 22

W1 = torch.randn((vecembedding_dim *block_size,hidden_neurons)) 
#b1 = torch.randn((hidden_neurons))
W2 = torch.randn((hidden_neurons,n_unique_chars))
b2 = torch.randn(n_unique_chars)
C = torch.randn((n_unique_chars,vecembedding_dim))
batchnorm_gain = torch.ones(size=[1,hidden_neurons])
batchnorm_bias = torch.zeros(size=[1,hidden_neurons])

# Individual stats are to keep track of these metrics when validation split 
# is being tested, because then batchnorm is not used, hence we need a global mean and std  
# ALso, these are not learnable parameters, they will be calculated under torch.no_grad()
# decoration to isolate it from the computation graph, since it is just a variable updation, 
# see training code to understand.

individual_mean = torch.zeros(size=[1,hidden_neurons])
individual_std = torch.ones(size=[1,hidden_neurons])

parameters = [C,W1,W2,b2,batchnorm_gain,batchnorm_bias]
for params in parameters:
    params.requires_grad = True
    
y = sum(p.nelement() for p in parameters)
y

28421

In [61]:
from tqdm import tqdm
#learning_rates_shop = torch.linspace(0.001,2.000,2000)
#losses = []
#learning_rates = []
minibatch_size = 64

for i in tqdm(range(200000)):
    #Forward Pass
    mini_batch_indices = torch.randint(0,Xtr.shape[0],(minibatch_size,)) 
    embedding = C[Xtr[mini_batch_indices]]
    h_preactivation = embedding.view(minibatch_size,block_size*vecembedding_dim) @ W1 #+b1
    batch_mean = h_preactivation.mean(dim=0,keepdim=True)
    batch_std = h_preactivation.std(dim=0,keepdim=True)
    h_preactivation = ((h_preactivation - batch_mean) * (batchnorm_gain))/batch_std  + batchnorm_bias
    
    with torch.no_grad():
        individual_mean = individual_mean*0.99 + batch_mean*0.01
        individual_std  = individual_std *0.99 + batch_std *0.01
    
    h = torch.nn.functional.tanh(h_preactivation)
    logits = h @ W2 + b2
    loss = torch.nn.functional.cross_entropy(logits,Ytr[mini_batch_indices])
    
    
    #Backward Pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    #update
    lr = 0.1 if i<60000 else 0.01
    for params in parameters:
        params.data += -lr * params.grad
    #losses.append(loss.item())
    #learning_rates.append(learning_rates_shop[i])
    
    if (not i%10000):
        print(f"epoch: {(i)}/200000 -  loss:{loss.item():.4f}")
    
    

  0%|          | 98/200000 [00:00<06:30, 511.67it/s]

epoch: 0/200000 -  loss:18.1969


  5%|▌         | 10067/200000 [00:19<05:24, 584.46it/s]

epoch: 10000/200000 -  loss:2.2580


 10%|█         | 20055/200000 [00:45<06:55, 433.17it/s]

epoch: 20000/200000 -  loss:2.5357


 15%|█▌        | 30052/200000 [01:10<08:34, 330.19it/s]

epoch: 30000/200000 -  loss:2.2721


 20%|██        | 40060/200000 [01:34<06:11, 430.06it/s]

epoch: 40000/200000 -  loss:2.5676


 25%|██▌       | 50051/200000 [01:59<06:09, 405.76it/s]

epoch: 50000/200000 -  loss:2.1879


 30%|███       | 60057/200000 [02:24<05:28, 425.68it/s]

epoch: 60000/200000 -  loss:2.1692


 35%|███▌      | 70067/200000 [02:43<03:49, 566.49it/s]

epoch: 70000/200000 -  loss:2.0701


 40%|████      | 80078/200000 [03:01<03:35, 555.48it/s]

epoch: 80000/200000 -  loss:2.3395


 45%|████▌     | 90082/200000 [03:18<03:11, 573.55it/s]

epoch: 90000/200000 -  loss:1.9042


 50%|█████     | 100064/200000 [03:36<03:21, 496.65it/s]

epoch: 100000/200000 -  loss:2.1106


 55%|█████▌    | 110079/200000 [03:54<02:34, 582.30it/s]

epoch: 110000/200000 -  loss:2.0542


 60%|██████    | 120084/200000 [04:11<02:18, 577.35it/s]

epoch: 120000/200000 -  loss:2.3263


 65%|██████▌   | 130098/200000 [04:29<02:04, 560.78it/s]

epoch: 130000/200000 -  loss:2.4381


 70%|███████   | 140084/200000 [04:47<01:42, 584.82it/s]

epoch: 140000/200000 -  loss:2.2697


 75%|███████▌  | 150082/200000 [05:04<01:30, 550.34it/s]

epoch: 150000/200000 -  loss:2.2785


 80%|████████  | 160088/200000 [05:22<01:08, 582.18it/s]

epoch: 160000/200000 -  loss:2.0066


 85%|████████▌ | 170061/200000 [05:39<00:51, 581.88it/s]

epoch: 170000/200000 -  loss:2.2836


 90%|█████████ | 180094/200000 [05:57<00:34, 573.30it/s]

epoch: 180000/200000 -  loss:2.1049


 95%|█████████▌| 190107/200000 [06:15<00:17, 573.79it/s]

epoch: 190000/200000 -  loss:2.3371


100%|██████████| 200000/200000 [06:32<00:00, 509.62it/s]


In [62]:
@torch.no_grad()
def split_loss(split):
    
    x,y = {
    'train': (Xtr, Ytr),
    'val': (Xval, Yval),
    'test': (Xte, Yte),
          }[split]
      
    #Forward Pass
    embedding = C[x]
    h_preactivation = embedding.view(embedding.shape[0],-1) @ W1 #+b1

    h_preactivation = ((h_preactivation - individual_mean)*batchnorm_gain /individual_std) + batchnorm_bias
    
    h = torch.tanh(h_preactivation)
    logits = h @ W2 + b2
    loss = torch.nn.functional.cross_entropy(logits,y)
    print(split,loss.item())

split_loss('val')
split_loss('train')



val 2.1617770195007324
train 2.161390781402588


In [9]:
%matplotlib inline
import matplotlib.pyplot as plt
#plt.plot(learning_rates_shop,losses)   

In [63]:
# predictions

for i in range(10):
    out = []
    context = [0] * block_size 
    while True:
      emb = C[torch.tensor([context])] 
      h = torch.tanh(emb.view(1, -1) @ W1 )
      logits = h @ W2 + b2
      probs = torch.nn.functional.softmax(logits, dim=1)
      ix = torch.multinomial(probs, num_samples=1, generator=None).item()
      context = context[1:] + [ix]
      out.append(ix)
      if ix == 0:
        break
    
    print(''.join(itos[i] for i in out))

ppqqvihnzvqzqdhqupypogxfogdzlqqavikahufgkuffprxmgdgwivjqsvivqdawhwrvwvkwyvk.
pprpquvibquqqqupyuguzubgurxcqsshfovdfrququvahcdkthftplbmdoxdmwaddzkqqqevddgxwvismwyvquzwzcelmgrqxtismbqdagdovdigmavddgqupzpulbhcdossepzlchbyuchofgfvprvegocktrffrymmgwqwivqtpwvigzzydqucwhmgyvun.
ppqpvibgeldjgh.
ppqqvihnzvqzqdhqququbkdigszpfppfpfpvffdpqqdveqcpypo.
ppqqvysqupwvifzeymmucquqohghgcpjfprusqopllbgwyengzqqqzvinghqx.
ppqqvihnzhfpvjvqukthqd.
ppqqve.
ppqqvysqupwvugozufgpuffprxmgdgwivmavdwghwvwaxfwrlqvanhymdagquqdhwagnxfjvapqjkecqsuklagdogxnviqzjqmaghqugwvcdozvigquptpugblbygqupwpugzwrtffprvmgrxkcifzevdzqxyvqupwygqufwpcfffrvqmawysqdpwptwissppolqdan.
ppqpvbhwfwdvcvengzuszzqxpvigmousqupypogxuffprxmgdgwivqagddgqupzpulbhcxtvifmiyzmahqdaghufzzxpzlly.
brvllogh.
