In [1]:
import numpy as np
import torch
import matplotlib.pyplot as plt
import torch.nn.functional as F

In [2]:
words =  open('names.txt','r').read().splitlines()
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [3]:
alphabets = sorted(list(set(''.join(words))))

stoi = {s:i+1 for i,s in enumerate(alphabets)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [4]:
#Building Dataset

def build_dataset(words):
    xs = []
    ys = []
    len_ws =[] 

    for w in words:

        chr = '...' + w +'.'
        i = 0

        for chr1,chr2,chr3,chr4 in zip(chr,chr[1:],chr[2:],chr[3:]):
            len_ws.append(i)
            i += 1
            xs.append([stoi[chr1],stoi[chr2],stoi[chr3]])
            ys.append(stoi[chr4])
    len_ws = torch.tensor(len_ws)
    len_ws = F.one_hot(len_ws,num_classes=16)


    xs = torch.tensor(xs)
    ys = torch.tensor(ys)

    return xs,ys,len_ws

import random

random.shuffle(words)

train_ix = int(len(words) * 0.8)
val_ix = int(len(words) * 0.9)

Xtr,ytr,nth_tr = build_dataset(words[:train_ix])
Xval,yval,nth_val = build_dataset(words[train_ix:val_ix])
Xte,yte,nth_te = build_dataset(words[val_ix:])

In [5]:
Xtr.shape

torch.Size([182418, 3])

In [6]:
feature_vec_dim = 10
h_size = 300

g = torch.random.manual_seed(576809309)

C = torch.randn((27,feature_vec_dim),generator=g);  C.requires_grad = True
W1 = torch.randn((46,h_size),generator=g);          W1.requires_grad = True
W2 = torch.randn((h_size,27),generator=g) * 0.01;          W2.requires_grad = True
b1 = torch.randn((1,h_size),generator=g);           b1.requires_grad = True
b2 = torch.randn((1,27),generator=g) * 0;               b2.requires_grad = True

parameters = [C,W1,W2,b1,b2]

In [7]:
running_losses = []
lr = 0.1
patience = 3

strikes = 0

for i in range(300000):

    if i % 1000 != 0:

        ix = torch.randint(0,len(Xtr),(512,))
        #forward pass
        X_enc = C[Xtr[ix]].view((-1,3 * feature_vec_dim))
        X_enc = torch.cat([X_enc,nth_tr[ix]],dim=1)
        act = X_enc @ W1 + b1
        h = act.tanh()
        logits = h @ W2 + b2
        loss = F.cross_entropy(logits,ytr[ix])

        if i%10000 == 0:
            print(f"Loss {i} ----> {loss.item()}")

        #Backward Pass
        for p in parameters:
            p.grad = None

        loss.backward()
        #Update Parameters 
        for p in parameters:
            p.data -= lr * p.grad
    else:
        #Evaluate on full dataset
        X_enc = C[Xtr].view((-1,3 * feature_vec_dim))
        X_enc = torch.cat([X_enc,nth_tr],dim=1)
        act = X_enc @ W1 + b1
        h = act.tanh()
        logits = h @ W2 + b2
        loss = F.cross_entropy(logits,ytr)
        running_losses.append(loss.item())
        if  len(running_losses) > 9 and running_losses[-1] > running_losses[-10]: # Looking 10000 runs back for a drop in loss...
            print(f"Strike {strikes + 1}: {running_losses[-2]} => {running_losses[-1]}")
            strikes += 1
        if strikes == patience:
            strikes = 0
            print(f"Loss on {i-1000}th iteration => {running_losses[-2]}")
            print(f"Loss on {i}th iteration => {running_losses[-1]}")
            print(f"Reducing learning rate from {lr} to {lr*0.5}")
            lr *= 0.5 # Halve the learning rate

        if len(running_losses) > 49 and running_losses[-50] - running_losses[-1] < 0.01: #Exploring over every 10,000 steps
            print("Loss is reducing at too slow a rate. Training has been halted")
            break
    
    if lr < 0.005:
        print("Gradient is vanishing...")
        break #Halt before gradient starts to vanish   



Strike 1: 1.9527976512908936 => 1.9536689519882202
Strike 2: 1.9461408853530884 => 1.94944167137146
Strike 3: 1.94944167137146 => 1.9486933946609497
Loss on 135000th iteration => 1.94944167137146
Loss on 136000th iteration => 1.9486933946609497
Reducing learning rate from 0.1 to 0.05
Strike 1: 1.935076117515564 => 1.936283826828003
Strike 2: 1.936283826828003 => 1.9357365369796753
Strike 3: 1.9322268962860107 => 1.9322967529296875
Loss on 185000th iteration => 1.9322268962860107
Loss on 186000th iteration => 1.9322967529296875
Reducing learning rate from 0.05 to 0.025
Loss is reducing at too slow a rate. Training has been halted


In [8]:
print(i)

186000


In [9]:
X_enc = C[Xtr].view((-1,3 * feature_vec_dim))
X_enc = torch.cat([X_enc,nth_tr],dim=1)
act = X_enc @ W1 + b1
h = act.tanh()
logits = h @ W2 + b2
loss = F.cross_entropy(logits,ytr)
print(loss.item())

1.9322967529296875


In [12]:
X_enc = C[Xte].view((-1,3 * feature_vec_dim))
X_enc = torch.cat([X_enc,nth_te],dim=1)
act = X_enc @ W1 + b1
h = act.tanh()
logits = h @ W2 + b2
loss = F.cross_entropy(logits,yte)
print(loss.item())

2.0584189891815186
