In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import time
from torch.nn import functional as F
from torch.nn import Sequential, Tanh, Parameter
import mlflow as mlflow
%matplotlib inline

In [7]:
torch.backends.mps.is_available()

True

In [8]:
names = open("../datasets/llm/names.txt",mode="r",encoding="utf8").read().splitlines()
names[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [9]:
vocab = "".join(sorted(set(x for word in names+["."] for x in word))) # building vocab
stoi = {k:v for v,k in enumerate(vocab)}
itos = {v:k for v,k in enumerate(vocab)}

In [10]:
e_dim = 10
h_dim = 200
n_context = 8

In [11]:
torch.set_default_device("cpu")
def build_dataset(data, n_context):
    xs = []
    ys = []
    
    for x in data:
        context = [0]* n_context
        for ch in x + ".":
            # print(f"{context} {stoi[ch]}")
            ix = stoi[ch]
            xs.append(context)
            ys.append(ix)
            context = context[1:]+[ix]
    xs = torch.tensor(xs)
    ys = torch.tensor(ys)
    print(xs.shape,ys.shape)
    return xs,ys

np.random.shuffle(names)

data_size =  len(names)

train_data = names[:int(data_size * .8)]
dev_data = names[int(data_size * .8):int(data_size * .9)]
test_data = names[int(data_size * .9):]

Xtr,Ytr = build_dataset(train_data,n_context) 
Xdev,Ydev = build_dataset(dev_data,n_context)
Xtest,Ytest = build_dataset(test_data,n_context)

torch.Size([182562, 8]) torch.Size([182562])
torch.Size([22771, 8]) torch.Size([22771])
torch.Size([22813, 8]) torch.Size([22813])


In [None]:
class Linear(torch.nn.Module):
    def __init__(self,nin,nout,bias=True):
        super().__init__()
        self.nin = nin
        self.nout = nout
        self.W1 = Parameter(torch.randn(nin, nout))
        self.b1 = Parameter(torch.randn(nout)) if bias else None

    
    def forward(self,x):
        self.out = x @ self.W1 * self.nin**-0.5
        if self.b1 != None:
            self.out = self.out + self.b1
        return self.out

    # def parameters(self):
    #     self.parameters = [self.W1] + [self.b1] if self.b1!=None else [self.W1]
    #     return self.parameters 
        
class Flatten(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self,x):
        if len(x.shape) > 2:
            self.out = x.view(x.shape[0],-1)
        else:
            self.out = x.view(-1,x.shape[0]*x.shape[1])
        return self.out
        
class Embedding(torch.nn.Module):
    def __init__(self,vocab_size,edim):
        super().__init__()
        self.embedding = Parameter(torch.randn(vocab_size, edim))

    def forward(self,x):
        self.out = self.embedding[x]
        return self.out

    # def parameters(self):
    #     self.parameters = [self.embedding]
    #     return self.parameters

class BatchNorm1d(torch.nn.Module):
    def __init__(self,nout,alpha=0.001,eps=1e-05):
        super().__init__()
        self.nout = nout
        self.bgain = Parameter(torch.ones(1,nout))
        self.bbias = Parameter(torch.zeros(1,nout))
        self.bstdrunning = torch.ones(1,nout)
        self.bmeanrunning = torch.zeros(1,nout)
        self.alpha = alpha
        self.eps = eps
        
    def forward(self,x):
        if(self.training):
            bmeani = x.mean(0,keepdim=True)
            bstdi = x.std(0,keepdim = True)
            self.out = self.bgain*(x - bmeani)/ (bstdi + self.eps) + self.bbias
            with torch.no_grad():
                self.bmeanrunning = self.alpha * self.bmeanrunning + (1 - self.alpha)* bmeani
                self.bstdrunning = self.alpha * bstdrunning + (1 - self.alpha) * bstdi
        else:
            self.out = self.bgain*(x - self.bmeanrunning)/ (self.bstdrunning + self.eps) + self.bbias

        return self.out
   
    # def parameters(self):
    #     self.parameters = [self.bgain,self.bbias]
    #     return self.parameters
class RNN(torch.nn.Module):
    def __init__(self,nin,nout):
        super().__init__()
        self.nin = nin
        self.nout = nout
        self.h = None
        self.wh = nn.Linear(nin,nout) #200 X 200
        self.wx = nn.Linear(nin,nout)
        self.wy = nn.Linear(nin,nout)
    
    def forward(self,x):
        B,T = x.shape
        if self.h is None:
            self.ph = torch.zeros(1,B,self.nout) # 1x200
        # print(self.h.shape,self.wy.shape,self.wh.shape,(x @ self.wx).shape,(self.h @ self.wh).shape)
        for i in range(T):
            self.h = F.tanh((self.ph @ self.wh) + (x @ self.wx)) #32x200 @ 200 X 200
            self.ph = self.h
            out = self.h @ self.wy
        # print(out.shape)
        return out
    
class Tanh1(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self,x):
        self.out = F.tanh(x)
        return self.out
@torch.no_grad()
def evaluate(X,Y,model):
    model.eval()
    out = model(X)
    loss = F.cross_entropy(out,Y)
    return out,loss

@torch.no_grad()
def generate(n_words,model):
    model.eval()
    for i in range(10):
        #forward pass
        out = []
        context = [0] * n_context
        while True:
            logits= model([context])
            p = F.softmax(logits,dim=1)
            ix = torch.multinomial(p,num_samples = 1, replacement = True,generator = g)
            context = context[1:] + [ix.item()]
            if ix.item() == 0:
                break
            out.append(itos[ix.item()])
        print("".join(out))


In [14]:
Model =  Sequential(
    Embedding(len(vocab),e_dim),
    Flatten(),Linear(n_context*e_dim,h_dim,False),BatchNorm1d(h_dim),Tanh1(),
    RNN(h_dim,h_dim),BatchNorm1d(h_dim),Tanh1(),
    # Linear(h_dim,h_dim,False),BatchNorm1d(h_dim),Tanh1(),
    # Linear(h_dim,h_dim,False),BatchNorm1d(h_dim),Tanh1(),
    Linear(h_dim,len(vocab),False)
    
)

In [None]:
batch_size = 32
lrsloss = []

for i in range(200000):
    #forward pass
    lr = 0.1 if i <100000 else 0.01
    Model.train()
    # optimizer = torch.optim.SGD(Model.parameters(), lr=lr)
    ix = torch.randint(0,high = Xtr.shape[0],size=(batch_size,))
    out= Model.forward(Xtr[ix])
    loss = F.cross_entropy(out,Ytr[ix])
    # lossi.append(loss.log10().item())
    if i % 10000 == 0:
        print(F"loss at {i}/200000 {loss.item():.4f}")
    #backpass
    for p in Model.parameters():
        p.grad = None
    loss.backward(retain_graph=True)
    for p in Model.parameters():
        p.data += -lr*p.grad
    

print(F"Mins taken: {(t - time.time())/60}")
print(F" Final loss: {loss.item():.4f}")