In [415]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import time
from torch.nn import functional as F
from torch.nn import Sequential, Tanh, Parameter
import mlflow as mlflow
%matplotlib inline

In [416]:
torch.manual_seed(42);

In [417]:
names = open("../datasets/llm/names.txt",mode="r",encoding="utf8").read().splitlines()
names[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [418]:
vocab = "".join(sorted(set(x for word in names+["."] for x in word))) # building vocab

In [419]:
len(vocab)

27

In [420]:
stoi = {k:v for v,k in enumerate(vocab)}
itos = {v:k for v,k in enumerate(vocab)}

In [421]:
torch.set_default_device("cpu")
def build_dataset(data, n_context):
    xs = []
    ys = []
    
    for x in data:
        context = [0]* n_context
        for ch in x + ".":
            # print(f"{context} {stoi[ch]}")
            ix = stoi[ch]
            xs.append(context)
            ys.append(ix)
            context = context[1:]+[ix]
    xs = torch.tensor(xs)
    ys = torch.tensor(ys)
    print(xs.shape,ys.shape)
    return xs,ys

np.random.shuffle(names)

data_size =  len(names)

train_data = names[:int(data_size * .8)]
dev_data = names[int(data_size * .8):int(data_size * .9)]
test_data = names[int(data_size * .9):]

#hyperparameters
n_context = 8
e_dim = 10
h_dim = 200
n_consec = 2

Xtr,Ytr = build_dataset(train_data,n_context) 
Xdev,Ydev = build_dataset(dev_data,n_context)
Xtest,Ytest = build_dataset(test_data,n_context)

torch.Size([182659, 8]) torch.Size([182659])
torch.Size([22534, 8]) torch.Size([22534])
torch.Size([22953, 8]) torch.Size([22953])


In [474]:
class Linear(torch.nn.Module):
    def __init__(self,nin,nout,bias=True,weight=1):
        super().__init__()
        self.nin = nin
        self.nout = nout
        self.weights = Parameter(torch.randn(nin, nout) * weight)
        self.bias = Parameter(torch.randn(nout)) if bias else None

    
    def forward(self,x):
        self.out = x @ self.weights * self.nin**-0.5
        if self.bias != None:
            self.out = self.out + self.bias
        return self.out 

    # def parameters(self):
    #     self.parameters = [self.W1] + [self.b1] if self.b1!=None else [self.W1]
    #     return self.parameters 
        
class Flatten(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self,x):
        self.out = x.view(x.shape[0],-1)
        return self.out
        
class Embedding(torch.nn.Module):
    def __init__(self,vocab_size,edim):
        super().__init__()
        self.weights = Parameter(torch.randn(vocab_size, edim))

    def forward(self,x):
        self.out = self.weights[x]
        return self.out

    # def parameters(self):
    #     self.parameters = [self.embedding]
    #     return self.parameters

class BatchNorm1d(torch.nn.Module):
    def __init__(self,nout,alpha=0.001,eps=1e-05):
        super().__init__()
        self.nout = nout
        self.bgain = Parameter(torch.ones(1,nout))
        self.bbias = Parameter(torch.zeros(1,nout))
        self.bstdrunning = torch.ones(1,nout)
        self.bmeanrunning = torch.zeros(1,nout)
        self.alpha = alpha
        self.eps = eps
        
    def forward(self,x):
        if(self.training):
            dimn = len(x.shape) -1
            # print([*range(dimn)])
            bmeani = x.mean([*range(dimn)],keepdim=True)
            bstdi = x.std([*range(dimn)],keepdim = True)
            self.out = self.bgain*(x - bmeani)/ (bstdi + self.eps) + self.bbias
            with torch.no_grad():
                self.bmeanrunning = self.alpha * self.bmeanrunning + (1 - self.alpha)* bmeani
                self.bstdrunning = self.alpha * self.bstdrunning + (1 - self.alpha) * bstdi
        else:
            self.out = self.bgain*(x - self.bmeanrunning)/ (self.bstdrunning + self.eps) + self.bbias

        return self.out
   
    # def parameters(self):
    #     self.parameters = [self.bgain,self.bbias]
    #     return self.parameters

class FlattenConsecutive(torch.nn.Module):
    def __init__(self,n):
        super().__init__()
        self.n = n

    def forward(self,x):
        B,T,C = x.shape
        x=x.view(B,T//self.n,C * self.n)
        if x.shape[1] == 1:
           x =  x.squeeze(1)
        self.out = x
        return self.out

class Tanh1(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self,x):
        self.out = F.tanh(x)
        return self.out

class RNN(torch.nn.Module):
    def __init__(self,nin,nout):
        super().__init__()
        self.nin = nin
        self.nout = nout
        self.h = None
        self.wh = Parameter(torch.randn(nin,nout)) #200 X 200
        self.wx = Parameter(torch.randn(nin,nout))
        self.wy = Parameter(torch.randn(nin,nout))
    
    def forward(self,x):
        if self.h is None:
            self.h = torch.zeros(1,self.nout) # 1x200
        # print(self.h.shape,self.wy.shape,self.wh.shape,self.wx.shape,x.shape)
        self.h = F.tanh((self.h @ self.wh) + (x @ self.wx)) #32x200 @ 200 X 200
        out = self.h @ self.wy
        print(out.shape)
        return out
        
@torch.no_grad()
def evaluate(X,Y,model):
    model.eval()
    out = model(X)
    loss = F.cross_entropy(out,Y)
    return out,loss

@torch.no_grad()
def generate(n_words,model):
    model.eval()
    for i in range(10):
        #forward pass
        out = []
        context = [0] * 8
        while True:
            logits= model(torch.tensor(context).view(1,-1))
            p = F.softmax(logits,dim=1)
            ix = torch.multinomial(p,num_samples = 1, replacement = True)
            context = context[1:] + [ix.item()]
            if ix.item() == 0:
                break
            out.append(itos[ix.item()])
        print("".join(out))


In [478]:
Model =  Sequential(
    Embedding(len(vocab),e_dim),
    FlattenConsecutive(n_consec),Linear(n_consec*e_dim,h_dim,False),BatchNorm1d(h_dim),Tanh1(),
    # FlattenConsecutive(n_consec),Linear(n_consec*h_dim,h_dim,False),BatchNorm1d(h_dim),Tanh1(),
    # FlattenConsecutive(n_consec),Linear(n_consec*h_dim,h_dim,False),BatchNorm1d(h_dim),Tanh1(),
    RNN(h_dim,h_dim),BatchNorm1d(h_dim),Tanh1(),
    Linear(h_dim,len(vocab),weight=0.1)
    
)

In [479]:
for x in Model.parameters():
    print(x.shape)

torch.Size([27, 10])
torch.Size([20, 200])
torch.Size([1, 200])
torch.Size([1, 200])
torch.Size([200, 200])
torch.Size([200, 200])
torch.Size([200, 200])
torch.Size([1, 200])
torch.Size([1, 200])
torch.Size([200, 27])
torch.Size([27])


In [480]:
batch_size = 32
lrsloss = []
lossi = []

for i in range(20000):
    #forward pass
    # print(i)
    lr = 0.01 if i <100000 else 0.001
    Model.train()
    # optimizer = torch.optim.SGD(Model.parameters(), lr=lr)
    ix = torch.randint(0,high = Xtr.shape[0],size=(batch_size,))
    out= Model.forward(Xtr[ix])
    # print(out.shape)
    loss = F.cross_entropy(out,Ytr[ix])
    lossi.append(loss.log10().item())
    if i % 1000 == 0:
        print(F"loss at {i}/20000 {loss.item():.4f}")
    #backpass
    for p in Model.parameters():
        p.grad = None
    loss.backward(retain_graph=True)
    for p in Model.parameters():
        p.data += -lr*p.grad

    # if i == 1:
    #     break
        

print(F" Final loss: {loss.item():.4f}")

torch.Size([32, 4, 200])


RuntimeError: Expected target size [32, 27], got [32]

In [456]:
evaluate(Xdev,Ydev,Model)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (200x200 and 1x200)

In [None]:
evaluate(Xtest,Ytest,Model)

In [None]:
generate(10,Model)

In [None]:
plt.plot(torch.tensor(lossi).view(-1, 1000).mean(1));

In [None]:
nin = 5
nout = 1
x = torch.randn(32,5)
w1 = torch.randn(200,200)
w2 = torch.randn(32,200).view(32,1,200)


In [None]:
r = RNN(5,5)

In [None]:
r(x).shape

In [None]:
r1 = torch.nn.RNN(5,5,bias=False,batch_first = True)

In [None]:
r1(x)[1].shape

In [None]:
r.h.shape

In [None]:
[*Model[0].parameters()][0][Xtr[ix]].shape

In [None]:
x @ torch.zeros(5,5)