In [None]:
import torch
import matplotlib.pyplot as plt 
import torch.nn.functional as F

import requests 

In [None]:
url="https://raw.githubusercontent.com/karpathy/makemore/master/names.txt"
# url="https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt"
res=requests.get(url)
words=(res.text).splitlines()
print(len(words),max(len(w) for w in words),min(len(w) for w in words))

In [None]:
chars =sorted(list(set((''.join(words))))) #get unique characters 
stoi={s:i+1 for i, s in enumerate(chars)}  # map char to int
stoi["."]=0 
itos={i:s for s,i in stoi.items()}         # map int to char

In [None]:
# calculate bigram frequencies in Matrix Form 
N=torch.zeros(28,28,dtype=int)
b={}
for w in  words:
    chs=['.']+list(w)+['.']
    for ch1,ch2 in zip(chs,chs[1:]):
        ix1=stoi[ch1]
        ix2=stoi[ch2]
        N[ix1][ix2]+=1

In [None]:
# plot bigram freaquencees
plt.figure(figsize=(16,16))
plt.imshow(N,cmap='Blues')
for i in range(27):
    for j in range(27):
        chstr=itos[i]+itos[j]
        plt.text(j,i,chstr,ha="center",va='bottom',color="gray")
        plt.text(j,i,N[i,j].item(),ha="center",va="top",color="gray")
plt.axis("off")

In [None]:
# Convert Bigram frequency matrix to probability matrix
P=(N+1).float()# adding 1 to prevent probability from being 0 giving infinite loss
P=P/P.sum(1,keepdim=True)
print(P.shape)

In [None]:
# sampling dumb lang  model basesd on probability matrix 
g=torch.Generator().manual_seed(2147483647)
for i in range(20):

  ix=0
  out=[]
  while True:
    p=P[ix]   
    #to compare againet completely random 
    # p=N[ix].float()
    # p=p/p.sum()
    # p=torch.ones(27)/27
    ix= torch.multinomial(p,num_samples=1,replacement=True, generator=g).item()
    out.append(itos[ix])
    if ix==0:
      break
  print(''.join(out))

In [None]:
# loss calculation for model based on probability materix 
logLikelihood=0.0
nll=0
n=0
for w in  words:
    chs=['.']+list(w)+['.']
    for ch1,ch2 in zip(chs,chs[1:]):
        ix1=stoi[ch1]
        ix2=stoi[ch2]
        prob= P[ix1][ix2]
        logprob=torch.log(prob)#multiplication of probabilites same as addition of negative log probabilites
        logLikelihood+=logprob
        n+=1  
    nll=-logLikelihood
      
print(f"log {logLikelihood}")
print(f"nll {nll}")
print(f"LOSS nll/n {nll/n}")


In [None]:
# Till now we were directly using prob matrix with wass calculated directly on the entire datset 
# Now we will be training a model to do that task instead of explicitly making a matrix

#increasing the probability is the same as increasing log(prob) which is the same as decreasing negative log(prob)

In [None]:
#create training set 
xs,ys=[],[]
n=0
for w in words:
    chs=['.']+list(w)+['.']
    for ch1,ch2 in zip(chs,chs[1:]):
        ix1=stoi[ch1]
        ix2=stoi[ch2]
        # print(ch1,ch2) be carful with prints and for loops long for loops are very hard to stop ,kernel become un alive 
        xs.append(ix1)
        ys.append(ix2)

xs=torch.tensor(xs)
ys=torch.tensor(ys)

In [None]:
W=torch.randn((27,27),requires_grad=True).float()

In [None]:
lossi= []

xenc=F.one_hot(xs,num_classes=27).float()
for k in range(100):
    # forward pass
    num=xs.nelement()
    logits= xenc@W
    print(xenc.shape,W.shape)
    counts=logits.exp()
    break
    probs=counts/counts.sum(1,keepdims=True)
    # loss calculation
    loss=-probs[torch.arange(num),ys].log().mean() +0.01*(W**2).mean() #regularization ie smoothing the weights
    # loss=F.cross_entropy(probs,ys) #better way to do the above without reinventing the wheel

    W.grad=None 
    loss.backward()
    print(k,loss)
    lossi.append(loss.data)
    # update
    W.data+=-50*W.grad
plt.plot(lossi)

In [None]:
#Yay almost the same loss as the model with direct Prob matrix as both of the have essentially the same information 
# Now we will sample the MLP model
g=torch.Generator().manual_seed(2147647)
for i in range(20):

  ix=0
  out=[]
  while True:
    # p=P[ix]   
    xenc=F.one_hot(torch.tensor([ix]),num_classes=27).float()
    logits=xenc @ W
    counts=logits.exp()
    p=counts/counts.sum(1,keepdims=True)
    ix= torch.multinomial(p,num_samples=1,replacement=True, generator=g).item()
    
    out.append(itos[ix])
    if ix==0:
      break
  print(''.join(out))

In [None]:
# Yup almost the same samples as well 