In [26]:
import numpy as np
import torch
import torch.nn.functional as F
import sys

In [27]:
inputString = [2,45,30,55,10]
outputString = [45,30,55,10,1]
numFeatures = 100 #Embedding Size
vocabSize = 80 #Words available

In [28]:
embeddings = [] #List of random Embeddings
for i in range(len(inputString)):
    x = np.random.randn(numFeatures,1) #Random Embedding
    embeddings.append(x)

In [29]:
embeddings, embeddings[0].shape, len(embeddings)

([array([[-0.41644167],
         [ 1.4572139 ],
         [-2.09613552],
         [ 0.44993645],
         [ 0.43373268],
         [-0.65288481],
         [ 1.08985742],
         [-1.32731822],
         [-0.85311324],
         [-0.6549107 ],
         [-0.06563781],
         [-0.14035496],
         [-0.14781258],
         [ 1.63290567],
         [ 0.66386207],
         [ 0.49313037],
         [-0.81791479],
         [-0.83306597],
         [-0.66398429],
         [-1.12629319],
         [ 1.25167803],
         [-2.32075385],
         [ 1.03341746],
         [ 0.77049461],
         [ 2.85476441],
         [-0.00637923],
         [ 0.48444024],
         [-0.00570254],
         [ 0.35042009],
         [ 1.05564339],
         [-1.49156967],
         [-0.89166366],
         [-0.15370519],
         [-0.2347041 ],
         [-0.53921484],
         [-0.06306569],
         [-0.78370676],
         [-0.47916896],
         [-0.09174645],
         [-0.19486551],
         [-0.33787608],
         [-0.489

In [30]:
def getOneHot(idx): #One-hot based on library size
    one_hot = np.zeros((vocabSize,1))
    one_hot[idx] = 1 
    return one_hot

In [31]:
print(getOneHot(2)) #one hot vector of word

[[0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [32]:
# lets make out weights and biases
numUnits = 50
h0 = torch.tensor(np.zeros((numUnits,1)))
Wh = torch.tensor(np.random.uniform(0,1,(numUnits,numUnits)),requires_grad=True)
Wx = torch.tensor(np.random.uniform(0,1,(numUnits,numFeatures)),requires_grad=True)
Wy = torch.tensor(np.random.uniform(0,1,(vocabSize,numUnits)),requires_grad=True)

In [33]:
print(Wh.shape,Wx.shape,Wy.shape,h0.shape)

torch.Size([50, 50]) torch.Size([50, 100]) torch.Size([80, 50]) torch.Size([50, 1])


In [34]:
#Define Forward Pass based on a single timestep
def stepForward(xt,Wx,Wh,Wy,prevMemory):
    x_frd = torch.matmul(Wx,torch.from_numpy(xt)) #Wx*X(t)
    h_frd = torch.matmul(Wh,prevMemory)
    ht = torch.tanh(x_frd+h_frd) #Flowing Activation
    yt_hat = F.softmax(torch.matmul(Wy,ht), dim = 0)
    return ht, yt_hat

In [35]:
ht, yt_hat = stepForward(embeddings[0],Wx,Wh,Wy,h0) #single input

In [36]:
ht.shape

torch.Size([50, 1])

In [37]:
yt_hat.shape , yt_hat.sum()

(torch.Size([80, 1]), tensor(1., dtype=torch.float64, grad_fn=<SumBackward0>))

In [38]:
def fullForwardRNN(X,Wx,Wh,Wy,prevMemory):#Unrolling of network
    y_hat = []
    for t in range(len(X)): #List of embeddings
        ht, yt_hat = stepForward(X[t],Wx,Wh,Wy,prevMemory)
        prevMemory = ht
        y_hat.append(yt_hat)
    return y_hat

In [39]:
y_hat = fullForwardRNN(embeddings,Wx,Wh,Wy,h0)

In [40]:
y_hat[0].shape

torch.Size([80, 1])

In [41]:
def computeLoss(y,y_hat):
    loss = 0
    for yi,yi_hat in zip(y,y_hat):
        Li = -torch.log2(yi_hat[yi==1]) #Crossentropy Loss
        loss += Li
    return loss/len(y) #avg loss


In [42]:
y = []
for idx in outputString:
    y.append(getOneHot(idx)) #Onehot the labels to compare to output

In [43]:
print(computeLoss(y,y_hat)) #Loss

tensor([8.0380], dtype=torch.float64, grad_fn=<DivBackward0>)


In [44]:
def updateParams(Wx,Wh,Wy,dWx,dWh,dWy,lr):
    with torch.no_grad(): #Does not affect gradient information
        Wx -= lr*dWx
        Wh -= lr*dWh
        Wy -= lr*dWy
    return Wx,Wh,Wy

In [45]:
#Train Function
def trainRNN(X,y,Wx,Wh,Wy,prevMemory,lr,nepoch):
    losses = []
    for epoch in range(nepoch):
        y_hat = fullForwardRNN(embeddings,Wx,Wh,Wy,prevMemory)
        loss = computeLoss(y,y_hat)
        loss.backward() #Compute gradient for all the parameters wrt Loss
        losses.append(loss)
        print("Loss after epoch =%d: %f" %(epoch,loss))
        sys.stdout.flush() 
        dWx = Wx.grad.data
        dWh = Wh.grad.data
        dWy = Wy.grad.data
        Wx,Wh,Wy = updateParams(Wx,Wh,Wy,dWx,dWh,dWy,lr)
        Wx.grad.data.zero_() #Good to reset Gradient for nect compute
        Wh.grad.data.zero_()
        Wy.grad.data.zero_()
    return Wx,Wh,Wy,losses

In [47]:
Wx,Wh,Wy,losses = trainRNN(embeddings,y,Wx,Wh,Wy,h0,0.001,100)

Loss after epoch =0: 8.037961
Loss after epoch =1: 8.006729
Loss after epoch =2: 7.975593
Loss after epoch =3: 7.944553
Loss after epoch =4: 7.913609
Loss after epoch =5: 7.882759
Loss after epoch =6: 7.852004
Loss after epoch =7: 7.821344
Loss after epoch =8: 7.790778
Loss after epoch =9: 7.760307
Loss after epoch =10: 7.729930
Loss after epoch =11: 7.699647
Loss after epoch =12: 7.669459
Loss after epoch =13: 7.639366
Loss after epoch =14: 7.609367
Loss after epoch =15: 7.579464
Loss after epoch =16: 7.549655
Loss after epoch =17: 7.519942
Loss after epoch =18: 7.490324
Loss after epoch =19: 7.460803
Loss after epoch =20: 7.431378
Loss after epoch =21: 7.402049
Loss after epoch =22: 7.372817
Loss after epoch =23: 7.343682
Loss after epoch =24: 7.314645
Loss after epoch =25: 7.285706
Loss after epoch =26: 7.256865
Loss after epoch =27: 7.228123
Loss after epoch =28: 7.199480
Loss after epoch =29: 7.170937
Loss after epoch =30: 7.142493
Loss after epoch =31: 7.114150
Loss after epoch =