Recurrent Neural Networks

In these neural networks the output is influenced by previous inputs too, so useful for sequential data. For example, predicting next word in the sentence. Basically, they have loops which feeds back to the same layer.

$ h_{t} = W_{hh}x_{t-1} + W_{hx}x_{t}$ \
$ y_{t} = W_{yh}h_{t}$

But these Neural Networks are very hard to train because of vanishing and exploding gradient problem. So, we have its updated version LSTM : Long Short Term Memory Networks where we don't consider all the previous inputs but channelize them into short and long channels using Forget Gate.

LSTM

In [1]:
import torch
import torch.nn as nn
import numpy as np
import torch.optim as optim

In [2]:
# LSTM cell

class LSTMcell:

    def __init__(self,m,r):

        # percentage long term to remember
        self.w_input_1 = nn.Parameter(torch.rand(r,m),requires_grad=True)
        self.w_short_1 = nn.Parameter(torch.rand(r,r),requires_grad=True)
        self.b_1 = nn.Parameter(torch.rand(r,1),requires_grad=True)
        # sigma

        # percentage potential memory to remember
        self.w_input_2 = nn.Parameter(torch.rand(r,m),requires_grad=True)
        self.w_short_2 = nn.Parameter(torch.rand(r,r),requires_grad=True)
        self.b_2 = nn.Parameter(torch.rand(r,1),requires_grad=True)
        # sigma

        # Potential Long term memory for current input
        self.w_input_3 = nn.Parameter(torch.rand(r,m),requires_grad=True)
        self.w_short_3 = nn.Parameter(torch.rand(r,r),requires_grad=True)
        self.b_3 = nn.Parameter(torch.rand(r,1),requires_grad=True)
        # tanh

        # New short
        self.w_input_4 = nn.Parameter(torch.rand(r,m),requires_grad=True)
        self.w_short_4 = nn.Parameter(torch.rand(r,r),requires_grad=True)
        self.b_4 = nn.Parameter(torch.rand(r,1),requires_grad=True)
        # sigmoid
        self.parameters = torch.tensor([self.w_input_1,self.w_input_2,self.w_input_3,self.w_input_4,
                           self.w_short_1,self.w_short_2,self.w_short_3,self.w_short_4,
                           self.b_1,self.b_2,self.b_3,self.b_4],requires_grad=True)


    def integrate(self, input, short, long):
        # percentage long term to remember
        
        o1 = torch.matmul(self.w_input_1, input)
        o2 = torch.matmul(self.w_short_1, short)
        o3 = torch.mul(long,torch.sigmoid(o1+o2 + self.b_1))

        # percentage potential memory to remember
        o4 = torch.matmul(self.w_input_2, input)
        o5 = torch.matmul(self.w_short_2, short) 
        o6 = torch.sigmoid(o4+o5 + self.b_2)

        # Potential Long term memory for current input
        o7 = torch.matmul(self.w_input_3, input)
        o8 = torch.matmul(self.w_short_3, short)
        o9 = torch.tanh(o7+o8+ self.b_3) 

        newLong = o3 + torch.mul(o6,o9)
        

        # new short
        o10 = torch.matmul(self.w_input_4, input)
        o11 = torch.matmul(self.w_short_4, short)
        o12 = torch.sigmoid(o11+o10 + self.b_4)
        
        newShort = torch.mul(o12,(torch.tanh(newLong)))

        return newShort,newLong
    

In [19]:
class LSTM:

    def __init__(self):
        self.layers = []
        self.ms = []
        self.rs = []
        self.parameters = []
        self.loss_function = nn.MSELoss()
    
    def add_layer(self,m,r,n): 
        # m : Input dimension 
        # n :number of cells in the layer 
        # r : output dimension
        self.layers.append([])  
        self.ms.append(m)
        self.rs.append(r)   
        for i in range(n):
            newcell = LSTMcell(m,r)
            self.layers[-1].append(newcell)
            self.parameters.append(newcell.parameters)

    def forward(self, input):
        l = len(self.layers)
        short = torch.zeros((l, self.rs[0],1), dtype = torch.float32)
        long = torch.zeros((l,self.rs[0],1),dtype = torch.float32)
        for i in range(len(input)):
            current_input = input[i].unsqueeze(1)
            for j in range(l):
                short_,long_ = short[j],long[j]
                for lstm_cell in self.layers[j]:
                    short_,long_ = lstm_cell.integrate(current_input,short_,long_)
                short[j],long[j] = short_,long_
                current_input = short_
        return short_
    
    def out(self,input):
        out = []
        for i in range(len(input)):
            out.append(self.forward(input[i]))
        return torch.tensor(out,requires_grad=True).unsqueeze(-1)
    
    def train(self,X,y,max_epochs):
        optimizer = optim.Adam(self.parameters)
        for i in range(max_epochs):
            out = self.out(X)
            loss = self.loss_function(out,y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

In [20]:
X = torch.tensor([[0,0],[1,1]],dtype = torch.float32).unsqueeze(-1)
y = torch.tensor([0,1],dtype = torch.float32).unsqueeze(-1)

In [27]:
lst = LSTM()
lst.add_layer(1,1,1)

In [28]:
lst.out(X)

tensor([[0.2411],
        [0.3355]], grad_fn=<UnsqueezeBackward0>)

In [29]:
lst.train(X,y,10000)

In [30]:
y_hat = lst.out(X)
y_hat

tensor([[0.2411],
        [0.3355]], grad_fn=<UnsqueezeBackward0>)

: 

In [9]:
y_hat - y

tensor([[ 0.7510],
        [-0.3948]], grad_fn=<SubBackward0>)