In [1]:
import numpy as np
from numpy import ndarray,dot
from typing import Dict

In [2]:
# LSTM layer - The compuational unit of an lstm is a cell(like gru,vanilla versions of rnn's). 
#The cell is a recursive function which computes the various vectors
# associated with the layer, given a timestep. When viewed in this manner, a recurrent layer has feedback connections.

# One can unroll the layer, sequence by sequence, then the network can be perceived as a feed forward network.

# Any LSTM arcitecture has three gates, controlling the flow data with in the cell.
  # 1) Input gate 2) Output gate 3) Forget gate.
  # The three gates output - Input gate activation vector, Output gate activation vector and a forget gate activation vector.
  # Other vectors associated with the cell are the cell state vector and the hidden/ouput vector.
    
  

In [3]:
# LSTM - archtecture 1
# The cell 
# 3 gates - input gate, forget gate and the output gate.
# 3 activation vectors associated with three gates.
# The cell takes as input three vectors at time t. The input vectors at time t, the hidden vector at time t-1 and the cell
# state vector at time t-1.

# The cell computes 6 different vectors - the input gate activation vector, the output gate activation vector, the forget
# gate activation vector, cell input activation vectors, the cell state vector, and the output/hidden vector.

# For computing these vectors, 4 sets of weight tensors are used.


In [4]:
# Activation Functions used 

def tanh(S:ndarray)->ndarray:
    return np.tanh(S)

def sigmoid(W:ndarray)->ndarray:
    return (1/(1+np.exp(-W)))

In [20]:
# arcitecture 1
def LSTM_type_1(S:ndarray,W:Dict[str,ndarray])->ndarray:
    assert S.ndim == 2  # A given data point - 2D tensor.
    
    h_t = np.zeros([W['u_i'].shape[1]])  # initializing hidden vector as zero vector at time t =0.
    cell_t = np.zeros([W['u_i'].shape[1]])  # initializing the cell state vector as zero vector at time t =0.
    
    output_gate_2d = []  # List to store the output gate activation vectors.
    hidden_vector_2d = []  # List to store the output/hidden vectors.
    input_gate_2d = [] # List to store the input gate activation vectors.
    forget_gate_2d = [] # List to store the forget gate activation vectors.
    cell_input_2d = [] # List to store the cell input activation vectors
    cell_state_2d = [] # List to store the cell state vectors.
                                                                                          # Computing
    for input_t in S:     # The loop - cell in the lstm layer.
        g_t = sigmoid(dot(W['w_g'],input_t) + dot(W['u_g'],h_t) + W['b_g'])  # The forget gate activation vector.
        i_t = sigmoid(dot(W['w_i'],input_t) + dot(W['u_i'],h_t) + W['b_i'])   # The input gate activation vector.
        o_t = sigmoid(dot(W['w_o'],input_t) + dot(W['u_o'],h_t) + W['b_o'])    # The output gate activation vector.
        
        c_t = tanh(dot(W['w_c'],input_t) + dot(W['u_c'],h_t) + W['b_c'])  # The cell input activation vector.
        
        cell_t = g_t*cell_t + i_t*c_t     # The cell state vector.
        h_t = o_t*(tanh(cell_t))        # The hidden/output vector.
        
        output_gate_2d.append(o_t)
        hidden_vector_2d.append(h_t)
        input_gate_2d.append(i_t)
        forget_gate_2d.append(g_t)
        cell_input_2d.append(c_t)
        cell_state_2d.append(cell_t)
        
    Outputs:Dict[str,ndarray] = {'Output':hidden_vector_2d,'Output_gate_activations':output_gate_2d,
                                'input_gate_activations':input_gate_2d, 'forget_gate_activations':forget_gate_2d,
                                'cell_input_activations':cell_input_2d, 'cell_state_vectors':cell_state_2d}
        
    return Outputs
      

In [7]:
S = np.random.random((6,3))

In [8]:
w_i = np.random.randn(4,3)  
u_i = np.random.randn(4,4)   # Weight tensors for the input gate
b_i = np.random.randn(4)

In [9]:
w_g = np.random.randn(4,3)
u_g = np.random.randn(4,4)
b_g = np.random.randn(4)     # weights for the forget gate

In [10]:
w_o = np.random.randn(4,3)
u_o = np.random.randn(4,4)    # weights for the output gate
b_o = np.random.randn(4)

In [11]:
w_c = np.random.randn(4,3)
u_c = np.random.randn(4,4)   # weights for the cell input activation gate
b_c = np.random.randn(4)

In [13]:
W:Dict['str',ndarray] = {'w_i':w_i,'u_i':u_i,'b_i':b_i,'w_g':w_g,'u_g':u_g,'b_g':b_g,'w_o':w_o,'u_o':u_o,'b_o':b_o,
                        'w_c':w_c,'u_c':u_c,'b_c':b_c}

In [21]:
LSTM_type_1(S,W)

{'Output': [array([0.17439339, 0.02694052, 0.27625327, 0.20628832]),
  array([0.16820564, 0.01714319, 0.46939144, 0.42642058]),
  array([0.15065099, 0.02594592, 0.48241038, 0.45394399]),
  array([0.20277785, 0.03496466, 0.53784283, 0.53583787]),
  array([0.08714051, 0.06191521, 0.54533189, 0.47631811])],
 'Output_gate_activations': [array([0.55438195, 0.16819073, 0.57731005, 0.9725676 ]),
  array([0.42611739, 0.14280374, 0.63957956, 0.98117514]),
  array([0.38611126, 0.15395771, 0.5994467 , 0.98034336]),
  array([0.50265993, 0.17054152, 0.59850067, 0.9881205 ]),
  array([0.6364184 , 0.23961794, 0.56575661, 0.99133956])],
 'input_gate_activations': [array([0.40633609, 0.42544549, 0.93615345, 0.25050981]),
  array([0.30470172, 0.51817046, 0.91715033, 0.36917452]),
  array([0.29032344, 0.70725064, 0.79037803, 0.58823696]),
  array([0.40192294, 0.59791122, 0.92210083, 0.34003067]),
  array([0.6750443 , 0.59266961, 0.96540777, 0.14457964])],
 'forget_gate_activations': [array([0.40879941, 0

In [22]:
# Architecture 2 - LSTM

# Cell.
# 3 gates - output gate, forget gate, input gate.
# 2 input vectors into the cell at time t -> the input vector at time t and the cell state vector at time t-1.
# 5 output vectors are computed -> output gate activation vector, forget gate activation vector, input gate activation vector,
# cell state vector and the hidden/output vector.

# 4 set of weights are used for the computations in a layer.

In [33]:
def LSTM_type_2(S:ndarray,W:Dict[str,ndarray])->Dict[str,ndarray]:
    assert S.ndim == 2 # S -2D tensor a datapoint.
    h_t = np.zeros([W['u_o'].shape[1]]) # Intializing the hidden vector at time t = 0 as the zero vector.
    cell_t = np.zeros([W['u_i'].shape[1]]) # Initializing the cell state vector at time t= 0 as the zero vector.
    
    input_gate_2d = [] # A list to store the input gate activation vectors.
    output_gate_2d = [] # A list to store the output gate activation vectors.
    forget_gate_2d = [] # A list to store the forget gate activation vectors computed across the different timesteps.
    cell_state_2d = [] # List to store the cell state vectors.
    hidden_vector_2d = []  # List to store the hidden/output vectors.
    
    for input_t in S:  # The layer - applying the cell recursively.
        g_t = sigmoid(dot(W['w_g'],input_t) + dot(W['u_g'],cell_t) + W['b_g']) # forget gate activation
        o_t = sigmoid(dot(W['w_o'],input_t) + dot(W['u_o'],cell_t) + W['b_o'])  # output gate activation vector
        i_t = sigmoid(dot(W['w_i'],input_t) + dot(W['u_i'],cell_t) + W['b_i'])  # input gate activation
        cell_t = g_t*cell_t + i_t*tanh(dot(W['w_cell'],input_t) + W['b_cell'])  # cell state vector.
        h_t = tanh(o_t*cell_t)   # hidden/output vector
        
        input_gate_2d.append(i_t)
        output_gate_2d.append(o_t)
        forget_gate_2d.append(g_t)
        cell_state_2d.append(cell_t)
        hidden_vector_2d.append(h_t)
        
    Outputs:Dict[str,ndarray] = {'Output':hidden_vector_2d,'input_gate_activations':input_gate_2d,
                                'output_gate_activations':output_gate_2d, 'forget_gate_activations':forget_gate_2d,
                                'cell_state_vectors':cell_state_2d}
        
    return Outputs
        
        

In [25]:
S = np.random.random((5,3))

In [26]:
w_i = np.random.randn(4,3)
u_i = np.random.randn(4,4)  # The weights for the input gate
b_i = np.random.randn(4)

In [27]:
w_g = np.random.randn(4,3)
u_g = np.random.randn(4,4)  # the weight tensors for the forget gate
b_g = np.random.randn(4)

In [28]:
w_o = np.random.randn(4,3)
u_o = np.random.randn(4,4)  #the weight tensors - output gate
b_o = np.random.randn(4)

In [35]:
w_cell = np.random.randn(4,3)
b_cell = np.random.randn(4)  # the weights - cell state vector.


In [36]:
W:Dict[str,ndarray] = {'w_i':w_i,'u_i':u_i,'b_i':b_i,'w_g':w_g,'u_g':u_g,'b_g':b_g,'w_o':w_o,'u_o':u_o,'b_o':b_o,
                      'w_cell':w_cell,'b_cell':b_cell}

In [37]:
LSTM_type_2(S,W)

{'Output': [array([ 0.043998  , -0.46532108,  0.37986034,  0.17472248]),
  array([ 0.13071077, -0.84398672,  0.52900674,  0.15981191]),
  array([ 0.27257167, -0.89982642,  0.47380328,  0.0332575 ]),
  array([ 0.37563906, -0.96440244,  0.59308615,  0.14129909]),
  array([ 0.44684989, -0.99208196,  0.55544949,  0.12635785])],
 'input_gate_activations': [array([0.1106751 , 0.59922474, 0.69854514, 0.89807662]),
  array([0.17910391, 0.87913883, 0.83274048, 0.83370517]),
  array([0.33358193, 0.93778776, 0.68190019, 0.35213479]),
  array([0.06162075, 0.842106  , 0.43981007, 0.61605649]),
  array([0.02152257, 0.93826267, 0.69557725, 0.71327531])],
 'output_gate_activations': [array([0.3982651 , 0.84811662, 0.80517803, 0.59034432]),
  array([0.53636741, 0.91657649, 0.56168988, 0.70389607]),
  array([0.50934436, 0.96621721, 0.64558847, 0.79744436]),
  array([0.65586699, 0.97877357, 0.7865797 , 0.83442332]),
  array([0.80773249, 0.97370113, 0.44669217, 0.90007516])],
 'forget_gate_activations': [