In [1]:
import numpy as np

#### Activation functions

In [2]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [3]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

### Forward Propagation

#### one cell step

In [4]:
def lstm_cell_forward(xt, a_prev, c_prev, parameters):
    """
    xt = input at time step t (n_x,m)
    aprev = activation output from previous step (n_a,m)
    c_prev = cell state from prevous time step
    parameters -- python dictionary containing:
                        Wf -- Weight matrix of the forget gate, numpy array of shape (n_a, n_a + n_x)
                        bf -- Bias of the forget gate, numpy array of shape (n_a, 1)
                        Wu -- Weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x)
                        bu -- Bias of the update gate, numpy array of shape (n_a, 1)
                        Wc -- Weight matrix of the first "tanh", numpy array of shape (n_a, n_a + n_x)
                        bc --  Bias of the first "tanh", numpy array of shape (n_a, 1)
                        Wo -- Weight matrix of the output gate, numpy array of shape (n_a, n_a + n_x)
                        bo --  Bias of the output gate, numpy array of shape (n_a, 1)
                        Wy -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                        by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
    """
    
    #Retrieve the weights and biases from the parameters passed
    Wf = parameters["Wf"]
    bf = parameters["bf"]
    Wu = parameters["Wu"]
    bu = parameters["bu"]
    Wc = parameters["Wc"]
    bc = parameters["bc"]
    Wo = parameters["Wo"]
    bo = parameters["bo"]
    Wy = parameters["Wy"]
    by = parameters["by"]
    
    #retrieve the m,n_x and n_a
    n_x,m = xt.shape
    n_a,_ = a_prev.shape
    
    #concat aprev and xt
    concated_data = np.zeros([n_x + n_a,m])
    concated_data[:n_a,:] = a_prev
    concated_data[n_a:,:] = xt
    
    cdasht = np.tanh( np.dot(Wc,concated_data) + bc)
    
    #forget gate
    ft = sigmoid(np.dot(Wf,concated_data) + bf)
    
    #update gate
    ut = sigmoid(np.dot(Wu,concated_data) + bu)
    
    #output gate
    ot = sigmoid(np.dot(Wo,concated_data) + bo)
    
    #next memory cell output
    c_next = ut*cdasht + ft*c_prev
    
    #activation output
    a_next = ot*np.tanh(c_next)
    
    #ypred at time step t
    yt_pred = softmax(np.dot(Wy,a_next) + by)
    
    #store data for backward propagation
    cache = (a_next, c_next, a_prev, c_prev, ft, ut, cdasht, ot, xt, parameters)
    
    return a_next, c_next, yt_pred, cache

In [5]:
np.random.seed(1)
xt_tmp = np.random.randn(3,10)
a_prev_tmp = np.random.randn(5,10)
c_prev_tmp = np.random.randn(5,10)
parameters_tmp = {}
parameters_tmp['Wf'] = np.random.randn(5, 5+3)
parameters_tmp['bf'] = np.random.randn(5,1)
parameters_tmp['Wu'] = np.random.randn(5, 5+3)
parameters_tmp['bu'] = np.random.randn(5,1)
parameters_tmp['Wo'] = np.random.randn(5, 5+3)
parameters_tmp['bo'] = np.random.randn(5,1)
parameters_tmp['Wc'] = np.random.randn(5, 5+3)
parameters_tmp['bc'] = np.random.randn(5,1)
parameters_tmp['Wy'] = np.random.randn(2,5)
parameters_tmp['by'] = np.random.randn(2,1)

a_next_tmp, c_next_tmp, yt_tmp, cache_tmp = lstm_cell_forward(xt_tmp, a_prev_tmp, c_prev_tmp, parameters_tmp)

#### forward steps(all cells)

In [6]:
def lstm_forward(x, a0, parameters):
    """
    x = input of shape (n_x,m,T_x)
    a0 = activation input of shape (n_a,m)
    parameters -- python dictionary containing:
                        Wf -- Weight matrix of the forget gate, numpy array of shape (n_a, n_a + n_x)
                        bf -- Bias of the forget gate, numpy array of shape (n_a, 1)
                        Wu -- Weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x)
                        bu -- Bias of the update gate, numpy array of shape (n_a, 1)
                        Wc -- Weight matrix of the first "tanh", numpy array of shape (n_a, n_a + n_x)
                        bc -- Bias of the first "tanh", numpy array of shape (n_a, 1)
                        Wo -- Weight matrix of the output gate, numpy array of shape (n_a, n_a + n_x)
                        bo -- Bias of the output gate, numpy array of shape (n_a, 1)
                        Wy -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                        by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
    """
    
    #To keep track of list of all caches
    caches = []
    
    n_x,m,T_x = x.shape
    n_a,_ = a0.shape
    n_y,_ = parameters["Wy"].shape
    
    #keep track of the activations, output and memory at each time step
    a = np.zeros([n_a,m,T_x])
    y = np.zeros([n_y,m,T_x])
    c = np.zeros([n_a,m,T_x])
    
    a_next = a0
    c_next = np.zeros([n_a,m])
    
    for t in range(T_x):
        #one cell step
        a_next,c_next, yt, cache = lstm_cell_forward(x[:,:,t], a_next, c_next, parameters)
        
        #activation layer 
        a[:,:,t] = a_next
        
        #memory cell 
        c[:,:,t] = c_next
        
        #predictions
        y[:,:,t] = yt
        
        #append the caches
        caches.append(cache)
        
    # store values needed for backward propagation in cache
    caches = (caches, x)

    return a, y, c, caches

In [7]:
np.random.seed(1)
x_tmp = np.random.randn(3,10,7)
a0_tmp = np.random.randn(5,10)
parameters_tmp = {}
parameters_tmp['Wf'] = np.random.randn(5, 5+3)
parameters_tmp['bf'] = np.random.randn(5,1)
parameters_tmp['Wu'] = np.random.randn(5, 5+3)
parameters_tmp['bu']= np.random.randn(5,1)
parameters_tmp['Wo'] = np.random.randn(5, 5+3)
parameters_tmp['bo'] = np.random.randn(5,1)
parameters_tmp['Wc'] = np.random.randn(5, 5+3)
parameters_tmp['bc'] = np.random.randn(5,1)
parameters_tmp['Wy'] = np.random.randn(2,5)
parameters_tmp['by'] = np.random.randn(2,1)

a_tmp, y_tmp, c_tmp, caches_tmp = lstm_forward(x_tmp, a0_tmp, parameters_tmp)

### Backward Propagation

#### one cell step

In [8]:
def lstm_cell_backward(da_next, dc_next, cache):
    """
    da_next -- Gradients of next hidden state, of shape (n_a, m)
    dc_next -- Gradients of next cell state, of shape (n_a, m)
    cache -- cache storing information from the forward pass 
    """
    
    (a_next, c_next, a_prev, c_prev, ft, ut, cdasht, ot, xt, parameters) = cache
    n_a,m = da_next.shape
    #gate
    dot = da_next * np.tanh(c_next)*ot*(1-ot)
    dft = (dc_next*c_prev +(da_next * ot * (1 - np.tanh(c_next) ** 2)) * c_prev )* ft * (1 - ft)
    dut = (dc_next*cdasht + (da_next * ot * (1 - np.tanh(c_next) ** 2)) * cdasht) * (1 - ut) * ut
    dcdasht = (dc_next*ut + (da_next * ot * (1 - np.tanh(c_next) ** 2)) * ut) * (1 - cdasht ** 2)
    
    #bias
    dbu = np.sum(dut,axis=1,keepdims=True)
    dbf = np.sum(dft,axis=1,keepdims=True)
    dbo = np.sum(dot,axis=1,keepdims=True)
    dbc = np.sum(dcdasht,axis=1,keepdims=True)
    
    #weights
    dWc = np.dot(dcdasht,np.concatenate((a_prev, xt), axis=0).T)
    dWu = np.dot(dut,np.concatenate((a_prev, xt), axis=0).T)
    dWf = np.dot(dft,np.concatenate((a_prev, xt), axis=0).T)
    dWo = np.dot(dot,np.concatenate((a_prev, xt), axis=0).T)
    
    da_prev = np.dot(parameters['Wf'][:,:n_a].T,dft)+np.dot(parameters['Wu'][:,:n_a].T,dut)+np.dot(parameters['Wc'][:,:n_a].T,dcdasht)+np.dot(parameters['Wo'][:,:n_a].T,dot) 
    
    dc_prev = dc_next*ft+ot*(1-np.square(np.tanh(c_next)))*ft*da_next 
    
    dxt = np.dot(parameters['Wf'][:,n_a:].T,dft)+np.dot(parameters['Wu'][:,n_a:].T,dut)+np.dot(parameters['Wc'][:,n_a:].T,dcdasht)+np.dot(parameters['Wo'][:,n_a:].T,dot) 
    
    gradients = {"dxt": dxt, "da_prev": da_prev, "dc_prev": dc_prev, "dWf": dWf,"dbf": dbf, "dWu": dWu,"dbu": dbu,
                "dWc": dWc,"dbc": dbc, "dWo": dWo,"dbo": dbo}

    return gradients
    

In [9]:
np.random.seed(1)
xt_tmp = np.random.randn(3,10)
a_prev_tmp = np.random.randn(5,10)
c_prev_tmp = np.random.randn(5,10)
parameters_tmp = {}
parameters_tmp['Wf'] = np.random.randn(5, 5+3)
parameters_tmp['bf'] = np.random.randn(5,1)
parameters_tmp['Wu'] = np.random.randn(5, 5+3)
parameters_tmp['bu'] = np.random.randn(5,1)
parameters_tmp['Wo'] = np.random.randn(5, 5+3)
parameters_tmp['bo'] = np.random.randn(5,1)
parameters_tmp['Wc'] = np.random.randn(5, 5+3)
parameters_tmp['bc'] = np.random.randn(5,1)
parameters_tmp['Wy'] = np.random.randn(2,5)
parameters_tmp['by'] = np.random.randn(2,1)

a_next_tmp, c_next_tmp, yt_tmp, cache_tmp = lstm_cell_forward(xt_tmp, a_prev_tmp, c_prev_tmp, parameters_tmp)

da_next_tmp = np.random.randn(5,10)
dc_next_tmp = np.random.randn(5,10)
gradients = lstm_cell_backward(da_next_tmp, dc_next_tmp, cache_tmp)

#### Backward step

In [10]:
def lstm_backward(da, caches):
    """
    da = Gradients w.r.t hidden states of shape (n_a,m,T_x)
    caches = output of lstm_forward
    """
    #Retrieve the parameters
    (caches, x) = caches
    (a1, c1, a0, c0, f1, i1, cdash1, o1, x1, parameters) = caches[0]
    
    #Get the shapes from the params
    n_a,m,T_x = da.shape
    n_x,m = x1.shape
    
    #Initialize the params
    dWf = np.zeros((n_a, n_a + n_x))
    dWu = np.zeros((n_a, n_a + n_x))
    dWc = np.zeros((n_a, n_a + n_x))
    dWo = np.zeros((n_a, n_a + n_x))
    
    dbf = np.zeros((n_a, 1))
    dbu = np.zeros((n_a, 1))
    dbc = np.zeros((n_a, 1))
    dbo = np.zeros((n_a, 1))
    
    dx = np.zeros((n_x, m, T_x))
    da0 = np.zeros((n_a, m))
    da_prevt = np.zeros((n_a, m))
    dc_prevt = np.zeros((n_a, m))
    
    for t in reversed(range(T_x)):
        #get the graidents
        gradients = lstm_cell_backward(da[:,:,t]+da_prevt,dc_prevt, caches[t])
        
        dx[:,:,t] = gradients["dxt"]
        dWf += gradients["dWf"]
        dWu += gradients["dWu"]
        dWc += gradients["dWc"]
        dWo += gradients["dWo"]
        dbf += gradients["dbf"]
        dbu += gradients["dbu"]
        dbc += gradients["dbc"]
        dbo += gradients["dbo"]
        
        da_prevt = gradients["da_prev"]
        dc_prevt = gradients["dc_prev"]
        
    da0 = gradients["da_prev"]
    
    # Store the gradients
    gradients = {"dx": dx, "da0": da0, "dWf": dWf,"dbf": dbf, "dWu": dWu,"dbu": dbu,
                "dWc": dWc,"dbc": dbc, "dWo": dWo,"dbo": dbo}
    
    return gradients

In [11]:
np.random.seed(1)
x_tmp = np.random.randn(3,10,7)
a0_tmp = np.random.randn(5,10)

parameters_tmp = {}
parameters_tmp['Wf'] = np.random.randn(5, 5+3)
parameters_tmp['bf'] = np.random.randn(5,1)
parameters_tmp['Wu'] = np.random.randn(5, 5+3)
parameters_tmp['bu'] = np.random.randn(5,1)
parameters_tmp['Wo'] = np.random.randn(5, 5+3)
parameters_tmp['bo'] = np.random.randn(5,1)
parameters_tmp['Wc'] = np.random.randn(5, 5+3)
parameters_tmp['bc'] = np.random.randn(5,1)
parameters_tmp['Wy'] = np.random.randn(2,5)
parameters_tmp['by'] = np.random.randn(2,1)

a_tmp, y_tmp, c_tmp, caches_tmp = lstm_forward(x_tmp, a0_tmp, parameters_tmp)

da_tmp = np.random.randn(5, 10, 4)
gradients = lstm_backward(da_tmp, caches_tmp)