In [1]:
import numpy as np

In [2]:
def get_sample(sample_ix, ix_to_char):
    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    txt = txt[0].upper() + txt[1:]   
    return txt

### Softmax Activation function

In [3]:
def softmax(x):
    e_x = np.exp(x - max(x))
    return (e_x / e_x.sum(axis=0))

### Paramter Initialization

In [4]:
def initialize_parameters(n_a, n_x, n_y):
    """
    Initialize the parameters required in RNN(Waa,Wax,Wya,by,b). 
    Weights - Waa,Wax,Wya - Randomly initialize the following weights and multiply it by 0.01 to avoid wide range of values
    Bias - by,b - initialize it to 0
    """
    Wax = np.random.randn(n_a, n_x)*0.01 
    Waa = np.random.randn(n_a, n_a)*0.01 
    Wya = np.random.randn(n_y, n_a)*0.01 
    b = np.zeros((n_a, 1)) 
    by = np.zeros((n_y, 1)) 
    
    parameters = {"Wax": Wax, "Waa": Waa, "Wya": Wya, "b": b,"by": by}
    
    return parameters

## RNN 

### RNN Forward Propagation

In [5]:
#Perform one step forward(One RNN cell) for time step t
def rnn_step_forward(parameters, a_prev, x):
    """
    Retrieve the weights and bias parameters and calculate the a_next and y_pred for time step t
    """
    Waa,Wya,Wax,b,by = parameters["Waa"],parameters["Wya"],parameters["Wax"],parameters["b"],parameters["by"]
    a_next = np.tanh(np.dot(Wax,x) + np.dot(Waa,a_prev) + b)
    y_pred = softmax(np.dot(Wya,a_next) + by)
    
    return a_next,y_pred

In [6]:
def rnn_forward(X,Y,a0,parameters,vocab_size = 27):
    #store the one-hot encoded vector,prediction and activations in dictionary
    x,y_hat,a = {},{},{}
    
    #initialize loss and initial activation(a0)
    loss = 0
    a[-1] = a0
    
    for t in range(len(X)):
        #create one hot encoded vector
        x[t] = np.zeros((vocab_size,1))
        
        if(X[t] != None):
            x[t][X[t]] = 1
            
        #perform one step forward for time step t
        a[t],y_hat[t] = rnn_step_forward(parameters, a[t-1], x[t])
        
        # calculate loss
        loss -= np.log(y_hat[t][Y[t],0])
    
    #store the predt,activation and one hot vector in a cache for use during the back propagation
    cache = (y_hat, a, x)
        
    return loss, cache

### RNN Backward Propagation

In [7]:
#Perform one step backward propagation for a timestamp t
def rnn_step_backward(dy, gradients, parameters, x, a, a_prev):
    """
    calculate the gradients by deriving with respect to cost function.
    """
    gradients["dWya"] += np.dot(dy,a.T)
    gradients["dby"] += dy
    
    da = np.dot(parameters['Wya'].T, dy) + gradients['da_next'] 
    
    # backprop through tanh nonlinearity
    daraw = (1 - a * a) * da 
    
    gradients['db'] += daraw
    gradients['dWax'] += np.dot(daraw, x.T)
    gradients['dWaa'] += np.dot(daraw, a_prev.T)
    gradients['da_next'] = np.dot(parameters['Waa'].T, daraw)
    
    return gradients

In [8]:
#Backward Propagation
def rnn_backward(X, Y, parameters, cache):
    """
    Perform backward propagation 
    """
    gradients = {}
    
    (y_hat,a,x) = cache
    
    Waa, Wax, Wya, by, b = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['b']
    
    #Initialize the gradients
    gradients["dWaa"],gradients["dWax"],gradients["dWya"] = np.zeros_like(Waa),np.zeros_like(Wax),np.zeros_like(Wya)
    gradients["dby"],gradients["db"] = np.zeros_like(by),np.zeros_like(b)
    gradients['da_next'] = np.zeros_like(a[0])
    
    #Loop through time step in reversed manner
    for t in reversed(range(len(X))):
        dy = np.copy(y_hat[t])
        dy[Y[t]] -= 1
        gradients = rnn_step_backward(dy, gradients, parameters, x[t], a[t], a[t-1])
        
    
    return gradients, a

### Update Parameters

In [10]:
def update_parameters(parameters, gradients, lr):
    """
    Update the parameters using learning rate and gradients calculated in backward propagation
    """
    parameters['Wax'] += -lr * gradients['dWax']
    parameters['Waa'] += -lr * gradients['dWaa']
    parameters['Wya'] += -lr * gradients['dWya']
    parameters['b']  += -lr * gradients['db']
    parameters['by']  += -lr * gradients['dby']
    
    return parameters