<a href="https://colab.research.google.com/github/Sanim27/DeepL_from_scratch/blob/main/RNN%26LSTM_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Lets dive into the world of RNN and LSTM.

First lets implement RNN cell.

a<t>=tanh ( Wax x<t> + Waa a<t-1> +ba )
....
....
yhat<t> = softmax ( Wya a<t> +by )

In [18]:
def softmax(z):
    e_z = np.exp(z - np.max(z, axis=0, keepdims=True))  # for numerical stability
    return e_z / np.sum(e_z, axis=0, keepdims=True)

In [12]:
import numpy as np

In [13]:
def rnn_cell_forward(xt, a_prev, parameters):
  Wax=parameters["Wax"]
  Waa=parameters["Waa"]
  Wya=parameters["Wya"]
  ba=parameters["ba"]
  by=parameters["by"]

  a_next=np.tanh(np.dot(Wax,xt)+np.dot(Waa,a_prev)+ba)
  yt_pred=softmax(np.dot(Wya,a_next)+by)

  cache=(a_next,a_prev,xt,parameters)

  return a_next,yt_pred,cache

Now building the whole loop for all time stamps.

In [14]:
def rnn_forward(x,a0,parameters):
  caches=[]
  n_x,m,T_x=x.shape
  n_y,n_a=parameters["Wya"].shape
  a=np.zeros((n_a,m,T_x))
  y_pred=np.zeros((n_y,m,T_x))
  a_next=a0
  for t in range(T_x):
    a_next,yt_pred,cache=rnn_cell_forward(x[:,:,t],a_next,parameters)
    a[:,:,t]=a_next
    y_pred[:,:,t]=yt_pred
    caches.append(cache)
  caches=(caches,x)
  return a,y_pred,caches

Onto the LSTM now.

In [15]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [16]:
def lstm_cell_forward(xt,a_prev,c_prev,parameters):
  Wf=parameters["Wf"]
  bf=parameters["bf"]
  Wi=parameters["Wi"]
  bi=parameters["bi"]
  Wc=parameters["Wc"]
  bc=parameters["bc"]
  Wo=parameters["Wo"]
  bo=parameters["bo"]
  Wy=parameters["Wy"]
  by=parameters["by"]

  n_x,m=xt.shape
  n_y,n_a=Wy.shape
  concat=np.concatenate([a_prev,xt])

  ft=sigmoid(np.dot(Wf,concat)+bf)
  it=sigmoid(np.dot(Wi,concat)+bi)
  cct=np.tanh(np.dot(Wc,concat)+bc)
  c_next=ft*c_prev+it*cct
  ot=sigmoid(np.dot(Wo,concat)+bo)
  a_next=ot*np.tanh(c_next)

  yt_pred=softmax(np.dot(Wy,a_next)+by)
  cache=(a_next,c_next,a_prev,c_prev,ft,it,cct,ot,xt,parameters)

  return a_next,c_next,yt_pred,cache

Now for all time stamps.

In [17]:
def lstm_forward(x,a0,parameters):
  caches=[]
  n_x,m,T_x=x.shape
  n_y,n_a=parameters["Wy"].shape
  a=np.zeros((n_a,m,T_x))
  c=np.zeros((n_a,m,T_x))
  y=np.zeros((n_y,m,T_x))
  a_next=a0
  c_next=np.zeros((n_a,m))

  for t in range(T_x):
    a_next,c_next,yt_pred,cache=lstm_cell_forward(x[:,:,t],a_next,c_next,parameters)
    a[:,:,t]=a_next
    c[:,:,t]=c_next
    y[:,:,t]=yt_pred
    caches.append(cache)
  caches=(caches,x)
  return a,y,c,caches

BackProp Time !!!

In [19]:
def rnn_cell_backward(da_next,cache):
  (a_next,a_prev,xt,parameters)=cache
  Wax=parameters["Wax"]
  Waa=parameters["Waa"]
  Wya=parameters["Wya"]
  ba=parameters["ba"]
  by=parameters["by"]

  dtanh=da_next*(1-a_next**2)
  dxt=np.dot(Wax.T,dtanh)
  dWax=np.dot(dtanh,xt.T)

  da_prev=np.dot(Waa.T,dtanh)
  dWaa=np.dot(dtanh,a_prev.T)

  dba=np.sum(dtanh,axis=1,keepdims=True)

  gradients = {"dxt": dxt, "da_prev": da_prev, "dWax": dWax, "dWaa": dWaa, "dba": dba}

  return gradients

In [20]:
def rnn_backward(da,caches):
  (caches,x)=caches
  (a1,a0,x1,parameters)=caches[0]
  n_a, m, T_x = da.shape
  n_x, m = x1.shape
  dx = np.zeros((n_x, m, T_x))
  dWax = np.zeros((n_a, n_x))
  dWaa = np.zeros((n_a, n_a))
  dba = np.zeros((n_a, 1))
  da0 = np.zeros((n_a, m))
  da_prevt = np.zeros((n_a, m))
  for t in reversed(range(T_x)):
    gradients = rnn_cell_backward(da[:, :, t] + da_prevt, caches[t])
    dxt, da_prevt, dWaxt, dWaat, dbat = gradients["dxt"], gradients["da_prev"], gradients["dWax"], gradients["dWaa"], gradients["dba"]
    dx[:, :, t] = dxt
    dWax += dWaxt
    dWaa += dWaat
    dba += dbat
  da0 = da_prevt
  gradients = {"dx": dx, "da0": da0, "dWax": dWax, "dWaa": dWaa,"dba": dba}
  return gradients