# Recurrent Neural Networks


## Single Layer RNN

$$
\begin{split}
H_t &= \phi(X_t W_{xh} + H_{t-1}W_{hh} + b_{h}) \\
O_t &= \phi(H^{n-1}_t W_{ho} + b_{o}) \\
\end{split}
$$

## Deep RNN

$$
\begin{split}
H_t^{(1)} &= \phi(X_t W_{xh} + H_{t-1}^{(1)}W_{hh} + b_{h}) \\
H_t^{(2)} &= \phi(H_t^{(1)} W_{xh} + H_{t-1}^{(2)}W_{hh} + b_{h}) \\
&\vdots \\
H_t^{(n)} &= \phi(H_t^{(n-1)} W_{xh} + H_{t-1}^{(n)}W_{hh} + b_{h}) \\
O_t &= \phi(H^{n}_t W_{ho} + b_{o}) \\
\end{split}
$$

## Mean Cross Entropy

$$
MCE(x_t) = -\frac{1}{n}\sum_{i=1}^n \log P(x_t | x_{t-1}, \dots, x_1)

## Perplexity

$$
PPL(x_t) = e^{MCE(x_t)}
$$

# Gated Recurrent Unit (GRU)

$$
\begin{split}
z_t &= \sigma(X_t W_{xz} + H_{t-1}W_{hz} + b_{z}) \\
r_t &= \sigma(X_t W_{xr} + H_{t-1}W_{hr} + b_{r}) \\
\\
\tilde{H}_t &= \tanh(X_t W_{xh} + (r_t \odot H_{t-1})W_{hh} + b_{h}) \\
H_t &= (1 - z_t) \odot \tilde{H}_t + z_t \odot H_{t-1} = lerp(z_t, H_{t-1}, \tilde{H}_t) \\
O_t &= \phi(H_t W_{ho} + b_{o}) \\
\\
\tilde{H}_t &- \text{candidate hidden state} \\
\end{split}
$$

# Long Short-Term Memory (LSTM)

$$
\begin{split}
i_t &= \sigma(X_t W_{xi} + H_{t-1}W_{hi} + b_{i}) \\
f_t &= \sigma(X_t W_{xf} + H_{t-1}W_{hf} + b_{f}) \\
o_t &= \sigma(X_t W_{xo} + H_{t-1}W_{ho} + b_{o}) \\
\\
\tilde{C}_t &= \tanh(X_t W_{xc} + H_{t-1}W_{hc} + b_{c}) \\
C_t &= f_t \odot C_{t-1} + i_t \odot \tilde{C}_t \\
H_t &= o_t \odot \tanh(C_t) \\
O_t &= \phi(H_t W_{ho} + b_{o}) \\
\\
\tilde{C}_t &- \text{candidate memory cell} \\
C_{t-1} &- \text{previous memory cell} \\
C_t &- \text{current memory cell} \\
\end{split}
$$

# Sequence to Sequence (Seq2Seq) Model

$$
\begin{split}
H_t &= f(x_t, H_{t-1}) \\
Encoder: c &= q(H_1, H_2, \dots, H_T) \\
Decoder: s_{t'} &= g(y_{t'-1}, c, s_{t'-1}) \\
\end{split}
$$

## Bilingual Evaluation Understudy (BLEU)

$$
\begin{split}
BLEU &= \exp \left( min \left ( 0, 1 - \frac{len_{label}}{len_{pred}} \right ) \right) \prod_{i=1}^{n} p_i^{1/2^i} \\
\end{split}
$$