diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index 632569fa4fbe3..85de86a42c0ad 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -648,7 +648,12 @@ def _switch_grad(x, stop=False):
     return (final_outputs, final_states)
 
 
-def birnn(cell_fw, cell_bw, inputs, initial_states, sequence_length, time_major,
+def birnn(cell_fw,
+          cell_bw,
+          inputs,
+          initial_states,
+          sequence_length=None,
+          time_major=False,
           **kwargs):
     """
     birnn creates a bidirectional recurrent neural network specified by 
@@ -686,8 +691,7 @@ def birnn(cell_fw, cell_bw, inputs, initial_states, sequence_length, time_major,
             else the shape is `[batch_size, time_steps, size]`, where size is
             `cell_fw.hidden_size + cell_bw.hidden_size`.
         final_states (tuple): A tuple of the final states of the forward 
-            cell and backward cell. 
-            
+            cell and backward cell.        
 
     Examples:
 
@@ -696,12 +700,22 @@ def birnn(cell_fw, cell_bw, inputs, initial_states, sequence_length, time_major,
             import paddle
             paddle.disable_static()
 
-            cell_fw = LSTMCell(16, 32)
-            cell_bw = LSTMCell(16, 32)
-            inputs = paddle.rand((2, 23, 16))
-            outputs, final_states = paddle.nn.functional.birnn(cell_fw, cell_bw, inputs)
+            cell_fw = paddle.nn.LSTMCell(16, 32)
+            cell_bw = paddle.nn.LSTMCell(16, 32)
+
+            inputs = paddle.rand((4, 23, 16))
+            hf, cf = paddle.rand((4, 32)), paddle.rand((4, 32))
+            hb, cb = paddle.rand((4, 32)), paddle.rand((4, 32))
+            initial_states = ((hf, cf), (hb, cb))
+            outputs, final_states = paddle.nn.functional.birnn(
+                cell_fw, cell_bw, inputs, initial_states)
         
     """
+    if initial_states is None:
+        state_fw = cell_fw.get_initial_states(
+            batch_ref=inputs, batch_dim_idx=1 if time_major else 0)
+        state_bw = cell_fw.get_initial_states(
+            batch_ref=inputs, batch_dim_idx=1 if time_major else 0)
     states_fw, states_bw = initial_states
     outputs_fw, states_fw = rnn(cell_fw,
                                 inputs,
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index 2f5756459709a..6f1c5f199ac99 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -280,7 +280,7 @@ class SimpleRNNCell(RNNCellBase):
     Arguments:
         input_size (int): The input size.
         hidden_size (int): The hidden size.
-        nonlinearity (str, optional): The activation in the SimpleRNN cell. 
+        activation (str, optional): The activation in the SimpleRNN cell. 
             It can be `tanh` or `relu`. Defaults to `tanh`.
         weight_ih_attr (ParamAttr, optional): The parameter attribute for 
             `weight_ih`. Default: None.
@@ -342,7 +342,7 @@ class SimpleRNNCell(RNNCellBase):
     def __init__(self,
                  input_size,
                  hidden_size,
-                 nonlinearity="tanh",
+                 activation="tanh",
                  weight_ih_attr=None,
                  weight_hh_attr=None,
                  bias_ih_attr=None,
@@ -371,13 +371,13 @@ def __init__(self,
 
         self.input_size = input_size
         self.hidden_size = hidden_size
-        if nonlinearity not in ["tanh", "relu"]:
+        if activation not in ["tanh", "relu"]:
             raise ValueError(
-                "nonlinearity for SimpleRNNCell should be tanh or relu, "
-                "but get {}".format(nonlinearity))
-        self.nonlinearity = nonlinearity
-        self._nonlinear_fn = paddle.tanh \
-            if nonlinearity == "tanh" \
+                "activation for SimpleRNNCell should be tanh or relu, "
+                "but get {}".format(activation))
+        self.activation = activation
+        self._activation_fn = paddle.tanh \
+            if activation == "tanh" \
             else F.relu
 
     def forward(self, inputs, states=None):
@@ -390,7 +390,7 @@ def forward(self, inputs, states=None):
         h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True)
         if self.bias_hh is not None:
             h2h += self.bias_hh
-        h = self._nonlinear_fn(i2h + h2h)
+        h = self._activation_fn(i2h + h2h)
         return h, h
 
     @property
@@ -479,9 +479,10 @@ class LSTMCell(RNNCellBase):
 
             x = paddle.randn((4, 16))
             prev_h = paddle.randn((4, 32))
+            prev_c = paddle.randn((4, 32))
 
             cell = paddle.nn.LSTMCell(16, 32)
-            y, h = cell(x, prev_h)
+            y, (h, c) = cell(x, (prev_h, prev_c))
 
     """
 
@@ -758,7 +759,7 @@ class RNN(Layer):
             prev_h = paddle.randn((4, 32))
 
             cell = paddle.nn.SimpleRNNCell(16, 32)
-            rnn = paddle.RNN(cell)
+            rnn = paddle.nn.RNN(cell)
             outputs, final_states = rnn(inputs, prev_h)
 
     """
@@ -848,9 +849,9 @@ class BiRNN(Layer):
             import paddle
             paddle.disable_static()
 
-            cell_fw = LSTMCell(16, 32)
-            cell_bw = LSTMCell(16, 32)
-            rnn = BidirectionalRNN(cell_fw, cell_bw)
+            cell_fw = paddle.nn.LSTMCell(16, 32)
+            cell_bw = paddle.nn.LSTMCell(16, 32)
+            rnn = paddle.nn.BiRNN(cell_fw, cell_bw)
 
             inputs = paddle.rand((2, 23, 16))
             outputs, final_states = rnn(inputs)
@@ -953,7 +954,7 @@ class SimpleRNN(RNNMixin):
         input_size (int): The input size for the first layer's cell.
         hidden_size (int): The hidden size for each layer's cell.
         num_layers (int, optional): Number of layers. Defaults to 1.
-        nonlinearity (str, optional): The activation in each SimpleRNN cell. It can be 
+        activation (str, optional): The activation in each SimpleRNN cell. It can be 
             `tanh` or `relu`. Defaults to `tanh`.
         direction (str, optional): The direction of the network. It can be "forward", 
             "backward" and "bidirectional". Defaults to "forward".
@@ -1018,7 +1019,7 @@ def __init__(self,
                  input_size,
                  hidden_size,
                  num_layers=1,
-                 nonlinearity="tanh",
+                 activation="tanh",
                  direction="forward",
                  dropout=0.,
                  time_major=False,
@@ -1031,29 +1032,29 @@ def __init__(self,
 
         if direction in ["forward", "backward"]:
             is_reverse = direction == "backward"
-            cell = SimpleRNNCell(input_size, hidden_size, nonlinearity,
+            cell = SimpleRNNCell(input_size, hidden_size, activation,
                                  weight_ih_attr, weight_hh_attr, bias_ih_attr,
                                  bias_hh_attr)
             self.append(RNN(cell, is_reverse, time_major))
             for i in range(1, num_layers):
-                cell = SimpleRNNCell(hidden_size, hidden_size, nonlinearity,
+                cell = SimpleRNNCell(hidden_size, hidden_size, activation,
                                      weight_ih_attr, weight_hh_attr,
                                      bias_ih_attr, bias_hh_attr)
                 self.append(RNN(cell, is_reverse, time_major))
         elif direction == "bidirectional":
-            cell_fw = SimpleRNNCell(input_size, hidden_size, nonlinearity,
+            cell_fw = SimpleRNNCell(input_size, hidden_size, activation,
                                     weight_ih_attr, weight_hh_attr,
                                     bias_ih_attr, bias_hh_attr)
-            cell_bw = SimpleRNNCell(input_size, hidden_size, nonlinearity,
+            cell_bw = SimpleRNNCell(input_size, hidden_size, activation,
                                     weight_ih_attr, weight_hh_attr,
                                     bias_ih_attr, bias_hh_attr)
             self.append(BiRNN(cell_fw, cell_bw, time_major))
             for i in range(1, num_layers):
                 cell_fw = SimpleRNNCell(
-                    2 * hidden_size, hidden_size, nonlinearity, weight_ih_attr,
+                    2 * hidden_size, hidden_size, activation, weight_ih_attr,
                     weight_hh_attr, bias_ih_attr, bias_hh_attr)
                 cell_bw = SimpleRNNCell(
-                    2 * hidden_size, hidden_size, nonlinearity, weight_ih_attr,
+                    2 * hidden_size, hidden_size, activation, weight_ih_attr,
                     weight_hh_attr, bias_ih_attr, bias_hh_attr)
                 self.append(BiRNN(cell_fw, cell_bw, time_major))
         else: