PaddlePaddle · reyoung · Sep 29, 2016 · Sep 27, 2016 · Sep 28, 2016 · Sep 28, 2016
diff --git a/doc/algorithm/rnn/rnn.rst b/doc/algorithm/rnn/rnn.rst
@@ -142,6 +142,7 @@ We also project the encoder vector to :code:`decoder_size` dimensional space, ge
 The decoder uses :code:`recurrent_group` to define the recurrent neural network. The step and output functions are defined in :code:`gru_decoder_with_attention`:
 
 .. code-block:: python
+
     group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
                   StaticInput(input=encoded_proj,is_seq=True)]
     trg_embedding = embedding_layer(

diff --git a/doc/ui/api/trainer_config_helpers/optimizers.rst b/doc/ui/api/trainer_config_helpers/optimizers.rst
@@ -4,6 +4,12 @@ BaseSGDOptimizer
     :members: BaseSGDOptimizer
     :noindex:
 
+MomentumOptimizer
+=================
+..  automodule:: paddle.trainer_config_helpers.optimizers
+    :members: MomentumOptimizer
+    :noindex:
+
 AdamOptimizer
 =============
 ..  automodule:: paddle.trainer_config_helpers.optimizers

diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
@@ -47,7 +47,7 @@ add_test(NAME test_CompareTwoOpts
   COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
         ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoOpts
             --config_file_a=trainer/tests/sample_trainer_config_opt_a.conf --config_file_b=trainer/tests/sample_trainer_config_opt_b.conf
-            --num_passes=1 --need_high_accuracy=1
+            --num_passes=1 --need_high_accuracy=0
     WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
 
 ################# test_CompareSparse ##################

diff --git a/paddle/trainer/tests/mnist.list b/paddle/trainer/tests/mnist.list
@@ -0,0 +1 @@
+trainer/tests/mnist_bin_part
diff --git a/paddle/trainer/tests/mnist_bin_part b/paddle/trainer/tests/mnist_bin_part
diff --git a/paddle/trainer/tests/sample_trainer_config_opt_a.conf b/paddle/trainer/tests/sample_trainer_config_opt_a.conf
@@ -12,32 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
+from paddle.trainer_config_helpers import *
 
 ################################### Data Configuration ###################################
-TrainData(ProtoData(files = "train.list"))
+TrainData(ProtoData(files = "trainer/tests/mnist.list"))
 ################################### Algorithm Configuration ###################################
-Settings(
-    learning_rate_decay_a = 0.0,
-    learning_rate_decay_b = 0.0,
-    learning_rate = 1e-03,
-    batch_size = 1000,
-    algorithm = 'sgd',
-    num_batches_per_send_parameter = 1,
-    num_batches_per_get_parameter = 1,
-    learning_method='sparse_momentum',
-)
-default_momentum(0.5)
+settings(batch_size = 1000,
+         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
 ################################### Network Configuration ###################################
-Layer(type = "data", name = "input", size = 784)
-Layer(inputs = [Input("input", parameter_name = "_layer1.w")], name = "layer1", bias = Bias(parameter_name = "_layer1.bias"), active_type = "sigmoid", type = "fc", size = 800)
-Layer(inputs = [Input("layer1", parameter_name = "_layer2.w")], name = "layer2", bias = Bias(parameter_name = "_layer2.bias"), active_type = "sigmoid", type = "fc", size = 800)
-#Layer(inputs = [Input("layer2", parameter_name = "_layer_output.w", decay_rate = 0.02)], name = "output", bias = Bias(parameter_name = "_layer_output.bias"), active_type = "margin", type = "fc", size = 10)
-#Layer(inputs = [Input("layer2", parameter_name = "_layer_output.w", decay_rate = 0.02)], name = "output", bias = Bias(parameter_name = "_layer_output.bias"), type = "fc", size = 10)
-Layer(inputs = [Input("layer2", parameter_name = "_layer_output.w")], name = "output", bias = Bias(parameter_name = "_layer_output.bias"), active_type = "softmax", type = "fc", size = 10)
-Layer(type = "data", name = "label", size = 1)
-Layer(inputs = [Input("output"), Input("label")], type = "multi-class-cross-entropy", name = "cost")
-#Layer(inputs = [Input("output"), Input("label")], type = "huber", name = "cost")
-Evaluator(inputs=["output", "label"], type = "classification_error", name = "classification_error")
-Inputs("input", "label")
-Outputs("cost")
+data = data_layer(name ="input", size=784)
+
+fc1 = fc_layer(input=data, size=800,
+               bias_attr=True,
+               act=SigmoidActivation())
+
+fc2 = fc_layer(input=fc1, size=800,
+               bias_attr=True,
+               act=SigmoidActivation())
+
+output = fc_layer(input=[fc1, fc2], size=10,
+                  bias_attr=True,
+                  act=SoftmaxActivation())
+
+lbl = data_layer(name ="label", size=1)
+
+cost = classification_cost(input=output, label=lbl)
+outputs(cost)
diff --git a/paddle/trainer/tests/sample_trainer_config_opt_b.conf b/paddle/trainer/tests/sample_trainer_config_opt_b.conf
@@ -12,32 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
+from paddle.trainer_config_helpers import *
 
 ################################### Data Configuration ###################################
-TrainData(ProtoData(files = "train.list"))
+TrainData(ProtoData(files = "trainer/tests/mnist.list"))
 ################################### Algorithm Configuration ###################################
-Settings(
-    learning_rate_decay_a = 0.0,
-    learning_rate_decay_b = 0.0,
-    learning_rate = 1e-03,
-    batch_size = 1000,
-    algorithm = 'sgd',
-    num_batches_per_send_parameter = 1,
-    num_batches_per_get_parameter = 1,
-    learning_method='momentum',
-)
-default_momentum(0.5)
+settings(batch_size = 1000,
+         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
 ################################### Network Configuration ###################################
-Layer(type = "data", name = "input", size = 784)
-Layer(inputs = [Input("input", parameter_name = "_layer1.w")], name = "layer1", bias = Bias(parameter_name = "_layer1.bias"), active_type = "sigmoid", type = "fc", size = 800)
-Layer(inputs = [Input("layer1", parameter_name = "_layer2.w")], name = "layer2", bias = Bias(parameter_name = "_layer2.bias"), active_type = "sigmoid", type = "fc", size = 800)
-#Layer(inputs = [Input("layer2", parameter_name = "_layer_output.w", decay_rate = 0.02)], name = "output", bias = Bias(parameter_name = "_layer_output.bias"), active_type = "margin", type = "fc", size = 10)
-#Layer(inputs = [Input("layer2", parameter_name = "_layer_output.w", decay_rate = 0.02)], name = "output", bias = Bias(parameter_name = "_layer_output.bias"), type = "fc", size = 10)
-Layer(inputs = [Input("layer2", parameter_name = "_layer_output.w")], name = "output", bias = Bias(parameter_name = "_layer_output.bias"), active_type = "softmax", type = "fc", size = 10)
-Layer(type = "data", name = "label", size = 1)
-Layer(inputs = [Input("output"), Input("label")], type = "multi-class-cross-entropy", name = "cost")
-#Layer(inputs = [Input("output"), Input("label")], type = "huber", name = "cost")
-Evaluator(inputs=["output", "label"], type = "classification_error", name = "classification_error")
-Inputs("input", "label")
-Outputs("cost")
+data = data_layer(name ="input", size=784)
+
+fc1 = fc_layer(input=data, size=800,
+               bias_attr=True,
+               act=SigmoidActivation())
+
+fc2 = fc_layer(input=fc1, size=800,
+               bias_attr=True,
+               act=SigmoidActivation())
+
+output = fc_layer(input=[fc1, fc2], size=10,
+                  bias_attr=True,
+                  act=SoftmaxActivation())
+
+lbl = data_layer(name ="label", size=1)
+
+cost = classification_cost(input=output, label=lbl)
+outputs(cost)
diff --git a/python/paddle/trainer_config_helpers/optimizers.py b/python/paddle/trainer_config_helpers/optimizers.py
@@ -71,16 +71,41 @@ def to_setting_kwargs(self):
 
 
 class MomentumOptimizer(BaseSGDOptimizer):
+    """
+    MomentumOptimizer.
+
+    When sparse=True, the update scheme:
+
+    ..  math::
+
+        \\alpha_t &= \\alpha_{t-1} / k \\\\
+        \\beta_t &= \\beta_{t-1} / (1 + \\lambda \\gamma_t) \\\\
+        u_t &= u_{t-1} - \\alpha_t \\gamma_t g_t \\\\
+        v_t &= v_{t-1} + \\tau_{t-1} \\alpha_t \\gamma_t g_t \\\\
+        \\tau_t &= \\tau_{t-1} + \\beta_t / \\alpha_t
+
+    where :math:`k` is momentum, :math:`\\lambda` is decay rate, 
+    :math:`\\gamma_t` is learning rate at the t'th step.
+
+    :param sparse: with sparse support or not.
+    :type sparse: bool
+    """
     def extra_settings(self):
         default_momentum(self.momentum)
 
     def to_setting_kwargs(self):
-        return {
-            'learning_method': 'momentum'
-        }
+        if self.sparse:
+            return {
+                'learning_method': 'sparse_momentum'
+            }
+        else:
+            return {
+                'learning_method': 'momentum'
+            }
 
-    def __init__(self, momentum=None):
+    def __init__(self, momentum=None, sparse=False):
         self.momentum = momentum
+        self.sparse = sparse
 
 
 class AdamOptimizer(BaseSGDOptimizer):