diff --git a/doc/api/v1/trainer_config_helpers/layers.rst b/doc/api/v1/trainer_config_helpers/layers.rst
index 24389c2d8574d..75c1b35246486 100644
--- a/doc/api/v1/trainer_config_helpers/layers.rst
+++ b/doc/api/v1/trainer_config_helpers/layers.rst
@@ -498,6 +498,12 @@ hsigmoid
     :members: hsigmoid
     :noindex:
 
+smooth_l1_cost
+--------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: smooth_l1_cost
+    :noindex:
+
 Check Layer 
 ============
 
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index 2a02baf17ba0d..154cfe24432f3 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -419,6 +419,11 @@ hsigmoid
 ..  autoclass:: paddle.v2.layer.hsigmoid
     :noindex:
 
+smooth_l1_cost
+--------------
+..  autoclass:: paddle.v2.layer.smooth_l1_cost
+    :noindex:
+
 Check Layer 
 ============
 
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
index 4ae5b828707eb..69d5830dd2a1a 100644
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -217,7 +217,7 @@ void SmoothL1CostLayer::forwardImp(Matrix& output,
     targetCpu->copyFrom(target);
     outputCpu->copyFrom(output);
     labelCpu->copyFrom(*label.value);
-    targetCpu->smoothL1(*outputCpu, *(labelCpu));
+    targetCpu->smoothL1(*outputCpu, *labelCpu);
     target.copyFrom(*targetCpu);
   } else {
     target.smoothL1(output, *label.value);
diff --git a/paddle/gserver/layers/CostLayer.h b/paddle/gserver/layers/CostLayer.h
index 569a6840f0d44..14c0b33ec1a62 100644
--- a/paddle/gserver/layers/CostLayer.h
+++ b/paddle/gserver/layers/CostLayer.h
@@ -164,9 +164,11 @@ class SumOfSquaresCostLayer : public CostLayer {
  * tasks.
  * \f[
  * L =
- *   (output - label)^2 * 0.5  / -1 < (output - label) < 1 /
- *   (output - label) - 0.5    / otherwise  /
+ *   0.5 * x^2    if / -1 < |x| < 1 /
+ *   |x| - 0.5    / otherwise /
  * \f]
+ *
+ * x = output - label
  */
 class SmoothL1CostLayer : public CostLayer {
 public:
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 0d7bd8c3b8522..e1e8e7fae7ca4 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1679,13 +1679,13 @@ TEST(Layer, smooth_l1) {
   TestConfig config;
   config.layerConfig.set_type("smooth_l1");
 
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 200, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 200, 0});
   config.layerConfig.add_inputs();
   config.layerConfig.add_inputs();
 
   for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "smooth_l1", 100, false, useGpu, false, 2.0);
+    testLayerGrad(config, "smooth_l1", 100, false, useGpu, false);
   }
 }
 
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 55a7344495f8e..6ac61be0bf1b7 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -3616,17 +3616,18 @@ void CpuMatrix::smoothL1(Matrix& output, Matrix& label) {
   CHECK_EQ(output.getHeight(), numSamples);
   CHECK_EQ(label.getWidth(), dim);
   CHECK_EQ(getWidth(), (size_t)1);
-  real* out = output.getData();
+
   real* cost = getData();
+  real* out = output.getData();
   real* lbl = label.getData();
 
-  for (size_t i = 0; i < numSamples; ++i, out += dim, cost += dim, lbl += dim) {
+  for (size_t i = 0; i < numSamples; ++i, out += dim, lbl += dim) {
     for (size_t j = 0; j < dim; ++j) {
-      cost[j] = std::fabs(out[j] - lbl[j]);
-      if (cost[j] < 1.0)
-        cost[j] = 0.5 * cost[j] * cost[j];
+      real absVal = std::fabs(out[j] - lbl[j]);
+      if (absVal < 1.0)
+        cost[i] += 0.5 * absVal * absVal;
       else
-        cost[j] = cost[j] - 0.5;
+        cost[i] += absVal - 0.5;
     }
   }
 }
@@ -3640,17 +3641,20 @@ void CpuMatrix::smoothL1Bp(Matrix& output, Matrix& label) {
   CHECK_EQ(label.getHeight(), numSamples);
   CHECK_EQ(output.getHeight(), numSamples);
   CHECK_EQ(label.getWidth(), dim);
-  CHECK_EQ(getWidth(), (size_t)1);
+  CHECK_EQ(getWidth(), dim);
+
   real* out = output.getData();
-  real* cost = getData();
   real* lbl = label.getData();
+  real* grad = getData();
 
-  // f'(x) = x         if |x| < 1
-  //       = sign(x)   otherwise
-  for (size_t i = 0; i < numSamples; ++i, out += dim, cost += dim, lbl += dim) {
+  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim, lbl += dim) {
     for (size_t j = 0; j < dim; ++j) {
-      cost[j] = out[j] - lbl[j];
-      if (std::fabs(cost[j]) >= 1) cost[j] = (0 < cost[j]) - (cost[j] < 0);
+      real val = out[j] - lbl[j];
+      if (std::fabs(val) < 1) {
+        grad[j] += val;
+      } else {
+        grad[j] += (real(0) < val) - (val < real(0));
+      }
     }
   }
 }
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index dc89419c40f8d..32e31fe2c446f 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2119,6 +2119,7 @@ def init(cls, name, inputs, device=None, coeff=1.):
 define_cost('SoftBinaryClassCrossEntropy', 'soft_binary_class_cross_entropy')
 define_cost('HuberTwoClass', 'huber')
 define_cost('SumCost', 'sum_cost')
+define_cost('SmoothL1Cost', 'smooth_l1')
 
 
 @config_layer('hsigmoid')
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index f906126d87941..b9e3d26404227 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -116,6 +116,7 @@
     'spp_layer',
     'pad_layer',
     'eos_layer',
+    'smooth_l1_cost',
     'layer_support',
 ]
 
@@ -201,6 +202,7 @@ class LayerType(object):
     SOFT_BIN_CLASS_CROSS_ENTROPY = "soft_binary_class_cross_entropy"
     MULTI_BIN_LABEL_CROSS_ENTROPY = "multi_binary_label_cross_entropy"
     SUM_COST = "sum_cost"
+    SMOOTH_L1 = "smooth_l1"
 
     @staticmethod
     def is_layer_type(type_name):
@@ -5249,8 +5251,6 @@ def multi_binary_label_cross_entropy(input,
     :type input: LayerOutput
     :param label: The input label.
     :type input: LayerOutput
-    :param type: The type of cost.
-    :type type: basestring
     :param name: The name of this layers. It is not necessary.
     :type name: None|basestring
     :param coeff: The coefficient affects the gradient in the backward.
@@ -5279,3 +5279,52 @@ def multi_binary_label_cross_entropy(input,
         LayerType.MULTI_BIN_LABEL_CROSS_ENTROPY,
         parents=[input, label],
         size=1)
+
+
+@wrap_name_default()
+@layer_support()
+def smooth_l1_cost(input, label, name=None, layer_attr=None):
+    """
+    This is a L1 loss but more smooth. It requires that the
+    size of input and label are equal. The formula is as follows,
+
+    .. math::
+
+        L = \sum_{i} smooth_{L1}(input_i - label_i)
+
+    in which
+
+    .. math::
+
+        smooth_{L1}(x) = \\begin{cases} 0.5x^2& \\text{if}  \\ |x| < 1 \\\\ |x|-0.5& \\text{otherwise} \end{cases}
+
+    More details can be found by referring to `Fast R-CNN
+    <https://arxiv.org/pdf/1504.08083v2.pdf>`_
+
+    .. code-block:: python
+
+       cost = smooth_l1_cost(input=input_layer,
+                             label=label_layer)
+
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param label: The input label.
+    :type input: LayerOutput
+    :param name: The name of this layers. It is not necessary.
+    :type name: None|basestring
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    assert isinstance(label, LayerOutput)
+    assert input.size == label.size
+
+    Layer(
+        name=name,
+        type=LayerType.SMOOTH_L1,
+        inputs=[input.name, label.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.SMOOTH_L1, parents=[input, label], size=1)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index 164d365c15b8a..c5dc8e1aab08d 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -5,6 +5,6 @@ last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
 img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
 test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
 test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
-test_seq_concat_reshape test_pad)
+test_seq_concat_reshape test_pad test_smooth_l1)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_smooth_l1.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_smooth_l1.protostr
new file mode 100644
index 0000000000000..4aa041ea2e173
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_smooth_l1.protostr
@@ -0,0 +1,40 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "label"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "__smooth_l1_cost_0__"
+  type: "smooth_l1"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+  }
+  inputs {
+    input_layer_name: "label"
+  }
+  coeff: 1.0
+}
+input_layer_names: "input"
+input_layer_names: "label"
+output_layer_names: "__smooth_l1_cost_0__"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "label"
+  layer_names: "__smooth_l1_cost_0__"
+  input_layer_names: "input"
+  input_layer_names: "label"
+  output_layer_names: "__smooth_l1_cost_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py b/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py
new file mode 100644
index 0000000000000..66629662dd916
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py
@@ -0,0 +1,7 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='input', size=300)
+lbl = data_layer(name='label', size=300)
+smooth_l1 = smooth_l1_cost(input=data, label=lbl)
+
+outputs(smooth_l1)