From 306cd665bca1e8e5fd48995de0490d4607ea10a5 Mon Sep 17 00:00:00 2001
From: frankwhzhang <frankwhzhang@126.com>
Date: Wed, 19 Aug 2020 12:44:44 +0000
Subject: [PATCH 01/18] add norm 2.0 api, test=develop

---
 python/paddle/nn/layer/norm.py | 205 ++++++++++++++++++++++++++++++++-
 1 file changed, 204 insertions(+), 1 deletion(-)

diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 1beba62c1809f..26df6908cc8e3 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -22,5 +22,208 @@
 from ...fluid.dygraph import SpectralNorm  #DEFINE_ALIAS
 
 __all__ = [
-    'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm'
+    'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm',
+    'BatchNorm', 'BatchNorm2d'
 ]
+
+
+class BatchNorm2d(layers.Layer):
+    """
+    Applies Batch Normalization over a 4D input (a mini-batch of 2D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+
+    When track_running_stats = False, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    Calculated as follows:
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+
+    When track_running_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global or running statistics (moving_mean and moving_variance). It usually got from the
+    pre-trained model. Calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+
+    The normalization function formula is as follows:
+
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\\gamma` : trainable proportional parameter
+    - :math:`\\beta` : trainable deviation parameter
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
+             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+             will create ParamAttr as weight_attr. If the Initializer of the weight_attr
+             is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
+             If it is set to None or one attribute of ParamAttr, batch_norm
+             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+             is not set, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
+        track_running_stats(bool, optional): Whether to use global mean and
+            variance. In train mode, when setting track_running_stats True, the global mean
+            and variance are also used during train period. Default: True.
+        name(str, optional): Default: None.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          from paddle.fluid.dygraph.base import to_variable
+          import numpy as np
+
+          x = np.random.random(size=(3, 10, 3, 7)).astype('float32')
+          with fluid.dygraph.guard():
+              x = to_variable(x)
+              batch_norm = fluid.BatchNorm2d(10)
+              hidden1 = batch_norm(x)
+    """
+
+    def __init__(self,
+                 num_features,
+                 momentum=0.9,
+                 epsilon=1e-05,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format='NCHW',
+                 track_running_stats=False,
+                 name=None):
+        super(BatchNorm2d, self).__init__()
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+
+        assert bias_attr is not False, "bias_attr should not be False in batch_norm."
+
+        self._dtype = 'float32'
+
+        param_shape = [num_features]
+
+        # create parameter
+        self.weight = self.create_parameter(
+            attr=self._weight_attr,
+            shape=param_shape,
+            dtype=self._dtype,
+            default_initializer=Constant(1.0))
+        self.weight.stop_gradient = self._weight_attr.learning_rate == 0.
+
+        self.bias = self.create_parameter(
+            attr=self._bias_attr,
+            shape=param_shape,
+            dtype=self._dtype,
+            is_bias=True)
+        self.bias.stop_gradient = self._param_attr.learning_rate == 0.
+
+        moving_mean_name = None
+        moving_variance_name = None
+
+        if name is not None:
+            moving_mean_name = name + "_mean"
+            moving_variance_name = name + "_variance"
+
+        self._mean = self.create_parameter(
+            attr=ParamAttr(
+                name=moving_mean_name,
+                initializer=Constant(0.0),
+                trainable=False,
+                do_model_average=True),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._mean.stop_gradient = True
+
+        self._variance = self.create_parameter(
+            attr=ParamAttr(
+                name=moving_variance_name,
+                initializer=Constant(1.0),
+                trainable=False,
+                do_model_average=True),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._variance.stop_gradient = True
+        self._in_place = True
+        self._momentum = momentum
+        self._epsilon = epsilon
+        self._fuse_with_relu = False
+        self._track_running_stats = track_running_stats
+
+    def forward(self, input):
+        # create output
+        # mean and mean_out share the same memory
+        mean_out = self._mean
+        # variance and variance out share the same memory
+        variance_out = self._variance
+
+        if in_dygraph_mode():
+            attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
+                     "is_test", not self.training, "data_layout",
+                     self._data_format, "use_mkldnn", False, "fuse_with_relu",
+                     self._fuse_with_relu, "use_global_stats",
+                     self._track_running_stats, 'trainable_statistics',
+                     self._track_running_stats)
+            batch_norm_out, _, _, _, _, _ = core.ops.batch_norm(
+                input, self.weight, self.bias, self._mean, self._variance,
+                mean_out, variance_out, *attrs)
+
+            return dygraph_utils._append_activation_in_dygraph(
+                batch_norm_out, act=None)
+
+        check_variable_and_dtype(
+            input, 'input', ['float16', 'float32', 'float64'], 'BatchNorm2d')
+
+        attrs = {
+            "momentum": self._momentum,
+            "epsilon": self._epsilon,
+            "is_test": not self.training,
+            "data_layout": self._data_format,
+            "use_mkldnn": False,
+            "fuse_with_relu": self._fuse_with_relu,
+            "use_global_stats": self._track_running_stats,
+            "trainable_statistics": self._track_running_stats,
+        }
+
+        inputs = {
+            "X": [input],
+            "Scale": [self.weight],
+            "Bias": [self.bias],
+            "Mean": [self._mean],
+            "Variance": [self._variance]
+        }
+
+        saved_mean = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        saved_variance = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        batch_norm_out = input if self._in_place else self._helper.create_variable_for_type_inference(
+            self._dtype)
+
+        outputs = {
+            "Y": [batch_norm_out],
+            "MeanOut": [mean_out],
+            "VarianceOut": [variance_out],
+            "SavedMean": [saved_mean],
+            "SavedVariance": [saved_variance]
+        }
+
+        self._helper.append_op(
+            type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+
+        # Currently, we don't support inplace in dygraph mode
+        return self._helper.append_activation(batch_norm_out, None)

From a61dc4d2ba53ed58d6e62a6e545da82ac4e80c75 Mon Sep 17 00:00:00 2001
From: frankwhzhang <frankwhzhang@126.com>
Date: Thu, 20 Aug 2020 03:14:25 +0000
Subject: [PATCH 02/18] add norm 2.0 api, test=develop

---
 python/paddle/nn/layer/norm.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 243f6e996c43c..8280c7ffaf929 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -22,10 +22,16 @@
 from ...fluid.dygraph import SpectralNorm  #DEFINE_ALIAS
 from ...fluid.dygraph import SyncBatchNorm  #DEFINE_ALIAS
 
+from ...fluid.dygraph import layers
+from ...fluid.framework import in_dygraph_mode
+
+from ...fluid.initializer import Constant
+from ...fluid.param_attr import ParamAttr
+from ...fluid import core, dygraph_utils
+
 __all__ = [
     'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm',
-    'BatchNorm2d'
-    'SyncBatchNorm'
+    'BatchNorm2d', 'SyncBatchNorm'
 ]
 
 
@@ -125,14 +131,14 @@ def __init__(self,
             shape=param_shape,
             dtype=self._dtype,
             default_initializer=Constant(1.0))
-        self.weight.stop_gradient = self._weight_attr.learning_rate == 0.
+        self.weight.stop_gradient = self._weight_attr and self._weight_attr.learning_rate == 0.
 
         self.bias = self.create_parameter(
             attr=self._bias_attr,
             shape=param_shape,
             dtype=self._dtype,
             is_bias=True)
-        self.bias.stop_gradient = self._param_attr.learning_rate == 0.
+        self.bias.stop_gradient = self._bias_attr and self._bias_attr.learning_rate == 0.
 
         moving_mean_name = None
         moving_variance_name = None
@@ -160,6 +166,8 @@ def __init__(self,
             shape=param_shape,
             dtype=self._dtype)
         self._variance.stop_gradient = True
+
+        self._data_format = data_format
         self._in_place = True
         self._momentum = momentum
         self._epsilon = epsilon

From 27993391822250d6505e2ea0cc2a0c99055340cf Mon Sep 17 00:00:00 2001
From: frankwhzhang <frankwhzhang@126.com>
Date: Thu, 20 Aug 2020 05:52:24 +0000
Subject: [PATCH 03/18] add norm 2.0 api, test=develop

---
 .../tests/unittests/test_batch_norm_2d_op.py  | 92 +++++++++++++++++++
 python/paddle/nn/__init__.py                  |  1 +
 python/paddle/nn/layer/norm.py                |  1 +
 3 files changed, 94 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/test_batch_norm_2d_op.py

diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_2d_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_2d_op.py
new file mode 100644
index 0000000000000..a3bccd5763fc7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_2d_op.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+
+class TestDygraphBatchNorm2d(unittest.TestCase):
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [4, 10, 4, 4]
+
+            def compute_v1(x, is_test, trainable_statistics):
+                with fluid.dygraph.guard(p):
+                    bn = fluid.dygraph.BatchNorm(
+                        shape[1],
+                        is_test=is_test,
+                        trainable_statistics=trainable_statistics)
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    bn = paddle.nn.BatchNorm2d(shape[1])
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x, False, False)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [4, 10, 16, 16]
+
+            def compute_v1(x_np, is_test, trainable_statistics):
+                with program_guard(Program(), Program()):
+                    bn = fluid.dygraph.BatchNorm(
+                        shape[1],
+                        is_test=is_test,
+                        trainable_statistics=trainable_statistics)
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = bn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            def compute_v2(x_np):
+                with program_guard(Program(), Program()):
+                    bn = paddle.nn.BatchNorm2d(shape[1])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = bn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x, False, False)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 289bf40dce5a7..9be1212b87330 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -107,6 +107,7 @@
 from .layer.norm import LayerNorm  #DEFINE_ALIAS
 from .layer.norm import SpectralNorm  #DEFINE_ALIAS
 from .layer.norm import InstanceNorm  #DEFINE_ALIAS
+from .layer.norm import BatchNorm2d  #DEFINE_ALIAS
 # from .layer.rnn import RNNCell        #DEFINE_ALIAS
 # from .layer.rnn import GRUCell        #DEFINE_ALIAS
 # from .layer.rnn import LSTMCell        #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 8280c7ffaf929..3236c5ec3eb0f 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -27,6 +27,7 @@
 
 from ...fluid.initializer import Constant
 from ...fluid.param_attr import ParamAttr
+from ...fluid.data_feeder import check_variable_and_dtype, check_type
 from ...fluid import core, dygraph_utils
 
 __all__ = [

From 2337889a145375d29e67c5a2ade7a255ee97e078 Mon Sep 17 00:00:00 2001
From: frankwhzhang <frankwhzhang@126.com>
Date: Thu, 20 Aug 2020 13:46:15 +0000
Subject: [PATCH 04/18] add norm 2.0 api, test=develop

---
 python/paddle/nn/__init__.py   |   2 +
 python/paddle/nn/layer/norm.py | 320 +++++++++++++++++++++++++--------
 2 files changed, 247 insertions(+), 75 deletions(-)

diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 9be1212b87330..8810a37d86995 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -107,7 +107,9 @@
 from .layer.norm import LayerNorm  #DEFINE_ALIAS
 from .layer.norm import SpectralNorm  #DEFINE_ALIAS
 from .layer.norm import InstanceNorm  #DEFINE_ALIAS
+from .layer.norm import BatchNorm1d  #DEFINE_ALIAS
 from .layer.norm import BatchNorm2d  #DEFINE_ALIAS
+from .layer.norm import BatchNorm3d  #DEFINE_ALIAS
 # from .layer.rnn import RNNCell        #DEFINE_ALIAS
 # from .layer.rnn import GRUCell        #DEFINE_ALIAS
 # from .layer.rnn import LSTMCell        #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 3236c5ec3eb0f..123e057c0c87e 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -32,79 +32,13 @@
 
 __all__ = [
     'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm',
-    'BatchNorm2d', 'SyncBatchNorm'
+    'BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d', 'SyncBatchNorm'
 ]
 
 
-class BatchNorm2d(layers.Layer):
+class _BatchNormBase(layers.Layer):
     """
-    Applies Batch Normalization over a 4D input (a mini-batch of 2D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
-
-    When track_running_stats = False, the :math:`\\mu_{\\beta}`
-    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
-    Calculated as follows:
-
-    ..  math::
-
-        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
-        \ mini-batch\ mean \\\\
-        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
-        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
-
-    When track_running_stats = True, the :math:`\\mu_{\\beta}`
-    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
-    They are global or running statistics (moving_mean and moving_variance). It usually got from the
-    pre-trained model. Calculated as follows:
-
-    .. math::
-        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
-        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
-
-    The normalization function formula is as follows:
-
-    ..  math::
-
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
-        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
-
-    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
-    - :math:`\\gamma` : trainable proportional parameter
-    - :math:`\\beta` : trainable deviation parameter
-
-    Parameters:
-        num_features(int): Indicate the number of channels of the input ``Tensor``.
-        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
-        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
-        weight_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
-             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
-             will create ParamAttr as weight_attr. If the Initializer of the weight_attr
-             is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
-             If it is set to None or one attribute of ParamAttr, batch_norm
-             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-             is not set, the bias is initialized zero. Default: None.
-        data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
-        track_running_stats(bool, optional): Whether to use global mean and
-            variance. In train mode, when setting track_running_stats True, the global mean
-            and variance are also used during train period. Default: True.
-        name(str, optional): Default: None.
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          from paddle.fluid.dygraph.base import to_variable
-          import numpy as np
-
-          x = np.random.random(size=(3, 10, 3, 7)).astype('float32')
-          with fluid.dygraph.guard():
-              x = to_variable(x)
-              batch_norm = fluid.BatchNorm2d(10)
-              hidden1 = batch_norm(x)
+    BatchNorm base .
     """
 
     def __init__(self,
@@ -116,13 +50,13 @@ def __init__(self,
                  data_format='NCHW',
                  track_running_stats=False,
                  name=None):
-        super(BatchNorm2d, self).__init__()
+        super(_BatchNormBase, self).__init__()
         self._weight_attr = weight_attr
         self._bias_attr = bias_attr
 
         assert bias_attr is not False, "bias_attr should not be False in batch_norm."
 
-        self._dtype = 'float32'
+        #self._dtype = 'float32'
 
         param_shape = [num_features]
 
@@ -130,14 +64,14 @@ def __init__(self,
         self.weight = self.create_parameter(
             attr=self._weight_attr,
             shape=param_shape,
-            dtype=self._dtype,
+            #dtype=self._dtype,
             default_initializer=Constant(1.0))
         self.weight.stop_gradient = self._weight_attr and self._weight_attr.learning_rate == 0.
 
         self.bias = self.create_parameter(
             attr=self._bias_attr,
             shape=param_shape,
-            dtype=self._dtype,
+            #dtype=self._dtype,
             is_bias=True)
         self.bias.stop_gradient = self._bias_attr and self._bias_attr.learning_rate == 0.
 
@@ -169,17 +103,21 @@ def __init__(self,
         self._variance.stop_gradient = True
 
         self._data_format = data_format
-        self._in_place = True
+        self._in_place = False
         self._momentum = momentum
         self._epsilon = epsilon
         self._fuse_with_relu = False
         self._track_running_stats = track_running_stats
 
+    def _check_input_dim(self, input):
+        raise NotImplementedError("BatchNorm Base error")
+
     def forward(self, input):
         # create output
         # mean and mean_out share the same memory
-        mean_out = self._mean
         # variance and variance out share the same memory
+        self._check_input_dim(input)
+        mean_out = self._mean
         variance_out = self._variance
 
         if in_dygraph_mode():
@@ -238,3 +176,235 @@ def forward(self, input):
 
         # Currently, we don't support inplace in dygraph mode
         return self._helper.append_activation(batch_norm_out, None)
+
+
+class BatchNorm1d(_BatchNormBase):
+    """
+    Applies Batch Normalization over a 2D or 3D input (a mini-batch of 1D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+
+    When track_running_stats = False, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    Calculated as follows:
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+
+    When track_running_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global or running statistics (moving_mean and moving_variance). It usually got from the
+    pre-trained model. Calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+
+    The normalization function formula is as follows:
+
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\\gamma` : trainable proportional parameter
+    - :math:`\\beta` : trainable deviation parameter
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
+             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+             will create ParamAttr as weight_attr. If the Initializer of the weight_attr
+             is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
+             If it is set to None or one attribute of ParamAttr, batch_norm
+             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+             is not set, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
+        track_running_stats(bool, optional): Whether to use global mean and
+            variance. In train mode, when setting track_running_stats True, the global mean
+            and variance are also used during train period. Default: True.
+        name(str, optional): Default: None.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          from paddle.fluid.dygraph.base import to_variable
+          import numpy as np
+
+          x = np.random.random(size=(3, 10)).astype('float32')
+          with fluid.dygraph.guard():
+              x = to_variable(x)
+              batch_norm = fluid.BatchNorm1d(10)
+              hidden1 = batch_norm(x)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 2 and len(input.shape) != 3:
+            raise ValueError('expected 2D or 3D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class BatchNorm2d(_BatchNormBase):
+    """
+    Applies Batch Normalization over a 4D input (a mini-batch of 2D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+
+    When track_running_stats = False, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    Calculated as follows:
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+
+    When track_running_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global or running statistics (moving_mean and moving_variance). It usually got from the
+    pre-trained model. Calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+
+    The normalization function formula is as follows:
+
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\\gamma` : trainable proportional parameter
+    - :math:`\\beta` : trainable deviation parameter
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
+             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+             will create ParamAttr as weight_attr. If the Initializer of the weight_attr
+             is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
+             If it is set to None or one attribute of ParamAttr, batch_norm
+             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+             is not set, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
+        track_running_stats(bool, optional): Whether to use global mean and
+            variance. In train mode, when setting track_running_stats True, the global mean
+            and variance are also used during train period. Default: True.
+        name(str, optional): Default: None.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          from paddle.fluid.dygraph.base import to_variable
+          import numpy as np
+
+          x = np.random.random(size=(3, 10, 3, 7)).astype('float32')
+          with fluid.dygraph.guard():
+              x = to_variable(x)
+              batch_norm = fluid.BatchNorm2d(10)
+              hidden1 = batch_norm(x)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 4:
+            raise ValueError('expected 4D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class BatchNorm3d(_BatchNormBase):
+    """
+    Applies Batch Normalization over a 5D input (a mini-batch of 3D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+
+    When track_running_stats = False, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    Calculated as follows:
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+
+    When track_running_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global or running statistics (moving_mean and moving_variance). It usually got from the
+    pre-trained model. Calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+
+    The normalization function formula is as follows:
+
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\\gamma` : trainable proportional parameter
+    - :math:`\\beta` : trainable deviation parameter
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
+             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+             will create ParamAttr as weight_attr. If the Initializer of the weight_attr
+             is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
+             If it is set to None or one attribute of ParamAttr, batch_norm
+             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+             is not set, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
+        track_running_stats(bool, optional): Whether to use global mean and
+            variance. In train mode, when setting track_running_stats True, the global mean
+            and variance are also used during train period. Default: True.
+        name(str, optional): Default: None.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          from paddle.fluid.dygraph.base import to_variable
+          import paddle
+          import numpy as np
+
+          x = np.random.random(size=(3, 10, 3, 7, 6)).astype('float32')
+          with fluid.dygraph.guard():
+              x = to_variable(x)
+              batch_norm = paddle.nn.BatchNorm3d(10)
+              hidden1 = batch_norm(x)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 5:
+            raise ValueError('expected 5D input (got {}D input)'.format(
+                len(input.shape)))

From d966f3158afec2fd865f082bd9b39645766288b0 Mon Sep 17 00:00:00 2001
From: frankwhzhang <frankwhzhang@126.com>
Date: Fri, 21 Aug 2020 07:35:52 +0000
Subject: [PATCH 05/18] add norm 2.0 api, test=develop

---
 python/paddle/nn/__init__.py            |   3 +
 python/paddle/nn/functional/__init__.py |   9 +-
 python/paddle/nn/functional/norm.py     | 280 ++++++++++-
 python/paddle/nn/layer/norm.py          | 598 ++++++++++++++++++++++--
 4 files changed, 855 insertions(+), 35 deletions(-)

diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 8810a37d86995..528ac4cfbc2d2 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -107,6 +107,9 @@
 from .layer.norm import LayerNorm  #DEFINE_ALIAS
 from .layer.norm import SpectralNorm  #DEFINE_ALIAS
 from .layer.norm import InstanceNorm  #DEFINE_ALIAS
+from .layer.norm import InstanceNorm1d  #DEFINE_ALIAS
+from .layer.norm import InstanceNorm2d  #DEFINE_ALIAS
+from .layer.norm import InstanceNorm3d  #DEFINE_ALIAS
 from .layer.norm import BatchNorm1d  #DEFINE_ALIAS
 from .layer.norm import BatchNorm2d  #DEFINE_ALIAS
 from .layer.norm import BatchNorm3d  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index e587466e76483..ffdeb0a5f391e 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -144,12 +144,11 @@
 from .loss import square_error_cost  #DEFINE_ALIAS
 from .loss import ssd_loss  #DEFINE_ALIAS
 from .loss import teacher_student_sigmoid_loss  #DEFINE_ALIAS
-# from .norm import batch_norm        #DEFINE_ALIAS
+from .norm import batch_norm  #DEFINE_ALIAS
 # from .norm import data_norm        #DEFINE_ALIAS
-# from .norm import group_norm        #DEFINE_ALIAS
-# from .norm import instance_norm        #DEFINE_ALIAS
-from .norm import l2_normalize  #DEFINE_ALIAS
-# from .norm import layer_norm        #DEFINE_ALIAS
+# from .norm import group_norm        #DEFINE_ALIAS# from .norm import instance_norm        #DEFINE_ALIAS
+from .norm import instance_norm  #DEFINE_ALIAS
+from .norm import layer_norm  #DEFINE_ALIAS
 from .norm import lrn  #DEFINE_ALIAS
 from .norm import normalize  #DEFINE_ALIAS
 # from .norm import spectral_norm        #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 0b007041b4ab3..7316b8e2b7b21 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -18,16 +18,19 @@
 from ...fluid.data_feeder import check_variable_and_dtype, check_type
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.framework import in_dygraph_mode, core
+from ...framework import create_parameter
 from ...fluid.layers import l2_normalize  #DEFINE_ALIAS
 from ...fluid.layers import lrn  #DEFINE_ALIAS
+from ...fluid.initializer import Constant
+from ...fluid.param_attr import ParamAttr
+from ...fluid import core, dygraph_utils
 
 __all__ = [
-    #       'batch_norm',
+    'batch_norm',
     #       'data_norm',
-    #       'group_norm',
-    #       'instance_norm',
+    'instance_norm',
     'l2_normalize',
-    #       'layer_norm',
+    'layer_norm',
     'lrn',
     'normalize',
     #       'spectral_norm'
@@ -110,3 +113,272 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
     eps = out.block.create_var(dtype=out.dtype)
     paddle.fill_constant([1], out.dtype, epsilon, out=eps)
     return paddle.elementwise_div(x, paddle.maximum(out, eps), name=name)
+
+
+def batch_norm(x,
+               running_mean,
+               running_var,
+               weight=None,
+               bias=None,
+               training=False,
+               momentum=0.9,
+               epsilon=1e-05,
+               data_format="NCHW",
+               name=None):
+    """
+    Applies Batch Normalization as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+
+    see nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d for detail.
+    
+    Parameters:
+        x(Tesnor): input value.
+        running_mean(Tensor): running mean.
+        running_var(Tensor): running variance.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight(Tensor, optional): The parameter attribute for Parameter `scale` of batch_norm. Default: None.
+        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm. Default: None.
+        training(bool, optional): defalut False.
+        data_format(str, optional): Specify the input data format. Defalut "NCHW".
+        name(str, optional): Default: None.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 1, 2, 3)).astype('float32')
+          running_mean = np.random.random(size=1).astype('float32')
+          running_variance = np.random.random(size=1).astype('float32')
+          rm = to_variable(running_mean)
+          rv = to_variable(running_variance)
+          batch_norm_out = paddle.nn.functional.batch_norm(x, rm, rv)
+
+          print(batch_norm_out.numpy)
+    """
+
+    assert len(x.shape) >= 2, "input dim must be larger than 1"
+    param_shape = [x.shape[1]]
+    if weight is None or weight is False:
+        weight = create_parameter(
+            dtype=x.dtype, shape=param_shape, default_initializer=Constant(1.0))
+        weight.stop_gradient = True
+
+    if bias is None or bias is False:
+        bias = create_parameter(
+            dtype=x.dtype, shape=param_shape, default_initializer=Constant(0.0))
+        bias.stop_gradient = True
+
+    mean_out = running_mean
+    variance_out = running_var
+    if in_dygraph_mode():
+        attrs = ("momentum", momentum, "epsilon", epsilon, "is_test", training,
+                 "data_layout", data_format, "use_mkldnn", False,
+                 "fuse_with_relu", False, "use_global_stats", training,
+                 'trainable_statistics', training)
+        batch_norm_out, _, _, _, _, _ = core.ops.batch_norm(
+            x, weight, bias, running_mean, running_var, mean_out, variance_out,
+            *attrs)
+
+        return dygraph_utils._append_activation_in_dygraph(
+            batch_norm_out, act=None)
+
+    check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                             'BatchNorm')
+
+    attrs = {
+        "momentum": momentum,
+        "epsilon": epsilon,
+        "is_test": not training,
+        "data_layout": data_format,
+        "use_mkldnn": False,
+        "fuse_with_relu": False,
+        "use_global_stats": training,
+        "trainable_statistics": training,
+    }
+
+    inputs = {
+        "X": [x],
+        "Scale": [weight],
+        "Bias": [bias],
+        "Mean": [running_mean],
+        "Variance": [running_var]
+    }
+
+    saved_mean = self._helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True)
+    saved_variance = self._helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True)
+    batch_norm_out = self._helper.create_variable_for_type_inference(x.dtype)
+
+    outputs = {
+        "Y": [batch_norm_out],
+        "MeanOut": [mean],
+        "VarianceOut": [variance],
+        "SavedMean": [saved_mean],
+        "SavedVariance": [saved_variance]
+    }
+
+    self._helper.append_op(
+        type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+
+    # Currently, we don't support inplace in dygraph mode
+    return self._helper.append_activation(batch_norm_out, None)
+
+
+def layer_norm(x,
+               normalized_shape,
+               weight=None,
+               bias=None,
+               epsilon=1e-05,
+               name=None):
+    """
+    see more detail in paddle.nn.LayerNorm
+    
+    Parameters:
+        x(Tensor): Input Tensor.
+        normalized_shape(int or list or tuple): Input shape from an expected input of
+            size :math:`[*, normalized_shape[0], normalized_shape[1], ..., normalized_shape[-1]]`.
+            If it is a single integer, this module will normalize over the last dimension
+            which is expected to be of that specific size.
+        epsilon(float, optional): The small value added to the variance to prevent
+            division by zero. Default: 1e-05.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
+            gain :math:`g`. If False, weight is None. If is None, a default :code:`ParamAttr` would be added as scale. The
+            :attr:`param_attr` is initialized as 1 if it is added. Default: None.
+        bias_attr(ParamAttr, optional): The parameter attribute for the learnable
+            bias :math:`b`. If is False, bias is None. If is None, a default :code:`ParamAttr` would be added as bias. The
+            :attr:`bias_attr` is initialized as 0 if it is added. Default: None.
+        name(str, optional): parameter name. Default None.
+
+    Returns:
+        None
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          layer_norm = paddle.nn.functional.layer_norm(x, x.shape[1:])
+          layer_norm_out = layer_norm(x)
+
+          print(layer_norm_out.numpy)
+    """
+    input_shape = list(x.shape)
+    input_ndim = len(input_shape)
+    normalized_ndim = len(normalized_shape)
+    begin_norm_axis = input_ndim - normalized_ndim
+    if input_ndim < normalized_ndim or input_shape[
+            begin_norm_axis:] != normalized_shape:
+        str_normalized_shape = str(normalized_shape)
+        raise ValueError('Given normalized_shape is ' + str_normalized_shape +
+                         ', expected input with shape [*, ' +
+                         str_normalized_shape[
+                             1:] + ', but got input shape ' + str(input_shape))
+
+    if in_dygraph_mode():
+        pre_act, _, _ = core.ops.layer_norm(x, weight, bias, 'epsilon', epsilon,
+                                            'begin_norm_axis', begin_norm_axis)
+        return dygraph_utils._append_activation_in_dygraph(pre_act, act=None)
+
+    check_variable_and_dtype(x, 'input', ['float32', 'float64'], 'LayerNorm')
+
+    inputs = dict()
+    inputs['X'] = [x]
+    if weight:
+        inputs['Scale'] = [weight]
+    if bias:
+        inputs['Bias'] = [bias]
+    attrs = {"epsilon": epsilon, "begin_norm_axis": begin_norm_axis}
+
+    # create output
+    mean_out = self._helper.create_variable_for_type_inference(
+        dtype=x.type, stop_gradient=True)
+    variance_out = self._helper.create_variable_for_type_inference(
+        dtype=x.type, stop_gradient=True)
+    layer_norm_out = self._helper.create_variable_for_type_inference(x.type)
+
+    self._helper.append_op(
+        type="layer_norm",
+        inputs=inputs,
+        outputs={
+            "Y": layer_norm_out,
+            "Mean": mean_out,
+            "Variance": variance_out,
+        },
+        attrs={"epsilon": epsilon,
+               "begin_norm_axis": begin_norm_axis})
+
+    return self._helper.append_activation(layer_norm_out, act=None)
+
+
+def instance_norm(x,
+                  running_mean=None,
+                  running_var=None,
+                  weight=None,
+                  bias=None,
+                  use_input_stats=True,
+                  momentum=0.1,
+                  eps=1e-05,
+                  data_format="NCHW",
+                  name=None):
+    """
+    See more detail in nn.layer.InstanceNorm2d.
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        running_mean(Tensor): running mean. Default None.
+        running_var(Tensor): running variance. Default None.
+        eps(float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        use_input_stats(bool): Default True.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+             of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
+	     If the Initializer of the weight_attr is not set, the parameter is initialized 
+	     one. If it is set to False, will not create weight_attr. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
+             If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
+	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
+             If it is set to False, will not create bias_attr. Default: None.
+        data_format(str, optional): Specify the input data format. Default: NCL.
+        name(str, optional): Default None.
+
+    Returns:
+        None.
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          instance_norm_out = paddle.nn.functional.instancenorm(x)
+
+          print(instance_norm_out.numpy)
+
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 2 and len(input.shape) != 3:
+            raise ValueError('expected 2D or 3D input (got {}D input)'.format(
+                len(input.shape)))
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 123e057c0c87e..0bd0f1f25e002 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -17,12 +17,14 @@
 from ...fluid.dygraph.nn import InstanceNorm
 
 from ...fluid.dygraph import BatchNorm  #DEFINE_ALIAS
-from ...fluid.dygraph import GroupNorm  #DEFINE_ALIAS
-from ...fluid.dygraph import LayerNorm  #DEFINE_ALIAS
+#from ...fluid.dygraph import GroupNorm  #DEFINE_ALIAS
+
+#from ...fluid.dygraph import LayerNorm  #DEFINE_ALIAS
 from ...fluid.dygraph import SpectralNorm  #DEFINE_ALIAS
 from ...fluid.dygraph import SyncBatchNorm  #DEFINE_ALIAS
 
 from ...fluid.dygraph import layers
+
 from ...fluid.framework import in_dygraph_mode
 
 from ...fluid.initializer import Constant
@@ -30,12 +32,548 @@
 from ...fluid.data_feeder import check_variable_and_dtype, check_type
 from ...fluid import core, dygraph_utils
 
+import numpy as np
+import numbers
+
 __all__ = [
     'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm',
-    'BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d', 'SyncBatchNorm'
+    'BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d', 'InstanceNorm1d',
+    'InstanceNorm2d', 'InstanceNorm3d', 'SyncBatchNorm'
 ]
 
 
+class _InstanceNormBase(layers.Layer):
+    """
+    This class is based class for InstanceNorm1d, 2d, 3d.
+    """
+
+    def __init__(self,
+                 num_features,
+                 epsilon=1e-5,
+                 momentum=0.9,
+                 weight_attr=None,
+                 bias_attr=None,
+                 track_running_stats=False,
+                 data_format="NCHW",
+                 name=None):
+        super(_InstanceNormBase, self).__init__()
+
+        if weight_attr == False or bias_attr == False:
+            assert weight_attr == param_attr, "weight_attr and bias_attr must be set to Fasle at the same time in InstanceNorm"
+        self._epsilon = epsilon
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+
+        if weight_attr != False and bias_attr != False:
+            self.scale = self.create_parameter(
+                attr=self._weight_attr,
+                shape=[num_features],
+                default_initializer=Constant(1.0),
+                is_bias=False)
+            self.bias = self.create_parameter(
+                attr=self._bias_attr,
+                shape=[num_features],
+                default_initializer=Constant(0.0),
+                is_bias=True)
+        else:
+            self.scale = None
+            self.bias = None
+
+    def _check_input_dim(self, input):
+        raise NotImplementedError("InstanceNorm Base error")
+
+    def forward(self, input):
+        self._check_input_dim(input)
+
+        if in_dygraph_mode():
+            out, _, _ = core.ops.instance_norm(input, self.scale, self.bias,
+                                               'epsilon', self._epsilon)
+            return out
+
+        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                                 "InstanceNorm")
+
+        attrs = {"epsilon": self._epsilon}
+
+        if self.scale and self.bias:
+            inputs = {"X": [input], "Scale": [self.scale], "Bias": [self.bias]}
+        else:
+            inputs = {"X": [input]}
+
+        saved_mean = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype, stop_gradient=True)
+        saved_variance = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype, stop_gradient=True)
+        instance_norm_out = self._helper.create_variable_for_type_inference(
+            input.dtype)
+
+        outputs = {
+            "Y": [instance_norm_out],
+            "SavedMean": [saved_mean],
+            "SavedVariance": [saved_variance]
+        }
+
+        self._helper.append_op(
+            type="instance_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+        return instance_norm_out
+
+
+class InstanceNorm1d(_InstanceNormBase):
+    """
+    Applies Instance Normalization over a 3D input (a mini-batch of 1D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
+
+    DataLayout: NCL `[batch, in_channels, length]`
+
+    :math:`input` is the input features over a mini-batch.
+
+    ..  math::
+        
+        \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
+        \\ mean\ of\ one\  feature\ map\ in\ mini-batch \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    Note:
+        `H` means height of feature map, `W` means width of feature map.
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        track_running_stats(bool, optional): Whether to use global mean and
+            variance. In train mode, when setting track_running_stats True, the global mean
+            and variance are also used during train period. Default: False.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+             of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
+	     If the Initializer of the weight_attr is not set, the parameter is initialized 
+	     one. If it is set to False, will not create weight_attr. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
+             If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
+	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
+             If it is set to False, will not create bias_attr. Default: None.
+        data_format(str, optional): Specify the input data format. Default: NCL.
+        name(str, optional): Default None.
+
+    Returns:
+        None.
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          instance_norm = paddle.nn.InstanceNorm1d(2)
+          instance_norm_out = instance_norm(x)
+
+          print(instance_norm_out.numpy)
+
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 2 and len(input.shape) != 3:
+            raise ValueError('expected 2D or 3D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class InstanceNorm2d(_InstanceNormBase):
+    """
+    Applies Instance Normalization over a 4D input (a mini-batch of 2D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
+
+    DataLayout: NCHW `[batch, in_channels, in_height, in_width]`
+
+
+    :math:`input` is the input features over a mini-batch.
+
+    ..  math::
+        
+        \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
+        \\ mean\ of\ one\  feature\ map\ in\ mini-batch \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    Note:
+        `H` means height of feature map, `W` means width of feature map.
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        track_running_stats(bool, optional): Whether to use global mean and
+            variance. In train mode, when setting track_running_stats True, the global mean
+            and variance are also used during train period. Default: False.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+             of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
+	     If the Initializer of the weight_attr is not set, the parameter is initialized 
+	     one. If it is set to False, will not create weight_attr. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
+             If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
+	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
+             If it is set to False, will not create bias_attr. Default: None.
+        data_format(str, optional): Specify the input data format. Default: NCHW.
+        name(str, optional): Default None.
+
+    Returns:
+        None.
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          instance_norm = paddle.nn.InstanceNorm2d(2)
+          instance_norm_out = instance_norm(x)
+
+          print(instance_norm_out.numpy)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 4:
+            raise ValueError('expected 4D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class InstanceNorm3d(_InstanceNormBase):
+    """
+    Applies Instance Normalization over a 5D input (a mini-batch of 3D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
+
+    DataLayout: NCHW `[batch, in_channels, D, in_height, in_width]`
+
+
+    :math:`input` is the input features over a mini-batch.
+
+    ..  math::
+        
+        \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
+        \\ mean\ of\ one\  feature\ map\ in\ mini-batch \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    Note:
+        `H` means height of feature map, `W` means width of feature map.
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        track_running_stats(bool, optional): Whether to use global mean and
+            variance. In train mode, when setting track_running_stats True, the global mean
+            and variance are also used during train period. Default: False.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+             of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
+	     If the Initializer of the weight_attr is not set, the parameter is initialized 
+	     one. If it is set to False, will not create weight_attr. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
+             If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
+	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
+             If it is set to False, will not create bias_attr. Default: None.
+        data_format(str, optional): Specify the input data format. Default: NCDHW.
+        name(str, optional): Default None.
+
+    Returns:
+        None.
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          instance_norm = paddle.nn.InstanceNorm3d(2)
+          instance_norm_out = instance_norm(x)
+
+          print(instance_norm_out.numpy)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 5:
+            raise ValueError('expected 5D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class GroupNorm(layers.Layer):
+    """
+    :alias_main: paddle.nn.GroupNorm
+	:alias: paddle.nn.GroupNorm,paddle.nn.layer.GroupNorm,paddle.nn.layer.norm.GroupNorm
+	:old_api: paddle.fluid.dygraph.GroupNorm
+
+    This interface is used to construct a callable object of the ``GroupNorm`` class.
+    For more details, refer to code examples.
+    It implements the function of the Group Normalization Layer.
+    Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_ .
+
+    Parameters:
+        num_channels(int): The number of channels of input.
+        num_groups(int): The number of groups that divided from channels.
+        epsilon(float, optional): The small value added to the variance to prevent
+                                  division by zero. Default: 1e-05.
+        weight_attr(ParamAttr, optional): The parameter attribute for the learnable
+                                         scale :math:`g`. If it is set to False, no scale will be added to the output units.
+                                         If it is set to None, the bias is initialized one. Default: None.
+        bias_attr(ParamAttr, optional): The parameter attribute for the learnable
+                                        bias :math:`b`. If it is set to False, no bias will be added to the output units.
+                                        If it is set to None, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format. Only NCHW is supported. Default: NCHW.
+        name(str, optional): Default None.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 6, 2, 2)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          group_norm = paddle.nn.GroupNorm(num_channels=3, num_groups=6)
+          group_norm_out = group_norm(x)
+
+          print(group_norm_out.numpy)
+    """
+
+    def __init__(self,
+                 num_channels,
+                 num_groups,
+                 epsilon=1e-05,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_layout='NCHW',
+                 name=None):
+        super(GroupNorm, self).__init__()
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self._epsilon = epsilon
+        self._num_channels = num_channels
+        self._num_groups = num_groups
+        if data_layout != 'NCHW':
+            raise ValueError("unsupported data layout:" + data_layout)
+
+        param_shape = [self._num_channels]
+
+        self.weight = self.create_parameter(
+            attr=self._weight_attr or False,
+            shape=param_shape,
+            default_initializer=Constant(1.0))
+
+        self.bias = self.create_parameter(
+            attr=self._weight_attr or False, shape=param_shape, is_bias=True)
+
+    def forward(self, input):
+        inputs = {'X': input}
+        if self.bias is not None:
+            inputs['Bias'] = self.bias
+        if self.weight is not None:
+            inputs['Scale'] = self.weight
+
+        # create output
+        mean_out = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype, stop_gradient=True)
+        variance_out = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype, stop_gradient=True)
+        group_norm_out = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype)
+
+        self._helper.append_op(
+            type="group_norm",
+            inputs=inputs,
+            outputs={
+                "Y": group_norm_out,
+                "Mean": mean_out,
+                "Variance": variance_out,
+            },
+            attrs={"epsilon": self._epsilon,
+                   "groups": self._num_groups})
+
+        return self._helper.append_activation(group_norm_out, None)
+
+
+class LayerNorm(layers.Layer):
+    """
+    :alias_main: paddle.nn.LayerNorm
+	:alias: paddle.nn.LayerNorm,paddle.nn.layer.LayerNorm,paddle.nn.layer.norm.LayerNorm
+	:old_api: paddle.fluid.dygraph.LayerNorm
+
+    This interface is used to construct a callable object of the ``LayerNorm`` class.
+    For more details, refer to code examples.
+    It implements the function of the Layer Normalization Layer and can be applied to mini-batch input data.
+    Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
+
+    The formula is as follows:
+
+    ..  math::
+
+        \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} x_i
+
+        \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}{(x_i - \\mu)^2} + \\epsilon}
+
+        y & = f(\\frac{g}{\\sigma}(x - \\mu) + b)
+
+    - :math:`x`: the vector representation of the summed inputs to the neurons in that layer.
+    - :math:`H`: the number of hidden units in a layers
+    - :math:`\\epsilon`: the small value added to the variance to prevent division by zero.
+    - :math:`g`: the trainable scale parameter.
+    - :math:`b`: the trainable bias parameter.
+
+    Parameters:
+        normalized_shape(int or list or tuple): Input shape from an expected input of
+            size :math:`[*, normalized_shape[0], normalized_shape[1], ..., normalized_shape[-1]]`.
+            If it is a single integer, this module will normalize over the last dimension
+            which is expected to be of that specific size.
+        epsilon(float, optional): The small value added to the variance to prevent
+            division by zero. Default: 1e-05.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
+            gain :math:`g`. If False, weight is None. If is None, a default :code:`ParamAttr` would be added as scale. The
+            :attr:`param_attr` is initialized as 1 if it is added. Default: None.
+        bias_attr(ParamAttr, optional): The parameter attribute for the learnable
+            bias :math:`b`. If is False, bias is None. If is None, a default :code:`ParamAttr` would be added as bias. The
+            :attr:`bias_attr` is initialized as 0 if it is added. Default: None.
+        name(str, optional): parameter name. Default None.
+
+    Returns:
+        None
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          layer_norm = paddle.nn.LayerNorm(x_data.shape[1:])
+          layer_norm_out = layer_norm(x)
+
+          print(layer_norm_out.numpy)
+    """
+
+    def __init__(self,
+                 normalized_shape,
+                 epsilon=1e-05,
+                 weight_attr=None,
+                 bias_attr=None,
+                 name=None):
+        super(LayerNorm, self).__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = [normalized_shape]
+
+        self._normalized_shape = list(normalized_shape)
+        self._epsilon = epsilon
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        param_shape = [np.prod(self._normalized_shape)]
+
+        if weight_attr is False:
+            self.weight = None
+        else:
+            self.weight = self.create_parameter(
+                attr=self._weight_attr,
+                shape=param_shape,
+                default_initializer=Constant(1.0))
+
+        if bias_attr is False:
+            self.bias = None
+        else:
+            self.bias = self.create_parameter(
+                attr=self._bias_attr, shape=param_shape, is_bias=True)
+
+    def forward(self, input):
+        input_shape = list(input.shape)
+        input_ndim = len(input_shape)
+        normalized_ndim = len(self._normalized_shape)
+        self._begin_norm_axis = input_ndim - normalized_ndim
+        if input_ndim < normalized_ndim or input_shape[
+                self._begin_norm_axis:] != self._normalized_shape:
+            str_normalized_shape = str(self._normalized_shape)
+            raise ValueError(
+                'Given normalized_shape is ' + str_normalized_shape +
+                ', expected input with shape [*, ' + str_normalized_shape[
+                    1:] + ', but got input shape ' + str(input_shape))
+
+        if in_dygraph_mode():
+            pre_act, _, _ = core.ops.layer_norm(
+                input, self.weight, self.bias, 'epsilon', self._epsilon,
+                'begin_norm_axis', self._begin_norm_axis)
+            return dygraph_utils._append_activation_in_dygraph(
+                pre_act, act=None)
+
+        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                                 'LayerNorm')
+
+        inputs = dict()
+        inputs['X'] = [input]
+        if self.weight:
+            inputs['Scale'] = [self.weight]
+        if self.bias:
+            inputs['Bias'] = [self.bias]
+        attrs = {
+            "epsilon": self._epsilon,
+            "begin_norm_axis": self._begin_norm_axis
+        }
+
+        # create output
+        mean_out = self._helper.create_variable_for_type_inference(
+            dtype=input.type, stop_gradient=True)
+        variance_out = self._helper.create_variable_for_type_inference(
+            dtype=input.type, stop_gradient=True)
+        layer_norm_out = self._helper.create_variable_for_type_inference(
+            input.type)
+
+        self._helper.append_op(
+            type="layer_norm",
+            inputs=inputs,
+            outputs={
+                "Y": layer_norm_out,
+                "Mean": mean_out,
+                "Variance": variance_out,
+            },
+            attrs={
+                "epsilon": self._epsilon,
+                "begin_norm_axis": self._begin_norm_axis
+            })
+
+        return self._helper.append_activation(layer_norm_out, act=self._act)
+
+
 class _BatchNormBase(layers.Layer):
     """
     BatchNorm base .
@@ -51,6 +589,7 @@ def __init__(self,
                  track_running_stats=False,
                  name=None):
         super(_BatchNormBase, self).__init__()
+        self._num_features = num_features
         self._weight_attr = weight_attr
         self._bias_attr = bias_attr
 
@@ -66,14 +605,16 @@ def __init__(self,
             shape=param_shape,
             #dtype=self._dtype,
             default_initializer=Constant(1.0))
-        self.weight.stop_gradient = self._weight_attr and self._weight_attr.learning_rate == 0.
+        self.weight.stop_gradient = (self._weight_attr is False) or (
+            self._weight_attr and self._weight_attr.learning_rate == 0.)
 
         self.bias = self.create_parameter(
             attr=self._bias_attr,
             shape=param_shape,
             #dtype=self._dtype,
             is_bias=True)
-        self.bias.stop_gradient = self._bias_attr and self._bias_attr.learning_rate == 0.
+        self.bias.stop_gradient = (self._bias_attr is False) or (
+            self._bias_attr and self._bias_attr.learning_rate == 0.)
 
         moving_mean_name = None
         moving_variance_name = None
@@ -238,15 +779,17 @@ class BatchNorm1d(_BatchNormBase):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          from paddle.fluid.dygraph.base import to_variable
+          import paddle
           import numpy as np
 
-          x = np.random.random(size=(3, 10)).astype('float32')
-          with fluid.dygraph.guard():
-              x = to_variable(x)
-              batch_norm = fluid.BatchNorm1d(10)
-              hidden1 = batch_norm(x)
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 1, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          batch_norm = paddle.nn.BatchNorm1d(1)
+          batch_norm_out = batch_norm(x)
+
+          print(batch_norm_out.numpy)
     """
 
     def _check_input_dim(self, input):
@@ -315,15 +858,17 @@ class BatchNorm2d(_BatchNormBase):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          from paddle.fluid.dygraph.base import to_variable
+          import paddle
           import numpy as np
 
-          x = np.random.random(size=(3, 10, 3, 7)).astype('float32')
-          with fluid.dygraph.guard():
-              x = to_variable(x)
-              batch_norm = fluid.BatchNorm2d(10)
-              hidden1 = batch_norm(x)
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 1, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          batch_norm = paddle.nn.BatchNorm2d(1)
+          batch_norm_out = batch_norm(x)
+
+          print(batch_norm_out.numpy)
     """
 
     def _check_input_dim(self, input):
@@ -392,16 +937,17 @@ class BatchNorm3d(_BatchNormBase):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          from paddle.fluid.dygraph.base import to_variable
           import paddle
           import numpy as np
 
-          x = np.random.random(size=(3, 10, 3, 7, 6)).astype('float32')
-          with fluid.dygraph.guard():
-              x = to_variable(x)
-              batch_norm = paddle.nn.BatchNorm3d(10)
-              hidden1 = batch_norm(x)
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 1, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          batch_norm = paddle.nn.BatchNorm3d(1)
+          batch_norm_out = batch_norm(x)
+
+          print(batch_norm_out.numpy)
     """
 
     def _check_input_dim(self, input):

From f733ec628658ef366ebbbd7219954c56fa124dba Mon Sep 17 00:00:00 2001
From: frankwhzhang <frankwhzhang@126.com>
Date: Fri, 21 Aug 2020 08:59:13 +0000
Subject: [PATCH 06/18] add norm 2.0 api, test=develop

---
 .../tests/unittests/test_batch_norm_2d_op.py  |  92 ----------
 .../tests/unittests/test_batch_norm_op_v2.py  | 173 ++++++++++++++++++
 2 files changed, 173 insertions(+), 92 deletions(-)
 delete mode 100644 python/paddle/fluid/tests/unittests/test_batch_norm_2d_op.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py

diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_2d_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_2d_op.py
deleted file mode 100644
index a3bccd5763fc7..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_2d_op.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-import paddle.fluid as fluid
-from op_test import OpTest, _set_use_system_allocator
-from paddle.fluid.framework import grad_var_name
-import paddle.fluid as fluid
-from paddle.fluid import Program, program_guard
-import paddle
-
-
-class TestDygraphBatchNorm2d(unittest.TestCase):
-    def test_dygraph(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            shape = [4, 10, 4, 4]
-
-            def compute_v1(x, is_test, trainable_statistics):
-                with fluid.dygraph.guard(p):
-                    bn = fluid.dygraph.BatchNorm(
-                        shape[1],
-                        is_test=is_test,
-                        trainable_statistics=trainable_statistics)
-                    y = bn(fluid.dygraph.to_variable(x))
-                return y.numpy()
-
-            def compute_v2(x):
-                with fluid.dygraph.guard(p):
-                    bn = paddle.nn.BatchNorm2d(shape[1])
-                    y = bn(fluid.dygraph.to_variable(x))
-                return y.numpy()
-
-            x = np.random.randn(*shape).astype("float32")
-            y1 = compute_v1(x, False, False)
-            y2 = compute_v2(x)
-            self.assertTrue(np.allclose(y1, y2))
-
-    def test_static(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            exe = fluid.Executor(p)
-            shape = [4, 10, 16, 16]
-
-            def compute_v1(x_np, is_test, trainable_statistics):
-                with program_guard(Program(), Program()):
-                    bn = fluid.dygraph.BatchNorm(
-                        shape[1],
-                        is_test=is_test,
-                        trainable_statistics=trainable_statistics)
-                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
-                    y = bn(x)
-                    exe.run(fluid.default_startup_program())
-                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
-                return r
-
-            def compute_v2(x_np):
-                with program_guard(Program(), Program()):
-                    bn = paddle.nn.BatchNorm2d(shape[1])
-                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
-                    y = bn(x)
-                    exe.run(fluid.default_startup_program())
-                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
-                return r
-
-            x = np.random.randn(*shape).astype("float32")
-            y1 = compute_v1(x, False, False)
-            y2 = compute_v2(x)
-            self.assertTrue(np.allclose(y1, y2))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
new file mode 100644
index 0000000000000..0c53fefbeed96
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+
+class TestBatchNorm(unittest.TestCase):
+    def test_functional_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            with fluid.dygraph.guard(p):
+                x_data = np.random.random(size=(2, 1, 2, 3)).astype('float32')
+                running_mean = np.random.random(size=1).astype('float32')
+                running_variance = np.random.random(size=1).astype('float32')
+                x = fluid.dygraph.to_variable(x_data)
+                rm = fluid.dygraph.to_variable(running_mean)
+                rv = fluid.dygraph.to_variable(running_variance)
+                batch_norm_out = paddle.nn.functional.batch_norm(x, rm, rv)
+
+    def test_functional_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            with program_guard(Program(), Program()):
+                x_data = np.random.random(size=(2, 1, 2, 3)).astype('float32')
+                running_mean = np.random.random(size=1).astype('float32')
+                running_variance = np.random.random(size=1).astype('float32')
+                x = fluid.data(name='x', shape=x_data.shape, dtype=x_data.dtype)
+                rm = fluid.data(
+                    name='rm',
+                    shape=running_mean.shape,
+                    dtype=running_mean.dtype)
+                rv = fluid.data(
+                    name='rv',
+                    shape=running_variance.shape,
+                    dtype=running_variance.dtype)
+                batch_norm_out = paddle.nn.functional.batch_norm(x, rm, rv)
+                exe.run(fluid.default_startup_program())
+                r = exe.run(feed={
+                    'x': x_data,
+                    'rm': running_mean,
+                    'rv': running_variance
+                },
+                            fetch_list=[batch_norm_out])[0]
+
+                batch_norm_out = paddle.nn.functional.batch_norm(x, rm, rv)
+
+    def test_name(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            with fluid.dygraph.guard(p):
+                batch_norm1d = paddle.nn.BatchNorm1d(1, name="test")
+
+    def test_error(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            #paddle.disable_static()
+            x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+            x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
+
+            def error1d():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                batch_norm1d = paddle.nn.BatchNorm1d(1)
+                batch_norm1d(fluid.dygraph.to_variable(x_data_4))
+
+            def error2d():
+                x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
+                batch_norm1d = paddle.nn.BatchNorm2d(1)
+                batch_norm1d(fluid.dygraph.to_variable(x_data_3))
+
+            def error3d():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                batch_norm1d = paddle.nn.BatchNorm3d(1)
+                batch_norm1d(fluid.dygraph.to_variable(x_data_4))
+
+            with fluid.dygraph.guard(p):
+                self.assertRaises(ValueError, error1d)
+                self.assertRaises(ValueError, error2d)
+                self.assertRaises(ValueError, error3d)
+
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [4, 10, 4, 4]
+
+            def compute_v1(x, is_test, trainable_statistics):
+                with fluid.dygraph.guard(p):
+                    bn = fluid.dygraph.BatchNorm(
+                        shape[1],
+                        is_test=is_test,
+                        trainable_statistics=trainable_statistics)
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    bn = paddle.nn.BatchNorm2d(shape[1])
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x, False, False)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [4, 10, 16, 16]
+
+            def compute_v1(x_np, is_test, trainable_statistics):
+                with program_guard(Program(), Program()):
+                    bn = fluid.dygraph.BatchNorm(
+                        shape[1],
+                        is_test=is_test,
+                        trainable_statistics=trainable_statistics)
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = bn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            def compute_v2(x_np):
+                with program_guard(Program(), Program()):
+                    bn = paddle.nn.BatchNorm2d(shape[1])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = bn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x, False, False)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+
+if __name__ == '__main__':
+    unittest.main()

From d4a9b028a887f289033f0d0520676930308f9d14 Mon Sep 17 00:00:00 2001
From: frankwhzhang <frankwhzhang@126.com>
Date: Fri, 21 Aug 2020 09:46:11 +0000
Subject: [PATCH 07/18] add norm 2.0 api, test=develop

---
 .../tests/unittests/test_batch_norm_op_v2.py  |   2 -
 .../unittests/test_instance_norm_op_v2.py     | 141 ++++++++++++++++++
 .../tests/unittests/test_layer_norm_op_v2.py  | 110 ++++++++++++++
 python/paddle/nn/functional/__init__.py       |   2 +-
 python/paddle/nn/functional/norm.py           |  64 +++++---
 python/paddle/nn/layer/norm.py                |   2 +-
 6 files changed, 299 insertions(+), 22 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py

diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
index 0c53fefbeed96..9c91f71b901b5 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -68,8 +68,6 @@ def test_functional_static(self):
                 },
                             fetch_list=[batch_norm_out])[0]
 
-                batch_norm_out = paddle.nn.functional.batch_norm(x, rm, rv)
-
     def test_name(self):
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
diff --git a/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
new file mode 100644
index 0000000000000..b1a358512d766
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+
+class TestInstanceNorm(unittest.TestCase):
+    def test_functional_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(
+                "instance_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            with fluid.dygraph.guard(p):
+                x_data = np.random.random(size=(2, 1, 2, 3)).astype('float32')
+                x = fluid.dygraph.to_variable(x_data)
+                batch_norm_out = paddle.nn.functional.instance_norm(x)
+
+    def test_functional_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(
+                "instance_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            with program_guard(Program(), Program()):
+                x_data = np.random.random(size=(2, 1, 2, 3)).astype('float32')
+                x = fluid.data(name='x', shape=x_data.shape, dtype=x_data.dtype)
+                instance_norm_out = paddle.nn.functional.instance_norm(x)
+                exe.run(fluid.default_startup_program())
+                r = exe.run(feed={'x': x_data, },
+                            fetch_list=[instance_norm_out])[0]
+
+    def test_error(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(
+                "instance_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+
+            def error1d():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                instance_norm1d = paddle.nn.InstanceNorm1d(1)
+                instance_norm1d(fluid.dygraph.to_variable(x_data_4))
+
+            def error2d():
+                x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
+                instance_norm2d = paddle.nn.InstanceNorm2d(1)
+                instance_norm2d(fluid.dygraph.to_variable(x_data_3))
+
+            def error3d():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                instance_norm3d = paddle.nn.BatchNorm3d(1)
+                instance_norm3d(fluid.dygraph.to_variable(x_data_4))
+
+            with fluid.dygraph.guard(p):
+                self.assertRaises(ValueError, error1d)
+                self.assertRaises(ValueError, error2d)
+                self.assertRaises(ValueError, error3d)
+
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(
+                "instance_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [4, 10, 4, 4]
+
+            def compute_v1(x):
+                with fluid.dygraph.guard(p):
+                    bn = fluid.dygraph.InstanceNorm(shape[1])
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    bn = paddle.nn.InstanceNorm2d(shape[1])
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(
+                "instance_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [4, 10, 16, 16]
+
+            def compute_v1(x_np):
+                with program_guard(Program(), Program()):
+                    ins = fluid.dygraph.InstanceNorm(shape[1])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = ins(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            def compute_v2(x_np):
+                with program_guard(Program(), Program()):
+                    ins = paddle.nn.InstanceNorm2d(shape[1])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = ins(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py
new file mode 100644
index 0000000000000..4becacb559864
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+
+class TestDygraphLayerNormv2(unittest.TestCase):
+    def test_functional_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            with fluid.dygraph.guard(p):
+                x_data = np.random.random(size=(2, 1, 2, 3)).astype('float32')
+                x = fluid.dygraph.to_variable(x_data)
+                layer_norm_out = paddle.nn.functional.layer_norm(x, x.shape[1:])
+
+    def test_functional_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            with program_guard(Program(), Program()):
+                x_data = np.random.random(size=(2, 1, 2, 3)).astype('float32')
+                x = fluid.data(name='x', shape=x_data.shape, dtype=x_data.dtype)
+                layer_norm_out = paddle.nn.functional.layer_norm(x, [1, 2, 3])
+                exe.run(fluid.default_startup_program())
+                r = exe.run(feed={'x': x_data, },
+                            fetch_list=[layer_norm_out])[0]
+
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [4, 10, 4, 4]
+
+            def compute_v1(x):
+                with fluid.dygraph.guard(p):
+                    ln = fluid.dygraph.LayerNorm(shape[1:])
+                    y = ln(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    ln = paddle.nn.LayerNorm(shape[1:])
+                    y = ln(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [4, 10, 16, 16]
+
+            def compute_v1(x_np):
+                with program_guard(Program(), Program()):
+                    ln = fluid.dygraph.LayerNorm(shape[1:])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = ln(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            def compute_v2(x_np):
+                with program_guard(Program(), Program()):
+                    ln = paddle.nn.LayerNorm(shape[1:])
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = ln(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index ffdeb0a5f391e..afb6e7271e13a 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -146,7 +146,7 @@
 from .loss import teacher_student_sigmoid_loss  #DEFINE_ALIAS
 from .norm import batch_norm  #DEFINE_ALIAS
 # from .norm import data_norm        #DEFINE_ALIAS
-# from .norm import group_norm        #DEFINE_ALIAS# from .norm import instance_norm        #DEFINE_ALIAS
+# from .norm import group_norm        #DEFINE_ALIAS
 from .norm import instance_norm  #DEFINE_ALIAS
 from .norm import layer_norm  #DEFINE_ALIAS
 from .norm import lrn  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 7316b8e2b7b21..5685f45d9f0ee 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -156,8 +156,9 @@ def batch_norm(x,
           x_data = np.random.random(size=(2, 1, 2, 3)).astype('float32')
           running_mean = np.random.random(size=1).astype('float32')
           running_variance = np.random.random(size=1).astype('float32')
-          rm = to_variable(running_mean)
-          rv = to_variable(running_variance)
+          x = to_tensor(x_data)
+          rm = to_tensor(running_mean)
+          rv = to_tensor(running_variance)
           batch_norm_out = paddle.nn.functional.batch_norm(x, rm, rv)
 
           print(batch_norm_out.numpy)
@@ -211,25 +212,26 @@ def batch_norm(x,
         "Variance": [running_var]
     }
 
-    saved_mean = self._helper.create_variable_for_type_inference(
+    helper = LayerHelper('batch_norm', **locals())
+    saved_mean = helper.create_variable_for_type_inference(
         dtype=x.dtype, stop_gradient=True)
-    saved_variance = self._helper.create_variable_for_type_inference(
+    saved_variance = helper.create_variable_for_type_inference(
         dtype=x.dtype, stop_gradient=True)
-    batch_norm_out = self._helper.create_variable_for_type_inference(x.dtype)
+    batch_norm_out = helper.create_variable_for_type_inference(x.dtype)
 
     outputs = {
         "Y": [batch_norm_out],
-        "MeanOut": [mean],
-        "VarianceOut": [variance],
+        "MeanOut": [running_mean],
+        "VarianceOut": [running_var],
         "SavedMean": [saved_mean],
         "SavedVariance": [saved_variance]
     }
 
-    self._helper.append_op(
+    helper.append_op(
         type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
 
     # Currently, we don't support inplace in dygraph mode
-    return self._helper.append_activation(batch_norm_out, None)
+    return helper.append_activation(batch_norm_out)
 
 
 def layer_norm(x,
@@ -304,13 +306,14 @@ def layer_norm(x,
     attrs = {"epsilon": epsilon, "begin_norm_axis": begin_norm_axis}
 
     # create output
-    mean_out = self._helper.create_variable_for_type_inference(
+    helper = LayerHelper('layer_norm', **locals())
+    mean_out = helper.create_variable_for_type_inference(
         dtype=x.type, stop_gradient=True)
-    variance_out = self._helper.create_variable_for_type_inference(
+    variance_out = helper.create_variable_for_type_inference(
         dtype=x.type, stop_gradient=True)
-    layer_norm_out = self._helper.create_variable_for_type_inference(x.type)
+    layer_norm_out = helper.create_variable_for_type_inference(x.type)
 
-    self._helper.append_op(
+    helper.append_op(
         type="layer_norm",
         inputs=inputs,
         outputs={
@@ -321,7 +324,7 @@ def layer_norm(x,
         attrs={"epsilon": epsilon,
                "begin_norm_axis": begin_norm_axis})
 
-    return self._helper.append_activation(layer_norm_out, act=None)
+    return helper.append_activation(layer_norm_out)
 
 
 def instance_norm(x,
@@ -378,7 +381,32 @@ def instance_norm(x,
 
     """
 
-    def _check_input_dim(self, input):
-        if len(input.shape) != 2 and len(input.shape) != 3:
-            raise ValueError('expected 2D or 3D input (got {}D input)'.format(
-                len(input.shape)))
+    if in_dygraph_mode():
+        out, _, _ = core.ops.instance_norm(x, weight, bias, 'epsilon', eps)
+        return out
+
+    check_variable_and_dtype(x, 'input', ['float32', 'float64'], "InstanceNorm")
+
+    attrs = {"epsilon": eps}
+
+    if weight and bias:
+        inputs = {"X": [x], "Scale": [weight], "Bias": [bias]}
+    else:
+        inputs = {"X": [x]}
+
+    helper = LayerHelper('instance_norm', **locals())
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True)
+    saved_variance = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True)
+    instance_norm_out = helper.create_variable_for_type_inference(x.dtype)
+
+    outputs = {
+        "Y": [instance_norm_out],
+        "SavedMean": [saved_mean],
+        "SavedVariance": [saved_variance]
+    }
+
+    helper.append_op(
+        type="instance_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+    return instance_norm_out
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 0bd0f1f25e002..0312e368071ca 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -571,7 +571,7 @@ def forward(self, input):
                 "begin_norm_axis": self._begin_norm_axis
             })
 
-        return self._helper.append_activation(layer_norm_out, act=self._act)
+        return self._helper.append_activation(layer_norm_out, act=None)
 
 
 class _BatchNormBase(layers.Layer):

From 99ec8b9c403e722281b11856a55bb38851926765 Mon Sep 17 00:00:00 2001
From: frankwhzhang <frankwhzhang@126.com>
Date: Sat, 22 Aug 2020 03:52:32 +0000
Subject: [PATCH 08/18] add norm 2.0 api, test=develop

---
 .../tests/unittests/test_batch_norm_op_v2.py  |  50 +----
 .../tests/unittests/test_group_norm_op_v2.py  |  86 +++++++++
 .../unittests/test_instance_norm_op_v2.py     |  26 ---
 .../tests/unittests/test_layer_norm_op_v2.py  |  24 ---
 python/paddle/nn/functional/norm.py           |  78 +++-----
 python/paddle/nn/layer/norm.py                | 174 +++---------------
 6 files changed, 141 insertions(+), 297 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py

diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
index 9c91f71b901b5..c6d3c6e7d0492 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -26,48 +26,6 @@
 
 
 class TestBatchNorm(unittest.TestCase):
-    def test_functional_dygraph(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            with fluid.dygraph.guard(p):
-                x_data = np.random.random(size=(2, 1, 2, 3)).astype('float32')
-                running_mean = np.random.random(size=1).astype('float32')
-                running_variance = np.random.random(size=1).astype('float32')
-                x = fluid.dygraph.to_variable(x_data)
-                rm = fluid.dygraph.to_variable(running_mean)
-                rv = fluid.dygraph.to_variable(running_variance)
-                batch_norm_out = paddle.nn.functional.batch_norm(x, rm, rv)
-
-    def test_functional_static(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            exe = fluid.Executor(p)
-            with program_guard(Program(), Program()):
-                x_data = np.random.random(size=(2, 1, 2, 3)).astype('float32')
-                running_mean = np.random.random(size=1).astype('float32')
-                running_variance = np.random.random(size=1).astype('float32')
-                x = fluid.data(name='x', shape=x_data.shape, dtype=x_data.dtype)
-                rm = fluid.data(
-                    name='rm',
-                    shape=running_mean.shape,
-                    dtype=running_mean.dtype)
-                rv = fluid.data(
-                    name='rv',
-                    shape=running_variance.shape,
-                    dtype=running_variance.dtype)
-                batch_norm_out = paddle.nn.functional.batch_norm(x, rm, rv)
-                exe.run(fluid.default_startup_program())
-                r = exe.run(feed={
-                    'x': x_data,
-                    'rm': running_mean,
-                    'rv': running_variance
-                },
-                            fetch_list=[batch_norm_out])[0]
-
     def test_name(self):
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
@@ -92,13 +50,13 @@ def error1d():
 
             def error2d():
                 x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
-                batch_norm1d = paddle.nn.BatchNorm2d(1)
-                batch_norm1d(fluid.dygraph.to_variable(x_data_3))
+                batch_norm2d = paddle.nn.BatchNorm2d(1)
+                batch_norm2d(fluid.dygraph.to_variable(x_data_3))
 
             def error3d():
                 x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
-                batch_norm1d = paddle.nn.BatchNorm3d(1)
-                batch_norm1d(fluid.dygraph.to_variable(x_data_4))
+                batch_norm3d = paddle.nn.BatchNorm3d(1)
+                batch_norm3d(fluid.dygraph.to_variable(x_data_4))
 
             with fluid.dygraph.guard(p):
                 self.assertRaises(ValueError, error1d)
diff --git a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
new file mode 100644
index 0000000000000..654e8d6f129e1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+
+class TestDygraphGroupNormv2(unittest.TestCase):
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("group_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [2, 6, 2, 2]
+
+            def compute_v1(x):
+                with fluid.dygraph.guard(p):
+                    gn = fluid.dygraph.GroupNorm(channels=2, groups=2)
+                    y = gn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    gn = paddle.nn.GroupNorm(num_channels=2, num_groups=2)
+                    y = gn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [2, 6, 2, 2]
+
+            def compute_v1(x_np):
+                with program_guard(Program(), Program()):
+                    gn = fluid.dygraph.GroupNorm(channels=2, groups=2)
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = gn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            def compute_v2(x_np):
+                with program_guard(Program(), Program()):
+                    gn = paddle.nn.GroupNorm(num_channels=2, num_groups=2)
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = gn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y1, y2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
index b1a358512d766..b02ba1a584b52 100644
--- a/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
@@ -26,32 +26,6 @@
 
 
 class TestInstanceNorm(unittest.TestCase):
-    def test_functional_dygraph(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu(
-                "instance_norm"):
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            with fluid.dygraph.guard(p):
-                x_data = np.random.random(size=(2, 1, 2, 3)).astype('float32')
-                x = fluid.dygraph.to_variable(x_data)
-                batch_norm_out = paddle.nn.functional.instance_norm(x)
-
-    def test_functional_static(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu(
-                "instance_norm"):
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            exe = fluid.Executor(p)
-            with program_guard(Program(), Program()):
-                x_data = np.random.random(size=(2, 1, 2, 3)).astype('float32')
-                x = fluid.data(name='x', shape=x_data.shape, dtype=x_data.dtype)
-                instance_norm_out = paddle.nn.functional.instance_norm(x)
-                exe.run(fluid.default_startup_program())
-                r = exe.run(feed={'x': x_data, },
-                            fetch_list=[instance_norm_out])[0]
-
     def test_error(self):
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu(
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py
index 4becacb559864..f324e4bd377c6 100644
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py
@@ -26,30 +26,6 @@
 
 
 class TestDygraphLayerNormv2(unittest.TestCase):
-    def test_functional_dygraph(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            with fluid.dygraph.guard(p):
-                x_data = np.random.random(size=(2, 1, 2, 3)).astype('float32')
-                x = fluid.dygraph.to_variable(x_data)
-                layer_norm_out = paddle.nn.functional.layer_norm(x, x.shape[1:])
-
-    def test_functional_static(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            exe = fluid.Executor(p)
-            with program_guard(Program(), Program()):
-                x_data = np.random.random(size=(2, 1, 2, 3)).astype('float32')
-                x = fluid.data(name='x', shape=x_data.shape, dtype=x_data.dtype)
-                layer_norm_out = paddle.nn.functional.layer_norm(x, [1, 2, 3])
-                exe.run(fluid.default_startup_program())
-                r = exe.run(feed={'x': x_data, },
-                            fetch_list=[layer_norm_out])[0]
-
     def test_dygraph(self):
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 5685f45d9f0ee..32743b1a347ad 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -128,7 +128,7 @@ def batch_norm(x,
     """
     Applies Batch Normalization as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
 
-    see nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d for detail.
+    nn.functional.batch_norm is uesd for nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d. Please use above API for BatchNorm.
     
     Parameters:
         x(Tesnor): input value.
@@ -136,9 +136,9 @@ def batch_norm(x,
         running_var(Tensor): running variance.
         epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
-        weight(Tensor, optional): The parameter attribute for Parameter `scale` of batch_norm. Default: None.
-        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm. Default: None.
-        training(bool, optional): defalut False.
+        weight(Tensor, optional): The weight tensor of batch_norm. Default: None.
+        bias(Tensor, optional): The bias tensor of batch_norm. Default: None.
+        training(bool, optional): The actual meaning is the opposite of global status. Defalut False.
         data_format(str, optional): Specify the input data format. Defalut "NCHW".
         name(str, optional): Default: None.
 
@@ -152,37 +152,32 @@ def batch_norm(x,
           import numpy as np
 
           paddle.disable_static()
-          np.random.seed(123)
-          x_data = np.random.random(size=(2, 1, 2, 3)).astype('float32')
+          x = np.random.seed(123)
+          x = np.random.random(size=(2, 1, 2, 3)).astype('float32')
           running_mean = np.random.random(size=1).astype('float32')
           running_variance = np.random.random(size=1).astype('float32')
-          x = to_tensor(x_data)
-          rm = to_tensor(running_mean)
-          rv = to_tensor(running_variance)
-          batch_norm_out = paddle.nn.functional.batch_norm(x, rm, rv)
-
-          print(batch_norm_out.numpy)
+          weight_data = np.random.random(size=1).astype('float32')
+          bias_data = np.random.random(size=1).astype('float32')
+          x = paddle.to_tensor(x)
+          rm = paddle.to_tensor(running_mean)
+          rv = paddle.to_tensor(running_variance)
+          w = paddle.to_tensor(weight_data)
+          b = paddle.to_tensor(bias_data)
+          batch_norm_out = paddle.nn.functional.batch_norm(x, rm, rv, w, b)
+          print batch_norm_out
     """
 
     assert len(x.shape) >= 2, "input dim must be larger than 1"
-    param_shape = [x.shape[1]]
-    if weight is None or weight is False:
-        weight = create_parameter(
-            dtype=x.dtype, shape=param_shape, default_initializer=Constant(1.0))
-        weight.stop_gradient = True
-
-    if bias is None or bias is False:
-        bias = create_parameter(
-            dtype=x.dtype, shape=param_shape, default_initializer=Constant(0.0))
-        bias.stop_gradient = True
 
+    # we use not training means use_global_status, more details see nn._BatchNormBase
+    use_global_stats = not training
+    # input ad out must share the memory
     mean_out = running_mean
     variance_out = running_var
     if in_dygraph_mode():
-        attrs = ("momentum", momentum, "epsilon", epsilon, "is_test", training,
-                 "data_layout", data_format, "use_mkldnn", False,
-                 "fuse_with_relu", False, "use_global_stats", training,
-                 'trainable_statistics', training)
+        attrs = ("momentum", momentum, "epsilon", epsilon, "data_layout",
+                 data_format, "use_mkldnn", False, "fuse_with_relu", False,
+                 "use_global_stats", use_global_stats)
         batch_norm_out, _, _, _, _, _ = core.ops.batch_norm(
             x, weight, bias, running_mean, running_var, mean_out, variance_out,
             *attrs)
@@ -196,12 +191,10 @@ def batch_norm(x,
     attrs = {
         "momentum": momentum,
         "epsilon": epsilon,
-        "is_test": not training,
         "data_layout": data_format,
         "use_mkldnn": False,
         "fuse_with_relu": False,
-        "use_global_stats": training,
-        "trainable_statistics": training,
+        "use_global_stats": use_global_stats,
     }
 
     inputs = {
@@ -230,7 +223,6 @@ def batch_norm(x,
     helper.append_op(
         type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
 
-    # Currently, we don't support inplace in dygraph mode
     return helper.append_activation(batch_norm_out)
 
 
@@ -245,19 +237,15 @@ def layer_norm(x,
     
     Parameters:
         x(Tensor): Input Tensor.
-        normalized_shape(int or list or tuple): Input shape from an expected input of
+        normalized_shape(int|list|tuple): Input shape from an expected input of
             size :math:`[*, normalized_shape[0], normalized_shape[1], ..., normalized_shape[-1]]`.
             If it is a single integer, this module will normalize over the last dimension
             which is expected to be of that specific size.
         epsilon(float, optional): The small value added to the variance to prevent
             division by zero. Default: 1e-05.
-        weight_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
-            gain :math:`g`. If False, weight is None. If is None, a default :code:`ParamAttr` would be added as scale. The
-            :attr:`param_attr` is initialized as 1 if it is added. Default: None.
-        bias_attr(ParamAttr, optional): The parameter attribute for the learnable
-            bias :math:`b`. If is False, bias is None. If is None, a default :code:`ParamAttr` would be added as bias. The
-            :attr:`bias_attr` is initialized as 0 if it is added. Default: None.
-        name(str, optional): parameter name. Default None.
+        weight(Tensor, optional): The weight tensor of batch_norm. Default: None.
+        bias(Tensor, optional): The bias tensor of batch_norm. Default: None.
+        name(str, optional): Default None.
 
     Returns:
         None
@@ -348,17 +336,9 @@ def instance_norm(x,
             numerical stability. Default is 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
         use_input_stats(bool): Default True.
-        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
-             of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
-	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
-	     If the Initializer of the weight_attr is not set, the parameter is initialized 
-	     one. If it is set to False, will not create weight_attr. Default: None.
-        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
-             If it is set to None or one attribute of ParamAttr, instance_norm
-	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
-	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
-             If it is set to False, will not create bias_attr. Default: None.
-        data_format(str, optional): Specify the input data format. Default: NCL.
+        weight(Tensor, optional): The weight tensor of instance_norm. Default: None.
+        bias(Tensor, optional): The bias tensor of instance_norm. Default: None.
+        data_format(str, optional): Specify the input data format. Default: NCHW.
         name(str, optional): Default None.
 
     Returns:
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 0312e368071ca..eb63e4a9565ca 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -32,6 +32,8 @@
 from ...fluid.data_feeder import check_variable_and_dtype, check_type
 from ...fluid import core, dygraph_utils
 
+from ..functional import batch_norm, layer_norm, instance_norm
+
 import numpy as np
 import numbers
 
@@ -85,37 +87,8 @@ def _check_input_dim(self, input):
     def forward(self, input):
         self._check_input_dim(input)
 
-        if in_dygraph_mode():
-            out, _, _ = core.ops.instance_norm(input, self.scale, self.bias,
-                                               'epsilon', self._epsilon)
-            return out
-
-        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                                 "InstanceNorm")
-
-        attrs = {"epsilon": self._epsilon}
-
-        if self.scale and self.bias:
-            inputs = {"X": [input], "Scale": [self.scale], "Bias": [self.bias]}
-        else:
-            inputs = {"X": [input]}
-
-        saved_mean = self._helper.create_variable_for_type_inference(
-            dtype=input.dtype, stop_gradient=True)
-        saved_variance = self._helper.create_variable_for_type_inference(
-            dtype=input.dtype, stop_gradient=True)
-        instance_norm_out = self._helper.create_variable_for_type_inference(
-            input.dtype)
-
-        outputs = {
-            "Y": [instance_norm_out],
-            "SavedMean": [saved_mean],
-            "SavedVariance": [saved_variance]
-        }
-
-        self._helper.append_op(
-            type="instance_norm", inputs=inputs, outputs=outputs, attrs=attrs)
-        return instance_norm_out
+        return instance_norm(
+            input, weight=self.scale, bias=self.bias, eps=self._epsilon)
 
 
 class InstanceNorm1d(_InstanceNormBase):
@@ -517,61 +490,12 @@ def __init__(self,
                 attr=self._bias_attr, shape=param_shape, is_bias=True)
 
     def forward(self, input):
-        input_shape = list(input.shape)
-        input_ndim = len(input_shape)
-        normalized_ndim = len(self._normalized_shape)
-        self._begin_norm_axis = input_ndim - normalized_ndim
-        if input_ndim < normalized_ndim or input_shape[
-                self._begin_norm_axis:] != self._normalized_shape:
-            str_normalized_shape = str(self._normalized_shape)
-            raise ValueError(
-                'Given normalized_shape is ' + str_normalized_shape +
-                ', expected input with shape [*, ' + str_normalized_shape[
-                    1:] + ', but got input shape ' + str(input_shape))
-
-        if in_dygraph_mode():
-            pre_act, _, _ = core.ops.layer_norm(
-                input, self.weight, self.bias, 'epsilon', self._epsilon,
-                'begin_norm_axis', self._begin_norm_axis)
-            return dygraph_utils._append_activation_in_dygraph(
-                pre_act, act=None)
-
-        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                                 'LayerNorm')
-
-        inputs = dict()
-        inputs['X'] = [input]
-        if self.weight:
-            inputs['Scale'] = [self.weight]
-        if self.bias:
-            inputs['Bias'] = [self.bias]
-        attrs = {
-            "epsilon": self._epsilon,
-            "begin_norm_axis": self._begin_norm_axis
-        }
-
-        # create output
-        mean_out = self._helper.create_variable_for_type_inference(
-            dtype=input.type, stop_gradient=True)
-        variance_out = self._helper.create_variable_for_type_inference(
-            dtype=input.type, stop_gradient=True)
-        layer_norm_out = self._helper.create_variable_for_type_inference(
-            input.type)
-
-        self._helper.append_op(
-            type="layer_norm",
-            inputs=inputs,
-            outputs={
-                "Y": layer_norm_out,
-                "Mean": mean_out,
-                "Variance": variance_out,
-            },
-            attrs={
-                "epsilon": self._epsilon,
-                "begin_norm_axis": self._begin_norm_axis
-            })
-
-        return self._helper.append_activation(layer_norm_out, act=None)
+        return layer_norm(
+            input,
+            normalized_shape=self._normalized_shape,
+            weight=self.weight,
+            bias=self.bias,
+            epsilon=self._epsilon)
 
 
 class _BatchNormBase(layers.Layer):
@@ -603,16 +527,12 @@ def __init__(self,
         self.weight = self.create_parameter(
             attr=self._weight_attr,
             shape=param_shape,
-            #dtype=self._dtype,
             default_initializer=Constant(1.0))
         self.weight.stop_gradient = (self._weight_attr is False) or (
             self._weight_attr and self._weight_attr.learning_rate == 0.)
 
         self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=param_shape,
-            #dtype=self._dtype,
-            is_bias=True)
+            attr=self._bias_attr, shape=param_shape, is_bias=True)
         self.bias.stop_gradient = (self._bias_attr is False) or (
             self._bias_attr and self._bias_attr.learning_rate == 0.)
 
@@ -654,69 +574,19 @@ def _check_input_dim(self, input):
         raise NotImplementedError("BatchNorm Base error")
 
     def forward(self, input):
-        # create output
-        # mean and mean_out share the same memory
-        # variance and variance out share the same memory
-        self._check_input_dim(input)
-        mean_out = self._mean
-        variance_out = self._variance
-
-        if in_dygraph_mode():
-            attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
-                     "is_test", not self.training, "data_layout",
-                     self._data_format, "use_mkldnn", False, "fuse_with_relu",
-                     self._fuse_with_relu, "use_global_stats",
-                     self._track_running_stats, 'trainable_statistics',
-                     self._track_running_stats)
-            batch_norm_out, _, _, _, _, _ = core.ops.batch_norm(
-                input, self.weight, self.bias, self._mean, self._variance,
-                mean_out, variance_out, *attrs)
-
-            return dygraph_utils._append_activation_in_dygraph(
-                batch_norm_out, act=None)
-
-        check_variable_and_dtype(
-            input, 'input', ['float16', 'float32', 'float64'], 'BatchNorm2d')
-
-        attrs = {
-            "momentum": self._momentum,
-            "epsilon": self._epsilon,
-            "is_test": not self.training,
-            "data_layout": self._data_format,
-            "use_mkldnn": False,
-            "fuse_with_relu": self._fuse_with_relu,
-            "use_global_stats": self._track_running_stats,
-            "trainable_statistics": self._track_running_stats,
-        }
-
-        inputs = {
-            "X": [input],
-            "Scale": [self.weight],
-            "Bias": [self.bias],
-            "Mean": [self._mean],
-            "Variance": [self._variance]
-        }
-
-        saved_mean = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype, stop_gradient=True)
-        saved_variance = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype, stop_gradient=True)
-        batch_norm_out = input if self._in_place else self._helper.create_variable_for_type_inference(
-            self._dtype)
-
-        outputs = {
-            "Y": [batch_norm_out],
-            "MeanOut": [mean_out],
-            "VarianceOut": [variance_out],
-            "SavedMean": [saved_mean],
-            "SavedVariance": [saved_variance]
-        }
 
-        self._helper.append_op(
-            type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+        self._check_input_dim(input)
 
-        # Currently, we don't support inplace in dygraph mode
-        return self._helper.append_activation(batch_norm_out, None)
+        return batch_norm(
+            input,
+            self._mean,
+            self._variance,
+            weight=self.weight,
+            bias=self.bias,
+            training=self.training or not self._track_running_stats,
+            momentum=self._momentum,
+            epsilon=self._epsilon,
+            data_format=self._data_format)
 
 
 class BatchNorm1d(_BatchNormBase):

From 5a7eca6a7aa3aed67ae03c2097533850f133814d Mon Sep 17 00:00:00 2001
From: frankwhzhang <frankwhzhang@126.com>
Date: Sat, 22 Aug 2020 13:54:13 +0000
Subject: [PATCH 09/18] add norm 2.0 api, test=develop

---
 python/paddle/nn/functional/norm.py | 19 ++++++------
 python/paddle/nn/layer/norm.py      | 48 ++++++++++++++---------------
 2 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 32743b1a347ad..bea53affe4bfb 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -131,13 +131,13 @@ def batch_norm(x,
     nn.functional.batch_norm is uesd for nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d. Please use above API for BatchNorm.
     
     Parameters:
-        x(Tesnor): input value.
+        x(Tesnor): input value. It's data type should be float32, float64.
         running_mean(Tensor): running mean.
         running_var(Tensor): running variance.
         epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
-        weight(Tensor, optional): The weight tensor of batch_norm. Default: None.
-        bias(Tensor, optional): The bias tensor of batch_norm. Default: None.
+        weight(Tensor, optional): The weight tensor of batch_norm, can not be None. Default: None.
+        bias(Tensor, optional): The bias tensor of batch_norm can not be None. Default: None.
         training(bool, optional): The actual meaning is the opposite of global status. Defalut False.
         data_format(str, optional): Specify the input data format. Defalut "NCHW".
         name(str, optional): Default: None.
@@ -169,6 +169,8 @@ def batch_norm(x,
 
     assert len(x.shape) >= 2, "input dim must be larger than 1"
 
+    assert weight is not None, "the weight must not be None, please use nn.BatchNorm1d, nn.BatchNorm2d or nn.BatchNorm3d"
+    assert bias is not None, "the bias must not be None, please use nn.BatchNorm1d, nn.BatchNorm2d or nn.BatchNorm3d"
     # we use not training means use_global_status, more details see nn._BatchNormBase
     use_global_stats = not training
     # input ad out must share the memory
@@ -236,7 +238,7 @@ def layer_norm(x,
     see more detail in paddle.nn.LayerNorm
     
     Parameters:
-        x(Tensor): Input Tensor.
+        x(Tensor): Input Tensor. It's data type should be float32, float64.
         normalized_shape(int|list|tuple): Input shape from an expected input of
             size :math:`[*, normalized_shape[0], normalized_shape[1], ..., normalized_shape[-1]]`.
             If it is a single integer, this module will normalize over the last dimension
@@ -329,15 +331,14 @@ def instance_norm(x,
     See more detail in nn.layer.InstanceNorm2d.
 
     Parameters:
-        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        x(Tensor): Input Tensor. It's data type should be float32, float64.
         running_mean(Tensor): running mean. Default None.
         running_var(Tensor): running variance. Default None.
-        eps(float, optional): A value added to the denominator for
-            numerical stability. Default is 1e-5.
-        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
-        use_input_stats(bool): Default True.
         weight(Tensor, optional): The weight tensor of instance_norm. Default: None.
         bias(Tensor, optional): The bias tensor of instance_norm. Default: None.
+        eps(float, optional): A value added to the denominator for numerical stability. Default is 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        use_input_stats(bool): Default True.
         data_format(str, optional): Specify the input data format. Default: NCHW.
         name(str, optional): Default None.
 
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index eb63e4a9565ca..5037cf9f73250 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -25,6 +25,7 @@
 
 from ...fluid.dygraph import layers
 
+from ...framework import get_default_dtype, set_default_dtype
 from ...fluid.framework import in_dygraph_mode
 
 from ...fluid.initializer import Constant
@@ -314,10 +315,10 @@ class GroupNorm(layers.Layer):
         num_groups(int): The number of groups that divided from channels.
         epsilon(float, optional): The small value added to the variance to prevent
                                   division by zero. Default: 1e-05.
-        weight_attr(ParamAttr, optional): The parameter attribute for the learnable
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
                                          scale :math:`g`. If it is set to False, no scale will be added to the output units.
                                          If it is set to None, the bias is initialized one. Default: None.
-        bias_attr(ParamAttr, optional): The parameter attribute for the learnable
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
                                         bias :math:`b`. If it is set to False, no bias will be added to the output units.
                                         If it is set to None, the bias is initialized zero. Default: None.
         data_format(str, optional): Specify the input data format. Only NCHW is supported. Default: NCHW.
@@ -434,7 +435,7 @@ class LayerNorm(layers.Layer):
         weight_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
             gain :math:`g`. If False, weight is None. If is None, a default :code:`ParamAttr` would be added as scale. The
             :attr:`param_attr` is initialized as 1 if it is added. Default: None.
-        bias_attr(ParamAttr, optional): The parameter attribute for the learnable
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
             bias :math:`b`. If is False, bias is None. If is None, a default :code:`ParamAttr` would be added as bias. The
             :attr:`bias_attr` is initialized as 0 if it is added. Default: None.
         name(str, optional): parameter name. Default None.
@@ -517,9 +518,8 @@ def __init__(self,
         self._weight_attr = weight_attr
         self._bias_attr = bias_attr
 
-        assert bias_attr is not False, "bias_attr should not be False in batch_norm."
-
-        #self._dtype = 'float32'
+        if get_default_dtype() == 'float16':
+            set_default_dtype('float32')
 
         param_shape = [num_features]
 
@@ -629,14 +629,14 @@ class BatchNorm1d(_BatchNormBase):
         num_features(int): Indicate the number of channels of the input ``Tensor``.
         epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
-        weight_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
              of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
-             will create ParamAttr as weight_attr. If the Initializer of the weight_attr
-             is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
+             will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
+             If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
              If it is set to None or one attribute of ParamAttr, batch_norm
-             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-             is not set, the bias is initialized zero. Default: None.
+             will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
+             If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
         data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
         track_running_stats(bool, optional): Whether to use global mean and
             variance. In train mode, when setting track_running_stats True, the global mean
@@ -708,14 +708,14 @@ class BatchNorm2d(_BatchNormBase):
         num_features(int): Indicate the number of channels of the input ``Tensor``.
         epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
-        weight_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
              of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
-             will create ParamAttr as weight_attr. If the Initializer of the weight_attr
-             is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
+             will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
+             If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
              If it is set to None or one attribute of ParamAttr, batch_norm
-             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-             is not set, the bias is initialized zero. Default: None.
+             will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
+             If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
         data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
         track_running_stats(bool, optional): Whether to use global mean and
             variance. In train mode, when setting track_running_stats True, the global mean
@@ -787,14 +787,14 @@ class BatchNorm3d(_BatchNormBase):
         num_features(int): Indicate the number of channels of the input ``Tensor``.
         epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
-        weight_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
              of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
-             will create ParamAttr as weight_attr. If the Initializer of the weight_attr
-             is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
+             will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
+             If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
              If it is set to None or one attribute of ParamAttr, batch_norm
-             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-             is not set, the bias is initialized zero. Default: None.
+             will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
+             If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
         data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
         track_running_stats(bool, optional): Whether to use global mean and
             variance. In train mode, when setting track_running_stats True, the global mean

From 59d75bc6640dae30a0bb6254914c90f804c5ad5b Mon Sep 17 00:00:00 2001
From: frankwhzhang <frankwhzhang@126.com>
Date: Sat, 22 Aug 2020 16:00:14 +0000
Subject: [PATCH 10/18] add norm 2.0 api, test=develop

---
 python/paddle/nn/functional/norm.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index bea53affe4bfb..65543cb042314 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -323,7 +323,7 @@ def instance_norm(x,
                   weight=None,
                   bias=None,
                   use_input_stats=True,
-                  momentum=0.1,
+                  momentum=0.9,
                   eps=1e-05,
                   data_format="NCHW",
                   name=None):
@@ -363,12 +363,14 @@ def instance_norm(x,
     """
 
     if in_dygraph_mode():
-        out, _, _ = core.ops.instance_norm(x, weight, bias, 'epsilon', eps)
+        out, _, _ = core.ops.instance_norm(x, weight, bias, "epsilon", eps,
+                                           "momentum", momentum, "data_format",
+                                           data_format)
         return out
 
     check_variable_and_dtype(x, 'input', ['float32', 'float64'], "InstanceNorm")
 
-    attrs = {"epsilon": eps}
+    attrs = {"epsilon": eps, "momentum": momentum, "data_format": data_format}
 
     if weight and bias:
         inputs = {"X": [x], "Scale": [weight], "Bias": [bias]}

From ed45c00681905b5986cf3dc29c51fe8aeadd64f5 Mon Sep 17 00:00:00 2001
From: frankwhzhang <frankwhzhang@126.com>
Date: Sun, 23 Aug 2020 05:14:45 +0000
Subject: [PATCH 11/18] add norm 2.0 api, test=develop

---
 python/paddle/fluid/tests/unittests/http.log | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 python/paddle/fluid/tests/unittests/http.log

diff --git a/python/paddle/fluid/tests/unittests/http.log b/python/paddle/fluid/tests/unittests/http.log
deleted file mode 100644
index e69de29bb2d1d..0000000000000

From 338e6c36d7edbd0c4af1cc917f215c8d3a6ff3d0 Mon Sep 17 00:00:00 2001
From: frankwhzhang <frankwhzhang@126.com>
Date: Sun, 23 Aug 2020 12:00:35 +0000
Subject: [PATCH 12/18] add norm 2.0 api, test=develop

---
 python/paddle/nn/functional/norm.py | 39 ++++++++---------
 python/paddle/nn/layer/norm.py      | 65 ++++++++++++++++++++++++++---
 2 files changed, 77 insertions(+), 27 deletions(-)

diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 65543cb042314..82a5ea99d6c5a 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -118,8 +118,8 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
 def batch_norm(x,
                running_mean,
                running_var,
-               weight=None,
-               bias=None,
+               weight,
+               bias,
                training=False,
                momentum=0.9,
                epsilon=1e-05,
@@ -134,10 +134,10 @@ def batch_norm(x,
         x(Tesnor): input value. It's data type should be float32, float64.
         running_mean(Tensor): running mean.
         running_var(Tensor): running variance.
+        weight(Tensor): The weight tensor of batch_norm, can not be None.
+        bias(Tensor): The bias tensor of batch_norm can not be None. 
         epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
-        weight(Tensor, optional): The weight tensor of batch_norm, can not be None. Default: None.
-        bias(Tensor, optional): The bias tensor of batch_norm can not be None. Default: None.
         training(bool, optional): The actual meaning is the opposite of global status. Defalut False.
         data_format(str, optional): Specify the input data format. Defalut "NCHW".
         name(str, optional): Default: None.
@@ -169,17 +169,21 @@ def batch_norm(x,
 
     assert len(x.shape) >= 2, "input dim must be larger than 1"
 
-    assert weight is not None, "the weight must not be None, please use nn.BatchNorm1d, nn.BatchNorm2d or nn.BatchNorm3d"
-    assert bias is not None, "the bias must not be None, please use nn.BatchNorm1d, nn.BatchNorm2d or nn.BatchNorm3d"
     # we use not training means use_global_status, more details see nn._BatchNormBase
     use_global_stats = not training
     # input ad out must share the memory
     mean_out = running_mean
     variance_out = running_var
+    attrs = {
+        "momentum": momentum,
+        "epsilon": epsilon,
+        "data_layout": data_format,
+        "use_mkldnn": False,
+        "fuse_with_relu": False,
+        "use_global_stats": use_global_stats,
+    }
+
     if in_dygraph_mode():
-        attrs = ("momentum", momentum, "epsilon", epsilon, "data_layout",
-                 data_format, "use_mkldnn", False, "fuse_with_relu", False,
-                 "use_global_stats", use_global_stats)
         batch_norm_out, _, _, _, _, _ = core.ops.batch_norm(
             x, weight, bias, running_mean, running_var, mean_out, variance_out,
             *attrs)
@@ -190,15 +194,6 @@ def batch_norm(x,
     check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
                              'BatchNorm')
 
-    attrs = {
-        "momentum": momentum,
-        "epsilon": epsilon,
-        "data_layout": data_format,
-        "use_mkldnn": False,
-        "fuse_with_relu": False,
-        "use_global_stats": use_global_stats,
-    }
-
     inputs = {
         "X": [x],
         "Scale": [weight],
@@ -208,11 +203,13 @@ def batch_norm(x,
     }
 
     helper = LayerHelper('batch_norm', **locals())
+
+    dtype = x.dtype if x.dtype is not 'float16' else 'float32'
     saved_mean = helper.create_variable_for_type_inference(
-        dtype=x.dtype, stop_gradient=True)
+        dtype=dtype, stop_gradient=True)
     saved_variance = helper.create_variable_for_type_inference(
-        dtype=x.dtype, stop_gradient=True)
-    batch_norm_out = helper.create_variable_for_type_inference(x.dtype)
+        dtype=dtype, stop_gradient=True)
+    batch_norm_out = helper.create_variable_for_type_inference(dtype)
 
     outputs = {
         "Y": [batch_norm_out],
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 5037cf9f73250..afb192e86b015 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -47,7 +47,9 @@
 
 class _InstanceNormBase(layers.Layer):
     """
-    This class is based class for InstanceNorm1d, 2d, 3d.
+    This class is based class for InstanceNorm1d, 2d, 3d. 
+
+    See InstaceNorm1d, InstanceNorm2d or InstanceNorm3d for more details.
     """
 
     def __init__(self,
@@ -134,9 +136,18 @@ class InstanceNorm1d(_InstanceNormBase):
         data_format(str, optional): Specify the input data format. Default: NCL.
         name(str, optional): Default None.
 
+
+    Shape:
+        - x: 2-D or 3-D tensor with shape: (batch, num_features) or (batch, num_features, length).
+        - output: 3-D tensor with same shape as input x.
+
     Returns:
         None.
 
+    **Note**:
+        Momentum and track_running_stats is not effective. The next version will fix the problem .
+
+
     Examples:
 
         .. code-block:: python
@@ -204,9 +215,16 @@ class InstanceNorm2d(_InstanceNormBase):
         data_format(str, optional): Specify the input data format. Default: NCHW.
         name(str, optional): Default None.
 
+    Shape:
+        - x: 4-D tensor with shape: (batch, num_features, height, weight).
+        - output: 4-D tensor with same shape as input x.
+
     Returns:
         None.
 
+    **Note**:
+        Momentum and track_running_stats is not effective. The next version will fix the problem .
+
     Examples:
 
         .. code-block:: python
@@ -273,9 +291,16 @@ class InstanceNorm3d(_InstanceNormBase):
         data_format(str, optional): Specify the input data format. Default: NCDHW.
         name(str, optional): Default None.
 
+    Shape:
+        - x: 5-D tensor with shape: (batch, num_features, dims, height, weight).
+        - output: 5-D tensor with same shape as input x.
+
     Returns:
         None.
 
+    **Note**:
+        Momentum and track_running_stats is not effective. The next version will fix the problem .
+
     Examples:
 
         .. code-block:: python
@@ -301,10 +326,6 @@ def _check_input_dim(self, input):
 
 class GroupNorm(layers.Layer):
     """
-    :alias_main: paddle.nn.GroupNorm
-	:alias: paddle.nn.GroupNorm,paddle.nn.layer.GroupNorm,paddle.nn.layer.norm.GroupNorm
-	:old_api: paddle.fluid.dygraph.GroupNorm
-
     This interface is used to construct a callable object of the ``GroupNorm`` class.
     For more details, refer to code examples.
     It implements the function of the Group Normalization Layer.
@@ -324,6 +345,10 @@ class GroupNorm(layers.Layer):
         data_format(str, optional): Specify the input data format. Only NCHW is supported. Default: NCHW.
         name(str, optional): Default None.
 
+    Shape:
+        - x: 4-D tensor with shape: (batch, num_features, height, weight).
+        - output: 4-D tensor with same shape as input x.
+
     Returns:
         None
 
@@ -426,7 +451,7 @@ class LayerNorm(layers.Layer):
     - :math:`b`: the trainable bias parameter.
 
     Parameters:
-        normalized_shape(int or list or tuple): Input shape from an expected input of
+        normalized_shape(int|list|tuple): Input shape from an expected input of
             size :math:`[*, normalized_shape[0], normalized_shape[1], ..., normalized_shape[-1]]`.
             If it is a single integer, this module will normalize over the last dimension
             which is expected to be of that specific size.
@@ -440,6 +465,10 @@ class LayerNorm(layers.Layer):
             :attr:`bias_attr` is initialized as 0 if it is added. Default: None.
         name(str, optional): parameter name. Default None.
 
+    Shape:
+        - x: 2-D, 3-D, 4-D or 5-D tensor.
+        - output: same shape as input x.
+
     Returns:
         None
 
@@ -643,6 +672,16 @@ class BatchNorm1d(_BatchNormBase):
             and variance are also used during train period. Default: True.
         name(str, optional): Default: None.
 
+    Shape:
+        - x: 2-D or 3-D tensor with shape: (batch, num_features) or (batch, num_features, length).
+        - output: 3-D tensor with same shape as input x.
+
+    Returns:
+        None.
+
+    **Note**:
+        When inference period, track_running_stats is not effective and will always use global mean and var. The next version will fix the problem .
+
     Returns:
         None
 
@@ -722,9 +761,16 @@ class BatchNorm2d(_BatchNormBase):
             and variance are also used during train period. Default: True.
         name(str, optional): Default: None.
 
+    Shape:
+        - x: 4-D tensor with shape: (batch, num_features, height, weight).
+        - output: 4-D tensor with same shape as input x.
+
     Returns:
         None
 
+    **Note**:
+        When inference period, track_running_stats is not effective and will always use global mean and var. The next version will fix the problem .
+
     Examples:
         .. code-block:: python
 
@@ -801,9 +847,16 @@ class BatchNorm3d(_BatchNormBase):
             and variance are also used during train period. Default: True.
         name(str, optional): Default: None.
 
+    Shape:
+        - x: 5-D tensor with shape: (batch, num_features, dims, height, weight).
+        - output: 5-D tensor with same shape as input x.
+
     Returns:
         None
 
+    **Note**:
+        When inference period, track_running_stats is not effective and will always use global mean and var. The next version will fix the problem .
+
     Examples:
         .. code-block:: python
 

From 24e2cc7c0543edd8b3bac4419e8f707d22224ae2 Mon Sep 17 00:00:00 2001
From: frankwhzhang <frankwhzhang@126.com>
Date: Sun, 23 Aug 2020 12:02:04 +0000
Subject: [PATCH 13/18] add norm 2.0 api, test=develop

---
 python/paddle/nn/layer/norm.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index afb192e86b015..3cb39971a978b 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -178,7 +178,6 @@ class InstanceNorm2d(_InstanceNormBase):
 
     DataLayout: NCHW `[batch, in_channels, in_height, in_width]`
 
-
     :math:`input` is the input features over a mini-batch.
 
     ..  math::
@@ -252,8 +251,7 @@ class InstanceNorm3d(_InstanceNormBase):
     """
     Applies Instance Normalization over a 5D input (a mini-batch of 3D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
 
-    DataLayout: NCHW `[batch, in_channels, D, in_height, in_width]`
-
+    DataLayout: NCHW `[batch, in_channels, dims, in_height, in_width]`
 
     :math:`input` is the input features over a mini-batch.
 
@@ -425,10 +423,6 @@ def forward(self, input):
 
 class LayerNorm(layers.Layer):
     """
-    :alias_main: paddle.nn.LayerNorm
-	:alias: paddle.nn.LayerNorm,paddle.nn.layer.LayerNorm,paddle.nn.layer.norm.LayerNorm
-	:old_api: paddle.fluid.dygraph.LayerNorm
-
     This interface is used to construct a callable object of the ``LayerNorm`` class.
     For more details, refer to code examples.
     It implements the function of the Layer Normalization Layer and can be applied to mini-batch input data.

From e563081f5e43b8d871d144704627a77400001f84 Mon Sep 17 00:00:00 2001
From: frankwhzhang <frankwhzhang@126.com>
Date: Sun, 23 Aug 2020 13:24:27 +0000
Subject: [PATCH 14/18] add norm 2.0 api, test=develop

---
 python/paddle/nn/functional/norm.py | 22 ++++++++++++++--------
 python/paddle/nn/layer/norm.py      | 23 +++++++++++++++++++----
 2 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 82a5ea99d6c5a..9699a912ebd47 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -174,16 +174,12 @@ def batch_norm(x,
     # input ad out must share the memory
     mean_out = running_mean
     variance_out = running_var
-    attrs = {
-        "momentum": momentum,
-        "epsilon": epsilon,
-        "data_layout": data_format,
-        "use_mkldnn": False,
-        "fuse_with_relu": False,
-        "use_global_stats": use_global_stats,
-    }
 
     if in_dygraph_mode():
+        # for dygraph need tuple
+        attrs = ("momentum", momentum, "epsilon", epsilon, "data_layout",
+                 data_format, "use_mkldnn", False, "fuse_with_relu", False,
+                 "use_global_stats", use_global_stats)
         batch_norm_out, _, _, _, _, _ = core.ops.batch_norm(
             x, weight, bias, running_mean, running_var, mean_out, variance_out,
             *attrs)
@@ -194,6 +190,16 @@ def batch_norm(x,
     check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
                              'BatchNorm')
 
+    # for static need dict
+    attrs = {
+        "momentum": momentum,
+        "epsilon": epsilon,
+        "data_layout": data_format,
+        "use_mkldnn": False,
+        "fuse_with_relu": False,
+        "use_global_stats": use_global_stats,
+    }
+
     inputs = {
         "X": [x],
         "Scale": [weight],
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 3cb39971a978b..3116fbf6b821a 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -37,6 +37,7 @@
 
 import numpy as np
 import numbers
+import warnings
 
 __all__ = [
     'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm',
@@ -178,6 +179,7 @@ class InstanceNorm2d(_InstanceNormBase):
 
     DataLayout: NCHW `[batch, in_channels, in_height, in_width]`
 
+
     :math:`input` is the input features over a mini-batch.
 
     ..  math::
@@ -251,7 +253,8 @@ class InstanceNorm3d(_InstanceNormBase):
     """
     Applies Instance Normalization over a 5D input (a mini-batch of 3D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
 
-    DataLayout: NCHW `[batch, in_channels, dims, in_height, in_width]`
+    DataLayout: NCHW `[batch, in_channels, D, in_height, in_width]`
+
 
     :math:`input` is the input features over a mini-batch.
 
@@ -423,6 +426,10 @@ def forward(self, input):
 
 class LayerNorm(layers.Layer):
     """
+    :alias_main: paddle.nn.LayerNorm
+	:alias: paddle.nn.LayerNorm,paddle.nn.layer.LayerNorm,paddle.nn.layer.norm.LayerNorm
+	:old_api: paddle.fluid.dygraph.LayerNorm
+
     This interface is used to construct a callable object of the ``LayerNorm`` class.
     For more details, refer to code examples.
     It implements the function of the Layer Normalization Layer and can be applied to mini-batch input data.
@@ -600,6 +607,11 @@ def forward(self, input):
 
         self._check_input_dim(input)
 
+        if not self.training and not self._track_running_stats:
+            warnings.warn(
+                "If both training and track are false, the bn op will run in train mode."
+            )
+
         return batch_norm(
             input,
             self._mean,
@@ -663,7 +675,8 @@ class BatchNorm1d(_BatchNormBase):
         data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
         track_running_stats(bool, optional): Whether to use global mean and
             variance. In train mode, when setting track_running_stats True, the global mean
-            and variance are also used during train period. Default: True.
+            and variance are also used during train period. When inference mode, track_running_stats 
+            is not effective and will use train mode. Default: True.
         name(str, optional): Default: None.
 
     Shape:
@@ -752,7 +765,8 @@ class BatchNorm2d(_BatchNormBase):
         data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
         track_running_stats(bool, optional): Whether to use global mean and
             variance. In train mode, when setting track_running_stats True, the global mean
-            and variance are also used during train period. Default: True.
+            and variance are also used during train period. When inference mode, track_running_stats 
+            is not effective and will use train mode. Default: True.
         name(str, optional): Default: None.
 
     Shape:
@@ -838,7 +852,8 @@ class BatchNorm3d(_BatchNormBase):
         data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
         track_running_stats(bool, optional): Whether to use global mean and
             variance. In train mode, when setting track_running_stats True, the global mean
-            and variance are also used during train period. Default: True.
+            and variance are also used during train period. When inference mode, track_running_stats 
+            is not effective and will use train mode. Default: True.
         name(str, optional): Default: None.
 
     Shape:

From 67c48d8546bfff7cd27c853c19c82e29c83a8058 Mon Sep 17 00:00:00 2001
From: frankwhzhang <frankwhzhang@126.com>
Date: Mon, 24 Aug 2020 06:25:33 +0000
Subject: [PATCH 15/18] add norm 2.0 api, test=develop

---
 python/paddle/nn/functional/norm.py |  2 +-
 python/paddle/nn/layer/norm.py      | 78 ++++++++++++++---------------
 2 files changed, 39 insertions(+), 41 deletions(-)

diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 9699a912ebd47..98964412da375 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -138,7 +138,7 @@ def batch_norm(x,
         bias(Tensor): The bias tensor of batch_norm can not be None. 
         epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
-        training(bool, optional): The actual meaning is the opposite of global status. Defalut False.
+        training(bool, optional): True means train mode which compute by batch data and track global mean and var during train period. False means inference mode which compute by global mean and var which calculated by train period. Defalut False.
         data_format(str, optional): Specify the input data format. Defalut "NCHW".
         name(str, optional): Default: None.
 
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 3116fbf6b821a..65e2714c0bfd2 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -541,7 +541,7 @@ def __init__(self,
                  weight_attr=None,
                  bias_attr=None,
                  data_format='NCHW',
-                 track_running_stats=False,
+                 track_running_stats=True,
                  name=None):
         super(_BatchNormBase, self).__init__()
         self._num_features = num_features
@@ -608,9 +608,12 @@ def forward(self, input):
         self._check_input_dim(input)
 
         if not self.training and not self._track_running_stats:
+            raise ValueError(
+                'When inference, expected track_running_stats is True.')
+
+        if self.training and not self._track_running_stats:
             warnings.warn(
-                "If both training and track are false, the bn op will run in train mode."
-            )
+                "When training,  we now also track global mean and variance.")
 
         return batch_norm(
             input,
@@ -618,7 +621,7 @@ def forward(self, input):
             self._variance,
             weight=self.weight,
             bias=self.bias,
-            training=self.training or not self._track_running_stats,
+            training=self.training,
             momentum=self._momentum,
             epsilon=self._epsilon,
             data_format=self._data_format)
@@ -665,18 +668,17 @@ class BatchNorm1d(_BatchNormBase):
         epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
         weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
-             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
-             will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
-             If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
         bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
-             If it is set to None or one attribute of ParamAttr, batch_norm
-             will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
-             If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
         data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
-        track_running_stats(bool, optional): Whether to use global mean and
-            variance. In train mode, when setting track_running_stats True, the global mean
-            and variance are also used during train period. When inference mode, track_running_stats 
-            is not effective and will use train mode. Default: True.
+        track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
+            True will track global mean and variance used for inference. When inference, track_running_stats must be 
+            True. Default: True.
         name(str, optional): Default: None.
 
     Shape:
@@ -687,10 +689,8 @@ class BatchNorm1d(_BatchNormBase):
         None.
 
     **Note**:
-        When inference period, track_running_stats is not effective and will always use global mean and var. The next version will fix the problem .
-
-    Returns:
-        None
+        Now track_running_stats is actucal always true. The next version will fix the problem .
+    
 
     Examples:
         .. code-block:: python
@@ -755,18 +755,17 @@ class BatchNorm2d(_BatchNormBase):
         epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
         weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
-             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
-             will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
-             If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
         bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
-             If it is set to None or one attribute of ParamAttr, batch_norm
-             will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
-             If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
         data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
-        track_running_stats(bool, optional): Whether to use global mean and
-            variance. In train mode, when setting track_running_stats True, the global mean
-            and variance are also used during train period. When inference mode, track_running_stats 
-            is not effective and will use train mode. Default: True.
+        track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
+            True will track global mean and variance used for inference. When inference, track_running_stats must be 
+            True. Default: True.
         name(str, optional): Default: None.
 
     Shape:
@@ -777,7 +776,7 @@ class BatchNorm2d(_BatchNormBase):
         None
 
     **Note**:
-        When inference period, track_running_stats is not effective and will always use global mean and var. The next version will fix the problem .
+        Now track_running_stats is actucal always true. The next version will fix the problem .
 
     Examples:
         .. code-block:: python
@@ -842,18 +841,17 @@ class BatchNorm3d(_BatchNormBase):
         epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
         weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
-             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
-             will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
-             If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
         bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
-             If it is set to None or one attribute of ParamAttr, batch_norm
-             will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
-             If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
         data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
-        track_running_stats(bool, optional): Whether to use global mean and
-            variance. In train mode, when setting track_running_stats True, the global mean
-            and variance are also used during train period. When inference mode, track_running_stats 
-            is not effective and will use train mode. Default: True.
+        track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
+            True will track global mean and variance used for inference. When inference, track_running_stats must be 
+            True. Default: True.
         name(str, optional): Default: None.
 
     Shape:
@@ -864,7 +862,7 @@ class BatchNorm3d(_BatchNormBase):
         None
 
     **Note**:
-        When inference period, track_running_stats is not effective and will always use global mean and var. The next version will fix the problem .
+        Now track_running_stats is actucal always true. The next version will fix the problem .
 
     Examples:
         .. code-block:: python

From 5fa037c7cb4583d508ddf26b89a4a77b15e2e4bb Mon Sep 17 00:00:00 2001
From: frankwhzhang <frankwhzhang@126.com>
Date: Mon, 24 Aug 2020 15:29:01 +0000
Subject: [PATCH 16/18] add norm 2.0 api, test=develop

---
 python/paddle/nn/layer/norm.py | 252 +++++++++++++++++++++++++++++++++
 1 file changed, 252 insertions(+)
 create mode 100644 python/paddle/nn/layer/norm.py

diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
new file mode 100644
index 0000000000000..369d462a8089a
--- /dev/null
+++ b/python/paddle/nn/layer/norm.py
@@ -0,0 +1,252 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TODO: define normalization api  
+
+import warnings
+from ...fluid.dygraph.nn import InstanceNorm
+
+from ...fluid.dygraph import BatchNorm  #DEFINE_ALIAS
+from ...fluid.dygraph import GroupNorm  #DEFINE_ALIAS
+from ...fluid.dygraph import LayerNorm  #DEFINE_ALIAS
+from ...fluid.dygraph import SpectralNorm  #DEFINE_ALIAS
+
+from ...fluid.dygraph import layers
+from ...fluid.framework import in_dygraph_mode
+
+from ...fluid.initializer import Constant
+from ...fluid.param_attr import ParamAttr
+from ...fluid.data_feeder import check_variable_and_dtype, check_type
+from ...fluid import core
+
+__all__ = [
+    'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm',
+    'SyncBatchNorm'
+]
+
+
+class SyncBatchNorm(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
+    It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can 
+    be used as a normalizer function for other operations, such as conv2d and fully connected 
+    operations.
+    The data is normalized by the mean and variance of the channel based on whole mini-batch
+    , which including data in all gpus.
+    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
+    for more details.
+
+    When model in training mode, the :math:`\\mu_{\\beta}` 
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of whole mini-batch data in all gpus.
+    Calculated as follows:
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+
+    - :math:`x` : whole mini-batch data in all gpus
+    - :math:`m` : the size of the whole mini-batch data
+
+    When model in evaluation mode, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are global statistics (moving_mean and moving_variance, 
+    which usually got from the pre-trained model). Global statistics calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+
+    The formula of normalization is as follows:
+ 
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\eps}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    - :math:`\\eps` : add a smaller value to the variance to prevent division by zero
+    - :math:`\\gamma` : trainable scale parameter vector
+    - :math:`\\beta` : trainable shift parameter vector 
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+             of this layer. If it is set to None or one attribute of ParamAttr, this layerr
+             will create ParamAttr as param_attr. If the Initializer of the param_attr
+             is not set, the parameter is initialized with Xavier. If it is set to False, 
+             this layer will not have trainable scale parameter. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of this layer.
+             If it is set to None or one attribute of ParamAttr, this layer
+             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+             is not set, the bias is initialized zero. If it is set to False, this layer will not 
+             have trainable bias parameter. Default: None.
+        track_running_stats(bool, optional): Whether to compute global stats, which including running mean and 
+             running variance. Default: True.
+
+    Shapes:
+        input: Tensor that the dimension from 2 to 5.
+        output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import paddle.nn as nn
+          import numpy as np
+
+          x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
+          paddle.disable_static()
+          x = paddle.to_tensor(x)
+          if paddle.fluid.is_compiled_with_cuda():
+              sync_batch_norm = nn.SyncBatchNorm(2)
+              hidden1 = sync_batch_norm(x)
+              print(hidden1.numpy())
+              # [[[[0.26824948, 1.0936325],[0.26824948, -1.6301316]],[[ 0.8095662, -0.665287],[-1.2744656, 1.1301866 ]]]]
+    """
+
+    def __init__(self,
+                 num_features,
+                 epsilon=1e-05,
+                 momentum=0.9,
+                 track_running_stats=True,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format='NCHW',
+                 name=None):
+        super(SyncBatchNorm, self).__init__()
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self._num_features = num_features
+        self._data_layout = data_format
+        self._momentum = momentum
+        self._epsilon = epsilon
+        self._track_running_stats = track_running_stats
+
+        if self._track_running_stats == False:
+            warnings.warn(
+                "moving mean and moving variance will be calculated whether `track_running_stats` is set to `True` or `False`, we will fix it in the next version."
+            )
+
+        param_shape = [self._num_features]
+
+        # create parameter
+        if weight_attr == False:
+            self.weight = self.create_parameter(
+                attr=None, shape=param_shape, default_initializer=Constant(1.0))
+            self.weight.stop_gradient = True
+        else:
+            self.weight = self.create_parameter(
+                attr=self._weight_attr,
+                shape=param_shape,
+                default_initializer=Constant(1.0))
+            self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
+
+        if bias_attr == False:
+            self.bias = self.create_parameter(
+                attr=None,
+                shape=param_shape,
+                default_initializer=Constant(0.0),
+                is_bias=True)
+            self.bias.stop_gradient = True
+        else:
+            self.bias = self.create_parameter(
+                attr=self._bias_attr, shape=param_shape, is_bias=True)
+            self.bias.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
+
+        self._mean = self.create_parameter(
+            attr=ParamAttr(
+                name=None,
+                initializer=Constant(0.0),
+                trainable=False,
+                do_model_average=True),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._mean.stop_gradient = True
+
+        self._variance = self.create_parameter(
+            attr=ParamAttr(
+                name=None,
+                initializer=Constant(1.0),
+                trainable=False,
+                do_model_average=True),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._variance.stop_gradient = True
+
+    def forward(self, x):
+        # create output
+        # mean and mean_out share the same memory
+        mean_out = self._mean
+        # variance and variance out share the same memory
+        variance_out = self._variance
+
+        ### train mode: use mini-batch stats, eval mode: use global stats
+        ### use_global_stats only support False in sync_batch_norm
+        if in_dygraph_mode():
+            attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
+                     "is_test", not self.training, "data_layout",
+                     self._data_layout, "use_mkldnn", False, "fuse_with_relu",
+                     False, "use_global_stats", False, 'trainable_statistics',
+                     False)
+            sync_batch_norm_out, _, _, _, _, _ = core.ops.sync_batch_norm(
+                x, self.weight, self.bias, self._mean, self._variance, mean_out,
+                variance_out, *attrs)
+
+            return sync_batch_norm_out
+
+        check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                                 'BatchNorm')
+
+        attrs = {
+            "momentum": self._momentum,
+            "epsilon": self._epsilon,
+            "is_test": not self.training,
+            "data_layout": self._data_layout,
+            "use_mkldnn": False,
+            "fuse_with_relu": False,
+            "use_global_stats": False,
+            "trainable_statistics": False,
+        }
+
+        inputs = {
+            "X": [x],
+            "Scale": [self.weight],
+            "Bias": [self.bias],
+            "Mean": [self._mean],
+            "Variance": [self._variance]
+        }
+
+        saved_mean = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        saved_variance = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        sync_batch_norm_out = self._helper.create_variable_for_type_inference(
+            self._dtype)
+
+        outputs = {
+            "Y": [sync_batch_norm_out],
+            "MeanOut": [mean_out],
+            "VarianceOut": [variance_out],
+            "SavedMean": [saved_mean],
+            "SavedVariance": [saved_variance]
+        }
+
+        self._helper.append_op(
+            type="sync_batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+        return sync_batch_norm_out

From d679e7991e498ac2a564704e982c3f5622017e62 Mon Sep 17 00:00:00 2001
From: frankwhzhang <frankwhzhang@126.com>
Date: Mon, 24 Aug 2020 15:35:56 +0000
Subject: [PATCH 17/18] add norm 2.0 api, test=develop

---
 python/paddle/nn/layer/norm.py | 875 ++++++++++++++++++++++++++++++++-
 1 file changed, 869 insertions(+), 6 deletions(-)

diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 369d462a8089a..046e55fb7300f 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -1,4 +1,17 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,28 +27,878 @@
 
 # TODO: define normalization api  
 
-import warnings
 from ...fluid.dygraph.nn import InstanceNorm
 
 from ...fluid.dygraph import BatchNorm  #DEFINE_ALIAS
-from ...fluid.dygraph import GroupNorm  #DEFINE_ALIAS
-from ...fluid.dygraph import LayerNorm  #DEFINE_ALIAS
+#from ...fluid.dygraph import GroupNorm  #DEFINE_ALIAS
+
+#from ...fluid.dygraph import LayerNorm  #DEFINE_ALIAS
 from ...fluid.dygraph import SpectralNorm  #DEFINE_ALIAS
+from ...fluid.dygraph import SyncBatchNorm  #DEFINE_ALIAS
 
 from ...fluid.dygraph import layers
+
+from ...framework import get_default_dtype, set_default_dtype
 from ...fluid.framework import in_dygraph_mode
 
 from ...fluid.initializer import Constant
 from ...fluid.param_attr import ParamAttr
 from ...fluid.data_feeder import check_variable_and_dtype, check_type
-from ...fluid import core
+from ...fluid import core, dygraph_utils
+
+from ..functional import batch_norm, layer_norm, instance_norm
+
+import numpy as np
+import numbers
+import warnings
 
 __all__ = [
     'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm',
-    'SyncBatchNorm'
+    'BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d', 'InstanceNorm1d',
+    'InstanceNorm2d', 'InstanceNorm3d', 'SyncBatchNorm'
 ]
 
 
+class _InstanceNormBase(layers.Layer):
+    """
+    This class is based class for InstanceNorm1d, 2d, 3d. 
+
+    See InstaceNorm1d, InstanceNorm2d or InstanceNorm3d for more details.
+    """
+
+    def __init__(self,
+                 num_features,
+                 epsilon=1e-5,
+                 momentum=0.9,
+                 weight_attr=None,
+                 bias_attr=None,
+                 track_running_stats=False,
+                 data_format="NCHW",
+                 name=None):
+        super(_InstanceNormBase, self).__init__()
+
+        if weight_attr == False or bias_attr == False:
+            assert weight_attr == param_attr, "weight_attr and bias_attr must be set to Fasle at the same time in InstanceNorm"
+        self._epsilon = epsilon
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+
+        if weight_attr != False and bias_attr != False:
+            self.scale = self.create_parameter(
+                attr=self._weight_attr,
+                shape=[num_features],
+                default_initializer=Constant(1.0),
+                is_bias=False)
+            self.bias = self.create_parameter(
+                attr=self._bias_attr,
+                shape=[num_features],
+                default_initializer=Constant(0.0),
+                is_bias=True)
+        else:
+            self.scale = None
+            self.bias = None
+
+    def _check_input_dim(self, input):
+        raise NotImplementedError("InstanceNorm Base error")
+
+    def forward(self, input):
+        self._check_input_dim(input)
+
+        return instance_norm(
+            input, weight=self.scale, bias=self.bias, eps=self._epsilon)
+
+
+class InstanceNorm1d(_InstanceNormBase):
+    """
+    Applies Instance Normalization over a 3D input (a mini-batch of 1D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
+
+    DataLayout: NCL `[batch, in_channels, length]`
+
+    :math:`input` is the input features over a mini-batch.
+
+    ..  math::
+        
+        \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
+        \\ mean\ of\ one\  feature\ map\ in\ mini-batch \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    Note:
+        `H` means height of feature map, `W` means width of feature map.
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        track_running_stats(bool, optional): Whether to use global mean and
+            variance. In train mode, when setting track_running_stats True, the global mean
+            and variance are also used during train period. Default: False.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+             of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
+	     If the Initializer of the weight_attr is not set, the parameter is initialized 
+	     one. If it is set to False, will not create weight_attr. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
+             If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
+	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
+             If it is set to False, will not create bias_attr. Default: None.
+        data_format(str, optional): Specify the input data format. Default: NCL.
+        name(str, optional): Default None.
+
+
+    Shape:
+        - x: 2-D or 3-D tensor with shape: (batch, num_features) or (batch, num_features, length).
+        - output: 3-D tensor with same shape as input x.
+
+    Returns:
+        None.
+
+    **Note**:
+        Momentum and track_running_stats is not effective. The next version will fix the problem .
+
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          instance_norm = paddle.nn.InstanceNorm1d(2)
+          instance_norm_out = instance_norm(x)
+
+          print(instance_norm_out.numpy)
+
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 2 and len(input.shape) != 3:
+            raise ValueError('expected 2D or 3D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class InstanceNorm2d(_InstanceNormBase):
+    """
+    Applies Instance Normalization over a 4D input (a mini-batch of 2D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
+
+    DataLayout: NCHW `[batch, in_channels, in_height, in_width]`
+
+
+    :math:`input` is the input features over a mini-batch.
+
+    ..  math::
+        
+        \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
+        \\ mean\ of\ one\  feature\ map\ in\ mini-batch \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    Note:
+        `H` means height of feature map, `W` means width of feature map.
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        track_running_stats(bool, optional): Whether to use global mean and
+            variance. In train mode, when setting track_running_stats True, the global mean
+            and variance are also used during train period. Default: False.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+             of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
+	     If the Initializer of the weight_attr is not set, the parameter is initialized 
+	     one. If it is set to False, will not create weight_attr. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
+             If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
+	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
+             If it is set to False, will not create bias_attr. Default: None.
+        data_format(str, optional): Specify the input data format. Default: NCHW.
+        name(str, optional): Default None.
+
+    Shape:
+        - x: 4-D tensor with shape: (batch, num_features, height, weight).
+        - output: 4-D tensor with same shape as input x.
+
+    Returns:
+        None.
+
+    **Note**:
+        Momentum and track_running_stats is not effective. The next version will fix the problem .
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          instance_norm = paddle.nn.InstanceNorm2d(2)
+          instance_norm_out = instance_norm(x)
+
+          print(instance_norm_out.numpy)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 4:
+            raise ValueError('expected 4D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class InstanceNorm3d(_InstanceNormBase):
+    """
+    Applies Instance Normalization over a 5D input (a mini-batch of 3D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
+
+    DataLayout: NCHW `[batch, in_channels, D, in_height, in_width]`
+
+
+    :math:`input` is the input features over a mini-batch.
+
+    ..  math::
+        
+        \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
+        \\ mean\ of\ one\  feature\ map\ in\ mini-batch \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    Note:
+        `H` means height of feature map, `W` means width of feature map.
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        track_running_stats(bool, optional): Whether to use global mean and
+            variance. In train mode, when setting track_running_stats True, the global mean
+            and variance are also used during train period. Default: False.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+             of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
+	     If the Initializer of the weight_attr is not set, the parameter is initialized 
+	     one. If it is set to False, will not create weight_attr. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
+             If it is set to None or one attribute of ParamAttr, instance_norm
+	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
+	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
+             If it is set to False, will not create bias_attr. Default: None.
+        data_format(str, optional): Specify the input data format. Default: NCDHW.
+        name(str, optional): Default None.
+
+    Shape:
+        - x: 5-D tensor with shape: (batch, num_features, dims, height, weight).
+        - output: 5-D tensor with same shape as input x.
+
+    Returns:
+        None.
+
+    **Note**:
+        Momentum and track_running_stats is not effective. The next version will fix the problem .
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          instance_norm = paddle.nn.InstanceNorm3d(2)
+          instance_norm_out = instance_norm(x)
+
+          print(instance_norm_out.numpy)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 5:
+            raise ValueError('expected 5D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class GroupNorm(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``GroupNorm`` class.
+    For more details, refer to code examples.
+    It implements the function of the Group Normalization Layer.
+    Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_ .
+
+    Parameters:
+        num_channels(int): The number of channels of input.
+        num_groups(int): The number of groups that divided from channels.
+        epsilon(float, optional): The small value added to the variance to prevent
+                                  division by zero. Default: 1e-05.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
+                                         scale :math:`g`. If it is set to False, no scale will be added to the output units.
+                                         If it is set to None, the bias is initialized one. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
+                                        bias :math:`b`. If it is set to False, no bias will be added to the output units.
+                                        If it is set to None, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format. Only NCHW is supported. Default: NCHW.
+        name(str, optional): Default None.
+
+    Shape:
+        - x: 4-D tensor with shape: (batch, num_features, height, weight).
+        - output: 4-D tensor with same shape as input x.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 6, 2, 2)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          group_norm = paddle.nn.GroupNorm(num_channels=3, num_groups=6)
+          group_norm_out = group_norm(x)
+
+          print(group_norm_out.numpy)
+    """
+
+    def __init__(self,
+                 num_channels,
+                 num_groups,
+                 epsilon=1e-05,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_layout='NCHW',
+                 name=None):
+        super(GroupNorm, self).__init__()
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self._epsilon = epsilon
+        self._num_channels = num_channels
+        self._num_groups = num_groups
+        if data_layout != 'NCHW':
+            raise ValueError("unsupported data layout:" + data_layout)
+
+        param_shape = [self._num_channels]
+
+        self.weight = self.create_parameter(
+            attr=self._weight_attr or False,
+            shape=param_shape,
+            default_initializer=Constant(1.0))
+
+        self.bias = self.create_parameter(
+            attr=self._weight_attr or False, shape=param_shape, is_bias=True)
+
+    def forward(self, input):
+        inputs = {'X': input}
+        if self.bias is not None:
+            inputs['Bias'] = self.bias
+        if self.weight is not None:
+            inputs['Scale'] = self.weight
+
+        # create output
+        mean_out = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype, stop_gradient=True)
+        variance_out = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype, stop_gradient=True)
+        group_norm_out = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype)
+
+        self._helper.append_op(
+            type="group_norm",
+            inputs=inputs,
+            outputs={
+                "Y": group_norm_out,
+                "Mean": mean_out,
+                "Variance": variance_out,
+            },
+            attrs={"epsilon": self._epsilon,
+                   "groups": self._num_groups})
+
+        return self._helper.append_activation(group_norm_out, None)
+
+
+class LayerNorm(layers.Layer):
+    """
+    :alias_main: paddle.nn.LayerNorm
+	:alias: paddle.nn.LayerNorm,paddle.nn.layer.LayerNorm,paddle.nn.layer.norm.LayerNorm
+	:old_api: paddle.fluid.dygraph.LayerNorm
+
+    This interface is used to construct a callable object of the ``LayerNorm`` class.
+    For more details, refer to code examples.
+    It implements the function of the Layer Normalization Layer and can be applied to mini-batch input data.
+    Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
+
+    The formula is as follows:
+
+    ..  math::
+
+        \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} x_i
+
+        \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}{(x_i - \\mu)^2} + \\epsilon}
+
+        y & = f(\\frac{g}{\\sigma}(x - \\mu) + b)
+
+    - :math:`x`: the vector representation of the summed inputs to the neurons in that layer.
+    - :math:`H`: the number of hidden units in a layers
+    - :math:`\\epsilon`: the small value added to the variance to prevent division by zero.
+    - :math:`g`: the trainable scale parameter.
+    - :math:`b`: the trainable bias parameter.
+
+    Parameters:
+        normalized_shape(int|list|tuple): Input shape from an expected input of
+            size :math:`[*, normalized_shape[0], normalized_shape[1], ..., normalized_shape[-1]]`.
+            If it is a single integer, this module will normalize over the last dimension
+            which is expected to be of that specific size.
+        epsilon(float, optional): The small value added to the variance to prevent
+            division by zero. Default: 1e-05.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
+            gain :math:`g`. If False, weight is None. If is None, a default :code:`ParamAttr` would be added as scale. The
+            :attr:`param_attr` is initialized as 1 if it is added. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
+            bias :math:`b`. If is False, bias is None. If is None, a default :code:`ParamAttr` would be added as bias. The
+            :attr:`bias_attr` is initialized as 0 if it is added. Default: None.
+        name(str, optional): parameter name. Default None.
+
+    Shape:
+        - x: 2-D, 3-D, 4-D or 5-D tensor.
+        - output: same shape as input x.
+
+    Returns:
+        None
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          layer_norm = paddle.nn.LayerNorm(x_data.shape[1:])
+          layer_norm_out = layer_norm(x)
+
+          print(layer_norm_out.numpy)
+    """
+
+    def __init__(self,
+                 normalized_shape,
+                 epsilon=1e-05,
+                 weight_attr=None,
+                 bias_attr=None,
+                 name=None):
+        super(LayerNorm, self).__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = [normalized_shape]
+
+        self._normalized_shape = list(normalized_shape)
+        self._epsilon = epsilon
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        param_shape = [np.prod(self._normalized_shape)]
+
+        if weight_attr is False:
+            self.weight = None
+        else:
+            self.weight = self.create_parameter(
+                attr=self._weight_attr,
+                shape=param_shape,
+                default_initializer=Constant(1.0))
+
+        if bias_attr is False:
+            self.bias = None
+        else:
+            self.bias = self.create_parameter(
+                attr=self._bias_attr, shape=param_shape, is_bias=True)
+
+    def forward(self, input):
+        return layer_norm(
+            input,
+            normalized_shape=self._normalized_shape,
+            weight=self.weight,
+            bias=self.bias,
+            epsilon=self._epsilon)
+
+
+class _BatchNormBase(layers.Layer):
+    """
+    BatchNorm base .
+    """
+
+    def __init__(self,
+                 num_features,
+                 momentum=0.9,
+                 epsilon=1e-05,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format='NCHW',
+                 track_running_stats=True,
+                 name=None):
+        super(_BatchNormBase, self).__init__()
+        self._num_features = num_features
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+
+        if get_default_dtype() == 'float16':
+            set_default_dtype('float32')
+
+        param_shape = [num_features]
+
+        # create parameter
+        self.weight = self.create_parameter(
+            attr=self._weight_attr,
+            shape=param_shape,
+            default_initializer=Constant(1.0))
+        self.weight.stop_gradient = (self._weight_attr is False) or (
+            self._weight_attr and self._weight_attr.learning_rate == 0.)
+
+        self.bias = self.create_parameter(
+            attr=self._bias_attr, shape=param_shape, is_bias=True)
+        self.bias.stop_gradient = (self._bias_attr is False) or (
+            self._bias_attr and self._bias_attr.learning_rate == 0.)
+
+        moving_mean_name = None
+        moving_variance_name = None
+
+        if name is not None:
+            moving_mean_name = name + "_mean"
+            moving_variance_name = name + "_variance"
+
+        self._mean = self.create_parameter(
+            attr=ParamAttr(
+                name=moving_mean_name,
+                initializer=Constant(0.0),
+                trainable=False,
+                do_model_average=True),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._mean.stop_gradient = True
+
+        self._variance = self.create_parameter(
+            attr=ParamAttr(
+                name=moving_variance_name,
+                initializer=Constant(1.0),
+                trainable=False,
+                do_model_average=True),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._variance.stop_gradient = True
+
+        self._data_format = data_format
+        self._in_place = False
+        self._momentum = momentum
+        self._epsilon = epsilon
+        self._fuse_with_relu = False
+        self._track_running_stats = track_running_stats
+
+    def _check_input_dim(self, input):
+        raise NotImplementedError("BatchNorm Base error")
+
+    def forward(self, input):
+
+        self._check_input_dim(input)
+
+        if not self.training and not self._track_running_stats:
+            raise ValueError(
+                'When inference, expected track_running_stats is True.')
+
+        if self.training and not self._track_running_stats:
+            warnings.warn(
+                "When training, we now always track global mean and variance.")
+
+        return batch_norm(
+            input,
+            self._mean,
+            self._variance,
+            weight=self.weight,
+            bias=self.bias,
+            training=self.training,
+            momentum=self._momentum,
+            epsilon=self._epsilon,
+            data_format=self._data_format)
+
+
+class BatchNorm1d(_BatchNormBase):
+    """
+    Applies Batch Normalization over a 2D or 3D input (a mini-batch of 1D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+
+    When track_running_stats = False, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    Calculated as follows:
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+
+    When track_running_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global or running statistics (moving_mean and moving_variance). It usually got from the
+    pre-trained model. Calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+
+    The normalization function formula is as follows:
+
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\\gamma` : trainable proportional parameter
+    - :math:`\\beta` : trainable deviation parameter
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
+        track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
+            True will track global mean and variance used for inference. When inference, track_running_stats must be 
+            True. Default: True.
+        name(str, optional): Default: None.
+
+    Shape:
+        - x: 2-D or 3-D tensor with shape: (batch, num_features) or (batch, num_features, length).
+        - output: 3-D tensor with same shape as input x.
+
+    Returns:
+        None.
+
+    **Note**:
+        Now track_running_stats is actucal always true. The next version will fix the problem .
+    
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 1, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          batch_norm = paddle.nn.BatchNorm1d(1)
+          batch_norm_out = batch_norm(x)
+
+          print(batch_norm_out.numpy)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 2 and len(input.shape) != 3:
+            raise ValueError('expected 2D or 3D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class BatchNorm2d(_BatchNormBase):
+    """
+    Applies Batch Normalization over a 4D input (a mini-batch of 2D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+
+    When track_running_stats = False, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    Calculated as follows:
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+
+    When track_running_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global or running statistics (moving_mean and moving_variance). It usually got from the
+    pre-trained model. Calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+
+    The normalization function formula is as follows:
+
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\\gamma` : trainable proportional parameter
+    - :math:`\\beta` : trainable deviation parameter
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
+        track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
+            True will track global mean and variance used for inference. When inference, track_running_stats must be 
+            True. Default: True.
+        name(str, optional): Default: None.
+
+    Shape:
+        - x: 4-D tensor with shape: (batch, num_features, height, weight).
+        - output: 4-D tensor with same shape as input x.
+
+    Returns:
+        None
+
+    **Note**:
+        Now track_running_stats is actucal always true. The next version will fix the problem .
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 1, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          batch_norm = paddle.nn.BatchNorm2d(1)
+          batch_norm_out = batch_norm(x)
+
+          print(batch_norm_out.numpy)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 4:
+            raise ValueError('expected 4D input (got {}D input)'.format(
+                len(input.shape)))
+
+
+class BatchNorm3d(_BatchNormBase):
+    """
+    Applies Batch Normalization over a 5D input (a mini-batch of 3D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+
+    When track_running_stats = False, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    Calculated as follows:
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+
+    When track_running_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global or running statistics (moving_mean and moving_variance). It usually got from the
+    pre-trained model. Calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+
+    The normalization function formula is as follows:
+
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\\gamma` : trainable proportional parameter
+    - :math:`\\beta` : trainable deviation parameter
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
+        track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
+            True will track global mean and variance used for inference. When inference, track_running_stats must be 
+            True. Default: True.
+        name(str, optional): Default: None.
+
+    Shape:
+        - x: 5-D tensor with shape: (batch, num_features, dims, height, weight).
+        - output: 5-D tensor with same shape as input x.
+
+    Returns:
+        None
+
+    **Note**:
+        Now track_running_stats is actucal always true. The next version will fix the problem .
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()
+          np.random.seed(123)
+          x_data = np.random.random(size=(2, 1, 2, 2, 3)).astype('float32')
+          x = paddle.to_tensor(x_data) 
+          batch_norm = paddle.nn.BatchNorm3d(1)
+          batch_norm_out = batch_norm(x)
+
+          print(batch_norm_out.numpy)
+    """
+
+    def _check_input_dim(self, input):
+        if len(input.shape) != 5:
+            raise ValueError('expected 5D input (got {}D input)'.format(
+                len(input.shape)))
+
+
 class SyncBatchNorm(layers.Layer):
     """
     This interface is used to construct a callable object of the ``SyncBatchNorm`` class.

From 4940e47b1679569fb7fe6d810c49d0fe82be5499 Mon Sep 17 00:00:00 2001
From: frankwhzhang <frankwhzhang@126.com>
Date: Mon, 24 Aug 2020 16:00:33 +0000
Subject: [PATCH 18/18] add norm 2.0 api, test=develop

---
 python/paddle/nn/functional/__init__.py |  1 +
 python/paddle/nn/functional/norm.py     | 10 ++++-----
 python/paddle/nn/layer/norm.py          | 27 ++++++++++++-------------
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 019e2ad34b3a2..1e14b1bc34fcf 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -159,6 +159,7 @@
 from .loss import ctc_loss  #DEFINE_ALIAS
 # from .norm import data_norm        #DEFINE_ALIAS
 # from .norm import group_norm        #DEFINE_ALIAS
+from .norm import l2_normalize  #DEFINE_ALIAS
 from .norm import batch_norm  #DEFINE_ALIAS
 from .norm import instance_norm  #DEFINE_ALIAS
 from .norm import layer_norm  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 98964412da375..13e86e5712a1c 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -139,8 +139,8 @@ def batch_norm(x,
         epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
         training(bool, optional): True means train mode which compute by batch data and track global mean and var during train period. False means inference mode which compute by global mean and var which calculated by train period. Defalut False.
-        data_format(str, optional): Specify the input data format. Defalut "NCHW".
-        name(str, optional): Default: None.
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL", "NCHW" or "NCDHW". Defalut "NCHW".
+        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
     Returns:
         None
@@ -250,7 +250,7 @@ def layer_norm(x,
             division by zero. Default: 1e-05.
         weight(Tensor, optional): The weight tensor of batch_norm. Default: None.
         bias(Tensor, optional): The bias tensor of batch_norm. Default: None.
-        name(str, optional): Default None.
+        name(str, optional): Name for the LayerNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
     Returns:
         None
@@ -342,8 +342,8 @@ def instance_norm(x,
         eps(float, optional): A value added to the denominator for numerical stability. Default is 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
         use_input_stats(bool): Default True.
-        data_format(str, optional): Specify the input data format. Default: NCHW.
-        name(str, optional): Default None.
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL", "NCHW" or "NCDHW". Defalut "NCHW".
+        name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
     Returns:
         None.
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 046e55fb7300f..c7855b23bf6e6 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -34,7 +34,6 @@
 
 #from ...fluid.dygraph import LayerNorm  #DEFINE_ALIAS
 from ...fluid.dygraph import SpectralNorm  #DEFINE_ALIAS
-from ...fluid.dygraph import SyncBatchNorm  #DEFINE_ALIAS
 
 from ...fluid.dygraph import layers
 
@@ -147,8 +146,8 @@ class InstanceNorm1d(_InstanceNormBase):
 	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
 	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
              If it is set to False, will not create bias_attr. Default: None.
-        data_format(str, optional): Specify the input data format. Default: NCL.
-        name(str, optional): Default None.
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL". Defalut "NCL".
+        name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
 
     Shape:
@@ -226,8 +225,8 @@ class InstanceNorm2d(_InstanceNormBase):
 	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
 	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
              If it is set to False, will not create bias_attr. Default: None.
-        data_format(str, optional): Specify the input data format. Default: NCHW.
-        name(str, optional): Default None.
+        data_format(str, optional): Specify the input data format, could be "NCHW". Default: NCHW.
+        name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
     Shape:
         - x: 4-D tensor with shape: (batch, num_features, height, weight).
@@ -302,8 +301,8 @@ class InstanceNorm3d(_InstanceNormBase):
 	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
 	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
              If it is set to False, will not create bias_attr. Default: None.
-        data_format(str, optional): Specify the input data format. Default: NCDHW.
-        name(str, optional): Default None.
+        data_format(str, optional): Specify the input data format, could be "NCDHW". Default: NCDHW.
+        name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
     Shape:
         - x: 5-D tensor with shape: (batch, num_features, dims, height, weight).
@@ -357,7 +356,7 @@ class GroupNorm(layers.Layer):
                                         bias :math:`b`. If it is set to False, no bias will be added to the output units.
                                         If it is set to None, the bias is initialized zero. Default: None.
         data_format(str, optional): Specify the input data format. Only NCHW is supported. Default: NCHW.
-        name(str, optional): Default None.
+        name(str, optional): Name for the GroupNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
     Shape:
         - x: 4-D tensor with shape: (batch, num_features, height, weight).
@@ -477,7 +476,7 @@ class LayerNorm(layers.Layer):
         bias_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
             bias :math:`b`. If is False, bias is None. If is None, a default :code:`ParamAttr` would be added as bias. The
             :attr:`bias_attr` is initialized as 0 if it is added. Default: None.
-        name(str, optional): parameter name. Default None.
+        name(str, optional): Name for the LayerNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
     Shape:
         - x: 2-D, 3-D, 4-D or 5-D tensor.
@@ -688,11 +687,11 @@ class BatchNorm1d(_BatchNormBase):
             If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
             If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
-        data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL". Defalut "NCL".
         track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
             True will track global mean and variance used for inference. When inference, track_running_stats must be 
             True. Default: True.
-        name(str, optional): Default: None.
+        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
     Shape:
         - x: 2-D or 3-D tensor with shape: (batch, num_features) or (batch, num_features, length).
@@ -779,7 +778,7 @@ class BatchNorm2d(_BatchNormBase):
         track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
             True will track global mean and variance used for inference. When inference, track_running_stats must be 
             True. Default: True.
-        name(str, optional): Default: None.
+        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
     Shape:
         - x: 4-D tensor with shape: (batch, num_features, height, weight).
@@ -861,11 +860,11 @@ class BatchNorm3d(_BatchNormBase):
             If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
             If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
-        data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
+        data_format(str, optional): Specify the input data format, the data format can be "NCDHW". Default: NCDHW.
         track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
             True will track global mean and variance used for inference. When inference, track_running_stats must be 
             True. Default: True.
-        name(str, optional): Default: None.
+        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
     Shape:
         - x: 5-D tensor with shape: (batch, num_features, dims, height, weight).