From 07bcdccdfc99f56db82ec58caada24b75cd18967 Mon Sep 17 00:00:00 2001
From: risemeup1 <515586620@qq.com>
Date: Mon, 5 Dec 2022 09:39:57 +0000
Subject: [PATCH 01/15] move BatchNorm from flud.dygraph.nn to
 paddle.nn.layer.norm

---
 .../slim/tests/test_imperative_skip_op.py     |   1 +
 python/paddle/fluid/dygraph/nn.py             | 306 -----------------
 .../unittests/dygraph_to_static/darknet.py    |   2 +-
 .../dygraph_to_static/test_cycle_gan.py       |   4 +-
 .../dygraph_to_static/test_resnet.py          |   3 +-
 .../dygraph_to_static/test_se_resnet.py       |   3 +-
 .../unittests/dygraph_to_static/test_tsm.py   |   3 +-
 .../unittests/mlu/test_batch_norm_op_mlu.py   |   8 +-
 .../mlu/test_batch_norm_op_mlu_v2.py          |   8 +-
 .../unittests/npu/test_batch_norm_op_npu.py   |   4 +-
 .../tests/unittests/test_batch_norm_op_v2.py  |  13 +-
 .../test_imperative_load_static_param.py      |   2 +-
 .../test_imperative_ocr_attention_model.py    |   2 +-
 .../tests/unittests/test_imperative_resnet.py |   3 +-
 .../unittests/test_imperative_se_resnext.py   |   3 +-
 .../unittests/xpu/test_batch_norm_op_xpu.py   |   3 +-
 python/paddle/nn/layer/norm.py                | 310 +++++++++++++++++-
 17 files changed, 343 insertions(+), 335 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
index 131866095ad7b..3b19ee76c3554 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
@@ -26,6 +26,7 @@
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.nn.layer import ReLU, LeakyReLU, Sigmoid, Softmax, ReLU6
 from paddle.nn import Linear, Conv2D, Softmax, BatchNorm
+from paddle.nn.norm import BatchNorm
 from paddle.fluid.dygraph.nn import Pool2D
 from paddle.fluid.log_helper import get_logger
 
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 4c8b9d7f555f1..9cced53b6edd9 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -52,7 +52,6 @@
     'Conv3D',
     'Pool2D',
     'Linear',
-    'BatchNorm',
     'Dropout',
     'Embedding',
     'GRUUnit',
@@ -1031,311 +1030,6 @@ def forward(self, input):
         return instance_norm_out
 
 
-class BatchNorm(layers.Layer):
-    r"""
-
-    This interface is used to construct a callable object of the ``BatchNorm`` class.
-    For more details, refer to code examples.
-    It implements the function of the Batch Normalization Layer and can be used
-    as a normalizer function for conv2d and fully connected operations.
-    The data is normalized by the mean and variance of the channel based on the current batch data.
-    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
-    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
-    for more details.
-
-    When use_global_stats = False, the :math:`\mu_{\beta}`
-    and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch.
-    Calculated as follows:
-
-    ..  math::
-
-        \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &
-        //\ mini-batch\ mean \\
-        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \mu_{\beta})^2 \qquad &
-        //\ mini-batch\ variance \\
-
-    - :math:`x` : mini-batch data
-    - :math:`m` : the size of the mini-batch data
-
-    When use_global_stats = True, the :math:`\\mu_{\\beta}`
-    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
-    They are global or running statistics (moving_mean and moving_variance). It usually got from the
-    pre-trained model. Calculated as follows:
-
-    .. math::
-        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
-        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
-
-    The normalization function formula is as follows:
-
-    ..  math::
-
-        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
-        \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
-        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
-
-
-    - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
-    - :math:`\gamma` : trainable proportional parameter
-    - :math:`\beta` : trainable deviation parameter
-
-    Parameters:
-        num_channels(int): Indicate the number of channels of the input ``Tensor``.
-        act(str, optional): Activation to be applied to the output of batch normalization. Default: None.
-        is_test (bool, optional): A flag indicating whether it is in test phrase or not.
-             This flag only has effect on static graph mode. For dygraph mode, please use ``eval()``.
-             Default: False.
-        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
-        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
-        param_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
-             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
-             will create ParamAttr as param_attr. If the Initializer of the param_attr
-             is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
-             If it is set to None or one attribute of ParamAttr, batch_norm
-             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-             is not set, the bias is initialized zero. Default: None.
-        dtype(str, optional): Indicate the data type of the input ``Tensor``,
-             which can be float32 or float64. Default: float32.
-        data_layout(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
-        in_place(bool, optional): Make the input and output of batch norm reuse memory. Default: False.
-        moving_mean_name(str, optional): The name of moving_mean which store the global Mean. Default: None.
-        moving_variance_name(str, optional): The name of the moving_variance which store the global Variance. Default: None.
-        do_model_average_for_mean_and_var(bool, optional): Whether parameter mean and variance should do model
-            average when model average is enabled. Default: True.
-        use_global_stats(bool, optional): Whether to use global mean and
-            variance. In inference or test mode, set use_global_stats to true
-            or is_test to true, and the behavior is equivalent.
-            In train mode, when setting use_global_stats True, the global mean
-            and variance are also used during train period. Default: False.
-        trainable_statistics(bool, optional): Whether to calculate mean and var in eval mode. In eval mode, when
-            setting trainable_statistics True, mean and variance will be calculated by current batch statistics.
-            Default: False.
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          from paddle.fluid.dygraph.base import to_variable
-          import numpy as np
-
-          x = np.random.random(size=(3, 10, 3, 7)).astype('float32')
-          with fluid.dygraph.guard():
-              x = to_variable(x)
-              batch_norm = fluid.BatchNorm(10)
-              hidden1 = batch_norm(x)
-    """
-
-    def __init__(
-        self,
-        num_channels,
-        act=None,
-        is_test=False,
-        momentum=0.9,
-        epsilon=1e-05,
-        param_attr=None,
-        bias_attr=None,
-        dtype='float32',
-        data_layout='NCHW',
-        in_place=False,
-        moving_mean_name=None,
-        moving_variance_name=None,
-        do_model_average_for_mean_and_var=True,
-        use_global_stats=False,
-        trainable_statistics=False,
-    ):
-        super().__init__()
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._act = act
-        self._use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
-
-        assert (
-            bias_attr is not False
-        ), "bias_attr should not be False in batch_norm."
-
-        if dtype == "float16":
-            self._dtype = "float32"
-        else:
-            self._dtype = dtype
-
-        param_shape = [num_channels]
-
-        # create parameter
-        self.weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=param_shape,
-            dtype=self._dtype,
-            default_initializer=Constant(1.0),
-        )
-        self.weight.stop_gradient = (
-            use_global_stats and self._param_attr.learning_rate == 0.0
-        )
-
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=param_shape,
-            dtype=self._dtype,
-            is_bias=True,
-        )
-        self.bias.stop_gradient = (
-            use_global_stats and self._param_attr.learning_rate == 0.0
-        )
-
-        self._mean = self.create_parameter(
-            attr=ParamAttr(
-                name=moving_mean_name,
-                initializer=Constant(0.0),
-                trainable=False,
-                do_model_average=do_model_average_for_mean_and_var,
-            ),
-            shape=param_shape,
-            dtype=self._dtype,
-        )
-        self._mean.stop_gradient = True
-
-        self._variance = self.create_parameter(
-            attr=ParamAttr(
-                name=moving_variance_name,
-                initializer=Constant(1.0),
-                trainable=False,
-                do_model_average=do_model_average_for_mean_and_var,
-            ),
-            shape=param_shape,
-            dtype=self._dtype,
-        )
-        self._variance.stop_gradient = True
-
-        self._in_place = in_place
-        self._data_layout = data_layout
-        self._momentum = momentum
-        self._epsilon = epsilon
-        self._is_test = is_test
-        self._fuse_with_relu = False
-        self._use_global_stats = use_global_stats
-        self._trainable_statistics = trainable_statistics
-
-    def forward(self, input):
-        # create output
-        # mean and mean_out share the same memory
-        mean_out = self._mean
-        # variance and variance out share the same memory
-        variance_out = self._variance
-
-        if _non_static_mode():
-            if in_dygraph_mode():
-                batch_norm_out, t1, t2, t3, t4, _ = _C_ops.batch_norm(
-                    input,
-                    self._mean,
-                    self._variance,
-                    self.weight,
-                    self.bias,
-                    not self.training,
-                    self._momentum,
-                    self._epsilon,
-                    self._data_layout,
-                    self._use_global_stats,
-                    self._trainable_statistics,
-                )
-                return dygraph_utils._append_activation_in_dygraph(
-                    batch_norm_out, act=self._act, use_mkldnn=self._use_mkldnn
-                )
-
-            elif _in_legacy_dygraph():
-                attrs = (
-                    "momentum",
-                    self._momentum,
-                    "epsilon",
-                    self._epsilon,
-                    "is_test",
-                    not self.training,
-                    "data_layout",
-                    self._data_layout,
-                    "use_mkldnn",
-                    self._use_mkldnn,
-                    "fuse_with_relu",
-                    self._fuse_with_relu,
-                    "use_global_stats",
-                    self._use_global_stats,
-                    'trainable_statistics',
-                    self._trainable_statistics,
-                )
-                batch_norm_out, _, _, _, _, _ = _legacy_C_ops.batch_norm(
-                    input,
-                    self.weight,
-                    self.bias,
-                    self._mean,
-                    self._variance,
-                    None,
-                    mean_out,
-                    variance_out,
-                    *attrs
-                )
-
-            return dygraph_utils._append_activation_in_dygraph(
-                batch_norm_out, act=self._act, use_mkldnn=self._use_mkldnn
-            )
-
-        check_variable_and_dtype(
-            input, 'input', ['float16', 'float32', 'float64'], 'BatchNorm'
-        )
-
-        attrs = {
-            "momentum": self._momentum,
-            "epsilon": self._epsilon,
-            "is_test": self._is_test,
-            "data_layout": self._data_layout,
-            "use_mkldnn": False,
-            "fuse_with_relu": self._fuse_with_relu,
-            "use_global_stats": self._use_global_stats,
-            "trainable_statistics": self._trainable_statistics,
-        }
-
-        inputs = {
-            "X": [input],
-            "Scale": [self.weight],
-            "Bias": [self.bias],
-            "Mean": [self._mean],
-            "Variance": [self._variance],
-        }
-
-        saved_mean = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype, stop_gradient=True
-        )
-        saved_variance = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype, stop_gradient=True
-        )
-        reserve_space = self._helper.create_variable_for_type_inference(
-            dtype=self._helper.input_dtype(input), stop_gradient=True
-        )
-
-        batch_norm_out = (
-            input
-            if self._in_place
-            else self._helper.create_variable_for_type_inference(self._dtype)
-        )
-
-        outputs = {
-            "Y": [batch_norm_out],
-            "MeanOut": [mean_out],
-            "VarianceOut": [variance_out],
-            "SavedMean": [saved_mean],
-            "SavedVariance": [saved_variance],
-        }
-        if reserve_space is not None:
-            outputs["ReserveSpace"] = [reserve_space]
-
-        self._helper.append_op(
-            type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs
-        )
-
-        # Currently, we don't support inplace in dygraph mode
-        return self._helper.append_activation(batch_norm_out, self._act)
-
-
 class Dropout(layers.Layer):
     """
     This interface is used to construct a callable object of the ``Dropout`` class.
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
index b1cb22c57008d..1b6879e589df9 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
@@ -17,7 +17,7 @@
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.regularizer import L2Decay
 
-from paddle.fluid.dygraph.nn import BatchNorm
+from paddle.nn.norm import BatchNorm
 
 
 class ConvBNLayer(fluid.dygraph.Layer):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
index ab79a05796de4..9de875c19f3ef 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
@@ -37,7 +37,9 @@
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import to_variable, declarative, ProgramTranslator
-from paddle.fluid.dygraph.nn import Conv2DTranspose, BatchNorm
+from paddle.fluid.dygraph.nn import Conv2DTranspose
+from paddle.nn.layer.norm import BatchNorm
+
 
 # Note: Set True to eliminate randomness.
 #     1. For one operation, cuDNN has several algorithms,
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
index 594a3fa71f894..91a0ec083d1d2 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
@@ -23,7 +23,8 @@
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import ProgramTranslator
-from paddle.fluid.dygraph.nn import BatchNorm, Linear, Pool2D
+from paddle.fluid.dygraph.nn import Linear, Pool2D
+from paddle.nn.layer.norm import BatchNorm
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 
 from predictor_utils import PredictorTools
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
index ea87ba5ba68fd..6bfed0f1798d3 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
@@ -23,7 +23,8 @@
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.base import to_variable
-from paddle.fluid.dygraph.nn import BatchNorm, Linear, Pool2D
+from paddle.fluid.dygraph.nn import Linear, Pool2D
+from paddle.nn.layer.norm import BatchNorm
 from paddle.fluid.dygraph import declarative
 from paddle.fluid.dygraph import ProgramTranslator
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
index cc307e5a7bb16..4f8b5c4e872cc 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
@@ -21,7 +21,8 @@
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import declarative, ProgramTranslator, to_variable
-from paddle.fluid.dygraph.nn import BatchNorm, Linear, Pool2D
+from paddle.fluid.dygraph.nn import Linear, Pool2D
+from paddle.nn.layer.norm import BatchNorm
 from tsm_config_utils import merge_configs, parse_config, print_configs
 
 random.seed(0)
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py
index 53b78e18f8861..25c83440a030d 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py
@@ -753,7 +753,7 @@ def test_errors(self):
 class TestDygraphBatchNormAPIError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
-            batch_norm = fluid.dygraph.BatchNorm(10)
+            batch_norm = nn.layer.norm.BatchNorm(10)
             # the input of BatchNorm must be Variable.
             x1 = fluid.create_lod_tensor(
                 np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace()
@@ -776,7 +776,7 @@ def test_dygraph(self):
 
             def compute(x, is_test, trainable_statistics):
                 with fluid.dygraph.guard(p):
-                    bn = fluid.dygraph.BatchNorm(
+                    bn = nn.layer.norm.BatchNorm(
                         shape[1],
                         is_test=is_test,
                         trainable_statistics=trainable_statistics,
@@ -799,7 +799,7 @@ def test_static(self):
 
             def compute(x_np, is_test, trainable_statistics):
                 with program_guard(Program(), Program()):
-                    bn = fluid.dygraph.BatchNorm(
+                    bn = nn.layer.norm.BatchNorm(
                         shape[1],
                         is_test=is_test,
                         trainable_statistics=trainable_statistics,
@@ -824,7 +824,7 @@ def test_reservespace(self):
             x = fluid.data(name='x', shape=x.shape, dtype=x.dtype)
             # Set this FLAG, the BatchNorm API will pass "reserve_space" argument into batch_norm op.
             os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '1'
-            batch_norm = fluid.dygraph.BatchNorm(7, data_layout="NHWC")
+            batch_norm = nn.layer.norm.BatchNorm(7, data_layout="NHWC")
             hidden1 = batch_norm(x)
             os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '0'
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py
index 72e7ac89caf36..c3e9a042233bd 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py
@@ -95,7 +95,7 @@ def test_dygraph(self):
 
             def compute_v1(x, is_test, trainable_statistics):
                 with fluid.dygraph.guard(p):
-                    bn = fluid.dygraph.BatchNorm(
+                    bn = nn.layer.norm(
                         shape[1],
                         is_test=is_test,
                         trainable_statistics=trainable_statistics,
@@ -111,7 +111,7 @@ def compute_v2(x):
 
             def compute_v3(x, is_test, trainable_statistics):
                 with fluid.dygraph.guard(p):
-                    bn = fluid.dygraph.BatchNorm(
+                    bn = nn.layer.norm(
                         shape[1],
                         is_test=is_test,
                         param_attr=fluid.ParamAttr(
@@ -153,7 +153,7 @@ def test_static(self):
 
             def compute_v1(x_np, is_test, trainable_statistics):
                 with program_guard(Program(), Program()):
-                    bn = fluid.dygraph.BatchNorm(
+                    bn = nn.layer.norm(
                         shape[1],
                         is_test=is_test,
                         trainable_statistics=trainable_statistics,
@@ -260,7 +260,7 @@ def test_global_stats(self):
         for p in self.places:
             with fluid.dygraph.guard(p):
                 x = paddle.randn([2, 6, 6, 4])
-                net1 = paddle.fluid.dygraph.BatchNorm(
+                net1 = paddle.nn.layer.norm(
                     6,
                     param_attr=fluid.ParamAttr(
                         initializer=fluid.initializer.Constant(1.0)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py
index e39506eed7a9b..2eeacffe27058 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py
@@ -562,7 +562,7 @@ def test_dygraph(self):
 
             def compute(x, is_test, trainable_statistics):
                 with fluid.dygraph.guard(p):
-                    bn = fluid.dygraph.BatchNorm(
+                    bn = nn.layer.norm.BatchNorm(
                         shape[1],
                         is_test=is_test,
                         trainable_statistics=trainable_statistics,
@@ -583,7 +583,7 @@ def test_static(self):
 
             def compute(x_np, is_test, trainable_statistics):
                 with program_guard(Program(), Program()):
-                    bn = fluid.dygraph.BatchNorm(
+                    bn = nn.layer.norm.BatchNorm(
                         shape[1],
                         is_test=is_test,
                         trainable_statistics=trainable_statistics,
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
index 614e058c6dead..0aa8f4cfa3c76 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -16,6 +16,7 @@
 import numpy as np
 import paddle.fluid.core as core
 import paddle.fluid as fluid
+import paddle.nn as nn
 from paddle.fluid.framework import _test_eager_guard
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
@@ -81,7 +82,7 @@ def error3d():
     def test_large_batch(self):
         def compute_baseline(x):
             with fluid.dygraph.guard(p):
-                bn = fluid.dygraph.BatchNorm(shape[1])
+                bn = nn.layer.norm.BatchNorm(shape[1])
                 x1 = paddle.to_tensor(x)
                 x1.stop_gradient = False
                 y = bn(x1)
@@ -127,7 +128,7 @@ def test_eager_api(self):
 
             def compute_v1(x):
                 with fluid.dygraph.guard(p):
-                    bn = fluid.dygraph.BatchNorm(shape[1])
+                    bn = nn.layer.norm.BatchNorm()
                     # bn = paddle.nn.BatchNorm2D(shape[1])
                     x1 = paddle.to_tensor(x)
                     x1.stop_gradient = False
@@ -161,7 +162,7 @@ def test_dygraph(self):
 
             def compute_v1(x, is_test, trainable_statistics):
                 with fluid.dygraph.guard(p):
-                    bn = fluid.dygraph.BatchNorm(
+                    bn = nn.layer.norm.BatchNorm(
                         shape[1],
                         is_test=is_test,
                         trainable_statistics=trainable_statistics,
@@ -182,7 +183,7 @@ def compute_v2(x):
 
             def compute_v3(x, is_test, trainable_statistics):
                 with fluid.dygraph.guard(p):
-                    bn = fluid.dygraph.BatchNorm(
+                    bn = nn.layer.norm.BatchNorm(
                         shape[1],
                         is_test=is_test,
                         param_attr=fluid.ParamAttr(
@@ -224,7 +225,7 @@ def test_static(self):
 
             def compute_v1(x_np, is_test, trainable_statistics):
                 with program_guard(Program(), Program()):
-                    bn = fluid.dygraph.BatchNorm(
+                    bn = nn.layer.norm.BatchNorm(
                         shape[1],
                         is_test=is_test,
                         trainable_statistics=trainable_statistics,
@@ -378,7 +379,7 @@ def test_global_stats(self):
         for p in self.places:
             with fluid.dygraph.guard(p):
                 x = paddle.randn([2, 6, 6, 4])
-                net1 = paddle.fluid.dygraph.BatchNorm(
+                net1 = nn.layer.norm.BatchNorm(
                     6,
                     param_attr=fluid.ParamAttr(
                         initializer=fluid.initializer.Constant(1.0)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
index c0f6badfe2c97..1277b9f89d364 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
@@ -17,7 +17,6 @@
 import paddle.fluid as fluid
 import paddle.fluid.framework as framework
 from paddle.fluid.dygraph.nn import (
-    BatchNorm,
     Conv3D,
     Embedding,
     GroupNorm,
@@ -26,6 +25,7 @@
     NCE,
     PRelu,
 )
+from paddle.nn.layer.norm import BatchNorm
 import numpy as np
 import os
 import tempfile
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
index 08a32aeaa9971..9a63c5ce700e6 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -20,10 +20,10 @@
 from paddle.fluid.dygraph.nn import (
     Pool2D,
     Linear,
-    BatchNorm,
     Embedding,
     GRUUnit,
 )
+from paddle.nn.layer.norm import BatchNorm
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
 from paddle.fluid.framework import _test_eager_guard
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
index 328245ab9c935..12abd58d8eda6 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -19,7 +19,8 @@
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid import Pool2D, BatchNorm, Linear
+from paddle.fluid import Pool2D, Linear
+from paddle.nn.layer.norm import BatchNorm
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
 from utils import DyGraphProgramDescTracerTestHelper, is_equal_program
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
index a0518f7ba7b43..9ec2dc431f58f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -19,7 +19,8 @@
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.dygraph.nn import Pool2D, BatchNorm, Linear
+from paddle.fluid.dygraph.nn import Pool2D, Linear
+from paddle.nn.layer.norm import BatchNorm
 from test_imperative_base import new_program_scope
 from paddle.fluid.framework import _test_eager_guard
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
index bf90dfd870584..bdcf2f5927330 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
@@ -20,6 +20,7 @@
 import paddle.fluid.core as core
 import paddle
 import paddle.fluid as fluid
+import paddle.nn as nn
 import paddle.nn.functional as F
 from xpu.get_test_cover_info import (
     create_test_class,
@@ -364,7 +365,7 @@ def test_global_stats(self):
             for p in self.places:
                 with fluid.dygraph.guard(p):
                     x = paddle.randn([2, 6, 6, 4])
-                    net1 = paddle.fluid.dygraph.BatchNorm(
+                    net1 = nn.layer.norm.BatchNorm(
                         6,
                         param_attr=fluid.ParamAttr(
                             initializer=fluid.initializer.Constant(1.0)
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 01fd204cab9a4..769c9e13900f0 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -27,17 +27,17 @@
 
 # TODO: define normalization api
 
-from ...fluid.dygraph import BatchNorm  # noqa: F401
+
 from ...fluid.dygraph import SpectralNorm  # noqa: F401
 
 from ...framework import get_default_dtype
-
+from ...fluid import dygraph_utils
 from ..initializer import Constant
 from ...framework import ParamAttr
 from ...fluid.data_feeder import check_variable_and_dtype
 
 from ..functional import batch_norm, layer_norm, instance_norm
-
+from ..framework import _non_static_mode, _global_flags
 import numpy as np
 import numbers
 import warnings
@@ -730,6 +730,310 @@ def extra_repr(self):
         return main_str
 
 
+class BatchNorm(Layer):
+    r"""
+    This interface is used to construct a callable object of the ``BatchNorm`` class.
+    For more details, refer to code examples.
+    It implements the function of the Batch Normalization Layer and can be used
+    as a normalizer function for conv2d and fully connected operations.
+    The data is normalized by the mean and variance of the channel based on the current batch data.
+    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
+    for more details.
+
+    When use_global_stats = False, the :math:`\mu_{\beta}`
+    and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch.
+    Calculated as follows:
+
+    ..  math::
+
+        \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &
+        //\ mini-batch\ mean \\
+        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \mu_{\beta})^2 \qquad &
+        //\ mini-batch\ variance \\
+
+    - :math:`x` : mini-batch data
+    - :math:`m` : the size of the mini-batch data
+
+    When use_global_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global or running statistics (moving_mean and moving_variance). It usually got from the
+    pre-trained model. Calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+
+    The normalization function formula is as follows:
+
+    ..  math::
+
+        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
+        \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
+        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
+
+
+    - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\gamma` : trainable proportional parameter
+    - :math:`\beta` : trainable deviation parameter
+
+    Parameters:
+        num_channels(int): Indicate the number of channels of the input ``Tensor``.
+        act(str, optional): Activation to be applied to the output of batch normalization. Default: None.
+        is_test (bool, optional): A flag indicating whether it is in test phrase or not.
+             This flag only has effect on static graph mode. For dygraph mode, please use ``eval()``.
+             Default: False.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        param_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
+             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+             will create ParamAttr as param_attr. If the Initializer of the param_attr
+             is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
+             If it is set to None or one attribute of ParamAttr, batch_norm
+             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+             is not set, the bias is initialized zero. Default: None.
+        dtype(str, optional): Indicate the data type of the input ``Tensor``,
+             which can be float32 or float64. Default: float32.
+        data_layout(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
+        in_place(bool, optional): Make the input and output of batch norm reuse memory. Default: False.
+        moving_mean_name(str, optional): The name of moving_mean which store the global Mean. Default: None.
+        moving_variance_name(str, optional): The name of the moving_variance which store the global Variance. Default: None.
+        do_model_average_for_mean_and_var(bool, optional): Whether parameter mean and variance should do model
+            average when model average is enabled. Default: True.
+        use_global_stats(bool, optional): Whether to use global mean and
+            variance. In inference or test mode, set use_global_stats to true
+            or is_test to true, and the behavior is equivalent.
+            In train mode, when setting use_global_stats True, the global mean
+            and variance are also used during train period. Default: False.
+        trainable_statistics(bool, optional): Whether to calculate mean and var in eval mode. In eval mode, when
+            setting trainable_statistics True, mean and variance will be calculated by current batch statistics.
+            Default: False.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          from paddle.fluid.dygraph.base import to_variable
+          import numpy as np
+
+          x = np.random.random(size=(3, 10, 3, 7)).astype('float32')
+          with fluid.dygraph.guard():
+              x = to_variable(x)
+              batch_norm = fluid.BatchNorm(10)
+              hidden1 = batch_norm(x)
+    """
+
+    def __init__(
+        self,
+        num_channels,
+        act=None,
+        is_test=False,
+        momentum=0.9,
+        epsilon=1e-05,
+        param_attr=None,
+        bias_attr=None,
+        dtype='float32',
+        data_layout='NCHW',
+        in_place=False,
+        moving_mean_name=None,
+        moving_variance_name=None,
+        do_model_average_for_mean_and_var=True,
+        use_global_stats=False,
+        trainable_statistics=False,
+    ):
+        super().__init__()
+        self._param_attr = param_attr
+        self._bias_attr = bias_attr
+        self._act = act
+        self._use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
+
+        assert (
+            bias_attr is not False
+        ), "bias_attr should not be False in batch_norm."
+
+        if dtype == "float16":
+            self._dtype = "float32"
+        else:
+            self._dtype = dtype
+
+        param_shape = [num_channels]
+
+        # create parameter
+        self.weight = self.create_parameter(
+            attr=self._param_attr,
+            shape=param_shape,
+            dtype=self._dtype,
+            default_initializer=Constant(1.0),
+        )
+        self.weight.stop_gradient = (
+            use_global_stats and self._param_attr.learning_rate == 0.0
+        )
+
+        self.bias = self.create_parameter(
+            attr=self._bias_attr,
+            shape=param_shape,
+            dtype=self._dtype,
+            is_bias=True,
+        )
+        self.bias.stop_gradient = (
+            use_global_stats and self._param_attr.learning_rate == 0.0
+        )
+
+        self._mean = self.create_parameter(
+            attr=ParamAttr(
+                name=moving_mean_name,
+                initializer=Constant(0.0),
+                trainable=False,
+                do_model_average=do_model_average_for_mean_and_var,
+            ),
+            shape=param_shape,
+            dtype=self._dtype,
+        )
+        self._mean.stop_gradient = True
+
+        self._variance = self.create_parameter(
+            attr=ParamAttr(
+                name=moving_variance_name,
+                initializer=Constant(1.0),
+                trainable=False,
+                do_model_average=do_model_average_for_mean_and_var,
+            ),
+            shape=param_shape,
+            dtype=self._dtype,
+        )
+        self._variance.stop_gradient = True
+
+        self._in_place = in_place
+        self._data_layout = data_layout
+        self._momentum = momentum
+        self._epsilon = epsilon
+        self._is_test = is_test
+        self._fuse_with_relu = False
+        self._use_global_stats = use_global_stats
+        self._trainable_statistics = trainable_statistics
+
+    def forward(self, input):
+        # create output
+        # mean and mean_out share the same memory
+        mean_out = self._mean
+        # variance and variance out share the same memory
+        variance_out = self._variance
+
+        if _non_static_mode():
+            if in_dygraph_mode():
+                batch_norm_out, t1, t2, t3, t4, _ = _C_ops.batch_norm(
+                    input,
+                    self._mean,
+                    self._variance,
+                    self.weight,
+                    self.bias,
+                    not self.training,
+                    self._momentum,
+                    self._epsilon,
+                    self._data_layout,
+                    self._use_global_stats,
+                    self._trainable_statistics,
+                )
+                return dygraph_utils._append_activation_in_dygraph(
+                    batch_norm_out, act=self._act, use_mkldnn=self._use_mkldnn
+                )
+
+            elif _in_legacy_dygraph():
+                attrs = (
+                    "momentum",
+                    self._momentum,
+                    "epsilon",
+                    self._epsilon,
+                    "is_test",
+                    not self.training,
+                    "data_layout",
+                    self._data_layout,
+                    "use_mkldnn",
+                    self._use_mkldnn,
+                    "fuse_with_relu",
+                    self._fuse_with_relu,
+                    "use_global_stats",
+                    self._use_global_stats,
+                    'trainable_statistics',
+                    self._trainable_statistics,
+                )
+                batch_norm_out, _, _, _, _, _ = _legacy_C_ops.batch_norm(
+                    input,
+                    self.weight,
+                    self.bias,
+                    self._mean,
+                    self._variance,
+                    None,
+                    mean_out,
+                    variance_out,
+                    *attrs
+                )
+
+            return dygraph_utils._append_activation_in_dygraph(
+                batch_norm_out, act=self._act, use_mkldnn=self._use_mkldnn
+            )
+
+        check_variable_and_dtype(
+            input, 'input', ['float16', 'float32', 'float64'], 'BatchNorm'
+        )
+
+        attrs = {
+            "momentum": self._momentum,
+            "epsilon": self._epsilon,
+            "is_test": self._is_test,
+            "data_layout": self._data_layout,
+            "use_mkldnn": False,
+            "fuse_with_relu": self._fuse_with_relu,
+            "use_global_stats": self._use_global_stats,
+            "trainable_statistics": self._trainable_statistics,
+        }
+
+        inputs = {
+            "X": [input],
+            "Scale": [self.weight],
+            "Bias": [self.bias],
+            "Mean": [self._mean],
+            "Variance": [self._variance],
+        }
+
+        saved_mean = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True
+        )
+        saved_variance = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True
+        )
+        reserve_space = self._helper.create_variable_for_type_inference(
+            dtype=self._helper.input_dtype(input), stop_gradient=True
+        )
+
+        batch_norm_out = (
+            input
+            if self._in_place
+            else self._helper.create_variable_for_type_inference(self._dtype)
+        )
+
+        outputs = {
+            "Y": [batch_norm_out],
+            "MeanOut": [mean_out],
+            "VarianceOut": [variance_out],
+            "SavedMean": [saved_mean],
+            "SavedVariance": [saved_variance],
+        }
+        if reserve_space is not None:
+            outputs["ReserveSpace"] = [reserve_space]
+
+        self._helper.append_op(
+            type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs
+        )
+
+        # Currently, we don't support inplace in dygraph mode
+        return self._helper.append_activation(batch_norm_out, self._act)
+
+
 class BatchNorm1D(_BatchNormBase):
     r"""
     Applies Batch Normalization over a 2D or 3D input (a mini-batch of 1D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .

From fc5a81eabfc1751de147fc9ac1b645b56e79c5de Mon Sep 17 00:00:00 2001
From: risemeup1 <515586620@qq.com>
Date: Mon, 5 Dec 2022 11:21:31 +0000
Subject: [PATCH 02/15] modfiy conflict

---
 .../fluid/tests/unittests/dygraph_to_static/test_resnet.py      | 1 -
 .../tests/unittests/test_imperative_ocr_attention_model.py      | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
index b60327c318535..af44406a99afe 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
@@ -24,7 +24,6 @@
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
-from paddle.fluid.dygraph.nn import BatchNorm
 from paddle.jit import ProgramTranslator
 from paddle.nn.layer.norm import BatchNorm
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
index 7f17cbf6ef155..29a99d4130124 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -21,7 +21,7 @@
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid.dygraph.base import to_variable
-from paddle.fluid.dygraph.nn import BatchNorm, Embedding, GRUUnit, Linear
+from paddle.fluid.dygraph.nn import Embedding, GRUUnit, Linear
 from paddle.fluid.framework import _test_eager_guard
 from paddle.nn import Linear
 from paddle.nn.layer.norm import BatchNorm

From 91f1a4d9cda8eea639c2a7ac02a7232b5b0cec7e Mon Sep 17 00:00:00 2001
From: risemeup1 <515586620@qq.com>
Date: Mon, 5 Dec 2022 12:46:16 +0000
Subject: [PATCH 03/15] modify pre-commit error

---
 .../tests/unittests/test_imperative_ocr_attention_model.py      | 2 +-
 .../paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py  | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
index 29a99d4130124..aeac8483ac283 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -21,7 +21,7 @@
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid.dygraph.base import to_variable
-from paddle.fluid.dygraph.nn import Embedding, GRUUnit, Linear
+from paddle.fluid.dygraph.nn import Embedding, GRUUnit
 from paddle.fluid.framework import _test_eager_guard
 from paddle.nn import Linear
 from paddle.nn.layer.norm import BatchNorm
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
index a104ee7cd75a4..0da8a62031b6a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
@@ -14,8 +14,6 @@
 
 import sys
 
-import paddle
-
 sys.path.append("..")
 import unittest
 

From f383b37cc4f4adf92a1313bce5143c517e286b2d Mon Sep 17 00:00:00 2001
From: risemeup1 <515586620@qq.com>
Date: Mon, 5 Dec 2022 14:20:13 +0000
Subject: [PATCH 04/15] modify static-check ci error

---
 python/paddle/nn/layer/norm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 5e488d1863d1d..5dc2edd40d61e 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -846,7 +846,7 @@ class BatchNorm(Layer):
           x = np.random.random(size=(3, 10, 3, 7)).astype('float32')
           with fluid.dygraph.guard():
               x = to_variable(x)
-              batch_norm = fluid.BatchNorm(10)
+              batch_norm = nn.layer.norm.BatchNorm(10)
               hidden1 = batch_norm(x)
     """
 

From c0f88d3aceaa0b520a67bfca9a45c9419ae89ebb Mon Sep 17 00:00:00 2001
From: risemeup1 <515586620@qq.com>
Date: Tue, 6 Dec 2022 01:54:08 +0000
Subject: [PATCH 05/15] fix failed tests

---
 .../fluid/contrib/slim/tests/test_imperative_skip_op.py  | 2 +-
 .../fluid/tests/unittests/dygraph_to_static/darknet.py   | 2 +-
 .../tests/unittests/dygraph_to_static/test_mobile_net.py | 2 +-
 .../tests/unittests/dygraph_to_static/test_se_resnet.py  | 2 +-
 .../paddle/fluid/tests/unittests/test_batch_norm_op.py   | 9 +++++----
 .../fluid/tests/unittests/test_batch_norm_op_v2.py       | 2 +-
 6 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
index 3b19ee76c3554..3d6bc7fa51ce8 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
@@ -26,7 +26,7 @@
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.nn.layer import ReLU, LeakyReLU, Sigmoid, Softmax, ReLU6
 from paddle.nn import Linear, Conv2D, Softmax, BatchNorm
-from paddle.nn.norm import BatchNorm
+from paddle.nn.layer.norm import BatchNorm
 from paddle.fluid.dygraph.nn import Pool2D
 from paddle.fluid.log_helper import get_logger
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
index bae882f4ebc84..d6003d9103dc8 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
@@ -64,7 +64,7 @@ def forward(self, inputs):
         out = self.conv(inputs)
         out = self.batch_norm(out)
         if self.act == 'leaky':
-            out = fluid.layers.leaky_relu(x=out, alpha=0.1)
+            out = paddle.nn.functional.leaky_relu(x=out, alpha=0.1)
         return out
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
index ff36720594c09..f23d0439ac68f 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -18,12 +18,12 @@
 import unittest
 
 import numpy as np
+from nn.layer.norm import BatchNorm
 from predictor_utils import PredictorTools
 
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
-from paddle.fluid.dygraph.nn import BatchNorm
 from paddle.fluid.initializer import MSRA
 from paddle.fluid.param_attr import ParamAttr
 from paddle.jit import ProgramTranslator
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
index 70ee21713c7ed..ad62658fc861c 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
@@ -26,10 +26,10 @@
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.base import to_variable
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
-from paddle.fluid.dygraph.nn import BatchNorm
 from paddle.jit import ProgramTranslator
 from paddle.jit.api import declarative
 from paddle.nn import Linear
+from paddle.nn.layer.norm import BatchNorm
 
 SEED = 2020
 np.random.seed(SEED)
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index 34b358130219d..ed66c6e7a5282 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -21,6 +21,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.nn as nn
 from paddle.fluid import Program, program_guard
 from paddle.fluid.framework import grad_var_name
 from paddle.fluid.op import Operator
@@ -770,7 +771,7 @@ def test_errors(self):
 class TestDygraphBatchNormAPIError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
-            batch_norm = fluid.dygraph.BatchNorm(10)
+            batch_norm = nn.layer.norm.BatchNorm(10)
             # the input of BatchNorm must be Variable.
             x1 = fluid.create_lod_tensor(
                 np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace()
@@ -793,7 +794,7 @@ def test_dygraph(self):
 
             def compute(x, is_test, trainable_statistics):
                 with fluid.dygraph.guard(p):
-                    bn = fluid.dygraph.BatchNorm(
+                    bn = nn.layer.norm.BatchNorm(
                         shape[1],
                         is_test=is_test,
                         trainable_statistics=trainable_statistics,
@@ -816,7 +817,7 @@ def test_static(self):
 
             def compute(x_np, is_test, trainable_statistics):
                 with program_guard(Program(), Program()):
-                    bn = fluid.dygraph.BatchNorm(
+                    bn = nn.layer.norm.BatchNorm(
                         shape[1],
                         is_test=is_test,
                         trainable_statistics=trainable_statistics,
@@ -841,7 +842,7 @@ def test_reservespace(self):
             x = fluid.data(name='x', shape=x.shape, dtype=x.dtype)
             # Set this FLAG, the BatchNorm API will pass "reserve_space" argument into batch_norm op.
             os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '1'
-            batch_norm = fluid.dygraph.BatchNorm(7, data_layout="NHWC")
+            batch_norm = nn.layer.norm.BatchNorm(7, data_layout="NHWC")
             hidden1 = batch_norm(x)
             os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '0'
 
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
index 3ca48ddaa5543..d27953d24e83c 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -129,7 +129,7 @@ def test_eager_api(self):
 
             def compute_v1(x):
                 with fluid.dygraph.guard(p):
-                    bn = nn.layer.norm.BatchNorm()
+                    bn = nn.layer.norm.BatchNorm(shape[1])
                     # bn = paddle.nn.BatchNorm2D(shape[1])
                     x1 = paddle.to_tensor(x)
                     x1.stop_gradient = False

From 2141dac62d81cbfec566aa4810f576d2d617a9e7 Mon Sep 17 00:00:00 2001
From: risemeup1 <515586620@qq.com>
Date: Tue, 6 Dec 2022 03:18:54 +0000
Subject: [PATCH 06/15] modify conflict

---
 .../fluid/contrib/slim/tests/test_imperative_skip_op.py   | 3 +--
 .../fluid/tests/unittests/test_imperative_se_resnext.py   | 8 +++-----
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
index 3d6bc7fa51ce8..91311b8c69534 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
@@ -25,9 +25,8 @@
 from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.nn.layer import ReLU, LeakyReLU, Sigmoid, Softmax, ReLU6
-from paddle.nn import Linear, Conv2D, Softmax, BatchNorm
+from paddle.nn import Linear, Conv2D, Softmax
 from paddle.nn.layer.norm import BatchNorm
-from paddle.fluid.dygraph.nn import Pool2D
 from paddle.fluid.log_helper import get_logger
 
 from imperative_test_utils import (
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
index 0c35855305cea..2e8205d3f86ed 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -104,8 +104,8 @@ def __init__(self, num_channels, reduction_ratio):
 
         super().__init__()
         self._num_channels = num_channels
-        self._pool = Pool2D(pool_size=0, pool_type='avg', global_pooling=True)
-        self._squeeze = Linear(
+        self._pool = paddle.nn.AdaptiveAvgPool2D(1)
+        self._squeeze = paddle.nn.Linear(
             num_channels,
             num_channels // reduction_ratio,
             param_attr=fluid.ParamAttr(
@@ -288,9 +288,7 @@ def __init__(self, layers=50, class_dim=102):
                 self.bottleneck_block_list.append(bottleneck_block)
                 shortcut = True
 
-        self.pool2d_avg = Pool2D(
-            pool_size=7, pool_type='avg', global_pooling=True
-        )
+        self.pool2d_avg = paddle.nn.AdaptiveAvgPool2D(1)
         import math
 
         stdv = 1.0 / math.sqrt(2048 * 1.0)

From ffb4a446706d0af78fe6bdfb880965b1533807ee Mon Sep 17 00:00:00 2001
From: risemeup1 <515586620@qq.com>
Date: Tue, 6 Dec 2022 03:22:29 +0000
Subject: [PATCH 07/15] modify conflict

---
 .../paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py  | 2 +-
 .../paddle/fluid/tests/unittests/test_imperative_se_resnext.py  | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
index 91311b8c69534..e1aec4eeca5ec 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
@@ -25,8 +25,8 @@
 from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.nn.layer import ReLU, LeakyReLU, Sigmoid, Softmax, ReLU6
-from paddle.nn import Linear, Conv2D, Softmax
 from paddle.nn.layer.norm import BatchNorm
+from paddle.nn import Linear, Conv2D, Softmax
 from paddle.fluid.log_helper import get_logger
 
 from imperative_test_utils import (
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
index 2e8205d3f86ed..65a8fc4f4ed80 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -287,7 +287,6 @@ def __init__(self, layers=50, class_dim=102):
                 num_channels = bottleneck_block._num_channels_out
                 self.bottleneck_block_list.append(bottleneck_block)
                 shortcut = True
-
         self.pool2d_avg = paddle.nn.AdaptiveAvgPool2D(1)
         import math
 

From 04824b53b72f0e844051409b768760ee91a1652e Mon Sep 17 00:00:00 2001
From: risemeup1 <515586620@qq.com>
Date: Tue, 6 Dec 2022 03:52:54 +0000
Subject: [PATCH 08/15] delete import modelu GRUUnit

---
 .../tests/unittests/test_imperative_ocr_attention_model.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
index 865c0aa0d7b63..7f40aba3d6488 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -21,7 +21,7 @@
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid.dygraph.base import to_variable
-from paddle.fluid.dygraph.nn import Embedding, GRUUnit
+from paddle.fluid.dygraph.nn import Embedding
 from paddle.fluid.framework import _test_eager_guard
 from paddle.nn import Linear
 from paddle.nn.layer.norm import BatchNorm

From 7064e9f89df00f25a62bee581b26077c93fdd8cf Mon Sep 17 00:00:00 2001
From: risemeup1 <515586620@qq.com>
Date: Tue, 6 Dec 2022 09:26:30 +0000
Subject: [PATCH 09/15] fix falied test

---
 .../paddle/fluid/tests/unittests/dygraph_to_static/darknet.py   | 2 +-
 .../fluid/tests/unittests/dygraph_to_static/test_mobile_net.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
index d6003d9103dc8..bf312818f2541 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
@@ -64,7 +64,7 @@ def forward(self, inputs):
         out = self.conv(inputs)
         out = self.batch_norm(out)
         if self.act == 'leaky':
-            out = paddle.nn.functional.leaky_relu(x=out, alpha=0.1)
+            out = paddle.nn.functional.leaky_relu(out, 0.1)
         return out
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
index c0df924fc43b1..75092c9e102d9 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -18,7 +18,6 @@
 import unittest
 
 import numpy as np
-from nn.layer.norm import BatchNorm
 from predictor_utils import PredictorTools
 
 import paddle
@@ -29,6 +28,7 @@
 from paddle.jit import ProgramTranslator
 from paddle.jit.api import declarative
 from paddle.nn import Linear
+from paddle.nn.layer.norm import BatchNorm
 
 # Note: Set True to eliminate randomness.
 #     1. For one operation, cuDNN has several algorithms,

From 61d48ff143b15c902e1f34c8dcc11a2a31a7cf5d Mon Sep 17 00:00:00 2001
From: risemeup1 <515586620@qq.com>
Date: Tue, 6 Dec 2022 10:55:55 +0000
Subject: [PATCH 10/15] fix failed testes

---
 python/paddle/nn/layer/norm.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 5dc2edd40d61e..527bf58f5a42e 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -840,9 +840,11 @@ class BatchNorm(Layer):
         .. code-block:: python
 
           import paddle.fluid as fluid
+          import paddle.nn as nn
           from paddle.fluid.dygraph.base import to_variable
           import numpy as np
 
+
           x = np.random.random(size=(3, 10, 3, 7)).astype('float32')
           with fluid.dygraph.guard():
               x = to_variable(x)

From e52ee55b45bd76b02daacb81f5a0913b2f442a64 Mon Sep 17 00:00:00 2001
From: risemeup1 <515586620@qq.com>
Date: Tue, 6 Dec 2022 13:47:22 +0000
Subject: [PATCH 11/15] fix failed tests

---
 .../dygraph_to_static/test_mobile_net.py      |  3 +-
 .../unittests/test_imperative_se_resnext.py   | 50 +++++++++----------
 2 files changed, 24 insertions(+), 29 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
index 75092c9e102d9..9f28eaa9fb9b6 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -28,7 +28,6 @@
 from paddle.jit import ProgramTranslator
 from paddle.jit.api import declarative
 from paddle.nn import Linear
-from paddle.nn.layer.norm import BatchNorm
 
 # Note: Set True to eliminate randomness.
 #     1. For one operation, cuDNN has several algorithms,
@@ -69,7 +68,7 @@ def __init__(
             bias_attr=False,
         )
 
-        self._batch_norm = BatchNorm(
+        self._batch_norm = paddle.nn.layer.nomr.BatchNorm(
             num_filters,
             act=act,
             param_attr=ParamAttr(name=self.full_name() + "_bn" + "_scale"),
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
index 65a8fc4f4ed80..353b02fe1b67d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -19,7 +19,7 @@
 
 import paddle
 import paddle.fluid as fluid
-from paddle.fluid import Linear, Pool2D, core
+from paddle.fluid import core
 from paddle.fluid.framework import _test_eager_guard
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.nn.layer.norm import BatchNorm
@@ -108,25 +108,27 @@ def __init__(self, num_channels, reduction_ratio):
         self._squeeze = paddle.nn.Linear(
             num_channels,
             num_channels // reduction_ratio,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.05)
+            weight_attr=paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(value=0.05)
             ),
-            act='relu',
         )
-        self._excitation = Linear(
+        self.act_1 = paddle.nn.ReLU()
+        self._excitation = paddle.nn.Linear(
             num_channels // reduction_ratio,
             num_channels,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.05)
+            weight_attr=paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(value=0.05)
             ),
-            act='sigmoid',
         )
+        self.act_2 = paddle.nn.Softmax()
 
     def forward(self, input):
         y = self._pool(input)
-        y = fluid.layers.reshape(y, shape=[-1, self._num_channels])
+        y = paddle.reshape(y, shape=[-1, self._num_channels])
         y = self._squeeze(y)
+        y = self.act_1(y)
         y = self._excitation(y)
+        y = self.act_2(y)
         y = fluid.layers.elementwise_mul(x=input, y=y, axis=0)
         return y
 
@@ -187,7 +189,7 @@ def forward(self, inputs):
         else:
             short = self.short(inputs)
 
-        y = fluid.layers.elementwise_add(x=short, y=scale)
+        y = paddle.add(x=short, y=scale)
 
         layer_helper = LayerHelper(self.full_name(), act='relu')
         y = layer_helper.append_activation(y)
@@ -218,9 +220,7 @@ def __init__(self, layers=50, class_dim=102):
                 stride=2,
                 act='relu',
             )
-            self.pool = Pool2D(
-                pool_size=3, pool_stride=2, pool_padding=1, pool_type='max'
-            )
+            self.pool = paddle.nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
         elif layers == 101:
             cardinality = 32
             reduction_ratio = 16
@@ -233,9 +233,7 @@ def __init__(self, layers=50, class_dim=102):
                 stride=2,
                 act='relu',
             )
-            self.pool = Pool2D(
-                pool_size=3, pool_stride=2, pool_padding=1, pool_type='max'
-            )
+            self.pool = paddle.nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
         elif layers == 152:
             cardinality = 64
             reduction_ratio = 16
@@ -262,9 +260,7 @@ def __init__(self, layers=50, class_dim=102):
                 stride=1,
                 act='relu',
             )
-            self.pool = Pool2D(
-                pool_size=3, pool_stride=2, pool_padding=1, pool_type='max'
-            )
+            self.pool = paddle.nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
 
         self.bottleneck_block_list = []
         num_channels = 64
@@ -294,14 +290,14 @@ def __init__(self, layers=50, class_dim=102):
 
         self.pool2d_avg_output = num_filters[-1] * 4 * 1 * 1
 
-        self.out = Linear(
+        self.out = paddle.nn.Linear(
             self.pool2d_avg_output,
             class_dim,
-            act='softmax',
-            param_attr=fluid.param_attr.ParamAttr(
-                initializer=fluid.initializer.Uniform(-stdv, stdv)
+            weight_attr=paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Uniform(-stdv, stdv)
             ),
         )
+        self.out_act = paddle.nn.Softmax()
 
     def forward(self, inputs):
         if self.layers == 50 or self.layers == 101:
@@ -316,9 +312,9 @@ def forward(self, inputs):
         for bottleneck_block in self.bottleneck_block_list:
             y = bottleneck_block(y)
         y = self.pool2d_avg(y)
-        y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output])
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_output])
         y = self.out(y)
-        return y
+        return self.out_act(y)
 
 
 class TestImperativeResneXt(unittest.TestCase):
@@ -375,7 +371,7 @@ def run_dygraph():
                     label.stop_gradient = True
 
                     out = se_resnext(img)
-                    softmax_out = fluid.layers.softmax(out, use_cudnn=False)
+                    softmax_out = paddle.nn.functional.softmax(out)
                     loss = fluid.layers.cross_entropy(
                         input=softmax_out, label=label
                     )
@@ -455,7 +451,7 @@ def run_dygraph():
             )
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             out = se_resnext(img)
-            softmax_out = fluid.layers.softmax(out, use_cudnn=False)
+            softmax_out = paddle.nn.function.softmax(out)
             loss = fluid.layers.cross_entropy(input=softmax_out, label=label)
             avg_loss = paddle.mean(x=loss)
             optimizer.minimize(avg_loss)

From 57bfe4ccb9136ad405a30c9502a79d81e7c6bc61 Mon Sep 17 00:00:00 2001
From: risemeup1 <515586620@qq.com>
Date: Tue, 6 Dec 2022 14:20:39 +0000
Subject: [PATCH 12/15] fix failed tests

---
 .../fleet/test_imperative_auto_mixed_precision.py           | 2 +-
 .../fleet/test_imperative_auto_mixed_precision_for_eager.py | 2 +-
 .../fluid/tests/unittests/dygraph_to_static/darknet.py      | 2 +-
 .../tests/unittests/dygraph_to_static/test_mobile_net.py    | 3 ++-
 .../fluid/tests/unittests/dygraph_to_static/test_op_attr.py | 2 +-
 .../tests/unittests/dygraph_to_static/test_resnet_v2.py     | 2 +-
 .../fluid/tests/unittests/ir/test_fuse_resnet_unit.py       | 2 +-
 python/paddle/fluid/tests/unittests/test_gradient_clip.py   | 2 +-
 .../fluid/tests/unittests/test_imperative_se_resnext.py     | 3 ---
 .../fluid/tests/unittests/test_inplace_addto_strategy.py    | 2 +-
 python/paddle/fluid/tests/unittests/test_layout_autotune.py | 2 +-
 .../unittests/xpu/test_fused_resnet_basic_block_op_xpu.py   | 6 +++---
 python/paddle/vision/models/densenet.py                     | 2 +-
 13 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py
index 3141ed81a458b..1a59db5d504dd 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py
@@ -708,7 +708,7 @@ def test_skip_BatchNorm_Layer_norm(self):
         for param in model.parameters():
             self.assertEqual((param.dtype == paddle.float32), True)
 
-        model = paddle.nn.BatchNorm(1)
+        model = paddle.nn.layer.norm.BatchNorm(1)
         model = paddle.amp.decorate(models=model, level='O2')
         for param in model.parameters():
             self.assertEqual((param.dtype == paddle.float32), True)
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py
index dbcdf3ac46c9e..8df63a16969d7 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py
@@ -707,7 +707,7 @@ def test_skip_BatchNorm_Layer_norm(self):
         for param in model.parameters():
             self.assertEqual((param.dtype == paddle.float32), True)
 
-        model = paddle.nn.BatchNorm(1)
+        model = paddle.nn.layer.norm.BatchNorm(1)
         model = paddle.amp.decorate(models=model, level='O2')
         for param in model.parameters():
             self.assertEqual((param.dtype == paddle.float32), True)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
index bf312818f2541..2f776911bdce0 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
@@ -114,7 +114,7 @@ def __init__(self, ch_in, ch_out, is_test=True):
     def forward(self, inputs):
         conv1 = self.conv1(inputs)
         conv2 = self.conv2(conv1)
-        out = fluid.layers.elementwise_add(x=inputs, y=conv2, act=None)
+        out = paddle.add(x=inputs, y=conv2)
         return out
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
index 9f28eaa9fb9b6..75092c9e102d9 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -28,6 +28,7 @@
 from paddle.jit import ProgramTranslator
 from paddle.jit.api import declarative
 from paddle.nn import Linear
+from paddle.nn.layer.norm import BatchNorm
 
 # Note: Set True to eliminate randomness.
 #     1. For one operation, cuDNN has several algorithms,
@@ -68,7 +69,7 @@ def __init__(
             bias_attr=False,
         )
 
-        self._batch_norm = paddle.nn.layer.nomr.BatchNorm(
+        self._batch_norm = BatchNorm(
             num_filters,
             act=act,
             param_attr=ParamAttr(name=self.full_name() + "_bn" + "_scale"),
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_op_attr.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_op_attr.py
index d474d80b63e60..d786ec0ed2d4d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_op_attr.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_op_attr.py
@@ -31,7 +31,7 @@ def __init__(self, in_num, out_num):
         super().__init__()
 
         self.linear = paddle.nn.Linear(in_num, out_num)
-        self.bn = paddle.nn.BatchNorm(out_num)
+        self.bn = paddle.nn.layer.norm.BatchNorm(out_num)
         self.sub = MySub()
 
     def forward(self, x):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
index ee01b71e29c78..3ddcd0738007b 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
@@ -76,7 +76,7 @@ def __init__(
             bias_attr=False,
         )
 
-        self._batch_norm = paddle.nn.BatchNorm(num_filters, act=act)
+        self._batch_norm = paddle.nn.layer.norm.BatchNorm(num_filters, act=act)
 
     def forward(self, inputs):
         y = self._conv(inputs)
diff --git a/python/paddle/fluid/tests/unittests/ir/test_fuse_resnet_unit.py b/python/paddle/fluid/tests/unittests/ir/test_fuse_resnet_unit.py
index bcadccf5fd671..6a36d78e95e54 100644
--- a/python/paddle/fluid/tests/unittests/ir/test_fuse_resnet_unit.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_fuse_resnet_unit.py
@@ -43,7 +43,7 @@ def test_fuse_resenet_unit(self):
                 conv2d = paddle.nn.Conv2D(
                     8, 32, 1, bias_attr=False, data_format='NHWC'
                 )
-                batch_norm = paddle.nn.BatchNorm(
+                batch_norm = paddle.nn.layer.norm.BatchNorm(
                     32, act='relu', data_layout='NHWC'
                 )
                 out = batch_norm(conv2d(x))
diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
index 4aa064921fe5c..c8684cfcc919d 100644
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -531,7 +531,7 @@ class SimpleNet(paddle.nn.Layer):
     def __init__(self):
         super().__init__()
         self.linear = paddle.nn.Linear(5, 5)
-        self.batch_norm = paddle.nn.BatchNorm(5)
+        self.batch_norm = paddle.nn.layer.BatchNorm(5)
 
     def forward(self, x):
         x = self.linear(x)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
index 353b02fe1b67d..e84e1134f036c 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -24,9 +24,6 @@
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.nn.layer.norm import BatchNorm
 
-# NOTE(zhiqiu): run with FLAGS_cudnn_deterministic=1
-
-
 batch_size = 8
 train_parameters = {
     "input_size": [3, 224, 224],
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py b/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
index 9f448e7f07a47..fb57ff7fb81d7 100644
--- a/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
@@ -43,7 +43,7 @@ def __init__(
             data_format=data_format,
         )
 
-        self._batch_norm = paddle.nn.BatchNorm(
+        self._batch_norm = paddle.nn.layer.norm.BatchNorm(
             num_filters, data_layout=data_format
         )
 
diff --git a/python/paddle/fluid/tests/unittests/test_layout_autotune.py b/python/paddle/fluid/tests/unittests/test_layout_autotune.py
index 70c283a549a09..3512bbca63dcb 100644
--- a/python/paddle/fluid/tests/unittests/test_layout_autotune.py
+++ b/python/paddle/fluid/tests/unittests/test_layout_autotune.py
@@ -26,7 +26,7 @@ class SimpleNet(paddle.nn.Layer):
     def __init__(self, data_format="NCHW", class_num=2):
         super().__init__()
         self.conv = paddle.nn.Conv2D(3, 8, (3, 3))
-        self.bn = paddle.nn.BatchNorm(num_channels=8)
+        self.bn = paddle.nn.layer.norm.BatchNorm(num_channels=8)
         self.relu = paddle.nn.ReLU()
         self.pool = paddle.nn.AvgPool2D(kernel_size=2, stride=2)
         self.flatten = paddle.nn.Flatten()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py
index 68bf21abd1a06..d3657d7c54c6b 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py
@@ -113,7 +113,7 @@ def Base(self):
                 bias_attr=None,
                 data_format='NCHW',
             )
-            self.bn1 = nn.BatchNorm(
+            self.bn1 = nn.layer.nomr.BatchNorm(
                 self.out_channels,
                 act='relu',
                 param_attr=bn1_weight,
@@ -130,7 +130,7 @@ def Base(self):
                 bias_attr=None,
                 data_format='NCHW',
             )
-            self.bn2 = nn.BatchNorm(
+            self.bn2 = nn.layer.norm.BatchNorm(
                 self.out_channels,
                 act=None,
                 param_attr=bn2_weight,
@@ -147,7 +147,7 @@ def Base(self):
                 bias_attr=None,
                 data_format='NCHW',
             )
-            self.bn3 = nn.BatchNorm(
+            self.bn3 = nn.layer.norm.BatchNorm(
                 self.out_channels,
                 act=None,
                 param_attr=bn3_weight,
diff --git a/python/paddle/vision/models/densenet.py b/python/paddle/vision/models/densenet.py
index f620e1d70956b..20747e828c838 100644
--- a/python/paddle/vision/models/densenet.py
+++ b/python/paddle/vision/models/densenet.py
@@ -20,13 +20,13 @@
 from paddle.nn import (
     AdaptiveAvgPool2D,
     AvgPool2D,
-    BatchNorm,
     Conv2D,
     Dropout,
     Linear,
     MaxPool2D,
 )
 from paddle.nn.initializer import Uniform
+from paddle.nn.layer.norm import BatchNorm
 from paddle.utils.download import get_weights_path_from_url
 
 __all__ = []

From b358a3c8e3c59df573eb009e23bcb504d5ddea17 Mon Sep 17 00:00:00 2001
From: risemeup1 <515586620@qq.com>
Date: Wed, 7 Dec 2022 01:38:23 +0000
Subject: [PATCH 13/15] fix failed test

---
 python/paddle/fluid/tests/unittests/test_gradient_clip.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
index c8684cfcc919d..ff3f2bfff0afc 100644
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -531,7 +531,7 @@ class SimpleNet(paddle.nn.Layer):
     def __init__(self):
         super().__init__()
         self.linear = paddle.nn.Linear(5, 5)
-        self.batch_norm = paddle.nn.layer.BatchNorm(5)
+        self.batch_norm = paddle.nn.layer.norm.BatchNorm(5)
 
     def forward(self, x):
         x = self.linear(x)

From 8e06fa48f3d0ea344b33b04be84355bd16695b5a Mon Sep 17 00:00:00 2001
From: risemeup1 <515586620@qq.com>
Date: Wed, 7 Dec 2022 05:15:21 +0000
Subject: [PATCH 14/15] fix error in test_fused_resenet_basic_block_op_xpu.py

---
 .../tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py
index d3657d7c54c6b..1b9434da600ab 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py
@@ -113,7 +113,7 @@ def Base(self):
                 bias_attr=None,
                 data_format='NCHW',
             )
-            self.bn1 = nn.layer.nomr.BatchNorm(
+            self.bn1 = nn.layer.norm.BatchNorm(
                 self.out_channels,
                 act='relu',
                 param_attr=bn1_weight,

From 9fb71b4e3ae2582fd9494d69eaa1e2db7a939bb8 Mon Sep 17 00:00:00 2001
From: risemeup1 <515586620@qq.com>
Date: Fri, 9 Dec 2022 07:48:59 +0000
Subject: [PATCH 15/15] modify after xiaoguang reviewed

---
 .../contrib/slim/tests/test_imperative_skip_op.py   |  3 +--
 .../fleet/test_imperative_auto_mixed_precision.py   |  2 +-
 ...est_imperative_auto_mixed_precision_for_eager.py |  2 +-
 .../tests/unittests/dygraph_to_static/darknet.py    |  2 +-
 .../unittests/dygraph_to_static/test_cycle_gan.py   |  2 +-
 .../unittests/dygraph_to_static/test_mobile_net.py  |  3 +--
 .../unittests/dygraph_to_static/test_op_attr.py     |  2 +-
 .../unittests/dygraph_to_static/test_resnet.py      |  2 +-
 .../unittests/dygraph_to_static/test_resnet_v2.py   |  2 +-
 .../unittests/dygraph_to_static/test_se_resnet.py   |  3 +--
 .../tests/unittests/dygraph_to_static/test_tsm.py   |  3 +--
 .../tests/unittests/ir/test_fuse_resnet_unit.py     |  2 +-
 .../tests/unittests/mlu/test_batch_norm_op_mlu.py   |  9 ++++-----
 .../unittests/mlu/test_batch_norm_op_mlu_v2.py      |  9 ++++-----
 .../tests/unittests/npu/test_batch_norm_op_npu.py   |  4 ++--
 .../fluid/tests/unittests/test_batch_norm_op.py     |  9 ++++-----
 .../fluid/tests/unittests/test_batch_norm_op_v2.py  | 13 ++++++-------
 .../fluid/tests/unittests/test_gradient_clip.py     |  2 +-
 .../unittests/test_imperative_load_static_param.py  |  3 +--
 .../test_imperative_ocr_attention_model.py          |  3 +--
 .../fluid/tests/unittests/test_imperative_resnet.py |  2 +-
 .../tests/unittests/test_imperative_se_resnext.py   |  2 +-
 .../tests/unittests/test_inplace_addto_strategy.py  |  2 +-
 .../fluid/tests/unittests/test_layout_autotune.py   |  2 +-
 .../tests/unittests/xpu/test_batch_norm_op_xpu.py   |  3 +--
 .../xpu/test_fused_resnet_basic_block_op_xpu.py     |  6 +++---
 python/paddle/nn/layer/norm.py                      |  1 +
 python/paddle/vision/models/densenet.py             |  2 +-
 28 files changed, 45 insertions(+), 55 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
index e1aec4eeca5ec..9b8ed24af2e55 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
@@ -25,8 +25,7 @@
 from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.nn.layer import ReLU, LeakyReLU, Sigmoid, Softmax, ReLU6
-from paddle.nn.layer.norm import BatchNorm
-from paddle.nn import Linear, Conv2D, Softmax
+from paddle.nn import Linear, Conv2D, Softmax, BatchNorm
 from paddle.fluid.log_helper import get_logger
 
 from imperative_test_utils import (
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py
index 1a59db5d504dd..3141ed81a458b 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py
@@ -708,7 +708,7 @@ def test_skip_BatchNorm_Layer_norm(self):
         for param in model.parameters():
             self.assertEqual((param.dtype == paddle.float32), True)
 
-        model = paddle.nn.layer.norm.BatchNorm(1)
+        model = paddle.nn.BatchNorm(1)
         model = paddle.amp.decorate(models=model, level='O2')
         for param in model.parameters():
             self.assertEqual((param.dtype == paddle.float32), True)
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py
index 8df63a16969d7..dbcdf3ac46c9e 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py
@@ -707,7 +707,7 @@ def test_skip_BatchNorm_Layer_norm(self):
         for param in model.parameters():
             self.assertEqual((param.dtype == paddle.float32), True)
 
-        model = paddle.nn.layer.norm.BatchNorm(1)
+        model = paddle.nn.BatchNorm(1)
         model = paddle.amp.decorate(models=model, level='O2')
         for param in model.parameters():
             self.assertEqual((param.dtype == paddle.float32), True)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
index 2f776911bdce0..783dfff262e8f 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
@@ -16,7 +16,7 @@
 import paddle.fluid as fluid
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.regularizer import L2Decay
-from paddle.nn.layer.norm import BatchNorm
+from paddle.nn import BatchNorm
 
 
 class ConvBNLayer(fluid.dygraph.Layer):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
index a26c65c622315..09117e3054078 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
@@ -40,7 +40,7 @@
 from paddle.fluid.dygraph import to_variable
 from paddle.jit import ProgramTranslator
 from paddle.jit.api import declarative
-from paddle.nn.layer.norm import BatchNorm
+from paddle.nn import BatchNorm
 
 # Note: Set True to eliminate randomness.
 #     1. For one operation, cuDNN has several algorithms,
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
index 5e9b6c2e8f1c2..d5a4ae996d68b 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -27,8 +27,7 @@
 from paddle.fluid.param_attr import ParamAttr
 from paddle.jit import ProgramTranslator
 from paddle.jit.api import declarative
-from paddle.nn import Linear
-from paddle.nn.layer.norm import BatchNorm
+from paddle.nn import BatchNorm, Linear
 
 # Note: Set True to eliminate randomness.
 #     1. For one operation, cuDNN has several algorithms,
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_op_attr.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_op_attr.py
index d786ec0ed2d4d..d474d80b63e60 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_op_attr.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_op_attr.py
@@ -31,7 +31,7 @@ def __init__(self, in_num, out_num):
         super().__init__()
 
         self.linear = paddle.nn.Linear(in_num, out_num)
-        self.bn = paddle.nn.layer.norm.BatchNorm(out_num)
+        self.bn = paddle.nn.BatchNorm(out_num)
         self.sub = MySub()
 
     def forward(self, x):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
index 529548f1aee13..1ad5facd5d1ac 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
@@ -25,7 +25,7 @@
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.jit import ProgramTranslator
-from paddle.nn.layer.norm import BatchNorm
+from paddle.nn import BatchNorm
 
 SEED = 2020
 IMAGENET1000 = 1281167
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
index 3ddcd0738007b..ee01b71e29c78 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
@@ -76,7 +76,7 @@ def __init__(
             bias_attr=False,
         )
 
-        self._batch_norm = paddle.nn.layer.norm.BatchNorm(num_filters, act=act)
+        self._batch_norm = paddle.nn.BatchNorm(num_filters, act=act)
 
     def forward(self, inputs):
         y = self._conv(inputs)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
index a68fe536267b6..b260bbc359ca9 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
@@ -28,8 +28,7 @@
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.jit import ProgramTranslator
 from paddle.jit.api import declarative
-from paddle.nn import Linear
-from paddle.nn.layer.norm import BatchNorm
+from paddle.nn import BatchNorm, Linear
 
 SEED = 2020
 np.random.seed(SEED)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
index e0d6cdcf260d6..4d8ab7612aa4c 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
@@ -26,8 +26,7 @@
 from paddle.fluid.dygraph import to_variable
 from paddle.jit import ProgramTranslator
 from paddle.jit.api import declarative
-from paddle.nn import Linear
-from paddle.nn.layer.norm import BatchNorm
+from paddle.nn import BatchNorm, Linear
 
 random.seed(0)
 np.random.seed(0)
diff --git a/python/paddle/fluid/tests/unittests/ir/test_fuse_resnet_unit.py b/python/paddle/fluid/tests/unittests/ir/test_fuse_resnet_unit.py
index 6a36d78e95e54..bcadccf5fd671 100644
--- a/python/paddle/fluid/tests/unittests/ir/test_fuse_resnet_unit.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_fuse_resnet_unit.py
@@ -43,7 +43,7 @@ def test_fuse_resenet_unit(self):
                 conv2d = paddle.nn.Conv2D(
                     8, 32, 1, bias_attr=False, data_format='NHWC'
                 )
-                batch_norm = paddle.nn.layer.norm.BatchNorm(
+                batch_norm = paddle.nn.BatchNorm(
                     32, act='relu', data_layout='NHWC'
                 )
                 out = batch_norm(conv2d(x))
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py
index 3de7dd924d279..66876ddb79294 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py
@@ -18,7 +18,6 @@
 import paddle
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
-import paddle.fluid as fluid
 import sys
 
 sys.path.append('..')
@@ -753,7 +752,7 @@ def test_errors(self):
 class TestDygraphBatchNormAPIError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
-            batch_norm = nn.layer.norm.BatchNorm(10)
+            batch_norm = paddle.nn.BatchNorm(10)
             # the input of BatchNorm must be Variable.
             x1 = fluid.create_lod_tensor(
                 np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace()
@@ -776,7 +775,7 @@ def test_dygraph(self):
 
             def compute(x, is_test, trainable_statistics):
                 with fluid.dygraph.guard(p):
-                    bn = nn.layer.norm.BatchNorm(
+                    bn = paddle.nn.BatchNorm(
                         shape[1],
                         is_test=is_test,
                         trainable_statistics=trainable_statistics,
@@ -799,7 +798,7 @@ def test_static(self):
 
             def compute(x_np, is_test, trainable_statistics):
                 with program_guard(Program(), Program()):
-                    bn = nn.layer.norm.BatchNorm(
+                    bn = paddle.nn.BatchNorm(
                         shape[1],
                         is_test=is_test,
                         trainable_statistics=trainable_statistics,
@@ -824,7 +823,7 @@ def test_reservespace(self):
             x = fluid.data(name='x', shape=x.shape, dtype=x.dtype)
             # Set this FLAG, the BatchNorm API will pass "reserve_space" argument into batch_norm op.
             os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '1'
-            batch_norm = nn.layer.norm.BatchNorm(7, data_layout="NHWC")
+            batch_norm = paddle.nn.BatchNorm(7, data_layout="NHWC")
             hidden1 = batch_norm(x)
             os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '0'
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py
index c3e9a042233bd..17672d668d38a 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py
@@ -17,7 +17,6 @@
 import numpy as np
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
-import paddle.fluid as fluid
 import sys
 
 sys.path.append("..")
@@ -95,7 +94,7 @@ def test_dygraph(self):
 
             def compute_v1(x, is_test, trainable_statistics):
                 with fluid.dygraph.guard(p):
-                    bn = nn.layer.norm(
+                    bn = paddle.nn.BatchNorm(
                         shape[1],
                         is_test=is_test,
                         trainable_statistics=trainable_statistics,
@@ -111,7 +110,7 @@ def compute_v2(x):
 
             def compute_v3(x, is_test, trainable_statistics):
                 with fluid.dygraph.guard(p):
-                    bn = nn.layer.norm(
+                    bn = paddle.nn.BatchNorm(
                         shape[1],
                         is_test=is_test,
                         param_attr=fluid.ParamAttr(
@@ -153,7 +152,7 @@ def test_static(self):
 
             def compute_v1(x_np, is_test, trainable_statistics):
                 with program_guard(Program(), Program()):
-                    bn = nn.layer.norm(
+                    bn = paddle.nn.BatchNorm(
                         shape[1],
                         is_test=is_test,
                         trainable_statistics=trainable_statistics,
@@ -260,7 +259,7 @@ def test_global_stats(self):
         for p in self.places:
             with fluid.dygraph.guard(p):
                 x = paddle.randn([2, 6, 6, 4])
-                net1 = paddle.nn.layer.norm(
+                net1 = paddle.nn.BatchNorm(
                     6,
                     param_attr=fluid.ParamAttr(
                         initializer=fluid.initializer.Constant(1.0)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py
index 2eeacffe27058..353fd250a5e1b 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py
@@ -562,7 +562,7 @@ def test_dygraph(self):
 
             def compute(x, is_test, trainable_statistics):
                 with fluid.dygraph.guard(p):
-                    bn = nn.layer.norm.BatchNorm(
+                    bn = paddle.nn.BatchNorm(
                         shape[1],
                         is_test=is_test,
                         trainable_statistics=trainable_statistics,
@@ -583,7 +583,7 @@ def test_static(self):
 
             def compute(x_np, is_test, trainable_statistics):
                 with program_guard(Program(), Program()):
-                    bn = nn.layer.norm.BatchNorm(
+                    bn = paddle.nn.BatchNorm(
                         shape[1],
                         is_test=is_test,
                         trainable_statistics=trainable_statistics,
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index 2e92d3d27f5eb..6802a8a9ea995 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -21,7 +21,6 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-import paddle.nn as nn
 from paddle.fluid import Program, program_guard
 from paddle.fluid.framework import grad_var_name
 from paddle.fluid.op import Operator
@@ -771,7 +770,7 @@ def test_errors(self):
 class TestDygraphBatchNormAPIError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
-            batch_norm = nn.layer.norm.BatchNorm(10)
+            batch_norm = paddle.nn.BatchNorm(10)
             # the input of BatchNorm must be Variable.
             x1 = fluid.create_lod_tensor(
                 np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace()
@@ -794,7 +793,7 @@ def test_dygraph(self):
 
             def compute(x, is_test, trainable_statistics):
                 with fluid.dygraph.guard(p):
-                    bn = nn.layer.norm.BatchNorm(
+                    bn = paddle.nn.BatchNorm(
                         shape[1],
                         is_test=is_test,
                         trainable_statistics=trainable_statistics,
@@ -817,7 +816,7 @@ def test_static(self):
 
             def compute(x_np, is_test, trainable_statistics):
                 with program_guard(Program(), Program()):
-                    bn = nn.layer.norm.BatchNorm(
+                    bn = paddle.nn.BatchNorm(
                         shape[1],
                         is_test=is_test,
                         trainable_statistics=trainable_statistics,
@@ -842,7 +841,7 @@ def test_reservespace(self):
             x = fluid.data(name='x', shape=x.shape, dtype=x.dtype)
             # Set this FLAG, the BatchNorm API will pass "reserve_space" argument into batch_norm op.
             os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '1'
-            batch_norm = nn.layer.norm.BatchNorm(7, data_layout="NHWC")
+            batch_norm = paddle.nn.BatchNorm(7, data_layout="NHWC")
             hidden1 = batch_norm(x)
             os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '0'
 
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
index d27953d24e83c..74edcd61d343e 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -19,7 +19,6 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-import paddle.nn as nn
 from paddle.fluid import Program, program_guard
 from paddle.fluid.framework import _test_eager_guard
 
@@ -83,7 +82,7 @@ def error3d():
     def test_large_batch(self):
         def compute_baseline(x):
             with fluid.dygraph.guard(p):
-                bn = nn.layer.norm.BatchNorm(shape[1])
+                bn = paddle.nn.BatchNorm(shape[1])
                 x1 = paddle.to_tensor(x)
                 x1.stop_gradient = False
                 y = bn(x1)
@@ -129,7 +128,7 @@ def test_eager_api(self):
 
             def compute_v1(x):
                 with fluid.dygraph.guard(p):
-                    bn = nn.layer.norm.BatchNorm(shape[1])
+                    bn = paddle.nn.BatchNorm(shape[1])
                     # bn = paddle.nn.BatchNorm2D(shape[1])
                     x1 = paddle.to_tensor(x)
                     x1.stop_gradient = False
@@ -163,7 +162,7 @@ def test_dygraph(self):
 
             def compute_v1(x, is_test, trainable_statistics):
                 with fluid.dygraph.guard(p):
-                    bn = nn.layer.norm.BatchNorm(
+                    bn = paddle.nn.BatchNorm(
                         shape[1],
                         is_test=is_test,
                         trainable_statistics=trainable_statistics,
@@ -184,7 +183,7 @@ def compute_v2(x):
 
             def compute_v3(x, is_test, trainable_statistics):
                 with fluid.dygraph.guard(p):
-                    bn = nn.layer.norm.BatchNorm(
+                    bn = paddle.nn.BatchNorm(
                         shape[1],
                         is_test=is_test,
                         param_attr=fluid.ParamAttr(
@@ -226,7 +225,7 @@ def test_static(self):
 
             def compute_v1(x_np, is_test, trainable_statistics):
                 with program_guard(Program(), Program()):
-                    bn = nn.layer.norm.BatchNorm(
+                    bn = paddle.nn.BatchNorm(
                         shape[1],
                         is_test=is_test,
                         trainable_statistics=trainable_statistics,
@@ -380,7 +379,7 @@ def test_global_stats(self):
         for p in self.places:
             with fluid.dygraph.guard(p):
                 x = paddle.randn([2, 6, 6, 4])
-                net1 = nn.layer.norm.BatchNorm(
+                net1 = paddle.nn.BatchNorm(
                     6,
                     param_attr=fluid.ParamAttr(
                         initializer=fluid.initializer.Constant(1.0)
diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
index f9b377cd2c2c9..0c89e000538d6 100644
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -529,7 +529,7 @@ class SimpleNet(paddle.nn.Layer):
     def __init__(self):
         super().__init__()
         self.linear = paddle.nn.Linear(5, 5)
-        self.batch_norm = paddle.nn.layer.norm.BatchNorm(5)
+        self.batch_norm = paddle.nn.BatchNorm(5)
 
     def forward(self, x):
         x = self.linear(x)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
index 12bea8ce1f65b..2477d2e4785f7 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
@@ -22,8 +22,7 @@
 import paddle.fluid as fluid
 import paddle.fluid.framework as framework
 from paddle.fluid.dygraph.nn import Embedding
-from paddle.nn import Linear
-from paddle.nn.layer.norm import BatchNorm
+from paddle.nn import BatchNorm, Linear
 
 
 class TestDygraphLoadStatic(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
index 7f40aba3d6488..58be44dc8ef6d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -23,8 +23,7 @@
 from paddle.fluid.dygraph.base import to_variable
 from paddle.fluid.dygraph.nn import Embedding
 from paddle.fluid.framework import _test_eager_guard
-from paddle.nn import Linear
-from paddle.nn.layer.norm import BatchNorm
+from paddle.nn import BatchNorm, Linear
 
 
 class Config:
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
index 7fc6fca074f5a..54b771134457a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -24,7 +24,7 @@
 from paddle.fluid.dygraph.base import to_variable
 from paddle.fluid.framework import _test_eager_guard
 from paddle.fluid.layer_helper import LayerHelper
-from paddle.nn.layer.norm import BatchNorm
+from paddle.nn import BatchNorm
 
 # NOTE(zhiqiu): run with FLAGS_cudnn_deterministic=1
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
index e84e1134f036c..5a1d986b8a29b 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -22,7 +22,7 @@
 from paddle.fluid import core
 from paddle.fluid.framework import _test_eager_guard
 from paddle.fluid.layer_helper import LayerHelper
-from paddle.nn.layer.norm import BatchNorm
+from paddle.nn import BatchNorm
 
 batch_size = 8
 train_parameters = {
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py b/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
index fb57ff7fb81d7..9f448e7f07a47 100644
--- a/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
@@ -43,7 +43,7 @@ def __init__(
             data_format=data_format,
         )
 
-        self._batch_norm = paddle.nn.layer.norm.BatchNorm(
+        self._batch_norm = paddle.nn.BatchNorm(
             num_filters, data_layout=data_format
         )
 
diff --git a/python/paddle/fluid/tests/unittests/test_layout_autotune.py b/python/paddle/fluid/tests/unittests/test_layout_autotune.py
index 3512bbca63dcb..70c283a549a09 100644
--- a/python/paddle/fluid/tests/unittests/test_layout_autotune.py
+++ b/python/paddle/fluid/tests/unittests/test_layout_autotune.py
@@ -26,7 +26,7 @@ class SimpleNet(paddle.nn.Layer):
     def __init__(self, data_format="NCHW", class_num=2):
         super().__init__()
         self.conv = paddle.nn.Conv2D(3, 8, (3, 3))
-        self.bn = paddle.nn.layer.norm.BatchNorm(num_channels=8)
+        self.bn = paddle.nn.BatchNorm(num_channels=8)
         self.relu = paddle.nn.ReLU()
         self.pool = paddle.nn.AvgPool2D(kernel_size=2, stride=2)
         self.flatten = paddle.nn.Flatten()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
index 0da8a62031b6a..d3909193cd6ce 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
@@ -27,7 +27,6 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-import paddle.nn as nn
 import paddle.nn.functional as F
 
 paddle.enable_static()
@@ -367,7 +366,7 @@ def test_global_stats(self):
             for p in self.places:
                 with fluid.dygraph.guard(p):
                     x = paddle.randn([2, 6, 6, 4])
-                    net1 = nn.layer.norm.BatchNorm(
+                    net1 = paddle.nn.BatchNorm(
                         6,
                         param_attr=fluid.ParamAttr(
                             initializer=fluid.initializer.Constant(1.0)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py
index 1b9434da600ab..3518083d75678 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py
@@ -113,7 +113,7 @@ def Base(self):
                 bias_attr=None,
                 data_format='NCHW',
             )
-            self.bn1 = nn.layer.norm.BatchNorm(
+            self.bn1 = paddle.nn.BatchNorm(
                 self.out_channels,
                 act='relu',
                 param_attr=bn1_weight,
@@ -130,7 +130,7 @@ def Base(self):
                 bias_attr=None,
                 data_format='NCHW',
             )
-            self.bn2 = nn.layer.norm.BatchNorm(
+            self.bn2 = paddle.nn.BatchNorm(
                 self.out_channels,
                 act=None,
                 param_attr=bn2_weight,
@@ -147,7 +147,7 @@ def Base(self):
                 bias_attr=None,
                 data_format='NCHW',
             )
-            self.bn3 = nn.layer.norm.BatchNorm(
+            self.bn3 = paddle.nn.BatchNorm(
                 self.out_channels,
                 act=None,
                 param_attr=bn3_weight,
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index d8f4026aadb73..f446970ee0ef6 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -26,6 +26,7 @@
 # limitations under the License.
 
 # TODO: define normalization api
+
 import numbers
 import os
 import warnings
diff --git a/python/paddle/vision/models/densenet.py b/python/paddle/vision/models/densenet.py
index 20747e828c838..f620e1d70956b 100644
--- a/python/paddle/vision/models/densenet.py
+++ b/python/paddle/vision/models/densenet.py
@@ -20,13 +20,13 @@
 from paddle.nn import (
     AdaptiveAvgPool2D,
     AvgPool2D,
+    BatchNorm,
     Conv2D,
     Dropout,
     Linear,
     MaxPool2D,
 )
 from paddle.nn.initializer import Uniform
-from paddle.nn.layer.norm import BatchNorm
 from paddle.utils.download import get_weights_path_from_url
 
 __all__ = []