From d050c188de9a506fe2089488a9eca4743bf2c9e8 Mon Sep 17 00:00:00 2001
From: wangzhen38 <41941775+wangzhen38@users.noreply.github.com>
Date: Tue, 6 Dec 2022 11:09:22 +0800
Subject: [PATCH 01/60] [remove fluid] Pool2D (#48707)

---
 .../slim/tests/test_imperative_skip_op.py     |   1 -
 python/paddle/fluid/dygraph/nn.py             | 233 ------------------
 .../fleet/parallel_dygraph_se_resnext.py      |   8 +-
 .../unittests/dygraph_to_static/test_mnist.py |  11 +-
 .../dygraph_to_static/test_mobile_net.py      |   8 +-
 .../dygraph_to_static/test_resnet.py          |   4 +-
 .../dygraph_to_static/test_resnet_v2.py       |   4 +-
 .../dygraph_to_static/test_se_resnet.py       |   8 +-
 .../unittests/dygraph_to_static/test_tsm.py   |   4 +-
 .../tests/unittests/parallel_dygraph_mnist.py |  11 +-
 .../unittests/test_dygraph_mnist_fp16.py      |  11 +-
 .../unittests/test_dygraph_multi_forward.py   |  11 +-
 .../tests/unittests/test_imperative_mnist.py  |  11 +-
 .../tests/unittests/test_imperative_resnet.py |   4 +-
 .../unittests/test_imperative_se_resnext.py   |   8 +-
 python/paddle/tests/test_model.py             |   8 +-
 16 files changed, 36 insertions(+), 309 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
index 131866095ad7b..9b8ed24af2e55 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
@@ -26,7 +26,6 @@
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.nn.layer import ReLU, LeakyReLU, Sigmoid, Softmax, ReLU6
 from paddle.nn import Linear, Conv2D, Softmax, BatchNorm
-from paddle.fluid.dygraph.nn import Pool2D
 from paddle.fluid.log_helper import get_logger
 
 from imperative_test_utils import (
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 39da342c38085..4dfb67ab4aa42 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -50,7 +50,6 @@
 
 __all__ = [
     'Conv3D',
-    'Pool2D',
     'Linear',
     'BatchNorm',
     'Embedding',
@@ -506,238 +505,6 @@ def forward(self, input):
         return self._helper.append_activation(pre_act, act=self._act)
 
 
-class Pool2D(layers.Layer):
-    r"""
-
-    This interface is used to construct a callable object of the ``Pool2D`` class.
-    For more details, refer to code examples.
-    The pooling2d operation calculates the output based on the input, pool_type and pool_size, pool_stride,
-    pool_padding parameters.Input and output are in NCHW format, where N is batch size, C is the number of feature map,
-    H is the height of the feature map, and W is the width of the feature map.
-    Parameters(ksize, strides, paddings) are two elements. These two elements represent height and width, respectively.
-    The input(X) size and output(Out) size may be different.
-
-    Example:
-
-        - Input:
-
-          Input shape: :math:`(N, C, H_{in}, W_{in})`
-
-        - Output:
-
-          Output shape: :math:`(N, C, H_{out}, W_{out})`
-
-        If ``ceil_mode`` = False:
-
-        .. math::
-
-            H_{out} = \\frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\\\
-            W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
-
-        If ``ceil_mode`` = True:
-
-        .. math::
-
-            H_{out} = \\frac{(H_{in} - ksize[0] + 2 * paddings[0] + strides[0] - 1)}{strides[0]} + 1 \\\\
-            W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
-
-        If ``exclusive`` = False:
-
-        .. math::
-
-            hstart &= i * strides[0] - paddings[0] \\\\
-            hend   &= hstart + ksize[0] \\\\
-            wstart &= j * strides[1] - paddings[1] \\\\
-            wend   &= wstart + ksize[1] \\\\
-            Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]}
-
-        If ``exclusive`` = True:
-
-        .. math::
-
-            hstart &= max(0, i * strides[0] - paddings[0])\\\\
-            hend &= min(H, hstart + ksize[0]) \\\\
-            wstart &= max(0, j * strides[1] - paddings[1]) \\\\
-            wend & = min(W, wstart + ksize[1]) \\\\
-            Output(i ,j) & = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
-
-    Parameters:
-        pool_size (int or list or tuple, optional): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain two integers, (pool_size_Height, pool_size_Width).
-            Otherwise, the pool kernel size will be a square of an int. Default: -1.
-        pool_type(str, optional) : The pooling type, can be "max" for max-pooling and "avg" for average-pooling.
-            Default: max.
-        pool_stride (int or list or tuple, optional): The pool stride size. If pool stride size is a tuple or list,
-            it must contain two integers, (pool_stride_Height, pool_stride_Width). Otherwise,
-            the pool stride size will be a square of an int. Default: 1.
-        pool_padding (int or list or tuple, optional): The padding size for pooling operation.
-            If ``pool_padding`` is a tuple,
-            it must contain two integers, (pool_padding_on_Height, pool_padding_on_Width).
-            Otherwise, the padding size for pooling operation will be a square of an int. Default: 0.
-        global_pooling (bool, optional): Whether to use the global pooling. If global_pooling = true,
-            kernel size and paddings will be ignored. Default: False.
-        use_cudnn (bool, optional): Only used in cudnn kernel, need install cudnn. Default: True.
-        ceil_mode (bool, optional): Whether to use the ceil function to calculate output height and width.
-            False is the default. If it is set to False, the floor function will be used. Default: False.
-        exclusive (bool, optional): Whether to exclude padding points in average pooling mode. Default: True.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NHWC"`.
-            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-            ``[batch_size, input_channels, input_height, input_width]``. When it is `"NHWC"`, the data is
-            stored in the order of: ``[batch_size, input_height, input_width, input_channels]``
-
-    Returns:
-        None
-
-    Raises:
-        ValueError: If ``pool_type`` is not "max" nor "avg".
-        ValueError: If ``global_pooling`` is False and ``pool_size`` is -1.
-        ValueError: If ``use_cudnn`` is not a bool value.
-        ValueError: If ``data_format`` is not "NCHW" nor "NHWC".
-
-    Examples:
-
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          from paddle.fluid.dygraph.base import to_variable
-          import numpy as np
-
-          with fluid.dygraph.guard():
-             data = numpy.random.random((3, 32, 32, 5)).astype('float32')
-             pool2d = fluid.dygraph.Pool2D(pool_size=2,
-                            pool_type='max',
-                            pool_stride=1,
-                            global_pooling=False)
-             pool2d_res = pool2d(to_variable(data))
-
-    """
-
-    def __init__(
-        self,
-        pool_size=-1,
-        pool_type="max",
-        pool_stride=1,
-        pool_padding=0,
-        global_pooling=False,
-        use_cudnn=True,
-        ceil_mode=False,
-        exclusive=True,
-        data_format="NCHW",
-    ):
-        data_format = data_format.upper()  # supprt NHWC, nhwc, etc.
-        pool_type = pool_type.lower()  # supprt max, Max, etc.
-        if pool_type not in ["max", "avg"]:
-            raise ValueError(
-                "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
-                str(pool_type),
-            )
-
-        if global_pooling is False and pool_size == -1:
-            raise ValueError(
-                "When the global_pooling is False, pool_size must be passed "
-                "and be a valid value. Received pool_size: " + str(pool_size)
-            )
-
-        if not isinstance(use_cudnn, bool):
-            raise ValueError("use_cudnn should be True or False")
-
-        self._use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
-
-        if data_format not in ["NCHW", "NHWC"]:
-            raise ValueError(
-                "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
-                "Attr(data_format): %s." % str(data_format)
-            )
-
-        super().__init__()
-
-        self._pool_type = pool_type
-        self._pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
-        self._pool_padding = utils.convert_to_list(
-            pool_padding, 2, 'pool_padding'
-        )
-        self._pool_stride = utils.convert_to_list(pool_stride, 2, 'pool_stride')
-        self._global_pooling = global_pooling
-        self._use_cudnn = use_cudnn
-        self._ceil_mode = ceil_mode
-        self._exclusive = exclusive
-        self._data_format = data_format
-        self._l_type = 'pool2d'
-
-    def forward(self, input):
-        if _non_static_mode():
-            if not self._use_mkldnn and in_dygraph_mode():
-                input = input._use_gpudnn(self._use_cudnn)
-                return _C_ops.pool2d(
-                    input,
-                    self._pool_size,
-                    self._pool_stride,
-                    self._pool_padding,
-                    self._ceil_mode,
-                    self._exclusive,
-                    self._data_format,
-                    self._pool_type,
-                    self._global_pooling,
-                    False,
-                    "EXPLICIT",
-                )
-
-            attrs = (
-                'pooling_type',
-                self._pool_type,
-                'ksize',
-                self._pool_size,
-                'global_pooling',
-                self._global_pooling,
-                'strides',
-                self._pool_stride,
-                'paddings',
-                self._pool_padding,
-                'use_cudnn',
-                self._use_cudnn,
-                'ceil_mode',
-                self._ceil_mode,
-                'use_mkldnn',
-                self._use_mkldnn,
-                'exclusive',
-                self._exclusive,
-                'data_format',
-                self._data_format,
-            )
-            return _legacy_C_ops.pool2d(input, *attrs)
-
-        check_variable_and_dtype(
-            input,
-            'input',
-            ['int8', 'uint8', 'float16', 'float32', 'float64'],
-            'Pool2D',
-        )
-
-        attrs = {
-            "pooling_type": self._pool_type,
-            "ksize": self._pool_size,
-            "global_pooling": self._global_pooling,
-            "strides": self._pool_stride,
-            "paddings": self._pool_padding,
-            "use_cudnn": self._use_cudnn,
-            "ceil_mode": self._ceil_mode,
-            "use_mkldnn": self._use_mkldnn,
-            "exclusive": self._exclusive,
-            "data_format": self._data_format,
-        }
-        inputs = {"X": [input]}
-
-        pool_out = self._helper.create_variable_for_type_inference(self._dtype)
-
-        self._helper.append_op(
-            type=self._l_type,
-            inputs={"X": input},
-            outputs={"Out": pool_out},
-            attrs=attrs,
-        )
-        return pool_out
-
-
 class Linear(layers.Layer):
     """
 
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_se_resnext.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_se_resnext.py
index 14bb4023b8cc5..f484a7930059b 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_se_resnext.py
@@ -114,9 +114,7 @@ def __init__(self, num_channels, reduction_ratio):
 
         super().__init__()
         self._num_channels = num_channels
-        self._pool = paddle.fluid.dygraph.nn.Pool2D(
-            pool_size=0, pool_type='avg', global_pooling=True
-        )
+        self._pool = paddle.nn.AdaptiveAvgPool2D(1)
         stdv = 1.0 / math.sqrt(num_channels * 1.0)
         self._squeeze = Linear(
             num_channels,
@@ -295,9 +293,7 @@ def __init__(self, layers=50, class_dim=102):
                 self.bottleneck_block_list.append(bottleneck_block)
                 shortcut = True
 
-        self.pool2d_avg = paddle.fluid.dygraph.nn.Pool2D(
-            pool_size=7, pool_type='avg', global_pooling=True
-        )
+        self.pool2d_avg = paddle.nn.AdaptiveAvgPool2D(1)
         stdv = 1.0 / math.sqrt(2048 * 1.0)
 
         self.pool2d_avg_output = num_filters[len(num_filters) - 1] * 2 * 1 * 1
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
index 93f01d165f3c4..d178370f546c4 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
@@ -69,13 +69,10 @@ def __init__(
             bias_attr=None,
         )
 
-        self._pool2d = paddle.fluid.dygraph.nn.Pool2D(
-            pool_size=pool_size,
-            pool_type=pool_type,
-            pool_stride=pool_stride,
-            pool_padding=pool_padding,
-            global_pooling=global_pooling,
-            use_cudnn=use_cudnn,
+        self._pool2d = paddle.nn.MaxPool2D(
+            kernel_size=pool_size,
+            stride=pool_stride,
+            padding=pool_padding,
         )
 
     def forward(self, inputs):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
index ff36720594c09..06def97195e35 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -256,9 +256,7 @@ def __init__(self, scale=1.0, class_dim=1000):
         )
         self.dwsl.append(dws6)
 
-        self.pool2d_avg = paddle.fluid.dygraph.nn.Pool2D(
-            pool_type='avg', global_pooling=True
-        )
+        self.pool2d_avg = paddle.nn.AdaptiveAvgPool2D(1)
 
         self.out = Linear(
             int(1024 * scale),
@@ -424,9 +422,7 @@ def __init__(self, class_dim=1000, scale=1.0):
         )
 
         # 4. pool
-        self._pool2d_avg = paddle.fluid.dygraph.nn.Pool2D(
-            pool_type='avg', global_pooling=True
-        )
+        self._pool2d_avg = paddle.nn.AdaptiveAvgPool2D(1)
 
         # 5. fc
         tmp_param = ParamAttr(name=self.full_name() + "fc10_weights")
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
index 6323ba5ee7ed0..821195c2cb0ed 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
@@ -184,9 +184,7 @@ def __init__(self, layers=50, class_dim=102):
                 )
                 self.bottleneck_block_list.append(bottleneck_block)
                 shortcut = True
-        self.pool2d_avg = paddle.fluid.dygraph.nn.Pool2D(
-            pool_size=7, pool_type='avg', global_pooling=True
-        )
+        self.pool2d_avg = paddle.nn.AdaptiveAvgPool2D(1)
 
         self.pool2d_avg_output = num_filters[len(num_filters) - 1] * 4 * 1 * 1
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
index 00e423d686fab..ee01b71e29c78 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
@@ -184,9 +184,7 @@ def __init__(self, layers=50, class_dim=102):
                 )
                 self.bottleneck_block_list.append(bottleneck_block)
                 shortcut = True
-        self.pool2d_avg = paddle.fluid.dygraph.Pool2D(
-            pool_size=7, pool_type='avg', global_pooling=True
-        )
+        self.pool2d_avg = paddle.nn.AdaptiveAvgPool2D(1)
 
         self.pool2d_avg_output = num_filters[len(num_filters) - 1] * 4 * 1 * 1
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
index 70ee21713c7ed..bce8c3a742d4a 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
@@ -127,9 +127,7 @@ def __init__(self, num_channels, reduction_ratio):
 
         super().__init__()
         self._num_channels = num_channels
-        self._pool = paddle.fluid.dygraph.nn.Pool2D(
-            pool_size=0, pool_type='avg', global_pooling=True
-        )
+        self._pool = paddle.nn.AdaptiveAvgPool2D(1)
         stdv = 1.0 / math.sqrt(num_channels * 1.0)
         self._fc = Linear(
             num_channels,
@@ -309,9 +307,7 @@ def __init__(self, layers=50, class_dim=102):
                 num_channels = bottleneck_block._num_channels_out
                 self.bottleneck_block_list.append(bottleneck_block)
                 shortcut = True
-        self.pool2d_avg = paddle.fluid.dygraph.nn.Pool2D(
-            pool_size=7, pool_type='avg', global_pooling=True
-        )
+        self.pool2d_avg = paddle.nn.AdaptiveAvgPool2D(1)
         stdv = 1.0 / math.sqrt(2048 * 1.0)
 
         self.pool2d_avg_output = num_filters[len(num_filters) - 1] * 2 * 1 * 1
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
index 0be42a27feb70..f1bd4fa4af998 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
@@ -185,9 +185,7 @@ def __init__(self, name_scope, config, mode):
                 num_channels = int(bottleneck_block._num_channels_out)
                 self.bottleneck_block_list.append(bottleneck_block)
                 shortcut = True
-        self.pool2d_avg = paddle.fluid.dygraph.nn.Pool2D(
-            pool_size=7, pool_type='avg', global_pooling=True
-        )
+        self.pool2d_avg = paddle.nn.AdaptiveAvgPool2D(1)
         import math
 
         stdv = 1.0 / math.sqrt(2048 * 1.0)
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
index 150abe911e501..dd9e995606534 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
@@ -54,13 +54,10 @@ def __init__(
             bias_attr=None,
         )
 
-        self._pool2d = paddle.fluid.dygraph.nn.Pool2D(
-            pool_size=pool_size,
-            pool_type=pool_type,
-            pool_stride=pool_stride,
-            pool_padding=pool_padding,
-            global_pooling=global_pooling,
-            use_cudnn=use_cudnn,
+        self._pool2d = paddle.nn.MaxPool2D(
+            kernel_size=pool_size,
+            stride=pool_stride,
+            padding=pool_padding,
         )
 
     def forward(self, inputs):
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py b/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py
index 477db13a701b7..ef7059887b91b 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py
@@ -57,13 +57,10 @@ def __init__(
             bias_attr=bias_attr,
         )
 
-        self._pool2d = paddle.fluid.dygraph.nn.Pool2D(
-            pool_size=pool_size,
-            pool_type=pool_type,
-            pool_stride=pool_stride,
-            pool_padding=pool_padding,
-            global_pooling=global_pooling,
-            use_cudnn=use_cudnn,
+        self._pool2d = paddle.nn.MaxPool2D(
+            kernel_size=pool_size,
+            stride=pool_stride,
+            padding=pool_padding,
         )
 
     def forward(self, inputs):
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
index fd7f97063b646..e4fd9766a2622 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
@@ -61,13 +61,10 @@ def __init__(
             bias_attr=None,
         )
 
-        self._pool2d = paddle.fluid.dygraph.nn.Pool2D(
-            pool_size=pool_size,
-            pool_type=pool_type,
-            pool_stride=pool_stride,
-            pool_padding=pool_padding,
-            global_pooling=global_pooling,
-            use_cudnn=use_cudnn,
+        self._pool2d = paddle.nn.MaxPool2D(
+            kernel_size=pool_size,
+            stride=pool_stride,
+            padding=pool_padding,
         )
 
     def forward(self, inputs):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
index 66d7eb19fb403..d4a26eb4cef72 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
@@ -59,13 +59,10 @@ def __init__(
             weight_attr=None,
             bias_attr=None,
         )
-        self._pool2d = paddle.fluid.dygraph.nn.Pool2D(
-            pool_size=pool_size,
-            pool_type=pool_type,
-            pool_stride=pool_stride,
-            pool_padding=pool_padding,
-            global_pooling=global_pooling,
-            use_cudnn=use_cudnn,
+        self._pool2d = paddle.nn.MaxPool2D(
+            kernel_size=pool_size,
+            stride=pool_stride,
+            padding=pool_padding,
         )
 
     def forward(self, inputs):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
index 0b5ba9a563147..a8cf1fc8ce86a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -215,9 +215,7 @@ def __init__(self, layers=50, class_dim=102, use_cudnn=True):
                 )
                 self.bottleneck_block_list.append(bottleneck_block)
                 shortcut = True
-        self.pool2d_avg = paddle.fluid.dygraph.nn.Pool2D(
-            pool_size=7, pool_type='avg', global_pooling=True
-        )
+        self.pool2d_avg = paddle.nn.AdaptiveAvgPool2D(1)
 
         self.pool2d_avg_output = num_filters[-1] * 4 * 1 * 1
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
index 6eb5ab1874d52..f2c8d285a0439 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -104,9 +104,7 @@ def __init__(self, num_channels, reduction_ratio):
 
         super().__init__()
         self._num_channels = num_channels
-        self._pool = paddle.fluid.dygraph.nn.Pool2D(
-            pool_size=0, pool_type='avg', global_pooling=True
-        )
+        self._pool = paddle.nn.AdaptiveAvgPool2D(1)
         self._squeeze = paddle.nn.Linear(
             num_channels,
             num_channels // reduction_ratio,
@@ -286,9 +284,7 @@ def __init__(self, layers=50, class_dim=102):
                 num_channels = bottleneck_block._num_channels_out
                 self.bottleneck_block_list.append(bottleneck_block)
                 shortcut = True
-        self.pool2d_avg = paddle.fluid.dygraph.nn.Pool2D(
-            pool_size=7, pool_type='avg', global_pooling=True
-        )
+        self.pool2d_avg = paddle.nn.AdaptiveAvgPool2D(1)
         import math
 
         stdv = 1.0 / math.sqrt(2048 * 1.0)
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index 021c523d210a5..34973bafbde91 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -41,10 +41,10 @@ def __init__(self, num_classes=10):
         self.features = Sequential(
             Conv2D(1, 6, 3, stride=1, padding=1),
             ReLU(),
-            paddle.fluid.dygraph.Pool2D(2, 'max', 2),
+            paddle.nn.MaxPool2D(2, 2),
             Conv2D(6, 16, 5, stride=1, padding=0),
             ReLU(),
-            paddle.fluid.dygraph.Pool2D(2, 'max', 2),
+            paddle.nn.MaxPool2D(2, 2),
         )
 
         if num_classes > 0:
@@ -93,10 +93,10 @@ def __init__(self, num_classes=10):
         self.features = Sequential(
             self.cov,
             ReLU(),
-            paddle.fluid.dygraph.Pool2D(2, 'max', 2),
+            paddle.nn.MaxPool2D(2, 2),
             Conv2D(6, 16, 5, stride=1, padding=0),
             ReLU(),
-            paddle.fluid.dygraph.Pool2D(2, 'max', 2),
+            paddle.nn.MaxPool2D(2, 2),
         )
 
         if num_classes > 0:

From 3125733a1c6154dd4a1b60e3b14500fd775cdc8a Mon Sep 17 00:00:00 2001
From: Weilong Wu <veyron_wu@163.com>
Date: Tue, 6 Dec 2022 11:17:07 +0800
Subject: [PATCH 02/60] rm _disable_legacy_dygraph and disable one mkldnn test
 file (#48721)

* rm _disable_legacy_dygraph

* disable test_flags_mkldnn_ops_on_off test
---
 .../tests/unittests/mkldnn/CMakeLists.txt     |   3 +-
 .../mkldnn/check_flags_mkldnn_ops_on_off.py   |   4 +-
 .../fluid/tests/unittests/test_assign_op.py   |   5 -
 .../tests/unittests/test_egr_python_api.py    | 648 ++++++++----------
 .../tests/unittests/test_linalg_lstsq_op.py   |   1 -
 5 files changed, 296 insertions(+), 365 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
index 3290ce5644c12..50062d69bc8c9 100755
--- a/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
@@ -5,6 +5,7 @@ file(
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
 list(REMOVE_ITEM TEST_OPS "test_onnx_format_quantization_mobilenetv1")
+list(REMOVE_ITEM TEST_OPS "test_flags_mkldnn_ops_on_off")
 
 if(WITH_MKLDNN AND NOT WIN32)
   list(APPEND TEST_OPS "test_onnx_format_quantization_mobilenetv1")
@@ -19,4 +20,4 @@ if(WITH_MKLDNN AND NOT WIN32)
   set_tests_properties(test_onnx_format_quantization_mobilenetv1
                        PROPERTIES TIMEOUT 300)
 endif()
-set_tests_properties(test_flags_mkldnn_ops_on_off PROPERTIES TIMEOUT 120)
+# set_tests_properties(test_flags_mkldnn_ops_on_off PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py
index aa9811a94bc3e..7f471307bafa4 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py
@@ -18,11 +18,9 @@
 
 import paddle
 import paddle.fluid as fluid
-from paddle.fluid.framework import _enable_legacy_dygraph, _global_flags
+from paddle.fluid.framework import _global_flags
 from paddle.fluid.layer_helper import LayerHelper
 
-_enable_legacy_dygraph()
-
 
 def check():
     print(
diff --git a/python/paddle/fluid/tests/unittests/test_assign_op.py b/python/paddle/fluid/tests/unittests/test_assign_op.py
index e2325733d1ad7..3c4b50f2f0498 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_op.py
@@ -22,7 +22,6 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-import paddle.fluid.framework as framework
 import paddle.fluid.layers as layers
 from paddle.fluid import Program, program_guard
 from paddle.fluid.backward import append_backward
@@ -42,7 +41,6 @@ def test_forward(self):
         self.check_output(check_eager=True)
         fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
         paddle.disable_static()
-        framework._disable_legacy_dygraph()
 
     def test_backward(self):
         paddle.enable_static()
@@ -50,7 +48,6 @@ def test_backward(self):
         self.check_grad(['X'], 'Out', check_eager=True)
         fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
         paddle.disable_static()
-        framework._disable_legacy_dygraph()
 
 
 class TestAssignFP16Op(op_test.OpTest):
@@ -67,7 +64,6 @@ def test_forward(self):
         self.check_output(check_eager=True)
         fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
         paddle.disable_static()
-        framework._disable_legacy_dygraph()
 
     def test_backward(self):
         paddle.enable_static()
@@ -75,7 +71,6 @@ def test_backward(self):
         self.check_grad(['X'], 'Out', check_eager=True)
         fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
         paddle.disable_static()
-        framework._disable_legacy_dygraph()
 
 
 class TestAssignOpWithLoDTensorArray(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
index c5ecca10b2e55..247d264efded9 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -22,85 +22,77 @@
 from paddle.fluid.framework import (
     EagerParamBase,
     _current_expected_place,
-    _disable_legacy_dygraph,
-    _test_eager_guard,
     in_dygraph_mode,
 )
 
 
 class EagerScaleTestCase(unittest.TestCase):
     def test_scale_base(self):
-        with _test_eager_guard():
-            paddle.set_device("cpu")
-            arr = np.ones([4, 16, 16, 32]).astype('float32')
-            tensor = paddle.to_tensor(arr, 'float32', core.CPUPlace())
-            print(tensor)
+        paddle.set_device("cpu")
+        arr = np.ones([4, 16, 16, 32]).astype('float32')
+        tensor = paddle.to_tensor(arr, 'float32', core.CPUPlace())
+        print(tensor)
+        tensor = core.eager.scale(tensor, 2.0, 0.9, True, False)
+        for i in range(0, 100):
             tensor = core.eager.scale(tensor, 2.0, 0.9, True, False)
-            for i in range(0, 100):
-                tensor = core.eager.scale(tensor, 2.0, 0.9, True, False)
-            print(tensor)
-            self.assertEqual(tensor.shape, [4, 16, 16, 32])
-            self.assertEqual(tensor.stop_gradient, True)
+        print(tensor)
+        self.assertEqual(tensor.shape, [4, 16, 16, 32])
+        self.assertEqual(tensor.stop_gradient, True)
 
     def test_retain_grad_and_run_backward(self):
-        with _test_eager_guard():
-            paddle.set_device("cpu")
+        paddle.set_device("cpu")
 
-            input_data = np.ones([4, 16, 16, 32]).astype('float32')
-            data_eager = paddle.to_tensor(
-                input_data, 'float32', core.CPUPlace(), False
-            )
+        input_data = np.ones([4, 16, 16, 32]).astype('float32')
+        data_eager = paddle.to_tensor(
+            input_data, 'float32', core.CPUPlace(), False
+        )
 
-            grad_data = np.ones([4, 16, 16, 32]).astype('float32')
-            grad_eager = paddle.to_tensor(grad_data, 'float32', core.CPUPlace())
+        grad_data = np.ones([4, 16, 16, 32]).astype('float32')
+        grad_eager = paddle.to_tensor(grad_data, 'float32', core.CPUPlace())
 
-            data_eager.retain_grads()
+        data_eager.retain_grads()
 
-            out_eager = core.eager.scale(data_eager, 1.0, 0.9, True, True)
-            self.assertIsNone(data_eager.grad)
-            out_eager.backward(grad_eager, False)
-            self.assertIsNotNone(data_eager.grad)
-            np.testing.assert_array_equal(data_eager.grad.numpy(), input_data)
+        out_eager = core.eager.scale(data_eager, 1.0, 0.9, True, True)
+        self.assertIsNone(data_eager.grad)
+        out_eager.backward(grad_eager, False)
+        self.assertIsNotNone(data_eager.grad)
+        np.testing.assert_array_equal(data_eager.grad.numpy(), input_data)
 
     def test_retain_grad_and_run_backward_raises(self):
-        with _test_eager_guard():
-            paddle.set_device("cpu")
+        paddle.set_device("cpu")
 
-            input_data = np.ones([4, 16, 16, 32]).astype('float32')
-            data_eager = paddle.to_tensor(
-                input_data, 'float32', core.CPUPlace(), False
-            )
+        input_data = np.ones([4, 16, 16, 32]).astype('float32')
+        data_eager = paddle.to_tensor(
+            input_data, 'float32', core.CPUPlace(), False
+        )
 
-            grad_data = np.ones([4, 16, 16, 32]).astype('float32')
-            grad_data2 = np.ones([4, 16]).astype('float32')
-            grad_eager = paddle.to_tensor(grad_data, 'float32', core.CPUPlace())
-            grad_eager2 = paddle.to_tensor(
-                grad_data2, 'float32', core.CPUPlace()
-            )
+        grad_data = np.ones([4, 16, 16, 32]).astype('float32')
+        grad_data2 = np.ones([4, 16]).astype('float32')
+        grad_eager = paddle.to_tensor(grad_data, 'float32', core.CPUPlace())
+        grad_eager2 = paddle.to_tensor(grad_data2, 'float32', core.CPUPlace())
 
-            data_eager.retain_grads()
+        data_eager.retain_grads()
 
-            out_eager = core.eager.scale(data_eager, 1.0, 0.9, True, True)
-            self.assertIsNone(data_eager.grad)
-            with self.assertRaisesRegexp(
-                AssertionError, "The type of grad_tensor must be paddle.Tensor"
-            ):
-                out_eager.backward(grad_data, False)
+        out_eager = core.eager.scale(data_eager, 1.0, 0.9, True, True)
+        self.assertIsNone(data_eager.grad)
+        with self.assertRaisesRegexp(
+            AssertionError, "The type of grad_tensor must be paddle.Tensor"
+        ):
+            out_eager.backward(grad_data, False)
 
-            with self.assertRaisesRegexp(
-                AssertionError,
-                "Tensor shape not match, Tensor of grad_tensor /*",
-            ):
-                out_eager.backward(grad_eager2, False)
+        with self.assertRaisesRegexp(
+            AssertionError,
+            "Tensor shape not match, Tensor of grad_tensor /*",
+        ):
+            out_eager.backward(grad_eager2, False)
 
 
 class EagerDtypeTestCase(unittest.TestCase):
     def check_to_tesnsor_and_numpy(self, dtype, proto_dtype):
-        with _test_eager_guard():
-            arr = np.random.random([4, 16, 16, 32]).astype(dtype)
-            tensor = paddle.to_tensor(arr, dtype)
-            self.assertEqual(tensor.dtype, proto_dtype)
-            np.testing.assert_array_equal(arr, tensor.numpy())
+        arr = np.random.random([4, 16, 16, 32]).astype(dtype)
+        tensor = paddle.to_tensor(arr, dtype)
+        self.assertEqual(tensor.dtype, proto_dtype)
+        np.testing.assert_array_equal(arr, tensor.numpy())
 
     def test_dtype_base(self):
         print("Test_dtype")
@@ -315,9 +307,9 @@ def test_constructor(self):
         place_list = [core.CPUPlace()]
         if core.is_compiled_with_cuda():
             place_list.append(core.CUDAPlace(0))
-        with _test_eager_guard():
-            for p in place_list:
-                self.constructor(p)
+
+        for p in place_list:
+            self.constructor(p)
 
     def constructor_with_kwargs(self, place):
         # init Tensor by Python array
@@ -639,180 +631,171 @@ def test_constructor_with_kwargs(self):
         place_list = [core.CPUPlace()]
         if core.is_compiled_with_cuda():
             place_list.append(core.CUDAPlace(0))
-        with _test_eager_guard():
-            for p in place_list:
-                self.constructor_with_kwargs(p)
+
+        for p in place_list:
+            self.constructor_with_kwargs(p)
 
     def test_copy_and_copy_to(self):
         print("Test_copy_and_copy_to")
-        with _test_eager_guard():
-            paddle.set_device("cpu")
-            arr = np.ones([4, 16, 16, 32]).astype('float32')
-            arr1 = np.zeros([4, 16]).astype('float32')
-            arr2 = np.ones([4, 16, 16, 32]).astype('float32') + np.ones(
-                [4, 16, 16, 32]
-            ).astype('float32')
-            tensor = paddle.to_tensor(
-                arr, core.VarDesc.VarType.FP32, core.CPUPlace()
-            )
-            self.assertEqual(tensor.stop_gradient, True)
-            tensor.stop_gradient = False
-            print("Set persistable")
-            tensor.persistable = False
-            tensor1 = paddle.to_tensor(
-                arr1, core.VarDesc.VarType.FP32, core.CPUPlace()
+
+        paddle.set_device("cpu")
+        arr = np.ones([4, 16, 16, 32]).astype('float32')
+        arr1 = np.zeros([4, 16]).astype('float32')
+        arr2 = np.ones([4, 16, 16, 32]).astype('float32') + np.ones(
+            [4, 16, 16, 32]
+        ).astype('float32')
+        tensor = paddle.to_tensor(
+            arr, core.VarDesc.VarType.FP32, core.CPUPlace()
+        )
+        self.assertEqual(tensor.stop_gradient, True)
+        tensor.stop_gradient = False
+        print("Set persistable")
+        tensor.persistable = False
+        tensor1 = paddle.to_tensor(
+            arr1, core.VarDesc.VarType.FP32, core.CPUPlace()
+        )
+        tensor1.persistable = True
+        self.assertEqual(tensor1.stop_gradient, True)
+        np.testing.assert_array_equal(tensor.numpy(), arr)
+        print("Test copy_")
+        tensor.copy_(tensor1, True)
+        self.assertEqual(tensor.persistable, False)
+        self.assertEqual(tensor.shape, [4, 16])
+        self.assertEqual(tensor.dtype, core.VarDesc.VarType.FP32)
+        np.testing.assert_array_equal(tensor.numpy(), arr1)
+
+        print("Test _copy_to")
+        tensor2 = paddle.to_tensor(
+            arr2, core.VarDesc.VarType.FP32, core.CPUPlace()
+        )
+        np.testing.assert_array_equal(tensor2.numpy(), arr2)
+        self.assertTrue(tensor2.place.is_cpu_place())
+        tensor2.persistable = True
+        tensor2.stop_gradient = False
+        if core.is_compiled_with_cuda():
+            tensor3 = tensor2._copy_to(core.CUDAPlace(0), True)
+            np.testing.assert_array_equal(tensor3.numpy(), arr2)
+            self.assertEqual(tensor3.persistable, True)
+            self.assertEqual(tensor3.stop_gradient, True)
+            self.assertTrue(tensor3.place.is_gpu_place())
+
+            tensor4 = tensor2.cuda(0, True)
+            np.testing.assert_array_equal(tensor4.numpy(), arr2)
+            self.assertEqual(tensor4.persistable, True)
+            self.assertEqual(tensor4.stop_gradient, False)
+            self.assertTrue(tensor4.place.is_gpu_place())
+
+            tensor5 = tensor4.cpu()
+            np.testing.assert_array_equal(tensor5.numpy(), arr2)
+            self.assertEqual(tensor5.persistable, True)
+            self.assertEqual(tensor5.stop_gradient, False)
+            self.assertTrue(tensor5.place.is_cpu_place())
+
+            tensor10 = paddle.to_tensor([1, 2, 3], place='gpu_pinned')
+            tensor11 = tensor10._copy_to(core.CUDAPlace(0), True)
+            np.testing.assert_array_equal(tensor10.numpy(), tensor11.numpy())
+        else:
+            tensor3 = tensor2._copy_to(core.CPUPlace(), True)
+            np.testing.assert_array_equal(tensor3.numpy(), arr2)
+            self.assertEqual(tensor3.persistable, True)
+            self.assertEqual(tensor3.stop_gradient, True)
+            self.assertTrue(tensor3.place.is_cpu_place())
+
+            tensor4 = tensor2.cpu()
+            np.testing.assert_array_equal(tensor4.numpy(), arr2)
+            self.assertEqual(tensor4.persistable, True)
+            self.assertEqual(tensor4.stop_gradient, False)
+            self.assertTrue(tensor4.place.is_cpu_place())
+
+    def test_share_buffer_to(self):
+        arr = np.ones([4, 16, 16, 32]).astype('float32')
+        arr1 = np.zeros([4, 16]).astype('float32')
+        arr2 = np.ones([4, 16, 16, 32]).astype('float32') + np.ones(
+            [4, 16, 16, 32]
+        ).astype('float32')
+        tensor = None
+        tensor2 = None
+        tensor = paddle.to_tensor(
+            arr, core.VarDesc.VarType.FP32, core.CPUPlace()
+        )
+        tensor3 = core.eager.Tensor(value=tensor, place=core.CPUPlace())
+        if core.is_compiled_with_cuda():
+            tensor2 = paddle.to_tensor(
+                arr2, core.VarDesc.VarType.FP32, core.CUDAPlace(0)
             )
-            tensor1.persistable = True
-            self.assertEqual(tensor1.stop_gradient, True)
-            np.testing.assert_array_equal(tensor.numpy(), arr)
-            print("Test copy_")
-            tensor.copy_(tensor1, True)
-            self.assertEqual(tensor.persistable, False)
-            self.assertEqual(tensor.shape, [4, 16])
-            self.assertEqual(tensor.dtype, core.VarDesc.VarType.FP32)
-            np.testing.assert_array_equal(tensor.numpy(), arr1)
-
-            print("Test _copy_to")
+        else:
             tensor2 = paddle.to_tensor(
                 arr2, core.VarDesc.VarType.FP32, core.CPUPlace()
             )
-            np.testing.assert_array_equal(tensor2.numpy(), arr2)
-            self.assertTrue(tensor2.place.is_cpu_place())
-            tensor2.persistable = True
-            tensor2.stop_gradient = False
-            if core.is_compiled_with_cuda():
-                tensor3 = tensor2._copy_to(core.CUDAPlace(0), True)
-                np.testing.assert_array_equal(tensor3.numpy(), arr2)
-                self.assertEqual(tensor3.persistable, True)
-                self.assertEqual(tensor3.stop_gradient, True)
-                self.assertTrue(tensor3.place.is_gpu_place())
-
-                tensor4 = tensor2.cuda(0, True)
-                np.testing.assert_array_equal(tensor4.numpy(), arr2)
-                self.assertEqual(tensor4.persistable, True)
-                self.assertEqual(tensor4.stop_gradient, False)
-                self.assertTrue(tensor4.place.is_gpu_place())
-
-                tensor5 = tensor4.cpu()
-                np.testing.assert_array_equal(tensor5.numpy(), arr2)
-                self.assertEqual(tensor5.persistable, True)
-                self.assertEqual(tensor5.stop_gradient, False)
-                self.assertTrue(tensor5.place.is_cpu_place())
-
-                tensor10 = paddle.to_tensor([1, 2, 3], place='gpu_pinned')
-                tensor11 = tensor10._copy_to(core.CUDAPlace(0), True)
-                np.testing.assert_array_equal(
-                    tensor10.numpy(), tensor11.numpy()
-                )
-            else:
-                tensor3 = tensor2._copy_to(core.CPUPlace(), True)
-                np.testing.assert_array_equal(tensor3.numpy(), arr2)
-                self.assertEqual(tensor3.persistable, True)
-                self.assertEqual(tensor3.stop_gradient, True)
-                self.assertTrue(tensor3.place.is_cpu_place())
-
-                tensor4 = tensor2.cpu()
-                np.testing.assert_array_equal(tensor4.numpy(), arr2)
-                self.assertEqual(tensor4.persistable, True)
-                self.assertEqual(tensor4.stop_gradient, False)
-                self.assertTrue(tensor4.place.is_cpu_place())
-
-    def test_share_buffer_to(self):
-        with _test_eager_guard():
-            arr = np.ones([4, 16, 16, 32]).astype('float32')
-            arr1 = np.zeros([4, 16]).astype('float32')
-            arr2 = np.ones([4, 16, 16, 32]).astype('float32') + np.ones(
-                [4, 16, 16, 32]
-            ).astype('float32')
-            tensor = None
-            tensor2 = None
-            tensor = paddle.to_tensor(
-                arr, core.VarDesc.VarType.FP32, core.CPUPlace()
-            )
-            tensor3 = core.eager.Tensor(value=tensor, place=core.CPUPlace())
-            if core.is_compiled_with_cuda():
-                tensor2 = paddle.to_tensor(
-                    arr2, core.VarDesc.VarType.FP32, core.CUDAPlace(0)
-                )
-            else:
-                tensor2 = paddle.to_tensor(
-                    arr2, core.VarDesc.VarType.FP32, core.CPUPlace()
-                )
-            np.testing.assert_array_equal(tensor.numpy(), arr)
-            np.testing.assert_array_equal(tensor2.numpy(), arr2)
-            tensor2._share_buffer_to(tensor)
-            np.testing.assert_array_equal(tensor.numpy(), arr2)
-            np.testing.assert_array_equal(tensor2.numpy(), arr2)
-            self.assertTrue(tensor._is_shared_buffer_with(tensor2))
-            self.assertTrue(tensor2._is_shared_buffer_with(tensor))
-            tensor._share_buffer_to(tensor3)
-            np.testing.assert_array_equal(tensor3.numpy(), arr2)
-            self.assertTrue(tensor3._is_shared_buffer_with(tensor))
+        np.testing.assert_array_equal(tensor.numpy(), arr)
+        np.testing.assert_array_equal(tensor2.numpy(), arr2)
+        tensor2._share_buffer_to(tensor)
+        np.testing.assert_array_equal(tensor.numpy(), arr2)
+        np.testing.assert_array_equal(tensor2.numpy(), arr2)
+        self.assertTrue(tensor._is_shared_buffer_with(tensor2))
+        self.assertTrue(tensor2._is_shared_buffer_with(tensor))
+        tensor._share_buffer_to(tensor3)
+        np.testing.assert_array_equal(tensor3.numpy(), arr2)
+        self.assertTrue(tensor3._is_shared_buffer_with(tensor))
 
     def test_share_underline_tensor_to(self):
-        with _test_eager_guard():
-            arr = np.ones([4, 16, 16, 32]).astype('float32')
-            arr1 = np.zeros([4, 16]).astype('float32')
-            arr2 = np.ones([4, 16, 16, 32]).astype('float32') + np.ones(
-                [4, 16, 16, 32]
-            ).astype('float32')
-            tensor = None
-            tensor2 = None
-            tensor = paddle.to_tensor(
-                arr, core.VarDesc.VarType.FP32, core.CPUPlace()
+        arr = np.ones([4, 16, 16, 32]).astype('float32')
+        arr1 = np.zeros([4, 16]).astype('float32')
+        arr2 = np.ones([4, 16, 16, 32]).astype('float32') + np.ones(
+            [4, 16, 16, 32]
+        ).astype('float32')
+        tensor = None
+        tensor2 = None
+        tensor = paddle.to_tensor(
+            arr, core.VarDesc.VarType.FP32, core.CPUPlace()
+        )
+        tensor3 = core.eager.Tensor()
+        if core.is_compiled_with_cuda():
+            tensor2 = paddle.to_tensor(
+                arr2, core.VarDesc.VarType.FP32, core.CUDAPlace(0)
             )
-            tensor3 = core.eager.Tensor()
-            if core.is_compiled_with_cuda():
-                tensor2 = paddle.to_tensor(
-                    arr2, core.VarDesc.VarType.FP32, core.CUDAPlace(0)
-                )
-            else:
-                tensor2 = paddle.to_tensor(
-                    arr2, core.VarDesc.VarType.FP32, core.CPUPlace()
-                )
-            np.testing.assert_array_equal(tensor.numpy(), arr)
-            np.testing.assert_array_equal(tensor2.numpy(), arr2)
-            tensor2._share_underline_tensor_to(tensor)
-            np.testing.assert_array_equal(tensor.numpy(), arr2)
-            np.testing.assert_array_equal(tensor2.numpy(), arr2)
-            self.assertTrue(tensor._is_shared_underline_tensor_with(tensor2))
-            self.assertTrue(tensor2._is_shared_underline_tensor_with(tensor))
-            tensor._share_underline_tensor_to(tensor3)
-            np.testing.assert_array_equal(tensor3.numpy(), arr2)
-            self.assertTrue(tensor3._is_shared_underline_tensor_with(tensor))
+        else:
+            tensor2 = paddle.to_tensor(
+                arr2, core.VarDesc.VarType.FP32, core.CPUPlace()
+            )
+        np.testing.assert_array_equal(tensor.numpy(), arr)
+        np.testing.assert_array_equal(tensor2.numpy(), arr2)
+        tensor2._share_underline_tensor_to(tensor)
+        np.testing.assert_array_equal(tensor.numpy(), arr2)
+        np.testing.assert_array_equal(tensor2.numpy(), arr2)
+        self.assertTrue(tensor._is_shared_underline_tensor_with(tensor2))
+        self.assertTrue(tensor2._is_shared_underline_tensor_with(tensor))
+        tensor._share_underline_tensor_to(tensor3)
+        np.testing.assert_array_equal(tensor3.numpy(), arr2)
+        self.assertTrue(tensor3._is_shared_underline_tensor_with(tensor))
 
     def test_properties(self):
         print("Test_properties")
-        with _test_eager_guard():
-            paddle.set_device("cpu")
-            arr = np.ones([4, 16, 16, 32]).astype('float32')
-            tensor = paddle.to_tensor(
-                arr, core.VarDesc.VarType.FP32, core.CPUPlace()
-            )
-            self.assertEqual(tensor.shape, [4, 16, 16, 32])
-            tensor.name = 'tensor_name_test'
-            self.assertEqual(tensor.name, 'tensor_name_test')
-            self.assertEqual(tensor.persistable, False)
-            tensor.persistable = True
-            self.assertEqual(tensor.persistable, True)
-            tensor.persistable = False
-            self.assertEqual(tensor.persistable, False)
-            self.assertTrue(tensor.place.is_cpu_place())
-            self.assertEqual(tensor._place_str, 'Place(cpu)')
-            self.assertEqual(tensor.stop_gradient, True)
-            tensor.stop_gradient = False
-            self.assertEqual(tensor.stop_gradient, False)
-            tensor.stop_gradient = True
-            self.assertEqual(tensor.stop_gradient, True)
-            self.assertEqual(tensor.type, core.VarDesc.VarType.LOD_TENSOR)
+        paddle.set_device("cpu")
+        arr = np.ones([4, 16, 16, 32]).astype('float32')
+        tensor = paddle.to_tensor(
+            arr, core.VarDesc.VarType.FP32, core.CPUPlace()
+        )
+        self.assertEqual(tensor.shape, [4, 16, 16, 32])
+        tensor.name = 'tensor_name_test'
+        self.assertEqual(tensor.name, 'tensor_name_test')
+        self.assertEqual(tensor.persistable, False)
+        tensor.persistable = True
+        self.assertEqual(tensor.persistable, True)
+        tensor.persistable = False
+        self.assertEqual(tensor.persistable, False)
+        self.assertTrue(tensor.place.is_cpu_place())
+        self.assertEqual(tensor._place_str, 'Place(cpu)')
+        self.assertEqual(tensor.stop_gradient, True)
+        tensor.stop_gradient = False
+        self.assertEqual(tensor.stop_gradient, False)
+        tensor.stop_gradient = True
+        self.assertEqual(tensor.stop_gradient, True)
+        self.assertEqual(tensor.type, core.VarDesc.VarType.LOD_TENSOR)
 
     def test_global_properties(self):
         print("Test_global_properties")
-        _disable_legacy_dygraph()
         self.assertTrue(in_dygraph_mode())
-        with _test_eager_guard():
-            self.assertTrue(in_dygraph_mode())
-        self.assertFalse(in_dygraph_mode())
 
     def test_place_guard(self):
         if core.is_compiled_with_cuda():
@@ -829,109 +812,97 @@ def test_place_guard(self):
                 )
 
     def test_value(self):
-        with _test_eager_guard():
-            arr = np.random.rand(4, 16, 16, 32).astype('float64')
-
-            egr_tensor0 = core.eager.Tensor(value=arr)
-            self.assertEqual(egr_tensor0.persistable, False)
-            self.assertTrue("generated" in egr_tensor0.name)
-            self.assertEqual(egr_tensor0.shape, [4, 16, 16, 32])
-            self.assertTrue(
-                egr_tensor0.place._equals(
-                    paddle.fluid.framework._current_expected_place()
-                )
-            )
-            self.assertEqual(egr_tensor0.dtype, core.VarDesc.VarType.FP64)
-            self.assertEqual(egr_tensor0.stop_gradient, True)
-            self.assertTrue(
-                egr_tensor0.value().get_tensor()._dtype(),
-                core.VarDesc.VarType.FP64,
-            )
-            self.assertTrue(
-                egr_tensor0.value().get_tensor()._place(),
-                paddle.fluid.framework._current_expected_place(),
+        arr = np.random.rand(4, 16, 16, 32).astype('float64')
+
+        egr_tensor0 = core.eager.Tensor(value=arr)
+        self.assertEqual(egr_tensor0.persistable, False)
+        self.assertTrue("generated" in egr_tensor0.name)
+        self.assertEqual(egr_tensor0.shape, [4, 16, 16, 32])
+        self.assertTrue(
+            egr_tensor0.place._equals(
+                paddle.fluid.framework._current_expected_place()
             )
-            self.assertTrue(egr_tensor0.value().get_tensor()._is_initialized())
+        )
+        self.assertEqual(egr_tensor0.dtype, core.VarDesc.VarType.FP64)
+        self.assertEqual(egr_tensor0.stop_gradient, True)
+        self.assertTrue(
+            egr_tensor0.value().get_tensor()._dtype(),
+            core.VarDesc.VarType.FP64,
+        )
+        self.assertTrue(
+            egr_tensor0.value().get_tensor()._place(),
+            paddle.fluid.framework._current_expected_place(),
+        )
+        self.assertTrue(egr_tensor0.value().get_tensor()._is_initialized())
 
     def test_set_value(self):
-        with _test_eager_guard():
-            ori_arr = np.random.rand(4, 16, 16, 32).astype('float32')
-            egr_tensor = core.eager.Tensor(value=ori_arr)
-            self.assertEqual(egr_tensor.stop_gradient, True)
-            self.assertEqual(egr_tensor.shape, [4, 16, 16, 32])
-            np.testing.assert_array_equal(egr_tensor.numpy(), ori_arr)
-            ori_place = egr_tensor.place
-
-            new_arr = np.random.rand(4, 16, 16, 32).astype('float32')
-            self.assertFalse(np.array_equal(egr_tensor.numpy(), new_arr))
-
-            egr_tensor.set_value(new_arr)
-            self.assertEqual(egr_tensor.stop_gradient, True)
-            self.assertTrue(egr_tensor.place._equals(ori_place))
-            self.assertEqual(egr_tensor.shape, [4, 16, 16, 32])
-            np.testing.assert_array_equal(egr_tensor.numpy(), new_arr)
+        ori_arr = np.random.rand(4, 16, 16, 32).astype('float32')
+        egr_tensor = core.eager.Tensor(value=ori_arr)
+        self.assertEqual(egr_tensor.stop_gradient, True)
+        self.assertEqual(egr_tensor.shape, [4, 16, 16, 32])
+        np.testing.assert_array_equal(egr_tensor.numpy(), ori_arr)
+        ori_place = egr_tensor.place
+
+        new_arr = np.random.rand(4, 16, 16, 32).astype('float32')
+        self.assertFalse(np.array_equal(egr_tensor.numpy(), new_arr))
+
+        egr_tensor.set_value(new_arr)
+        self.assertEqual(egr_tensor.stop_gradient, True)
+        self.assertTrue(egr_tensor.place._equals(ori_place))
+        self.assertEqual(egr_tensor.shape, [4, 16, 16, 32])
+        np.testing.assert_array_equal(egr_tensor.numpy(), new_arr)
 
     def test_sharding_related_api(self):
-        with _test_eager_guard():
-            arr0 = np.random.rand(4, 16, 16, 32).astype('float32')
-            egr_tensor1 = core.eager.Tensor(
-                arr0, core.CPUPlace(), True, False, "numpy_tensor1", False
-            )
-            self.assertEqual(egr_tensor1._numel(), 32768)
-            self.assertEqual(egr_tensor1._slice(0, 2)._numel(), 16384)
+        arr0 = np.random.rand(4, 16, 16, 32).astype('float32')
+        egr_tensor1 = core.eager.Tensor(
+            arr0, core.CPUPlace(), True, False, "numpy_tensor1", False
+        )
+        self.assertEqual(egr_tensor1._numel(), 32768)
+        self.assertEqual(egr_tensor1._slice(0, 2)._numel(), 16384)
 
     def test_copy_gradient_from(self):
-        with _test_eager_guard():
-            np_x = np.random.random((2, 2))
-            np_y = np.random.random((2, 2))
-            x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
-            y = paddle.to_tensor(np_y, dtype="float64")
-            out = x + x
-            out.backward()
-            x._copy_gradient_from(y)
-            np.testing.assert_array_equal(x.grad.numpy(), np_y)
+        np_x = np.random.random((2, 2))
+        np_y = np.random.random((2, 2))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64")
+        out = x + x
+        out.backward()
+        x._copy_gradient_from(y)
+        np.testing.assert_array_equal(x.grad.numpy(), np_y)
 
     def test_clear(self):
-        with _test_eager_guard():
-            np_x = np.random.random((3, 8, 8))
-            x = paddle.to_tensor(np_x, dtype="float64")
-            self.assertTrue(x._is_initialized())
-            x._clear()
-            self.assertFalse(x._is_initialized())
+        np_x = np.random.random((3, 8, 8))
+        x = paddle.to_tensor(np_x, dtype="float64")
+        self.assertTrue(x._is_initialized())
+        x._clear()
+        self.assertFalse(x._is_initialized())
 
     def test_use_gpudnn(self):
         np_x = np.random.random((3, 8, 8))
-        with _test_eager_guard():
-            self.assertTrue(in_dygraph_mode())
-            x = paddle.to_tensor(np_x, dtype="float64")
-            y = x._use_gpudnn(False)
-            np.testing.assert_array_equal(x.numpy(), y.numpy())
-            y = x._use_gpudnn(True)
-            np.testing.assert_array_equal(x.numpy(), y.numpy())
-
-        self.assertFalse(in_dygraph_mode())
+
+        self.assertTrue(in_dygraph_mode())
         x = paddle.to_tensor(np_x, dtype="float64")
-        with self.assertRaises(AttributeError):
-            x = x._use_gpudnn(False)
+        y = x._use_gpudnn(False)
+        np.testing.assert_array_equal(x.numpy(), y.numpy())
+        y = x._use_gpudnn(True)
+        np.testing.assert_array_equal(x.numpy(), y.numpy())
 
 
 class EagerParamBaseUsageTestCase(unittest.TestCase):
     def test_print(self):
-        with _test_eager_guard():
-            linear = paddle.nn.Linear(3, 3, bias_attr=False)
-            print(linear.weight)
+        linear = paddle.nn.Linear(3, 3, bias_attr=False)
+        print(linear.weight)
 
     def test_copy(self):
-        with _test_eager_guard():
-            linear = paddle.nn.Linear(1, 3)
-            linear_copy = copy.deepcopy(linear)
-            linear_copy2 = linear.weight._copy_to(core.CPUPlace(), True)
-            np.testing.assert_array_equal(
-                linear.weight.numpy(), linear_copy.weight.numpy()
-            )
-            np.testing.assert_array_equal(
-                linear.weight.numpy(), linear_copy2.numpy()
-            )
+        linear = paddle.nn.Linear(1, 3)
+        linear_copy = copy.deepcopy(linear)
+        linear_copy2 = linear.weight._copy_to(core.CPUPlace(), True)
+        np.testing.assert_array_equal(
+            linear.weight.numpy(), linear_copy.weight.numpy()
+        )
+        np.testing.assert_array_equal(
+            linear.weight.numpy(), linear_copy2.numpy()
+        )
 
     def func_fp16_initilaizer(self):
         paddle.set_default_dtype("float16")
@@ -963,18 +934,6 @@ def func_fp16_initilaizer(self):
         paddle.set_default_dtype("float32")
         return res
 
-    def test_fp16_initializer(self):
-        res1 = list()
-        res2 = list()
-        paddle.seed(102)
-        paddle.framework.random._manual_program_seed(102)
-        with _test_eager_guard():
-            res1 = self.func_fp16_initilaizer()
-        res2 = self.func_fp16_initilaizer()
-
-        for i in range(len(res1)):
-            np.testing.assert_array_equal(res1[i], res2[i])
-
     def func_layer_helper_base(self, value):
         base = paddle.fluid.layer_helper_base.LayerHelperBase(
             "test_layer", "test_layer"
@@ -984,53 +943,32 @@ def func_layer_helper_base(self, value):
     def func_base_to_variable(self, value):
         paddle.fluid.dygraph.base.to_variable(value)
 
-    def test_to_variable(self):
-        value = np.random.rand(4, 16, 16, 32).astype('float32')
-        res1 = None
-        res3 = None
-        with _test_eager_guard():
-            res1 = self.func_layer_helper_base(value)
-            res3 = self.func_base_to_variable(value)
-        res2 = self.func_layer_helper_base(value)
-        res4 = self.func_base_to_variable(value)
-        np.testing.assert_array_equal(res1, res2)
-        np.testing.assert_array_equal(res3, res4)
-
     def test_backward_with_single_tensor(self):
-        with _test_eager_guard():
-            arr4 = np.random.rand(4, 16, 16, 32).astype('float32')
-            egr_tensor12 = core.eager.Tensor(arr4, core.CPUPlace())
-            egr_tensor12.retain_grads()
-            arr = np.ones([4, 16, 16, 32]).astype('float32')
-            self.assertEqual(egr_tensor12.persistable, False)
-            self.assertTrue("generated_tensor" in egr_tensor12.name)
-            self.assertEqual(egr_tensor12.shape, [4, 16, 16, 32])
-            self.assertEqual(egr_tensor12.dtype, core.VarDesc.VarType.FP32)
-            self.assertEqual(egr_tensor12.stop_gradient, True)
-            self.assertTrue(egr_tensor12.place._equals(paddle.fluid.CPUPlace()))
-            np.testing.assert_array_equal(egr_tensor12.numpy(), arr4)
-            np.testing.assert_array_equal(egr_tensor12.gradient(), None)
-            egr_tensor12.stop_gradient = False
-            egr_tensor12.backward()
-            np.testing.assert_array_equal(egr_tensor12.gradient(), arr)
+        arr4 = np.random.rand(4, 16, 16, 32).astype('float32')
+        egr_tensor12 = core.eager.Tensor(arr4, core.CPUPlace())
+        egr_tensor12.retain_grads()
+        arr = np.ones([4, 16, 16, 32]).astype('float32')
+        self.assertEqual(egr_tensor12.persistable, False)
+        self.assertTrue("generated_tensor" in egr_tensor12.name)
+        self.assertEqual(egr_tensor12.shape, [4, 16, 16, 32])
+        self.assertEqual(egr_tensor12.dtype, core.VarDesc.VarType.FP32)
+        self.assertEqual(egr_tensor12.stop_gradient, True)
+        self.assertTrue(egr_tensor12.place._equals(paddle.fluid.CPUPlace()))
+        np.testing.assert_array_equal(egr_tensor12.numpy(), arr4)
+        np.testing.assert_array_equal(egr_tensor12.gradient(), None)
+        egr_tensor12.stop_gradient = False
+        egr_tensor12.backward()
+        np.testing.assert_array_equal(egr_tensor12.gradient(), arr)
 
     def test_set_value(self):
-        with _test_eager_guard():
-            linear = paddle.nn.Linear(1, 3)
-            ori_place = linear.weight.place
-            new_weight = np.ones([1, 3]).astype('float32')
-            self.assertFalse(np.array_equal(linear.weight.numpy(), new_weight))
-
-            linear.weight.set_value(new_weight)
-            np.testing.assert_array_equal(linear.weight.numpy(), new_weight)
-            self.assertTrue(linear.weight.place._equals(ori_place))
-
-
-class EagerGuardTestCase(unittest.TestCase):
-    def test__test_eager_guard(self):
-        tracer = paddle.fluid.dygraph.tracer.Tracer()
-        with _test_eager_guard(tracer):
-            self.assertTrue(in_dygraph_mode())
+        linear = paddle.nn.Linear(1, 3)
+        ori_place = linear.weight.place
+        new_weight = np.ones([1, 3]).astype('float32')
+        self.assertFalse(np.array_equal(linear.weight.numpy(), new_weight))
+
+        linear.weight.set_value(new_weight)
+        np.testing.assert_array_equal(linear.weight.numpy(), new_weight)
+        self.assertTrue(linear.weight.place._equals(ori_place))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py b/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py
index bae9094a7f74e..82576ab1bd1bf 100644
--- a/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py
@@ -73,7 +73,6 @@ def generate_output(self):
 
     def test_eager_dygraph(self):
         paddle.disable_static()
-        paddle.fluid.framework._disable_legacy_dygraph()
         for dev in self.devices:
             paddle.set_device(dev)
             place = paddle.CPUPlace() if dev == "cpu" else paddle.CUDAPlace(0)

From 39adb22aba6520ac2147ea320411dc94fec55f57 Mon Sep 17 00:00:00 2001
From: Ryan <44900829+DrRyanHuang@users.noreply.github.com>
Date: Tue, 6 Dec 2022 11:32:25 +0800
Subject: [PATCH 03/60] Add IntermediateLayerGetter (#47908)

---
 .../unittests/test_IntermediateLayerGetter.py |  92 +++++++++++++++
 python/paddle/vision/models/_utils.py         | 108 ++++++++++++++++++
 python/paddle/vision/models/mobilenetv2.py    |   2 +-
 python/paddle/vision/models/mobilenetv3.py    |   2 +-
 python/paddle/vision/models/utils.py          |  32 ------
 5 files changed, 202 insertions(+), 34 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_IntermediateLayerGetter.py
 create mode 100644 python/paddle/vision/models/_utils.py
 delete mode 100644 python/paddle/vision/models/utils.py

diff --git a/python/paddle/fluid/tests/unittests/test_IntermediateLayerGetter.py b/python/paddle/fluid/tests/unittests/test_IntermediateLayerGetter.py
new file mode 100644
index 0000000000000..90d182bddf8ac
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_IntermediateLayerGetter.py
@@ -0,0 +1,92 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import paddle
+from paddle.vision.models._utils import IntermediateLayerGetter
+
+
+class TestBase:
+    def setUp(self):
+
+        self.init_model()
+        self.model.eval()
+
+        self.layer_names = [
+            (order, name)
+            for order, (name, _) in enumerate(self.model.named_children())
+        ]
+        # choose two layer children of model randomly
+        self.start, self.end = sorted(
+            random.sample(self.layer_names, 2), key=lambda x: x[0]
+        )
+
+        self.return_layers_dic = {self.start[1]: "feat1", self.end[1]: "feat2"}
+        self.new_model = IntermediateLayerGetter(
+            self.model, self.return_layers_dic
+        )
+
+    def init_model(self):
+        self.model = None
+
+    @paddle.no_grad()
+    def test_inter_result(self):
+
+        inp = paddle.randn([1, 3, 80, 80])
+        inter_oup = self.new_model(inp)
+
+        for layer_name, layer in self.model.named_children():
+
+            if (isinstance(layer, paddle.nn.Linear) and inp.ndim == 4) or (
+                len(layer.sublayers()) > 0
+                and isinstance(layer.sublayers()[0], paddle.nn.Linear)
+                and inp.ndim == 4
+            ):
+                inp = paddle.flatten(inp, 1)
+
+            inp = layer(inp)
+            if layer_name in self.return_layers_dic:
+                feat_name = self.return_layers_dic[layer_name]
+                self.assertTrue((inter_oup[feat_name] == inp).all())
+
+
+class TestIntermediateLayerGetterResNet18(TestBase, unittest.TestCase):
+    def init_model(self):
+        self.model = paddle.vision.models.resnet18(pretrained=False)
+
+
+class TestIntermediateLayerGetterDenseNet121(TestBase, unittest.TestCase):
+    def init_model(self):
+        self.model = paddle.vision.models.densenet121(pretrained=False)
+
+
+class TestIntermediateLayerGetterVGG11(TestBase, unittest.TestCase):
+    def init_model(self):
+        self.model = paddle.vision.models.vgg11(pretrained=False)
+
+
+class TestIntermediateLayerGetterMobileNetV3Small(TestBase, unittest.TestCase):
+    def init_model(self):
+        self.model = paddle.vision.models.MobileNetV3Small()
+
+
+class TestIntermediateLayerGetterShuffleNetV2(TestBase, unittest.TestCase):
+    def init_model(self):
+        self.model = paddle.vision.models.shufflenet_v2_x0_25()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/vision/models/_utils.py b/python/paddle/vision/models/_utils.py
new file mode 100644
index 0000000000000..a556700801794
--- /dev/null
+++ b/python/paddle/vision/models/_utils.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+from typing import Dict
+
+import paddle
+import paddle.nn as nn
+
+
+def _make_divisible(v, divisor=8, min_value=None):
+    """
+    This function ensures that all layers have a channel number that is divisible by divisor
+    You can also see at https://github.com/keras-team/keras/blob/8ecef127f70db723c158dbe9ed3268b3d610ab55/keras/applications/mobilenet_v2.py#L505
+
+    Args:
+        divisor (int): The divisor for number of channels. Default: 8.
+        min_value (int, optional): The minimum value of number of channels, if it is None,
+                the default is divisor. Default: None.
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class IntermediateLayerGetter(nn.LayerDict):
+    """
+    Layer wrapper that returns intermediate layers from a model.
+
+    It has a strong assumption that the layers have been registered into the model in the
+    same order as they are used. This means that one should **not** reuse the same nn.Layer
+    twice in the forward if you want this to work.
+
+    Additionally, it is only able to query sublayer that are directly assigned to the model.
+    So if `model` is passed, `model.feature1` can be returned, but not `model.feature1.layer2`.
+
+    Args:
+        model (nn.Layer): model on which we will extract the features
+        return_layers (Dict[name, new_name]): a dict containing the names of the layers for
+        which the activations will be returned as the key of the dict, and the value of the
+        dict is the name of the returned activation (which the user can specify).
+
+    Examples:
+        .. code-block:: python
+
+        import paddle
+        m = paddle.vision.models.resnet18(pretrained=False)
+        # extract layer1 and layer3, giving as names `feat1` and feat2`
+        new_m = paddle.vision.models._utils.IntermediateLayerGetter(m,
+            {'layer1': 'feat1', 'layer3': 'feat2'})
+        out = new_m(paddle.rand([1, 3, 224, 224]))
+        print([(k, v.shape) for k, v in out.items()])
+        # [('feat1', [1, 64, 56, 56]), ('feat2', [1, 256, 14, 14])]
+    """
+
+    __annotations__ = {
+        "return_layers": Dict[str, str],
+    }
+
+    def __init__(self, model: nn.Layer, return_layers: Dict[str, str]) -> None:
+        if not set(return_layers).issubset(
+            [name for name, _ in model.named_children()]
+        ):
+            raise ValueError("return_layers are not present in model")
+        orig_return_layers = return_layers
+        return_layers = {str(k): str(v) for k, v in return_layers.items()}
+        layers = OrderedDict()
+        for name, module in model.named_children():
+            layers[name] = module
+            if name in return_layers:
+                del return_layers[name]
+            if not return_layers:
+                break
+
+        super(IntermediateLayerGetter, self).__init__(layers)
+        self.return_layers = orig_return_layers
+
+    def forward(self, x):
+        out = OrderedDict()
+        for name, module in self.items():
+
+            if (isinstance(module, nn.Linear) and x.ndim == 4) or (
+                len(module.sublayers()) > 0
+                and isinstance(module.sublayers()[0], nn.Linear)
+                and x.ndim == 4
+            ):
+                x = paddle.flatten(x, 1)
+
+            x = module(x)
+            if name in self.return_layers:
+                out_name = self.return_layers[name]
+                out[out_name] = x
+        return out
diff --git a/python/paddle/vision/models/mobilenetv2.py b/python/paddle/vision/models/mobilenetv2.py
index 1f9d04509dd7b..12b5210c7cdb0 100644
--- a/python/paddle/vision/models/mobilenetv2.py
+++ b/python/paddle/vision/models/mobilenetv2.py
@@ -17,7 +17,7 @@
 from paddle.utils.download import get_weights_path_from_url
 
 from ..ops import ConvNormActivation
-from .utils import _make_divisible
+from ._utils import _make_divisible
 
 __all__ = []
 
diff --git a/python/paddle/vision/models/mobilenetv3.py b/python/paddle/vision/models/mobilenetv3.py
index 3ca62af7e558f..195578314048c 100644
--- a/python/paddle/vision/models/mobilenetv3.py
+++ b/python/paddle/vision/models/mobilenetv3.py
@@ -19,7 +19,7 @@
 from paddle.utils.download import get_weights_path_from_url
 
 from ..ops import ConvNormActivation
-from .utils import _make_divisible
+from ._utils import _make_divisible
 
 __all__ = []
 
diff --git a/python/paddle/vision/models/utils.py b/python/paddle/vision/models/utils.py
deleted file mode 100644
index f61d0d601a44f..0000000000000
--- a/python/paddle/vision/models/utils.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-def _make_divisible(v, divisor=8, min_value=None):
-    """
-    This function ensures that all layers have a channel number that is divisible by divisor
-    You can also see at https://github.com/keras-team/keras/blob/8ecef127f70db723c158dbe9ed3268b3d610ab55/keras/applications/mobilenet_v2.py#L505
-
-    Args:
-        divisor (int): The divisor for number of channels. Default: 8.
-        min_value (int, optional): The minimum value of number of channels, if it is None,
-                the default is divisor. Default: None.
-    """
-    if min_value is None:
-        min_value = divisor
-    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-    # Make sure that round down does not go down by more than 10%.
-    if new_v < 0.9 * v:
-        new_v += divisor
-    return new_v

From 4ab8a2ae91d8c19fe7504f1c23589496209cbb1a Mon Sep 17 00:00:00 2001
From: wuhuachaocoding <77733235+wuhuachaocoding@users.noreply.github.com>
Date: Tue, 6 Dec 2022 11:55:05 +0800
Subject: [PATCH 04/60] update dgc download link. (#48700)

* update dgc download link.

* update.
---
 cmake/external/dgc.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/external/dgc.cmake b/cmake/external/dgc.cmake
index ef1ee1b0020f0..edd46aca80a9f 100644
--- a/cmake/external/dgc.cmake
+++ b/cmake/external/dgc.cmake
@@ -23,14 +23,14 @@ set(DGC_INCLUDE_DIR
 set(DGC_LIBRARIES
     "${DGC_INSTALL_DIR}/lib/libdgc.a"
     CACHE FILEPATH "dgc library." FORCE)
-set(DGC_URL "https://fleet.bj.bcebos.com/dgc/collective_f66ef73.tgz")
+set(DGC_URL "https://fleet.bj.bcebos.com/dgc/collective_7369ff.tgz")
 include_directories(${DGC_INCLUDE_DIR})
 
 ExternalProject_Add(
   extern_dgc
   ${EXTERNAL_PROJECT_LOG_ARGS}
   URL ${DGC_URL}
-  URL_MD5 "94e6fa1bc97169d0e1aad44570fe3251"
+  URL_MD5 "ede459281a0f979da8d84f81287369ff"
   PREFIX "${DGC_PREFIX_DIR}"
   CONFIGURE_COMMAND ""
   BUILD_COMMAND make -j${NPROC}

From 1976cc4b6ea8463e73f92e4fcf9c313e918f9e25 Mon Sep 17 00:00:00 2001
From: zqw_1997 <118182234+zhengqiwen1997@users.noreply.github.com>
Date: Tue, 6 Dec 2022 12:19:18 +0800
Subject: [PATCH 05/60] [fluid remove]: remove
 paddle.fluid.layers.target_assign, paddle.fluid.layers.rpn_target_assign,
 paddle.fluid.layers.retinanet_target_assign and paddle.fluid.layers.ssd_loss
 (#48669)

* remove paddle.fluid.layers.nn.temporal_shift

* code check

* rm unittest

* remove ssd_loss, target_assigns

* remove target_assign, retinanet_target_assign, rpn_target_assign and ssd_loss
---
 python/paddle/fluid/layers/detection.py       | 829 ------------------
 python/paddle/fluid/tests/test_detection.py   | 149 ----
 .../fluid/tests/unittests/test_layers.py      |  64 --
 .../unittests/test_rpn_target_assign_op.py    | 422 ---------
 4 files changed, 1464 deletions(-)

diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 486daac6092c6..3d277705aa90e 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -43,11 +43,7 @@
     'density_prior_box',
     'multi_box_head',
     'bipartite_match',
-    'target_assign',
     'detection_output',
-    'ssd_loss',
-    'rpn_target_assign',
-    'retinanet_target_assign',
     'anchor_generator',
     'roi_perspective_transform',
     'generate_proposal_labels',
@@ -69,460 +65,6 @@
 ]
 
 
-def retinanet_target_assign(
-    bbox_pred,
-    cls_logits,
-    anchor_box,
-    anchor_var,
-    gt_boxes,
-    gt_labels,
-    is_crowd,
-    im_info,
-    num_classes=1,
-    positive_overlap=0.5,
-    negative_overlap=0.4,
-):
-    r"""
-    **Target Assign Layer for the detector RetinaNet.**
-
-    This OP finds out positive and negative samples from all anchors
-    for training the detector `RetinaNet <https://arxiv.org/abs/1708.02002>`_ ,
-    and assigns target labels for classification along with target locations for
-    regression to each sample, then takes out the part belonging to positive and
-    negative samples from category prediction( :attr:`cls_logits`) and location
-    prediction( :attr:`bbox_pred`) which belong to all anchors.
-
-    The searching principles for positive and negative samples are as followed:
-
-    1. Anchors are assigned to ground-truth boxes when it has the highest IoU
-    overlap with a ground-truth box.
-
-    2. Anchors are assigned to ground-truth boxes when it has an IoU overlap
-    higher than :attr:`positive_overlap` with any ground-truth box.
-
-    3. Anchors are assigned to background when its IoU overlap is lower than
-    :attr:`negative_overlap` for all ground-truth boxes.
-
-    4. Anchors which do not meet the above conditions do not participate in
-    the training process.
-
-    Retinanet predicts a :math:`C`-vector for classification and a 4-vector for box
-    regression for each anchor, hence the target label for each positive(or negative)
-    sample is a :math:`C`-vector and the target locations for each positive sample
-    is a 4-vector. As for a positive sample, if the category of its assigned
-    ground-truth box is class :math:`i`, the corresponding entry in its length
-    :math:`C` label vector is set to 1 and all other entries is set to 0, its box
-    regression targets are computed as the offset between itself and its assigned
-    ground-truth box. As for a negative sample, all entries in its length :math:`C`
-    label vector are set to 0 and box regression targets are omitted because
-    negative samples do not participate in the training process of location
-    regression.
-
-    After the assignment, the part belonging to positive and negative samples is
-    taken out from category prediction( :attr:`cls_logits` ), and the part
-    belonging to positive samples is taken out from location
-    prediction( :attr:`bbox_pred` ).
-
-    Args:
-        bbox_pred(Variable): A 3-D Tensor with shape :math:`[N, M, 4]` represents
-            the predicted locations of all anchors. :math:`N` is the batch size( the
-            number of images in a mini-batch), :math:`M` is the number of all anchors
-            of one image, and each anchor has 4 coordinate values. The data type of
-            :attr:`bbox_pred` is float32 or float64.
-        cls_logits(Variable): A 3-D Tensor with shape :math:`[N, M, C]` represents
-            the predicted categories of all anchors. :math:`N` is the batch size,
-            :math:`M` is the number of all anchors of one image, and :math:`C` is
-            the number of categories (**Notice: excluding background**). The data type
-            of :attr:`cls_logits` is float32 or float64.
-        anchor_box(Variable): A 2-D Tensor with shape :math:`[M, 4]` represents
-            the locations of all anchors. :math:`M` is the number of all anchors of
-            one image, each anchor is represented as :math:`[xmin, ymin, xmax, ymax]`,
-            :math:`[xmin, ymin]` is the left top coordinate of the anchor box,
-            :math:`[xmax, ymax]` is the right bottom coordinate of the anchor box.
-            The data type of :attr:`anchor_box` is float32 or float64. Please refer
-            to the OP :ref:`api_fluid_layers_anchor_generator`
-            for the generation of :attr:`anchor_box`.
-        anchor_var(Variable): A 2-D Tensor with shape :math:`[M,4]` represents the expanded
-            factors of anchor locations used in loss function. :math:`M` is number of
-            all anchors of one image, each anchor possesses a 4-vector expanded factor.
-            The data type of :attr:`anchor_var` is float32 or float64. Please refer
-            to the OP :ref:`api_fluid_layers_anchor_generator`
-            for the generation of :attr:`anchor_var`.
-        gt_boxes(Variable): A 1-level 2-D LoDTensor with shape :math:`[G, 4]` represents
-            locations of all ground-truth boxes. :math:`G` is the total number of
-            all ground-truth boxes in a mini-batch, and each ground-truth box has 4
-            coordinate values. The data type of :attr:`gt_boxes` is float32 or
-            float64.
-        gt_labels(variable): A 1-level 2-D LoDTensor with shape :math:`[G, 1]` represents
-            categories of all ground-truth boxes, and the values are in the range of
-            :math:`[1, C]`. :math:`G` is the total number of all ground-truth boxes
-            in a mini-batch, and each ground-truth box has one category. The data type
-            of :attr:`gt_labels` is int32.
-        is_crowd(Variable): A 1-level 1-D LoDTensor with shape :math:`[G]` which
-            indicates whether a ground-truth box is a crowd. If the value is 1, the
-            corresponding box is a crowd, it is ignored during training. :math:`G` is
-            the total number of all ground-truth boxes in a mini-batch. The data type
-            of :attr:`is_crowd` is int32.
-        im_info(Variable): A 2-D Tensor with shape [N, 3] represents the size
-            information of input images. :math:`N` is the batch size, the size
-            information of each image is a 3-vector which are the height and width
-            of the network input along with the factor scaling the origin image to
-            the network input. The data type of :attr:`im_info` is float32.
-        num_classes(int32): The number of categories for classification, the default
-            value is 1.
-        positive_overlap(float32): Minimum overlap required between an anchor
-            and ground-truth box for the anchor to be a positive sample, the default
-            value is 0.5.
-        negative_overlap(float32): Maximum overlap allowed between an anchor
-            and ground-truth box for the anchor to be a negative sample, the default
-            value is 0.4. :attr:`negative_overlap` should be less than or equal to
-            :attr:`positive_overlap`, if not, the actual value of
-            :attr:`positive_overlap` is :attr:`negative_overlap`.
-
-    Returns:
-        A tuple with 6 Variables:
-
-        **predict_scores** (Variable): A 2-D Tensor with shape :math:`[F+B, C]` represents
-        category prediction belonging to positive and negative samples. :math:`F`
-        is the number of positive samples in a mini-batch, :math:`B` is the number
-        of negative samples, and :math:`C` is the number of categories
-        (**Notice: excluding background**). The data type of :attr:`predict_scores`
-        is float32 or float64.
-
-        **predict_location** (Variable): A 2-D Tensor with shape :math:`[F, 4]` represents
-        location prediction belonging to positive samples. :math:`F` is the number
-        of positive samples. :math:`F` is the number of positive samples, and each
-        sample has 4 coordinate values. The data type of :attr:`predict_location`
-        is float32 or float64.
-
-        **target_label** (Variable): A 2-D Tensor with shape :math:`[F+B, 1]` represents
-        target labels for classification belonging to positive and negative
-        samples. :math:`F` is the number of positive samples, :math:`B` is the
-        number of negative, and each sample has one target category. The data type
-        of :attr:`target_label` is int32.
-
-        **target_bbox** (Variable): A 2-D Tensor with shape :math:`[F, 4]` represents
-        target locations for box regression belonging to positive samples.
-        :math:`F` is the number of positive samples, and each sample has 4
-        coordinate values. The data type of :attr:`target_bbox` is float32 or
-        float64.
-
-        **bbox_inside_weight** (Variable): A 2-D Tensor with shape :math:`[F, 4]`
-        represents whether a positive sample is fake positive, if a positive
-        sample is false positive, the corresponding entries in
-        :attr:`bbox_inside_weight` are set 0, otherwise 1. :math:`F` is the number
-        of total positive samples in a mini-batch, and each sample has 4
-        coordinate values. The data type of :attr:`bbox_inside_weight` is float32
-        or float64.
-
-        **fg_num** (Variable): A 2-D Tensor with shape :math:`[N, 1]` represents the number
-        of positive samples. :math:`N` is the batch size. **Notice: The number
-        of positive samples is used as the denominator of later loss function,
-        to avoid the condition that the denominator is zero, this OP has added 1
-        to the actual number of positive samples of each image.** The data type of
-        :attr:`fg_num` is int32.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          bbox_pred = fluid.data(name='bbox_pred', shape=[1, 100, 4],
-                            dtype='float32')
-          cls_logits = fluid.data(name='cls_logits', shape=[1, 100, 10],
-                            dtype='float32')
-          anchor_box = fluid.data(name='anchor_box', shape=[100, 4],
-                            dtype='float32')
-          anchor_var = fluid.data(name='anchor_var', shape=[100, 4],
-                            dtype='float32')
-          gt_boxes = fluid.data(name='gt_boxes', shape=[10, 4],
-                            dtype='float32')
-          gt_labels = fluid.data(name='gt_labels', shape=[10, 1],
-                            dtype='int32')
-          is_crowd = fluid.data(name='is_crowd', shape=[1],
-                            dtype='int32')
-          im_info = fluid.data(name='im_info', shape=[1, 3],
-                            dtype='float32')
-          score_pred, loc_pred, score_target, loc_target, bbox_inside_weight, fg_num = \\
-                fluid.layers.retinanet_target_assign(bbox_pred, cls_logits, anchor_box,
-                anchor_var, gt_boxes, gt_labels, is_crowd, im_info, 10)
-
-    """
-
-    check_variable_and_dtype(
-        bbox_pred,
-        'bbox_pred',
-        ['float32', 'float64'],
-        'retinanet_target_assign',
-    )
-    check_variable_and_dtype(
-        cls_logits,
-        'cls_logits',
-        ['float32', 'float64'],
-        'retinanet_target_assign',
-    )
-    check_variable_and_dtype(
-        anchor_box,
-        'anchor_box',
-        ['float32', 'float64'],
-        'retinanet_target_assign',
-    )
-    check_variable_and_dtype(
-        anchor_var,
-        'anchor_var',
-        ['float32', 'float64'],
-        'retinanet_target_assign',
-    )
-    check_variable_and_dtype(
-        gt_boxes, 'gt_boxes', ['float32', 'float64'], 'retinanet_target_assign'
-    )
-    check_variable_and_dtype(
-        gt_labels, 'gt_labels', ['int32'], 'retinanet_target_assign'
-    )
-    check_variable_and_dtype(
-        is_crowd, 'is_crowd', ['int32'], 'retinanet_target_assign'
-    )
-    check_variable_and_dtype(
-        im_info, 'im_info', ['float32', 'float64'], 'retinanet_target_assign'
-    )
-
-    helper = LayerHelper('retinanet_target_assign', **locals())
-    # Assign target label to anchors
-    loc_index = helper.create_variable_for_type_inference(dtype='int32')
-    score_index = helper.create_variable_for_type_inference(dtype='int32')
-    target_label = helper.create_variable_for_type_inference(dtype='int32')
-    target_bbox = helper.create_variable_for_type_inference(
-        dtype=anchor_box.dtype
-    )
-    bbox_inside_weight = helper.create_variable_for_type_inference(
-        dtype=anchor_box.dtype
-    )
-    fg_num = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(
-        type="retinanet_target_assign",
-        inputs={
-            'Anchor': anchor_box,
-            'GtBoxes': gt_boxes,
-            'GtLabels': gt_labels,
-            'IsCrowd': is_crowd,
-            'ImInfo': im_info,
-        },
-        outputs={
-            'LocationIndex': loc_index,
-            'ScoreIndex': score_index,
-            'TargetLabel': target_label,
-            'TargetBBox': target_bbox,
-            'BBoxInsideWeight': bbox_inside_weight,
-            'ForegroundNumber': fg_num,
-        },
-        attrs={
-            'positive_overlap': positive_overlap,
-            'negative_overlap': negative_overlap,
-        },
-    )
-
-    loc_index.stop_gradient = True
-    score_index.stop_gradient = True
-    target_label.stop_gradient = True
-    target_bbox.stop_gradient = True
-    bbox_inside_weight.stop_gradient = True
-    fg_num.stop_gradient = True
-
-    cls_logits = paddle.reshape(x=cls_logits, shape=(-1, num_classes))
-    bbox_pred = paddle.reshape(x=bbox_pred, shape=(-1, 4))
-    predicted_cls_logits = paddle.gather(cls_logits, score_index)
-    predicted_bbox_pred = paddle.gather(bbox_pred, loc_index)
-
-    return (
-        predicted_cls_logits,
-        predicted_bbox_pred,
-        target_label,
-        target_bbox,
-        bbox_inside_weight,
-        fg_num,
-    )
-
-
-def rpn_target_assign(
-    bbox_pred,
-    cls_logits,
-    anchor_box,
-    anchor_var,
-    gt_boxes,
-    is_crowd,
-    im_info,
-    rpn_batch_size_per_im=256,
-    rpn_straddle_thresh=0.0,
-    rpn_fg_fraction=0.5,
-    rpn_positive_overlap=0.7,
-    rpn_negative_overlap=0.3,
-    use_random=True,
-):
-    """
-    **Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection.**
-
-    This layer can be, for given the  Intersection-over-Union (IoU) overlap
-    between anchors and ground truth boxes, to assign classification and
-    regression targets to each each anchor, these target labels are used for
-    train RPN. The classification targets is a binary class label (of being
-    an object or not). Following the paper of Faster-RCNN, the positive labels
-    are two kinds of anchors: (i) the anchor/anchors with the highest IoU
-    overlap with a ground-truth box, or (ii) an anchor that has an IoU overlap
-    higher than rpn_positive_overlap(0.7) with any ground-truth box. Note
-    that a single ground-truth box may assign positive labels to multiple
-    anchors. A non-positive anchor is when its IoU ratio is lower than
-    rpn_negative_overlap (0.3) for all ground-truth boxes. Anchors that are
-    neither positive nor negative do not contribute to the training objective.
-    The regression targets are the encoded ground-truth boxes associated with
-    the positive anchors.
-
-    Args:
-        bbox_pred(Variable): A 3-D Tensor with shape [N, M, 4] represents the
-            predicted locations of M bounding bboxes. N is the batch size,
-            and each bounding box has four coordinate values and the layout
-            is [xmin, ymin, xmax, ymax]. The data type can be float32 or float64.
-        cls_logits(Variable): A 3-D Tensor with shape [N, M, 1] represents the
-            predicted confidence predictions. N is the batch size, 1 is the
-            frontground and background sigmoid, M is number of bounding boxes.
-            The data type can be float32 or float64.
-        anchor_box(Variable): A 2-D Tensor with shape [M, 4] holds M boxes,
-            each box is represented as [xmin, ymin, xmax, ymax],
-            [xmin, ymin] is the left top coordinate of the anchor box,
-            if the input is image feature map, they are close to the origin
-            of the coordinate system. [xmax, ymax] is the right bottom
-            coordinate of the anchor box. The data type can be float32 or float64.
-        anchor_var(Variable): A 2-D Tensor with shape [M,4] holds expanded
-            variances of anchors. The data type can be float32 or float64.
-        gt_boxes (Variable): The ground-truth bounding boxes (bboxes) are a 2D
-            LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth
-            bboxes of mini-batch input. The data type can be float32 or float64.
-        is_crowd (Variable): A 1-D LoDTensor which indicates groud-truth is crowd.
-                             The data type must be int32.
-        im_info (Variable): A 2-D LoDTensor with shape [N, 3]. N is the batch size,
-        3 is the height, width and scale.
-        rpn_batch_size_per_im(int): Total number of RPN examples per image.
-                                    The data type must be int32.
-        rpn_straddle_thresh(float): Remove RPN anchors that go outside the image
-            by straddle_thresh pixels. The data type must be float32.
-        rpn_fg_fraction(float): Target fraction of RoI minibatch that is labeled
-            foreground (i.e. class > 0), 0-th class is background. The data type must be float32.
-        rpn_positive_overlap(float): Minimum overlap required between an anchor
-            and ground-truth box for the (anchor, gt box) pair to be a positive
-            example. The data type must be float32.
-        rpn_negative_overlap(float): Maximum overlap allowed between an anchor
-            and ground-truth box for the (anchor, gt box) pair to be a negative
-            examples. The data type must be float32.
-
-    Returns:
-        tuple:
-        A tuple(predicted_scores, predicted_location, target_label,
-        target_bbox, bbox_inside_weight) is returned. The predicted_scores
-        and predicted_location is the predicted result of the RPN.
-        The target_label and target_bbox is the ground truth,
-        respectively. The predicted_location is a 2D Tensor with shape
-        [F, 4], and the shape of target_bbox is same as the shape of
-        the predicted_location, F is the number of the foreground
-        anchors. The predicted_scores is a 2D Tensor with shape
-        [F + B, 1], and the shape of target_label is same as the shape
-        of the predicted_scores, B is the number of the background
-        anchors, the F and B is depends on the input of this operator.
-        Bbox_inside_weight represents whether the predicted loc is fake_fg
-        or not and the shape is [F, 4].
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            bbox_pred = fluid.data(name='bbox_pred', shape=[None, 4], dtype='float32')
-            cls_logits = fluid.data(name='cls_logits', shape=[None, 1], dtype='float32')
-            anchor_box = fluid.data(name='anchor_box', shape=[None, 4], dtype='float32')
-            anchor_var = fluid.data(name='anchor_var', shape=[None, 4], dtype='float32')
-            gt_boxes = fluid.data(name='gt_boxes', shape=[None, 4], dtype='float32')
-            is_crowd = fluid.data(name='is_crowd', shape=[None], dtype='float32')
-            im_info = fluid.data(name='im_infoss', shape=[None, 3], dtype='float32')
-            loc, score, loc_target, score_target, inside_weight = fluid.layers.rpn_target_assign(
-                bbox_pred, cls_logits, anchor_box, anchor_var, gt_boxes, is_crowd, im_info)
-
-    """
-
-    helper = LayerHelper('rpn_target_assign', **locals())
-
-    check_variable_and_dtype(
-        bbox_pred, 'bbox_pred', ['float32', 'float64'], 'rpn_target_assign'
-    )
-    check_variable_and_dtype(
-        cls_logits, 'cls_logits', ['float32', 'float64'], 'rpn_target_assign'
-    )
-    check_variable_and_dtype(
-        anchor_box, 'anchor_box', ['float32', 'float64'], 'rpn_target_assign'
-    )
-    check_variable_and_dtype(
-        anchor_var, 'anchor_var', ['float32', 'float64'], 'rpn_target_assign'
-    )
-    check_variable_and_dtype(
-        gt_boxes, 'gt_boxes', ['float32', 'float64'], 'rpn_target_assign'
-    )
-    check_variable_and_dtype(
-        is_crowd, 'is_crowd', ['int32'], 'rpn_target_assign'
-    )
-    check_variable_and_dtype(
-        im_info, 'im_info', ['float32', 'float64'], 'rpn_target_assign'
-    )
-
-    # Assign target label to anchors
-    loc_index = helper.create_variable_for_type_inference(dtype='int32')
-    score_index = helper.create_variable_for_type_inference(dtype='int32')
-    target_label = helper.create_variable_for_type_inference(dtype='int32')
-    target_bbox = helper.create_variable_for_type_inference(
-        dtype=anchor_box.dtype
-    )
-    bbox_inside_weight = helper.create_variable_for_type_inference(
-        dtype=anchor_box.dtype
-    )
-    helper.append_op(
-        type="rpn_target_assign",
-        inputs={
-            'Anchor': anchor_box,
-            'GtBoxes': gt_boxes,
-            'IsCrowd': is_crowd,
-            'ImInfo': im_info,
-        },
-        outputs={
-            'LocationIndex': loc_index,
-            'ScoreIndex': score_index,
-            'TargetLabel': target_label,
-            'TargetBBox': target_bbox,
-            'BBoxInsideWeight': bbox_inside_weight,
-        },
-        attrs={
-            'rpn_batch_size_per_im': rpn_batch_size_per_im,
-            'rpn_straddle_thresh': rpn_straddle_thresh,
-            'rpn_positive_overlap': rpn_positive_overlap,
-            'rpn_negative_overlap': rpn_negative_overlap,
-            'rpn_fg_fraction': rpn_fg_fraction,
-            'use_random': use_random,
-        },
-    )
-
-    loc_index.stop_gradient = True
-    score_index.stop_gradient = True
-    target_label.stop_gradient = True
-    target_bbox.stop_gradient = True
-    bbox_inside_weight.stop_gradient = True
-
-    cls_logits = paddle.reshape(x=cls_logits, shape=(-1, 1))
-    bbox_pred = paddle.reshape(x=bbox_pred, shape=(-1, 4))
-    predicted_cls_logits = paddle.gather(cls_logits, score_index)
-    predicted_bbox_pred = paddle.gather(bbox_pred, loc_index)
-
-    return (
-        predicted_cls_logits,
-        predicted_bbox_pred,
-        target_label,
-        target_bbox,
-        bbox_inside_weight,
-    )
-
-
 def detection_output(
     loc,
     scores,
@@ -1340,377 +882,6 @@ def bipartite_match(
     return match_indices, match_distance
 
 
-def target_assign(
-    input,
-    matched_indices,
-    negative_indices=None,
-    mismatch_value=None,
-    name=None,
-):
-    """
-
-    This operator can be, for given the target bounding boxes or labels,
-    to assign classification and regression targets to each prediction as well as
-    weights to prediction. The weights is used to specify which prediction would
-    not contribute to training loss.
-
-    For each instance, the output `out` and`out_weight` are assigned based on
-    `match_indices` and `negative_indices`.
-    Assumed that the row offset for each instance in `input` is called lod,
-    this operator assigns classification/regression targets by performing the
-    following steps:
-
-    1. Assigning all outputs based on `match_indices`:
-
-    .. code-block:: text
-
-        If id = match_indices[i][j] > 0,
-
-            out[i][j][0 : K] = X[lod[i] + id][j % P][0 : K]
-            out_weight[i][j] = 1.
-
-        Otherwise,
-
-            out[j][j][0 : K] = {mismatch_value, mismatch_value, ...}
-            out_weight[i][j] = 0.
-
-    2. Assigning outputs based on `neg_indices` if `neg_indices` is provided:
-
-    Assumed that i-th instance in `neg_indices` is called `neg_indice`,
-    for i-th instance:
-
-    .. code-block:: text
-
-        for id in neg_indice:
-            out[i][id][0 : K] = {mismatch_value, mismatch_value, ...}
-            out_weight[i][id] = 1.0
-
-    Args:
-       input (Variable): This input is a 3D LoDTensor with shape [M, P, K].
-           Data type should be int32 or float32.
-       matched_indices (Variable): The input matched indices
-           is 2D Tenosr<int32> with shape [N, P], If MatchIndices[i][j] is -1,
-           the j-th entity of column is not matched to any entity of row in
-           i-th instance.
-       negative_indices (Variable, optional): The input negative example indices
-           are an optional input with shape [Neg, 1] and int32 type, where Neg is
-           the total number of negative example indices.
-       mismatch_value (float32, optional): Fill this value to the mismatched
-           location.
-       name (string): The default value is None.  Normally there is no need for
-           user to set this property.  For more information, please refer
-           to :ref:`api_guide_Name`.
-
-    Returns:
-        tuple: A tuple(out, out_weight) is returned.
-
-        out (Variable): a 3D Tensor with shape [N, P, K] and same data type
-        with `input`, N and P is the same as they are in `matched_indices`,
-        K is the same as it in input of X.
-
-        out_weight (Variable): the weight for output with the shape of [N, P, 1].
-        Data type is float32.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle
-            paddle.enable_static()
-            x = fluid.data(
-                name='x',
-                shape=[4, 20, 4],
-                dtype='float',
-                lod_level=1)
-            matched_id = fluid.data(
-                name='indices',
-                shape=[8, 20],
-                dtype='int32')
-            trg, trg_weight = fluid.layers.target_assign(
-                x,
-                matched_id,
-                mismatch_value=0)
-    """
-    helper = LayerHelper('target_assign', **locals())
-    out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    out_weight = helper.create_variable_for_type_inference(dtype='float32')
-    helper.append_op(
-        type='target_assign',
-        inputs={
-            'X': input,
-            'MatchIndices': matched_indices,
-            'NegIndices': negative_indices,
-        },
-        outputs={'Out': out, 'OutWeight': out_weight},
-        attrs={'mismatch_value': mismatch_value},
-    )
-    return out, out_weight
-
-
-def ssd_loss(
-    location,
-    confidence,
-    gt_box,
-    gt_label,
-    prior_box,
-    prior_box_var=None,
-    background_label=0,
-    overlap_threshold=0.5,
-    neg_pos_ratio=3.0,
-    neg_overlap=0.5,
-    loc_loss_weight=1.0,
-    conf_loss_weight=1.0,
-    match_type='per_prediction',
-    mining_type='max_negative',
-    normalize=True,
-    sample_size=None,
-):
-    r"""
-	:alias_main: paddle.nn.functional.ssd_loss
-	:alias: paddle.nn.functional.ssd_loss,paddle.nn.functional.loss.ssd_loss
-	:old_api: paddle.fluid.layers.ssd_loss
-
-    **Multi-box loss layer for object detection algorithm of SSD**
-
-    This layer is to compute detection loss for SSD given the location offset
-    predictions, confidence predictions, prior boxes and ground-truth bounding
-    boxes and labels, and the type of hard example mining. The returned loss
-    is a weighted sum of the localization loss (or regression loss) and
-    confidence loss (or classification loss) by performing the following steps:
-
-    1. Find matched bounding box by bipartite matching algorithm.
-
-      1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
-
-      1.2 Compute matched bounding box by bipartite matching algorithm.
-
-    2. Compute confidence for mining hard examples
-
-      2.1. Get the target label based on matched indices.
-
-      2.2. Compute confidence loss.
-
-    3. Apply hard example mining to get the negative example indices and update
-       the matched indices.
-
-    4. Assign classification and regression targets
-
-      4.1. Encoded bbox according to the prior boxes.
-
-      4.2. Assign regression targets.
-
-      4.3. Assign classification targets.
-
-    5. Compute the overall objective loss.
-
-      5.1 Compute confidence loss.
-
-      5.2 Compute localization loss.
-
-      5.3 Compute the overall weighted loss.
-
-    Args:
-        location (Variable): The location predictions are a 3D Tensor with
-            shape [N, Np, 4], N is the batch size, Np is total number of
-            predictions for each instance. 4 is the number of coordinate values,
-            the layout is [xmin, ymin, xmax, ymax].The data type is float32 or
-            float64.
-        confidence (Variable): The confidence predictions are a 3D Tensor
-            with shape [N, Np, C], N and Np are the same as they are in
-            `location`, C is the class number.The data type is float32 or
-            float64.
-        gt_box (Variable): The ground-truth bounding boxes (bboxes) are a 2D
-            LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth
-            bboxes of mini-batch input.The data type is float32 or float64.
-        gt_label (Variable): The ground-truth labels are a 2D LoDTensor
-            with shape [Ng, 1].Ng is the total number of ground-truth bboxes of
-            mini-batch input, 1 is the number of class. The data type is float32
-            or float64.
-        prior_box (Variable): The prior boxes are a 2D Tensor with shape [Np, 4].
-            Np and 4 are the same as they are in `location`. The data type is
-            float32 or float64.
-        prior_box_var (Variable): The variance of prior boxes are a 2D Tensor
-            with shape [Np, 4]. Np and 4 are the same as they are in `prior_box`
-        background_label (int): The index of background label, 0 by default.
-        overlap_threshold (float): If match_type is 'per_prediction', use
-            'overlap_threshold' to determine the extra matching bboxes when finding \
-            matched boxes. 0.5 by default.
-        neg_pos_ratio (float): The ratio of the negative boxes to the positive
-            boxes, used only when mining_type is 'max_negative', 3.0 by default.
-        neg_overlap (float): The negative overlap upper bound for the unmatched
-            predictions. Use only when mining_type is 'max_negative',
-            0.5 by default.
-        loc_loss_weight (float): Weight for localization loss, 1.0 by default.
-        conf_loss_weight (float): Weight for confidence loss, 1.0 by default.
-        match_type (str): The type of matching method during training, should
-            be 'bipartite' or 'per_prediction', 'per_prediction' by default.
-        mining_type (str): The hard example mining type, should be 'hard_example'
-            or 'max_negative', now only support `max_negative`.
-        normalize (bool): Whether to normalize the SSD loss by the total number
-            of output locations, True by default.
-        sample_size (int): The max sample size of negative box, used only when
-            mining_type is 'hard_example'.
-
-    Returns:
-        Variable(Tensor):  The weighted sum of the localization loss and confidence loss, \
-        with shape [N * Np, 1], N and Np are the same as they are in
-        `location`.The data type is float32 or float64.
-
-    Raises:
-        ValueError: If mining_type is 'hard_example', now only support mining \
-        type of `max_negative`.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            pb = fluid.data(
-                           name='prior_box',
-                           shape=[10, 4],
-                           dtype='float32')
-            pbv = fluid.data(
-                           name='prior_box_var',
-                           shape=[10, 4],
-                           dtype='float32')
-            loc = fluid.data(name='target_box', shape=[10, 4], dtype='float32')
-            scores = fluid.data(name='scores', shape=[10, 21], dtype='float32')
-            gt_box = fluid.data(
-                 name='gt_box', shape=[4], lod_level=1, dtype='float32')
-            gt_label = fluid.data(
-                 name='gt_label', shape=[1], lod_level=1, dtype='float32')
-            loss = fluid.layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv)
-    """
-
-    helper = LayerHelper('ssd_loss', **locals())
-    if mining_type != 'max_negative':
-        raise ValueError("Only support mining_type == max_negative now.")
-
-    num, num_prior, num_class = confidence.shape
-    conf_shape = paddle.shape(confidence)
-
-    def __reshape_to_2d(var):
-        out = paddle.flatten(var, 2, -1)
-        out = paddle.flatten(out, 0, 1)
-        return out
-
-    # 1. Find matched bounding box by prior box.
-    #   1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
-    iou = iou_similarity(x=gt_box, y=prior_box)
-    #   1.2 Compute matched bounding box by bipartite matching algorithm.
-    matched_indices, matched_dist = bipartite_match(
-        iou, match_type, overlap_threshold
-    )
-
-    # 2. Compute confidence for mining hard examples
-    # 2.1. Get the target label based on matched indices
-    gt_label = paddle.reshape(
-        x=gt_label, shape=(len(gt_label.shape) - 1) * (0,) + (-1, 1)
-    )
-    gt_label.stop_gradient = True
-    target_label, _ = target_assign(
-        gt_label, matched_indices, mismatch_value=background_label
-    )
-    # 2.2. Compute confidence loss.
-    # Reshape confidence to 2D tensor.
-    confidence = __reshape_to_2d(confidence)
-    target_label = tensor.cast(x=target_label, dtype='int64')
-    target_label = __reshape_to_2d(target_label)
-    target_label.stop_gradient = True
-    conf_loss = softmax_with_cross_entropy(confidence, target_label)
-    # 3. Mining hard examples
-    actual_shape = paddle.slice(conf_shape, axes=[0], starts=[0], ends=[2])
-    actual_shape.stop_gradient = True
-    # shape=(-1, 0) is set for compile-time, the correct shape is set by
-    # actual_shape in runtime.
-    conf_loss = paddle.reshape(x=conf_loss, shape=actual_shape)
-    conf_loss.stop_gradient = True
-    neg_indices = helper.create_variable_for_type_inference(dtype='int32')
-    dtype = matched_indices.dtype
-    updated_matched_indices = helper.create_variable_for_type_inference(
-        dtype=dtype
-    )
-    helper.append_op(
-        type='mine_hard_examples',
-        inputs={
-            'ClsLoss': conf_loss,
-            'LocLoss': None,
-            'MatchIndices': matched_indices,
-            'MatchDist': matched_dist,
-        },
-        outputs={
-            'NegIndices': neg_indices,
-            'UpdatedMatchIndices': updated_matched_indices,
-        },
-        attrs={
-            'neg_pos_ratio': neg_pos_ratio,
-            'neg_dist_threshold': neg_overlap,
-            'mining_type': mining_type,
-            'sample_size': sample_size,
-        },
-    )
-
-    # 4. Assign classification and regression targets
-    # 4.1. Encoded bbox according to the prior boxes.
-    encoded_bbox = box_coder(
-        prior_box=prior_box,
-        prior_box_var=prior_box_var,
-        target_box=gt_box,
-        code_type='encode_center_size',
-    )
-    # 4.2. Assign regression targets
-    target_bbox, target_loc_weight = target_assign(
-        encoded_bbox, updated_matched_indices, mismatch_value=background_label
-    )
-    # 4.3. Assign classification targets
-    target_label, target_conf_weight = target_assign(
-        gt_label,
-        updated_matched_indices,
-        negative_indices=neg_indices,
-        mismatch_value=background_label,
-    )
-
-    # 5. Compute loss.
-    # 5.1 Compute confidence loss.
-    target_label = __reshape_to_2d(target_label)
-    target_label = tensor.cast(x=target_label, dtype='int64')
-
-    conf_loss = softmax_with_cross_entropy(confidence, target_label)
-    target_conf_weight = __reshape_to_2d(target_conf_weight)
-    conf_loss = conf_loss * target_conf_weight
-
-    # the target_label and target_conf_weight do not have gradient.
-    target_label.stop_gradient = True
-    target_conf_weight.stop_gradient = True
-
-    # 5.2 Compute regression loss.
-    location = __reshape_to_2d(location)
-    target_bbox = __reshape_to_2d(target_bbox)
-
-    smooth_l1_loss = paddle.nn.loss.SmoothL1Loss()
-    loc_loss = smooth_l1_loss(location, target_bbox)
-    target_loc_weight = __reshape_to_2d(target_loc_weight)
-    loc_loss = loc_loss * target_loc_weight
-
-    # the target_bbox and target_loc_weight do not have gradient.
-    target_bbox.stop_gradient = True
-    target_loc_weight.stop_gradient = True
-
-    # 5.3 Compute overall weighted loss.
-    loss = conf_loss_weight * conf_loss + loc_loss_weight * loc_loss
-    # reshape to [N, Np], N is the batch size and Np is the prior box number.
-    # shape=(-1, 0) is set for compile-time, the correct shape is set by
-    # actual_shape in runtime.
-    loss = paddle.reshape(x=loss, shape=actual_shape)
-    loss = paddle.sum(loss, axis=1, keepdim=True)
-    if normalize:
-        normalizer = paddle.sum(target_loc_weight)
-        loss = loss / normalizer
-
-    return loss
-
-
 def prior_box(
     input,
     image,
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 7fd3bc2e8b2cb..cf2523947f0d2 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -163,74 +163,6 @@ def test_box_coder_error(self):
                 code_type='encode_center_size',
             )
 
-    def test_detection_api(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[4], dtype='float32')
-            y = layers.data(name='y', shape=[4], dtype='float32')
-            z = layers.data(name='z', shape=[4], dtype='float32', lod_level=1)
-            iou = layers.iou_similarity(x=x, y=y)
-            bcoder = layers.box_coder(
-                prior_box=x,
-                prior_box_var=y,
-                target_box=z,
-                code_type='encode_center_size',
-            )
-            self.assertIsNotNone(iou)
-            self.assertIsNotNone(bcoder)
-
-            matched_indices, matched_dist = layers.bipartite_match(iou)
-            self.assertIsNotNone(matched_indices)
-            self.assertIsNotNone(matched_dist)
-
-            gt = layers.data(
-                name='gt', shape=[1, 1], dtype='int32', lod_level=1
-            )
-            trg, trg_weight = layers.target_assign(
-                gt, matched_indices, mismatch_value=0
-            )
-            self.assertIsNotNone(trg)
-            self.assertIsNotNone(trg_weight)
-
-            gt2 = layers.data(
-                name='gt2', shape=[10, 4], dtype='float32', lod_level=1
-            )
-            trg, trg_weight = layers.target_assign(
-                gt2, matched_indices, mismatch_value=0
-            )
-            self.assertIsNotNone(trg)
-            self.assertIsNotNone(trg_weight)
-
-        print(str(program))
-
-    def test_ssd_loss(self):
-        program = Program()
-        with program_guard(program):
-            pb = layers.data(
-                name='prior_box',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32',
-            )
-            pbv = layers.data(
-                name='prior_box_var',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32',
-            )
-            loc = layers.data(name='target_box', shape=[10, 4], dtype='float32')
-            scores = layers.data(name='scores', shape=[10, 21], dtype='float32')
-            gt_box = layers.data(
-                name='gt_box', shape=[4], lod_level=1, dtype='float32'
-            )
-            gt_label = layers.data(
-                name='gt_label', shape=[1], lod_level=1, dtype='int32'
-            )
-            loss = layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv)
-            self.assertIsNotNone(loss)
-            self.assertEqual(loss.shape[-1], 1)
-        print(str(program))
-
 
 class TestPriorBox(unittest.TestCase):
     def test_prior_box(self):
@@ -521,87 +453,6 @@ def test_detection_map(self):
         print(str(program))
 
 
-class TestRpnTargetAssign(unittest.TestCase):
-    def test_rpn_target_assign(self):
-        program = Program()
-        with program_guard(program):
-            bbox_pred_shape = [10, 50, 4]
-            cls_logits_shape = [10, 50, 2]
-            anchor_shape = [50, 4]
-
-            bbox_pred = layers.data(
-                name='bbox_pred',
-                shape=bbox_pred_shape,
-                append_batch_size=False,
-                dtype='float32',
-            )
-            cls_logits = layers.data(
-                name='cls_logits',
-                shape=cls_logits_shape,
-                append_batch_size=False,
-                dtype='float32',
-            )
-            anchor_box = layers.data(
-                name='anchor_box',
-                shape=anchor_shape,
-                append_batch_size=False,
-                dtype='float32',
-            )
-            anchor_var = layers.data(
-                name='anchor_var',
-                shape=anchor_shape,
-                append_batch_size=False,
-                dtype='float32',
-            )
-            gt_boxes = layers.data(
-                name='gt_boxes', shape=[4], lod_level=1, dtype='float32'
-            )
-            is_crowd = layers.data(
-                name='is_crowd',
-                shape=[1, 10],
-                dtype='int32',
-                lod_level=1,
-                append_batch_size=False,
-            )
-            im_info = layers.data(
-                name='im_info',
-                shape=[1, 3],
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False,
-            )
-            outs = layers.rpn_target_assign(
-                bbox_pred=bbox_pred,
-                cls_logits=cls_logits,
-                anchor_box=anchor_box,
-                anchor_var=anchor_var,
-                gt_boxes=gt_boxes,
-                is_crowd=is_crowd,
-                im_info=im_info,
-                rpn_batch_size_per_im=256,
-                rpn_straddle_thresh=0.0,
-                rpn_fg_fraction=0.5,
-                rpn_positive_overlap=0.7,
-                rpn_negative_overlap=0.3,
-                use_random=False,
-            )
-            pred_scores = outs[0]
-            pred_loc = outs[1]
-            tgt_lbl = outs[2]
-            tgt_bbox = outs[3]
-            bbox_inside_weight = outs[4]
-
-            self.assertIsNotNone(pred_scores)
-            self.assertIsNotNone(pred_loc)
-            self.assertIsNotNone(tgt_lbl)
-            self.assertIsNotNone(tgt_bbox)
-            self.assertIsNotNone(bbox_inside_weight)
-            assert pred_scores.shape[1] == 1
-            assert pred_loc.shape[1] == 4
-            assert pred_loc.shape[1] == tgt_bbox.shape[1]
-            print(str(program))
-
-
 class TestGenerateProposals(LayerTest):
     def test_generate_proposals(self):
         scores_np = np.random.rand(2, 3, 4, 4).astype('float32')
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index dcf9d4d1000bf..2258e3807c423 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -3288,70 +3288,6 @@ def test_deform_roi_pooling(self):
             )
         return out
 
-    def test_retinanet_target_assign(self):
-        with program_guard(
-            fluid.default_main_program(), fluid.default_startup_program()
-        ):
-            bbox_pred = layers.data(
-                name='bbox_pred',
-                shape=[1, 100, 4],
-                append_batch_size=False,
-                dtype='float32',
-            )
-            cls_logits = layers.data(
-                name='cls_logits',
-                shape=[1, 100, 10],
-                append_batch_size=False,
-                dtype='float32',
-            )
-            anchor_box = layers.data(
-                name='anchor_box',
-                shape=[100, 4],
-                append_batch_size=False,
-                dtype='float32',
-            )
-            anchor_var = layers.data(
-                name='anchor_var',
-                shape=[100, 4],
-                append_batch_size=False,
-                dtype='float32',
-            )
-            gt_boxes = layers.data(
-                name='gt_boxes',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32',
-            )
-            gt_labels = layers.data(
-                name='gt_labels',
-                shape=[10, 1],
-                append_batch_size=False,
-                dtype='int32',
-            )
-            is_crowd = layers.data(
-                name='is_crowd',
-                shape=[1],
-                append_batch_size=False,
-                dtype='int32',
-            )
-            im_info = layers.data(
-                name='im_info',
-                shape=[1, 3],
-                append_batch_size=False,
-                dtype='float32',
-            )
-            return layers.retinanet_target_assign(
-                bbox_pred,
-                cls_logits,
-                anchor_box,
-                anchor_var,
-                gt_boxes,
-                gt_labels,
-                is_crowd,
-                im_info,
-                10,
-            )
-
     def test_addmm(self):
         with program_guard(
             fluid.default_main_program(), fluid.default_startup_program()
diff --git a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
index 98cad29ac2e9d..d0147d8b700f1 100644
--- a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
@@ -23,9 +23,6 @@
     _generate_groundtruth,
 )
 
-import paddle.fluid as fluid
-from paddle.fluid import Program, program_guard
-
 
 def rpn_target_assign(
     anchor_by_gt_overlap,
@@ -485,424 +482,5 @@ def test_check_output(self):
         self.check_output()
 
 
-class TestRetinanetTargetAssignOpError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-            bbox_pred1 = fluid.data(
-                name='bbox_pred1', shape=[1, 100, 4], dtype='float32'
-            )
-            cls_logits1 = fluid.data(
-                name='cls_logits1', shape=[1, 100, 10], dtype='float32'
-            )
-            anchor_box1 = fluid.data(
-                name='anchor_box1', shape=[100, 4], dtype='float32'
-            )
-            anchor_var1 = fluid.data(
-                name='anchor_var1', shape=[100, 4], dtype='float32'
-            )
-            gt_boxes1 = fluid.data(
-                name='gt_boxes1', shape=[10, 4], dtype='float32'
-            )
-            gt_labels1 = fluid.data(
-                name='gt_labels1', shape=[10, 1], dtype='int32'
-            )
-            is_crowd1 = fluid.data(name='is_crowd1', shape=[1], dtype='float32')
-            im_info1 = fluid.data(
-                name='im_info1', shape=[1, 3], dtype='float32'
-            )
-
-            # The `bbox_pred` must be Variable and the data type of `bbox_pred` Tensor
-            # one of float32 and float64.
-            def test_bbox_pred_type():
-                (
-                    score_pred,
-                    loc_pred,
-                    score_target,
-                    loc_target,
-                    bbox_inside_weight,
-                    fg_num,
-                ) = fluid.layers.retinanet_target_assign(
-                    [1],
-                    cls_logits1,
-                    anchor_box1,
-                    anchor_var1,
-                    gt_boxes1,
-                    gt_labels1,
-                    is_crowd1,
-                    im_info1,
-                    10,
-                )
-
-            self.assertRaises(TypeError, test_bbox_pred_type)
-
-            def test_bbox_pred_tensor_dtype():
-                bbox_pred2 = fluid.data(
-                    name='bbox_pred2', shape=[1, 100, 4], dtype='intt32'
-                )
-                (
-                    score_pred,
-                    loc_pred,
-                    score_target,
-                    loc_target,
-                    bbox_inside_weight,
-                    fg_num,
-                ) = fluid.layers.retinanet_target_assign(
-                    bbox_pred2,
-                    cls_logits1,
-                    anchor_box1,
-                    anchor_var1,
-                    gt_boxes1,
-                    gt_labels1,
-                    is_crowd1,
-                    im_info1,
-                    10,
-                )
-
-            self.assertRaises(TypeError, test_bbox_pred_tensor_dtype)
-
-            # The `cls_logits` must be Variable and the data type of `cls_logits` Tensor
-            # one of float32 and float64.
-            def test_cls_logits_type():
-                (
-                    score_pred,
-                    loc_pred,
-                    score_target,
-                    loc_target,
-                    bbox_inside_weight,
-                    fg_num,
-                ) = fluid.layers.retinanet_target_assign(
-                    bbox_pred1,
-                    2,
-                    anchor_box1,
-                    anchor_var1,
-                    gt_boxes1,
-                    gt_labels1,
-                    is_crowd1,
-                    im_info1,
-                    10,
-                )
-
-            self.assertRaises(TypeError, test_cls_logits_type)
-
-            def test_cls_logits_tensor_dtype():
-                cls_logits2 = fluid.data(
-                    name='cls_logits2', shape=[1, 100, 10], dtype='int32'
-                )
-                (
-                    score_pred,
-                    loc_pred,
-                    score_target,
-                    loc_target,
-                    bbox_inside_weight,
-                    fg_num,
-                ) = fluid.layers.retinanet_target_assign(
-                    bbox_pred1,
-                    cls_logits2,
-                    anchor_box1,
-                    anchor_var1,
-                    gt_boxes1,
-                    gt_labels1,
-                    is_crowd1,
-                    im_info1,
-                    10,
-                )
-
-            self.assertRaises(TypeError, test_cls_logits_tensor_dtype)
-
-            # The `anchor_box` must be Variable and the data type of `anchor_box` Tensor
-            # one of float32 and float64.
-            def test_anchor_box_type():
-                (
-                    score_pred,
-                    loc_pred,
-                    score_target,
-                    loc_target,
-                    bbox_inside_weight,
-                    fg_num,
-                ) = fluid.layers.retinanet_target_assign(
-                    bbox_pred1,
-                    cls_logits1,
-                    [5],
-                    anchor_var1,
-                    gt_boxes1,
-                    gt_labels1,
-                    is_crowd1,
-                    im_info1,
-                    10,
-                )
-
-            self.assertRaises(TypeError, test_anchor_box_type)
-
-            def test_anchor_box_tensor_dtype():
-                anchor_box2 = fluid.data(
-                    name='anchor_box2', shape=[100, 4], dtype='int32'
-                )
-                (
-                    score_pred,
-                    loc_pred,
-                    score_target,
-                    loc_target,
-                    bbox_inside_weight,
-                    fg_num,
-                ) = fluid.layers.retinanet_target_assign(
-                    bbox_pred1,
-                    cls_logits1,
-                    anchor_box2,
-                    anchor_var1,
-                    gt_boxes1,
-                    gt_labels1,
-                    is_crowd1,
-                    im_info1,
-                    10,
-                )
-
-            self.assertRaises(TypeError, test_anchor_box_tensor_dtype)
-
-            # The `anchor_var` must be Variable and the data type of `anchor_var` Tensor
-            # one of float32 and float64.
-            def test_anchor_var_type():
-                (
-                    score_pred,
-                    loc_pred,
-                    score_target,
-                    loc_target,
-                    bbox_inside_weight,
-                    fg_num,
-                ) = fluid.layers.retinanet_target_assign(
-                    bbox_pred1,
-                    cls_logits1,
-                    anchor_box1,
-                    5,
-                    gt_boxes1,
-                    gt_labels1,
-                    is_crowd1,
-                    im_info1,
-                    10,
-                )
-
-            self.assertRaises(TypeError, test_anchor_var_type)
-
-            def test_anchor_var_tensor_dtype():
-                anchor_var2 = fluid.data(
-                    name='anchor_var2', shape=[100, 4], dtype='int32'
-                )
-                (
-                    score_pred,
-                    loc_pred,
-                    score_target,
-                    loc_target,
-                    bbox_inside_weight,
-                    fg_num,
-                ) = fluid.layers.retinanet_target_assign(
-                    bbox_pred1,
-                    cls_logits1,
-                    anchor_box1,
-                    anchor_var2,
-                    gt_boxes1,
-                    gt_labels1,
-                    is_crowd1,
-                    im_info1,
-                    10,
-                )
-
-            self.assertRaises(TypeError, test_anchor_var_tensor_dtype)
-
-            # The `gt_boxes` must be Variable and the data type of `gt_boxes` Tensor
-            # one of float32 and float64.
-            def test_gt_boxes_type():
-                (
-                    score_pred,
-                    loc_pred,
-                    score_target,
-                    loc_target,
-                    bbox_inside_weight,
-                    fg_num,
-                ) = fluid.layers.retinanet_target_assign(
-                    bbox_pred1,
-                    cls_logits1,
-                    anchor_box1,
-                    anchor_var1,
-                    [4],
-                    gt_labels1,
-                    is_crowd1,
-                    im_info1,
-                    10,
-                )
-
-            self.assertRaises(TypeError, test_gt_boxes_type)
-
-            def test_gt_boxes_tensor_dtype():
-                gt_boxes2 = fluid.data(
-                    name='gt_boxes2', shape=[10, 4], dtype='int32'
-                )
-                (
-                    score_pred,
-                    loc_pred,
-                    score_target,
-                    loc_target,
-                    bbox_inside_weight,
-                    fg_num,
-                ) = fluid.layers.retinanet_target_assign(
-                    bbox_pred1,
-                    cls_logits1,
-                    anchor_box1,
-                    anchor_var1,
-                    gt_boxes2,
-                    gt_labels1,
-                    is_crowd1,
-                    im_info1,
-                    10,
-                )
-
-            self.assertRaises(TypeError, test_gt_boxes_tensor_dtype)
-
-            # The `gt_label` must be Variable and the data type of `gt_label` Tensor
-            # int32.
-            def test_gt_label_type():
-                (
-                    score_pred,
-                    loc_pred,
-                    score_target,
-                    loc_target,
-                    bbox_inside_weight,
-                    fg_num,
-                ) = fluid.layers.retinanet_target_assign(
-                    bbox_pred1,
-                    cls_logits1,
-                    anchor_box1,
-                    anchor_var1,
-                    gt_boxes1,
-                    9,
-                    is_crowd1,
-                    im_info1,
-                    10,
-                )
-
-            self.assertRaises(TypeError, test_gt_label_type)
-
-            def test_gt_label_tensor_dtype():
-                gt_labels2 = fluid.data(
-                    name='label2', shape=[10, 1], dtype='float32'
-                )
-                (
-                    score_pred,
-                    loc_pred,
-                    score_target,
-                    loc_target,
-                    bbox_inside_weight,
-                    fg_num,
-                ) = fluid.layers.retinanet_target_assign(
-                    bbox_pred1,
-                    cls_logits1,
-                    anchor_box1,
-                    anchor_var1,
-                    gt_boxes1,
-                    gt_labels2,
-                    is_crowd1,
-                    im_info1,
-                    10,
-                )
-
-            self.assertRaises(TypeError, test_gt_label_tensor_dtype)
-
-            # The `is_crowd` must be Variable and the data type of `is_crowd` Tensor
-            # int32.
-            def test_is_crowd_type():
-                (
-                    score_pred,
-                    loc_pred,
-                    score_target,
-                    loc_target,
-                    bbox_inside_weight,
-                    fg_num,
-                ) = fluid.layers.retinanet_target_assign(
-                    bbox_pred1,
-                    cls_logits1,
-                    anchor_box1,
-                    anchor_var1,
-                    gt_boxes1,
-                    gt_labels1,
-                    [10],
-                    im_info1,
-                    10,
-                )
-
-            self.assertRaises(TypeError, test_is_crowd_type)
-
-            def test_is_crowd_tensor_dtype():
-                is_crowd2 = fluid.data(
-                    name='is_crowd2', shape=[10, 1], dtype='float32'
-                )
-                (
-                    score_pred,
-                    loc_pred,
-                    score_target,
-                    loc_target,
-                    bbox_inside_weight,
-                    fg_num,
-                ) = fluid.layers.retinanet_target_assign(
-                    bbox_pred1,
-                    cls_logits1,
-                    anchor_box1,
-                    anchor_var1,
-                    gt_boxes1,
-                    gt_labels1,
-                    is_crowd2,
-                    im_info1,
-                    10,
-                )
-
-            self.assertRaises(TypeError, test_is_crowd_tensor_dtype)
-
-            # The `im_info` must be Variable and the data type of `im_info` Tensor
-            # must be one of float32 and float64.
-            def test_im_info_type():
-                (
-                    score_pred,
-                    loc_pred,
-                    score_target,
-                    loc_target,
-                    bbox_inside_weight,
-                    fg_num,
-                ) = fluid.layers.retinanet_target_assign(
-                    bbox_pred1,
-                    cls_logits1,
-                    anchor_box1,
-                    anchor_var1,
-                    gt_boxes1,
-                    gt_labels1,
-                    is_crowd1,
-                    1,
-                    10,
-                )
-
-            self.assertRaises(TypeError, test_im_info_type)
-
-            def test_im_info_tensor_dtype():
-                im_info2 = fluid.data(
-                    name='im_info2', shape=[1, 3], dtype='int32'
-                )
-                (
-                    score_pred,
-                    loc_pred,
-                    score_target,
-                    loc_target,
-                    bbox_inside_weight,
-                    fg_num,
-                ) = fluid.layers.retinanet_target_assign(
-                    bbox_pred1,
-                    cls_logits1,
-                    anchor_box1,
-                    anchor_var1,
-                    gt_boxes1,
-                    gt_labels1,
-                    is_crowd1,
-                    im_info2,
-                    10,
-                )
-
-            self.assertRaises(TypeError, test_im_info_tensor_dtype)
-
-
 if __name__ == '__main__':
     unittest.main()

From 8fb829ba4710ba3845ed5f0e0ec3de411312af63 Mon Sep 17 00:00:00 2001
From: kangguangli <kangguangli@hotmail.com>
Date: Tue, 6 Dec 2022 12:26:53 +0800
Subject: [PATCH 06/60] Remove fluid matmul (#47988)

* remove layers.matmul in nets.py

* remove layers.matmul in rnn_impl/test_quantization_pass/auto_parallel_gpt_model/test_auto_parallel_completion_gpt

* remove layers.matmul in other files

* fix

* fix

* remove layers.matmul itself

* remove ref in CMakeLists.txt and tools directory

* remove matmul in fluid.layers.nn.py

* remove matmul in fluid.dygraph.rnn.py && resotre test_matmul_op.py

* replace matmul in fluid.dygraph.rnn.py && clean api_test in test_matmul_op.py

* fix error && restore empty test_auto_search_dist_matmul_op.py

* fix check in test_auto_parallel_partitioner.py

* fix test_dist_matmul && test_flags_mkldnn_ops_on_off

* fix test_fused_attention_op_xpu.py && test_matmul_op_xpu.py

* remove test_auto_search_dist_matmul_op.py

* remove layers.matmul in auto_parallel_gpt_model.py && fix doc in fluid/io.py

* fix for matmul_grad

* fix codestyle

* fix codestyle

* resolve conflicts error

* restore unit test file but not compiled it for later remove

* fix codestyle

* fix wrong unittest skip

* fix unittest delete

* fix scale cost

* fix scale cost

* resolve conflicts error

* resolve conflicts error

Co-authored-by: jakpiase <jakpia21@gmail.com>
---
 .../phi/kernels/onednn/matmul_grad_kernel.cc  |   8 +-
 .../paddle/fluid/contrib/layers/rnn_impl.py   |   6 +-
 .../slim/tests/test_quantization_pass.py      |   4 +-
 python/paddle/fluid/dygraph/rnn.py            |  24 +--
 python/paddle/fluid/layers/nn.py              | 149 ------------------
 python/paddle/fluid/nets.py                   |   4 +-
 .../auto_parallel/test_dist_matmul.py         |  28 ++--
 .../auto_parallel/test_dist_op_cost.py        |  10 +-
 .../unittests/auto_parallel_gpt_model.py      |   6 +-
 .../fleet/hybrid_parallel_pp_embedding.py     |   5 +-
 .../fleet/hybrid_parallel_pp_recompute.py     |   8 +-
 .../fleet/hybrid_parallel_pp_transformer.py   |   8 +-
 ...allel_pp_transformer_with_virtual_stage.py |   8 +-
 .../fleet/hybrid_parallel_shared_weight.py    |   5 +-
 .../fleet/parallel_dygraph_transformer.py     |  10 +-
 .../fluid/tests/unittests/dist_transformer.py |   6 +-
 .../dygraph_to_static/bert_dygraph_model.py   |   4 +-
 .../seq2seq_dygraph_model.py                  |   6 +-
 .../unittests/dygraph_to_static/test_bmn.py   |   2 +-
 .../unittests/dygraph_to_static/test_dict.py  |   4 +-
 .../test_program_translator.py                |   4 +-
 .../dygraph_to_static/test_ptb_lm.py          |   4 +-
 .../transformer_dygraph_model.py              |  10 +-
 .../tests/unittests/ipu/test_matmul_op_ipu.py |  12 +-
 .../unittests/ipu/test_weight_sharing_ipu.py  |   2 +-
 .../test_mkldnn_matmul_op_output_fuse_pass.py |   6 +-
 .../ir/inference/test_trt_inspector.py        |   4 +-
 .../unittests/ir/inference/test_trt_matmul.py |  13 +-
 .../test_trt_matmul_quant_dequant.py          |  12 +-
 .../mkldnn/check_flags_mkldnn_ops_on_off.py   |   2 +-
 .../mkldnn/test_flags_mkldnn_ops_on_off.py    |   6 +-
 .../parallel_dygraph_sparse_embedding.py      |   2 +-
 .../test_auto_parallel_completion.py          |  11 +-
 .../test_auto_parallel_completion_gpt.py      |   5 +-
 .../test_auto_parallel_partitioner.py         |  31 ++--
 .../test_auto_parallel_partitioner_gpt.py     |   5 +-
 .../test_auto_search_dist_matmul_op.py        |   1 +
 .../unittests/test_auto_search_dist_op.py     |   3 +-
 .../fluid/tests/unittests/test_cholesky_op.py |   3 +-
 .../tests/unittests/test_dist_transpiler.py   |   4 +-
 .../test_eager_deletion_padding_rnn.py        |   6 +-
 .../unittests/test_fused_attention_op.py      |   6 +-
 .../test_fused_multi_transformer_int8_op.py   |   6 +-
 .../test_fused_multi_transformer_op.py        |   5 +-
 .../tests/unittests/test_imperative_basic.py  |   2 +-
 .../tests/unittests/test_imperative_gnn.py    |   4 +-
 ..._imperative_lod_tensor_to_selected_rows.py |   2 +-
 .../unittests/test_imperative_ptb_rnn.py      |   4 +-
 .../unittests/test_imperative_save_load.py    |   4 +-
 .../unittests/test_imperative_save_load_v2.py |   4 +-
 ..._imperative_selected_rows_to_lod_tensor.py |   4 +-
 ..._imperative_transformer_sorted_gradient.py |  10 +-
 .../fluid/tests/unittests/test_layers.py      |   6 +-
 .../fluid/tests/unittests/test_matmul_op.py   | 146 -----------------
 .../fluid/tests/unittests/test_mul_nn_grad.py |   2 +-
 .../tests/unittests/test_recurrent_op.py      |   8 +-
 .../tests/unittests/test_rnn_decode_api.py    |   4 +-
 .../tests/unittests/test_static_save_load.py  |   4 +-
 .../tests/unittests/transformer_model.py      |   4 +-
 .../xpu/test_fused_attention_op_xpu.py        |   3 +-
 .../tests/unittests/xpu/test_matmul_op_xpu.py |  95 -----------
 61 files changed, 183 insertions(+), 591 deletions(-)

diff --git a/paddle/phi/kernels/onednn/matmul_grad_kernel.cc b/paddle/phi/kernels/onednn/matmul_grad_kernel.cc
index ceb752f6d41be..fec008e7a106e 100644
--- a/paddle/phi/kernels/onednn/matmul_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/matmul_grad_kernel.cc
@@ -101,9 +101,13 @@ void MatmulGradKernel(const Context &dev_ctx,
 
   if (x_dims.size() != ndims) {
     x_dims = ExtendDimsWithOnes(x_dims, ndims);
-  } else if (y_dims.size() != ndims) {
+  }
+  if (y_dims.size() != ndims) {
     y_dims = ExtendDimsWithOnes(y_dims, ndims);
   }
+  if (dout_dims.size() != ndims) {
+    dout_dims = ExtendDimsWithOnes(dout_dims, ndims);
+  }
 
   // in broadcasting scenario new memory is required because
   // reduce sum must be calculated upon broadcasted dims
@@ -150,7 +154,9 @@ void MatmulGradKernel(const Context &dev_ctx,
   }
 
   dx->Resize(x.dims());
+  dx->set_mem_desc(x.mem_desc().reshape(vectorize(x.dims())));
   dy->Resize(y.dims());
+  dy->set_mem_desc(y.mem_desc().reshape(vectorize(y.dims())));
 }
 
 template <typename T, typename Context>
diff --git a/python/paddle/fluid/contrib/layers/rnn_impl.py b/python/paddle/fluid/contrib/layers/rnn_impl.py
index 4e23057fc4680..9e7034ab66be9 100644
--- a/python/paddle/fluid/contrib/layers/rnn_impl.py
+++ b/python/paddle/fluid/contrib/layers/rnn_impl.py
@@ -151,7 +151,7 @@ def _build_once(self, input, pre_hidden):
     def forward(self, input, pre_hidden):
         concat_input_hidden = layers.concat([input, pre_hidden], 1)
 
-        gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight)
+        gate_input = paddle.matmul(x=concat_input_hidden, y=self._gate_weight)
 
         gate_input = paddle.add(gate_input, self._gate_bias)
 
@@ -160,7 +160,7 @@ def forward(self, input, pre_hidden):
 
         r_hidden = r * pre_hidden
 
-        candidate = layers.matmul(
+        candidate = paddle.matmul(
             layers.concat([input, r_hidden], 1), self._candidate_weight
         )
         candidate = paddle.add(candidate, self._candidate_bias)
@@ -874,7 +874,7 @@ def _build_once(self, input, pre_hidden, pre_cell):
 
     def forward(self, input, pre_hidden, pre_cell):
         concat_input_hidden = layers.concat([input, pre_hidden], 1)
-        gate_input = layers.matmul(x=concat_input_hidden, y=self._weight)
+        gate_input = paddle.matmul(x=concat_input_hidden, y=self._weight)
 
         gate_input = paddle.add(gate_input, self._bias)
         i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
index 5aabeee119799..7fa95fd13f494 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
@@ -76,7 +76,7 @@ def conv_bn_layer(
     matmul_weight = paddle.create_parameter(
         shape=[1, 16, 32, 32], dtype='float32'
     )
-    hidden = fluid.layers.matmul(hidden, matmul_weight, True, True)
+    hidden = paddle.matmul(hidden, matmul_weight, True, True)
     if quant_skip_pattern:
         with fluid.name_scope(quant_skip_pattern):
             pool = fluid.layers.pool2d(
@@ -724,7 +724,7 @@ def conv_bn_layer(
         conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
         short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
         hidden = paddle.nn.functional.relu(paddle.add(x=conv, y=short))
-    hidden = fluid.layers.matmul(hidden, data2, True, True)
+    hidden = paddle.matmul(hidden, data2, True, True)
     if isinstance(quant_skip_pattern, str):
         with fluid.name_scope(quant_skip_pattern):
             pool1 = fluid.layers.pool2d(
diff --git a/python/paddle/fluid/dygraph/rnn.py b/python/paddle/fluid/dygraph/rnn.py
index 986d1c562b405..d74e0b1bfee70 100644
--- a/python/paddle/fluid/dygraph/rnn.py
+++ b/python/paddle/fluid/dygraph/rnn.py
@@ -17,11 +17,11 @@
 from ..layers import (
     concat,
     fill_constant,
-    matmul,
     elementwise_mul,
     split,
 )
 import copy
+import paddle
 
 __all__ = ['LSTMCell', 'GRUCell']
 
@@ -215,11 +215,12 @@ def __init__(
     def forward(self, input, pre_hidden, pre_cell):
 
         if self._use_cudnn_impl:
-            igates = matmul(input, y=self._weight_ih, transpose_y=True)
+            igates = paddle.matmul(input, y=self._weight_ih, transpose_y=True)
             igates = paddle.add(igates, self._bias_ih)
-            hgates = matmul(pre_hidden, self._weight_hh, transpose_y=True)
+            hgates = paddle.matmul(
+                pre_hidden, self._weight_hh, transpose_y=True
+            )
             hgates = paddle.add(hgates, self._bias_hh)
-
             chunked_igates = split(igates, num_or_sections=4, dim=1)
             chunked_hgates = split(hgates, num_or_sections=4, dim=1)
 
@@ -241,7 +242,7 @@ def forward(self, input, pre_hidden, pre_cell):
         else:
 
             concat_input_hidden = concat([input, pre_hidden], 1)
-            gate_input = matmul(x=concat_input_hidden, y=self._weight)
+            gate_input = paddle.matmul(x=concat_input_hidden, y=self._weight)
 
             gate_input = paddle.add(gate_input, self._bias)
             i, j, f, o = split(gate_input, num_or_sections=4, dim=-1)
@@ -461,10 +462,11 @@ def __init__(
     def forward(self, input, pre_hidden):
 
         if self._use_cudnn_impl:
-
-            igates = matmul(input, y=self._weight_ih, transpose_y=True)
+            igates = paddle.matmul(input, y=self._weight_ih, transpose_y=True)
             igates = paddle.add(igates, self._bias_ih)
-            hgates = matmul(pre_hidden, self._weight_hh, transpose_y=True)
+            hgates = paddle.matmul(
+                pre_hidden, self._weight_hh, transpose_y=True
+            )
             hgates = paddle.add(hgates, self._bias_hh)
 
             chunked_igates = split(igates, num_or_sections=3, dim=1)
@@ -486,7 +488,9 @@ def forward(self, input, pre_hidden):
 
             concat_input_hidden = concat([input, pre_hidden], 1)
 
-            gate_input = matmul(x=concat_input_hidden, y=self._gate_weight)
+            gate_input = paddle.matmul(
+                x=concat_input_hidden, y=self._gate_weight
+            )
 
             gate_input = paddle.add(gate_input, self._gate_bias)
             gate_input = self._gate_activation(gate_input)
@@ -494,7 +498,7 @@ def forward(self, input, pre_hidden):
 
             r_hidden = r * pre_hidden
 
-            candidate = matmul(
+            candidate = paddle.matmul(
                 concat([input, r_hidden], 1), self._candidate_weight
             )
             candidate = paddle.add(candidate, self._candidate_bias)
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 860a5375bf818..39d4d678abd0e 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -73,7 +73,6 @@
     'dropout',
     'split',
     'l2_normalize',
-    'matmul',
     'row_conv',
     'layer_norm',
     'spectral_norm',
@@ -2589,154 +2588,6 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
     return out
 
 
-@deprecated(since="2.0.0", update_to="paddle.matmul")
-def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
-    """
-    Applies matrix multiplication to two tensors.
-
-    Currently, the input tensors' rank can be any, but when the rank of any
-    inputs is bigger than 3, this two inputs' rank should be equal.
-
-    The actual behavior depends on the shapes of :math:`x`, :math:`y` and the
-    flag values of :attr:`transpose_x`, :attr:`transpose_y`. Specifically:
-
-    - If a transpose flag is specified, the last two dimensions of the tensor
-      are transposed. If the tensor is rank-1 of shape :math:`[D]`, then for
-      :math:`x` it is treated as :math:`[1, D]` in nontransposed form and as
-      :math:`[D, 1]` in transposed form, whereas for :math:`y` it is the
-      opposite: It is treated as :math:`[D, 1]` in nontransposed form and as
-      :math:`[1, D]` in transposed form.
-
-    - After transpose, the two tensors are 2-D or n-D and matrix multiplication
-      performs in the following way.
-
-      - If both are 2-D, they are multiplied like conventional matrices.
-      - If either is n-D, it is treated as a stack of matrices residing in the
-        last two dimensions and a batched matrix multiply supporting broadcast
-        applies on the two tensors.
-
-    Also note that if the raw tensor :math:`x` or :math:`y` is rank-1 and
-    nontransposed, the prepended or appended dimension :math:`1` will be
-    removed after matrix multiplication.
-
-    Args:
-        x (Variable): The input variable which is a Tensor or LoDTensor.
-        y (Variable): The input variable which is a Tensor or LoDTensor.
-        transpose_x (bool): Whether to transpose :math:`x` before multiplication.
-        transpose_y (bool): Whether to transpose :math:`y` before multiplication.
-        alpha (float): The scale of output. Default 1.0.
-        name(str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically.
-
-    Returns:
-        Variable: The product Tensor (or LoDTensor) variable.
-
-    Examples:
-        .. code-block:: python
-
-            # Examples to clarify shapes of the inputs and output
-            # x: [B, ..., M, K], y: [B, ..., K, N]
-            # fluid.layers.matmul(x, y)  # out: [B, ..., M, N]
-
-            # x: [B, M, K], y: [B, K, N]
-            # fluid.layers.matmul(x, y)  # out: [B, M, N]
-
-            # x: [B, M, K], y: [K, N]
-            # fluid.layers.matmul(x, y)  # out: [B, M, N]
-
-            # x: [M, K], y: [K, N]
-            # fluid.layers.matmul(x, y)  # out: [M, N]
-
-            # x: [B, M, K], y: [K]
-            # fluid.layers.matmul(x, y)  # out: [B, M]
-
-            # x: [K], y: [K]
-            # fluid.layers.matmul(x, y)  # out: [1]
-
-            # x: [M], y: [N]
-            # fluid.layers.matmul(x, y, True, True)  # out: [M, N]
-
-            import paddle
-            import paddle.fluid as fluid
-            paddle.enable_static()
-
-            x = fluid.layers.data(name='x', shape=[2, 3], dtype='float32')
-            y = fluid.layers.data(name='y', shape=[3, 2], dtype='float32')
-            out = fluid.layers.matmul(x, y, True, True)
-    """
-    if _non_static_mode():
-        out = _varbase_creator(dtype=x.dtype)
-        _legacy_C_ops.matmul(
-            x,
-            y,
-            out,
-            'transpose_X',
-            transpose_x,
-            'transpose_Y',
-            transpose_y,
-            'alpha',
-            float(alpha),
-        )
-        return out
-
-    def __check_input(x, y):
-        var_names = {'x': x, 'y': y}
-        for name, val in var_names.items():
-            check_variable_and_dtype(
-                val, name, ['float16', 'float32', 'float64'], 'matmul'
-            )
-        x_shape = list(x.shape)
-        y_shape = list(y.shape)
-        if len(x_shape) == 1:
-            x_shape = [1] + x_shape
-        if len(y_shape) == 1:
-            y_shape = y_shape + [1]
-
-        # check the inner 2 dimensions
-        if transpose_x:
-            x_shape[-2], x_shape[-1] = x_shape[-1], x_shape[-2]
-        if transpose_y:
-            y_shape[-2], y_shape[-1] = y_shape[-1], y_shape[-2]
-        if x_shape[-1] != y_shape[-2]:
-            assert (x_shape[-1] == -1) or (y_shape[-2] == -1), (
-                "After performing an optional transpose, Input X's width should be "
-                "equal to Y's width for multiplication "
-                "prerequisites. But received X's shape: %s, Y's shape: %s\n"
-                % (x_shape, y_shape)
-            )
-
-        if len(y_shape) > 2 and len(x_shape) > 2:
-            for i, dim_x in enumerate(x_shape[:-2]):
-                # don't check neg shape
-                if dim_x < 0 or y_shape[i] < 0:
-                    continue
-                if dim_x != y_shape[i]:
-                    raise ValueError(
-                        "When the matrix is larger than 2 dimensions, the higher "
-                        "dimensional values of the two matrices need to be equal. "
-                        "But received x_shape[%d] != y_shape[%d]. X's shape: %s, "
-                        "Y's shape: %s.\n" % (i, i, x_shape, y_shape)
-                    )
-
-    attrs = {
-        'transpose_X': transpose_x,
-        'transpose_Y': transpose_y,
-        'alpha': float(alpha),
-    }
-
-    __check_input(x, y)
-
-    helper = LayerHelper('matmul', **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='matmul',
-        inputs={'X': x, 'Y': y},
-        outputs={'Out': out},
-        attrs=attrs,
-    )
-    return out
-
-
 @templatedoc()
 def row_conv(input, future_context_size, param_attr=None, act=None):
     """
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index 3d4f187e18f32..0a781e67a82fc 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -621,7 +621,7 @@ def __combine_heads(x):
 
     key_dim_per_head = keys.shape[-1] // num_heads
     scaled_q = paddle.scale(x=q, scale=key_dim_per_head**-0.5)
-    product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
+    product = paddle.matmul(x=scaled_q, y=k, transpose_y=True)
 
     x = paddle.reshape(x=product, shape=[-1, product.shape[-1]])
     x = paddle.nn.functional.softmax(x)
@@ -631,5 +631,5 @@ def __combine_heads(x):
         weights = layers.dropout(
             weights, dropout_prob=dropout_rate, is_test=False
         )
-    ctx_multiheads = layers.matmul(weights, v)
+    ctx_multiheads = paddle.matmul(weights, v)
     return __combine_heads(ctx_multiheads)
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_matmul.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_matmul.py
index 5e69d6955af2a..0a07b98de705d 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_matmul.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_matmul.py
@@ -84,9 +84,7 @@ def matmul_dp2mp2(init_x, init_y, trans_x, trans_y):
         y = init_y(trans_y)
         x.stop_gradient = False
         y.stop_gradient = False
-        out = paddle.fluid.layers.matmul(
-            x, y, transpose_x=trans_x, transpose_y=trans_y
-        )
+        out = paddle.matmul(x, y, transpose_x=trans_x, transpose_y=trans_y)
         loss = paddle.mean(out)
     return main_program, start_program, loss
 
@@ -134,22 +132,22 @@ def check_col_program(self, main_program, dist_ctx):
         # [0, -1] * [-1, 1] --> [0, 1]
         ref_ops = [
             "c_identity",
-            "matmul",
+            "matmul_v2",
             "reduce_mean",
             "fill_constant",
             "reduce_mean_grad",
-            "matmul_grad",
+            "matmul_v2_grad",
         ]
         ops = []
         block = main_program.global_block()
         for op in block.ops:
             ops.append(op.type)
-            if op.type == "matmul":
+            if op.type == "matmul_v2":
                 out_name = op.output('Out')[0]
                 out_var = block.vars[out_name]
                 op_dist_attr = dist_ctx.get_op_dist_attr_for_program(op)
                 assert op_dist_attr.impl_idx == 0
-                assert op_dist_attr.impl_type == "matmul"
+                assert op_dist_attr.impl_type == "matmul_v2"
                 out_dims_mapping = op_dist_attr.get_output_dims_mapping(
                     out_name
                 )
@@ -158,33 +156,33 @@ def check_col_program(self, main_program, dist_ctx):
                     out_var
                 )
                 assert tensor_dist_attr.dims_mapping == [0, 1]
-            if op.type == "matmul_grad":
+            if op.type == "matmul_v2_grad":
                 op_dist_attr = dist_ctx.get_op_dist_attr_for_program(op)
                 assert op_dist_attr.impl_idx == 0
-                assert op_dist_attr.impl_type == "matmul"
+                assert op_dist_attr.impl_type == "matmul_v2"
 
         assert ops == ref_ops
 
     def check_row_program(self, main_program, dist_ctx):
         # [0, -1, 1] * [1, -1] --> [0, -1, -1]
         ref_ops = [
-            "matmul",
+            "matmul_v2",
             "c_allreduce_sum",
             "reduce_mean",
             "fill_constant",
             "reduce_mean_grad",
-            "matmul_grad",
+            "matmul_v2_grad",
         ]
         ops = []
         block = main_program.global_block()
         for op in block.ops:
             ops.append(op.type)
-            if op.type == "matmul":
+            if op.type == "matmul_v2":
                 out_name = op.output('Out')[0]
                 out_var = block.vars[out_name]
                 op_dist_attr = dist_ctx.get_op_dist_attr_for_program(op)
                 assert op_dist_attr.impl_idx == 1
-                assert op_dist_attr.impl_type == "matmul"
+                assert op_dist_attr.impl_type == "matmul_v2"
                 out_dims_mapping = op_dist_attr.get_output_dims_mapping(
                     out_name
                 )
@@ -193,10 +191,10 @@ def check_row_program(self, main_program, dist_ctx):
                     out_var
                 )
                 assert tensor_dist_attr.dims_mapping == [0, -1, -1]
-            if op.type == "matmul_grad":
+            if op.type == "matmul_v2_grad":
                 op_dist_attr = dist_ctx.get_op_dist_attr_for_program(op)
                 assert op_dist_attr.impl_idx == 1
-                assert op_dist_attr.impl_type == "matmul"
+                assert op_dist_attr.impl_type == "matmul_v2"
         assert ops == ref_ops
 
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_op_cost.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_op_cost.py
index 163309f3a37e3..c9a4623911101 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_op_cost.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_op_cost.py
@@ -168,9 +168,7 @@ def make_program():
                     auto.ProcessMesh([0, 1], dim_names=["x"]),
                     [None, "x"],
                 )
-                out1 = paddle.fluid.layers.matmul(
-                    out, param1
-                )  # [8, 8] [-1, -1]
+                out1 = paddle.matmul(out, param1)  # [8, 8] [-1, -1]
                 tmp_param = paddle.create_parameter(
                     [8, 8], paddle.float32
                 )  # [8, 8] [-1, -1]
@@ -179,10 +177,8 @@ def make_program():
                     auto.ProcessMesh([0, 1], dim_names=["x"]),
                     [None, None],
                 )
-                tmp_out = paddle.fluid.layers.matmul(out1, tmp_param)
-                out2 = paddle.fluid.layers.matmul(
-                    tmp_out, param2
-                )  # [8, 4] [-1, 0]
+                tmp_out = paddle.matmul(out1, tmp_param)
+                out2 = paddle.matmul(tmp_out, param2)  # [8, 4] [-1, 0]
 
                 out8 = paddle.transpose(out2, [1, 0])  # [4, 8] [0, -1]
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py b/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py
index 425f00d12198d..2edb5360e471d 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py
@@ -231,8 +231,10 @@ def gen_cache(self, key, value=None, type=Cache):
             return self.Cache(key, value)
 
     def core_attn(self, q, k, v, attn_mask):
-        product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5
+        product = paddle.matmul(x=q, y=k, transpose_y=True)
+        product = paddle.multiply(
+            product,
+            paddle.to_tensor(self.head_dim**-0.5, dtype=product.dtype),
         )
         if attn_mask is not None:
             product = product + attn_mask
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_embedding.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_embedding.py
index 0d1e7084ab94d..3a0afaacb2d12 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_embedding.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_embedding.py
@@ -20,7 +20,6 @@
 import paddle
 import paddle.distributed as dist
 import paddle.distributed.fleet as fleet
-import paddle.fluid as fluid
 import paddle.nn as nn
 from paddle.distributed.fleet.meta_parallel import PipelineLayer
 from paddle.fluid.dygraph.layers import Layer
@@ -54,7 +53,7 @@ def __init__(self):
 
     def forward(self, x1, x2, y1):
         x_emb = self.word_embeddings(x1)
-        fc = fluid.layers.matmul(x_emb, self.softmax_weight)
+        fc = paddle.matmul(x_emb, self.softmax_weight)
         fc = paddle.add(fc, self.softmax_bias)
         projection = paddle.reshape(fc, shape=[-1, vocab_size])
         loss = paddle.nn.functional.softmax_with_cross_entropy(
@@ -83,7 +82,7 @@ def __init__(self):
 
     def forward(self, args):
         x1, x2 = args
-        fc = fluid.layers.matmul(x1, self.softmax_weight)
+        fc = paddle.matmul(x1, self.softmax_weight)
 
         return fc, x2
 
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_recompute.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_recompute.py
index e2690efcb61b3..921ed62fc9a2e 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_recompute.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_recompute.py
@@ -24,7 +24,6 @@
 import paddle.nn.functional as F
 from paddle import framework
 from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
-from paddle.fluid import layers
 from paddle.fluid.dygraph.layers import Layer
 
 
@@ -73,13 +72,12 @@ def forward(self, x):
         q = self.q_proj(x)
         k = self.k_proj(x)
         v = self.v_proj(x)
-        product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=d_model**-0.5
-        )
+        product = paddle.matmul(x=q, y=k, transpose_y=True)
+        product = paddle.scale(product, scale=d_model**-0.5)
         weights = F.softmax(product)
 
         weights = F.dropout(weights, 0.2)
-        tgt = layers.matmul(weights, v)
+        tgt = paddle.matmul(weights, v)
         residual = tgt
         tgt = self.norm1(tgt)
         tgt = residual + tgt
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer.py
index ea0ab82a899cb..2f9e68188c7d0 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer.py
@@ -23,7 +23,6 @@
 import paddle.nn as nn
 import paddle.nn.functional as F
 from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
-from paddle.fluid import layers
 from paddle.fluid.dygraph.layers import Layer
 
 
@@ -82,14 +81,13 @@ def forward(self, x, mask):
         q = self.q_proj(x)
         k = self.k_proj(x)
         v = self.v_proj(x)
-        product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=d_model**-0.5
-        )
+        product = paddle.matmul(x=q, y=k, transpose_y=True)
+        product = paddle.scale(product, scale=d_model**-0.5)
 
         weights = F.softmax(product + mask)
         # TODO(shenliang03) For save/load in PipeLineParallel, can’t support dropout temporarily.
         # weights = F.dropout(weights, 0.2)
-        tgt = layers.matmul(weights, v)
+        tgt = paddle.matmul(weights, v)
         residual = tgt
         tgt = self.norm1(tgt)
         tgt = residual + tgt
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py
index aa4b9e0a88715..7f3aa674b5c1e 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py
@@ -23,7 +23,6 @@
 import paddle.nn as nn
 import paddle.nn.functional as F
 from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
-from paddle.fluid import layers
 from paddle.fluid.dygraph.layers import Layer
 
 
@@ -83,12 +82,11 @@ def forward(self, x, mask):
         q = self.q_proj(x)
         k = self.k_proj(x)
         v = self.v_proj(x)
-        product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=d_model**-0.5
-        )
+        product = paddle.matmul(x=q, y=k, transpose_y=True)
+        product = paddle.scale(product, scale=d_model**-0.5)
 
         weights = F.softmax(product + mask)
-        tgt = layers.matmul(weights, v)
+        tgt = paddle.matmul(weights, v)
         residual = tgt
         tgt = self.norm1(tgt)
         tgt = residual + tgt
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_shared_weight.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_shared_weight.py
index 456078921295b..45e20bfad0439 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_shared_weight.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_shared_weight.py
@@ -20,7 +20,6 @@
 import paddle
 import paddle.distributed as dist
 import paddle.distributed.fleet as fleet
-import paddle.fluid as fluid
 import paddle.nn as nn
 from paddle.distributed.fleet.meta_parallel import (
     LayerDesc,
@@ -61,7 +60,7 @@ def __init__(self):
 
     def forward(self, x1, x2, y1):
         x_emb = self.word_embeddings(x1)
-        fc = fluid.layers.matmul(x_emb, self.softmax_weight)
+        fc = paddle.matmul(x_emb, self.softmax_weight)
         fc = paddle.add(fc, self.softmax_bias)
         projection = paddle.reshape(fc, shape=[-1, vocab_size])
 
@@ -97,7 +96,7 @@ def __init__(self):
 
     def forward(self, args):
         x1, x2 = args
-        fc = fluid.layers.matmul(x1, self.softmax_weight)
+        fc = paddle.matmul(x1, self.softmax_weight)
 
         return fc, x2
 
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
index 41c8afd629028..52ec9e5b121b2 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
@@ -334,12 +334,12 @@ def forward(self, queries, keys, values, attn_bias):
         transpose_v = paddle.transpose(x=reshaped_v, perm=[0, 2, 1, 3])
 
         # scale dot product attention
-        product = fluid.layers.matmul(
+        product = paddle.matmul(
             x=transpose_q,
             y=transpose_k,
             transpose_y=True,
-            alpha=self._d_model**-0.5,
         )
+        product = paddle.scale(product, scale=self._d_model**-0.5)
         if attn_bias is not None:
             product += attn_bias
         weights = paddle.nn.functional.softmax(product)
@@ -350,9 +350,9 @@ def forward(self, queries, keys, values, attn_bias):
                 seed=ModelHyperParams.dropout_seed,
                 is_test=False,
             )
-            out = fluid.layers.matmul(weights_droped, transpose_v)
+            out = paddle.matmul(weights_droped, transpose_v)
         else:
-            out = fluid.layers.matmul(weights, transpose_v)
+            out = paddle.matmul(weights, transpose_v)
 
         # combine heads
         if len(out.shape) != 4:
@@ -839,7 +839,7 @@ def forward(self, dec_inputs=None, enc_output=None):
         )
 
         if self._weight_sharing:
-            predict = fluid.layers.matmul(
+            predict = paddle.matmul(
                 x=dec_output_reshape,
                 y=self._prepare_decoder_layer._input_emb.weight,
                 transpose_y=True,
diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py
index c6165dd753537..e56a632c3de4d 100644
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -1174,7 +1174,7 @@ def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
         Scaled Dot-Product Attention
         """
         scaled_q = paddle.scale(x=q, scale=d_model**-0.5)
-        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
+        product = paddle.matmul(x=scaled_q, y=k, transpose_y=True)
         if attn_bias:
             product += attn_bias
         weights = paddle.nn.functional.softmax(product)
@@ -1185,7 +1185,7 @@ def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
                 seed=ModelHyperParams.dropout_seed,
                 is_test=False,
             )
-        out = layers.matmul(weights, v)
+        out = paddle.matmul(weights, v)
         return out
 
     q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
@@ -1701,7 +1701,7 @@ def wrap_decoder(
     )
     # Return logits for training and probs for inference.
     if weight_sharing:
-        predict = layers.matmul(
+        predict = paddle.matmul(
             x=dec_output,
             y=fluid.framework._get_var(word_emb_param_names[0]),
             transpose_y=True,
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py
index d45d775829944..43f7f0f6d2b5e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py
@@ -272,7 +272,7 @@ def forward(self, src_ids, position_ids, sentence_ids, input_mask):
 
         emb_out = self.pre_process_layer(emb_out)
 
-        self_attn_mask = fluid.layers.matmul(
+        self_attn_mask = paddle.matmul(
             x=input_mask, y=input_mask, transpose_y=True
         )
         self_attn_mask = paddle.scale(
@@ -401,7 +401,7 @@ def forward(
         mask_trans_feat = self.pre_process_layer(mask_trans_feat)
 
         if self._weight_sharing:
-            fc_out = fluid.layers.matmul(
+            fc_out = paddle.matmul(
                 x=mask_trans_feat,
                 y=self.bert_layer._src_emb._w,
                 transpose_y=True,
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
index 5babde40b4355..bbca449bde67a 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
@@ -70,7 +70,7 @@ def __init__(
 
     def forward(self, input, pre_hidden, pre_cell):
         concat_input_hidden = layers.concat([input, pre_hidden], 1)
-        gate_input = layers.matmul(x=concat_input_hidden, y=self._weight)
+        gate_input = paddle.matmul(x=concat_input_hidden, y=self._weight)
 
         gate_input = paddle.add(gate_input, self._bias)
         i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
@@ -697,14 +697,14 @@ def _gather(self, x, indices, batch_pos):
     def attention(self, query, enc_output, mask=None):
         query = fluid.layers.unsqueeze(query, [1])
         memory = self.attn_fc(enc_output)
-        attn = fluid.layers.matmul(query, memory, transpose_y=True)
+        attn = paddle.matmul(query, memory, transpose_y=True)
 
         if mask is not None:
             attn = paddle.transpose(attn, [1, 0, 2])
             attn = paddle.add(attn, mask * 1000000000)
             attn = paddle.transpose(attn, [1, 0, 2])
         weight = paddle.nn.functional.softmax(attn)
-        weight_memory = fluid.layers.matmul(weight, memory)
+        weight_memory = paddle.matmul(weight, memory)
 
         return weight_memory
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
index b7461b21aa612..17b5282903dc5 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
@@ -282,7 +282,7 @@ def forward(self, x):
         # PEM
         xp = paddle.nn.functional.relu(self.p_conv1(x))
         # BM layer
-        xp = fluid.layers.matmul(xp, self.sample_mask)
+        xp = paddle.matmul(xp, self.sample_mask)
         xp = paddle.reshape(xp, shape=[0, 0, -1, self.dscale, self.tscale])
 
         xp = self.p_conv3d1(xp)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
index 597580eedc765..aa5fa35d9c1d4 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
@@ -66,9 +66,9 @@ def forward(self, input, cache=None):
             v = 0.2 * cache_v + v
             cache["k"], cache["v"] = k, v
 
-        weight = fluid.layers.matmul(x=q, y=k, transpose_y=True)
+        weight = paddle.matmul(x=q, y=k, transpose_y=True)
         weight = paddle.nn.functional.softmax(weight)
-        out = fluid.layers.matmul(weight, v)
+        out = paddle.matmul(weight, v)
 
         return out
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
index aee91f6de1729..dd581526f4ec5 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
@@ -42,7 +42,7 @@
 def simple_func(x, weight_numpy):
     x = fluid.dygraph.to_variable(x)
     w = fluid.dygraph.to_variable(weight_numpy)
-    y = fluid.layers.matmul(x, w)
+    y = paddle.matmul(x, w)
     z = paddle.mean(y)
     return z
 
@@ -51,7 +51,7 @@ def simple_func(x, weight_numpy):
 def decorated_simple_func(x, weight_numpy):
     x = fluid.dygraph.to_variable(x)
     w = fluid.dygraph.to_variable(weight_numpy)
-    y = fluid.layers.matmul(x, w)
+    y = paddle.matmul(x, w)
     z = paddle.mean(y)
     return z
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
index 62c6c18346885..6f821265ca7f0 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
@@ -94,7 +94,7 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None):
                 bias = self.bias_arr[k]
 
                 nn = fluid.layers.concat([step_input, pre_hidden], 1)
-                gate_input = fluid.layers.matmul(x=nn, y=weight_1)
+                gate_input = paddle.matmul(x=nn, y=weight_1)
 
                 gate_input = paddle.add(gate_input, bias)
                 i, j, f, o = fluid.layers.split(
@@ -213,7 +213,7 @@ def forward(self, input, label, init_hidden, init_cell):
             x_emb, init_h, init_c
         )
 
-        projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
+        projection = paddle.matmul(rnn_out, self.softmax_weight)
         projection = paddle.add(projection, self.softmax_bias)
 
         loss = paddle.nn.functional.softmax_with_cross_entropy(
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
index 16449d00ae736..88cc415b4bbab 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
@@ -148,16 +148,14 @@ def forward(self, queries, keys, values, attn_bias, cache=None):
             v = layers.concat([cache_v, v], axis=2)
             cache["k"], cache["v"] = k, v
         # scale dot product attention
-        product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5
-        )
+        product = paddle.matmul(x=q, y=k, transpose_y=True)
+        product = paddle.scale(product, scale=self.d_model**-0.5)
         if attn_bias is not None:
             product += attn_bias
         weights = paddle.nn.functional.softmax(product)
         if self.dropout_rate:
             weights = layers.dropout(weights, dropout_prob=self.dropout_rate)
-            out = layers.matmul(weights, v)
-
+            out = paddle.matmul(weights, v)
         out = paddle.transpose(out, perm=[0, 2, 1, 3])
         out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
 
@@ -524,7 +522,7 @@ def __init__(
             postprocess_cmd,
         )
         if share_input_output_embed:
-            self.linear = lambda x: layers.matmul(
+            self.linear = lambda x: paddle.matmul(
                 x=x, y=self.word_embedder.word_embedder.weight, transpose_y=True
             )
         else:
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
index c6677b31b38cb..77c19ef30fe3a 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
@@ -44,7 +44,6 @@ def set_op_attrs(self):
         self.attrs = {
             "transpose_x": False,
             "transpose_y": False,
-            "alpha": 1.0,
         }
 
     @IPUOpTest.static_graph
@@ -56,7 +55,7 @@ def build_model(self):
             name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32'
         )
 
-        out = paddle.fluid.layers.matmul(x, y, **self.attrs)
+        out = paddle.matmul(x, y, **self.attrs)
         self.fetch_list = [out.name]
 
     def run_model(self, exec_mode):
@@ -75,7 +74,6 @@ def set_op_attrs(self):
         self.attrs = {
             "transpose_x": True,
             "transpose_y": True,
-            "alpha": 1.0,
         }
 
 
@@ -84,7 +82,6 @@ def set_op_attrs(self):
         self.attrs = {
             "transpose_x": True,
             "transpose_y": True,
-            "alpha": 3.14,
         }
 
     def set_atol(self):
@@ -141,7 +138,6 @@ def set_op_attrs(self):
         self.attrs = {
             "transpose_x": True,
             "transpose_y": True,
-            "alpha": 1.0,
         }
 
 
@@ -154,7 +150,10 @@ def set_data_feed(self):
         self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
     def set_op_attrs(self):
-        self.attrs = {"transpose_x": False, "transpose_y": True, "alpha": 0.125}
+        self.attrs = {
+            "transpose_x": False,
+            "transpose_y": True,
+        }
 
 
 class TestCase8(TestBase):
@@ -179,7 +178,6 @@ def set_op_attrs(self):
         self.attrs = {
             "transpose_x": True,
             "transpose_y": True,
-            "alpha": 1.0,
         }
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py
index 65c069c20311b..b08835de54be0 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py
@@ -67,7 +67,7 @@ def build_model(self):
                 input=y, size=768, param_attr=paddle.fluid.ParamAttr(name="fc")
             )
         with paddle.static.ipu_shard_guard(index=0, stage=2):
-            out = paddle.fluid.layers.matmul(
+            out = paddle.matmul(
                 x=z,
                 y=self.main_prog.global_block().var('word_embedding'),
                 transpose_y=True,
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_op_output_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_op_output_fuse_pass.py
index 2ef22f90572f3..7b4229e1c360b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_op_output_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_op_output_fuse_pass.py
@@ -37,7 +37,7 @@ def make_network(self):
             y = fluid.data(
                 name='y', shape=[-1] + self.shape_y, dtype=self.d_type
             )
-            out = fluid.layers.matmul(x, y)
+            out = paddle.matmul(x, y)
             out = paddle.transpose(out, perm=[0, 2, 1, 3])
             out = paddle.reshape(out, [0, 0, self.shape_y[0] * self.shape_y[2]])
 
@@ -79,7 +79,7 @@ def make_network(self):
             y = fluid.data(
                 name='y', shape=[-1] + self.shape_y, dtype=self.d_type
             )
-            out = fluid.layers.matmul(x, y)
+            out = paddle.matmul(x, y)
             out = paddle.transpose(out, perm=[0, 1, 2, 3])
             out = paddle.reshape(out, [0, 0, 0, 0])
             out = fluid.layers.fc(out, size=1)
@@ -102,7 +102,7 @@ def make_network(self):
             y = fluid.data(
                 name='y', shape=[-1] + self.shape_y, dtype=self.d_type
             )
-            out = fluid.layers.matmul(x, y)
+            out = paddle.matmul(x, y)
             out = paddle.transpose(out, perm=[0, 2, 1, 3])
             out = paddle.transpose(out, perm=[0, 1, 2, 3])  # breaks pattern
             out = paddle.reshape(out, [0, 0, self.shape_y[0] * self.shape_y[2]])
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_inspector.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_inspector.py
index 829fc392160d3..9c8e1ee04cc38 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_inspector.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_inspector.py
@@ -30,13 +30,13 @@ def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(name="data", shape=[1, 16, 16], dtype="float32")
-            matmul_out = fluid.layers.matmul(
+            matmul_out = paddle.matmul(
                 x=data,
                 y=data,
                 transpose_x=self.transpose_x,
                 transpose_y=self.transpose_y,
-                alpha=self.alpha,
             )
+            matmul_out = paddle.scale(matmul_out, scale=self.alpha)
             out = fluid.layers.batch_norm(matmul_out, is_test=True)
 
         self.feeds = {
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
index 76ebfc9317dc9..038912fbe4cb1 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
@@ -17,6 +17,7 @@
 import numpy as np
 from inference_pass_test import InferencePassTest
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
@@ -27,13 +28,13 @@ def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(name="data", shape=[24, 24], dtype="float32")
-            matmul_out = fluid.layers.matmul(
+            matmul_out = paddle.matmul(
                 x=data,
                 y=data,
                 transpose_x=self.transpose_x,
                 transpose_y=self.transpose_y,
-                alpha=self.alpha,
             )
+            matmul_out = paddle.scale(matmul_out, scale=self.alpha)
             out = fluid.layers.batch_norm(matmul_out, is_test=True)
 
         self.feeds = {
@@ -66,13 +67,13 @@ def setUp(self):
             data = fluid.data(
                 name="data", shape=[-1, 6, 24, 24], dtype="float32"
             )
-            matmul_out = fluid.layers.matmul(
+            matmul_out = paddle.matmul(
                 x=data,
                 y=data,
                 transpose_x=self.transpose_x,
                 transpose_y=self.transpose_y,
-                alpha=self.alpha,
             )
+            matmul_out = paddle.scale(matmul_out, scale=self.alpha)
             out = fluid.layers.batch_norm(matmul_out, is_test=True)
 
         self.feeds = {
@@ -128,13 +129,13 @@ def setUp(self):
                 name="data_x", shape=[-1, 6, 24], dtype="float32"
             )
             data_y = fluid.data(name="data_y", shape=[24, 16], dtype="float32")
-            matmul_out = fluid.layers.matmul(
+            matmul_out = paddle.matmul(
                 x=data_x,
                 y=data_y,
                 transpose_x=self.transpose_x,
                 transpose_y=self.transpose_y,
-                alpha=self.alpha,
             )
+            matmul_out = paddle.scale(matmul_out, scale=self.alpha)
             out = fluid.layers.batch_norm(matmul_out, is_test=True)
 
         self.feeds = {
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py
index b85f530cb06af..b8566840d2131 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py
@@ -32,13 +32,13 @@ def network():
                 name='data', shape=[1, 28, 28], dtype='float32'
             )
             self.label = fluid.data(name='label', shape=[1, 1], dtype='int64')
-            matmul_out = fluid.layers.matmul(
+            matmul_out = paddle.matmul(
                 x=self.data,
                 y=self.data,
                 transpose_x=self.transpose_x,
                 transpose_y=self.transpose_y,
-                alpha=self.alpha,
             )
+            matmul_out = paddle.scale(matmul_out, scale=self.alpha)
             fc_out = fluid.layers.fc(
                 input=matmul_out,
                 size=10,
@@ -128,13 +128,13 @@ def network():
             )
             self.label = fluid.data(name='label', shape=[1, 1], dtype='int64')
             reshape_out = paddle.reshape(self.data, shape=[1, 4, 14, 14])
-            matmul_out = fluid.layers.matmul(
+            matmul_out = paddle.matmul(
                 x=reshape_out,
                 y=reshape_out,
                 transpose_x=self.transpose_x,
                 transpose_y=self.transpose_y,
-                alpha=self.alpha,
             )
+            matmul_out = paddle.scale(matmul_out, scale=self.alpha)
             out = fluid.layers.batch_norm(matmul_out, is_test=True)
             fc_out = fluid.layers.fc(
                 input=matmul_out,
@@ -224,13 +224,13 @@ def network():
                 name='data', shape=[-1, 28, 28], dtype='float32'
             )
             self.label = fluid.data(name='label', shape=[1, 1], dtype='int64')
-            matmul_out = fluid.layers.matmul(
+            matmul_out = paddle.matmul(
                 x=self.data,
                 y=self.data,
                 transpose_x=self.transpose_x,
                 transpose_y=self.transpose_y,
-                alpha=self.alpha,
             )
+            matmul_out = paddle.scale(matmul_out, scale=self.alpha)
             out = fluid.layers.batch_norm(matmul_out, is_test=True)
             fc_out = fluid.layers.fc(
                 input=matmul_out,
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py
index 7f471307bafa4..7a006e3627c4d 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py
@@ -48,7 +48,7 @@ def check():
         a = fluid.dygraph.to_variable(a_np)
         b = fluid.dygraph.to_variable(b_np)
         y = paddle.add(x=a, y=b)
-        y = fluid.layers.matmul(x=y, y=b, transpose_y=True)
+        y = paddle.matmul(x=y, y=b, transpose_y=True)
         res1 = func(y)
 
         np_res = np.add(a_np, b_np)
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_flags_mkldnn_ops_on_off.py b/python/paddle/fluid/tests/unittests/mkldnn/test_flags_mkldnn_ops_on_off.py
index 2d136dba6ff2a..1c406084105e6 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_flags_mkldnn_ops_on_off.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_flags_mkldnn_ops_on_off.py
@@ -87,14 +87,14 @@ def test_flags_use_mkl_dnn_on_multiple(self):
         assert self.not_found(self.matmul_regex, out, err)
 
     def test_flags_use_mkl_dnn_off(self):
-        env = {str("FLAGS_tracer_mkldnn_ops_off"): str("matmul")}
+        env = {str("FLAGS_tracer_mkldnn_ops_off"): str("matmul_v2")}
         out, err = self.flags_use_mkl_dnn_common(env)
         assert self.found(self.relu_regex, out, err)
         assert self.found(self.ew_add_regex, out, err)
         assert self.not_found(self.matmul_regex, out, err)
 
     def test_flags_use_mkl_dnn_off_multiple(self):
-        env = {str("FLAGS_tracer_mkldnn_ops_off"): str("matmul,relu")}
+        env = {str("FLAGS_tracer_mkldnn_ops_off"): str("matmul_v2,relu")}
         out, err = self.flags_use_mkl_dnn_common(env)
         assert self.not_found(self.relu_regex, out, err)
         assert self.found(self.ew_add_regex, out, err)
@@ -103,7 +103,7 @@ def test_flags_use_mkl_dnn_off_multiple(self):
     def test_flags_use_mkl_dnn_on_off(self):
         env = {
             str("FLAGS_tracer_mkldnn_ops_on"): str("elementwise_add"),
-            str("FLAGS_tracer_mkldnn_ops_off"): str("matmul"),
+            str("FLAGS_tracer_mkldnn_ops_off"): str("matmul_v2"),
         }
         out, err = self.flags_use_mkl_dnn_common(env)
         assert self.not_found(self.relu_regex, out, err)
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py
index 7c46efe77556f..e1103c1d595c0 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py
@@ -65,7 +65,7 @@ def __init__(
 
     def forward(self, input, label):
         x_emb = self.embedding(input)
-        fc = fluid.layers.matmul(x_emb, self.softmax_weight)
+        fc = paddle.matmul(x_emb, self.softmax_weight)
         fc = paddle.add(fc, self.softmax_bias)
         projection = paddle.reshape(fc, shape=[-1, self.vocab_size])
         loss = paddle.nn.functional.softmax_with_cross_entropy(
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py
index 113f32d31e12a..1094c1ae8ac93 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py
@@ -24,7 +24,6 @@
 from paddle.distributed.auto_parallel.completion import Completer
 from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from paddle.distributed.fleet import auto
-from paddle.fluid import layers
 
 paddle.enable_static()
 _global_parallel_strategy = None
@@ -301,9 +300,8 @@ def forward(self, input):
         v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
 
         # scale dot product attention
-        product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5
-        )
+        product = tensor.matmul(x=q, y=k, transpose_y=True)
+        product = tensor.scale(product, scale=self.head_dim**-0.5)
 
         if self.attn_mask is not None:
             product = product + self.attn_mask
@@ -568,9 +566,8 @@ def forward(self, input_ids, position_ids):
         v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
 
         # scale dot product attention
-        product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5
-        )
+        product = tensor.matmul(x=q, y=k, transpose_y=True)
+        product = tensor.scale(product, scale=self.head_dim**-0.5)
 
         if self.attn_mask is not None:
             product = product + self.attn_mask
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py
index 45dd9bb66ee06..5a5431552b99e 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py
@@ -210,9 +210,8 @@ def forward(
                 query, key, value, use_cache, cache
             )
         # scale dot product attention
-        product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5
-        )
+        product = tensor.matmul(x=q, y=k, transpose_y=True)
+        product = tensor.scale(product, scale=self.head_dim**-0.5)
 
         if attn_mask is not None:
             product = product + attn_mask
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
index f745926f5b39b..8300aaa418c2a 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
@@ -28,7 +28,6 @@
 from paddle.distributed.auto_parallel.process_group import new_process_group
 from paddle.distributed.auto_parallel.utils import _get_comm_group
 from paddle.distributed.fleet import auto
-from paddle.fluid import layers
 
 paddle.enable_static()
 _global_parallel_strategy = None
@@ -695,9 +694,8 @@ def forward(self, input):
         v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
 
         # scale dot product attention
-        product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5
-        )
+        product = tensor.matmul(x=q, y=k, transpose_y=True)
+        product = tensor.scale(product, scale=self.head_dim**-0.5)
 
         if self.attn_mask is not None:
             product = product + self.attn_mask
@@ -868,7 +866,8 @@ def test_attn_mp(self):
             'transpose2',
             'reshape2',
             'transpose2',
-            'matmul',
+            'matmul_v2',
+            "scale",
             'softmax',
             'dropout',
             'matmul_v2',
@@ -976,7 +975,8 @@ def test_attn_dp_mp(self):
             'transpose2',
             'reshape2',
             'transpose2',
-            'matmul',
+            'matmul_v2',
+            "scale",
             'softmax',
             'dropout',
             'matmul_v2',
@@ -1166,9 +1166,8 @@ def forward(self, input_ids, position_ids):
         v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
 
         # scale dot product attention
-        product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5
-        )
+        product = tensor.matmul(x=q, y=k, transpose_y=True)
+        product = tensor.scale(product, scale=self.head_dim**-0.5)
 
         if self.attn_mask is not None:
             product = product + self.attn_mask
@@ -1347,7 +1346,8 @@ def test_decoder_dp_mp(self):
             'transpose2',
             'reshape2',
             'transpose2',
-            'matmul',
+            'matmul_v2',
+            "scale",
             'softmax',
             'dropout',
             'matmul_v2',
@@ -1399,15 +1399,15 @@ def test_decoder_dp_mp(self):
             distributed_attr_check_for_program(dist_main_prog, dist_context)
         )
         # check distribured attr
-        serial_op_idx = [0, 5, 9, 11, 23, 28, 31]
+        serial_op_idx = [0, 5, 9, 11, 24, 29, 32]
         dist_op_idx = [
             [0, 1],
             [6, 7],
             [11, 12],
             [14, 15],
-            [27, 28],
-            [33, 34],
-            [37, 38],
+            [28, 29],
+            [34, 35],
+            [38, 39],
         ]
         self.assertTrue(
             distributed_attr_check_for_dist_op(
@@ -1500,7 +1500,8 @@ def test_decoder_noparallel(self):
             'transpose2',
             'reshape2',
             'transpose2',
-            'matmul',
+            'matmul_v2',
+            "scale",
             'softmax',
             'dropout',
             'matmul_v2',
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
index cefc98cdb5ff4..a41b79d4effb9 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
@@ -256,9 +256,8 @@ def forward(
                 query, key, value, use_cache, cache
             )
         # scale dot product attention
-        product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5
-        )
+        product = tensor.matmul(x=q, y=k, transpose_y=True)
+        product = tensor.scale(product, scale=self.head_dim**-0.5)
 
         if attn_mask is not None:
             product = product + attn_mask
diff --git a/python/paddle/fluid/tests/unittests/test_auto_search_dist_matmul_op.py b/python/paddle/fluid/tests/unittests/test_auto_search_dist_matmul_op.py
index a9430ea4aa0c9..145a4a2724b1b 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_search_dist_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_search_dist_matmul_op.py
@@ -103,6 +103,7 @@ def mlp_forward(train_program, start_program):
     return loss, train_program, start_program
 
 
+@unittest.skipIf(True, "to delete later")
 class TestCompatible(unittest.TestCase):
     def test_matmulv2_matmul_2_compatible(self):
         valid_op_dist_attr_list = []
diff --git a/python/paddle/fluid/tests/unittests/test_auto_search_dist_op.py b/python/paddle/fluid/tests/unittests/test_auto_search_dist_op.py
index 0c36aa2460454..676883dfd2a1c 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_search_dist_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_search_dist_op.py
@@ -26,7 +26,6 @@
 from paddle.distributed.auto_parallel.operators.common import (
     get_distributed_operator_impl_container,
 )
-from paddle.fluid import layers
 
 paddle.enable_static()
 device = "gpu" if core.is_compiled_with_cuda() else "cpu"
@@ -85,7 +84,7 @@ def mlp_forward(train_program, start_program):
             shape=[hidden_size, hidden_size],
             dtype='float32',
         )
-        input = layers.matmul(x=input, y=matmulinput)
+        input = paddle.matmul(x=input, y=matmulinput)
         label = static.data(
             name="label", shape=[batch_size, 1], dtype='float32'
         )
diff --git a/python/paddle/fluid/tests/unittests/test_cholesky_op.py b/python/paddle/fluid/tests/unittests/test_cholesky_op.py
index 20ab3e73ab3aa..7525e3e5423f7 100644
--- a/python/paddle/fluid/tests/unittests/test_cholesky_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cholesky_op.py
@@ -22,7 +22,6 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-import paddle.fluid.layers as layers
 
 
 @skip_check_grad_ci(
@@ -77,7 +76,7 @@ def func(self, place):
                 dtype=root_data.dtype, shape=root_data.shape
             )
             root_t = paddle.transpose(root, self.trans_dims)
-            x = layers.matmul(x=root, y=root_t) + 1e-05
+            x = paddle.matmul(x=root, y=root_t) + 1e-05
             out = paddle.cholesky(x, upper=self.attrs["upper"])
             grad_check(root, out, x_init=root_data, place=place)
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 45cdf97236874..4e4f299db3bbc 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -414,9 +414,7 @@ def net_conf(self):
 
         input_emb_re = paddle.reshape(input_emb, shape=[-1, 1, embedding_size])
 
-        neg_matmul = fluid.layers.matmul(
-            input_emb_re, neg_emb_w_re, transpose_y=True
-        )
+        neg_matmul = paddle.matmul(input_emb_re, neg_emb_w_re, transpose_y=True)
         neg_matmul_re = paddle.reshape(neg_matmul, shape=[-1, neg_num])
         neg_logits = paddle.add(neg_matmul_re, neg_emb_b_vec)
         # nce loss
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
index 921e4a4e43d2e..da0c10f85fc7c 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
@@ -167,7 +167,7 @@ def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None):
                 bias = bias_arr[k]
 
                 nn = layers.concat([input, pre_hidden], 1)
-                gate_input = layers.matmul(x=nn, y=weight_1)
+                gate_input = paddle.matmul(x=nn, y=weight_1)
 
                 gate_input = paddle.add(gate_input, bias)
                 i = paddle.slice(
@@ -291,7 +291,7 @@ def encoder_static(
                 bias = bias_arr[k]
 
                 nn = layers.concat([input, pre_hidden], 1)
-                gate_input = layers.matmul(x=nn, y=weight_1)
+                gate_input = paddle.matmul(x=nn, y=weight_1)
 
                 gate_input = paddle.add(gate_input, bias)
                 i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
@@ -459,7 +459,7 @@ def encoder_static(
         ),
     )
 
-    projection = layers.matmul(rnn_out, softmax_weight)
+    projection = paddle.matmul(rnn_out, softmax_weight)
     projection = paddle.add(projection, softmax_bias)
     projection = paddle.reshape(projection, shape=[-1, vocab_size])
 
diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
index a73dae327117c..47296e48a2b4d 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
@@ -21,7 +21,6 @@
 import paddle.incubate.nn.functional as incubate_f
 import paddle.nn.functional as F
 from paddle import tensor
-from paddle.fluid import layers
 from paddle.fluid.framework import default_main_program
 from paddle.nn.layer.common import Dropout, Linear
 from paddle.nn.layer.norm import LayerNorm
@@ -192,9 +191,8 @@ def GetBaselineOut(self):
 
         # [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim]
         # --> [B, n_head, seq_len, out_seq_len]
-        qk_out = layers.matmul(
-            x=q_out, y=k_out, transpose_y=True, alpha=self.head_dim**-0.5
-        )
+        qk_out = paddle.matmul(x=q_out, y=k_out, transpose_y=True)
+        qk_out = paddle.scale(qk_out, scale=self.head_dim**-0.5)
 
         if attn_mask is not None:
             attn_mask = _convert_attention_mask(attn_mask, qk_out.dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_int8_op.py b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_int8_op.py
index fbbe2d65418af..d0057ac1b7cce 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_int8_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_int8_op.py
@@ -19,7 +19,6 @@
 import paddle
 import paddle.nn.functional as F
 from paddle import _legacy_C_ops, tensor
-from paddle.fluid import layers
 from paddle.fluid.framework import default_main_program
 from paddle.nn.layer.common import Dropout
 from paddle.nn.layer.norm import LayerNorm
@@ -388,9 +387,8 @@ def GetBaselineOut(self):
 
             # [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim]
             # --> [B, n_head, seq_len, out_seq_len]
-            qk_out = layers.matmul(
-                x=q_out, y=k_out, transpose_y=True, alpha=self.head_dim**-0.5
-            )
+            qk_out = paddle.matmul(x=q_out, y=k_out, transpose_y=True)
+            qk_out = paddle.scale(qk_out, scale=self.head_dim**-0.5)
 
             if self.debug:
                 print('qk out is')
diff --git a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
index 8aadeba437f5b..abbfb3f08bff8 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
@@ -281,9 +281,8 @@ def GetBaselineOut(self):
 
             # [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim]
             # --> [B, n_head, seq_len, out_seq_len]
-            qk_out = layers.matmul(
-                x=q_out, y=k_out, transpose_y=True, alpha=self.head_dim**-0.5
-            )
+            qk_out = paddle.matmul(x=q_out, y=k_out, transpose_y=True)
+            qk_out = paddle.scale(qk_out, scale=self.head_dim**-0.5)
 
             if self.debug:
                 print('qk out is')
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index a8aa34eb44b08..375536b8cb684 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -1001,7 +1001,7 @@ def func_without_guard(self):
         with self.assertRaisesRegexp(
             TypeError, "Please use `with fluid.dygraph.guard()"
         ):
-            y = fluid.layers.matmul(x, x)
+            y = paddle.matmul(x, x)
 
     def test_without_guard(self):
         with _test_eager_guard():
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
index f62dfe436a799..db750a5aa11f1 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
@@ -46,9 +46,9 @@ def __init__(self, name_scope, in_features, out_features):
         )
 
     def forward(self, features, adj):
-        support = fluid.layers.matmul(features, self.weight)
+        support = paddle.matmul(features, self.weight)
         # TODO(panyx0718): sparse matmul?
-        return fluid.layers.matmul(adj, support) + self.bias
+        return paddle.matmul(adj, support) + self.bias
 
 
 class GCN(fluid.Layer):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
index 76733836ddc56..fe706a78f8fe6 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
@@ -64,7 +64,7 @@ def __init__(
 
     def forward(self, input, label):
         x_emb = self.embedding(input)
-        projection = fluid.layers.matmul(
+        projection = paddle.matmul(
             x_emb, paddle.transpose(self.embedding.weight, perm=[1, 0])
         )
         projection = paddle.add(projection, self.softmax_bias)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
index 3980b0dbb27e6..55f7f1ec31f18 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -109,7 +109,7 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None):
                 bias = self.bias_arr[k]
 
                 nn = fluid.layers.concat([self._input, pre_hidden], 1)
-                gate_input = fluid.layers.matmul(x=nn, y=weight_1)
+                gate_input = paddle.matmul(x=nn, y=weight_1)
 
                 gate_input = paddle.add(gate_input, bias)
                 i, j, f, o = fluid.layers.split(
@@ -225,7 +225,7 @@ def forward(self, input, label, init_hidden, init_cell):
         rnn_out = paddle.reshape(
             rnn_out, shape=[-1, self.num_steps, self.hidden_size]
         )
-        projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
+        projection = paddle.matmul(rnn_out, self.softmax_weight)
         projection = paddle.add(projection, self.softmax_bias)
         projection = paddle.reshape(projection, shape=[-1, self.vocab_size])
         loss = paddle.nn.functional.softmax_with_cross_entropy(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
index a386e2113fa99..2e30ea41a18cd 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
@@ -104,7 +104,7 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None):
                 bias = self.bias_arr[k]
 
                 nn = fluid.layers.concat([self._input, pre_hidden], 1)
-                gate_input = fluid.layers.matmul(x=nn, y=weight_1)
+                gate_input = paddle.matmul(x=nn, y=weight_1)
 
                 gate_input = paddle.add(gate_input, bias)
                 i, j, f, o = fluid.layers.split(
@@ -221,7 +221,7 @@ def forward(self, input, label, init_hidden, init_cell):
             rnn_out, shape=[-1, self.num_steps, self.hidden_size]
         )
 
-        projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
+        projection = paddle.matmul(rnn_out, self.softmax_weight)
         projection = paddle.add(projection, self.softmax_bias)
         projection = paddle.reshape(projection, shape=[-1, self.vocab_size])
         loss = paddle.nn.functional.softmax_with_cross_entropy(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
index 19f4616d92705..4a3c6c64a6f6e 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -105,7 +105,7 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None):
                 bias = self.bias_arr[k]
 
                 nn = fluid.layers.concat([self._input, pre_hidden], 1)
-                gate_input = fluid.layers.matmul(x=nn, y=weight_1)
+                gate_input = paddle.matmul(x=nn, y=weight_1)
 
                 gate_input = paddle.add(gate_input, bias)
                 i, j, f, o = fluid.layers.split(
@@ -222,7 +222,7 @@ def forward(self, input, label, init_hidden, init_cell):
             rnn_out, shape=[-1, self.num_steps, self.hidden_size]
         )
 
-        projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
+        projection = paddle.matmul(rnn_out, self.softmax_weight)
         projection = paddle.add(projection, self.softmax_bias)
         projection = paddle.reshape(projection, shape=[-1, self.vocab_size])
         loss = paddle.nn.functional.softmax_with_cross_entropy(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
index bfba325046ea3..dd490e8d5553b 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
@@ -72,9 +72,9 @@ def __init__(
 
     def forward(self, input, label):
         x_emb = self.embedding(input)
-        fc = fluid.layers.matmul(x_emb, self.softmax_weight)
+        fc = paddle.matmul(x_emb, self.softmax_weight)
         fc = paddle.add(fc, self.softmax_bias)
-        projection = fluid.layers.matmul(
+        projection = paddle.matmul(
             fc, paddle.transpose(self.embedding.weight, perm=[1, 0])
         )
         projection = paddle.reshape(projection, shape=[-1, self.vocab_size])
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
index a88c31dd3f5ee..0bd69f0359104 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
@@ -495,12 +495,12 @@ def forward(self, queries, keys, values, attn_bias):
         transpose_v = paddle.transpose(x=reshaped_v, perm=[0, 2, 1, 3])
 
         # scale dot product attention
-        product = fluid.layers.matmul(
+        product = paddle.matmul(
             x=transpose_q,
             y=transpose_k,
             transpose_y=True,
-            alpha=self._d_model**-0.5,
         )
+        product = paddle.scale(product, scale=self._d_model**-0.5)
         if attn_bias is not None:
             product += attn_bias
         weights = paddle.nn.functional.softmax(product)
@@ -511,9 +511,9 @@ def forward(self, queries, keys, values, attn_bias):
                 seed=ModelHyperParams.dropout_seed,
                 is_test=False,
             )
-            out = fluid.layers.matmul(weights_droped, transpose_v)
+            out = paddle.matmul(weights_droped, transpose_v)
         else:
-            out = fluid.layers.matmul(weights, transpose_v)
+            out = paddle.matmul(weights, transpose_v)
 
         # combine heads
         if len(out.shape) != 4:
@@ -1003,7 +1003,7 @@ def forward(self, dec_inputs=None, enc_output=None):
         )
 
         if self._weight_sharing:
-            predict = fluid.layers.matmul(
+            predict = paddle.matmul(
                 x=dec_output_reshape,
                 y=self._prepare_decoder_layer._input_emb.weight,
                 transpose_y=True,
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 2258e3807c423..9297666eead48 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -290,7 +290,7 @@ def test_matmul(self):
         with self.static_graph():
             t = layers.data(name='t', shape=[3, 3], dtype='float32')
             t2 = layers.data(name='t2', shape=[3, 3], dtype='float32')
-            ret = layers.matmul(t, t2)
+            ret = paddle.matmul(t, t2)
             static_ret = self.get_static_graph_result(
                 feed={
                     't': np.ones([3, 3], dtype='float32'),
@@ -303,14 +303,14 @@ def test_matmul(self):
             with _test_eager_guard():
                 t = np.ones([3, 3], dtype='float32')
                 t2 = np.ones([3, 3], dtype='float32')
-                dy_eager_ret = layers.matmul(
+                dy_eager_ret = paddle.matmul(
                     base.to_variable(t), base.to_variable(t2)
                 )
                 dy_eager_ret_value = dy_eager_ret.numpy()
 
             t = np.ones([3, 3], dtype='float32')
             t2 = np.ones([3, 3], dtype='float32')
-            dy_ret = layers.matmul(base.to_variable(t), base.to_variable(t2))
+            dy_ret = paddle.matmul(base.to_variable(t), base.to_variable(t2))
             dy_ret_value = dy_ret.numpy()
 
         np.testing.assert_allclose(static_ret, dy_ret_value, rtol=1e-05)
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_op.py b/python/paddle/fluid/tests/unittests/test_matmul_op.py
index a5835fd266e09..1ac71759de572 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_op.py
@@ -19,7 +19,6 @@
 
 import paddle
 import paddle.fluid as fluid
-from paddle.fluid import Program, program_guard
 
 
 def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y):
@@ -117,151 +116,6 @@ def test_check_grad_ignore_y(self):
         )
 
 
-class TestMatmulOpError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-            # The inputs type of matmul_op must be Variable.
-            input1 = 12
-            self.assertRaises(TypeError, fluid.layers.matmul, input1, input1)
-            # The inputs dtype of matmul_op must be float32, float64.
-            input2 = fluid.layers.data(
-                name='input2', shape=[10, 10], dtype="int32"
-            )
-            self.assertRaises(TypeError, fluid.layers.matmul, input2, input2)
-            input3 = fluid.layers.data(
-                name='input3', shape=[2, 2], dtype="float16"
-            )
-            fluid.layers.matmul(input3, input3)
-
-
-# Negative dimension generation
-def generate_negative_dims(in_shape):
-    from itertools import combinations
-
-    size = len(in_shape)
-    indexs = list()
-    shapes = list()
-    for i in range(size):
-        indexs.extend(list(combinations([j for j in range(size)], i + 1)))
-    for idx in indexs:
-        shapes.append(
-            [in_shape[i] if i not in idx else -1 for i in range(size)]
-        )
-    return shapes
-
-
-# Build program with inputs sizes that contain negative numbers
-def test_negative_dims_program(obj):
-    for shape_x in generate_negative_dims(obj.shape_X):
-        for shape_y in generate_negative_dims(obj.shape_Y):
-            X = np.random.random(obj.shape_X).astype("float32")
-            Y = np.random.random(obj.shape_Y).astype("float32")
-            Ref = reference_matmul(X, Y, obj.transpose_X, obj.transpose_Y)
-            with program_guard(Program(), Program()):
-                x = fluid.data(name='x', shape=shape_x, dtype='float32')
-                y = fluid.data(name='y', shape=shape_y, dtype='float32')
-                output = fluid.layers.matmul(
-                    x, y, obj.transpose_X, obj.transpose_Y
-                )
-                obj.assertEqual(len(Ref.shape), len(output.shape))
-                for idx in range(len(Ref.shape)):
-                    if output.shape[idx] != -1:
-                        obj.assertEqual(Ref.shape[idx], output.shape[idx])
-                exe = fluid.Executor(fluid.CPUPlace())
-                (res,) = exe.run(
-                    fluid.default_main_program(),
-                    feed={'x': X, 'y': Y},
-                    fetch_list=[output],
-                )
-                np.allclose(res, Ref, atol=1e-5)
-
-
-# Generate program api cases for all negative possibilities
-def api_test(dim_x, dim_y, trans_x, trans_y):
-    test_name = 'TestMatMulAPI_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
-        dim_x, dim_y, trans_x, trans_y
-    )
-    shape_x, shape_y = generate_compatible_shapes(
-        dim_x, dim_y, trans_x, trans_y
-    )
-    globals()[test_name] = type(
-        test_name,
-        (unittest.TestCase,),
-        {
-            'shape_X': shape_x,
-            'shape_Y': shape_y,
-            'transpose_X': trans_x,
-            'transpose_Y': trans_y,
-            'test_propram': test_negative_dims_program,
-        },
-    )
-
-
-# Generate operators cases for all possibilities
-def inject_test(dim_x, dim_y, trans_x, trans_y):
-    test_name = 'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
-        dim_x, dim_y, trans_x, trans_y
-    )
-    shape_x, shape_y = generate_compatible_shapes(
-        dim_x, dim_y, trans_x, trans_y
-    )
-    globals()[test_name] = type(
-        test_name,
-        (Generator, OpTest),
-        {
-            'shape_X': shape_x,
-            'shape_Y': shape_y,
-            'transpose_X': trans_x,
-            'transpose_Y': trans_y,
-        },
-    )
-
-
-for dim_X in (1, 2, 3):
-    for dim_Y in (1, 2, 3):
-        for transose_x in (False, True):
-            for transose_y in (False, True):
-                inject_test(dim_X, dim_Y, transose_x, transose_y)
-                api_test(dim_X, dim_Y, transose_x, transose_y)
-
-
-# Test case more batch_size and N, M, K
-def generate_compatible_shapes_batch(
-    dim_X, dim_Y, transpose_X, transpose_Y, batch_size
-):
-    BATCH_SIZE = 2
-    M = 3
-    N = 4
-    K = 5
-    if (dim_X == 1 and transpose_X) or (dim_Y == 1 and transpose_Y):
-        K = 1
-    if dim_X == 1:
-        if transpose_X:
-            shape_X = [M]
-        else:
-            shape_X = [K]
-    if dim_Y == 1:
-        if transpose_Y:
-            shape_Y = [N]
-        else:
-            shape_Y = [K]
-    if dim_X >= 2:
-        if transpose_X:
-            shape_X = [K, M]
-        else:
-            shape_X = [M, K]
-    if dim_X == 3:
-        shape_X = [BATCH_SIZE] + shape_X
-    if dim_Y >= 2:
-        if transpose_Y:
-            shape_Y = [N, K]
-        else:
-            shape_Y = [K, N]
-    if dim_Y == 3:
-        shape_Y = [BATCH_SIZE] + shape_Y
-    return shape_X, shape_Y
-
-
 # Test case n-dim
 def generate_compatible_shapes_ndim(dim, transpose_X, transpose_Y):
     M = 2
diff --git a/python/paddle/fluid/tests/unittests/test_mul_nn_grad.py b/python/paddle/fluid/tests/unittests/test_mul_nn_grad.py
index b6c3f03f979a1..e988a875f8e95 100644
--- a/python/paddle/fluid/tests/unittests/test_mul_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_nn_grad.py
@@ -94,7 +94,7 @@ def func(self, place):
         y = paddle.create_parameter(
             dtype=typename, shape=self.y_shape, name='y'
         )
-        out = layers.matmul(
+        out = paddle.matmul(
             x, y, self.transpose_x, self.transpose_y, name='out'
         )
 
diff --git a/python/paddle/fluid/tests/unittests/test_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
index 2b06de33f2cc0..1cf0f8e3745fd 100644
--- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
@@ -616,13 +616,13 @@ def create_rnn_op(self):
         rnn = layers.StaticRNN()
 
         def dot_attention(query, memory):
-            attn = layers.matmul(query, memory, transpose_y=True)
+            attn = paddle.matmul(query, memory, transpose_y=True)
             weight = paddle.nn.functional.softmax(attn)
-            weight_memory = layers.matmul(weight, memory)
+            weight_memory = paddle.matmul(weight, memory)
 
             return weight_memory, weight
 
-        y = layers.matmul(emb, w1)
+        y = paddle.matmul(emb, w1)
         with rnn.step():
             pre_h = rnn.memory(
                 shape=(self.sent_len, self.input_dim),
@@ -631,7 +631,7 @@ def dot_attention(query, memory):
             )
             step_in = rnn.step_input(x)
             concat_in = layers.concat([step_in, pre_h], 1)
-            new_h = layers.matmul(concat_in, w2)
+            new_h = paddle.matmul(concat_in, w2)
             new_h = layers.unsqueeze(new_h, [1])
             new_h, _ = dot_attention(new_h, y)
             new_h = paddle.squeeze(new_h, [1])
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
index 67657071db83f..5a1aaa78338a6 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
@@ -71,14 +71,14 @@ def attention(self, hidden, encoder_output, encoder_padding_mask):
         query = layers.fc(
             hidden, size=encoder_output.shape[-1], bias_attr=False
         )
-        attn_scores = layers.matmul(
+        attn_scores = paddle.matmul(
             layers.unsqueeze(query, [1]), encoder_output, transpose_y=True
         )
         if encoder_padding_mask is not None:
             attn_scores = paddle.add(attn_scores, encoder_padding_mask)
         attn_scores = paddle.nn.functional.softmax(attn_scores)
         attn_out = paddle.squeeze(
-            layers.matmul(attn_scores, encoder_output), [1]
+            paddle.matmul(attn_scores, encoder_output), [1]
         )
         attn_out = layers.concat([attn_out, hidden], 1)
         attn_out = layers.fc(attn_out, size=self.hidden_size, bias_attr=False)
diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load.py b/python/paddle/fluid/tests/unittests/test_static_save_load.py
index a2c44c5fae8fa..a20573edd1716 100644
--- a/python/paddle/fluid/tests/unittests/test_static_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_static_save_load.py
@@ -115,7 +115,7 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None):
                 bias = self.bias_arr[k]
 
                 nn = fluid.layers.concat([self._input, pre_hidden], 1)
-                gate_input = fluid.layers.matmul(x=nn, y=weight_1)
+                gate_input = paddle.matmul(x=nn, y=weight_1)
 
                 gate_input = paddle.add(gate_input, bias)
                 i, j, f, o = fluid.layers.split(
@@ -234,7 +234,7 @@ def forward(self, input, label, init_hidden, init_cell):
         rnn_out = paddle.reshape(
             rnn_out, shape=[-1, self.num_steps, self.hidden_size]
         )
-        projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
+        projection = paddle.matmul(rnn_out, self.softmax_weight)
         projection = paddle.add(projection, self.softmax_bias)
         projection = paddle.reshape(projection, shape=[-1, self.vocab_size])
         loss = paddle.nn.functional.softmax_with_cross_entropy(
diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py
index 316241caf8f63..a338d31f78fde 100644
--- a/python/paddle/fluid/tests/unittests/transformer_model.py
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
@@ -163,13 +163,13 @@ def __softmax(x, eps=1e-9):
             return layers.elementwise_div(x=exp_out, y=sum_out, axis=0)
 
         scaled_q = paddle.scale(x=q, scale=d_model**-0.5)
-        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
+        product = paddle.matmul(x=scaled_q, y=k, transpose_y=True)
         weights = __softmax(paddle.add(x=product, y=attn_bias))
         if dropout_rate:
             weights = layers.dropout(
                 weights, dropout_prob=dropout_rate, is_test=False
             )
-        out = layers.matmul(weights, v)
+        out = paddle.matmul(weights, v)
         return out
 
     q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fused_attention_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_fused_attention_op_xpu.py
index 64aa6570095fc..3cdb5094f21d4 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fused_attention_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_fused_attention_op_xpu.py
@@ -31,7 +31,6 @@
 import paddle.incubate.nn.functional as incubate_f
 import paddle.nn.functional as F
 from paddle import tensor
-from paddle.fluid import layers
 from paddle.fluid.framework import default_main_program
 from paddle.nn.layer.common import Dropout, Linear
 from paddle.nn.layer.norm import LayerNorm
@@ -164,7 +163,7 @@ def GetBaselineOut(self):
 
             # [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim]
             # --> [B, n_head, seq_len, out_seq_len]
-            qk_out = layers.matmul(
+            qk_out = tensor.matmul(
                 x=q_out * self.head_dim**-0.5, y=k_out, transpose_y=True
             )
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
index d5ad5cb6f76b3..c04cc72be4d6f 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
@@ -27,7 +27,6 @@
 
 import paddle
 import paddle.fluid as fluid
-from paddle.fluid import Program, program_guard
 
 
 def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
@@ -135,71 +134,11 @@ def generate_compatible_shapes_2(dim, transpose_X, transpose_Y):
     return shape_X, shape_Y
 
 
-def generate_negative_dims(in_shape):
-    from itertools import combinations
-
-    size = len(in_shape)
-    indexs = list()
-    shapes = list()
-    for i in range(size):
-        indexs.extend(list(combinations([j for j in range(size)], i + 1)))
-    for idx in indexs:
-        shapes.append(
-            [in_shape[i] if i not in idx else -1 for i in range(size)]
-        )
-    return shapes
-
-
-def test_negative_dims_program(obj):
-    for shape_x in generate_negative_dims(obj.shape_X):
-        for shape_y in generate_negative_dims(obj.shape_Y):
-            X = np.random.random(obj.shape_X).astype(obj.in_type)
-            Y = np.random.random(obj.shape_Y).astype(obj.in_type)
-            Ref = reference_matmul(X, Y, obj.transpose_X, obj.transpose_Y)
-            with program_guard(Program(), Program()):
-                x = fluid.data(name='x', shape=shape_x, dtype=obj.in_type_str)
-                y = fluid.data(name='y', shape=shape_y, dtype=obj.in_type_str)
-                output = fluid.layers.matmul(
-                    x, y, obj.transpose_X, obj.transpose_Y
-                )
-                obj.assertEqual(len(Ref.shape), len(output.shape))
-                for idx in range(len(Ref.shape)):
-                    if output.shape[idx] != -1:
-                        obj.assertEqual(Ref.shape[idx], output.shape[idx])
-                exe = fluid.Executor(fluid.XPUPlace(0))
-                (res,) = exe.run(
-                    fluid.default_main_program(),
-                    feed={'x': X, 'y': Y},
-                    fetch_list=[output],
-                )
-                np.allclose(res, Ref, atol=1e-3)
-
-
 class XPUTestMatmulOpErr(XPUOpTestWrapper):
     def __init__(self):
         self.op_name = "matmul"
         self.use_dynamic_create_class = False
 
-    class TestMatmulOpError(unittest.TestCase):
-        def test_errors(self):
-            with program_guard(Program(), Program()):
-                # The inputs type of matmul_op must be Variable.
-                input1 = 12
-                self.assertRaises(
-                    TypeError, fluid.layers.matmul, input1, input1
-                )
-                # The inputs dtype of matmul_op must be float32, float16
-                input2 = fluid.layers.data(
-                    name='input2', shape=[10, 10], dtype="int32"
-                )
-                self.assertRaises(
-                    TypeError, fluid.layers.matmul, input2, input2
-                )
-                input3 = fluid.layers.data(
-                    name='input3', shape=[2, 2], dtype="float16"
-                )
-                fluid.layers.matmul(input3, input3)
-
     class API_TestMm(unittest.TestCase):
         def test_out(self):
             with fluid.program_guard(fluid.Program()):
@@ -399,39 +338,6 @@ def dynamic_create_class(self):
         return base_class, classes
 
 
-class XPUTestMatmulOp2(XPUOpTestWrapper):
-    def __init__(self):
-        self.op_name = "matmul"
-        self.use_dynamic_create_class = True
-
-    def dynamic_create_class(self):
-        base_class = unittest.TestCase
-        classes = []
-        xpu_support_dims_list = [[1, 1], [2, 2], [3, 3]]
-        batch_size = [2, 4, 5, 10, 50, 100, 300]
-        for dims in xpu_support_dims_list:
-            dim_X = dims[0]
-            dim_Y = dims[1]
-            for transose_x in [True, False]:
-                for transose_y in [True, False]:
-                    for batch in batch_size:
-                        class_name = 'TestMatMulAPI_dimX_{}_dim_Y_{}_transX_{}_transY_{}_batch_{}'.format(
-                            dim_X, dim_Y, transose_x, transose_y, batch
-                        )
-                        shape_x, shape_y = generate_compatible_shapes(
-                            dim_X, dim_Y, transose_x, transose_y, batch
-                        )
-                        attr_dict = {
-                            'shape_X': shape_x,
-                            'shape_Y': shape_y,
-                            'transpose_X': transose_x,
-                            'transpose_Y': transose_y,
-                            'test_propram': test_negative_dims_program,
-                        }
-                        classes.append([class_name, attr_dict])
-        return base_class, classes
-
-
 class XPUTestMatmulOp3(XPUOpTestWrapper):
     def __init__(self):
         self.op_name = "matmul"
@@ -464,7 +370,6 @@ def dynamic_create_class(self):
 for stype in support_types:
     create_test_class(globals(), XPUTestMatmulOpErr, stype)
     create_test_class(globals(), XPUTestMatmulOp1, stype)
-    create_test_class(globals(), XPUTestMatmulOp2, stype)
     create_test_class(globals(), XPUTestMatmulOp3, stype)
 
 if __name__ == "__main__":

From 8de336f9768b9312d894ac436e4b4c8bbd971b23 Mon Sep 17 00:00:00 2001
From: houj04 <35131887+houj04@users.noreply.github.com>
Date: Tue, 6 Dec 2022 13:35:50 +0800
Subject: [PATCH 07/60] [XPU] add tile_grad op (#48720)

---
 .../fluid/platform/device/xpu/xpu2_op_list.h  |  1 +
 .../phi/kernels/impl/tile_grad_kernel_impl.h  |  4 +-
 paddle/phi/kernels/xpu/tile_grad_kernel.cc    | 99 +++++++++++++++++++
 .../tests/unittests/xpu/test_tile_op_xpu.py   |  9 ++
 4 files changed, 111 insertions(+), 2 deletions(-)
 create mode 100644 paddle/phi/kernels/xpu/tile_grad_kernel.cc

diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index 835d52bdf4eea..59fd27dced779 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -663,6 +663,7 @@ XPUOpMap& get_kl2_ops() {
                      pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::BOOL, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"tile_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"transpose2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
diff --git a/paddle/phi/kernels/impl/tile_grad_kernel_impl.h b/paddle/phi/kernels/impl/tile_grad_kernel_impl.h
index 05f9139b1485b..d9b97956ce9d1 100644
--- a/paddle/phi/kernels/impl/tile_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/tile_grad_kernel_impl.h
@@ -97,8 +97,8 @@ void TileGradKernel(const Context& dev_ctx,
     PADDLE_ENFORCE_GE(dims,
                       1,
                       errors::InvalidArgument(
-                          "Th rank of the input 'Out@GRAD' for tile_grad op "
-                          " must be greater than or equal to 1, but "
+                          "The rank of the input 'Out@GRAD' for tile_grad op "
+                          "must be greater than or equal to 1, but "
                           "the value received is %d.",
                           dims));
     PADDLE_ENFORCE_LE(dims,
diff --git a/paddle/phi/kernels/xpu/tile_grad_kernel.cc b/paddle/phi/kernels/xpu/tile_grad_kernel.cc
new file mode 100644
index 0000000000000..c9dce98d19234
--- /dev/null
+++ b/paddle/phi/kernels/xpu/tile_grad_kernel.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/tile_grad_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TileGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    const IntArray& repeat_times,
+                    DenseTensor* x_grad) {
+  auto x_dims = x.dims();
+  auto vec_x_dims = phi::vectorize<int>(x_dims);
+  auto repeat_times_data = repeat_times.GetData();
+  if (repeat_times_data.size() < vec_x_dims.size()) {
+    int diff = vec_x_dims.size() - repeat_times_data.size();
+    repeat_times_data.insert(repeat_times_data.begin(), diff, 1);
+  } else {
+    int diff = repeat_times_data.size() - vec_x_dims.size();
+    vec_x_dims.insert(vec_x_dims.begin(), diff, 1);
+  }
+  // 1. reshape_dims_vec is the broadcast parameter.
+  // 2. reduce_dims_vec is the dimension parameter to compute gradients. For
+  //    each dimension expanded, the gradients should be summed to original
+  //    size.
+  std::vector<int> reshape_dims_vec;
+  std::vector<int> reduce_dims_vec;
+  for (size_t i = 0; i < repeat_times_data.size(); ++i) {
+    reduce_dims_vec.push_back(reshape_dims_vec.size());
+    reshape_dims_vec.push_back(repeat_times_data[i]);
+    reshape_dims_vec.push_back(vec_x_dims[i]);
+  }
+
+  dev_ctx.template Alloc<T>(x_grad);
+
+  int dims = reduce_dims_vec.size();
+
+  bool just_copy = true;
+  for (size_t i = 0; i < repeat_times_data.size(); i++) {
+    if (repeat_times_data[i] != 1) {
+      just_copy = false;
+      break;
+    }
+  }
+  // no need reduce, just copy
+  if (just_copy) {
+    phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
+    // TensorCopy may change the dims of dx
+    x_grad->Resize(x_dims);
+  } else {
+    PADDLE_ENFORCE_GE(dims,
+                      1,
+                      errors::InvalidArgument(
+                          "The rank of the input 'Out@GRAD' for tile_grad op "
+                          "must be greater than or equal to 1, but "
+                          "the value received is %d.",
+                          dims));
+    PADDLE_ENFORCE_LE(dims,
+                      MAX_RANK_SUPPORTED,
+                      errors::InvalidArgument(
+                          "The rank of the input 'Out@GRAD' for tile_grad op "
+                          "must be less than or equal "
+                          "to %d, but the value received is %d.",
+                          MAX_RANK_SUPPORTED,
+                          dims));
+
+    using XPUType = typename XPUTypeTrait<T>::Type;
+    // int reduce_sum(Context* ctx, const T* x, T* y, const std::vector<int>&
+    // xshape, const std::vector<int>& rdims)
+    const auto* out_data = out_grad.data<XPUType>();
+    auto* x_grad_data = x_grad->data<XPUType>();
+    int r = xpu::reduce_sum<XPUType>(dev_ctx.x_context(),
+                                     out_data,
+                                     x_grad_data,
+                                     reshape_dims_vec,
+                                     reduce_dims_vec);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(tile_grad, XPU, ALL_LAYOUT, phi::TileGradKernel, float) {}
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py
index 41fc20daffcdf..106b915dd70aa 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py
@@ -59,6 +59,9 @@ def init_data(self):
         def test_check_output(self):
             self.check_output_with_place(self.place)
 
+        def test_check_grad(self):
+            self.check_grad_with_place(self.place, ['X'], 'Out')
+
     # with dimension expanding
     class TestTileOpRank2Expanding(TestTileOpRank1):
         def init_data(self):
@@ -126,6 +129,9 @@ def init_data(self):
         def test_check_output(self):
             self.check_output_with_place(self.place)
 
+        def test_check_grad(self):
+            self.check_grad_with_place(self.place, ['X'], 'Out')
+
     class TestTileOpRank2_Corner_tensor_attr(TestTileOpRank1_tensor_attr):
         def init_data(self):
             self.ori_shape = [12, 14]
@@ -168,6 +174,9 @@ def init_data(self):
         def test_check_output(self):
             self.check_output_with_place(self.place)
 
+        def test_check_grad(self):
+            self.check_grad_with_place(self.place, ['X'], 'Out')
+
     class TestTileOpRank2_tensor(TestTileOpRank1_tensor):
         def init_data(self):
             self.ori_shape = [12, 14]

From 7575d37cffe1b6988f54aa7c924eabbf2f66501c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Tue, 6 Dec 2022 06:43:04 +0100
Subject: [PATCH 08/60] [PHI] Migrate elementwise_(add/mul) kernels (#48625)

* remove fluid code

* init

* typo

* fix merge conflicts
---
 .../mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc |   2 +-
 .../ir/mkldnn/mkldnn_inplace_pass_tester.cc   |   2 +-
 .../mkldnn/elementwise_add_mkldnn_op.cc       |  27 --
 .../mkldnn/elementwise_mkldnn_op.h            | 415 ------------------
 .../mkldnn/elementwise_mul_mkldnn_op.cc       |  27 --
 .../elementwise/unity_build_rule.cmake        |   2 -
 .../operators/mkldnn/test_mkldnn_caching.cc   |   4 +-
 .../mkldnn/test_mkldnn_op_inplace.cc          |   2 +-
 paddle/phi/kernels/elementwise_kernel.cc      |  14 -
 .../phi/kernels/onednn/elementwise_kernel.cc  |  69 ++-
 10 files changed, 69 insertions(+), 495 deletions(-)
 delete mode 100644 paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
 delete mode 100644 paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
 delete mode 100644 paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc

diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
index 1762f638e7de8..7911b125b1215 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
@@ -36,7 +36,7 @@ PD_DECLARE_KERNEL(batch_norm, OneDNN, ONEDNN);
 USE_OP_ITSELF(conv2d_transpose);
 PD_DECLARE_KERNEL(conv2d_transpose, OneDNN, ONEDNN);
 USE_OP_ITSELF(elementwise_add);
-USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
+PD_DECLARE_KERNEL(add_raw, OneDNN, ONEDNN);
 USE_OP_ITSELF(gelu);
 PD_DECLARE_KERNEL(gelu, OneDNN, ONEDNN);
 PD_DECLARE_ARG_MAPPING_FN(gelu);
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
index 673f7cd88d6ca..e8116fc47c8a6 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
@@ -25,7 +25,7 @@
 USE_OP_ITSELF(softmax);
 PD_DECLARE_KERNEL(softmax, OneDNN, ONEDNN);
 USE_OP_ITSELF(elementwise_add);
-USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
+PD_DECLARE_KERNEL(add_raw, OneDNN, ONEDNN);
 USE_OP_ITSELF(leaky_relu);
 PD_DECLARE_KERNEL(leaky_relu, OneDNN, ONEDNN);
 USE_OP_ITSELF(gelu);
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
deleted file mode 100644
index 57996477e38a9..0000000000000
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_KERNEL(
-    elementwise_add,
-    MKLDNN,
-    ::paddle::platform::CPUPlace,
-    ops::EltwiseMKLDNNKernel<float, dnnl::algorithm::binary_add>,
-    ops::EltwiseMKLDNNKernel<paddle::platform::bfloat16,
-                             dnnl::algorithm::binary_add>,
-    ops::EltwiseMKLDNNKernel<int8_t, dnnl::algorithm::binary_add>,
-    ops::EltwiseMKLDNNKernel<uint8_t, dnnl::algorithm::binary_add>)
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
deleted file mode 100644
index 6c7a8a7a66cf5..0000000000000
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
+++ /dev/null
@@ -1,415 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <unordered_map>
-
-#include "paddle/fluid/framework/data_layout_transform.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace paddle {
-namespace operators {
-
-using dnnl::memory;
-using dnnl::primitive;
-using dnnl::stream;
-using phi::DataLayout;
-using phi::OneDNNContext;
-using phi::funcs::BinaryOneDNNHandler;
-
-inline std::vector<int64_t> CalculateBroadcastedDims(
-    const phi::DenseTensor* x, const phi::DenseTensor* y) {
-  const auto src_tz = phi::vectorize(x->dims());
-  const auto dst_tz = phi::vectorize(y->dims());
-
-  std::vector<int64_t> dst_tz_ex(src_tz.size(), 1);
-
-  if (src_tz.size() == dst_tz.size()) {
-    for (size_t i = 0; i < src_tz.size(); i++) {
-      dst_tz_ex[i] = (src_tz[i] == dst_tz[i]) ? dst_tz[i] : 1;
-    }
-  } else {
-    size_t j = 0;
-    for (size_t i = 0; i < src_tz.size(); i++) {
-      dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++];
-      if (j == dst_tz.size()) break;
-    }
-  }
-
-  return dst_tz_ex;
-}
-
-inline void AddSubNonBroadcast(
-    phi::funcs::ReorderOneDNNHandler* reorder_handler,
-    phi::DenseTensor* grad_tensor,
-    const std::shared_ptr<dnnl::memory>& src_memory,
-    const std::shared_ptr<dnnl::memory>& dst_memory,
-    const std::vector<float>& scales) {
-  dnnl::primitive_attr reorder_attr;
-  reorder_attr.set_output_scales(0, scales);
-  auto reorder_p =
-      reorder_handler->AcquireReorder(dst_memory, src_memory, reorder_attr);
-
-  reorder_p->execute(
-      OneDNNContext::tls().get_stream(), *src_memory, *dst_memory);
-}
-
-template <typename T>
-inline void BroadcastReduction(const framework::ExecutionContext& ctx,
-                               const dnnl::engine& onednn_engine,
-                               phi::DenseTensor* grad_tensor,
-                               const phi::DenseTensor* dout,
-                               const std::shared_ptr<dnnl::memory>& src_memory,
-                               std::shared_ptr<dnnl::memory> dst_memory,
-                               const std::vector<float>& scales,
-                               const bool is_sub) {
-  dnnl::primitive_attr broadcast_reduction_attr;
-
-  // Broadcasting
-  if (is_sub) {
-    dnnl::post_ops po;
-    po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, scales[0], 0);
-    broadcast_reduction_attr.set_post_ops(po);
-  }
-
-  phi::funcs::ReductionOneDNNHandler<T> reduction_handler(
-      dnnl::algorithm::reduction_sum,
-      0.0f,
-      0.0f,
-      onednn_engine,
-      ctx.GetPlace(),
-      dout,
-      grad_tensor,
-      CalculateBroadcastedDims(dout, grad_tensor),
-      broadcast_reduction_attr);
-  dst_memory = reduction_handler.AcquireDstMemory(grad_tensor);
-
-  auto reduction_p = reduction_handler.AcquireForwardPrimitive();
-  auto astream = OneDNNContext::tls().get_stream();
-  reduction_p->execute(astream,
-                       {
-                           {DNNL_ARG_SRC, *src_memory},
-                           {DNNL_ARG_DST, *dst_memory},
-                       });
-  astream.wait();
-  grad_tensor->set_mem_desc(dst_memory->get_desc().reshape(
-      phi::vectorize<int64_t>(grad_tensor->dims())));
-}
-
-template <typename T, dnnl::algorithm BINARY_OP>
-class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
- private:
-  dnnl::post_ops get_post_ops(const framework::ExecutionContext& ctx) const {
-    dnnl::post_ops post_operations;
-    platform::AppendActivation(ctx, post_operations);
-    if (ctx.HasAttr("fused_output_scale")) {
-      float scale_alpha = ctx.Attr<float>("fused_output_scale");
-      post_operations.append_eltwise(
-          1.0, dnnl::algorithm::eltwise_linear, scale_alpha, 0.0f);
-    }
-    return post_operations;
-  }
-
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto& dev_ctx = ctx.template device_context<OneDNNContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* z = ctx.Output<phi::DenseTensor>("Out");
-
-    float scale_x = ctx.Attr<float>("Scale_x");
-    float scale_y = ctx.Attr<float>("Scale_y");
-    float scale_o = ctx.Attr<float>("Scale_out");
-    int axis = ctx.Attr<int>("axis");
-
-    BinaryOneDNNHandler<T> handler(BINARY_OP,
-                                   axis,
-                                   mkldnn_engine,
-                                   ctx.GetPlace(),
-                                   x,
-                                   y,
-                                   z,
-                                   scale_x,
-                                   scale_y,
-                                   scale_o,
-                                   true,
-                                   get_post_ops(ctx));
-
-    // oneDNN's binary is optimized for broadcasting y into x, so in other case
-    // we have to swap tensors to achieve optimal performance
-    if (x->numel() < y->numel()) {
-      std::swap(x, y);
-    }
-
-    const auto src_x_memory = handler.AcquireSrcMemory(x);
-    const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
-    // (jczaja) For Inplace src and dst should be the same memory object.
-    // So x should share buffer with z. But UT mechanics is testing inplace
-    // execution for this op not checking that x can be bradcasted to match in
-    // shape y tensor.
-    // This is wrong as when x is to be broadcasted then z(out) will match the
-    // shape of y which is bigger than x. Hence if x is smaller in shape than z
-    // and they share a buffer (of
-    // shape x) then this buffer is not big enough to hold result of elementwise
-    // operation.
-    const bool reuse_x_memopry =
-        x->numel() == z->numel() && x->IsSharedBufferWith(*z);
-    std::shared_ptr<dnnl::memory> dst_memory;
-    if (reuse_x_memopry) {
-      dst_memory = src_x_memory;
-      // NOTE(chenfeiyu): when the output reuses memory from other tensor rather
-      // than allocate its own, it's still need to take care of its data type.
-      // Unfortunately, paddle's operator only infers the output' shape, but not
-      // the data type. mutable_data<T> takes care of allocation and data type
-      // normally, but if the memory is already allocated and there is no need
-      // to re-allocate, it just set the data type. So this it added there to
-      // get the right data type.
-      z->mutable_data<T>(ctx.GetPlace());
-    } else {
-      dst_memory = handler.AcquireDstMemory(z);
-    }
-
-    const auto binary_prim = handler.AcquireForwardPrimitive();
-
-    auto& astream = OneDNNContext::tls().get_stream();
-
-    const std::unordered_map<int, dnnl::memory> args = {
-        {DNNL_ARG_SRC_0, *src_x_memory},
-        {DNNL_ARG_SRC_1, *src_y_memory},
-        {DNNL_ARG_DST, *dst_memory}};
-
-    binary_prim->execute(astream, args);
-    astream.wait();
-
-    if (handler.use_broadcasting_hack == false) {
-      platform::SetOutMemDescWithLogicalLayoutFusesSupport(
-          ctx, z, dst_memory->get_desc());
-    } else {
-      auto dims = dst_memory->get_desc().dims();
-      dims.insert(dims.begin(), x->dims()[0]);
-      dims[1] /= dims[0];
-      platform::SetOutMemDescWithLogicalLayoutFusesSupport(
-          ctx, z, dst_memory->get_desc().reshape(dims));
-    }
-  }
-};
-
-template <typename T, dnnl::algorithm BINARY_OP>
-class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-
-    auto& dev_ctx = ctx.template device_context<OneDNNContext>();
-    const auto& onednn_engine = dev_ctx.GetEngine();
-
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-
-    // oneDNN's binary is optimized for broadcasting y into x, so in other case
-    // we have to swap tensors to achieve optimal performance
-    bool swap_x_y = false;
-    if (x->numel() < y->numel()) {
-      std::swap(x, y);
-      std::swap(dx, dy);
-      swap_x_y = true;
-    }
-
-    std::vector<float> scales{1.0};
-    if (swap_x_y) {
-      scales[0] = (BINARY_OP == dnnl::algorithm::binary_add) ? 1 : -1;
-    }
-
-    int axis = ctx.Attr<int>("axis");
-
-    auto tz = phi::vectorize<int64_t>(dout->dims());
-    auto dout_type = phi::funcs::ToOneDNNDataType(dout->dtype());
-
-    phi::funcs::ReorderOneDNNHandler reorder_handler(
-        tz, dout->dtype(), dout_type, onednn_engine);
-
-    auto reorder_src_memory = reorder_handler.AcquireSrcMemory(
-        dout->mem_desc(), phi::funcs::to_void_cast(dout->data<T>()));
-
-    std::shared_ptr<dnnl::memory> dst_memory;
-    std::shared_ptr<dnnl::memory> broadcast_src_memory = reorder_src_memory;
-
-    auto& astream = OneDNNContext::tls().get_stream();
-    if (dx) {
-      // elementwise_add & elementwise_sub
-      if (BINARY_OP == dnnl::algorithm::binary_add ||
-          BINARY_OP == dnnl::algorithm::binary_sub) {
-        if (dout->dims() == dx->dims()) {
-          dst_memory = reorder_handler.AcquireDstMemory(
-              dx, dout->mem_desc(), ctx.GetPlace());
-          AddSubNonBroadcast(
-              &reorder_handler, dx, reorder_src_memory, dst_memory, scales);
-        }
-      } else {  // elementwise_mul & elementwise_div
-        BinaryOneDNNHandler<T> binary_handler(BINARY_OP,
-                                              axis,
-                                              onednn_engine,
-                                              ctx.GetPlace(),
-                                              dout,
-                                              y,
-                                              dx,
-                                              1.0f,
-                                              1.0f,
-                                              1.0f,
-                                              false);
-
-        const auto src_dout_memory = binary_handler.AcquireSrcMemory(dout);
-        const auto src_y_memory = binary_handler.AcquireSecondSrcMemory(y);
-        dst_memory = binary_handler.AcquireDstMemory(dx);
-
-        const auto binary_prim = binary_handler.AcquireForwardPrimitive();
-
-        const std::unordered_map<int, dnnl::memory> args = {
-            {DNNL_ARG_SRC_0, *src_dout_memory},
-            {DNNL_ARG_SRC_1, *src_y_memory},
-            {DNNL_ARG_DST, *dst_memory}};
-
-        binary_prim->execute(astream, args);
-      }
-      astream.wait();
-
-      if (dout->dims() != dx->dims()) {
-        BroadcastReduction<T>(ctx,
-                              onednn_engine,
-                              dx,
-                              dout,
-                              broadcast_src_memory,
-                              dst_memory,
-                              scales,
-                              BINARY_OP == dnnl::algorithm::binary_sub);
-      } else {
-        dx->set_mem_desc(dst_memory->get_desc());
-      }
-    }
-
-    if (dy) {
-      // elementwise_add & elementwise_sub
-      if (BINARY_OP == dnnl::algorithm::binary_add ||
-          BINARY_OP == dnnl::algorithm::binary_sub) {
-        if (dout->dims() == dy->dims()) {
-          dst_memory = reorder_handler.AcquireDstMemory(
-              dy, dout->mem_desc(), ctx.GetPlace());
-          AddSubNonBroadcast(
-              &reorder_handler, dy, reorder_src_memory, dst_memory, scales);
-        }
-      } else {  // elementwise_mul & elementwise_div
-        std::unordered_map<int, dnnl::memory> args;
-        std::shared_ptr<dnnl::binary> binary_prim;
-        std::shared_ptr<dnnl::memory> post_op_memory;
-        std::shared_ptr<dnnl::memory> src_0_memory;
-        std::shared_ptr<dnnl::memory> src_1_memory;
-
-        BinaryOneDNNHandler<T> binary_handler(dnnl::algorithm::binary_mul,
-                                              axis,
-                                              onednn_engine,
-                                              ctx.GetPlace(),
-                                              dout,
-                                              x,
-                                              nullptr,
-                                              1.0f,
-                                              1.0f,
-                                              1.0f,
-                                              false);
-
-        src_1_memory = binary_handler.AcquireSecondSrcMemory(x);
-
-        if (BINARY_OP == dnnl::algorithm::binary_div) {
-          BinaryOneDNNHandler<T> post_op_binary_handler(
-              dnnl::algorithm::binary_div,
-              axis,
-              onednn_engine,
-              ctx.GetPlace(),
-              y,
-              y,
-              nullptr,
-              1.0f,
-              1.0f,
-              1.0f,
-              false);
-
-          post_op_memory = post_op_binary_handler.AcquireSrcMemory(y);
-
-          dnnl::post_ops po;
-          po.append_binary(dnnl::algorithm::binary_div,
-                           post_op_memory->get_desc());
-
-          binary_handler = BinaryOneDNNHandler<T>(dnnl::algorithm::binary_mul,
-                                                  axis,
-                                                  onednn_engine,
-                                                  ctx.GetPlace(),
-                                                  dout,
-                                                  out,
-                                                  nullptr,
-                                                  -1.0f,
-                                                  1.0f,
-                                                  1.0f,
-                                                  false,
-                                                  po);
-
-          src_1_memory = binary_handler.AcquireSecondSrcMemory(out);
-        }
-
-        src_0_memory = binary_handler.AcquireSrcMemory(dout);
-
-        const auto dst_dy_memory = (dout->dims() == dy->dims())
-                                       ? binary_handler.AcquireDstMemory(dy)
-                                       : binary_handler.AcquireDstMemory();
-
-        binary_prim = binary_handler.AcquireForwardPrimitive();
-        args = {{DNNL_ARG_SRC_0, *src_0_memory},
-                {DNNL_ARG_SRC_1, *src_1_memory},
-                {DNNL_ARG_DST, *dst_dy_memory}};
-
-        if (BINARY_OP == dnnl::algorithm::binary_div)
-          args.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1,
-                       *post_op_memory});
-
-        binary_prim->execute(astream, args);
-        broadcast_src_memory = dst_dy_memory;
-        dst_memory = dst_dy_memory;
-      }
-      astream.wait();
-
-      if (dout->dims() != dy->dims()) {
-        BroadcastReduction<T>(ctx,
-                              onednn_engine,
-                              dy,
-                              dout,
-                              broadcast_src_memory,
-                              dst_memory,
-                              scales,
-                              BINARY_OP == dnnl::algorithm::binary_sub);
-      } else {
-        dy->set_mem_desc(dst_memory->get_desc());
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
deleted file mode 100644
index ba3a0d87f6cf7..0000000000000
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_KERNEL(
-    elementwise_mul,
-    MKLDNN,
-    ::paddle::platform::CPUPlace,
-    ops::EltwiseMKLDNNKernel<float, dnnl::algorithm::binary_mul>,
-    ops::EltwiseMKLDNNKernel<paddle::platform::bfloat16,
-                             dnnl::algorithm::binary_mul>,
-    ops::EltwiseMKLDNNKernel<int8_t, dnnl::algorithm::binary_mul>,
-    ops::EltwiseMKLDNNKernel<uint8_t, dnnl::algorithm::binary_mul>)
diff --git a/paddle/fluid/operators/elementwise/unity_build_rule.cmake b/paddle/fluid/operators/elementwise/unity_build_rule.cmake
index 060c990ea8712..057e8d700ecbc 100644
--- a/paddle/fluid/operators/elementwise/unity_build_rule.cmake
+++ b/paddle/fluid/operators/elementwise/unity_build_rule.cmake
@@ -7,14 +7,12 @@
 register_unity_group(
   cc
   elementwise_add_op.cc
-  mkldnn/elementwise_add_mkldnn_op.cc
   elementwise_div_op.cc
   elementwise_floordiv_op.cc
   elementwise_max_op.cc
   elementwise_min_op.cc
   elementwise_mod_op.cc
   elementwise_mul_op.cc
-  mkldnn/elementwise_mul_mkldnn_op.cc
   elementwise_pow_op.cc
   elementwise_sub_op.cc)
 register_unity_group(
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
index e005683e24228..cbf0b918e6d72 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
@@ -28,9 +28,9 @@
 #include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(elementwise_add);
-USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
+PD_DECLARE_KERNEL(add_raw, OneDNN, ONEDNN);
 USE_OP_ITSELF(elementwise_mul);
-USE_OP_DEVICE_KERNEL(elementwise_mul, MKLDNN);
+PD_DECLARE_KERNEL(multiply_raw, OneDNN, ONEDNN);
 USE_OP_ITSELF(relu);
 PD_DECLARE_KERNEL(relu, OneDNN, ONEDNN);
 USE_OP_ITSELF(softmax);
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
index 8aa299570443b..2c8ef7f0981dd 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
@@ -28,7 +28,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(elementwise_add);
-USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
+PD_DECLARE_KERNEL(add_raw, OneDNN, ONEDNN);
 USE_OP_ITSELF(relu);
 PD_DECLARE_KERNEL(relu, OneDNN, ONEDNN);
 USE_OP_ITSELF(softmax);
diff --git a/paddle/phi/kernels/elementwise_kernel.cc b/paddle/phi/kernels/elementwise_kernel.cc
index d4ffd49b5fc48..c6031b34af249 100644
--- a/paddle/phi/kernels/elementwise_kernel.cc
+++ b/paddle/phi/kernels/elementwise_kernel.cc
@@ -414,17 +414,3 @@ PD_REGISTER_KERNEL(elementwise_pow,
                    float,
                    phi::dtype::float16) {}
 #endif
-
-#if defined PADDLE_WITH_MKLDNN
-PD_REGISTER_KERNEL(subtract,
-                   OneDNN,
-                   ONEDNN,
-                   phi::SubtractKernel,
-                   float,
-                   phi::dtype::bfloat16,
-                   int8_t,
-                   uint8_t) {}
-
-PD_REGISTER_KERNEL(
-    divide, OneDNN, ONEDNN, phi::DivideKernel, float, phi::dtype::bfloat16) {}
-#endif
diff --git a/paddle/phi/kernels/onednn/elementwise_kernel.cc b/paddle/phi/kernels/onednn/elementwise_kernel.cc
index 29d527a523fbf..b786da7a31915 100644
--- a/paddle/phi/kernels/onednn/elementwise_kernel.cc
+++ b/paddle/phi/kernels/onednn/elementwise_kernel.cc
@@ -32,14 +32,14 @@ void ElementwiseKernel(const OneDNNContext& dev_ctx,
 
   float scale_x = dev_ctx.HasDnnAttr("Scale_x")
                       ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_x"))
-                      : 1;
+                      : 1.0f;
   float scale_y = dev_ctx.HasDnnAttr("Scale_y")
                       ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_y"))
-                      : 1;
+                      : 1.0f;
   float scale_out =
       dev_ctx.HasDnnAttr("Scale_out")
           ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_out"))
-          : 1;
+          : 1.0f;
 
   dnnl::post_ops post_operations;
   funcs::AppendActivation(dev_ctx, post_operations);
@@ -114,12 +114,14 @@ void ElementwiseKernel(const OneDNNContext& dev_ctx,
   astream.wait();
 
   if (handler.use_broadcasting_hack == false) {
-    out->set_mem_desc(dst_memory->get_desc());
+    funcs::SetOutMemDescWithLogicalLayoutFusesSupport(
+        dev_ctx, out, dst_memory->get_desc());
   } else {
     auto dims = dst_memory->get_desc().dims();
     dims.insert(dims.begin(), non_const_x->dims()[0]);
     dims[1] /= dims[0];
-    out->set_mem_desc(dst_memory->get_desc().reshape(dims));
+    funcs::SetOutMemDescWithLogicalLayoutFusesSupport(
+        dev_ctx, out, dst_memory->get_desc().reshape(dims));
   }
 }
 
@@ -131,13 +133,40 @@ void ElementwiseKernel(const OneDNNContext& dev_ctx,
                        int axis,                               \
                        DenseTensor* out) {                     \
     ElementwiseKernel<T, algorithm>(dev_ctx, x, y, axis, out); \
+  }                                                            \
+  template <typename T, typename Context>                      \
+  void name##Kernel(const Context& dev_ctx,                    \
+                    const DenseTensor& x,                      \
+                    const DenseTensor& y,                      \
+                    DenseTensor* out) {                        \
+    ElementwiseKernel<T, algorithm>(dev_ctx, x, y, -1, out);   \
   }
 
+DEFINE_ONEDNN_ELEMENTWISE_KERNEL(Add, dnnl::algorithm::binary_add)
 DEFINE_ONEDNN_ELEMENTWISE_KERNEL(Subtract, dnnl::algorithm::binary_sub)
+DEFINE_ONEDNN_ELEMENTWISE_KERNEL(Multiply, dnnl::algorithm::binary_mul)
 DEFINE_ONEDNN_ELEMENTWISE_KERNEL(Divide, dnnl::algorithm::binary_div)
 
 }  // namespace phi
 
+PD_REGISTER_KERNEL(add_raw,
+                   OneDNN,
+                   ONEDNN,
+                   phi::AddRawKernel,
+                   float,
+                   phi::dtype::bfloat16,
+                   int8_t,
+                   uint8_t) {}
+
+PD_REGISTER_KERNEL(add,
+                   OneDNN,
+                   ONEDNN,
+                   phi::AddKernel,
+                   float,
+                   phi::dtype::bfloat16,
+                   int8_t,
+                   uint8_t) {}
+
 PD_REGISTER_KERNEL(subtract_raw,
                    OneDNN,
                    ONEDNN,
@@ -147,9 +176,39 @@ PD_REGISTER_KERNEL(subtract_raw,
                    int8_t,
                    uint8_t) {}
 
+PD_REGISTER_KERNEL(subtract,
+                   OneDNN,
+                   ONEDNN,
+                   phi::SubtractKernel,
+                   float,
+                   phi::dtype::bfloat16,
+                   int8_t,
+                   uint8_t) {}
+
+PD_REGISTER_KERNEL(multiply_raw,
+                   OneDNN,
+                   ONEDNN,
+                   phi::MultiplyRawKernel,
+                   float,
+                   phi::dtype::bfloat16,
+                   int8_t,
+                   uint8_t) {}
+
+PD_REGISTER_KERNEL(multiply,
+                   OneDNN,
+                   ONEDNN,
+                   phi::MultiplyKernel,
+                   float,
+                   phi::dtype::bfloat16,
+                   int8_t,
+                   uint8_t) {}
+
 PD_REGISTER_KERNEL(divide_raw,
                    OneDNN,
                    ONEDNN,
                    phi::DivideRawKernel,
                    float,
                    phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(
+    divide, OneDNN, ONEDNN, phi::DivideKernel, float, phi::dtype::bfloat16) {}

From 06a92c5083b522ec09a60331843c59513d8da5a8 Mon Sep 17 00:00:00 2001
From: xiaoxiaohehe001 <49090790+xiaoxiaohehe001@users.noreply.github.com>
Date: Tue, 6 Dec 2022 14:07:14 +0800
Subject: [PATCH 09/60] cutlass (#48706)

---
 cmake/external/cutlass.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external/cutlass.cmake b/cmake/external/cutlass.cmake
index a80a729a13957..c96631206dfd7 100644
--- a/cmake/external/cutlass.cmake
+++ b/cmake/external/cutlass.cmake
@@ -17,7 +17,7 @@ include(ExternalProject)
 set(CUTLASS_PREFIX_DIR ${THIRD_PARTY_PATH}/cutlass)
 
 set(CUTLASS_REPOSITORY https://github.com/NVIDIA/cutlass.git)
-set(CUTLASS_TAG v2.9.1)
+set(CUTLASS_TAG v2.10.0)
 
 include_directories("${THIRD_PARTY_PATH}/cutlass/src/extern_cutlass/")
 include_directories("${THIRD_PARTY_PATH}/cutlass/src/extern_cutlass/include/")

From 06b32b38685c7b0a4ac93de869fe49fd99f51d5a Mon Sep 17 00:00:00 2001
From: QingshuChen <chenqingshu@baidu.com>
Date: Tue, 6 Dec 2022 14:30:43 +0800
Subject: [PATCH 10/60] add xpu_support op function (#48606)

*test=kunlun
---
 paddle/fluid/framework/operator.cc            |  45 +-
 paddle/fluid/imperative/prepared_operator.cc  |  16 +-
 .../fluid/platform/device/xpu/xpu1_op_list.h  | 376 -----------
 .../fluid/platform/device/xpu/xpu_op_list.cc  |  67 +-
 .../fluid/platform/device/xpu/xpu_op_list.h   |  15 +-
 paddle/fluid/pybind/place.cc                  |   2 +-
 paddle/phi/backends/xpu/CMakeLists.txt        |   2 +-
 paddle/phi/backends/xpu/xpu1_op_list.cc       |  34 +
 paddle/phi/backends/xpu/xpu2_op_list.cc       | 623 ++++++++++++++++++
 paddle/phi/backends/xpu/xpu_op_list.cc        |  19 +-
 paddle/phi/backends/xpu/xpu_op_list.h         |  13 +-
 paddle/phi/core/kernel_factory.cc             |   3 +-
 12 files changed, 746 insertions(+), 469 deletions(-)
 delete mode 100644 paddle/fluid/platform/device/xpu/xpu1_op_list.h
 create mode 100644 paddle/phi/backends/xpu/xpu1_op_list.cc
 create mode 100644 paddle/phi/backends/xpu/xpu2_op_list.cc

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 8e9753e1d7fbd..538a76e738904 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1319,9 +1319,10 @@ bool OperatorWithKernel::SupportXPU() const {
           op_kernels.end(),
           [this](OpKernelMap::const_reference kern_pair) {
             return platform::is_xpu_place(kern_pair.first.place_) &&
-                   paddle::platform::is_xpu_support_op(type_,
-                                                       kern_pair.first) &&
-                   !paddle::platform::is_in_xpu_black_list(type_);
+                   paddle::platform::is_xpu_support_op(
+                       type_,
+                       framework::TransToPhiDataType(
+                           kern_pair.first.data_type_));
           });
     }
   }
@@ -1409,8 +1410,8 @@ bool OperatorWithKernel::SupportsKernelType(
 #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
   if (paddle::platform::is_xpu_place(kernel_type.place_)) {
     return kernel_iter != kernels.end() &&
-           paddle::platform::is_xpu_support_op(type_, kernel_type) &&
-           !paddle::platform::is_in_xpu_black_list(type_);
+           paddle::platform::is_xpu_support_op(
+               type_, framework::TransToPhiDataType(kernel_type.data_type_));
   }
 #endif
 
@@ -1418,7 +1419,8 @@ bool OperatorWithKernel::SupportsKernelType(
   if (paddle::platform::is_xpu_place(kernel_type.place_)) {
     bool use_xpu_kp_kernel_rt =
         FLAGS_run_kp_kernel &&
-        paddle::platform::is_xpu_kp_support_op(type_, kernel_type);
+        paddle::platform::is_xpu_support_op(
+            type_, framework::TransToPhiDataType(kernel_type.data_type_));
     bool use_xpu_kp_kernel_debug =
         paddle::platform::is_in_xpu_kpwhite_list(type_);
     bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
@@ -1428,8 +1430,8 @@ bool OperatorWithKernel::SupportsKernelType(
       return kernels.find(tmp_kernel_type) != kernels.end();
     }
     return kernel_iter != kernels.end() &&
-           paddle::platform::is_xpu_support_op(type_, kernel_type) &&
-           !paddle::platform::is_in_xpu_black_list(type_);
+           paddle::platform::is_xpu_support_op(
+               type_, framework::TransToPhiDataType(kernel_type.data_type_));
   }
 #endif
 
@@ -1591,7 +1593,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       if (paddle::platform::is_xpu_place(kernel_type_->place_)) {
         bool use_xpu_kp_kernel_rt =
             FLAGS_run_kp_kernel &&
-            paddle::platform::is_xpu_kp_support_op(type_, *kernel_type_);
+            paddle::platform::is_xpu_support_op(
+                type_, framework::TransToPhiDataType(kernel_type_->data_type_));
         bool use_xpu_kp_kernel_debug =
             paddle::platform::is_in_xpu_kpwhite_list(type_);
         if (use_xpu_kp_kernel_rt) {
@@ -1668,7 +1671,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       if (paddle::platform::is_xpu_place(kernel_type_->place_)) {
         bool use_xpu_kp_kernel_rt =
             FLAGS_run_kp_kernel &&
-            paddle::platform::is_xpu_kp_support_op(type_, *kernel_type_);
+            paddle::platform::is_xpu_support_op(
+                type_, framework::TransToPhiDataType(kernel_type_->data_type_));
         bool use_xpu_kp_kernel_debug =
             paddle::platform::is_in_xpu_kpwhite_list(type_);
         if (use_xpu_kp_kernel_rt) {
@@ -1709,14 +1713,15 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 #if defined(PADDLE_WITH_XPU)
     bool is_xpu_unsupport =
         paddle::platform::is_xpu_place(kernel_type_->place_) &&
-            !paddle::platform::is_xpu_support_op(type_, *kernel_type_.get()) ||
-        paddle::platform::is_in_xpu_black_list(type_);
+        !paddle::platform::is_xpu_support_op(
+            type_, framework::TransToPhiDataType(kernel_type_->data_type_));
 #endif
 #ifdef PADDLE_WITH_XPU_KP
     bool use_xpu_kp_kernel_rt =
         paddle::platform::is_xpu_place(kernel_type_->place_) &&
         FLAGS_run_kp_kernel &&
-        paddle::platform::is_xpu_kp_support_op(type_, *kernel_type_);
+        paddle::platform::is_xpu_support_op(
+            type_, framework::TransToPhiDataType(kernel_type_->data_type_));
     bool use_xpu_kp_kernel_debug =
         paddle::platform::is_xpu_place(kernel_type_->place_) &&
         paddle::platform::is_in_xpu_kpwhite_list(type_);
@@ -2051,8 +2056,9 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
 #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
   if (platform::is_xpu_place(expected_kernel_key.place_) &&
       (kernel_iter == kernels.end() ||
-       !paddle::platform::is_xpu_support_op(type_, expected_kernel_key) ||
-       paddle::platform::is_in_xpu_black_list(type_))) {
+       !paddle::platform::is_xpu_support_op(
+           type_,
+           framework::TransToPhiDataType(expected_kernel_key.data_type_)))) {
     VLOG(3) << "fluid missing XPU kernel: " << type_
             << ", expected_kernel_key:" << expected_kernel_key
             << ", fallbacking to CPU one!";
@@ -2065,7 +2071,9 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
   if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) {
     bool use_xpu_kp_kernel_rt =
         FLAGS_run_kp_kernel &&
-        paddle::platform::is_xpu_kp_support_op(type_, expected_kernel_key);
+        paddle::platform::is_xpu_support_op(
+            type_,
+            framework::TransToPhiDataType(expected_kernel_key.data_type_));
     bool use_xpu_kp_kernel_debug =
         paddle::platform::is_in_xpu_kpwhite_list(type_);
     if (use_xpu_kp_kernel_rt) {
@@ -2093,9 +2101,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
                 << ", using_kernel_key:" << expected_kernel_key;
       }
     }
-    bool is_xpu_unsupport =
-        (!paddle::platform::is_xpu_support_op(type_, expected_kernel_key) ||
-         paddle::platform::is_in_xpu_black_list(type_));
+    bool is_xpu_unsupport = (!paddle::platform::is_xpu_support_op(
+        type_, framework::TransToPhiDataType(expected_kernel_key.data_type_)));
     if (!is_xpu_kp_support &&
         (kernel_iter == kernels.end() || is_xpu_unsupport)) {
       VLOG(3) << "fluid missing XPU kernel: " << type_
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 136e5c185888a..b0cd6b07a4046 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -255,9 +255,9 @@ PreparedOp PrepareImpl(
 #if defined(PADDLE_WITH_XPU)
   bool is_xpu_unsupport =
       paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
-          !paddle::platform::is_xpu_support_op(op.Type(),
-                                               expected_kernel_key) ||
-      paddle::platform::is_in_xpu_black_list(op.Type());
+      !paddle::platform::is_xpu_support_op(
+          op.Type(),
+          framework::TransToPhiDataType(expected_kernel_key.data_type_));
 #endif
 
 #ifdef PADDLE_WITH_MLU
@@ -292,8 +292,10 @@ PreparedOp PrepareImpl(
 #ifdef PADDLE_WITH_XPU_KP
     if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) {
       bool use_xpu_kp_kernel_rt =
-          FLAGS_run_kp_kernel && paddle::platform::is_xpu_kp_support_op(
-                                     op.Type(), expected_kernel_key);
+          FLAGS_run_kp_kernel &&
+          paddle::platform::is_xpu_support_op(
+              op.Type(),
+              framework::TransToPhiDataType(expected_kernel_key.data_type_));
       bool use_xpu_kp_kernel_debug =
           paddle::platform::is_in_xpu_kpwhite_list(op.Type());
       if (use_xpu_kp_kernel_rt) {
@@ -368,7 +370,9 @@ PreparedOp PrepareImpl(
   bool use_xpu_kp_kernel_rt =
       paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
       FLAGS_run_kp_kernel &&
-      paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key);
+      paddle::platform::is_xpu_support_op(
+          op.Type(),
+          framework::TransToPhiDataType(expected_kernel_key.data_type_));
   bool use_xpu_kp_kernel_debug =
       paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
       paddle::platform::is_in_xpu_kpwhite_list(op.Type());
diff --git a/paddle/fluid/platform/device/xpu/xpu1_op_list.h b/paddle/fluid/platform/device/xpu/xpu1_op_list.h
deleted file mode 100644
index 06794268f04b2..0000000000000
--- a/paddle/fluid/platform/device/xpu/xpu1_op_list.h
+++ /dev/null
@@ -1,376 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#ifdef PADDLE_WITH_XPU
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-
-#include "paddle/fluid/framework/op_kernel_type.h"
-
-namespace paddle {
-namespace platform {
-
-using vartype = paddle::framework::proto::VarType;
-using pOpKernelType = paddle::framework::OpKernelType;
-using XPUKernelSet =
-    std::unordered_set<pOpKernelType, paddle::framework::OpKernelType::Hash>;
-using XPUOpMap = std::unordered_map<std::string, XPUKernelSet>;
-
-XPUOpMap& get_kl1_ops() {
-  // KL1支持的op，通过op_name, data_type, place来索引
-  static XPUOpMap s_xpu1_kernels{
-      {"abs", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"accuracy", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"adamw", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"affine_channel_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"affine_channel",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"assign",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace())})},
-      {"batch_norm_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"batch_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"bilinear_interp",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"bilinear_interp_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"bilinear_interp_v2",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"bilinear_interp_v2_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"broadcast",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"cast",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace())})},
-      {"clip_by_norm",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"coalesce_tensor",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace())})},
-      {"concat", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"concat_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"conv2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"conv2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"deformable_conv",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"deformable_conv_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"depthwise_conv2d",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"depthwise_conv2d_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"dropout", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"dropout_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"c_allreduce_sum",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"c_reduce_sum",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_add",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_add_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_div_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_div",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_floordiv",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_max_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_max",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_min_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_min",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_mul_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_mul",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_pow",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_sub_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_sub",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"expand_as_v2",
-       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"expand_v2",
-       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"fill_any_like",
-       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"fill_constant",
-       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"gather_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"gather", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"gaussian_random",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"gelu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"gelu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"hard_switch_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"hard_switch", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"iou_similarity",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"lamb", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"layer_norm_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"leaky_relu_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"leaky_relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"load",
-       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"logicaland",
-       XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::INT16, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"logicalnot",
-       XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::INT16, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"logicalor",
-       XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::INT16, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"log_loss_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"log_loss", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"logsumexp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"log", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"lookup_table_v2_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"lookup_table_v2",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"matmul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"matmul_v2_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"matmul_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"matmul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"mean_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"momentum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"mul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"nearest_interp_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"nearest_interp_v2_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"nearest_interp_v2",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"nearest_interp",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"one_hot_v2",
-       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"one_hot",
-       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"pool2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"pool2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"pow", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"range",
-       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_max_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_mean_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_prod", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_sum_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"relu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reshape2_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reshape2",
-       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"rmsprop", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"rnn_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"rnn", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"roi_align_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"roi_align", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"scale", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"sgd", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"shape",
-       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"sigmoid_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"sigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"sign", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"slice",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace())})},
-      {"softmax_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"softmax_with_cross_entropy",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"softmax_with_cross_entropy_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"split",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace())})},
-      {"sqrt_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"sqrt", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"square_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"square", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"squeeze2_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::UINT8, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"squeeze2",
-       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::UINT8, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"squeeze_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::UINT8, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"squeeze",
-       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::UINT8, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"stack", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"stack_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"tanh_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"tanh", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"top_k", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"transpose2_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"transpose2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"transpose_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"transpose", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"truncated_gaussian_random",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"uniform_random",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"unsqueeze2_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::UINT8, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"unsqueeze2",
-       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::UINT8, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"unsqueeze_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::UINT8, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"unsqueeze",
-       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::UINT8, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"where_index", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace())})},
-      // AddMore
-  };
-
-  return s_xpu1_kernels;
-}
-
-}  // namespace platform
-}  // namespace paddle
-#endif
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
index c5363e6d6c960..220a41c1b1042 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
@@ -15,25 +15,14 @@ limitations under the License. */
 #include <string>
 #include <unordered_set>
 
-#include "paddle/fluid/platform/device/xpu/xpu1_op_list.h"
-#include "paddle/fluid/platform/device/xpu/xpu2_op_list.h"
+#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
 #include "paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h"
+#include "paddle/phi/backends/xpu/xpu_op_list.h"
 
 namespace paddle {
 namespace platform {
 
-bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) {
-  auto v = get_xpu_version(type.place_.device);
-  auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1) ? get_kl1_ops()
-                                                          : get_kl2_ops();
-  if (ops.find(op_name) != ops.end() &&
-      ops[op_name].find(type) != ops[op_name].end()) {
-    return true;
-  }
-  return false;
-}
-
 // ops_string contains op_list(e.g., 'mul,mul_grad'), parse the op string and
 // insert op to op set
 static void tokenize(const std::string& ops,
@@ -50,18 +39,6 @@ static void tokenize(const std::string& ops,
 }
 
 #ifdef PADDLE_WITH_XPU_KP
-bool is_xpu_kp_support_op(const std::string& op_name,
-                          const pOpKernelType& type) {
-  auto v = get_xpu_version(type.place_.device);
-  auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1) ? get_kl1_ops()
-                                                          : get_kp_ops();
-  if (ops.find(op_name) != ops.end() &&
-      ops[op_name].find(type) != ops[op_name].end()) {
-    return true;
-  }
-  return false;
-}
-
 bool is_in_xpu_kpwhite_list(const std::string& op_name) {
   static bool inited = false;
   static std::unordered_set<std::string> xpu_kpwhite_list;
@@ -88,49 +65,37 @@ bool is_in_xpu_kpwhite_list(const std::string& op_name) {
 }
 #endif
 
-#ifdef PADDLE_WITH_XPU_KP
-std::vector<vartype::Type> get_xpu_kp_op_support_type(
-    const std::string& op_name, phi::backends::xpu::XPUVersion version) {
-  std::vector<vartype::Type> res;
-  auto& ops = version == phi::backends::xpu::XPUVersion::XPU1 ? get_kl1_ops()
-                                                              : get_kp_ops();
-  if (ops.find(op_name) != ops.end()) {
-    XPUKernelSet& type_set = ops[op_name];
-    for (auto& item : type_set) {
-      res.push_back(item.data_type_);
-    }
-  }
-  return res;
-}
-#endif
-
 std::vector<vartype::Type> get_xpu_op_support_type(
     const std::string& op_name, phi::backends::xpu::XPUVersion version) {
+  auto& ops = version == phi::backends::xpu::XPUVersion::XPU1
+                  ? phi::backends::xpu::get_kl1_ops()
+                  : phi::backends::xpu::get_kl2_ops();
   std::vector<vartype::Type> res;
-  auto& ops = version == phi::backends::xpu::XPUVersion::XPU1 ? get_kl1_ops()
-                                                              : get_kl2_ops();
   if (ops.find(op_name) != ops.end()) {
-    XPUKernelSet& type_set = ops[op_name];
-    for (auto& item : type_set) {
-      res.push_back(item.data_type_);
+    auto& dtypes = ops[op_name];
+    for (auto& type : dtypes) {
+      res.push_back(static_cast<vartype::Type>(phi::TransToProtoVarType(type)));
     }
   }
   return res;
 }
 
 XPUOpListMap get_xpu_op_list(phi::backends::xpu::XPUVersion version) {
+  auto& ops = version == phi::backends::xpu::XPUVersion::XPU1
+                  ? phi::backends::xpu::get_kl1_ops()
+                  : phi::backends::xpu::get_kl2_ops();
   XPUOpListMap res;
-  auto& ops = version == phi::backends::xpu::XPUVersion::XPU1 ? get_kl1_ops()
-                                                              : get_kl2_ops();
   for (auto& op : ops) {
-    std::vector<vartype::Type> op_vartypes;
+    std::vector<vartype::Type> op_types;
     for (auto& item : op.second) {
-      op_vartypes.push_back(item.data_type_);
+      op_types.push_back(
+          static_cast<vartype::Type>(phi::TransToProtoVarType(item)));
     }
-    res[op.first] = std::move(op_vartypes);
+    res[op.first] = std::move(op_types);
   }
   return res;
 }
+
 }  // namespace platform
 }  // namespace paddle
 #endif
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.h b/paddle/fluid/platform/device/xpu/xpu_op_list.h
index d701294865d6d..3da4e7b190c41 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.h
@@ -21,22 +21,17 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-using pOpKernelType = paddle::framework::OpKernelType;
-using vartype = paddle::framework::proto::VarType;
-using XPUOpListMap =
-    std::unordered_map<std::string, std::vector<vartype::Type>>;
-
-bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type);
 using phi::backends::xpu::is_in_xpu_black_list;
+using phi::backends::xpu::is_xpu_support_op;
 
 #ifdef PADDLE_WITH_XPU_KP
-bool is_xpu_kp_support_op(const std::string& op_name,
-                          const pOpKernelType& type);
 bool is_in_xpu_kpwhite_list(const std::string& op_name);
-std::vector<vartype::Type> get_xpu_kp_op_support_type(
-    const std::string& op_name, phi::backends::xpu::XPUVersion version);
 #endif
 
+using vartype = paddle::framework::proto::VarType;
+using XPUOpListMap =
+    std::unordered_map<std::string, std::vector<vartype::Type>>;
+
 std::vector<vartype::Type> get_xpu_op_support_type(
     const std::string& op_name, phi::backends::xpu::XPUVersion version);
 XPUOpListMap get_xpu_op_list(phi::backends::xpu::XPUVersion version);
diff --git a/paddle/fluid/pybind/place.cc b/paddle/fluid/pybind/place.cc
index b39427f8b3962..309fad4273dc2 100644
--- a/paddle/fluid/pybind/place.cc
+++ b/paddle/fluid/pybind/place.cc
@@ -456,7 +456,7 @@ void BindPlace(pybind11::module &m) {  // NOLINT
 #ifdef PADDLE_WITH_XPU_KP
   m.def("get_xpu_device_op_support_types",
         [](const std::string &op_name, phi::backends::xpu::XPUVersion version) {
-          return platform::get_xpu_kp_op_support_type(op_name, version);
+          return platform::get_xpu_op_support_type(op_name, version);
         });
 #else
   m.def("get_xpu_device_op_support_types",
diff --git a/paddle/phi/backends/xpu/CMakeLists.txt b/paddle/phi/backends/xpu/CMakeLists.txt
index 0e4fa36ea162b..d84e6a63e058a 100644
--- a/paddle/phi/backends/xpu/CMakeLists.txt
+++ b/paddle/phi/backends/xpu/CMakeLists.txt
@@ -1,4 +1,4 @@
 cc_library(
   phi_xpu_op_list
-  SRCS xpu_op_list.cc
+  SRCS xpu_op_list.cc xpu1_op_list.cc xpu2_op_list.cc
   DEPS glog)
diff --git a/paddle/phi/backends/xpu/xpu1_op_list.cc b/paddle/phi/backends/xpu/xpu1_op_list.cc
new file mode 100644
index 0000000000000..87c0502abba07
--- /dev/null
+++ b/paddle/phi/backends/xpu/xpu1_op_list.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or
+agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+or implied. See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include "paddle/phi/backends/xpu/xpu_op_list.h"
+
+namespace phi {
+namespace backends {
+namespace xpu {
+
+XPUOpMap& get_kl1_ops() {
+  // KL1支持的op，通过op_name, data_type
+  static XPUOpMap s_xpu1_kernels{
+      // AddMore
+  };
+
+  PD_THROW("get_kl1_ops unsupported");
+  return s_xpu1_kernels;
+}
+
+}  // namespace xpu
+}  // namespace backends
+}  // namespace phi
+#endif
diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
new file mode 100644
index 0000000000000..b1357b32e38b5
--- /dev/null
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -0,0 +1,623 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or
+agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+or implied. See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include "paddle/phi/backends/xpu/xpu_op_list.h"
+
+namespace phi {
+namespace backends {
+namespace xpu {
+
+XPUOpMap& get_kl2_ops() {
+  // KL2支持的op，通过op_name, data_type, place来索引
+  static XPUOpMap s_xpu2_kernels{
+      {"abs", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"abs_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"adadelta", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"adamw", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"adam", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"arg_max", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"argsort_grad",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::FLOAT32})},
+      {"argsort",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::FLOAT32})},
+      {"assign",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::BOOL})},
+      {"assign_value", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"batch_norm_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"batch_norm", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"bmm", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"bmm_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"bce_loss_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"bce_loss", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"beam_search",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"beam_search_decode",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"bilinear_interp_v2", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"bilinear_interp_v2_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"broadcast", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"c_allgather",
+       XPUKernelSet({phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"c_allreduce_sum",
+       XPUKernelSet({phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::INT32})},
+      {"c_embedding", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"c_identity",
+       XPUKernelSet({phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"c_sync_calc_stream", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"c_sync_comm_stream", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"cast",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::BOOL,
+                     phi::DataType::UINT8,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32})},
+      {"check_finite_and_unscale",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"clip", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"clip_by_norm", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"coalesce_tensor", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"concat_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"concat",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT64})},
+      {"conv2d_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"conv2d",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"conv3d_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"conv3d",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"conv2d_transpose_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"conv2d_transpose", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"cumsum",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"deformable_conv_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"deformable_conv", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"depthwise_conv2d_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"depthwise_conv2d", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"dropout_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"dropout",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"elementwise_add_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"elementwise_add",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"elementwise_div_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"elementwise_div",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"elementwise_floordiv",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"elementwise_max_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"elementwise_max",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"elementwise_min_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"elementwise_min",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"elementwise_mul_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"elementwise_mul",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"elementwise_pow",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"elementwise_sub_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"elementwise_sub",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT64})},
+      {"elementwise_mod",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32})},
+      {"empty",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT16,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT64})},
+      {"embedding_sparse_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"equal",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT32})},
+      {"exp_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"exp", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"expand_as_v2",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"expand_v2",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"fill_any_like",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"fill_constant",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT16,
+                     phi::DataType::UINT8,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::COMPLEX64,
+                     phi::DataType::COMPLEX128})},
+      {"flatten2_grad",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT8,
+                     phi::DataType::FLOAT32})},
+      {"flatten2",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT8,
+                     phi::DataType::FLOAT32})},
+      {"flatten_contiguous_range_grad",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT8,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"flatten_contiguous_range",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT8,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"flatten_grad",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT8,
+                     phi::DataType::FLOAT32})},
+      {"flatten",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT8,
+                     phi::DataType::FLOAT32})},
+      {"unfold",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"unfold_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"floor", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"gather_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"gather_nd",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::FLOAT32})},
+      {"gather",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"gaussian_random", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"gelu_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"gelu", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"generate_proposals_v2", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"grad_add",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"greater_equal",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT32})},
+      {"greater_than",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT32})},
+      {"grid_sampler", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"hard_sigmoid_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"hard_sigmoid", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"hard_swish_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"hard_swish", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"huber_loss_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"huber_loss", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"kldiv_loss", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"kldiv_loss_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"iou_similarity", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"index_select",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"instance_norm", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"instance_norm_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"label_smooth", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"lamb", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"lars_momentum",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"layer_norm_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"layer_norm",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"leaky_relu_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"leaky_relu", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"less_equal",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT32})},
+      {"less_than",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT32})},
+      {"load_combine",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::INT8,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"log", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"log_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"log_softmax", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"log_softmax_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"lookup_table_v2_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"lookup_table_v2", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"masked_select",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"masked_select_grad",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"matmul_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"matmul_v2_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"matmul_v2",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"matmul",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"mean_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"mean", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"merged_momentum",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"mish_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"mish", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"momentum",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"mul", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"mul_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"nearest_interp_v2", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"nearest_interp_v2_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"not_equal",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT32})},
+      {"one_hot", XPUKernelSet({phi::DataType::INT32, phi::DataType::INT64})},
+      {"one_hot_v2",
+       XPUKernelSet({phi::DataType::INT32, phi::DataType::INT64})},
+      {"p_norm", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"p_norm_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"pad3d_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"pad3d", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"pool2d_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"pool2d",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"pow", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"pow_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"pow2_decay_with_linear_warmup", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"prior_box", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"range", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::INT64})},
+      {"reciprocal", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"reciprocal_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"reduce_max_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"reduce_max", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"reduce_mean_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"reduce_mean", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"reduce_min", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"reduce_prod", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"reduce_sum_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"reduce_sum", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"relu6", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"relu6_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"relu_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"relu", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"reshape2_grad",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT32})},
+      {"reshape2",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT32})},
+      {"resnet_unit",
+       XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::FLOAT32})},
+      {"resnet_unit_grad",
+       XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::FLOAT32})},
+      {"rmsprop", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"rnn", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"rnn_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"roi_align", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"roi_align_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"roll", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"roll_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"scale",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT64})},
+      {"scatter",
+       XPUKernelSet({phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT32})},
+      {"sampling_id",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT64})},
+      {"sgd", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"sgd_dense_param_sparse_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"silu_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"silu", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"sigmoid_cross_entropy_with_logits_grad",
+       XPUKernelSet({phi::DataType::FLOAT32})},
+      {"sigmoid_cross_entropy_with_logits",
+       XPUKernelSet({phi::DataType::FLOAT32})},
+      {"shape", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::INT64})},
+      {"sigmoid", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"sigmoid_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"sign", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"slice_grad",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT32})},
+      {"slice",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"softmax",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"softmax_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"softmax_with_cross_entropy_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"softmax_with_cross_entropy",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"softplus", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"softplus_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"split",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT32})},
+      {"split_with_num",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT32})},
+      {"sqrt", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"sqrt_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"square_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"square",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"squeeze2_grad",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::FLOAT32})},
+      {"squeeze2",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::FLOAT32})},
+      {"squeeze_grad",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::FLOAT32})},
+      {"squeeze",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::FLOAT32})},
+      {"stack",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32})},
+      {"stack_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::INT32})},
+      {"strided_slice",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT16,
+                     phi::DataType::INT32})},
+      {"strided_slice_grad",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT16,
+                     phi::DataType::INT32})},
+      {"sum", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"swish", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"swish_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"tanh_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"tanh", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"temporal_shift", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"temporal_shift_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"tril_triu",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::INT32})},
+      {"tril_triu_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::INT32})},
+      {"tile",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT32})},
+      {"transpose2_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"transpose2",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"transpose_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"transpose",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"truncated_gaussian_random", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"top_k", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"top_k_v2", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"update_loss_scaling",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"uniform_random", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"unsqueeze2_grad",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16})},
+      {"unsqueeze2",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16})},
+      {"unsqueeze_with_xshape",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16})},
+      {"unsqueeze_grad",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::FLOAT32})},
+      {"unsqueeze",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::FLOAT32})},
+      {"warpctc_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"warpctc", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"where_index",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT32})},
+      {"where",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::FLOAT32})},
+
+      // AddMore
+      {"sequence_conv", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"sequence_conv_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"sequence_unpad", XPUKernelSet({phi::DataType::FLOAT32})},
+      // Fused op
+      {"resnet_basic_block_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"resnet_basic_block", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"fused_gemm_epilogue",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"fused_gemm_epilogue_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"fused_attention",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"fused_attention_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"fused_feedforward",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"fused_feedforward_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+  };
+
+  return s_xpu2_kernels;
+}
+
+}  // namespace xpu
+}  // namespace backends
+}  // namespace phi
+#endif
diff --git a/paddle/phi/backends/xpu/xpu_op_list.cc b/paddle/phi/backends/xpu/xpu_op_list.cc
index 9ec6f67c6a5ec..edcf81183be42 100644
--- a/paddle/phi/backends/xpu/xpu_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu_op_list.cc
@@ -10,11 +10,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 #include "paddle/phi/backends/xpu/xpu_op_list.h"
-
 #include <glog/logging.h>
 #include <mutex>
 #include <string>
 #include <unordered_set>
+#include "paddle/phi/backends/xpu/xpu_info.h"
 
 namespace phi {
 namespace backends {
@@ -35,7 +35,7 @@ static void tokenize(const std::string& ops,
   op_set->insert(ops.substr(beg));
 }
 
-bool is_in_xpu_black_list(const std::string& op_name) {
+bool is_in_xpu_black_list(const std::string& fluid_op_name) {
   static bool inited = false;
   static std::unordered_set<std::string> xpu_black_list;
   static std::mutex s_mtx;
@@ -54,7 +54,20 @@ bool is_in_xpu_black_list(const std::string& op_name) {
       }
     }
   }
-  if (xpu_black_list.find(op_name) != xpu_black_list.end()) {
+  if (xpu_black_list.find(fluid_op_name) != xpu_black_list.end()) {
+    return true;
+  }
+  return false;
+}
+
+bool is_xpu_support_op(const std::string& fluid_op_name,
+                       const phi::DataType type) {
+  if (is_in_xpu_black_list(fluid_op_name)) return false;
+  auto v = get_xpu_version(0);
+  auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1) ? get_kl1_ops()
+                                                          : get_kl2_ops();
+  if (ops.find(fluid_op_name) != ops.end() &&
+      ops[fluid_op_name].find(type) != ops[fluid_op_name].end()) {
     return true;
   }
   return false;
diff --git a/paddle/phi/backends/xpu/xpu_op_list.h b/paddle/phi/backends/xpu/xpu_op_list.h
index de322f025c3aa..17b2f1c6965a6 100644
--- a/paddle/phi/backends/xpu/xpu_op_list.h
+++ b/paddle/phi/backends/xpu/xpu_op_list.h
@@ -12,12 +12,23 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include "paddle/phi/common/data_type.h"
 
 namespace phi {
 namespace backends {
 namespace xpu {
 
-bool is_in_xpu_black_list(const std::string& op_name);
+using XPUKernelSet = std::unordered_set<phi::DataType>;
+using XPUOpMap = std::unordered_map<std::string, XPUKernelSet>;
+
+XPUOpMap& get_kl1_ops();
+XPUOpMap& get_kl2_ops();
+
+bool is_in_xpu_black_list(const std::string& fluid_op_name);
+bool is_xpu_support_op(const std::string& fluid_op_name,
+                       const phi::DataType type);
 
 }  // namespace xpu
 }  // namespace backends
diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc
index 3e714d7fe37cc..9b4fc5351111f 100644
--- a/paddle/phi/core/kernel_factory.cc
+++ b/paddle/phi/core/kernel_factory.cc
@@ -151,7 +151,8 @@ KernelResult KernelFactory::SelectKernelOrThrowError(
 #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
   VLOG(6) << "fluid_op_name: " << TransToFluidOpName(kernel_name);
   if ((FLAGS_enable_api_kernel_fallback && kernel_iter == iter->second.end()) ||
-      phi::backends::xpu::is_in_xpu_black_list(TransToFluidOpName(kernel_name))
+      !phi::backends::xpu::is_xpu_support_op(TransToFluidOpName(kernel_name),
+                                             kernel_key.dtype())
 #else
   if ((FLAGS_enable_api_kernel_fallback && kernel_iter == iter->second.end())
 #endif

From e4ee872cb642613033d30c468cde6a8f451d7802 Mon Sep 17 00:00:00 2001
From: wuhuachaocoding <77733235+wuhuachaocoding@users.noreply.github.com>
Date: Tue, 6 Dec 2022 14:40:12 +0800
Subject: [PATCH 11/60] update for untrainable params for stage3. (#48577)

---
 .../sharding/group_sharded_stage3.py          |  13 +-
 .../distributed/sharding/group_sharded.py     |   4 +-
 .../dygraph_group_sharded_stage3_eager.py     | 178 ++++++++++++++++++
 ...est_dygraph_group_sharded_api_for_eager.py |   4 +
 4 files changed, 196 insertions(+), 3 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage3_eager.py

diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
index ac41b4af4c9b0..ad4d53cb08254 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
@@ -346,7 +346,7 @@ def _add_manage_info(trainable_param):
 
         current_params = list()
         for p in current_layer_params:
-            if p.trainable and p._numel() > self._segment_size:
+            if p._numel() > self._segment_size:
                 current_params.append(_add_manage_info(p))
             elif p.trainable:
                 self._unslice_params.add(_UnsliceParam(p))
@@ -430,7 +430,11 @@ def _param_storage(self, param, buffer_size):
         param.status = "part"
 
         # Updata optimizer master weights
-        if param.dtype == Type.fp16.value and not self._offload:
+        if (
+            param.trainable
+            and param.dtype == Type.fp16.value
+            and not self._offload
+        ):
             master_tensor = paddle.cast(param.fw_storage, Type.fp32.value)
             master_tensor.name = param.name
             self._optim._master_weights[param.fw_storage.name] = master_tensor
@@ -599,6 +603,9 @@ def _register_backward_hooks(self):
     def _get_allreduce_fn(self, param):
         @paddle.autograd.no_grad()
         def allreduce_(*_):
+            assert (
+                param.trainable
+            ), "the param must be trainable for grad allreduced"
             if param.name in self._task_flow.full_grad.keys():
                 full_grad = self._task_flow.full_grad[param.name]
                 # Only support sync allreduce current rank's layer now
@@ -962,6 +969,8 @@ def _allgather_buffer(
 @paddle.autograd.no_grad()
 def _create_params_grad(trainable_params, param2buffer_size, task_flow):
     for param in trainable_params:
+        if not param.trainable:
+            continue
         if param.name in task_flow.full_grad.keys():
             continue
         assert isinstance(param2buffer_size[param.name], int)
diff --git a/python/paddle/distributed/sharding/group_sharded.py b/python/paddle/distributed/sharding/group_sharded.py
index a69718261d909..012008913eee0 100644
--- a/python/paddle/distributed/sharding/group_sharded.py
+++ b/python/paddle/distributed/sharding/group_sharded.py
@@ -140,7 +140,9 @@ def check_dtype(param):
 
     params_fp16 = list(filter(check_dtype, model.parameters()))
     if scaler is None and len(params_fp16) > 0:
-        raise ValueError("Please enter the correct scaler.")
+        logger_.warning(
+            "the input of scaler is None, please ensure the logic of your scaler outside is same as GroupShardedScaler."
+        )
     # convert model/optimizer/scaler
     if level in ['os', 'os_g']:
         logger_.info("*" * 30)
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage3_eager.py b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage3_eager.py
new file mode 100644
index 0000000000000..efd7a7b1ce70c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage3_eager.py
@@ -0,0 +1,178 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import numpy as np
+
+import paddle
+from paddle import nn
+from paddle.distributed.sharding import group_sharded_parallel
+from paddle.fluid.framework import _test_eager_guard
+
+paddle.seed(2022)
+np.random.seed(2022)
+
+
+class Model(nn.Layer):
+    def __init__(self):
+        super(Model, self).__init__()
+        self.first_stage = nn.Linear(4096, 4096, bias_attr=False)
+        self.center_stage = nn.Linear(4096, 4096)
+        self.center_stage.weight.stop_gradient = True
+        self.center_stage.bias.stop_gradient = True
+        self.final_stage = nn.Linear(4096, 2, bias_attr=False)
+
+    def forward(self, x):
+        x = self.first_stage(x)
+        x = self.center_stage(x)
+        x = self.final_stage(x)
+        return x
+
+
+def optimizer_setting(model, use_multi_precision):
+    optimizer = paddle.optimizer.AdamW(
+        learning_rate=0.001,
+        parameters=model.parameters(),
+        multi_precision=use_multi_precision,
+    )
+    return optimizer
+
+
+def train_mlp(
+    model,
+    shard_level="p_g_os",
+    use_multi_precision=False,
+    output_dir="",
+    amp_level='O1',
+    sync_buffers=False,
+    use_sharding=True,
+    data=None,
+):
+    optimizer = optimizer_setting(
+        model=model, use_multi_precision=use_multi_precision
+    )
+    if use_multi_precision:
+        model = paddle.amp.decorate(models=model, level=amp_level)
+
+    scaler = paddle.amp.GradScaler(init_loss_scaling=32768)
+
+    if use_sharding:
+        model, optimizer, scaler = group_sharded_parallel(
+            model=model,
+            optimizer=optimizer,
+            level=shard_level,
+            scaler=scaler,
+            sync_buffers=sync_buffers,
+        )
+
+    res_loss = []
+    for i in range(20):
+        model.train()
+        img = data[i]
+        with paddle.amp.auto_cast(use_multi_precision, level=amp_level):
+            out = model(img)
+            avg_loss = out.mean()
+
+        res_loss.append(avg_loss.item())
+
+        if not use_multi_precision:
+            avg_loss.backward()
+            optimizer.step()
+        else:
+            scaler.scale(avg_loss).backward()
+            scaler.step(optimizer)
+            scaler.update()
+
+        optimizer.clear_grad()
+
+    return res_loss
+
+
+def test_sharding_api():
+    paddle.distributed.init_parallel_env()
+
+    # just test warning
+    model = Model()
+    model = paddle.amp.decorate(models=model, level="O2")
+    optimizer = optimizer_setting(model=model, use_multi_precision=True)
+    model, optimizer, scaler = group_sharded_parallel(
+        model=model,
+        optimizer=optimizer,
+        level="p_g_os",
+    )
+
+    data = [paddle.randn([8, 4096]) for i in range(20)]
+
+    model = Model()
+    sd3_model = Model()
+    sd3_model.set_state_dict(model.state_dict())
+
+    # dp fp32
+    dp_fp32_loss = train_mlp(
+        model, use_multi_precision=False, use_sharding=False, data=data
+    )
+
+    # stage3 fp32
+    sd3_fp32_loss = train_mlp(
+        sd3_model,
+        shard_level="p_g_os",
+        use_multi_precision=False,
+        use_sharding=True,
+        data=data,
+    )
+
+    print("dp_fp32_loss: ", dp_fp32_loss)
+    print("sd3_fp32_loss: ", sd3_fp32_loss)
+
+    for i in range(len(dp_fp32_loss)):
+        np.testing.assert_allclose(
+            np.array(dp_fp32_loss[i]),
+            np.array(sd3_fp32_loss[i]),
+            rtol=1e-8,
+            atol=1e-8,
+        )
+
+    model = Model()
+    sd3_model = Model()
+    sd3_model.set_state_dict(model.state_dict())
+
+    # dp fp16
+    dp_fp16_loss = train_mlp(
+        model, use_multi_precision=True, use_sharding=False, data=data
+    )
+
+    # stage3 fp16
+    sd3_fp16_loss = train_mlp(
+        sd3_model,
+        shard_level="p_g_os",
+        use_multi_precision=True,
+        use_sharding=True,
+        data=data,
+    )
+
+    print("dp_fp316_loss: ", dp_fp32_loss)
+    print("sd3_fp32_loss: ", sd3_fp32_loss)
+
+    for i in range(len(dp_fp16_loss)):
+        np.testing.assert_allclose(
+            np.array(dp_fp16_loss[i]),
+            np.array(sd3_fp16_loss[i]),
+            rtol=1e-5,
+            atol=1e-5,
+        )
+
+
+if __name__ == '__main__':
+    with _test_eager_guard():
+        test_sharding_api()
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_group_sharded_api_for_eager.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_group_sharded_api_for_eager.py
index a8b9a3229bdd0..ecf864cf806f6 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_group_sharded_api_for_eager.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_group_sharded_api_for_eager.py
@@ -27,6 +27,10 @@ class TestDygraphGroupSharded(TestMultipleGpus):
     def test_dygraph_group_sharded(self):
         self.run_mnist_2gpu('dygraph_group_sharded_api_eager.py')
 
+    # check stage3 for some functions.
+    def test_dygraph_group_sharded(self):
+        self.run_mnist_2gpu('dygraph_group_sharded_stage3_eager.py')
+
 
 if __name__ == "__main__":
     unittest.main()

From 888631b6042899d25c95e8a647cfaded383f0cb7 Mon Sep 17 00:00:00 2001
From: Nyakku Shigure <sigure.qaq@gmail.com>
Date: Tue, 6 Dec 2022 17:02:48 +0800
Subject: [PATCH 12/60] [CodeStyle][isort][Dy2St] sort imports in test_error
 (#48746)

* [CodeStyle][isort][Dy2St] sort imports in test_error

* update lineno
---
 pyproject.toml                                |  1 -
 .../unittests/dygraph_to_static/test_error.py | 22 ++++++++++---------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2530f0936b03d..926f20aa34bcb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,6 +19,5 @@ extend_skip_glob = [
     "python/paddle/fluid/tests/unittests/mlu/**",
 
     # These files will be fixed in the future
-    "python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py",
     "python/paddle/jit/**",
 ]
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
index 805970814425d..ac91f9a74efac 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 import inspect
+import os
 import unittest
+
 import numpy as np
+
 import paddle
 import paddle.fluid as fluid
 from paddle.jit.dy2static import error
@@ -254,11 +256,11 @@ def set_exception_type(self):
 
     def set_message(self):
         self.expected_message = [
-            'File "{}", line 33, in func_error_in_compile_time'.format(
+            'File "{}", line 35, in func_error_in_compile_time'.format(
                 self.filepath
             ),
             'inner_func()',
-            'File "{}", line 26, in inner_func'.format(self.filepath),
+            'File "{}", line 28, in inner_func'.format(self.filepath),
             'def inner_func():',
             'fluid.layers.fill_constant(shape=[1, 2], value=9, dtype="int")',
             '<--- HERE',
@@ -285,7 +287,7 @@ def set_exception_type(self):
 
     def set_message(self):
         self.expected_message = [
-            'File "{}", line 44, in func_error_in_compile_time_2'.format(
+            'File "{}", line 46, in func_error_in_compile_time_2'.format(
                 self.filepath
             ),
             'def func_error_in_compile_time_2(x):',
@@ -311,7 +313,7 @@ def set_exception_type(self):
 
     def set_message(self):
         self.expected_message = [
-            'File "{}", line 89, in forward'.format(self.filepath),
+            'File "{}", line 91, in forward'.format(self.filepath),
             '@paddle.jit.to_static',
             'def forward(self):',
             'self.test_func()',
@@ -335,7 +337,7 @@ def set_exception_type(self):
 
     def set_message(self):
         self.expected_message = [
-            'File "{}", line 52, in func_error_in_runtime'.format(
+            'File "{}", line 54, in func_error_in_runtime'.format(
                 self.filepath
             ),
             'x = fluid.dygraph.to_variable(x)',
@@ -352,7 +354,7 @@ def set_func(self):
 
     def set_message(self):
         self.expected_message = [
-            'File "{}", line 104, in func_error_in_runtime_with_empty_line'.format(
+            'File "{}", line 106, in func_error_in_runtime_with_empty_line'.format(
                 self.filepath
             ),
             'two = fluid.layers.fill_constant(shape=[1], value=2, dtype="int32")',
@@ -399,7 +401,7 @@ def set_exception_type(self):
 
     def set_message(self):
         self.expected_message = [
-            'File "{}", line 78, in forward'.format(self.filepath),
+            'File "{}", line 80, in forward'.format(self.filepath),
             'def forward(self, x):',
             'y = self._linear(x)',
             'z = fluid.layers.fill_constant(shape=[1, 2], value=9, dtype="int")',
@@ -433,9 +435,9 @@ def set_exception_type(self):
 
     def set_message(self):
         self.expected_message = [
-            'File "{}", line 116, in forward'.format(self.filepath),
+            'File "{}", line 118, in forward'.format(self.filepath),
             'return self.inner_net.forward(x)',
-            'File "{}", line 125, in forward'.format(self.filepath),
+            'File "{}", line 127, in forward'.format(self.filepath),
             'def forward(self, x):',
             'out = paddle.matmul(self.w, x)',
             '<--- HERE',

From 0a2dfa380332ea38e0df3076cbcc1cac3417202f Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Tue, 6 Dec 2022 18:16:44 +0800
Subject: [PATCH 13/60] Clear extra input (Bias, ResidualData) in OpMaker of
 conv2d (#47579)

* delete Bias and ResidualData in OpMaker of conv2d

* delete extra input of conv3d

* refactor pass of conv_bias_fusion

* fix mkldnn dependency

* fix mkldnn compile

* fix test_conv_bias_mkldnn_fuse_pass

* police some code

* remove useless log

* fix analyzer_vit_ocr_tester

* fix conv_activation_mkldnn_fuse_pass

* fix test_analyzer_ocr

* add fused_conv_sig

* fix performence regression

* fix performance regression
---
 paddle/fluid/framework/ir/graph_helper.cc     |   2 +-
 .../framework/ir/graph_pattern_detector.cc    |  13 +-
 .../framework/ir/graph_pattern_detector.h     |   2 +-
 .../compute_propagate_scales_mkldnn_pass.cc   |   4 +-
 .../conv_activation_mkldnn_fuse_pass.cc       |  41 +-
 .../ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc   |  87 +++-
 .../ir/mkldnn/conv_bias_mkldnn_fuse_pass.h    |   8 +
 .../conv_bias_mkldnn_fuse_pass_tester.cc      |   3 +-
 .../framework/ir/mkldnn/cpu_quantize_pass.cc  |  11 +-
 .../framework/ir/mkldnn/cpu_quantize_pass.h   |   4 +-
 .../ir/mkldnn/cpu_quantize_placement_pass.cc  |  29 +-
 .../mkldnn/params_quantization_mkldnn_pass.cc |   2 +-
 paddle/fluid/framework/ir/pass.h              |   2 +-
 paddle/fluid/framework/op_desc.cc             |   2 +-
 .../inference/api/mkldnn_quantizer_config.cc  |   6 +
 .../inference/api/paddle_analysis_config.h    |   1 +
 .../tests/api/analyzer_vit_ocr_tester.cc      |   2 +-
 .../fluid/operators/compat/fused_conv2d.pbtxt |  54 +++
 .../fluid/operators/compat/fused_conv3d.pbtxt |  54 +++
 paddle/fluid/operators/conv_op.cc             |   6 -
 .../fluid/operators/fused/fused_conv2d_op.cc  |  98 +++++
 paddle/fluid/pybind/pybind.cc                 |   1 -
 paddle/phi/kernels/CMakeLists.txt             |   1 +
 .../fusion/onednn/fused_conv_kernel.cc        | 147 +++++++
 paddle/phi/kernels/onednn/conv_function.h     | 408 ++++++++++++++++++
 paddle/phi/kernels/onednn/conv_kernel.cc      | 381 +---------------
 paddle/phi/ops/compat/fused_conv_sig.cc       |  56 +++
 .../test_conv_bias_mkldnn_fuse_pass.py        |   6 +-
 28 files changed, 1013 insertions(+), 418 deletions(-)
 create mode 100644 paddle/fluid/operators/compat/fused_conv2d.pbtxt
 create mode 100644 paddle/fluid/operators/compat/fused_conv3d.pbtxt
 create mode 100644 paddle/fluid/operators/fused/fused_conv2d_op.cc
 create mode 100644 paddle/phi/kernels/fusion/onednn/fused_conv_kernel.cc
 create mode 100644 paddle/phi/kernels/onednn/conv_function.h
 create mode 100644 paddle/phi/ops/compat/fused_conv_sig.cc

diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index fe2c9adf68fec..7e6ef668fb398 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -713,7 +713,7 @@ static void GetGraphOpDesc(const std::vector<Node *> &nodes,
         UpdateControlOpSkipEagerDeletionVars(*n, graph, graph_idx, n->Name());
       }
       ops->emplace_back(*n->Op());
-      VLOG(4) << n->ToString();
+      VLOG(5) << n->ToString();
     }
     // delete no OpDesc op
   }
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 9912cee3838db..475792c5564b3 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -2068,8 +2068,9 @@ PDNode *patterns::Flatten2Matmul::operator()() {
   return matmul_out;
 }
 
-PDNode *patterns::ConvResidual::operator()(bool with_residual_data) {
-  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
+PDNode *patterns::ConvResidual::operator()(const std::string &conv_type,
+                                           bool with_residual_data) {
+  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op(conv_type);
 
   if (!with_residual_data) {
     conv_op->assert_more([&](Node *x) {
@@ -2082,22 +2083,22 @@ PDNode *patterns::ConvResidual::operator()(bool with_residual_data) {
 
   auto input_var = pattern->NewNode(conv_input_repr())
                        ->AsInput()
-                       ->assert_is_op_input("conv2d", "Input");
+                       ->assert_is_op_input(conv_type, "Input");
 
   auto filter_var = pattern->NewNode(conv_filter_repr())
                         ->AsInput()
-                        ->assert_is_op_input("conv2d", "Filter");
+                        ->assert_is_op_input(conv_type, "Filter");
 
   auto output_var = pattern->NewNode(conv_output_repr())
                         ->AsOutput()
-                        ->assert_is_op_output("conv2d", "Output");
+                        ->assert_is_op_output(conv_type, "Output");
 
   std::vector<PDNode *> links_from{input_var, filter_var};
 
   if (with_residual_data) {
     auto res_conn_var = pattern->NewNode(conv_residual_data_repr())
                             ->AsInput()
-                            ->assert_is_op_input("conv2d", "ResidualData");
+                            ->assert_is_op_input(conv_type, "ResidualData");
     links_from.push_back(res_conn_var);
   }
 
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 8263a19756b1d..1674cac012150 100755
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1057,7 +1057,7 @@ struct ConvResidual : public PatternBase {
   ConvResidual(PDPattern* pattern, const std::string& name_scope)
       : PatternBase(pattern, name_scope, "conv_residual") {}
 
-  PDNode* operator()(bool with_residual_data);
+  PDNode* operator()(const std::string& conv_type, bool with_residual_data);
 
   PATTERN_DECL_NODE(conv_op);
   PATTERN_DECL_NODE(conv_input);
diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
index f1686e445fc1b..3fb7636f06fd0 100644
--- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
@@ -319,7 +319,7 @@ void ComputePropagateScalesMkldnnPass::ComputeWeightScales(
     ir::Graph* graph, Scope* scope, StringPairMap* var_quant_scales) const {
   ComputeVarScales(graph,
                    scope,
-                   {"conv2d", "depthwise_conv2d"},
+                   {"conv2d", "depthwise_conv2d", "fused_conv2d"},
                    "Filter",
                    1,
                    var_quant_scales);
@@ -446,7 +446,7 @@ void ComputePropagateScalesMkldnnPass::UpdateReluOutputScales(
     if (op->Type() == "relu") {
       is_unsigned = true;
     } else {
-      if (op->Type() == "conv2d") {
+      if (op->Type() == "conv2d" || op->Type() == "fused_conv2d") {
         act_name = "fuse_activation";
         output_name = "Output";
       } else if (op->Type() == "fc") {
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
index 3cd3cc8f7b054..a673aafadccfc 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
@@ -26,7 +26,7 @@ using string::PrettyLogDetail;
 
 void ConvActivationMkldnnFusePass::ApplyImpl(Graph* graph) const {
   auto act_types = phi::funcs::GetSupportedActivations();
-  std::vector<std::string> conv_types = {"conv2d"};
+  std::vector<std::string> conv_types = {"conv2d", "fused_conv2d"};
 
   for (auto& act_type : act_types) {
     FuseConvConcatAct(graph, act_type);
@@ -218,6 +218,45 @@ ConvActivationMkldnnFusePass::ConvActivationMkldnnFusePass() {
       .IsStringIn({"NCHW", "NHWC", "AnyLayout"})
       .End();
 
+  AddOpCompat(OpCompat("fused_conv2d"))
+      .AddInput("Input")
+      .IsTensor()
+      .End()
+      .AddInput("Filter")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsOptional()
+      .IsTensor()
+      .End()
+      .AddInput("ResidualData")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Output")
+      .IsTensor()
+      .End()
+      .AddAttr("strides")
+      .IsType<std::vector<int>>()
+      .End()
+      .AddAttr("paddings")
+      .IsType<std::vector<int>>()
+      .End()
+      .AddAttr("padding_algorithm")
+      .IsOptional()
+      .IsStringIn({"EXPLICIT", "SAME", "VALID"})
+      .End()
+      .AddAttr("groups")
+      .IsNumGE(1)
+      .End()
+      .AddAttr("dilations")
+      .IsType<std::vector<int>>()
+      .End()
+      .AddAttr("data_format")
+      .IsOptional()
+      .IsStringIn({"NCHW", "NHWC", "AnyLayout"})
+      .End();
+
   AddOpCompat(OpCompat("concat"))
       .AddInput("X")
       .End()
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
index bf88b82fc30a1..13cd875431603 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
@@ -61,6 +61,40 @@ ConvBiasFusePass::ConvBiasFusePass() {
       .IsStringIn({"NCHW", "NHWC", "AnyLayout"})
       .End();
 
+  AddOpCompat(OpCompat("fused_conv2d"))
+      .AddInput("Input")
+      .IsTensor()
+      .End()
+      .AddInput("Filter")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Output")
+      .IsTensor()
+      .End()
+      .AddAttr("strides")
+      .IsType<std::vector<int>>()
+      .End()
+      .AddAttr("paddings")
+      .IsType<std::vector<int>>()
+      .End()
+      .AddAttr("padding_algorithm")
+      .IsOptional()
+      .IsStringIn({"EXPLICIT", "SAME", "VALID"})
+      .End()
+      .AddAttr("groups")
+      .IsNumGE(1)
+      .End()
+      .AddAttr("dilations")
+      .IsType<std::vector<int>>()
+      .End()
+      .AddAttr("data_format")
+      .IsStringIn({"NCHW", "NHWC", "AnyLayout"})
+      .End();
+
   AddOpCompat(OpCompat("elementwise_add"))
       .AddInput("X")
       .IsTensor()
@@ -165,6 +199,40 @@ Conv3DBiasFusePass::Conv3DBiasFusePass() {
       .IsStringIn({"NDHWC", "NCDHW"})
       .End();
 
+  AddOpCompat(OpCompat("fused_conv3d"))
+      .AddInput("Input")
+      .IsTensor()
+      .End()
+      .AddInput("Filter")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Output")
+      .IsTensor()
+      .End()
+      .AddAttr("strides")
+      .IsType<std::vector<int>>()
+      .End()
+      .AddAttr("paddings")
+      .IsType<std::vector<int>>()
+      .End()
+      .AddAttr("padding_algorithm")
+      .IsOptional()
+      .IsStringIn({"EXPLICIT", "SAME", "VALID"})
+      .End()
+      .AddAttr("groups")
+      .IsNumGE(1)
+      .End()
+      .AddAttr("dilations")
+      .IsType<std::vector<int>>()
+      .End()
+      .AddAttr("data_format")
+      .IsStringIn({"NCHW", "NHWC", "AnyLayout"})
+      .End();
+
   AddOpCompat(OpCompat("elementwise_add"))
       .AddInput("X")
       .IsTensor()
@@ -203,6 +271,16 @@ phi::DenseTensor tensor_apply_eltwise(const phi::DenseTensor& vec_a,
 }
 
 void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
+  FuseConvBias(graph, type(), fused_type());
+  if (type() != fused_type()) {
+    // Is the second pass useful?
+    FuseConvBias(graph, fused_type(), fused_type());
+  }
+}
+
+void ConvBiasFusePass::FuseConvBias(ir::Graph* graph,
+                                    const std::string& conv_type,
+                                    const std::string& fused_conv) const {
   PADDLE_ENFORCE_NOT_NULL(
       graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init(name_scope_, graph);
@@ -216,9 +294,9 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
       gpd.mutable_pattern()
           ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
           ->AsInput()
-          ->assert_is_op_input(type(), "Input");
+          ->assert_is_op_input(conv_type, "Input");
   patterns::ConvBias conv_bias_pattern(gpd.mutable_pattern(), name_scope_);
-  conv_bias_pattern(conv_input, type());
+  conv_bias_pattern(conv_input, conv_type);
   int found_conv_bias_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
@@ -249,7 +327,7 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
     // check if fuse can be done and if MKL-DNN should be used
     FuseOptions fuse_option = FindFuseOption(*conv, *eltwise);
     if (fuse_option == DO_NOT_FUSE || fuse_option == FUSE_NATIVE) {
-      VLOG(3) << "do not perform " + type() + "+bias fuse";
+      VLOG(3) << "do not perform " + conv_type + "+bias fuse";
       return;
     }
 
@@ -294,7 +372,7 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
       desc.SetInput("Filter", std::vector<std::string>({conv_weight->Name()}));
       desc.SetInput("Bias", std::vector<std::string>({eltwise_bias->Name()}));
       desc.SetOutput("Output", std::vector<std::string>({eltwise_out->Name()}));
-      desc.SetType(type());
+      desc.SetType(fused_conv);
 
       for (auto& attr : conv->Op()->GetAttrMap()) {
         desc.SetAttr(attr.first, attr.second);
@@ -323,6 +401,7 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
                             type());
   }
 }
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
index 18e09173491da..d4fb89f091c87 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
@@ -32,11 +32,17 @@ class ConvBiasFusePass : public FusePassBase {
   ConvBiasFusePass();
   virtual ~ConvBiasFusePass() {}
   virtual std::string type() const { return "conv2d"; }
+  virtual std::string fused_type() const { return "fused_conv2d"; }
 
  protected:
   void ApplyImpl(ir::Graph* graph) const override;
+  void FuseConvBias(ir::Graph* graph,
+                    const std::string& conv_type,
+                    const std::string& fused_conv) const;
+
   const std::string name_scope_{"conv_bias_mkldnn_fuse"};
 };
+
 /*
  * Fuse the Conv3D and Elementwise_add to a Conv3DBiasOp.
  */
@@ -44,12 +50,14 @@ class Conv2DTransposeBiasFusePass : public ConvBiasFusePass {
  public:
   Conv2DTransposeBiasFusePass();
   std::string type() const override { return "conv2d_transpose"; }
+  std::string fused_type() const override { return "conv2d_transpose"; }
 };
 
 class Conv3DBiasFusePass : public ConvBiasFusePass {
  public:
   Conv3DBiasFusePass();
   std::string type() const override { return "conv3d"; }
+  std::string fused_type() const override { return "fused_conv3d"; }
 };
 }  // namespace ir
 }  // namespace framework
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
index 41aea6218b209..c5ee20b4b0162 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
@@ -139,7 +139,8 @@ void MainTest(bool convWithExistingBias) {
   int conv_bias_count = 0;
 
   for (auto* node : graph->Nodes()) {
-    if (node->IsOp() && node->Op()->Type() == "conv2d") {
+    if (node->IsOp() && (node->Op()->Type() == "conv2d" ||
+                         node->Op()->Type() == "fused_conv2d")) {
       auto* op = node->Op();
       ASSERT_TRUE(op->HasAttr("use_mkldnn"));
       EXPECT_TRUE(PADDLE_GET_CONST(bool, op->GetAttr("use_mkldnn")));
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index ac509aa604bd6..6995412d055c6 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -388,11 +388,12 @@ void CPUQuantizePass::GetQuantInfo(Graph* graph) const {
 }
 
 void CPUQuantizePass::QuantizeConv(Graph* graph,
+                                   const std::string& conv_type,
                                    bool with_residual_data) const {
   GraphPatternDetector gpd;
   auto pattern = gpd.mutable_pattern();
   patterns::ConvResidual conv_pattern{pattern, name_scope_};
-  conv_pattern(with_residual_data);
+  conv_pattern(conv_type, with_residual_data);
 
   int quantize_conv_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
@@ -510,7 +511,7 @@ void CPUQuantizePass::QuantizeConv(Graph* graph,
   AddStatis(quantize_conv_count);
 
   LogQuantizedOpsCounter(
-      "conv2d",
+      conv_type,
       quantize_conv_count,
       ((with_residual_data) ? "with residual connection" : ""));
 }
@@ -1247,8 +1248,10 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
       platform::errors::InvalidArgument("Scope cannot be nullptr."));
 
   GetQuantInfo(graph);
-  QuantizeConv(graph, false /* with_residual_data */);
-  QuantizeConv(graph, true /* with_residual_data */);
+  QuantizeConv(graph, "conv2d", false /* with_residual_data */);
+  QuantizeConv(graph, "conv2d", true /* with_residual_data */);
+  QuantizeConv(graph, "fused_conv2d", false /* with_residual_data */);
+  QuantizeConv(graph, "fused_conv2d", true /* with_residual_data */);
   QuantizePool(graph);
   QuantizeConcat(graph);
   QuantizePriorBox(graph);
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
index b3c5312197baf..59bf2ab2d4fd0 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
@@ -49,7 +49,9 @@ class CPUQuantizePass : public FusePassBase {
  protected:
   void ApplyImpl(ir::Graph* graph) const override;
 
-  void QuantizeConv(Graph* graph, bool with_residual_data) const;
+  void QuantizeConv(Graph* graph,
+                    const std::string& conv_type,
+                    bool with_residual_data) const;
   void QuantizeFc(Graph* graph, bool with_residual_data) const;
   void QuantizePool(Graph* graph) const;
   void QuantizeConcat(Graph* graph) const;
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
index 70433772ce0c7..08b66d8f2f56e 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
@@ -25,25 +25,14 @@ class Graph;
 void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Marks operators which are to be quantized.";
   std::unordered_set<std::string> supported_op_types =
-      std::unordered_set<std::string>({"concat",
-                                       "conv2d",
-                                       "depthwise_conv2d",
-                                       "elementwise_add",
-                                       "elementwise_mul",
-                                       "elementwise_sub",
-                                       "fc",
-                                       "matmul",
-                                       "nearest_interp",
-                                       "nearest_interp_v2",
-                                       "pool2d",
-                                       "prior_box",
-                                       "reshape2",
-                                       "transpose2",
-                                       "fusion_gru",
-                                       "fusion_lstm",
-                                       "multi_gru",
-                                       "slice",
-                                       "split"});
+      std::unordered_set<std::string>(
+          {"concat",          "conv2d",          "depthwise_conv2d",
+           "fused_conv2d",    "fused_conv3d",    "elementwise_add",
+           "elementwise_mul", "elementwise_sub", "fc",
+           "matmul",          "nearest_interp",  "nearest_interp_v2",
+           "pool2d",          "prior_box",       "reshape2",
+           "transpose2",      "fusion_gru",      "fusion_lstm",
+           "multi_gru",       "slice",           "split"});
   const auto& excluded_ids_list =
       Get<std::unordered_set<int>>("quantize_excluded_op_ids");
   const auto& op_types_list =
@@ -71,7 +60,6 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
     GET_IR_NODE_FROM_SUBGRAPH(op, op, quantize_placement_pattern);
-
     if (std::find(excluded_ids_list.begin(),
                   excluded_ids_list.end(),
                   op->id()) != excluded_ids_list.end()) {
@@ -81,7 +69,6 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
     if (op->Op()->GetAttrIfExists<int>("skip_quant") == 1) {
       return;
     }
-
     op->Op()->SetAttr("mkldnn_data_type", std::string("int8"));
   };
   gpd(graph, handler);
diff --git a/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass.cc
index e2000708b8a84..5ad1e95cd79c0 100644
--- a/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass.cc
@@ -120,7 +120,7 @@ void ParamsQuantizationMkldnnPass::QuantizeConv(ir::Graph* graph,
                                                 bool with_residual_data) const {
   GraphPatternDetector gpd;
   patterns::ConvResidual conv_pattern(gpd.mutable_pattern(), name_scope_);
-  conv_pattern(with_residual_data);
+  conv_pattern("conv2d", with_residual_data);
 
   int params_to_int8_conv_found = 0;
 
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index e0315f0b5b741..50d7434c7d97a 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -146,7 +146,7 @@ class Pass {
     }
     attrs_[attr_name] = attr;
     attr_dels_[attr_name] = [attr, attr_name]() {
-      VLOG(3) << "deleting " << attr_name;
+      VLOG(8) << "deleting " << attr_name;
       delete attr;
     };
   }
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index dcc47058b6414..2b84fed6846a8 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -979,7 +979,7 @@ struct SetAttrDescVisitor {
 };
 
 void OpDesc::Flush() {
-  VLOG(4) << "Flush "
+  VLOG(8) << "Flush "
           << " " << Type() << " " << need_update_;
   if (need_update_) {
     this->desc_.mutable_inputs()->Clear();
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
index 0beac10903886..646e72fe2885a 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
@@ -26,6 +26,12 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() {
   rules_["conv2d"]["ResidualData"] = ScaleAlgo::KL;
   rules_["conv2d"]["Output"] = ScaleAlgo::KL;
 
+  rules_["fused_conv2d"]["Input"] = ScaleAlgo::KL;
+  rules_["fused_conv2d"]["Filter"] = ScaleAlgo::MAX_CH;
+  rules_["fused_conv2d"]["Bias"] = ScaleAlgo::NONE;  // do not compute scale
+  rules_["fused_conv2d"]["ResidualData"] = ScaleAlgo::KL;
+  rules_["fused_conv2d"]["Output"] = ScaleAlgo::KL;
+
   rules_["pool2d"]["X"] = ScaleAlgo::KL;
   rules_["pool2d"]["Out"] = ScaleAlgo::KL;
 
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index f8ddcbdaa8f39..5521caee9f430 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -1172,6 +1172,7 @@ struct PD_INFER_DECL AnalysisConfig {
       "concat",
       "conv2d",
       "depthwise_conv2d",
+      "fused_conv2d",
       "elementwise_add",
       "elementwise_mul",
       "fc",
diff --git a/paddle/fluid/inference/tests/api/analyzer_vit_ocr_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vit_ocr_tester.cc
index 3870fde8b533a..8180d951050ce 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vit_ocr_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vit_ocr_tester.cc
@@ -103,7 +103,7 @@ TEST(Analyzer_vit_ocr, fuse_status) {
       static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
 
   CHECK_EQ(fuse_statis.at("fc_mkldnn_pass"), 33);
-  CHECK_EQ(fuse_statis.at("conv2d_gelu_mkldnn_fuse_pass"), 2);
+  CHECK_EQ(fuse_statis.at("fused_conv2d_gelu_mkldnn_fuse_pass"), 2);
   CHECK_EQ(fuse_statis.at("fc_elementwise_add_mkldnn_fuse"), 16);
 }
 #endif
diff --git a/paddle/fluid/operators/compat/fused_conv2d.pbtxt b/paddle/fluid/operators/compat/fused_conv2d.pbtxt
new file mode 100644
index 0000000000000..c6bdc08f4649b
--- /dev/null
+++ b/paddle/fluid/operators/compat/fused_conv2d.pbtxt
@@ -0,0 +1,54 @@
+type: "fused_conv2d"
+def {
+  inputs {
+    name: "Input"
+  }
+  inputs {
+    name: "Filter"
+  }
+  inputs {
+    name: "Bias"
+  }
+  inputs {
+    name: "ResidualData"
+  }
+  outputs {
+    name: "Output"
+  }
+  attrs {
+    name: "strides"
+    type: INTS
+  }
+  attrs {
+    name: "paddings"
+    type: INTS
+  }
+  attrs {
+    name: "padding_algorithm"
+    type: STRING
+  }
+  attrs {
+    name: "groups"
+    type: INT
+  }
+  attrs {
+    name: "dilations"
+    type: INTS
+  }
+  attrs {
+    name: "data_format"
+    type: STRING
+  }
+  attrs {
+    name: "fuse_activation"
+    type: STRING
+  }
+  attrs {
+    name: "fuse_residual_connection"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "force_fp32_output"
+    type: BOOLEAN
+  }
+}
diff --git a/paddle/fluid/operators/compat/fused_conv3d.pbtxt b/paddle/fluid/operators/compat/fused_conv3d.pbtxt
new file mode 100644
index 0000000000000..038cabf5140de
--- /dev/null
+++ b/paddle/fluid/operators/compat/fused_conv3d.pbtxt
@@ -0,0 +1,54 @@
+type: "fused_conv3d"
+def {
+  inputs {
+    name: "Input"
+  }
+  inputs {
+    name: "Filter"
+  }
+  inputs {
+    name: "Bias"
+  }
+  inputs {
+    name: "ResidualData"
+  }
+  outputs {
+    name: "Output"
+  }
+  attrs {
+    name: "strides"
+    type: INTS
+  }
+  attrs {
+    name: "paddings"
+    type: INTS
+  }
+  attrs {
+    name: "padding_algorithm"
+    type: STRING
+  }
+  attrs {
+    name: "groups"
+    type: INT
+  }
+  attrs {
+    name: "dilations"
+    type: INTS
+  }
+  attrs {
+    name: "data_format"
+    type: STRING
+  }
+  attrs {
+    name: "fuse_activation"
+    type: STRING
+  }
+  attrs {
+    name: "fuse_residual_connection"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "force_fp32_output"
+    type: BOOLEAN
+  }
+}
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 50b90e56c03e0..107e3b5a3de49 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -364,12 +364,6 @@ void Conv3DOpMaker::Make() {
            "is the width of the filter."
            "If the groups attribute is greater than 1, C equals the number of "
            "input image channels divided by the groups.");
-  AddInput("ResidualData",
-           "(Tensor) Tensor with residual data "
-           "to which convolution output will be added."
-           "Used with fuse_residual_connection fusion.")
-      .AsDispensable()
-      .AsExtra();
   AddOutput("Output",
             "(Tensor) The output tensor of convolution operator."
             "It has same data fromat and data type as the Input.");
diff --git a/paddle/fluid/operators/fused/fused_conv2d_op.cc b/paddle/fluid/operators/fused/fused_conv2d_op.cc
new file mode 100644
index 0000000000000..178c2a963e28f
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_conv2d_op.cc
@@ -0,0 +1,98 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/conv_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FusedConvOpMaker : public Conv2DOpMaker {
+ protected:
+  void Apply() override {
+    AddAttr<std::string>(
+        "mkldnn_data_type",
+        "(string, default \"float32\"). Data type of mkldnn kernel")
+        .SetDefault("float32")
+        .InEnum({"float32", "int8", "bfloat16"});
+    AddAttr<std::string>("fuse_activation",
+                         "(string, default \"\") Only used in mkldnn kernel")
+        .SetDefault("");
+    AddAttr<bool>("fuse_residual_connection",
+                  "(bool, default false) Only used in mkldnn kernel. Used "
+                  "whenever convolution output is as an input to residual "
+                  "connection.")
+        .SetDefault(false);
+    AddAttr<bool>("force_fp32_output",
+                  "(bool, default false) Force INT8 kernel output FP32, only "
+                  "used in MKL-DNN INT8")
+        .SetDefault(false);
+    AddAttr<bool>("use_mkldnn", "(bool, default false) Used in mkldnn kernel")
+        .SetDefault(true);
+    AddComment(R"DOC(
+Convolution Operator.
+
+The convolution operation calculates the output based on the input, filter
+and strides, paddings, dilations, groups parameters. The size of each dimension of the
+parameters is checked in the infer-shape.
+Input(Input) and Output(Output) are in NCHW or NHWC format. Where N is batch
+size, C is the number of channels, H is the height of the feature, and W is
+the width of the feature.
+Filters(Input) is MCHW format format. Where M is the number of output image channels, C is
+the number of input image channels, H is the height of the filter, and W
+is the width of the filter.
+Parameters(strides, paddings, dilations) are two elements. These two elements represent
+height and width, respectively.
+The input(X) size and output(Out) size may be different.
+
+Example:
+  Input:
+       Input shape: $(N, C_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
+  Output:
+  ?
+       Output shape: $(N, C_{out}, H_{out}, W_{out})$
+  Where
+$$
+       H_{out}= \frac{(H_{in} + pad_height_top + pad_height_bottom - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 \\
+       W_{out}= \frac{(W_{in} + pad_width_left + pad_width_right - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1
+$$
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+// fused_conv2d is only used for onednn inference.
+REGISTER_OPERATOR(
+    fused_conv2d,
+    ops::ConvOp,
+    ops::FusedConvOpMaker,
+    ops::ConvOpInferVarType,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+// fused_conv3d is only used for onednn inference.
+REGISTER_OPERATOR(
+    fused_conv3d,
+    ops::ConvOp,
+    ops::FusedConvOpMaker,
+    ops::ConvOpInferVarType,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index d85824a9237b2..be8355d023d25 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1197,7 +1197,6 @@ All parameter, weight, gradient are variables in Paddle.
           -> const paddle::framework::AttributeMap & {
         return operators::ExtraInfoUtils::Instance().GetExtraAttrsMap(op_type);
       });
-
   m.def(
       "get_attrtibute_type",
       [](const std::string &op_type,
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index ef2231c059ad9..808b18bb02d45 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -118,6 +118,7 @@ if(WITH_MKLDNN)
     "strings/cpu/*.cc"
     "onednn/*.cc"
     "fusion/*.cc"
+    "fusion/onednn/*.cc"
     "fusion/cpu/*.cc")
 else()
   file(
diff --git a/paddle/phi/kernels/fusion/onednn/fused_conv_kernel.cc b/paddle/phi/kernels/fusion/onednn/fused_conv_kernel.cc
new file mode 100644
index 0000000000000..a49bf03eee6d4
--- /dev/null
+++ b/paddle/phi/kernels/fusion/onednn/fused_conv_kernel.cc
@@ -0,0 +1,147 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/onednn/conv_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void FusedConv2DKernel(const Context& dev_ctx,
+                       const DenseTensor& input,
+                       const DenseTensor& filter,
+                       const paddle::optional<DenseTensor>& bias,
+                       const paddle::optional<DenseTensor>& residual_param,
+                       const std::vector<int>& strides,
+                       const std::vector<int>& paddings,
+                       const std::string& padding_algorithm,
+                       const std::vector<int>& dilations,
+                       int groups,
+                       const std::string& data_format,
+                       const std::string& mkldnn_data_type,
+                       const std::string& fuse_activation,
+                       bool fuse_residual_conn,
+                       bool force_fp32_output,
+                       DenseTensor* out) {
+  bool is_BFLOAT16 = mkldnn_data_type == "bfloat16";
+
+  ConvOnednn<T>(dev_ctx,
+                &input,
+                &filter,
+                bias.get_ptr(),
+                residual_param.get_ptr(),
+                strides,
+                paddings,
+                padding_algorithm,
+                dilations,
+                groups,
+                data_format,
+                true,
+                is_BFLOAT16,
+                fuse_activation,
+                fuse_residual_conn,
+                force_fp32_output,
+                out);
+}
+
+template <typename T, typename Context>
+void FusedDepthwiseConv2DKernel(
+    const Context& dev_ctx,
+    const DenseTensor& input,
+    const DenseTensor& filter,
+    const paddle::optional<DenseTensor>& bias,
+    const paddle::optional<DenseTensor>& residual_param,
+    const std::vector<int>& strides,
+    const std::vector<int>& paddings,
+    const std::string& padding_algorithm,
+    const std::vector<int>& dilations,
+    int groups,
+    const std::string& data_format,
+    const std::string& mkldnn_data_type,
+    const std::string& fuse_activation,
+    bool fuse_residual_conn,
+    bool force_fp32_output,
+    DenseTensor* out) {
+  bool is_BFLOAT16 = mkldnn_data_type == "bfloat16";
+
+  ConvOnednn<T>(dev_ctx,
+                &input,
+                &filter,
+                bias.get_ptr(),
+                residual_param.get_ptr(),
+                strides,
+                paddings,
+                padding_algorithm,
+                dilations,
+                groups,
+                data_format,
+                true,
+                is_BFLOAT16,
+                fuse_activation,
+                fuse_residual_conn,
+                force_fp32_output,
+                out);
+}
+
+template <typename T, typename Context>
+void FusedConv3DKernel(const Context& dev_ctx,
+                       const DenseTensor& input,
+                       const DenseTensor& filter,
+                       const paddle::optional<DenseTensor>& bias,
+                       const paddle::optional<DenseTensor>& residual_param,
+                       const std::vector<int>& strides,
+                       const std::vector<int>& paddings,
+                       const std::string& padding_algorithm,
+                       const std::vector<int>& dilations,
+                       int groups,
+                       const std::string& data_format,
+                       const std::string& mkldnn_data_type,
+                       const std::string& fuse_activation,
+                       bool fuse_residual_conn,
+                       bool force_fp32_output,
+                       DenseTensor* out) {
+  bool is_BFLOAT16 = mkldnn_data_type == "bfloat16";
+
+  ConvOnednn<T>(dev_ctx,
+                &input,
+                &filter,
+                bias.get_ptr(),
+                residual_param.get_ptr(),
+                strides,
+                paddings,
+                padding_algorithm,
+                dilations,
+                groups,
+                data_format,
+                true,
+                is_BFLOAT16,
+                fuse_activation,
+                fuse_residual_conn,
+                force_fp32_output,
+                out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(fused_conv2d,
+                   OneDNN,
+                   ONEDNN,
+                   phi::FusedConv2DKernel,
+                   float,
+                   phi::dtype::bfloat16,
+                   uint8_t,
+                   int8_t) {}
+
+PD_REGISTER_KERNEL(
+    fused_conv3d, OneDNN, ONEDNN, phi::FusedConv3DKernel, float) {}
diff --git a/paddle/phi/kernels/onednn/conv_function.h b/paddle/phi/kernels/onednn/conv_function.h
new file mode 100644
index 0000000000000..4b3c4d58895cc
--- /dev/null
+++ b/paddle/phi/kernels/onednn/conv_function.h
@@ -0,0 +1,408 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/visit_type.h"
+#include "paddle/phi/kernels/funcs/data_layout_transform.h"
+#include "paddle/phi/kernels/onednn/conv_handler.h"
+
+namespace phi {
+
+static dnnl::memory::data_type GetDstType(
+    bool is_int8,
+    bool is_bfloat16,
+    bool force_fp32_output,
+    std::string fuse_activation,
+    bool fuse_residual_conn,
+    const phi::DenseTensor* residual_param) {
+  auto dst_dt = dnnl::memory::data_type::f32;
+  if (is_int8) {
+    dst_dt = (fuse_activation == "relu" || fuse_activation == "relu6")
+                 ? dnnl::memory::data_type::u8
+                 : dnnl::memory::data_type::s8;
+    if (force_fp32_output) {
+      dst_dt = dnnl::memory::data_type::f32;
+    }
+    if (fuse_residual_conn && residual_param) {
+      auto residual_dt = funcs::ToOneDNNDataType(residual_param->dtype());
+      if (dst_dt != residual_dt) dst_dt = residual_dt;
+    }
+  } else {
+    if (!force_fp32_output && is_bfloat16) {
+      dst_dt = dnnl::memory::data_type::bf16;
+      if (fuse_residual_conn && residual_param) {
+        dst_dt = funcs::ToOneDNNDataType(residual_param->dtype());
+      }
+    }
+  }
+  return dst_dt;
+}
+
+#define PD_VISIT_FLOAT_AND_INT8_TYPES(TYPE, NAME, ...)                    \
+  [&] {                                                                   \
+    const auto& __dtype__ = TYPE;                                         \
+    switch (__dtype__) {                                                  \
+      PD_PRIVATE_CASE_TYPE(                                               \
+          NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)          \
+      PD_PRIVATE_CASE_TYPE(                                               \
+          NAME, ::paddle::DataType::INT8, int8_t, __VA_ARGS__)            \
+      default:                                                            \
+        PD_THROW("function " #NAME " is not implemented for data type `", \
+                 __dtype__,                                               \
+                 "`");                                                    \
+    }                                                                     \
+  }()
+
+template <typename T, typename T_out>
+void ComputeFP32(const OneDNNContext& dev_ctx,
+                 const DenseTensor* input,
+                 const DenseTensor* filter,
+                 const DenseTensor* bias,
+                 const DenseTensor* residual_param,
+                 const std::vector<int>& strides,
+                 const std::vector<int>& paddings,
+                 const std::string& padding_algorithm,
+                 const std::vector<int>& dilations,
+                 int groups,
+                 const std::string& data_format,
+                 bool is_test,
+                 bool is_BFLOAT16,
+                 const std::string& fuse_activation,
+                 bool fuse_residual_conn,
+                 bool force_fp32_output,
+                 DenseTensor* output) {
+  const auto& onednn_engine = dev_ctx.GetEngine();
+  const bool is_conv3d = strides.size() == 3U;
+  const std::string& unique_name =
+      dev_ctx.GetInputsName("Input")[0] + dev_ctx.GetInputsName("Filter")[0];
+  PD_VISIT_FLOAT_AND_INT8_TYPES(
+      filter->dtype(), "ConvOneDNNHandlerT", ([&] {
+        onednn::ConvOneDNNHandlerT<T, data_t, T_out> handler(dev_ctx,
+                                                             onednn_engine,
+                                                             dev_ctx.GetPlace(),
+                                                             input,
+                                                             filter,
+                                                             bias,
+                                                             strides,
+                                                             paddings,
+                                                             padding_algorithm,
+                                                             dilations,
+                                                             groups,
+                                                             data_format,
+                                                             is_test,
+                                                             is_BFLOAT16,
+                                                             fuse_activation,
+                                                             fuse_residual_conn,
+                                                             force_fp32_output,
+                                                             output,
+                                                             unique_name);
+        auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input);
+        auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder(
+            filter, groups, is_conv3d, is_test);
+        std::shared_ptr<dnnl::memory> dst_memory_p;
+        if (fuse_residual_conn) {
+          dst_memory_p =
+              handler.AcquireDstMemoryWithResidual(output, residual_param);
+        } else {
+          dst_memory_p = handler.template AcquireDstMemory<T_out>(output);
+        }
+
+        auto conv_p = handler.AcquireForwardPrimitive();
+        std::unordered_map<int, dnnl::memory> args = {
+            {DNNL_ARG_SRC, *src_memory_p},
+            {DNNL_ARG_WEIGHTS, *weights_memory_p},
+            {DNNL_ARG_DST, *dst_memory_p}};
+
+        if (bias) {
+          auto bias_memory_p =
+              handler.AcquireBiasMemoryWithReorder(bias, is_test);
+          args.insert({DNNL_ARG_BIAS, *bias_memory_p});
+        }
+
+        auto& astream = OneDNNContext::tls().get_stream();
+        conv_p->execute(astream, args);
+        astream.wait();
+        output->set_mem_desc(dst_memory_p->get_desc());
+      }));
+}
+
+template <typename T, typename T_out>
+void ComputeINT8(const OneDNNContext& dev_ctx,
+                 const DenseTensor* input,
+                 const DenseTensor* filter,
+                 const DenseTensor* bias,
+                 const DenseTensor* residual_param,
+                 const std::vector<int>& strides,
+                 const std::vector<int>& paddings,
+                 const std::string& padding_algorithm,
+                 const std::vector<int>& dilations,
+                 int groups,
+                 const std::string& data_format,
+                 bool is_test,
+                 bool is_BFLOAT16,
+                 const std::string& fuse_activation,
+                 bool fuse_residual_conn,
+                 bool force_fp32_output,
+                 DenseTensor* output) {
+  const auto& onednn_engine = dev_ctx.GetEngine();
+  const bool is_conv3d = strides.size() == 3U;
+
+  bool unsigned_output =
+      (fuse_activation == "relu" || fuse_activation == "relu6");
+  bool need_s8_to_u8 = false;
+
+  PADDLE_ENFORCE_NE(
+      is_conv3d,
+      true,
+      phi::errors::Unimplemented(
+          "OneDNN int8 convolution does not support 3D inputs currently"));
+  PADDLE_ENFORCE_EQ(
+      fuse_residual_conn && force_fp32_output,
+      false,
+      phi::errors::Unimplemented(
+          "residual fusion does not support force output with fp32"));
+  const std::string& unique_name =
+      dev_ctx.GetInputsName("Input")[0] + dev_ctx.GetInputsName("Filter")[0];
+  PD_VISIT_FLOAT_AND_INT8_TYPES(
+      filter->dtype(), "ConvMKLDNNHandlerT", ([&] {
+        onednn::ConvOneDNNHandlerT<T, data_t, T_out> handler(dev_ctx,
+                                                             onednn_engine,
+                                                             dev_ctx.GetPlace(),
+                                                             input,
+                                                             filter,
+                                                             bias,
+                                                             strides,
+                                                             paddings,
+                                                             padding_algorithm,
+                                                             dilations,
+                                                             groups,
+                                                             data_format,
+                                                             is_test,
+                                                             is_BFLOAT16,
+                                                             fuse_activation,
+                                                             fuse_residual_conn,
+                                                             force_fp32_output,
+                                                             output,
+                                                             unique_name);
+
+        auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input);
+
+        const auto& scale_weights_data =
+            dev_ctx.HasDnnAttr("Scale_weights")
+                ? PADDLE_GET_CONST(std::vector<float>,
+                                   dev_ctx.GetDnnAttr("Scale_weights"))
+                : std::vector<float>{1.0f};
+        const bool is_multi_channel = scale_weights_data.size() > 1;
+        int mask_reorder = is_multi_channel
+                               ? ((groups != 1) ? (1 << 1) + (1 << 0) : 1 << 0)
+                               : 0;
+        auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder(
+            filter, groups, false, true, scale_weights_data, mask_reorder);
+
+        std::shared_ptr<dnnl::memory> dst_memory_p;
+        if (fuse_residual_conn) {
+          PADDLE_ENFORCE_EQ(
+              output->dims(),
+              residual_param->dims(),
+              phi::errors::InvalidArgument(
+                  "Output and elementwise parameter need to have the "
+                  "same dimension sizes, but got output's dimension = %d"
+                  " and residual param's dimension =%d .",
+                  output->dims().size(),
+                  residual_param->dims().size()));
+          dst_memory_p =
+              handler.AcquireDstMemoryWithResidual(output, residual_param);
+          need_s8_to_u8 = (funcs::OneDNNGetDataType<T_out>() ==
+                           dnnl::memory::data_type::s8) &&
+                          unsigned_output;
+        } else {
+          dst_memory_p = handler.template AcquireDstMemory<T_out>(output);
+        }
+
+        auto conv_p = handler.AcquireForwardPrimitive();
+
+        std::unordered_map<int, dnnl::memory> args = {
+            {DNNL_ARG_SRC, *src_memory_p},
+            {DNNL_ARG_WEIGHTS, *weights_memory_p},
+            {DNNL_ARG_DST, *dst_memory_p}};
+
+        if (bias) {
+          std::vector<float> bias_scales;
+          auto p_scales_tuple =
+              std::make_shared<std::tuple<float, std::vector<float>>>(
+                  std::make_tuple(static_cast<float>(mask_reorder),
+                                  bias_scales));
+          if (dev_ctx.HasDnnAttr("Bias_scales")) {
+            bias_scales = PADDLE_GET_CONST(std::vector<float>,
+                                           dev_ctx.GetDnnAttr("Bias_scales"));
+            p_scales_tuple =
+                std::make_shared<std::tuple<float, std::vector<float>>>(
+                    std::make_tuple(static_cast<float>(mask_reorder),
+                                    bias_scales));
+          } else {
+            p_scales_tuple = handler.get_int8_bias_scales(
+                filter, groups, scale_weights_data);
+          }
+          auto bias_memory_p = handler.AcquireBiasMemoryWithReorder(
+              bias,
+              true,
+              std::get<1>(*p_scales_tuple),
+              std::get<0>(*p_scales_tuple));
+          args.insert({DNNL_ARG_BIAS, *bias_memory_p});
+        }
+
+        auto& astream = OneDNNContext::tls().get_stream();
+        conv_p->execute(astream, args);
+        astream.wait();
+
+        if (need_s8_to_u8) {
+          dev_ctx.Alloc<uint8_t>(output);
+        }
+
+        output->set_mem_desc(dst_memory_p->get_desc());
+      }));
+}
+
+template <typename T, typename Context>
+void ConvOnednn(const Context& dev_ctx,
+                const DenseTensor* input,
+                const DenseTensor* filter,
+                const DenseTensor* bias,
+                const DenseTensor* residual_param,
+                const std::vector<int>& strides,
+                const std::vector<int>& paddings,
+                const std::string& padding_algorithm,
+                const std::vector<int>& dilations,
+                int groups,
+                const std::string& data_format,
+                bool is_test,
+                bool is_bfloat16,
+                const std::string& fuse_activation,
+                bool fuse_residual_connection,
+                bool force_fp32_output,
+                DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      dev_ctx.GetPlace().GetType(),
+      AllocationType::CPU,
+      phi::errors::PreconditionNotMet("Operator DNNL Conv must use CPUPlace"));
+
+  bool is_INT8 = funcs::is_int8<T>();
+
+  auto dst_dt = GetDstType(is_INT8,
+                           is_bfloat16,
+                           force_fp32_output,
+                           fuse_activation,
+                           fuse_residual_connection,
+                           residual_param);
+  if (!is_INT8) {
+    if (dst_dt == dnnl::memory::data_type::f32) {
+      ComputeFP32<T, float>(dev_ctx,
+                            input,
+                            filter,
+                            bias,
+                            residual_param,
+                            strides,
+                            paddings,
+                            padding_algorithm,
+                            dilations,
+                            groups,
+                            data_format,
+                            is_test,
+                            is_bfloat16,
+                            fuse_activation,
+                            fuse_residual_connection,
+                            force_fp32_output,
+                            out);
+    } else if (dst_dt == dnnl::memory::data_type::bf16) {
+      ComputeFP32<T, dtype::bfloat16>(dev_ctx,
+                                      input,
+                                      filter,
+                                      bias,
+                                      residual_param,
+                                      strides,
+                                      paddings,
+                                      padding_algorithm,
+                                      dilations,
+                                      groups,
+                                      data_format,
+                                      is_test,
+                                      is_bfloat16,
+                                      fuse_activation,
+                                      fuse_residual_connection,
+                                      force_fp32_output,
+                                      out);
+    }
+  } else {
+    if (dst_dt == dnnl::memory::data_type::f32) {
+      ComputeINT8<T, float>(dev_ctx,
+                            input,
+                            filter,
+                            bias,
+                            residual_param,
+                            strides,
+                            paddings,
+                            padding_algorithm,
+                            dilations,
+                            groups,
+                            data_format,
+                            is_test,
+                            is_bfloat16,
+                            fuse_activation,
+                            fuse_residual_connection,
+                            force_fp32_output,
+                            out);
+    } else if (dst_dt == dnnl::memory::data_type::u8) {
+      ComputeINT8<T, uint8_t>(dev_ctx,
+                              input,
+                              filter,
+                              bias,
+                              residual_param,
+                              strides,
+                              paddings,
+                              padding_algorithm,
+                              dilations,
+                              groups,
+                              data_format,
+                              is_test,
+                              is_bfloat16,
+                              fuse_activation,
+                              fuse_residual_connection,
+                              force_fp32_output,
+                              out);
+    } else if (dst_dt == dnnl::memory::data_type::s8) {
+      ComputeINT8<T, int8_t>(dev_ctx,
+                             input,
+                             filter,
+                             bias,
+                             residual_param,
+                             strides,
+                             paddings,
+                             padding_algorithm,
+                             dilations,
+                             groups,
+                             data_format,
+                             is_test,
+                             is_bfloat16,
+                             fuse_activation,
+                             fuse_residual_connection,
+                             force_fp32_output,
+                             out);
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/onednn/conv_kernel.cc b/paddle/phi/kernels/onednn/conv_kernel.cc
index e2faaea6b023a..1e54ba0337e22 100644
--- a/paddle/phi/kernels/onednn/conv_kernel.cc
+++ b/paddle/phi/kernels/onednn/conv_kernel.cc
@@ -17,265 +17,10 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/data_layout_transform.h"
-#include "paddle/phi/kernels/onednn/conv_handler.h"
+#include "paddle/phi/kernels/onednn/conv_function.h"
 
 namespace phi {
 
-static dnnl::memory::data_type GetDstType(
-    bool is_int8,
-    bool is_bfloat16,
-    bool force_fp32_output,
-    std::string fuse_activation,
-    bool fuse_residual_conn,
-    const phi::DenseTensor* residual_param) {
-  auto dst_dt = dnnl::memory::data_type::f32;
-  if (is_int8) {
-    dst_dt = (fuse_activation == "relu" || fuse_activation == "relu6")
-                 ? dnnl::memory::data_type::u8
-                 : dnnl::memory::data_type::s8;
-    if (force_fp32_output) {
-      dst_dt = dnnl::memory::data_type::f32;
-    }
-    if (fuse_residual_conn && residual_param) {
-      auto residual_dt = funcs::ToOneDNNDataType(residual_param->dtype());
-      if (dst_dt != residual_dt) dst_dt = residual_dt;
-    }
-  } else {
-    if (!force_fp32_output && is_bfloat16) {
-      dst_dt = dnnl::memory::data_type::bf16;
-      if (fuse_residual_conn && residual_param) {
-        dst_dt = funcs::ToOneDNNDataType(residual_param->dtype());
-      }
-    }
-  }
-  return dst_dt;
-}
-
-#define PD_VISIT_FLOAT_AND_INT8_TYPES(TYPE, NAME, ...)                    \
-  [&] {                                                                   \
-    const auto& __dtype__ = TYPE;                                         \
-    switch (__dtype__) {                                                  \
-      PD_PRIVATE_CASE_TYPE(                                               \
-          NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)          \
-      PD_PRIVATE_CASE_TYPE(                                               \
-          NAME, ::paddle::DataType::INT8, int8_t, __VA_ARGS__)            \
-      default:                                                            \
-        PD_THROW("function " #NAME " is not implemented for data type `", \
-                 __dtype__,                                               \
-                 "`");                                                    \
-    }                                                                     \
-  }()
-
-template <typename T, typename T_out>
-void ComputeFP32(const OneDNNContext& dev_ctx,
-                 const DenseTensor* input,
-                 const DenseTensor* filter,
-                 const DenseTensor* bias,
-                 const DenseTensor* residual_param,
-                 const std::vector<int>& strides,
-                 const std::vector<int>& paddings,
-                 const std::string& padding_algorithm,
-                 const std::vector<int>& dilations,
-                 int groups,
-                 const std::string& data_format,
-                 bool is_test,
-                 bool is_BFLOAT16,
-                 const std::string& fuse_activation,
-                 bool fuse_residual_conn,
-                 bool force_fp32_output,
-                 DenseTensor* output) {
-  const auto& onednn_engine = dev_ctx.GetEngine();
-  const bool is_conv3d = strides.size() == 3U;
-  const std::string& unique_name =
-      dev_ctx.GetInputsName("Input")[0] + dev_ctx.GetInputsName("Filter")[0];
-  PD_VISIT_FLOAT_AND_INT8_TYPES(
-      filter->dtype(), "ConvOneDNNHandlerT", ([&] {
-        onednn::ConvOneDNNHandlerT<T, data_t, T_out> handler(dev_ctx,
-                                                             onednn_engine,
-                                                             dev_ctx.GetPlace(),
-                                                             input,
-                                                             filter,
-                                                             bias,
-                                                             strides,
-                                                             paddings,
-                                                             padding_algorithm,
-                                                             dilations,
-                                                             groups,
-                                                             data_format,
-                                                             is_test,
-                                                             is_BFLOAT16,
-                                                             fuse_activation,
-                                                             fuse_residual_conn,
-                                                             force_fp32_output,
-                                                             output,
-                                                             unique_name);
-        auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input);
-        auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder(
-            filter, groups, is_conv3d, is_test);
-        std::shared_ptr<dnnl::memory> dst_memory_p;
-        if (fuse_residual_conn) {
-          dst_memory_p =
-              handler.AcquireDstMemoryWithResidual(output, residual_param);
-        } else {
-          dst_memory_p = handler.template AcquireDstMemory<T_out>(output);
-        }
-
-        auto conv_p = handler.AcquireForwardPrimitive();
-        std::unordered_map<int, dnnl::memory> args = {
-            {DNNL_ARG_SRC, *src_memory_p},
-            {DNNL_ARG_WEIGHTS, *weights_memory_p},
-            {DNNL_ARG_DST, *dst_memory_p}};
-
-        if (bias) {
-          auto bias_memory_p =
-              handler.AcquireBiasMemoryWithReorder(bias, is_test);
-          args.insert({DNNL_ARG_BIAS, *bias_memory_p});
-        }
-
-        auto& astream = OneDNNContext::tls().get_stream();
-        conv_p->execute(astream, args);
-        astream.wait();
-        output->set_mem_desc(dst_memory_p->get_desc());
-      }));
-}
-
-template <typename T, typename T_out>
-void ComputeINT8(const OneDNNContext& dev_ctx,
-                 const DenseTensor* input,
-                 const DenseTensor* filter,
-                 const DenseTensor* bias,
-                 const DenseTensor* residual_param,
-                 const std::vector<int>& strides,
-                 const std::vector<int>& paddings,
-                 const std::string& padding_algorithm,
-                 const std::vector<int>& dilations,
-                 int groups,
-                 const std::string& data_format,
-                 bool is_test,
-                 bool is_BFLOAT16,
-                 const std::string& fuse_activation,
-                 bool fuse_residual_conn,
-                 bool force_fp32_output,
-                 DenseTensor* output) {
-  const auto& onednn_engine = dev_ctx.GetEngine();
-  const bool is_conv3d = strides.size() == 3U;
-
-  bool unsigned_output =
-      (fuse_activation == "relu" || fuse_activation == "relu6");
-  bool need_s8_to_u8 = false;
-
-  PADDLE_ENFORCE_NE(
-      is_conv3d,
-      true,
-      phi::errors::Unimplemented(
-          "OneDNN int8 convolution does not support 3D inputs currently"));
-  PADDLE_ENFORCE_EQ(
-      fuse_residual_conn && force_fp32_output,
-      false,
-      phi::errors::Unimplemented(
-          "residual fusion does not support force output with fp32"));
-  const std::string& unique_name =
-      dev_ctx.GetInputsName("Input")[0] + dev_ctx.GetInputsName("Filter")[0];
-  PD_VISIT_FLOAT_AND_INT8_TYPES(
-      filter->dtype(), "ConvMKLDNNHandlerT", ([&] {
-        onednn::ConvOneDNNHandlerT<T, data_t, T_out> handler(dev_ctx,
-                                                             onednn_engine,
-                                                             dev_ctx.GetPlace(),
-                                                             input,
-                                                             filter,
-                                                             bias,
-                                                             strides,
-                                                             paddings,
-                                                             padding_algorithm,
-                                                             dilations,
-                                                             groups,
-                                                             data_format,
-                                                             is_test,
-                                                             is_BFLOAT16,
-                                                             fuse_activation,
-                                                             fuse_residual_conn,
-                                                             force_fp32_output,
-                                                             output,
-                                                             unique_name);
-
-        auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input);
-
-        const auto& scale_weights_data =
-            dev_ctx.HasDnnAttr("Scale_weights")
-                ? PADDLE_GET_CONST(std::vector<float>,
-                                   dev_ctx.GetDnnAttr("Scale_weights"))
-                : std::vector<float>{1.0f};
-        const bool is_multi_channel = scale_weights_data.size() > 1;
-        int mask_reorder = is_multi_channel
-                               ? ((groups != 1) ? (1 << 1) + (1 << 0) : 1 << 0)
-                               : 0;
-        auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder(
-            filter, groups, false, true, scale_weights_data, mask_reorder);
-
-        std::shared_ptr<dnnl::memory> dst_memory_p;
-        if (fuse_residual_conn) {
-          PADDLE_ENFORCE_EQ(
-              output->dims(),
-              residual_param->dims(),
-              phi::errors::InvalidArgument(
-                  "Output and elementwise parameter need to have the "
-                  "same dimension sizes, but got output's dimension = %d"
-                  " and residual param's dimension =%d .",
-                  output->dims().size(),
-                  residual_param->dims().size()));
-          dst_memory_p =
-              handler.AcquireDstMemoryWithResidual(output, residual_param);
-          need_s8_to_u8 = (funcs::OneDNNGetDataType<T_out>() ==
-                           dnnl::memory::data_type::s8) &&
-                          unsigned_output;
-        } else {
-          dst_memory_p = handler.template AcquireDstMemory<T_out>(output);
-        }
-
-        auto conv_p = handler.AcquireForwardPrimitive();
-
-        std::unordered_map<int, dnnl::memory> args = {
-            {DNNL_ARG_SRC, *src_memory_p},
-            {DNNL_ARG_WEIGHTS, *weights_memory_p},
-            {DNNL_ARG_DST, *dst_memory_p}};
-
-        if (bias) {
-          std::vector<float> bias_scales;
-          auto p_scales_tuple =
-              std::make_shared<std::tuple<float, std::vector<float>>>(
-                  std::make_tuple(static_cast<float>(mask_reorder),
-                                  bias_scales));
-          if (dev_ctx.HasDnnAttr("Bias_scales")) {
-            bias_scales = PADDLE_GET_CONST(std::vector<float>,
-                                           dev_ctx.GetDnnAttr("Bias_scales"));
-            p_scales_tuple =
-                std::make_shared<std::tuple<float, std::vector<float>>>(
-                    std::make_tuple(static_cast<float>(mask_reorder),
-                                    bias_scales));
-          } else {
-            p_scales_tuple = handler.get_int8_bias_scales(
-                filter, groups, scale_weights_data);
-          }
-          auto bias_memory_p = handler.AcquireBiasMemoryWithReorder(
-              bias,
-              true,
-              std::get<1>(*p_scales_tuple),
-              std::get<0>(*p_scales_tuple));
-          args.insert({DNNL_ARG_BIAS, *bias_memory_p});
-        }
-
-        auto& astream = OneDNNContext::tls().get_stream();
-        conv_p->execute(astream, args);
-        astream.wait();
-
-        if (need_s8_to_u8) {
-          dev_ctx.Alloc<uint8_t>(output);
-        }
-
-        output->set_mem_desc(dst_memory_p->get_desc());
-      }));
-}
-
 template <typename T, typename Context>
 void ConvKernel(const Context& dev_ctx,
                 const DenseTensor& input,
@@ -287,12 +32,6 @@ void ConvKernel(const Context& dev_ctx,
                 int groups,
                 const std::string& data_format,
                 DenseTensor* out) {
-  PADDLE_ENFORCE_EQ(
-      dev_ctx.GetPlace().GetType(),
-      AllocationType::CPU,
-      phi::errors::PreconditionNotMet("Operator DNNL Conv must use CPUPlace"));
-  bool is_INT8 = funcs::is_int8<T>();
-
   bool is_test = dev_ctx.HasDnnAttr("is_test")
                      ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("is_test"))
                      : false;
@@ -320,107 +59,23 @@ void ConvKernel(const Context& dev_ctx,
       dev_ctx.HasDnnAttr("force_fp32_output")
           ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("force_fp32_output"))
           : false;
-  auto dst_dt = GetDstType(is_INT8,
-                           is_BFLOAT16,
-                           force_fp32_output,
-                           fuse_activation,
-                           fuse_residual_conn,
-                           residual_param);
-  if (!is_INT8) {
-    if (dst_dt == dnnl::memory::data_type::f32) {
-      ComputeFP32<T, float>(dev_ctx,
-                            &input,
-                            &filter,
-                            bias,
-                            residual_param,
-                            strides,
-                            paddings,
-                            padding_algorithm,
-                            dilations,
-                            groups,
-                            data_format,
-                            is_test,
-                            is_BFLOAT16,
-                            fuse_activation,
-                            fuse_residual_conn,
-                            force_fp32_output,
-                            out);
-    } else if (dst_dt == dnnl::memory::data_type::bf16) {
-      ComputeFP32<T, dtype::bfloat16>(dev_ctx,
-                                      &input,
-                                      &filter,
-                                      bias,
-                                      residual_param,
-                                      strides,
-                                      paddings,
-                                      padding_algorithm,
-                                      dilations,
-                                      groups,
-                                      data_format,
-                                      is_test,
-                                      is_BFLOAT16,
-                                      fuse_activation,
-                                      fuse_residual_conn,
-                                      force_fp32_output,
-                                      out);
-    }
-  } else {
-    if (dst_dt == dnnl::memory::data_type::f32) {
-      ComputeINT8<T, float>(dev_ctx,
-                            &input,
-                            &filter,
-                            bias,
-                            residual_param,
-                            strides,
-                            paddings,
-                            padding_algorithm,
-                            dilations,
-                            groups,
-                            data_format,
-                            is_test,
-                            is_BFLOAT16,
-                            fuse_activation,
-                            fuse_residual_conn,
-                            force_fp32_output,
-                            out);
-    } else if (dst_dt == dnnl::memory::data_type::u8) {
-      ComputeINT8<T, uint8_t>(dev_ctx,
-                              &input,
-                              &filter,
-                              bias,
-                              residual_param,
-                              strides,
-                              paddings,
-                              padding_algorithm,
-                              dilations,
-                              groups,
-                              data_format,
-                              is_test,
-                              is_BFLOAT16,
-                              fuse_activation,
-                              fuse_residual_conn,
-                              force_fp32_output,
-                              out);
-    } else if (dst_dt == dnnl::memory::data_type::s8) {
-      ComputeINT8<T, int8_t>(dev_ctx,
-                             &input,
-                             &filter,
-                             bias,
-                             residual_param,
-                             strides,
-                             paddings,
-                             padding_algorithm,
-                             dilations,
-                             groups,
-                             data_format,
-                             is_test,
-                             is_BFLOAT16,
-                             fuse_activation,
-                             fuse_residual_conn,
-                             force_fp32_output,
-                             out);
-    }
-  }
+  ConvOnednn<T>(dev_ctx,
+                &input,
+                &filter,
+                bias,
+                residual_param,
+                strides,
+                paddings,
+                padding_algorithm,
+                dilations,
+                groups,
+                data_format,
+                is_test,
+                is_BFLOAT16,
+                fuse_activation,
+                fuse_residual_conn,
+                force_fp32_output,
+                out);
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/ops/compat/fused_conv_sig.cc b/paddle/phi/ops/compat/fused_conv_sig.cc
new file mode 100644
index 0000000000000..0e0f4325232dc
--- /dev/null
+++ b/paddle/phi/ops/compat/fused_conv_sig.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature FusedConv2dOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("fused_conv2d",
+                         {"Input", "Filter", "Bias", "ResidualData"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "dilations",
+                          "groups",
+                          "data_format",
+                          "mkldnn_data_type",
+                          "fuse_activation",
+                          "fuse_residual_connection",
+                          "force_fp32_output"},
+                         {"Output"});
+}
+
+KernelSignature FusedConv3dOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("fused_conv3d",
+                         {"Input", "Filter", "Bias", "ResidualData"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "dilations",
+                          "groups",
+                          "data_format",
+                          "mkldnn_data_type",
+                          "fuse_activation",
+                          "fuse_residual_connection",
+                          "force_fp32_output"},
+                         {"Output"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(fused_conv2d, phi::FusedConv2dOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(fused_conv3d, phi::FusedConv3dOpArgumentMapping);
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
index 6e16970dcd2fe..7efea770bfa2a 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
@@ -36,7 +36,7 @@ def sample_predictor_configs(self, program_config):
         # MKLDNN
         config = self.create_inference_config(use_gpu=False)
         config.enable_mkldnn()
-        yield config, ["conv2d"], (1e-4, 1e-5)
+        yield config, ["fused_conv2d"], (1e-4, 1e-5)
 
     def is_program_valid(self, prog_config):
         paddings = prog_config.ops[0].attrs["paddings"]
@@ -156,8 +156,10 @@ def sample_program_config(self, draw):
         inputs = dict()
         weights = dict()
         use_mkldnn = None
+        conv_type = "conv2d"
         if draw(st.booleans()):
             conv_bias_shape = [f_shape[0]]
+            conv_type = "fused_conv2d"
             inputs = {
                 "Input": ["input_x"],
                 "Filter": ["filter"],
@@ -181,7 +183,7 @@ def sample_program_config(self, draw):
             use_mkldnn = False
 
         conv2d_op = OpConfig(
-            "conv2d",
+            conv_type,
             inputs=inputs,
             outputs={"Output": ["conv2d_out"]},
             strides=strides,

From e1e8bf72031f77adfd44c843bfdc0f4b9541a62e Mon Sep 17 00:00:00 2001
From: xiongkun <xiongkun03@baidu.com>
Date: Tue, 6 Dec 2022 18:48:51 +0800
Subject: [PATCH 14/60] make bilinear interpolate stable. (#48644)

* make bilinear interpolate stable.

* fix code
---
 .../kernels/gpu/interpolate_grad_kernel.cu    | 42 ++++++++++++-------
 1 file changed, 28 insertions(+), 14 deletions(-)

diff --git a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
index b38cae829680b..cb1d959e30aa0 100644
--- a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
@@ -25,6 +25,8 @@
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/primitive/datamover_primitives.h"
 
+DECLARE_bool(cudnn_deterministic);
+
 namespace phi {
 
 template <typename T>
@@ -1034,6 +1036,12 @@ static void Interpolate2DCUDABwd(
 #endif
 
     if (optimize_flag & is_nchw) {
+      if (FLAGS_cudnn_deterministic) {
+        VLOG(2)
+            << "Run grad kernel of bilinear interpolate 2d with single thread.";
+        config.block_per_grid = 1;
+        config.thread_per_block = 1;
+      }
       KeBilinearInterpBwShareMemory<T><<<config.block_per_grid,
                                          config.thread_per_block,
                                          0,
@@ -1052,21 +1060,27 @@ static void Interpolate2DCUDABwd(
     } else if (!optimize_flag & is_nchw) {
       const int num_kernels = n * c * out_h * out_w;
       const int num_threads = std::min(dev_ctx.GetMaxThreadsPerBlock(), 1024);
+      int block_per_grid = backends::gpu::DivUp(num_kernels, num_threads);
+      int thread_per_block = num_threads;
+      if (FLAGS_cudnn_deterministic) {
+        VLOG(2)
+            << "Run grad kernel of bilinear interpolate 2d with single thread.";
+        block_per_grid = 1;
+        thread_per_block = 1;
+      }
       KeBilinearInterpNCHWBw<T>
-          <<<backends::gpu::DivUp(num_kernels, num_threads),
-             num_threads,
-             0,
-             dev_ctx.stream()>>>(input_grad_data,
-                                 in_h,
-                                 in_w,
-                                 out_h,
-                                 out_w,
-                                 n,
-                                 c,
-                                 ratio_h,
-                                 ratio_w,
-                                 output_grad_data,
-                                 align_type_value);
+          <<<block_per_grid, thread_per_block, 0, dev_ctx.stream()>>>(
+              input_grad_data,
+              in_h,
+              in_w,
+              out_h,
+              out_w,
+              n,
+              c,
+              ratio_h,
+              ratio_w,
+              output_grad_data,
+              align_type_value);
     } else {
       int64_t cw = c * out_w;
       auto interp_divmods = funcs::FastDivModForInterpolate(c, out_chw, cw);

From c838c1ed99ab68ff1b4b4261bf659780572fc242 Mon Sep 17 00:00:00 2001
From: ceci3 <ceci3@users.noreply.github.com>
Date: Tue, 6 Dec 2022 19:45:32 +0800
Subject: [PATCH 15/60] clear tmp var in ptq (#48660)

---
 .../contrib/slim/quantization/post_training_quantization.py    | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index 68df2a8adcc96..fa57a9bd746ea 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -692,11 +692,10 @@ def _reset_activation_persistable(self):
         '''
         Reset activations to be not persistable.
         '''
-        to_erase = []
         for var in self._program.list_vars():
             if var.name in self._quantized_act_var_name:
                 var.persistable = False
-                to_erase.append(var.name)
+                self._scope.find_var(var.name).get_tensor()._clear()
 
     def _sampling(self):
         '''

From 57ad9b46a93f734633464d21ab259033ba82afe0 Mon Sep 17 00:00:00 2001
From: Nyakku Shigure <sigure.qaq@gmail.com>
Date: Wed, 7 Dec 2022 09:53:25 +0800
Subject: [PATCH 16/60] [Dy2St] replace deprecated `load_module` with
 `exec_module` (#48679)

---
 python/paddle/jit/dy2static/utils.py                 | 10 ++++++----
 python/paddle/utils/cpp_extension/extension_utils.py |  5 ++++-
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/python/paddle/jit/dy2static/utils.py b/python/paddle/jit/dy2static/utils.py
index a57134411ffec..438baef376f4e 100644
--- a/python/paddle/jit/dy2static/utils.py
+++ b/python/paddle/jit/dy2static/utils.py
@@ -18,6 +18,7 @@
 import copy
 from paddle.utils import gast
 import inspect
+import importlib.util
 import os
 import sys
 import shutil
@@ -32,6 +33,7 @@
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.layers import assign
 from functools import reduce
+from importlib.machinery import SourceFileLoader
 import warnings
 
 
@@ -71,9 +73,6 @@ def visit(self, node):
         return ret
 
 
-# imp is deprecated in python3
-from importlib.machinery import SourceFileLoader
-
 dygraph_class_to_static_api = {
     "CosineDecay": "cosine_decay",
     "ExponentialDecay": "exponential_decay",
@@ -586,7 +585,10 @@ def func_prefix(func):
         DEL_TEMP_DIR = False
 
     func_name = dyfunc.__name__
-    module = SourceFileLoader(module_name, f.name).load_module()
+    loader = SourceFileLoader(module_name, f.name)
+    spec = importlib.util.spec_from_loader(loader.name, loader)
+    module = importlib.util.module_from_spec(spec)
+    loader.exec_module(module)
     # The 'forward' or 'another_forward' of 'TranslatedLayer' cannot be obtained
     # through 'func_name'. So set the special function name '__i_m_p_l__'.
     if hasattr(module, '__i_m_p_l__'):
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index 09b5492e54180..29a4deeb1cdc3 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -16,6 +16,7 @@
 import collections
 import glob
 import hashlib
+import importlib.util
 import json
 import logging
 import os
@@ -1070,7 +1071,9 @@ def _load_module_from_file(api_file_path, module_name, verbose=False):
 
     # load module with RWLock
     loader = machinery.SourceFileLoader(ext_name, api_file_path)
-    module = loader.load_module()
+    spec = importlib.util.spec_from_loader(loader.name, loader)
+    module = importlib.util.module_from_spec(spec)
+    loader.exec_module(module)
 
     return module
 

From 0d8ddf9fce71a909fa88f4b6e20b6faf1037beb5 Mon Sep 17 00:00:00 2001
From: zhangyikun02 <48021248+zhangyk0314@users.noreply.github.com>
Date: Wed, 7 Dec 2022 10:41:52 +0800
Subject: [PATCH 17/60] modify d2d copy to xpu::copy in xpu kernel, test=kunlun
 (#48710)

---
 .../fluid/operators/reader/buffered_reader.cc |  7 ++++-
 paddle/phi/backends/xpu/xpu_info.cc           | 19 ++++--------
 paddle/phi/kernels/reshape_grad_kernel.cc     | 22 ++++++++++++++
 paddle/phi/kernels/reshape_kernel.cc          | 30 +++++++++++++++++++
 paddle/phi/kernels/xpu/gather_nd_kernel.cc    | 12 ++++----
 .../kernels/xpu/generate_proposals_kernel.cc  | 20 ++++++-------
 paddle/phi/kernels/xpu/scatter_kernel.cc      |  5 +++-
 paddle/phi/kernels/xpu/tile_kernel.cc         |  6 +++-
 .../unittests/xpu/get_test_cover_info.py      |  2 ++
 .../unittests/xpu/test_reshape2_op_xpu.py     |  1 +
 10 files changed, 91 insertions(+), 33 deletions(-)

diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index 5bb8a29ce356e..ddb85f3cfbb4c 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -417,8 +417,13 @@ void BufferedReader::ReadAsync(size_t i) {
         // TODO(zhanghuan) for now hardware not support xpu_memcpy_async, maybe
         // KL3
         if ((platform::is_xpu_place(cpu_place))) {
-          memory::Copy(place_, xpu_ptr, cpu_place, cpu_ptr, size);
           platform::XPUStreamSync(stream_.get());
+          char *tmp = new char[size];
+          PADDLE_ENFORCE_XPU_SUCCESS(xpu_memcpy(
+              tmp, cpu_ptr, size, XPUMemcpyKind::XPU_DEVICE_TO_HOST));
+          PADDLE_ENFORCE_XPU_SUCCESS(xpu_memcpy(
+              xpu_ptr, tmp, size, XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+          delete[] tmp;
         } else {
           memory::Copy(place_, xpu_ptr, cpu_place, cpu_ptr, size);
         }
diff --git a/paddle/phi/backends/xpu/xpu_info.cc b/paddle/phi/backends/xpu/xpu_info.cc
index d084afee2285c..89ebce438a464 100644
--- a/paddle/phi/backends/xpu/xpu_info.cc
+++ b/paddle/phi/backends/xpu/xpu_info.cc
@@ -169,19 +169,12 @@ void MemcpySyncD2D(void* dst,
                    const phi::XPUContext& dev_ctx) {
   int dev_id = GetXPUCurrentDeviceId();
   if (dst_place.device == dev_id && src_place.device == dev_id) {
-    dev_ctx.Wait();
-    char* tmp = new char[count];
-    PADDLE_ENFORCE_XPU_SUCCESS(
-        xpu_memcpy(tmp, src, count, XPUMemcpyKind::XPU_DEVICE_TO_HOST));
-    PADDLE_ENFORCE_XPU_SUCCESS(
-        xpu_memcpy(dst, tmp, count, XPUMemcpyKind::XPU_HOST_TO_DEVICE));
-    delete[] tmp;
-    // PADDLE_ENFORCE_XDNN_SUCCESS(
-    //    baidu::xpu::api::copy(dev_ctx.x_context(),
-    //                          static_cast<const int8_t*>(src),
-    //                          static_cast<int8_t*>(dst),
-    //                          count),
-    //    "copy ");
+    PADDLE_ENFORCE_XDNN_SUCCESS(
+        baidu::xpu::api::copy(dev_ctx.x_context(),
+                              static_cast<const int8_t*>(src),
+                              static_cast<int8_t*>(dst),
+                              count),
+        "copy ");
   } else {
     PADDLE_ENFORCE_XPU_SUCCESS(
         xpu_memcpy_peer(dst_place.device, dst, src_place.device, src, count));
diff --git a/paddle/phi/kernels/reshape_grad_kernel.cc b/paddle/phi/kernels/reshape_grad_kernel.cc
index c4b92c4f760a2..ffd616054c0c3 100644
--- a/paddle/phi/kernels/reshape_grad_kernel.cc
+++ b/paddle/phi/kernels/reshape_grad_kernel.cc
@@ -17,6 +17,9 @@
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
+#ifdef PADDLE_WITH_XPU
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#endif
 
 namespace phi {
 
@@ -29,6 +32,25 @@ void ReshapeGradKernel(const Context& dev_ctx,
   x_grad->Resize(x_dims);
 }
 
+#ifdef PADDLE_WITH_XPU
+template <>
+void ReshapeGradKernel<phi::XPUContext>(const XPUContext& dev_ctx,
+                                        const DenseTensor& out_grad,
+                                        DenseTensor* x_grad) {
+  auto x_dims = x_grad->dims();
+  dev_ctx.Alloc(x_grad, out_grad.dtype());
+  auto* src_ptr = out_grad.data();
+  auto* dst_ptr = x_grad->data();
+  auto size = out_grad.numel() * paddle::experimental::SizeOf(out_grad.dtype());
+  int ret = xpu::copy(dev_ctx.x_context(),
+                      reinterpret_cast<const int8_t*>(src_ptr),
+                      reinterpret_cast<int8_t*>(dst_ptr),
+                      size);
+  PADDLE_ENFORCE_XDNN_SUCCESS(ret, "copy");
+  x_grad->Resize(x_dims);
+}
+#endif
+
 template <typename Context>
 void ReshapeDoubleGradKernel(const Context& dev_ctx,
                              const DenseTensor& out_grad,
diff --git a/paddle/phi/kernels/reshape_kernel.cc b/paddle/phi/kernels/reshape_kernel.cc
index 632a63c9ab7ff..a792322a440ad 100644
--- a/paddle/phi/kernels/reshape_kernel.cc
+++ b/paddle/phi/kernels/reshape_kernel.cc
@@ -19,6 +19,9 @@
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
+#ifdef PADDLE_WITH_XPU
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#endif
 
 namespace phi {
 
@@ -42,6 +45,33 @@ void ReshapeKernel(const Context& dev_ctx,
   out->ResetLoD(x.lod());
 }
 
+#ifdef PADDLE_WITH_XPU
+template <>
+void ReshapeKernel<phi::XPUContext>(const XPUContext& dev_ctx,
+                                    const DenseTensor& x,
+                                    const IntArray& shape,
+                                    DenseTensor* out) {
+  MetaTensor meta_out(out);
+  InferMetaFromVecValue(x, shape.GetData(), &meta_out);
+  if (x.initialized() && x.Holder() == out->Holder()) {
+    dev_ctx.Alloc(out, x.dtype());
+    return;
+  }
+  dev_ctx.Alloc(out, x.dtype());
+  auto dims = out->dims();
+  auto* src_ptr = x.data();
+  auto* dst_ptr = out->data();
+  auto size = x.numel() * paddle::experimental::SizeOf(x.dtype());
+  int ret = xpu::copy(dev_ctx.x_context(),
+                      reinterpret_cast<const int8_t*>(src_ptr),
+                      reinterpret_cast<int8_t*>(dst_ptr),
+                      size);
+  PADDLE_ENFORCE_XDNN_SUCCESS(ret, "copy");
+  out->Resize(dims);
+  out->ResetLoD(x.lod());
+}
+#endif
+
 template <typename Context>
 void ReshapeWithXShape(const Context& dev_ctx,
                        const DenseTensor& x,
diff --git a/paddle/phi/kernels/xpu/gather_nd_kernel.cc b/paddle/phi/kernels/xpu/gather_nd_kernel.cc
index d7d23fa17cbb3..8241e5109da33 100644
--- a/paddle/phi/kernels/xpu/gather_nd_kernel.cc
+++ b/paddle/phi/kernels/xpu/gather_nd_kernel.cc
@@ -30,7 +30,10 @@ void GatherNdKernel(const Context &ctx,
   if (x.numel() == 0) return;
 
   if (index.numel() == 0) {
-    phi::Copy(ctx, x, phi::XPUPlace(), true, out);
+    out->Resize(x.dims());
+    ctx.template Alloc<T>(out);
+    int r = xpu::copy(ctx.x_context(), x.data<T>(), out->data<T>(), x.numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
     return;
   }
 
@@ -69,12 +72,7 @@ void GatherNdKernel(const Context &ctx,
                                      x_vec,
                                      index_shape);
   }
-  PADDLE_ENFORCE_EQ(
-      ret,
-      XPU_SUCCESS,
-      phi::errors::External("XPU gather_nd kernel return wrong value[%d %s]",
-                            ret,
-                            XPUAPIErrorMsg[ret]));
+  PADDLE_ENFORCE_XDNN_SUCCESS(ret, "gather_nd");
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/xpu/generate_proposals_kernel.cc b/paddle/phi/kernels/xpu/generate_proposals_kernel.cc
index bf7f3e90bfd51..f19d19241ebd5 100644
--- a/paddle/phi/kernels/xpu/generate_proposals_kernel.cc
+++ b/paddle/phi/kernels/xpu/generate_proposals_kernel.cc
@@ -372,16 +372,16 @@ void GenerateProposalsKernel(const Context& dev_ctx,
     DenseTensor& proposals = tensor_pair.first;
     DenseTensor& nscores = tensor_pair.second;
 
-    paddle::memory::Copy(place,
-                         rpn_rois->data<T>() + num_proposals * 4,
-                         place,
-                         proposals.data<T>(),
-                         sizeof(T) * proposals.numel());
-    paddle::memory::Copy(place,
-                         rpn_roi_probs->data<T>() + num_proposals,
-                         place,
-                         nscores.data<T>(),
-                         sizeof(T) * scores.numel());
+    r = xpu::copy(dev_ctx.x_context(),
+                  proposals.data<T>(),
+                  rpn_rois->data<T>() + num_proposals * 4,
+                  proposals.numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
+    r = xpu::copy(dev_ctx.x_context(),
+                  nscores.data<T>(),
+                  rpn_roi_probs->data<T>() + num_proposals,
+                  nscores.numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
 
     if (dev_ctx.x_context()->xpu_stream) {
       dev_ctx.Wait();
diff --git a/paddle/phi/kernels/xpu/scatter_kernel.cc b/paddle/phi/kernels/xpu/scatter_kernel.cc
index 988b8a71568e9..18e4e03dd2787 100644
--- a/paddle/phi/kernels/xpu/scatter_kernel.cc
+++ b/paddle/phi/kernels/xpu/scatter_kernel.cc
@@ -27,7 +27,10 @@ void ScatterKernel(const Context &ctx,
                    const DenseTensor &updates,
                    bool overwrite,
                    DenseTensor *out) {
-  phi::Copy(ctx, x, ctx.GetPlace(), false, out);
+  out->Resize(x.dims());
+  ctx.template Alloc<T>(out);
+  int ret = xpu::copy(ctx.x_context(), x.data<T>(), out->data<T>(), x.numel());
+  PADDLE_ENFORCE_XDNN_SUCCESS(ret, "copy");
   // Apply ScatterUpdate: Out[index] = Updates[:]
   const auto &index_type = index.dtype();
   bool index_type_match =
diff --git a/paddle/phi/kernels/xpu/tile_kernel.cc b/paddle/phi/kernels/xpu/tile_kernel.cc
index 022e355f4c9c7..b9383f108ebe5 100644
--- a/paddle/phi/kernels/xpu/tile_kernel.cc
+++ b/paddle/phi/kernels/xpu/tile_kernel.cc
@@ -102,7 +102,11 @@ void TileKernel(const Context& dev_ctx,
 
   std::vector<int64_t> temp(repeat_times.size(), 1);
   if (repeat_times == temp) {
-    phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+    out->Resize(x.dims());
+    dev_ctx.template Alloc<T>(out);
+    int r =
+        xpu::copy(dev_ctx.x_context(), x.data<T>(), out->data<T>(), x.numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
     return;
   }
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
index 22131010d91c3..afaf3b2a52fab 100644
--- a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
+++ b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
@@ -94,6 +94,8 @@
     "c_embedding_float32",  # unittests of collective ops do not using xpu testing framework
     "c_sync_comm_stream_float32",
     "c_sync_calc_stream_float32",
+    "reshape2_bool",
+    "reshape2_grad_bool",
 ]
 xpu_test_device_op_white_list = []
 xpu_test_device_op_type_white_list = []
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py
index 01773e8a28c5b..e85ccf0cc4456 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py
@@ -41,6 +41,7 @@ class TestReshapeOp(XPUOpTest):
         def setUp(self):
             self.init_data()
             self.op_type = "reshape2"
+            self.dtype = self.in_type
             self.init_test_input()
             self.init_test_output()
             self.init_attrs()

From ddd5656aaf01aff9ac1489c19b707bf4e56e87ce Mon Sep 17 00:00:00 2001
From: Weilong Wu <veyron_wu@163.com>
Date: Wed, 7 Dec 2022 10:49:18 +0800
Subject: [PATCH 18/60] rm _test_eager_guard (#48767)

---
 python/paddle/tests/test_async_read_write.py  | 64 ++++----------
 .../test_callback_reduce_lr_on_plateau.py     | 15 +---
 python/paddle/tests/test_callback_visualdl.py |  8 +-
 python/paddle/tests/test_datasets.py          | 85 +++----------------
 python/paddle/tests/test_dlpack.py            | 36 ++------
 5 files changed, 35 insertions(+), 173 deletions(-)

diff --git a/python/paddle/tests/test_async_read_write.py b/python/paddle/tests/test_async_read_write.py
index 6605ac6d81ce2..4fc20039881b9 100644
--- a/python/paddle/tests/test_async_read_write.py
+++ b/python/paddle/tests/test_async_read_write.py
@@ -19,7 +19,6 @@
 import paddle
 from paddle.device import cuda
 from paddle.fluid import core
-from paddle.fluid.framework import _in_legacy_dygraph, _test_eager_guard
 
 
 class TestAsyncRead(unittest.TestCase):
@@ -40,24 +39,14 @@ def func_setUp(self):
 
     def func_test_async_read_empty_offset_and_count(self):
         with cuda.stream_guard(self.stream):
-            if _in_legacy_dygraph():
-                core.async_read(
-                    self.src,
-                    self.dst,
-                    self.index,
-                    self.buffer,
-                    self.empty,
-                    self.empty,
-                )
-            else:
-                core.eager.async_read(
-                    self.src,
-                    self.dst,
-                    self.index,
-                    self.buffer,
-                    self.empty,
-                    self.empty,
-                )
+            core.eager.async_read(
+                self.src,
+                self.dst,
+                self.index,
+                self.buffer,
+                self.empty,
+                self.empty,
+            )
         array1 = paddle.gather(self.src, self.index)
         array2 = self.dst[: len(self.index)]
 
@@ -71,14 +60,9 @@ def func_test_async_read_success(self):
             np.array([5, 10], dtype="int64"), place=paddle.CPUPlace()
         )
         with cuda.stream_guard(self.stream):
-            if _in_legacy_dygraph():
-                core.async_read(
-                    self.src, self.dst, self.index, self.buffer, offset, count
-                )
-            else:
-                core.eager.async_read(
-                    self.src, self.dst, self.index, self.buffer, offset, count
-                )
+            core.eager.async_read(
+                self.src, self.dst, self.index, self.buffer, offset, count
+            )
         # index data
         index_array1 = paddle.gather(self.src, self.index)
         count_numel = paddle.sum(count).numpy()[0]
@@ -101,26 +85,14 @@ def func_test_async_read_only_1dim(self):
         dst = paddle.empty([40], dtype="float32")
         buffer_ = paddle.empty([20]).pin_memory()
         with cuda.stream_guard(self.stream):
-            if _in_legacy_dygraph():
-                core.async_read(
-                    src, dst, self.index, buffer_, self.empty, self.empty
-                )
-            else:
-                core.eager.async_read(
-                    src, dst, self.index, buffer_, self.empty, self.empty
-                )
+            core.eager.async_read(
+                src, dst, self.index, buffer_, self.empty, self.empty
+            )
         array1 = paddle.gather(src, self.index)
         array2 = dst[: len(self.index)]
         np.testing.assert_allclose(array1.numpy(), array2.numpy(), rtol=1e-05)
 
     def test_main(self):
-        with _test_eager_guard():
-            self.func_setUp()
-            self.func_test_async_read_empty_offset_and_count()
-            self.func_setUp()
-            self.func_test_async_read_success()
-            self.func_setUp()
-            self.func_test_async_read_only_1dim()
         self.func_setUp()
         self.func_test_async_read_empty_offset_and_count()
         self.func_setUp()
@@ -145,10 +117,7 @@ def func_test_async_write_success(self):
             np.array([40, 60], dtype="int64"), place=paddle.CPUPlace()
         )
         with cuda.stream_guard(self.stream):
-            if _in_legacy_dygraph():
-                core.async_write(self.src, self.dst, offset, count)
-            else:
-                core.eager.async_write(self.src, self.dst, offset, count)
+            core.eager.async_write(self.src, self.dst, offset, count)
 
         offset_a = paddle.gather(self.dst, paddle.to_tensor(np.arange(0, 40)))
         offset_b = paddle.gather(self.dst, paddle.to_tensor(np.arange(60, 120)))
@@ -158,9 +127,6 @@ def func_test_async_write_success(self):
         )
 
     def test_async_write_success(self):
-        with _test_eager_guard():
-            self.func_setUp()
-            self.func_test_async_write_success()
         self.func_setUp()
         self.func_test_async_write_success()
 
diff --git a/python/paddle/tests/test_callback_reduce_lr_on_plateau.py b/python/paddle/tests/test_callback_reduce_lr_on_plateau.py
index c221c6d48246d..9e98ee484105f 100644
--- a/python/paddle/tests/test_callback_reduce_lr_on_plateau.py
+++ b/python/paddle/tests/test_callback_reduce_lr_on_plateau.py
@@ -17,7 +17,6 @@
 import paddle
 import paddle.vision.transforms as T
 from paddle import Model
-from paddle.fluid.framework import _test_eager_guard
 from paddle.metric import Accuracy
 from paddle.nn.layer.loss import CrossEntropyLoss
 from paddle.static import InputSpec
@@ -32,7 +31,7 @@ def __len__(self):
 
 
 class TestReduceLROnPlateau(unittest.TestCase):
-    def func_reduce_lr_on_plateau(self):
+    def test_reduce_lr_on_plateau(self):
         transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])])
         train_dataset = CustomMnist(mode='train', transform=transform)
         val_dataset = CustomMnist(mode='test', transform=transform)
@@ -57,12 +56,7 @@ def func_reduce_lr_on_plateau(self):
             callbacks=[callbacks],
         )
 
-    def test_reduce_lr_on_plateau(self):
-        with _test_eager_guard():
-            self.func_reduce_lr_on_plateau()
-        self.func_reduce_lr_on_plateau()
-
-    def func_warn_or_error(self):
+    def test_warn_or_error(self):
         with self.assertRaises(ValueError):
             paddle.callbacks.ReduceLROnPlateau(factor=2.0)
         # warning
@@ -113,11 +107,6 @@ def func_warn_or_error(self):
             callbacks=[callbacks],
         )
 
-    def test_warn_or_error(self):
-        with _test_eager_guard():
-            self.func_warn_or_error()
-        self.func_warn_or_error()
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/tests/test_callback_visualdl.py b/python/paddle/tests/test_callback_visualdl.py
index 5ad245a16344a..62a6c08b41923 100644
--- a/python/paddle/tests/test_callback_visualdl.py
+++ b/python/paddle/tests/test_callback_visualdl.py
@@ -18,7 +18,6 @@
 
 import paddle
 import paddle.vision.transforms as T
-from paddle.fluid.framework import _test_eager_guard
 from paddle.static import InputSpec
 from paddle.vision.datasets import MNIST
 
@@ -35,7 +34,7 @@ def setUp(self):
     def tearDown(self):
         shutil.rmtree(self.save_dir)
 
-    def func_visualdl_callback(self):
+    def test_visualdl_callback(self):
         inputs = [InputSpec([-1, 1, 28, 28], 'float32', 'image')]
         labels = [InputSpec([None, 1], 'int64', 'label')]
 
@@ -58,11 +57,6 @@ def func_visualdl_callback(self):
             train_dataset, eval_dataset, batch_size=64, callbacks=callback
         )
 
-    def test_visualdl_callback(self):
-        with _test_eager_guard():
-            self.func_visualdl_callback()
-        self.func_visualdl_callback()
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/tests/test_datasets.py b/python/paddle/tests/test_datasets.py
index 2890a877462b4..422f7729c9f4c 100644
--- a/python/paddle/tests/test_datasets.py
+++ b/python/paddle/tests/test_datasets.py
@@ -22,7 +22,6 @@
 
 import paddle.vision.transforms as T
 from paddle.dataset.common import _check_exists_and_download
-from paddle.fluid.framework import _test_eager_guard
 from paddle.vision.datasets import (
     MNIST,
     DatasetFolder,
@@ -47,7 +46,7 @@ def setUp(self):
     def tearDown(self):
         shutil.rmtree(self.data_dir)
 
-    def func_test_dataset(self):
+    def test_dataset(self):
         dataset_folder = DatasetFolder(self.data_dir)
 
         for _ in dataset_folder:
@@ -60,12 +59,7 @@ def func_test_dataset(self):
         for _ in dataset_folder:
             pass
 
-    def test_dataset(self):
-        with _test_eager_guard():
-            self.func_test_dataset()
-        self.func_test_dataset()
-
-    def func_test_folder(self):
+    def test_folder(self):
         loader = ImageFolder(self.data_dir)
 
         for _ in loader:
@@ -77,12 +71,7 @@ def func_test_folder(self):
 
         assert len(loader) == 4
 
-    def test_folder(self):
-        with _test_eager_guard():
-            self.func_test_folder()
-        self.func_test_folder()
-
-    def func_test_transform(self):
+    def test_transform(self):
         def fake_transform(img):
             return img
 
@@ -96,12 +85,7 @@ def fake_transform(img):
         for _ in loader:
             pass
 
-    def test_transform(self):
-        with _test_eager_guard():
-            self.func_test_transform()
-        self.func_test_transform()
-
-    def func_test_errors(self):
+    def test_errors(self):
         with self.assertRaises(RuntimeError):
             ImageFolder(self.empty_dir)
         with self.assertRaises(RuntimeError):
@@ -110,14 +94,9 @@ def func_test_errors(self):
         with self.assertRaises(ValueError):
             _check_exists_and_download('temp_paddle', None, None, None, False)
 
-    def test_errors(self):
-        with _test_eager_guard():
-            self.func_test_errors()
-        self.func_test_errors()
-
 
 class TestMNISTTest(unittest.TestCase):
-    def func_test_main(self):
+    def test_main(self):
         transform = T.Transpose()
         mnist = MNIST(mode='test', transform=transform)
         self.assertTrue(len(mnist) == 10000)
@@ -130,14 +109,9 @@ def func_test_main(self):
         self.assertTrue(label.shape[0] == 1)
         self.assertTrue(0 <= int(label) <= 9)
 
-    def test_main(self):
-        with _test_eager_guard():
-            self.func_test_main()
-        self.func_test_main()
-
 
 class TestMNISTTrain(unittest.TestCase):
-    def func_test_main(self):
+    def test_main(self):
         transform = T.Transpose()
         mnist = MNIST(mode='train', transform=transform)
         self.assertTrue(len(mnist) == 60000)
@@ -166,14 +140,9 @@ def func_test_main(self):
         with self.assertRaises(ValueError):
             mnist = MNIST(mode='train', transform=transform, backend=1)
 
-    def test_main(self):
-        with _test_eager_guard():
-            self.func_test_main()
-        self.func_test_main()
-
 
 class TestFASHIONMNISTTest(unittest.TestCase):
-    def func_test_main(self):
+    def test_main(self):
         transform = T.Transpose()
         mnist = FashionMNIST(mode='test', transform=transform)
         self.assertTrue(len(mnist) == 10000)
@@ -186,14 +155,9 @@ def func_test_main(self):
         self.assertTrue(label.shape[0] == 1)
         self.assertTrue(0 <= int(label) <= 9)
 
-    def test_main(self):
-        with _test_eager_guard():
-            self.func_test_main()
-        self.func_test_main()
-
 
 class TestFASHIONMNISTTrain(unittest.TestCase):
-    def func_test_main(self):
+    def test_main(self):
         transform = T.Transpose()
         mnist = FashionMNIST(mode='train', transform=transform)
         self.assertTrue(len(mnist) == 60000)
@@ -222,26 +186,16 @@ def func_test_main(self):
         with self.assertRaises(ValueError):
             mnist = FashionMNIST(mode='train', transform=transform, backend=1)
 
-    def test_main(self):
-        with _test_eager_guard():
-            self.func_test_main()
-        self.func_test_main()
-
-    def func_test_dataset_value(self):
+    def test_dataset_value(self):
         fmnist = FashionMNIST(mode='train')
         value = np.mean([np.array(x[0]) for x in fmnist])
 
         # 72.94035223214286 was getted from competitive products
         np.testing.assert_allclose(value, 72.94035223214286)
 
-    def test_dataset_value(self):
-        with _test_eager_guard():
-            self.func_test_dataset_value()
-        self.func_test_dataset_value()
-
 
 class TestFlowersTrain(unittest.TestCase):
-    def func_test_main(self):
+    def test_main(self):
         flowers = Flowers(mode='train')
         self.assertTrue(len(flowers) == 6149)
 
@@ -254,14 +208,9 @@ def func_test_main(self):
         self.assertTrue(image.shape[2] == 3)
         self.assertTrue(label.shape[0] == 1)
 
-    def test_main(self):
-        with _test_eager_guard():
-            self.func_test_main()
-        self.func_test_main()
-
 
 class TestFlowersValid(unittest.TestCase):
-    def func_test_main(self):
+    def test_main(self):
         flowers = Flowers(mode='valid')
         self.assertTrue(len(flowers) == 1020)
 
@@ -274,14 +223,9 @@ def func_test_main(self):
         self.assertTrue(image.shape[2] == 3)
         self.assertTrue(label.shape[0] == 1)
 
-    def test_main(self):
-        with _test_eager_guard():
-            self.func_test_main()
-        self.func_test_main()
-
 
 class TestFlowersTest(unittest.TestCase):
-    def func_test_main(self):
+    def test_main(self):
         flowers = Flowers(mode='test')
         self.assertTrue(len(flowers) == 1020)
 
@@ -310,11 +254,6 @@ def func_test_main(self):
         with self.assertRaises(ValueError):
             flowers = Flowers(mode='test', backend=1)
 
-    def test_main(self):
-        with _test_eager_guard():
-            self.func_test_main()
-        self.func_test_main()
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/tests/test_dlpack.py b/python/paddle/tests/test_dlpack.py
index 88a831be2268b..77ffdbecedbf6 100644
--- a/python/paddle/tests/test_dlpack.py
+++ b/python/paddle/tests/test_dlpack.py
@@ -19,11 +19,10 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid.framework import _test_eager_guard
 
 
 class TestDLPack(unittest.TestCase):
-    def func_test_dlpack_dygraph(self):
+    def test_dlpack_dygraph(self):
         paddle.disable_static()
         tensor = paddle.to_tensor(np.array([1, 2, 3, 4]).astype('int'))
         dlpack = paddle.utils.dlpack.to_dlpack(tensor)
@@ -38,12 +37,7 @@ def func_test_dlpack_dygraph(self):
             np.array(out_from_dlpack), np.array([1, 2, 3, 4]).astype('int')
         )
 
-    def test_dlpack_dygraph(self):
-        with _test_eager_guard():
-            self.func_test_dlpack_dygraph()
-        self.func_test_dlpack_dygraph()
-
-    def func_test_dlpack_tensor_larger_than_2dim(self):
+    def test_dlpack_tensor_larger_than_2dim(self):
         paddle.disable_static()
         numpy_data = np.random.randn(4, 5, 6)
         t = paddle.to_tensor(numpy_data)
@@ -52,11 +46,6 @@ def func_test_dlpack_tensor_larger_than_2dim(self):
         out = paddle.utils.dlpack.from_dlpack(dlpack)
         np.testing.assert_allclose(numpy_data, out.numpy(), rtol=1e-05)
 
-    def test_dlpack_tensor_larger_than_2dim(self):
-        with _test_eager_guard():
-            self.func_test_dlpack_tensor_larger_than_2dim()
-        self.func_test_dlpack_tensor_larger_than_2dim()
-
     def test_dlpack_static(self):
         paddle.enable_static()
         tensor = fluid.create_lod_tensor(
@@ -87,7 +76,7 @@ def test_dlpack_static(self):
                 np.array([[1], [2], [3], [4]]).astype('int'),
             )
 
-    def func_test_dlpack_dtype_conversion(self):
+    def test_dlpack_dtype_conversion(self):
         paddle.disable_static()
         # DLpack does not explicitly support bool data type.
         dtypes = [
@@ -119,11 +108,6 @@ def func_test_dlpack_dtype_conversion(self):
             self.assertEqual(x.dtype, o.dtype)
             np.testing.assert_allclose(x.numpy(), o.numpy(), rtol=1e-05)
 
-    def test_dlpack_dtype_conversion(self):
-        with _test_eager_guard():
-            self.func_test_dlpack_dtype_conversion()
-        self.func_test_dlpack_dtype_conversion()
-
     def test_dlpack_deletion(self):
         # See Paddle issue 47171
         if paddle.is_compiled_with_cuda():
@@ -134,23 +118,13 @@ def test_dlpack_deletion(self):
 
 
 class TestRaiseError(unittest.TestCase):
-    def func_test_from_dlpack_raise_type_error(self):
+    def test_from_dlpack_raise_type_error(self):
         self.assertRaises(
             TypeError, paddle.utils.dlpack.from_dlpack, np.zeros(5)
         )
 
-    def test_from_dlpack_raise_type_error(self):
-        with _test_eager_guard():
-            self.func_test_from_dlpack_raise_type_error()
-        self.func_test_from_dlpack_raise_type_error()
-
-    def func_test_to_dlpack_raise_type_error(self):
-        self.assertRaises(TypeError, paddle.utils.dlpack.to_dlpack, np.zeros(5))
-
     def test_to_dlpack_raise_type_error(self):
-        with _test_eager_guard():
-            self.func_test_to_dlpack_raise_type_error()
-        self.func_test_to_dlpack_raise_type_error()
+        self.assertRaises(TypeError, paddle.utils.dlpack.to_dlpack, np.zeros(5))
 
 
 if __name__ == '__main__':

From c6a2b0fd92c8c6760bfcceabb477036f1be1b887 Mon Sep 17 00:00:00 2001
From: 201716010711 <87008376+201716010711@users.noreply.github.com>
Date: Wed, 7 Dec 2022 11:09:10 +0800
Subject: [PATCH 19/60] delete sampling_id api (#48543)

---
 python/paddle/fluid/layers/nn.py              | 40 ----------
 python/paddle/fluid/layers/rnn.py             | 34 --------
 .../unittests/npu/test_sampling_id_op_npu.py  | 52 -------------
 .../fluid/tests/unittests/test_layers.py      | 15 ----
 .../fluid/tests/unittests/test_random_seed.py | 77 -------------------
 .../tests/unittests/test_sampling_id_op.py    | 45 -----------
 6 files changed, 263 deletions(-)
 delete mode 100644 python/paddle/fluid/tests/unittests/npu/test_sampling_id_op_npu.py
 delete mode 100644 python/paddle/fluid/tests/unittests/test_sampling_id_op.py

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 39d4d678abd0e..474bccc162e2b 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -86,7 +86,6 @@
     'elementwise_sub',
     'elementwise_mul',
     'gaussian_random',
-    'sampling_id',
     'clip',
     'clip_by_norm',
     'mean',
@@ -3190,45 +3189,6 @@ def gaussian_random(
     return out
 
 
-@templatedoc()
-def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'):
-    """
-    This op is used for sampling id from multinomial distribution from the input, sampling one id for one sample.
-
-    Parameters:
-        x (Variable): 2-D tensor, [batch_size, input_feature_dimensions]
-        min (Float): minimum , default 0.0.
-        max (Float): maximum, default 1.0.
-        seed (Float): Random seed, default 0. if seed is not 0, will generate same number every time.
-        dtype(np.dtype|core.VarDesc.VarType|str): The type of output data : float32, float_16, int etc
-
-    Returns:
-        Variable: sampling tensor.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.data(
-                name="X",
-                shape=[13, 11],
-                dtype='float32')
-
-            out = fluid.layers.sampling_id(x)
-    """
-
-    helper = LayerHelper('sampling_id', **locals())
-    out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='sampling_id',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'min': min, 'max': max, 'seed': seed},
-    )
-
-    return out
-
-
 def _elementwise_op(helper):
     op_type = helper.layer_type
     x = helper.kwargs.get('x', None)
diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index 8b5721438d2e5..6786f04292ba4 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -2306,40 +2306,6 @@ def __init__(
         )
         self.seed = seed
 
-    def sample(self, time, outputs, states):
-        r"""
-        Perform sampling from a categorical distribution, and the distribution
-        is computed by `softmax(outputs/softmax_temperature)`.
-
-        Parameters:
-            time(Variable): An `int64` tensor with shape `[1]` provided by the
-                caller, representing the current time step number of decoding.
-            outputs(Variable): A tensor variable. Usually it's data type is float32
-                or float64, and it's shape is `[batch_size, vocabulary_size]`,
-                representing the predicted logits of current step. It is same as
-                `outputs` returned by `BasicDecoder.output_fn(BasicDecoder.cell.call())`.
-            states(Variable): A (possibly nested structure of) tensor variable[s].
-                It is same as `new_states` returned by `BasicDecoder.cell.call()`.
-
-        Returns:
-            Variable: An `int64` tensor with shape `[batch_size]`, representing \
-                the sampled ids.
-        """
-        logits = (
-            (outputs / self.softmax_temperature)
-            if self.softmax_temperature is not None
-            else outputs
-        )
-        probs = paddle.nn.functional.softmax(logits)
-        # TODO: remove this stop_gradient. The stop_gradient of sample_ids can
-        # not pass to probs, since sampling_id op does not have corresponding
-        # grad op and thus can not pass.
-        probs.stop_gradient = True
-        sample_ids = nn.sampling_id(
-            probs, seed=self.seed, dtype=self.start_tokens.dtype
-        )
-        return sample_ids
-
 
 class BasicDecoder(Decoder):
     """
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sampling_id_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sampling_id_op_npu.py
deleted file mode 100644
index d354e39bcf08b..0000000000000
--- a/python/paddle/fluid/tests/unittests/npu/test_sampling_id_op_npu.py
+++ /dev/null
@@ -1,52 +0,0 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-import sys
-
-sys.path.append("..")
-
-from op_test import OpTest, _set_use_system_allocator
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-from paddle.fluid.op import Operator
-import paddle
-
-_set_use_system_allocator(False)
-
-
-class TestSamplingIdShape(unittest.TestCase):
-    def test_shape(self):
-        paddle.enable_static()
-        x = fluid.layers.data(name='x', shape=[3], dtype='float32')
-        output = fluid.layers.sampling_id(x)
-
-        place = fluid.NPUPlace(0)
-        exe = fluid.Executor(place=place)
-        exe.run(fluid.default_startup_program())
-
-        feed = {
-            'x': np.array([[0.2, 0.3, 0.5], [0.2, 0.3, 0.4]], dtype='float32')
-        }
-        output_np = exe.run(feed=feed, fetch_list=[output])[0]
-
-        self.assertEqual(output.shape[0], -1)
-        self.assertEqual(len(output.shape), 1)
-        self.assertEqual(output_np.shape[0], 2)
-        self.assertEqual(len(output_np.shape), 1)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 9297666eead48..043321bf566cc 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -2308,7 +2308,6 @@ def setUp(self):
             {
                 "make_gaussian_random",
                 "make_kldiv_loss",
-                "make_sampling_id",
                 "make_uniform_random_batch_size_like",
             }
         )
@@ -2794,20 +2793,6 @@ def make_gaussian_random(self):
             out = layers.gaussian_random(shape=[20, 30])
             return out
 
-    def make_sampling_id(self):
-        with program_guard(
-            fluid.default_main_program(), fluid.default_startup_program()
-        ):
-            x = self._get_data(
-                name="X",
-                shape=[13, 11],
-                dtype='float32',
-                append_batch_size=False,
-            )
-
-            out = layers.sampling_id(x)
-            return out
-
     def make_sum(self):
         with program_guard(
             fluid.default_main_program(), fluid.default_startup_program()
diff --git a/python/paddle/fluid/tests/unittests/test_random_seed.py b/python/paddle/fluid/tests/unittests/test_random_seed.py
index 5a3e92eb9fdd4..44b368889583d 100644
--- a/python/paddle/fluid/tests/unittests/test_random_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_random_seed.py
@@ -372,83 +372,6 @@ def test_generator_randperm_static(self):
                 np.testing.assert_allclose(out1_res2, out2_res2, rtol=1e-05)
                 self.assertTrue(not np.allclose(out1_res2, out1_res1))
 
-    def test_generator_sampling_id_dygraph(self):
-        """Test Generator seed."""
-        gen = paddle.seed(12312321111)
-
-        fluid.enable_dygraph()
-
-        gen.manual_seed(12312321111)
-        x = fluid.layers.uniform_random(
-            [10, 10], dtype="float32", min=0.0, max=1.0
-        )
-        y = fluid.layers.sampling_id(x)
-
-        st1 = gen.get_state()
-        x1 = fluid.layers.uniform_random(
-            [10, 10], dtype="float32", min=0.0, max=1.0
-        )
-        y1 = fluid.layers.sampling_id(x)
-
-        gen.set_state(st1)
-        x2 = fluid.layers.uniform_random(
-            [10, 10], dtype="float32", min=0.0, max=1.0
-        )
-        y2 = fluid.layers.sampling_id(x)
-
-        gen.manual_seed(12312321111)
-        x3 = fluid.layers.uniform_random(
-            [10, 10], dtype="float32", min=0.0, max=1.0
-        )
-        y3 = fluid.layers.sampling_id(x)
-
-        x_np = y.numpy()
-        x1_np = y1.numpy()
-        x2_np = y2.numpy()
-        x3_np = y3.numpy()
-
-        if not core.is_compiled_with_cuda():
-            print(">>>>>>> sampling id dygraph >>>>>>>")
-            np.testing.assert_allclose(x1_np, x2_np, rtol=1e-05)
-            np.testing.assert_allclose(x_np, x3_np, rtol=1e-05)
-
-    def test_generator_randperm_static_1(self):
-
-        fluid.disable_dygraph()
-
-        paddle.seed(123123143)
-
-        startup_program = fluid.Program()
-        train_program = fluid.Program()
-        with fluid.program_guard(train_program, startup_program):
-            # example 1:
-            # attr shape is a list which doesn't contain tensor Variable.
-            x = fluid.layers.uniform_random(shape=[10, 10])
-            result_1 = fluid.layers.sampling_id(x)
-            result_2 = fluid.layers.sampling_id(x)
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            exe.run(startup_program)
-            out1 = exe.run(
-                train_program, feed={}, fetch_list=[result_1, result_2]
-            )
-
-            paddle.seed(123123143)
-            out2 = exe.run(
-                train_program, feed={}, fetch_list=[result_1, result_2]
-            )
-
-            out1_res1 = np.array(out1[0])
-            out1_res2 = np.array(out1[1])
-            out2_res1 = np.array(out2[0])
-            out2_res2 = np.array(out2[1])
-
-            if not core.is_compiled_with_cuda():
-                print(">>>>>>> sampling id static >>>>>>>")
-                np.testing.assert_allclose(out1_res1, out2_res1, rtol=1e-05)
-                np.testing.assert_allclose(out1_res2, out2_res2, rtol=1e-05)
-                self.assertTrue(not np.allclose(out1_res2, out1_res1))
-
     def test_gen_TruncatedNormal_initializer(self):
         fluid.disable_dygraph()
 
diff --git a/python/paddle/fluid/tests/unittests/test_sampling_id_op.py b/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
deleted file mode 100644
index ae84e98aaa746..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-import paddle
-import paddle.fluid as fluid
-
-
-class TestSamplingIdShape(unittest.TestCase):
-    def test_shape(self):
-        paddle.enable_static()
-        x = fluid.layers.data(name='x', shape=[3], dtype='float32')
-        output = fluid.layers.sampling_id(x)
-
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place=place)
-        exe.run(fluid.default_startup_program())
-
-        feed = {
-            'x': np.array([[0.2, 0.3, 0.5], [0.2, 0.3, 0.4]], dtype='float32')
-        }
-        output_np = exe.run(feed=feed, fetch_list=[output])[0]
-
-        self.assertEqual(output.shape[0], -1)
-        self.assertEqual(len(output.shape), 1)
-        self.assertEqual(output_np.shape[0], 2)
-        self.assertEqual(len(output_np.shape), 1)
-
-
-if __name__ == "__main__":
-    unittest.main()

From e5bc2eec701a26c1fc8211724bee31236ec1c918 Mon Sep 17 00:00:00 2001
From: Qi Li <qili93@qq.com>
Date: Wed, 7 Dec 2022 11:40:31 +0800
Subject: [PATCH 20/60] [NPU] add FLAGS_npu_storage_format env to enable npu
 storage format, test=develop (#48774)

---
 paddle/fluid/platform/flags.cc                     | 12 ++++++++++++
 .../paddle/fluid/dygraph/varbase_patch_methods.py  |  7 ++++++-
 python/paddle/nn/functional/conv.py                | 14 ++++++++++++--
 python/paddle/nn/layer/norm.py                     |  7 ++++++-
 4 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index e8800cedb376d..b809e026544bb 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -1041,6 +1041,18 @@ PADDLE_DEFINE_EXPORTED_string(jit_engine_type,
                               "Predictor",
                               "Choose default funciton type in JitLayer.");
 
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+/**
+ * Custom Device NPU related FLAG
+ * Name: FLAGS_npu_storage_format
+ * Since Version: 2.5.0
+ * Value Range: bool, default=false
+ * Example:
+ * Note: Enable NPU Storage Format for Ascend910 performance improvement.
+ */
+PADDLE_DEFINE_EXPORTED_bool(npu_storage_format, false, "");
+#endif
+
 #ifdef PADDLE_WITH_CUDNN_FRONTEND
 /**
  * CUDNNv8 related FLAG
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index e9b963a781db9..0d2cd0cbf2db0 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import inspect
 import numpy as np
 import warnings
@@ -379,7 +380,11 @@ def gradient(self):
 
             new_ivar = self._grad_ivar()
             # TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
-            if 'npu' in get_all_custom_device_type():
+            if (
+                os.environ.get('FLAGS_npu_storage_format', None)
+                in [1, '1', True, 'True', 'true']
+                and 'npu' in get_all_custom_device_type()
+            ):
                 new_ivar = paddle.incubate._npu_identity(x=new_ivar, format=-1)
             new_ivar = new_ivar._copy_to(core.CPUPlace(), True)
             if self._grad_ivar().type == core.VarDesc.VarType.SELECTED_ROWS:
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index d29f91d035f28..face92190c0f5 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+
 from paddle import _C_ops, _legacy_C_ops, get_flags, in_dynamic_mode
 from paddle.device import (
     get_all_custom_device_type,
@@ -149,7 +151,11 @@ def _conv_nd(
             new_shape[channel_dim] = -1
             bias = bias.reshape(new_shape)
             # TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
-            if 'npu' in get_all_custom_device_type():
+            if (
+                os.environ.get('FLAGS_npu_storage_format', None)
+                in [1, '1', True, 'True', 'true']
+                and 'npu' in get_all_custom_device_type()
+            ):
                 with no_grad():
                     bias_storage = _C_ops.npu_identity(
                         bias, 3
@@ -747,7 +753,11 @@ def conv2d(
                         + [1 for i in range(len(x.shape) - channel_dim - 1)],
                     )
                 # TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
-                if 'npu' in get_all_custom_device_type():
+                if (
+                    os.environ.get('FLAGS_npu_storage_format', None)
+                    in [1, '1', True, 'True', 'true']
+                    and 'npu' in get_all_custom_device_type()
+                ):
                     with no_grad():
                         bias_storage = _C_ops.npu_identity(
                             bias, 3
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 64f9f8913313d..c0117560f25e2 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -28,6 +28,7 @@
 # TODO: define normalization api
 
 import numbers
+import os
 import warnings
 
 import numpy as np
@@ -681,7 +682,11 @@ def __init__(
         self._variance.stop_gradient = True
 
         # TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
-        if 'npu' in get_all_custom_device_type():
+        if (
+            os.environ.get('FLAGS_npu_storage_format', None)
+            in [1, '1', True, 'True', 'true']
+            and 'npu' in get_all_custom_device_type()
+        ):
             with no_grad():
                 weight_trans = _C_ops.npu_identity(
                     self.weight, 3

From 17879045f17c762592efaf29d09b565422d2b130 Mon Sep 17 00:00:00 2001
From: zhoutianzi666 <39978853+zhoutianzi666@users.noreply.github.com>
Date: Wed, 7 Dec 2022 13:09:50 +0800
Subject: [PATCH 21/60] optimize nchw<->nhwc kernel in fp16 model (#48692)

---
 paddle/phi/kernels/funcs/math_function.cu    | 81 ++++++++++++++++++--
 paddle/phi/kernels/funcs/math_function.h     |  3 +
 paddle/phi/kernels/transfer_layout_kernel.cc | 26 +++++++
 3 files changed, 105 insertions(+), 5 deletions(-)

diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu
index a0e59f8f3fe23..e1ab8922fd2b5 100644
--- a/paddle/phi/kernels/funcs/math_function.cu
+++ b/paddle/phi/kernels/funcs/math_function.cu
@@ -27,11 +27,83 @@ limitations under the License. */
 namespace phi {
 namespace funcs {
 
+// The following part of the code refers to NVIDIA-cutlass
+// https://github.com/NVIDIA/cutlass/blob/master/tools/util/include/cutlass/util/device_nchw_to_nhwc.h
+// Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved. SPDX-License-Identifier: BSD-3-Clause
+template <typename T>
+__global__ void batch_transpose_kernel(
+    T* output, const T* input, const int batch, const int M, const int N) {
+  const int num = M * N;
+  // "+1" to avoid smem bank conflict
+  __shared__ T shbuf[32 * (32 + 1)];
+  const int32_t tid = threadIdx.y * blockDim.x + threadIdx.x;
+  const int32_t wid = tid / 32;
+  const int32_t lid = tid % 32;
+  const int32_t batch_i = blockIdx.z;
+  const int32_t mi0 = blockIdx.y * 32;
+  const int32_t ni0 = blockIdx.x * 32;
+
+  const size_t input_idx = batch_i * num + (mi0 + wid) * N + ni0;
+  const T* A = input + input_idx;
+  if (ni0 + lid < N) {
+    const int lid_x_33 = lid * 33;
+    if ((mi0 + 32) <= M) {
+      int mi = wid;  // between 0 and 7
+#pragma unroll
+      for (int mLoopIdx = 0; mLoopIdx < 4; mLoopIdx++) {
+        shbuf[lid_x_33 + mi] = A[lid];
+        A = &A[8 * N];
+        mi += 8;
+      }
+    } else {
+      for (int mi = wid; mi < 32; mi += 8) {
+        if ((mi + mi0) < M) {
+          shbuf[lid_x_33 + mi] = A[lid];
+        }
+        A = &A[8 * N];
+      }
+    }
+  }
+  __syncthreads();
+
+  const int32_t miOut = mi0 + lid;
+  output = &output[batch_i * num + miOut];
+  if (miOut < M) {
+    if (ni0 + 32 < N) {
+      int nI = wid;
+#pragma unroll
+      for (int nLoopIdx = 0; nLoopIdx < 4; ++nLoopIdx) {
+        output[(ni0 + nI) * M] = shbuf[(nI)*33 + lid];
+        nI += 8;
+      }
+    } else {
+      for (int nI = wid; nI < 32; nI += 8) {
+        if (ni0 + nI < N) {
+          output[(ni0 + nI) * M] = shbuf[(nI)*33 + lid];
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void BatchTranspose(T* output, const T* input, int batch, int m, int n) {
+  dim3 grid((n + 31) / 32, (m + 31) / 32, batch);
+  dim3 block(32, 8);
+  batch_transpose_kernel<<<grid, block>>>(output, input, batch, m, n);
+}
+
 using float16 = phi::dtype::float16;
 using bfloat16 = phi::dtype::bfloat16;
 
-template struct SetConstant<phi::GPUContext, phi::dtype::float16>;
-template struct SetConstant<phi::GPUContext, phi::dtype::bfloat16>;
+template void BatchTranspose(
+    float16* output, const float16* input, int batch, int m, int n);
+template void BatchTranspose(
+    float* output, const float* input, int batch, int m, int n);
+
+template struct SetConstant<phi::GPUContext, float16>;
+template struct SetConstant<phi::GPUContext, bfloat16>;
 template struct SetConstant<phi::GPUContext, float>;
 template struct SetConstant<phi::GPUContext, double>;
 template struct SetConstant<phi::GPUContext, uint8_t>;
@@ -42,10 +114,9 @@ template struct SetConstant<phi::GPUContext, bool>;
 template struct SetConstant<phi::GPUContext, phi::dtype::complex<float>>;
 template struct SetConstant<phi::GPUContext, phi::dtype::complex<double>>;
 
+template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext, float16>;
 template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
-                            phi::dtype::float16>;
-template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
-                            phi::dtype::bfloat16>;
+                            bfloat16>;
 template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext, float>;
 template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext, double>;
 template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext, uint8_t>;
diff --git a/paddle/phi/kernels/funcs/math_function.h b/paddle/phi/kernels/funcs/math_function.h
index 48649a454ae91..6f1cac49352e3 100644
--- a/paddle/phi/kernels/funcs/math_function.h
+++ b/paddle/phi/kernels/funcs/math_function.h
@@ -29,6 +29,9 @@ limitations under the License. */
 namespace phi {
 namespace funcs {
 
+template <typename T>
+void BatchTranspose(T* output, const T* input, int batch, int m, int n);
+
 template <typename DeviceContext, typename T>
 struct TransposeNormal {
   // for dims >= 7 situation
diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc
index d7b8d55707a6d..f2c57150c6246 100644
--- a/paddle/phi/kernels/transfer_layout_kernel.cc
+++ b/paddle/phi/kernels/transfer_layout_kernel.cc
@@ -70,6 +70,32 @@ void TransferLayoutGeneral(const Context& dev_ctx,
   out->Resize(phi::make_ddim(dst_dim));
   dev_ctx.Alloc(out, x.dtype());
 
+  // In GPU fp16 model, we will insert many transfer_layout ops in
+  // conv2d_fusion_layout_transfer_pass, so we optimize this kernel on GPU
+  if (std::is_same<Context, phi::GPUContext>::value) {
+    std::vector<int> axis_nchw_nhwc = {0, 2, 3, 1};
+    std::vector<int> axis_nhwc_nchw = {0, 3, 1, 2};
+    const int batch = src_dim[0];
+    int row_len = src_dim[1];
+    int col_len = src_dim[2] * src_dim[3];
+    if (axis == axis_nhwc_nchw) {
+      row_len = src_dim[1] * src_dim[2];
+      col_len = src_dim[3];
+    }
+    if (x.dtype() == phi::DataType::FLOAT16) {
+      funcs::BatchTranspose(out->data<phi::dtype::float16>(),
+                            x.data<phi::dtype::float16>(),
+                            batch,
+                            row_len,
+                            col_len);
+      return;
+    } else if (x.dtype() == phi::DataType::FLOAT32) {
+      funcs::BatchTranspose(
+          out->data<float>(), x.data<float>(), batch, row_len, col_len);
+      return;
+    }
+  }
+
   PD_VISIT_ALL_TYPES(x.dtype(), "CastDataLayout", ([&] {
                        CastDataLayout<data_t, Context>(dev_ctx, x, axis, out);
                      }));

From 87fbc5e418b1282b11cb18a32075a1ff2de8d4b8 Mon Sep 17 00:00:00 2001
From: feng_shuai <fengshuai03@baidu.com>
Date: Wed, 7 Dec 2022 13:12:50 +0800
Subject: [PATCH 22/60] fix: oss just support sm>=75 (#48731)

---
 paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
index 0a238eadd95c9..2ae972729f5e1 100644
--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
@@ -400,7 +400,8 @@ class MultiheadMatMulOpConverter : public OpConverter {
         }
       } else {
         if (input_dims.d[1] <= 384 && !bias_qk_attr &&
-            engine_->precision() != AnalysisConfig::Precision::kFloat32) {
+            engine_->precision() != AnalysisConfig::Precision::kFloat32 &&
+            platform::GetGPUComputeCapability(0) >= 75) {
           /*
             * input_dims.d[0]: batch(-1)
             * input_dims.d[1]: length:256

From 93b7ccf53d1364f49d2901e854b85c671a13b883 Mon Sep 17 00:00:00 2001
From: QingshuChen <chenqingshu@baidu.com>
Date: Wed, 7 Dec 2022 13:53:30 +0800
Subject: [PATCH 23/60] update kl1 op list and optimize matmul unitest for
 kunlun (#48775)

*test=kunlun
---
 paddle/phi/backends/xpu/xpu1_op_list.cc       | 276 +++++++++++++++++-
 paddle/phi/backends/xpu/xpu2_op_list.cc       |  22 +-
 .../unittests/xpu/test_matmul_v2_op_xpu.py    |  33 +--
 3 files changed, 294 insertions(+), 37 deletions(-)

diff --git a/paddle/phi/backends/xpu/xpu1_op_list.cc b/paddle/phi/backends/xpu/xpu1_op_list.cc
index 87c0502abba07..0a51baad7cf21 100644
--- a/paddle/phi/backends/xpu/xpu1_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu1_op_list.cc
@@ -21,10 +21,284 @@ namespace xpu {
 XPUOpMap& get_kl1_ops() {
   // KL1支持的op，通过op_name, data_type
   static XPUOpMap s_xpu1_kernels{
+      {"abs", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"accuracy", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"adam", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"adamw", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"affine_channel_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"affine_channel", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"arg_max", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"assign",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::BOOL})},
+      {"batch_norm_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"batch_norm", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"bilinear_interp", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"bilinear_interp_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"bilinear_interp_v2", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"bilinear_interp_v2_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"broadcast",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64})},
+      {"cast",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32})},
+      {"clip_by_norm", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"coalesce_tensor",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::INT32})},
+      {"concat", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"concat_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"conv2d", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"conv2d_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"deformable_conv", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"deformable_conv_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"depthwise_conv2d", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"depthwise_conv2d_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"dropout", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"dropout_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"c_allreduce_sum", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"c_reduce_sum", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"elementwise_add", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"elementwise_add_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"elementwise_div_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"elementwise_div", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"elementwise_floordiv", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"elementwise_max_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"elementwise_max", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"elementwise_min_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"elementwise_min", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"elementwise_mul_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"elementwise_mul", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"elementwise_pow", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"elementwise_sub_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"elementwise_sub", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"equal", XPUKernelSet({phi::DataType::INT64})},
+      {"expand_as_v2",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"expand_v2",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::FLOAT32})},
+      {"fill_any_like", XPUKernelSet({phi::DataType::INT64})},
+      {"fill_constant",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT32})},
+      {"gather_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"gather", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"gaussian_random", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"gelu_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"gelu", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"hard_switch_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"hard_switch", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"iou_similarity", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"lamb", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"layer_norm_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"layer_norm", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"leaky_relu_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"leaky_relu", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"load",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT8,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::FLOAT32})},
+      {"logicaland",
+       XPUKernelSet({phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::INT16,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::FLOAT32})},
+      {"logicalnot",
+       XPUKernelSet({phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::INT16,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::FLOAT32})},
+      {"logicalor",
+       XPUKernelSet({phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::INT16,
+                     phi::DataType::INT32,
+                     phi::DataType::INT64,
+                     phi::DataType::FLOAT32})},
+      {"log_loss_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"log_loss", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"logsumexp", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"log", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"lookup_table_v2_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"lookup_table_v2", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"matmul_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"matmul_v2_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"matmul_v2", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"matmul", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"mean_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"mean", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"momentum", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"mul_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"mul", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"nearest_interp_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"nearest_interp_v2_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"nearest_interp_v2", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"nearest_interp", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"one_hot_v2",
+       XPUKernelSet({phi::DataType::INT32, phi::DataType::INT64})},
+      {"one_hot", XPUKernelSet({phi::DataType::INT32, phi::DataType::INT64})},
+      {"pool2d_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"pool2d", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"pow", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"range",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::FLOAT32})},
+      {"reduce_max_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"reduce_max", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"reduce_mean", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"reduce_mean_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"reduce_prod", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"reduce_sum_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"reduce_sum", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"relu_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"relu", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"reshape2_grad",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT32})},
+      {"reshape2",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT32})},
+      {"rmsprop", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"rnn_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"rnn", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"roi_align_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"roi_align", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"scale", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"sgd", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"shape",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::FLOAT32})},
+      {"sigmoid_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"sigmoid", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"sign", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"slice_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"slice", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::INT32})},
+      {"softmax_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"softmax_with_cross_entropy", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"softmax_with_cross_entropy_grad",
+       XPUKernelSet({phi::DataType::FLOAT32})},
+      {"softmax", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"split", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::INT32})},
+      {"sqrt_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"sqrt", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"square_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"square", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"squeeze2_grad",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::FLOAT32})},
+      {"squeeze2",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::FLOAT32})},
+      {"squeeze_grad",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::FLOAT32})},
+      {"squeeze",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::FLOAT32})},
+      {"stack", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"stack_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"sum", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"tanh_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"tanh", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"top_k", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"transpose2_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"transpose2", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"transpose_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"transpose", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"truncated_gaussian_random", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"uniform_random", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"unsqueeze2_grad",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::FLOAT32})},
+      {"unsqueeze2",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::FLOAT32})},
+      {"unsqueeze_grad",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::FLOAT32})},
+      {"unsqueeze",
+       XPUKernelSet({phi::DataType::FLOAT64,
+                     phi::DataType::INT64,
+                     phi::DataType::INT32,
+                     phi::DataType::BOOL,
+                     phi::DataType::INT8,
+                     phi::DataType::UINT8,
+                     phi::DataType::FLOAT32})},
+      {"where_index", XPUKernelSet({phi::DataType::BOOL})},
       // AddMore
   };
 
-  PD_THROW("get_kl1_ops unsupported");
   return s_xpu1_kernels;
 }
 
diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
index b1357b32e38b5..8b19d3a5c4efa 100644
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -2,10 +2,11 @@
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or
-agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-or implied. See the License for the specific language governing permissions and
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
@@ -94,7 +95,8 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"clip", XPUKernelSet({phi::DataType::FLOAT32})},
       {"clip_by_norm", XPUKernelSet({phi::DataType::FLOAT32})},
-      {"coalesce_tensor", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"coalesce_tensor",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"concat_grad",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"concat",
@@ -525,6 +527,7 @@ XPUOpMap& get_kl2_ops() {
                      phi::DataType::INT64,
                      phi::DataType::BOOL,
                      phi::DataType::FLOAT32})},
+      {"tile_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"transpose2_grad",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"transpose2",
@@ -557,15 +560,6 @@ XPUOpMap& get_kl2_ops() {
                      phi::DataType::UINT8,
                      phi::DataType::FLOAT32,
                      phi::DataType::FLOAT16})},
-      {"unsqueeze_with_xshape",
-       XPUKernelSet({phi::DataType::FLOAT64,
-                     phi::DataType::INT64,
-                     phi::DataType::INT32,
-                     phi::DataType::BOOL,
-                     phi::DataType::INT8,
-                     phi::DataType::UINT8,
-                     phi::DataType::FLOAT32,
-                     phi::DataType::FLOAT16})},
       {"unsqueeze_grad",
        XPUKernelSet({phi::DataType::FLOAT64,
                      phi::DataType::INT64,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
index c5d782400b3bb..6ad31224cd2d0 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
@@ -190,8 +190,8 @@ class TestMatMulOp9(TestMatMulV2Op):
         """
 
         def config(self):
-            self.x_shape = (100, 20, 100)
-            self.y_shape = (100, 100, 100)
+            self.x_shape = (5, 20, 7)
+            self.y_shape = (5, 7, 7)
             self.trans_x = False
             self.trans_y = True
 
@@ -201,8 +201,8 @@ class TestMatMulOp10(TestMatMulV2Op):
         """
 
         def config(self):
-            self.x_shape = (100, 20, 100)
-            self.y_shape = (100, 20, 100)
+            self.x_shape = (3, 20, 8)
+            self.y_shape = (3, 20, 8)
             self.trans_x = True
             self.trans_y = False
 
@@ -212,8 +212,8 @@ class TestMatMulOp11(TestMatMulV2Op):
         """
 
         def config(self):
-            self.x_shape = (2, 20, 100)
-            self.y_shape = (100, 30)
+            self.x_shape = (2, 20, 11)
+            self.y_shape = (11, 30)
             self.trans_x = False
             self.trans_y = False
 
@@ -245,8 +245,8 @@ class TestMatMulOp14(TestMatMulV2Op):
         """
 
         def config(self):
-            self.x_shape = (100, 2, 100, 10)
-            self.y_shape = (100, 2, 10, 90)
+            self.x_shape = (7, 2, 100, 10)
+            self.y_shape = (7, 2, 10, 90)
             self.trans_x = False
             self.trans_y = False
 
@@ -256,22 +256,11 @@ class TestMatMulOp15(TestMatMulV2Op):
         """
 
         def config(self):
-            self.x_shape = (100, 2, 100, 10)
-            self.y_shape = (100, 2, 100, 10)
+            self.x_shape = (3, 2, 4, 10)
+            self.y_shape = (3, 2, 4, 10)
             self.trans_x = False
             self.trans_y = True
 
-    class TestMatMulOp16(TestMatMulV2Op):
-        """
-        case 16 : to check the big data
-        """
-
-        def config(self):
-            self.x_shape = (1000, 2, 100, 100)
-            self.y_shape = (1000, 2, 100, 900)
-            self.trans_x = False
-            self.trans_y = False
-
     class TestMatMulOp17(TestMatMulV2Op):
         """
         case 17 : to check the gradient for special case
@@ -289,7 +278,7 @@ class TestMatMulOp18(TestMatMulV2Op):
         """
 
         def config(self):
-            self.x_shape = (8, 111, 4, 17)
+            self.x_shape = (8, 11, 4, 17)
             self.y_shape = 17
             self.trans_x = False
             self.trans_y = False

From 693de9f05a5c5d0d5a52cf0102e02cb610451f92 Mon Sep 17 00:00:00 2001
From: WangZhen <23097963+0x45f@users.noreply.github.com>
Date: Wed, 7 Dec 2022 14:09:22 +0800
Subject: [PATCH 24/60] Fix accuracy fp16 kernel return fp32 tensor error
 (#48803)

---
 paddle/phi/kernels/gpu/accuracy_kernel.cu | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/paddle/phi/kernels/gpu/accuracy_kernel.cu b/paddle/phi/kernels/gpu/accuracy_kernel.cu
index ef3e5b9af2408..8a4aa2a6397c9 100644
--- a/paddle/phi/kernels/gpu/accuracy_kernel.cu
+++ b/paddle/phi/kernels/gpu/accuracy_kernel.cu
@@ -26,13 +26,13 @@
 namespace phi {
 using phi::PADDLE_CUDA_NUM_THREADS;
 
-template <int BlockSize>
+template <int BlockSize, typename T>
 __global__ void AccuracyCudaKernel(const int N,
                                    const int D,
                                    const int64_t* Xdata,
                                    const int64_t* labeldata,
                                    int* correct_data,
-                                   float* accuracy,
+                                   T* accuracy,
                                    int* total_data) {
   int count = 0;
   __shared__ int total[BlockSize];
@@ -64,7 +64,7 @@ __global__ void AccuracyCudaKernel(const int N,
 #endif
   if (threadIdx.x == 0) {
     *correct_data = result;
-    *accuracy = static_cast<float>(result) / static_cast<float>(N);
+    *accuracy = static_cast<T>(result) / static_cast<T>(N);
     *total_data = N;
   }
 }
@@ -84,18 +84,18 @@ void AccuracyRawKernel(const Context& dev_ctx,
 
   int* correct_data = dev_ctx.template Alloc<int>(correct);
   int* total_data = dev_ctx.template Alloc<int>(total);
-  float* accuracy_data = dev_ctx.template Alloc<float>(accuracy);
+  T* accuracy_data = dev_ctx.template Alloc<T>(accuracy);
 
   int num_samples = static_cast<int>(inference.dims()[0]);
   size_t infer_width = inference.dims()[1];
   auto stream = dev_ctx.stream();
-  phi::backends::gpu::GpuMemsetAsync(accuracy_data, 0, sizeof(float), stream);
+  phi::backends::gpu::GpuMemsetAsync(accuracy_data, 0, sizeof(T), stream);
 
   if (num_samples == 0) {
     return;
   }
 
-  AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS>
+  AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS, T>
       <<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(num_samples,
                                                   infer_width,
                                                   indices_data,

From 65420271609b8cce860ec8034569292db7d13d71 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Wed, 7 Dec 2022 14:36:34 +0800
Subject: [PATCH 25/60] [phi::DenseTensor] Replace Tensor with phi::DenseTensor
 (#48682)

---
 .../fluid/imperative/gradient_accumulator.cc  |  10 +-
 paddle/fluid/operators/abs_op_mlu.cc          |   4 +-
 paddle/fluid/operators/abs_op_npu.cc          |   2 -
 paddle/fluid/operators/activation_op_mlu.cc   |   4 +-
 paddle/fluid/operators/activation_op_npu.cc   | 218 +++++++++---------
 paddle/fluid/operators/affine_grid_op.cc      |   2 -
 .../amp/alloc_float_status_op_npu.cc          |   2 -
 .../amp/check_finite_and_unscale_op_mlu.cc    |   8 +-
 .../amp/check_finite_and_unscale_op_npu.cc    |  10 +-
 .../check_finite_and_unscale_op_npu_test.cc   |   4 +-
 .../amp/clear_float_status_op_npu.cc          |   4 +-
 .../operators/amp/get_float_status_op_npu.cc  |   4 +-
 .../amp/update_loss_scaling_op_npu.cc         |   6 +-
 paddle/fluid/operators/arg_max_op_npu.cc      |   3 +-
 paddle/fluid/operators/arg_min_op_npu.cc      |   1 -
 paddle/fluid/operators/argsort_op_npu.cc      |  35 ++-
 paddle/fluid/operators/attention_lstm_op.cc   |  33 +--
 paddle/fluid/operators/attention_lstm_op.h    |   2 -
 paddle/fluid/operators/batch_norm_op.cc       |  18 +-
 paddle/fluid/operators/batch_norm_op.cu       |   1 -
 paddle/fluid/operators/batch_norm_op.h        |   1 -
 paddle/fluid/operators/batch_norm_op_mlu.cc   |  12 +-
 paddle/fluid/operators/batch_norm_op_npu.cc   |   2 +-
 paddle/fluid/operators/bce_loss_op_mlu.cc     |   2 -
 paddle/fluid/operators/bce_loss_op_npu.cc     |   2 -
 paddle/fluid/operators/cast_op.cc             |   2 +-
 paddle/fluid/operators/cast_op_mlu.cc         |   2 -
 paddle/fluid/operators/cast_op_npu.cc         |   2 -
 paddle/fluid/operators/center_loss_op.h       |   3 +-
 paddle/fluid/operators/clip_by_norm_op.h      |   1 -
 paddle/fluid/operators/clip_by_norm_op_npu.cc |   8 +-
 paddle/fluid/operators/clip_op_mlu.cc         |   8 +-
 paddle/fluid/operators/clip_op_npu.cc         |  10 +-
 paddle/fluid/operators/coalesce_tensor_op.cc  |   2 +-
 .../operators/collective/c_allreduce_op.h     |   5 +-
 .../c_softmax_with_cross_entropy_op.cu        |  20 +-
 paddle/fluid/operators/concat_op.cc           |   1 -
 paddle/fluid/operators/concat_op_mlu.cc       |   4 +-
 .../operators/controlflow/logical_op_mlu.cc   |   2 -
 .../operators/controlflow/logical_op_npu.cc   |   2 -
 paddle/fluid/operators/conv_op.h              |   2 -
 paddle/fluid/operators/conv_op_mlu.cc         |  33 ++-
 paddle/fluid/operators/conv_op_npu.cc         |  29 +--
 .../fluid/operators/conv_transpose_op_mlu.cc  |  17 +-
 .../fluid/operators/conv_transpose_op_npu.cc  |   9 +-
 paddle/fluid/operators/copy_cross_scope_op.cc |   2 -
 paddle/fluid/operators/correlation_op.cc      |   2 -
 paddle/fluid/operators/cos_sim_op.h           |   6 +-
 paddle/fluid/operators/crop_op_npu.cc         |   6 +-
 paddle/fluid/operators/cross_entropy_op.h     |   6 +-
 paddle/fluid/operators/ctc_align_op.h         |   2 -
 paddle/fluid/operators/cudnn_lstm_op.cu.cc    |  49 ++--
 paddle/fluid/operators/cumsum_op_mlu.cc       |   4 +-
 paddle/fluid/operators/cumsum_op_npu.cc       |   8 +-
 paddle/fluid/operators/cvm_op.cc              |   2 -
 paddle/fluid/operators/cvm_op.cu              |   1 -
 paddle/fluid/operators/cvm_op.h               |   2 -
 paddle/fluid/operators/data_norm_op.cc        |  13 +-
 paddle/fluid/operators/data_norm_op.cu        |   3 +-
 .../fluid/operators/deformable_conv_op_mlu.cc |  30 ++-
 .../operators/deformable_psroi_pooling_op.cu  |   1 -
 .../operators/deformable_psroi_pooling_op.h   |   4 +-
 .../fluid/operators/detection/bbox_util.cu.h  |  16 +-
 .../operators/detection/bipartite_match_op.cc |   4 +-
 .../fluid/operators/detection/box_clip_op.cu  |   1 -
 .../fluid/operators/detection/box_clip_op.h   |   9 +-
 .../operators/detection/box_coder_op_npu.cc   | 135 +++++------
 .../detection/collect_fpn_proposals_op.cc     |   1 -
 .../detection/collect_fpn_proposals_op.cu     |  26 +--
 .../detection/density_prior_box_op_npu.cc     |  71 +++---
 .../detection/generate_mask_labels_op.cc      |  58 ++---
 .../detection/generate_proposal_labels_op.cc  |  62 ++---
 .../detection/generate_proposals_op.cc        |  42 ++--
 .../detection/generate_proposals_op.cu        |  40 ++--
 .../detection/generate_proposals_v2_op.cc     |   2 -
 .../detection/iou_similarity_op_mlu.cc        |  50 ++--
 .../detection/iou_similarity_op_npu.cc        |  50 ++--
 .../detection/locality_aware_nms_op.cc        |  10 +-
 .../operators/detection/matrix_nms_op.cc      |   2 -
 .../operators/detection/multiclass_nms_op.cc  |  10 +-
 .../detection/polygon_box_transform_op.cc     |   2 -
 .../detection/polygon_box_transform_op.cu     |   1 -
 .../operators/detection/prior_box_op_npu.cc   |   8 +-
 .../retinanet_detection_output_op.cc          |  47 ++--
 .../detection/roi_perspective_transform_op.cc |  14 +-
 .../detection/rpn_target_assign_op.cc         | 138 +++++------
 .../detection/sigmoid_focal_loss_op.cu        |  21 +-
 .../detection/sigmoid_focal_loss_op.h         |  21 +-
 .../operators/detection/yolo_box_op_mlu.cc    |   2 +-
 paddle/fluid/operators/detection_map_op.cc    |   2 -
 paddle/fluid/operators/dgc_clip_by_norm_op.h  |   2 -
 paddle/fluid/operators/dropout_op_mlu.cc      |   8 +-
 paddle/fluid/operators/dropout_op_npu.cc      |  16 +-
 .../elementwise/elementwise_add_op_mlu.cc     |   1 -
 .../elementwise/elementwise_add_op_npu.cc     |   7 +-
 .../elementwise/elementwise_div_op.h          |   1 -
 .../elementwise/elementwise_div_op_mlu.cc     |   8 +-
 .../elementwise/elementwise_div_op_npu.cc     |  22 +-
 .../elementwise_floordiv_op_npu.cc            |   2 -
 .../elementwise/elementwise_max_op_npu.cc     |  18 +-
 .../elementwise/elementwise_min_op_mlu.cc     |   2 -
 .../elementwise/elementwise_min_op_npu.cc     |  16 +-
 .../operators/elementwise/elementwise_mlu.h   |   6 +-
 .../elementwise/elementwise_mod_op_npu.cc     |   4 +-
 .../elementwise/elementwise_mul_op.h          |   1 -
 .../elementwise/elementwise_mul_op_mlu.cc     |   5 +-
 .../elementwise/elementwise_mul_op_npu.cc     |   9 +-
 .../operators/elementwise/elementwise_npu.h   |   9 +-
 .../operators/elementwise/elementwise_op.h    |   6 -
 .../elementwise/elementwise_pow_op_mlu.cc     |  12 +-
 .../elementwise/elementwise_pow_op_npu.cc     |  32 ++-
 .../elementwise/elementwise_sub_op_mlu.cc     |   2 -
 .../elementwise/elementwise_sub_op_npu.cc     |   8 +-
 paddle/fluid/operators/expand_as_op.h         |   1 -
 paddle/fluid/operators/expand_as_v2_op.h      |   1 -
 paddle/fluid/operators/expand_as_v2_op_mlu.cc |   2 -
 paddle/fluid/operators/expand_op.h            |   1 -
 paddle/fluid/operators/expand_v2_op_npu.cc    |   9 +-
 paddle/fluid/operators/eye_op_npu.cc          |   2 -
 paddle/fluid/operators/fc_op.h                |   1 -
 .../fill_constant_batch_size_like_op_npu.cc   |   4 +-
 .../fluid/operators/fill_constant_op_mlu.cc   |   3 +-
 paddle/fluid/operators/filter_by_instag_op.cu |   1 -
 paddle/fluid/operators/filter_by_instag_op.h  |   1 -
 paddle/fluid/operators/flatten_op.cc          |   2 -
 paddle/fluid/operators/flatten_op_npu.cc      |   2 -
 paddle/fluid/operators/fsp_op.h               |   2 -
 paddle/fluid/operators/fused/attn_gemm.h      |   1 -
 paddle/fluid/operators/fused/attn_gemm_int8.h |   1 -
 .../fluid/operators/fused/conv_fusion_op.cu   |  10 +-
 .../operators/fused/cudnn_bn_add_relu_test.cc | 171 +++++++-------
 .../fused/cudnn_bn_stats_finalize.cu.h        |  21 +-
 .../operators/fused/cudnn_norm_conv.cu.h      |  21 +-
 .../operators/fused/cudnn_norm_conv_test.cc   |  13 +-
 .../fused/cudnn_scale_bias_add_relu.cu.h      |  39 ++--
 paddle/fluid/operators/fused/fmha_ref.h       |   2 -
 .../operators/fused/fused_attention_op.cc     |   2 -
 .../operators/fused/fused_attention_op.cu     |   8 +-
 .../operators/fused/fused_attention_op_xpu.cc | 157 +++++++------
 ...sed_bias_dropout_residual_layer_norm_op.cc |   2 -
 ...sed_bias_dropout_residual_layer_norm_op.cu |   2 -
 .../operators/fused/fused_bn_activation_op.cc |   6 +-
 .../operators/fused/fused_bn_activation_op.cu |   5 +-
 .../operators/fused/fused_bn_activation_op.h  |   1 -
 .../fused/fused_bn_add_activation_op.cc       |   6 +-
 .../fused/fused_bn_add_activation_op.cu       |   5 +-
 .../fused/fused_bn_add_activation_op.h        |   1 -
 .../fused_embedding_eltwise_layernorm_op.cu   |   1 -
 .../fused/fused_embedding_fc_lstm_op.cc       |  21 +-
 .../fused/fused_embedding_fc_lstm_op.h        |   2 -
 .../fused/fused_embedding_seq_pool_op.h       |   5 +-
 .../operators/fused/fused_feedforward_op.cc   |   1 -
 .../operators/fused/fused_feedforward_op.cu   |   2 -
 .../fused/fused_feedforward_op_xpu.cc         | 194 ++++++++--------
 .../operators/fused/fused_gate_attention.h    |  36 ++-
 .../fused/fused_gate_attention_op.cc          |   1 -
 .../fused/fused_gate_attention_op.cu          |  93 ++++----
 .../operators/fused/fused_gemm_epilogue_op.cc |   1 -
 .../operators/fused/fused_gemm_epilogue_op.cu |   2 -
 .../fused/fused_gemm_epilogue_op_xpu.cc       |   2 -
 .../fused/fused_multi_transformer_int8_op.cc  |   4 +-
 .../fused/fused_multi_transformer_int8_op.cu  |  38 +--
 .../fused/fused_multi_transformer_op.cc       |   4 +-
 .../fused/fused_multi_transformer_op.cu       |  92 ++++----
 .../fused/fused_multi_transformer_op.cu.h     |  12 +-
 .../fused/fusion_conv_inception_op.cu         |   1 -
 paddle/fluid/operators/fused/fusion_gru_op.cc |  29 +--
 paddle/fluid/operators/fused/fusion_gru_op.h  |   2 -
 .../fluid/operators/fused/fusion_lstm_op.cc   |  21 +-
 paddle/fluid/operators/fused/fusion_lstm_op.h |   2 -
 .../fused/fusion_repeated_fc_relu_op.cc       |   8 +-
 .../fused/fusion_repeated_fc_relu_op.h        |   2 -
 .../fused/fusion_seqconv_eltadd_relu_op.cc    |  20 +-
 .../fused/fusion_seqconv_eltadd_relu_op.h     |   2 -
 .../fused/fusion_seqexpand_concat_fc_op.cc    |   7 +-
 .../fused/fusion_seqexpand_concat_fc_op.h     |   2 -
 .../fused/fusion_seqpool_concat_op.h          |   2 -
 .../fused/fusion_seqpool_cvm_concat_op.cc     |   3 +-
 .../fused/fusion_seqpool_cvm_concat_op.h      |   2 -
 .../fused/fusion_squared_mat_sub_op.cc        |  12 +-
 .../fused/fusion_squared_mat_sub_op.h         |   2 -
 .../operators/fused/multihead_matmul_op.cu    |  11 +-
 .../operators/fused/resnet_basic_block_op.cc  |   1 -
 .../fused/resnet_basic_block_op_xpu.cc        |   2 -
 .../fluid/operators/fused/resnet_unit_op.cc   |   2 -
 .../fluid/operators/fused/resnet_unit_op.cu   | 133 ++++++-----
 .../operators/fused/resnet_unit_op_xpu.cc     | 101 ++++----
 .../operators/fused/skip_layernorm_op.cu      |   1 -
 .../fused/xpu_fused_common_function.h         |  15 +-
 .../fluid/operators/fused/yolo_box_head_op.cu |   1 -
 .../fluid/operators/fused/yolo_box_post_op.cu |   1 -
 paddle/fluid/operators/gather_nd_op_mlu.cc    |   2 -
 paddle/fluid/operators/gather_nd_op_npu.cc    |   1 -
 .../fluid/operators/gather_scatter_kernel.cc  |  24 +-
 .../fluid/operators/gather_scatter_kernel.cu  |  26 +--
 .../fluid/operators/gather_scatter_kernel.h   |  50 ++--
 paddle/fluid/operators/gaussian_random_op.cc  |   2 -
 .../fluid/operators/gaussian_random_op_mlu.cc |   3 +-
 .../fluid/operators/gaussian_random_op_npu.cc |   3 +-
 paddle/fluid/operators/gelu_op_npu.cc         |   2 -
 .../fluid/operators/graph_khop_sampler_op.cu  |   2 -
 .../fluid/operators/graph_khop_sampler_op.h   |   2 -
 paddle/fluid/operators/grid_sampler_op_mlu.cc |   6 +-
 paddle/fluid/operators/group_norm_op.cc       |  15 +-
 paddle/fluid/operators/group_norm_op.cu       |  10 +-
 paddle/fluid/operators/group_norm_op.h        |   1 -
 paddle/fluid/operators/group_norm_op_npu.cc   |  47 ++--
 paddle/fluid/operators/gru_op.cc              |  14 +-
 paddle/fluid/operators/gru_op.cu.cc           |   9 +-
 paddle/fluid/operators/gru_op.h               |  21 +-
 paddle/fluid/operators/gru_unit_op.h          |   6 +-
 paddle/fluid/operators/huber_loss_op_mlu.cc   |  26 +--
 paddle/fluid/operators/huber_loss_op_npu.cc   |   6 +-
 paddle/fluid/operators/im2sequence_op.h       |  37 ++-
 paddle/fluid/operators/index_sample_op_npu.cc |   5 +-
 paddle/fluid/operators/index_select_op.h      |   1 -
 paddle/fluid/operators/index_select_op_npu.cc |   8 +-
 paddle/fluid/operators/inplace_abn_op.cc      |  10 +-
 paddle/fluid/operators/inplace_abn_op.cu      |   6 +-
 paddle/fluid/operators/inplace_abn_op.h       |   1 -
 paddle/fluid/operators/instance_norm_op.cc    |  12 +-
 paddle/fluid/operators/instance_norm_op.h     |   1 -
 .../fluid/operators/instance_norm_op_npu.cc   |   3 +-
 paddle/fluid/operators/interpolate_op.cu      |  16 +-
 paddle/fluid/operators/interpolate_op.h       |   3 +-
 paddle/fluid/operators/interpolate_op_npu.cc  |   5 +-
 .../fluid/operators/interpolate_v2_op_mlu.cc  |   4 +-
 .../fluid/operators/interpolate_v2_op_npu.cc  |  72 +++---
 paddle/fluid/operators/jit/benchmark.cc       |  35 ++-
 paddle/fluid/operators/kldiv_loss_op_npu.cc   |   4 +-
 paddle/fluid/operators/label_smooth_op_mlu.cc |   2 -
 paddle/fluid/operators/label_smooth_op_npu.cc |  10 +-
 paddle/fluid/operators/layer_norm_kernel.cu.h |   1 -
 paddle/fluid/operators/layer_norm_op.cc       |   7 +-
 paddle/fluid/operators/layer_norm_op_mlu.cc   |  17 +-
 paddle/fluid/operators/layer_norm_op_npu.cc   |  33 ++-
 paddle/fluid/operators/layout_utils.h         |   2 -
 .../fluid/operators/limit_by_capacity_op.cu   |   2 -
 paddle/fluid/operators/log_loss_op_npu.cc     |   2 -
 paddle/fluid/operators/log_loss_op_xpu.cc     |   2 -
 .../fluid/operators/lookup_table_dequant_op.h |   1 -
 paddle/fluid/operators/lookup_table_op.h      |   1 -
 paddle/fluid/operators/lookup_table_v2_op.h   |  11 +-
 .../fluid/operators/lookup_table_v2_op_mlu.cc |   4 +-
 .../fluid/operators/lookup_table_v2_op_npu.cc |  11 +-
 paddle/fluid/operators/lrn_op.h               |   3 -
 paddle/fluid/operators/lstm_op.h              |  40 ++--
 paddle/fluid/operators/lstmp_op.h             |  47 ++--
 .../fluid/operators/masked_select_op_mlu.cc   |  14 +-
 .../fluid/operators/match_matrix_tensor_op.cc |   3 +-
 .../fluid/operators/match_matrix_tensor_op.h  |   1 -
 paddle/fluid/operators/math/context_project.h |  45 ++--
 .../operators/math/eigen_values_vectors.h     |  26 +--
 paddle/fluid/operators/math/sample_prob.cu    |   4 +-
 paddle/fluid/operators/math/sample_prob.h     |   2 -
 .../fluid/operators/math/sequence_pooling.cc  |   5 +-
 paddle/fluid/operators/math/softmax.cu        |   5 +-
 paddle/fluid/operators/math/tree2col.cu       |   9 +-
 paddle/fluid/operators/matmul_op_mlu.cc       |  10 +-
 paddle/fluid/operators/matmul_op_npu.cc       |  27 ++-
 paddle/fluid/operators/matmul_v2_op_mlu.cc    |  10 +-
 paddle/fluid/operators/matmul_v2_op_npu.cc    |  21 +-
 paddle/fluid/operators/mean_iou_op.h          |   7 +-
 paddle/fluid/operators/mean_op_mlu.cc         |  17 +-
 paddle/fluid/operators/mean_op_npu.cc         |  21 +-
 paddle/fluid/operators/meshgrid_op_mlu.cc     |  12 +-
 .../operators/metrics/accuracy_op_mlu.cc      |  14 +-
 .../operators/metrics/accuracy_op_xpu.cc      |   1 -
 .../operators/metrics/precision_recall_op.h   |   1 -
 .../operators/mkldnn/dequantize_mkldnn_op.cc  |   1 -
 .../operators/mkldnn/matmul_v2_mkldnn_op.cc   |  49 ++--
 .../operators/mkldnn/quantize_mkldnn_op.cc    |   1 -
 .../operators/mkldnn/requantize_mkldnn_op.cc  |   1 -
 .../operators/mkldnn/reshape_mkldnn_op.cc     |   2 +-
 .../operators/mkldnn/transpose_mkldnn_op.cc   |  10 +-
 paddle/fluid/operators/mlu/mlu_baseop.cc      | 128 +++++-----
 paddle/fluid/operators/mlu/mlu_baseop.h       |  13 +-
 .../fluid/operators/modified_huber_loss_op.cu |   2 -
 .../fluid/operators/modified_huber_loss_op.h  |   1 -
 paddle/fluid/operators/multi_dot_op.cc        |   1 -
 paddle/fluid/operators/multinomial_op_npu.cc  |   2 -
 paddle/fluid/operators/multiplex_op.cc        |   2 -
 paddle/fluid/operators/nce_op.h               |  11 +-
 paddle/fluid/operators/norm_op_npu.cc         |   1 -
 paddle/fluid/operators/norm_utils.cu.h        |  29 ++-
 paddle/fluid/operators/number_count_op.cu     |   2 -
 paddle/fluid/operators/one_hot_op.h           |   1 -
 paddle/fluid/operators/one_hot_op_npu.cc      |   3 +-
 paddle/fluid/operators/one_hot_op_xpu.cc      |   2 -
 paddle/fluid/operators/one_hot_v2_op_mlu.cc   |  13 +-
 paddle/fluid/operators/one_hot_v2_op_npu.cc   |   3 +-
 .../fluid/operators/optimizers/adadelta_op.cc |   2 -
 .../fluid/operators/optimizers/adagrad_op.cc  |   1 -
 paddle/fluid/operators/optimizers/adam_op.h   |   2 -
 .../fluid/operators/optimizers/adam_op_mlu.cc |  14 +-
 .../fluid/operators/optimizers/adam_op_npu.cc |  14 +-
 .../fluid/operators/optimizers/adamax_op.cc   |   1 -
 .../optimizers/decayed_adagrad_op.cc          |   1 -
 paddle/fluid/operators/optimizers/dpsgd_op.cc |   1 -
 paddle/fluid/operators/optimizers/ftrl_op.cc  |   1 -
 paddle/fluid/operators/optimizers/ftrl_op.h   |   1 -
 .../operators/optimizers/merged_adam_op.cc    |   2 -
 .../optimizers/merged_momentum_op_mlu.cc      |   5 +-
 .../fluid/operators/optimizers/momentum_op.cc |  14 +-
 .../operators/optimizers/momentum_op_mlu.cc   |   4 +-
 .../optimizers/proximal_adagrad_op.cc         |   1 -
 .../optimizers/proximal_adagrad_op.h          |   2 -
 .../operators/optimizers/proximal_gd_op.cc    |   1 -
 .../operators/optimizers/proximal_gd_op.h     |   2 -
 .../operators/optimizers/rmsprop_op_npu.cc    |  14 +-
 .../optimizers/sparse_momentum_op.cc          |  19 +-
 paddle/fluid/operators/p_norm_op_npu.cc       |  21 +-
 paddle/fluid/operators/pad3d_op_npu.cc        |   2 -
 paddle/fluid/operators/pad_op_npu.cc          |   2 -
 paddle/fluid/operators/partial_concat_op.cc   |   1 -
 paddle/fluid/operators/partial_concat_op.cu   |   4 +-
 paddle/fluid/operators/partial_concat_op.h    |   1 -
 paddle/fluid/operators/partial_sum_op.cc      |   1 -
 paddle/fluid/operators/partial_sum_op.cu      |   6 +-
 paddle/fluid/operators/partial_sum_op.h       |   2 -
 paddle/fluid/operators/pool_op.cc             |   8 +-
 paddle/fluid/operators/pool_op.h              |   2 -
 paddle/fluid/operators/pool_op_mlu.cc         |  12 +-
 .../operators/positive_negative_pair_op.h     |   2 -
 paddle/fluid/operators/prelu_op.cc            |   2 -
 paddle/fluid/operators/prroi_pool_op.cc       |   2 -
 paddle/fluid/operators/prroi_pool_op.cu       |   2 -
 paddle/fluid/operators/pyramid_hash_op.cc     |   1 -
 paddle/fluid/operators/random_routing_op.cu   |   2 -
 paddle/fluid/operators/rank_attention_op.cc   |   1 -
 .../operators/reduce_ops/reduce_any_op_npu.cc |   1 -
 .../reduce_ops/reduce_any_op_npu_test.cc      |   2 -
 .../operators/reduce_ops/reduce_max_op_mlu.cc |  20 +-
 .../operators/reduce_ops/reduce_max_op_npu.cc |  15 +-
 .../reduce_ops/reduce_mean_op_mlu.cc          |   2 +-
 .../reduce_ops/reduce_mean_op_npu.cc          |   6 +-
 .../operators/reduce_ops/reduce_min_op_npu.cc |   5 +-
 paddle/fluid/operators/reduce_ops/reduce_op.h |   9 +-
 .../operators/reduce_ops/reduce_op_function.h |   1 -
 .../reduce_ops/reduce_prod_op_npu.cc          |   1 -
 .../operators/reduce_ops/reduce_sum_op.h      |   2 +-
 .../operators/reduce_ops/reduce_sum_op_mlu.cc |   2 +-
 .../operators/reduce_ops/reduce_sum_op_npu.cc |   2 +-
 paddle/fluid/operators/reshape_op.cc          |   8 +-
 paddle/fluid/operators/rnn_op_mlu.cc          |   3 +-
 paddle/fluid/operators/roi_align_op.cc        |   2 -
 paddle/fluid/operators/roi_align_op_mlu.cc    |  24 +-
 paddle/fluid/operators/roi_align_op_npu.cc    |  11 +-
 paddle/fluid/operators/roi_pool_op.cc         |   2 -
 paddle/fluid/operators/sample_logits_op.cu    |  15 +-
 paddle/fluid/operators/sample_logits_op.h     |  16 +-
 paddle/fluid/operators/sampling_id_op.cc      |   2 -
 paddle/fluid/operators/sampling_id_op.h       |   2 -
 paddle/fluid/operators/save_combine_op.cc     |   2 -
 paddle/fluid/operators/scatter_op_mlu.cc      |   2 +-
 paddle/fluid/operators/scatter_op_npu.cc      |  10 +-
 paddle/fluid/operators/search_compute.h       |   1 -
 paddle/fluid/operators/seed_op.cc             |   1 -
 paddle/fluid/operators/seed_op.h              |   1 -
 paddle/fluid/operators/set_value_op.cc        |  27 ++-
 paddle/fluid/operators/set_value_op.h         |   1 -
 paddle/fluid/operators/set_value_op_mlu.cc    |   8 +-
 paddle/fluid/operators/set_value_op_npu.cc    |   6 +-
 paddle/fluid/operators/shape_op_mlu.cc        |   3 +-
 paddle/fluid/operators/shape_op_npu.cc        |   2 -
 paddle/fluid/operators/shard_index_op_npu.cc  |  11 +-
 paddle/fluid/operators/shuffle_batch_op.h     |   1 -
 paddle/fluid/operators/shuffle_channel_op.cu  |   1 -
 ...igmoid_cross_entropy_with_logits_op_mlu.cc |   1 -
 ...igmoid_cross_entropy_with_logits_op_npu.cc |   1 -
 paddle/fluid/operators/similarity_focus_op.h  |   1 -
 paddle/fluid/operators/slice_op.cc            |   6 +-
 paddle/fluid/operators/slice_op_mlu.cc        |   2 -
 paddle/fluid/operators/slice_op_npu.cc        |   3 +-
 paddle/fluid/operators/smooth_l1_loss_op.h    |   7 +-
 .../fluid/operators/smooth_l1_loss_op_npu.cc  |  22 +-
 .../softmax_with_cross_entropy_op_mlu.cc      |   2 -
 .../softmax_with_cross_entropy_op_npu.cc      |   6 +-
 paddle/fluid/operators/space_to_depth_op.cc   |   2 -
 paddle/fluid/operators/sparse_attention_op.cu |  48 ++--
 paddle/fluid/operators/split_op_mlu.cc        |   2 -
 paddle/fluid/operators/split_op_npu.cc        |   4 +-
 .../fluid/operators/squared_l2_distance_op.h  |   2 -
 .../fluid/operators/squared_l2_norm_op_mlu.cc |   6 +-
 .../fluid/operators/squared_l2_norm_op_npu.cc |   8 +-
 paddle/fluid/operators/stack_op_mlu.cc        |  10 +-
 paddle/fluid/operators/stack_op_npu.cc        |  18 +-
 paddle/fluid/operators/stft_op.h              |  16 +-
 paddle/fluid/operators/strided_slice_op.cc    |   6 +-
 .../fluid/operators/strided_slice_op_mlu.cc   |   9 +-
 .../fluid/operators/strided_slice_op_npu.cc   |  31 ++-
 paddle/fluid/operators/sum_op_mlu.cc          |   3 +-
 paddle/fluid/operators/sum_op_npu.cc          |   3 +-
 paddle/fluid/operators/svd_helper.h           |  38 +--
 .../fluid/operators/sync_batch_norm_op_mlu.cc |  29 ++-
 .../fluid/operators/sync_batch_norm_op_npu.cc | 142 ++++++------
 .../fluid/operators/take_along_axis_op_npu.cc |   2 -
 paddle/fluid/operators/tdm_child_op.h         |   1 -
 paddle/fluid/operators/tdm_sampler_op.h       |   1 -
 .../teacher_student_sigmoid_loss_op.cc        |  12 +-
 .../teacher_student_sigmoid_loss_op.h         |   1 -
 paddle/fluid/operators/temporal_shift_op.h    |   1 -
 paddle/fluid/operators/tile_op_mlu.cc         |   2 -
 paddle/fluid/operators/tile_op_npu.cc         |   1 -
 paddle/fluid/operators/top_k_op.cu            |   4 +-
 paddle/fluid/operators/top_k_op.h             |   2 -
 paddle/fluid/operators/top_k_op_npu.cc        |   2 +-
 paddle/fluid/operators/top_k_op_xpu.cc        |   1 -
 paddle/fluid/operators/tree_conv_op.h         |  13 +-
 .../truncated_gaussian_random_op_npu.cc       |  14 +-
 paddle/fluid/operators/uniform_random_op.cc   |   2 +-
 paddle/fluid/operators/uniform_random_op.cu   |   3 +-
 paddle/fluid/operators/uniform_random_op.h    |   1 -
 .../fluid/operators/uniform_random_op_mlu.cc  |   5 +-
 .../fluid/operators/uniform_random_op_npu.cc  |   5 +-
 paddle/fluid/operators/var_conv_2d_op.cc      |  37 +--
 paddle/fluid/operators/var_conv_2d_op.h       |   1 -
 paddle/fluid/operators/where_index_op_mlu.cc  |   6 +-
 paddle/fluid/operators/where_index_op_npu.cc  |  12 +-
 419 files changed, 2450 insertions(+), 2880 deletions(-)

diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index b57c874ceebe0..c1838ee201d45 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -644,11 +644,11 @@ void GradientAccumulator::CallGradientHooks() {
       true,
       platform::errors::PreconditionNotMet(
           "Only can call gradient hooks after sum gradient completed."));
-  PADDLE_ENFORCE_EQ(
-      HasInnerVar(),
-      true,
-      platform::errors::PreconditionNotMet(
-          "Leaf Tensor's inner var is nullptr when call gradient hook."));
+  PADDLE_ENFORCE_EQ(HasInnerVar(),
+                    true,
+                    platform::errors::PreconditionNotMet(
+                        "Leaf Tensor's inner var is nullptr when "
+                        "call gradient hook."));
   PADDLE_ENFORCE_EQ(
       inner_var_->Var().IsInitialized(),
       true,
diff --git a/paddle/fluid/operators/abs_op_mlu.cc b/paddle/fluid/operators/abs_op_mlu.cc
index 9afa4c28e0544..e635b9547b4fc 100644
--- a/paddle/fluid/operators/abs_op_mlu.cc
+++ b/paddle/fluid/operators/abs_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the Licnse. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class AbsMLUKernel : public framework::OpKernel<T> {
  public:
@@ -54,7 +52,7 @@ class AbsGradMLUKernel : public framework::OpKernel<T> {
     MLUCnnlOpTensorDesc mul_op_desc(
         CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
 
-    Tensor sign_x;
+    phi::DenseTensor sign_x;
     sign_x.mutable_data<T>(x->dims(), ctx.GetPlace());
 
     MLUCnnl::Sign(ctx,
diff --git a/paddle/fluid/operators/abs_op_npu.cc b/paddle/fluid/operators/abs_op_npu.cc
index a1ca88ae5b572..47c88abb9ede1 100644
--- a/paddle/fluid/operators/abs_op_npu.cc
+++ b/paddle/fluid/operators/abs_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the Licnse. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class AbsNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/activation_op_mlu.cc b/paddle/fluid/operators/activation_op_mlu.cc
index 736b398996b45..f26af0a5b9743 100644
--- a/paddle/fluid/operators/activation_op_mlu.cc
+++ b/paddle/fluid/operators/activation_op_mlu.cc
@@ -21,8 +21,6 @@ limitations under the Licnse. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <cnnlActivationMode_t act_mode, typename T>
 class ActivationMLUKernel : public framework::OpKernel<T> {
  public:
@@ -442,7 +440,7 @@ class ReciprocalGradMLUKernel : public framework::OpKernel<T> {
     auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto place = ctx.GetPlace();
     dx->mutable_data<T>(place);
-    Tensor square_out;
+    phi::DenseTensor square_out;
     square_out.Resize(out->dims());
     square_out.mutable_data<T>(place);
     MLUCnnlTensorDesc out_desc(*out);
diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc
index 3c6e207b971bc..b471c08d39ce9 100644
--- a/paddle/fluid/operators/activation_op_npu.cc
+++ b/paddle/fluid/operators/activation_op_npu.cc
@@ -24,14 +24,12 @@ limitations under the Licnse. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class PowNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     auto factor = ctx.Attr<float>("factor");
 
     out->mutable_data<T>(ctx.GetPlace());
@@ -54,9 +52,9 @@ template <typename DeviceContext, typename T>
 class PowGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto factor = ctx.Attr<float>("factor");
 
     auto x_dims = x->dims();
@@ -69,7 +67,7 @@ class PowGradNPUKernel : public framework::OpKernel<T> {
     // NOTE(liym27): dx = dout * factor * x.pow(factor-1)
 
     // Step1: Compute x_pow = x.pow(factor-1)
-    Tensor x_pow(x->type());
+    phi::DenseTensor x_pow(x->type());
     x_pow.mutable_data<T>(x->dims(), place);
     const auto& runner_pow = NpuOpRunner(
         "Power", {*x}, {x_pow}, {{"power", factor - static_cast<float>(1)}});
@@ -78,13 +76,13 @@ class PowGradNPUKernel : public framework::OpKernel<T> {
     // Step 2: Construct a broadcast factor, which has the same shape with x.
 
     // 2.1 Get a factor tensor with shape [1].
-    Tensor factor_tensor(experimental::DataType::FLOAT32);
+    phi::DenseTensor factor_tensor(experimental::DataType::FLOAT32);
     factor_tensor.mutable_data<float>({1}, place);
     FillNpuTensorWithConstant<float>(&factor_tensor, factor);
 
     // 2.2 Get the factor which has the shape with x and the same value with
     // factor.
-    Tensor factor_bc_tensor(experimental::DataType::FLOAT32);
+    phi::DenseTensor factor_bc_tensor(experimental::DataType::FLOAT32);
     factor_bc_tensor.mutable_data<float>(x_dims, place);
     const auto& runner_bc = NpuOpRunner("FillD",
                                         {factor_tensor},
@@ -93,7 +91,7 @@ class PowGradNPUKernel : public framework::OpKernel<T> {
     runner_bc.Run(stream);
 
     // Step 3: Compute x_power_mul_factor = factor * x.pow(factor-1)
-    Tensor x_power_mul_factor(x->type());
+    phi::DenseTensor x_power_mul_factor(x->type());
     x_power_mul_factor.mutable_data<T>(x->dims(), place);
     const auto& runner_mul_1 =
         NpuOpRunner("Mul", {factor_bc_tensor, x_pow}, {x_power_mul_factor}, {});
@@ -111,8 +109,8 @@ template <typename DeviceContext, typename T>
 class ReluNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     out->mutable_data<T>(ctx.GetPlace());
 
@@ -134,9 +132,9 @@ template <typename DeviceContext, typename T>
 class ReluGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* out = ctx.Input<phi::DenseTensor>("Out");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
@@ -153,8 +151,8 @@ template <typename DeviceContext, typename T>
 class Relu6NPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     out->mutable_data<T>(ctx.GetPlace());
 
@@ -176,9 +174,9 @@ template <typename DeviceContext, typename T>
 class Relu6GradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* out = ctx.Input<phi::DenseTensor>("Out");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
@@ -195,9 +193,9 @@ template <typename DeviceContext, typename T>
 class SqrtNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
 
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
 
@@ -216,8 +214,8 @@ template <typename DeviceContext, typename T>
 class LeakyReluNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     auto alpha = ctx.Attr<float>("alpha");
 
     out->mutable_data<T>(ctx.GetPlace());
@@ -236,9 +234,9 @@ template <typename DeviceContext, typename T>
 class LeakyReluGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto alpha = ctx.Attr<float>("alpha");
 
     auto stream =
@@ -257,10 +255,10 @@ template <typename DeviceContext, typename T>
 class SqrtGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out = ctx.Input<phi::DenseTensor>("Out");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
 
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto place = ctx.GetPlace();
 
@@ -279,9 +277,9 @@ template <typename DeviceContext, typename T>
 class LogNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
 
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
 
@@ -291,12 +289,12 @@ class LogNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    Tensor one(x->type());
+    phi::DenseTensor one(x->type());
     one.mutable_data<T>(x->dims(), place);
     const auto& runner_one = NpuOpRunner("OnesLike", {*x}, {one}, {});
     runner_one.Run(stream);
 
-    Tensor sub(x->type());
+    phi::DenseTensor sub(x->type());
     sub.mutable_data<T>(x->dims(), place);
     const auto& runner_sub = NpuOpRunner("Sub", {*x, one}, {sub}, {});
     runner_sub.Run(stream);
@@ -310,10 +308,10 @@ template <typename DeviceContext, typename T>
 class LogGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x = ctx.Input<Tensor>("X");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
 
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto place = ctx.GetPlace();
 
@@ -331,9 +329,9 @@ template <typename DeviceContext, typename T>
 class TanhNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
 
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
 
@@ -352,10 +350,10 @@ template <typename DeviceContext, typename T>
 class TanhGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* out = ctx.Input<phi::DenseTensor>("Out");
 
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto place = ctx.GetPlace();
 
@@ -374,9 +372,9 @@ template <typename DeviceContext, typename T>
 class SquareNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
 
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
 
@@ -395,9 +393,9 @@ template <typename DeviceContext, typename T>
 class SquareGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto factor = static_cast<float>(2.0);
 
@@ -406,7 +404,7 @@ class SquareGradNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
     // Step 1: Compute x_muls_factor = factor * x
-    Tensor x_muls_factor(x->type());
+    phi::DenseTensor x_muls_factor(x->type());
     x_muls_factor.mutable_data<T>(x->dims(), place);
     const auto& runner_muls_1 =
         NpuOpRunner("Muls", {*x}, {x_muls_factor}, {{"value", factor}});
@@ -424,9 +422,9 @@ template <typename DeviceContext, typename T>
 class SigmoidNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
 
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
 
@@ -445,10 +443,10 @@ template <typename DeviceContext, typename T>
 class SigmoidGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* out = ctx.Input<phi::DenseTensor>("Out");
 
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto place = ctx.GetPlace();
 
@@ -469,8 +467,8 @@ template <typename T>
 class SwishNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     float beta = ctx.Attr<float>("beta");
 
     out->mutable_data<T>(ctx.GetPlace());
@@ -494,9 +492,9 @@ template <typename T>
 class SwishGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     float beta = ctx.Attr<float>("beta");
 
     dx->mutable_data<T>(ctx.GetPlace());
@@ -504,7 +502,7 @@ class SwishGradNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    Tensor beta_x, sigmoid_out, swish_out;
+    phi::DenseTensor beta_x, sigmoid_out, swish_out;
     beta_x.mutable_data<T>(x->dims(), ctx.GetPlace());
     sigmoid_out.mutable_data<T>(x->dims(), ctx.GetPlace());
     swish_out.mutable_data<T>(x->dims(), ctx.GetPlace());
@@ -543,8 +541,8 @@ template <typename T>
 class HardSwishNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     float threshold = ctx.Attr<float>("threshold");
     float scale = ctx.Attr<float>("scale");
@@ -558,25 +556,25 @@ class HardSwishNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    Tensor tensor_offset(x->type());
+    phi::DenseTensor tensor_offset(x->type());
     tensor_offset.mutable_data<T>({1}, place);
     FillNpuTensorWithConstant<T>(&tensor_offset, static_cast<T>(offset));
 
-    Tensor add_offset_val(x->type());
+    phi::DenseTensor add_offset_val(x->type());
     add_offset_val.mutable_data<T>(x->dims(), place);
     const auto& runner_add =
         NpuOpRunner("AddV2", {*x, tensor_offset}, {add_offset_val});
     runner_add.Run(stream);
 
-    Tensor tensor_threshold(x->type());
+    phi::DenseTensor tensor_threshold(x->type());
     tensor_threshold.mutable_data<T>({1}, place);
     FillNpuTensorWithConstant<T>(&tensor_threshold, static_cast<T>(threshold));
 
-    Tensor tensor_zero(x->type());
+    phi::DenseTensor tensor_zero(x->type());
     tensor_zero.mutable_data<T>({1}, place);
     FillNpuTensorWithConstant<T>(&tensor_zero, static_cast<T>(0.0));
 
-    Tensor clip_val(x->type());
+    phi::DenseTensor clip_val(x->type());
     clip_val.mutable_data<T>(x->dims(), place);
     const auto& runner_clip =
         NpuOpRunner("ClipByValue",
@@ -584,10 +582,10 @@ class HardSwishNPUKernel : public framework::OpKernel<T> {
                     {clip_val});
     runner_clip.Run(stream);
 
-    Tensor tensor_scale_tmp(x->type());
+    phi::DenseTensor tensor_scale_tmp(x->type());
     tensor_scale_tmp.mutable_data<T>({1}, place);
     FillNpuTensorWithConstant<T>(&tensor_scale_tmp, static_cast<T>(scale));
-    Tensor tensor_scale(x->type());
+    phi::DenseTensor tensor_scale(x->type());
     tensor_scale.mutable_data<T>(x->dims(), place);
     const auto& runner_fill =
         NpuOpRunner("FillD",
@@ -596,7 +594,7 @@ class HardSwishNPUKernel : public framework::OpKernel<T> {
                     {{"dims", phi::vectorize(x->dims())}});
     runner_fill.Run(stream);
 
-    Tensor div_val(x->type());
+    phi::DenseTensor div_val(x->type());
     div_val.mutable_data<T>(x->dims(), place);
     const auto& runner_div =
         NpuOpRunner("Div", {clip_val, tensor_scale}, {div_val});
@@ -611,9 +609,9 @@ template <typename T>
 class HardSwishGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     float threshold = ctx.Attr<float>("threshold");
     float scale = ctx.Attr<float>("scale");
@@ -627,23 +625,23 @@ class HardSwishGradNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    Tensor tensor_offset(x->type());
+    phi::DenseTensor tensor_offset(x->type());
     tensor_offset.mutable_data<T>({1}, place);
     FillNpuTensorWithConstant<T>(&tensor_offset, static_cast<T>(offset));
 
-    Tensor add_offset_val(x->type());
+    phi::DenseTensor add_offset_val(x->type());
     add_offset_val.mutable_data<T>(x->dims(), place);
     const auto& runner_add =
         NpuOpRunner("AddV2", {*x, tensor_offset}, {add_offset_val});
     runner_add.Run(stream);
 
-    Tensor tmp1(x->type());
+    phi::DenseTensor tmp1(x->type());
     tmp1.mutable_data<T>(x->dims(), place);
     const auto& runner_pow1 = NpuOpRunner(
         "Power", {*x}, {tmp1}, {{"scale", 2.0f}, {"shift", offset}});
     runner_pow1.Run(stream);
 
-    Tensor tmp2(x->type());
+    phi::DenseTensor tmp2(x->type());
     tmp2.mutable_data<T>(x->dims(), place);
     const auto& runner_ht_grad =
         NpuOpRunner("HardtanhGrad",
@@ -652,17 +650,17 @@ class HardSwishGradNPUKernel : public framework::OpKernel<T> {
                     {{"min_val", 0.0f}, {"max_val", threshold}});
     runner_ht_grad.Run(stream);
 
-    Tensor tmp3(x->type());
+    phi::DenseTensor tmp3(x->type());
     tmp3.mutable_data<T>(x->dims(), place);
     const auto& runner_pow2 = NpuOpRunner(
         "Power", {tmp2}, {tmp3}, {{"scale", 1.0f / scale}, {"shift", 1.0f}});
     runner_pow2.Run(stream);
 
-    Tensor tensor_threshold_tmp(x->type());
+    phi::DenseTensor tensor_threshold_tmp(x->type());
     tensor_threshold_tmp.mutable_data<T>({1}, place);
     FillNpuTensorWithConstant<T>(&tensor_threshold_tmp,
                                  static_cast<T>(threshold));
-    Tensor tensor_threshold(x->type());
+    phi::DenseTensor tensor_threshold(x->type());
     tensor_threshold.mutable_data<T>(x->dims(), place);
     const auto& runner_fill =
         NpuOpRunner("FillD",
@@ -671,12 +669,12 @@ class HardSwishGradNPUKernel : public framework::OpKernel<T> {
                     {{"dims", phi::vectorize(x->dims())}});
     runner_fill.Run(stream);
 
-    Tensor tmp_bool(experimental::DataType::BOOL);
+    phi::DenseTensor tmp_bool(experimental::DataType::BOOL);
     tmp_bool.mutable_data<bool>(x->dims(), place);
     const auto& runner_less =
         NpuOpRunner("Less", {add_offset_val, tensor_threshold}, {tmp_bool});
     runner_less.Run(stream);
-    Tensor tmp4(x->type());
+    phi::DenseTensor tmp4(x->type());
     tmp4.mutable_data<T>(x->dims(), place);
     auto dst_dtype =
         ConvertToNpuDtype(framework::TransToProtoVarType(x->type()));
@@ -687,7 +685,7 @@ class HardSwishGradNPUKernel : public framework::OpKernel<T> {
                     {{"dst_type", static_cast<int>(dst_dtype)}});
     runner_cast.Run(stream);
 
-    Tensor tmp5(x->type());
+    phi::DenseTensor tmp5(x->type());
     tmp5.mutable_data<T>(x->dims(), place);
     const auto& runner_sub = NpuOpRunner("Sub", {tmp3, tmp4}, {tmp5});
     runner_sub.Run(stream);
@@ -701,8 +699,8 @@ template <typename DeviceContext, typename T>
 class HardSigmoidNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     float slope = ctx.Attr<float>("slope");
     float offset = ctx.Attr<float>("offset");
 
@@ -724,10 +722,10 @@ template <typename DeviceContext, typename T>
 class HardSigmoidGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* out = ctx.Input<phi::DenseTensor>("Out");
 
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     float slope = ctx.Attr<float>("slope");
     float offset = ctx.Attr<float>("offset");
@@ -751,8 +749,8 @@ template <typename DeviceContext, typename T>
 class ReciprocalNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     auto place = ctx.GetPlace();
     out->mutable_data<T>(place);
     auto stream =
@@ -767,9 +765,9 @@ template <typename DeviceContext, typename T>
 class ReciprocalGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* out = ctx.Input<phi::DenseTensor>("Out");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto place = ctx.GetPlace();
     dx->mutable_data<T>(place);
     auto stream =
@@ -785,8 +783,8 @@ template <typename DeviceContext, typename T>
 class CosNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
     out->mutable_data<T>(place);
@@ -804,14 +802,14 @@ template <typename DeviceContext, typename T>
 class CosGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto place = ctx.GetPlace();
     dx->mutable_data<T>(place);
 
-    Tensor sin_out(x->type());  // Temporary Tensor
+    phi::DenseTensor sin_out(x->type());  // Temporary phi::DenseTensor
     sin_out.Resize(x->dims());
     sin_out.mutable_data<T>(place);
 
@@ -824,7 +822,7 @@ class CosGradNPUKernel : public framework::OpKernel<T> {
     const auto& runner_dx = NpuOpRunner("Mul", {*dout, sin_out}, {*dx}, {});
     runner_dx.Run(stream);
 
-    Tensor tmp(x->type());  // Temporary Tensor
+    phi::DenseTensor tmp(x->type());  // Temporary phi::DenseTensor
     tmp.Resize(phi::make_ddim({1, 1}));
     tmp.mutable_data<T>(place);
     float factor = -1.;
@@ -840,8 +838,8 @@ template <typename DeviceContext, typename T>
 class AtanNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     auto place = ctx.GetPlace();
     out->mutable_data<T>(place);
     const auto& runner = NpuOpRunner("Atan", {*x}, {*out}, {});
@@ -856,9 +854,9 @@ template <typename DeviceContext, typename T>
 class AtanGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto place = ctx.GetPlace();
     dx->mutable_data<T>(place);
     auto stream =
@@ -888,9 +886,9 @@ template <typename DeviceContext, typename T>
 class ExpGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* out = ctx.Input<phi::DenseTensor>("Out");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     dx->mutable_data<T>(ctx.GetPlace());
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
@@ -904,9 +902,9 @@ template <typename DeviceContext, typename T>
 class SinNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
 
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
 
diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc
index 2d7eb04f1dba0..b23d3670d5e80 100644
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
@@ -28,8 +28,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class AffineGridOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
index 508c51de723c0..6c78a20e2a51a 100644
--- a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
+++ b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class AllocFloatStatusKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
index 5f5415ffd37d0..543b40ee8fcd0 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel<T> {
   using MPDType = typename details::MPTypeTrait<T>::Type;
@@ -45,7 +43,7 @@ class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel<T> {
       out->mutable_data<T>(ctx.GetPlace());
 
       // check is_finite or is_nan
-      Tensor is_finite(found_inf->type());
+      phi::DenseTensor is_finite(found_inf->type());
       if (i != 0) {
         is_finite.Resize(phi::make_ddim({1}));
         is_finite.mutable_data<bool>(ctx.GetPlace());
@@ -78,8 +76,8 @@ class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel<T> {
       // out = in/scale, if found_inf = false
       // But when found_inf is true, the data of Out should not be used.
       // So, on MLU, we always compute out with in/scale.
-      Tensor float_x;
-      Tensor float_out;
+      phi::DenseTensor float_x;
+      phi::DenseTensor float_out;
       if (std::is_same<T, paddle::platform::float16>::value) {
         float_x.Resize(x->dims());
         float_out.Resize(out->dims());
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
index 3b6e2ba7184c0..c65b889618f07 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 // NOTE(zhiqiu): The CheckFiniteAndUnscaleNPUKernel is different from CUDA.
 // On NPU, we do not really check the data of input tensors,
 // but use NPUGetFloatStatus to check whether the nan/inf occurs on device,
@@ -47,13 +45,13 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
             .stream();
 
     // step1: inverse scale
-    Tensor const_tensor;
+    phi::DenseTensor const_tensor;
     const_tensor.mutable_data<T>({1}, ctx.GetPlace());
     FillNpuTensorWithConstant<T>(&const_tensor, static_cast<T>(1.0));
 
     // Inverse(1.0/scale)
     phi::DenseTensor* tmp_inverse_out = const_cast<phi::DenseTensor*>(scale);
-    Tensor inverse_out(scale->type());
+    phi::DenseTensor inverse_out(scale->type());
     inverse_out.Resize(scale->dims());
     inverse_out.mutable_data<T>(ctx.GetPlace());
     const auto& runner_inverse =
@@ -62,7 +60,7 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
     tmp_inverse_out = &inverse_out;
 
     // NOTE(zhiqiu):
-    Tensor tmp;
+    phi::DenseTensor tmp;
     tmp.mutable_data<float>({8}, ctx.GetPlace());
     // NOTE(zhiqiu): NPUGetFloatStatus updates data on input in-place.
     // tmp is only placeholder.
@@ -73,7 +71,7 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
                     {{"message", std::string("check_nan_and_inf")}});
     runner_float_status.Run(stream);
 
-    Tensor sum;
+    phi::DenseTensor sum;
     sum.mutable_data<float>({1}, ctx.GetPlace());
     const auto& runner_reduce_sum =
         NpuOpRunner("ReduceSumD",
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
index cca370bf95331..bf7272ba8b878 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
@@ -31,8 +31,6 @@ limitations under the License. */
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-using Tensor = phi::DenseTensor;
-
 USE_OP_ITSELF(check_finite_and_unscale);
 USE_OP_DEVICE_KERNEL(check_finite_and_unscale, NPU);
 
@@ -110,7 +108,7 @@ void Compare(f::Scope *scope, const p::DeviceContext &ctx) {
   ctx.Wait();
 
   // out found_inf
-  Tensor found_inf_tensor;
+  phi::DenseTensor found_inf_tensor;
   found_inf_tensor.Resize({1});
   bool *found_inf_data =
       found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
diff --git a/paddle/fluid/operators/amp/clear_float_status_op_npu.cc b/paddle/fluid/operators/amp/clear_float_status_op_npu.cc
index b5750181139d4..18e68e1ba377f 100644
--- a/paddle/fluid/operators/amp/clear_float_status_op_npu.cc
+++ b/paddle/fluid/operators/amp/clear_float_status_op_npu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class ClearFloatStatusKernel : public framework::OpKernel<T> {
  public:
@@ -35,7 +33,7 @@ class ClearFloatStatusKernel : public framework::OpKernel<T> {
                       platform::errors::PreconditionNotMet(
                           "The input(FloatStatus) and Output(FloatStatusOut) "
                           "should be the same."));
-    Tensor tmp;
+    phi::DenseTensor tmp;
     tmp.mutable_data<float>({8}, ctx.GetPlace());
     const auto& runner =
         NpuOpRunner("NPUClearFloatStatus", {tmp}, {*float_status_out});
diff --git a/paddle/fluid/operators/amp/get_float_status_op_npu.cc b/paddle/fluid/operators/amp/get_float_status_op_npu.cc
index 8befb2df9b835..c6dd6f4e6b968 100644
--- a/paddle/fluid/operators/amp/get_float_status_op_npu.cc
+++ b/paddle/fluid/operators/amp/get_float_status_op_npu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class GetFloatStatusKernel : public framework::OpKernel<T> {
  public:
@@ -35,7 +33,7 @@ class GetFloatStatusKernel : public framework::OpKernel<T> {
                       platform::errors::PreconditionNotMet(
                           "The input(FloatStatus) and Output(FloatStatusOut) "
                           "should be the same."));
-    Tensor tmp;
+    phi::DenseTensor tmp;
     tmp.mutable_data<float>({8}, ctx.GetPlace());
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
index b1bfcf8edd672..fb5475610ce15 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -25,8 +25,6 @@ DECLARE_int32(min_loss_scaling);
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 void Update(const platform::NPUDeviceContext& ctx,
             const std::vector<bool> found_inf_vec,
@@ -50,7 +48,7 @@ void Update(const platform::NPUDeviceContext& ctx,
                              good_out_tensor->numel() * sizeof(int),
                              stream);
     // bad_out_data = bad_in_data + 1
-    Tensor factor_tensor(bad_out_tensor->dtype());
+    phi::DenseTensor factor_tensor(bad_out_tensor->dtype());
     factor_tensor.mutable_data<int>({1}, place);
     FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
     const auto& runner_p2 = NpuOpRunner(
@@ -106,7 +104,7 @@ void Update(const platform::NPUDeviceContext& ctx,
                              stream);
 
     // good_out_data = good_in_data + 1
-    Tensor factor_tensor(good_out_tensor->dtype());
+    phi::DenseTensor factor_tensor(good_out_tensor->dtype());
     factor_tensor.mutable_data<int>({1}, place);
     FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
     const auto& runner_p2 = NpuOpRunner(
diff --git a/paddle/fluid/operators/arg_max_op_npu.cc b/paddle/fluid/operators/arg_max_op_npu.cc
index 6e5048db47ead..175703eaf9fa5 100644
--- a/paddle/fluid/operators/arg_max_op_npu.cc
+++ b/paddle/fluid/operators/arg_max_op_npu.cc
@@ -18,7 +18,6 @@ limitations under the Licnse. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 template <typename T>
@@ -36,7 +35,7 @@ struct VisitDataArgNPUMaxFunctor {
     auto dtype = ctx.Attr<int>("dtype");
     const bool& flatten = ctx.Attr<bool>("flatten");
 
-    Tensor transformed_x(x.type());
+    phi::DenseTensor transformed_x(x.type());
     transformed_x.ShareDataWith(x);
     if (flatten) {
       transformed_x.Resize(phi::make_ddim({x.numel()}));
diff --git a/paddle/fluid/operators/arg_min_op_npu.cc b/paddle/fluid/operators/arg_min_op_npu.cc
index fe917140b7b9f..5132393cd3727 100644
--- a/paddle/fluid/operators/arg_min_op_npu.cc
+++ b/paddle/fluid/operators/arg_min_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class ArgMinNPUKernel : public framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/argsort_op_npu.cc b/paddle/fluid/operators/argsort_op_npu.cc
index 7aedb41c9fde3..d5a42b8228e0a 100644
--- a/paddle/fluid/operators/argsort_op_npu.cc
+++ b/paddle/fluid/operators/argsort_op_npu.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 template <typename T>
@@ -79,16 +78,16 @@ class ArgsortNPUKernel : public framework::OpKernel<T> {
     framework::NPUAttributeMap attr = {{"axis", -1},
                                        {"descending", descending}};
 
-    Tensor indices_tmp(experimental::DataType::INT32);
+    phi::DenseTensor indices_tmp(experimental::DataType::INT32);
     indices_tmp.Resize(indices->dims());
 
     if (framework::TransToProtoVarType(input->dtype()) ==
         framework::proto::VarType::INT64) {
-      Tensor input_fp32(experimental::DataType::FLOAT32);
+      phi::DenseTensor input_fp32(experimental::DataType::FLOAT32);
       input_fp32.Resize(input->dims());
       CastToFP32(ctx, stream, *input, &input_fp32);
 
-      Tensor output_fp32(experimental::DataType::FLOAT32);
+      phi::DenseTensor output_fp32(experimental::DataType::FLOAT32);
       output_fp32.Resize(output->dims());
 
       if (axis == -1 || axis + 1 == in_dims.size()) {
@@ -112,12 +111,12 @@ class ArgsortNPUKernel : public framework::OpKernel<T> {
         }
         auto trans_dims = phi::make_ddim(shape);
 
-        Tensor trans_input(input_fp32.type());
+        phi::DenseTensor trans_input(input_fp32.type());
         trans_input.Resize(trans_dims);
         TranposeNPU<float>(ctx, stream, &perm, input_fp32, &trans_input);
 
-        Tensor trans_output(input_fp32.type());
-        Tensor trans_indices(experimental::DataType::INT32);
+        phi::DenseTensor trans_output(input_fp32.type());
+        phi::DenseTensor trans_indices(experimental::DataType::INT32);
         trans_output.mutable_data<float>(trans_dims, ctx.GetPlace());
         trans_indices.mutable_data<int32_t>(trans_dims, ctx.GetPlace());
 
@@ -150,12 +149,12 @@ class ArgsortNPUKernel : public framework::OpKernel<T> {
         }
         auto trans_dims = phi::make_ddim(shape);
 
-        Tensor trans_input(input->type());
+        phi::DenseTensor trans_input(input->type());
         trans_input.Resize(trans_dims);
         TranposeNPU<T>(ctx, stream, &perm, *input, &trans_input);
 
-        Tensor trans_output(input->type());
-        Tensor trans_indices(experimental::DataType::INT32);
+        phi::DenseTensor trans_output(input->type());
+        phi::DenseTensor trans_indices(experimental::DataType::INT32);
         trans_output.mutable_data<T>(trans_dims, ctx.GetPlace());
         trans_indices.mutable_data<int32_t>(trans_dims, ctx.GetPlace());
 
@@ -183,12 +182,12 @@ static void FullAssignNPU(const framework::ExecutionContext& ctx,
       phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
   const int64_t input_width = in_dims[in_dims.size() - 1];
 
-  Tensor input_tmp;
+  phi::DenseTensor input_tmp;
   input_tmp.ShareDataWith(input);
   input_tmp.Resize(
       phi::make_ddim(std::vector<int64_t>{input_height * input_width}));
 
-  Tensor indices_tmp;
+  phi::DenseTensor indices_tmp;
   indices_tmp.ShareDataWith(indices);
   indices_tmp.Resize(
       phi::make_ddim(std::vector<int64_t>{input_height, input_width}));
@@ -197,12 +196,12 @@ static void FullAssignNPU(const framework::ExecutionContext& ctx,
   for (Type i = 0; i < input_height; i++) {
     indexs_value.push_back(i * input_width);
   }
-  Tensor indexs_tmp(indices.type());
+  phi::DenseTensor indexs_tmp(indices.type());
   framework::TensorFromVector<int64_t>(
       indexs_value, ctx.device_context(), &indexs_tmp);
   indexs_tmp.Resize(phi::make_ddim(std::vector<int64_t>{input_height, 1}));
 
-  Tensor indices_index(indices.type());
+  phi::DenseTensor indices_index(indices.type());
   indices_index.mutable_data<int64_t>(indices_tmp.dims(), ctx.GetPlace());
   const auto& runner_add =
       NpuOpRunner("Add", {indices_tmp, indexs_tmp}, {indices_index}, {});
@@ -212,7 +211,7 @@ static void FullAssignNPU(const framework::ExecutionContext& ctx,
       phi::make_ddim(std::vector<int64_t>{input_height * input_width}));
 
   t_out->mutable_data<T>(ctx.GetPlace());
-  Tensor out_tmp(t_out->type());
+  phi::DenseTensor out_tmp(t_out->type());
   out_tmp.ShareDataWith(*t_out);
 
   const auto& runner = NpuOpRunner("TensorScatterUpdate",
@@ -252,15 +251,15 @@ class ArgsortGradNPUKernel : public framework::OpKernel<T> {
       }
       auto trans_dims = phi::make_ddim(shape);
 
-      Tensor trans_dout(dO->type());
-      Tensor trans_ids(indices->type());
+      phi::DenseTensor trans_dout(dO->type());
+      phi::DenseTensor trans_ids(indices->type());
       trans_dout.Resize(trans_dims);
       trans_ids.Resize(trans_dims);
 
       TranposeNPU<T>(ctx, stream, &perm, *dO, &trans_dout);
       TranposeNPU<int64_t>(ctx, stream, &perm, *indices, &trans_ids);
 
-      Tensor trans_dx(dO->type());
+      phi::DenseTensor trans_dx(dO->type());
       trans_dx.Resize(trans_dims);
       FullAssignNPU<T, int64_t>(
           ctx, stream, trans_dims, trans_dout, trans_ids, &trans_dx);
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index d3ae66b3c02ff..9dff9a05d73ad 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -212,39 +212,41 @@ void AttentionLSTMOpMaker::Make() {
       "this phi::DenseTensor is a matrix with shape (T X M), where T is the "
       "total time steps in this mini-batch, M is the dim size of x.");
   AddInput("C0",
-           "(Tensor) LSTM C0"
+           "(phi::DenseTensor) LSTM C0"
            "This is a tensor with shape (N x D), where N is the batch size, D "
            "is the gate size."
            "C0 is necessary because of attention.");
   AddInput("H0",
-           "(Tensor, optional) LSTM H0"
+           "(phi::DenseTensor, optional) LSTM H0"
            "This is a tensor with shape (N x D), where N is the "
            "batch size and D is the gate size.")
       .AsDispensable();
   AddInput("AttentionWeight",
-           "(Tensor) the weights of attention fc. Always relu the fc result."
+           "(phi::DenseTensor) the weights of attention fc. Always relu the fc "
+           "result."
            "The shape is ((M+D) x 1), where M is the dim size of x, D is the "
            "gate size of LSTM.");
   AddInput("AttentionBias",
-           "(Tensor, optional) the bias of attention fc."
+           "(phi::DenseTensor, optional) the bias of attention fc."
            "The shape is (1 x 1)")
       .AsDispensable();
   AddInput("AttentionScalar",
-           "(Tensor, optional) the scalar on the result of attentioned fc. "
+           "(phi::DenseTensor, optional) the scalar on the result of "
+           "attentioned fc. "
            "Always relu the Scalar."
            "The shape is (1 x 1)")
       .AsDispensable();
   AddInput("AttentionScalarBias",
-           "(Tensor, optional) the scalar bias of attention fc."
+           "(phi::DenseTensor, optional) the scalar bias of attention fc."
            "The shape is (1 x 1)")
       .AsDispensable();
   AddInput("LSTMWeight",
-           "(Tensor) the combined weight of LSTM"
+           "(phi::DenseTensor) the combined weight of LSTM"
            " - The shape is ((D+M) x 4D), where D is the hidden gate size, M "
            "is the dim size of x"
            " - Weight = {W_forget, W_input, W_output, W_cell}");
   AddInput("LSTMBias",
-           "(Tensor) the combined bias of LSTM, shape (1x4D)."
+           "(phi::DenseTensor) the combined bias of LSTM, shape (1x4D)."
            "Note: we should add the bias of hidden and context accorindg to "
            "the same gate: "
            "{B_forget, B_input, B_output, B_cell}");
@@ -257,21 +259,22 @@ void AttentionLSTMOpMaker::Make() {
       "(phi::DenseTensor) (same as LSTMOp) the cell state of LSTM operator. "
       "The shape is (T x D), and lod is the same with the `Input`.");
   AddOutput("AttentionedX",
-            "(Tensor) shape is (T x 1), the result after X * AttentionWeight,"
+            "(phi::DenseTensor) shape is (T x 1), the result after X * "
+            "AttentionWeight,"
             " where T is the total time steps in this mini-batch,"
             " D is the hidden size.")
       .AsIntermediate();
   AddOutput("AttentionFCOut",
-            "(Tensor) (max_seq_len, 1), compute at each step.")
+            "(phi::DenseTensor) (max_seq_len, 1), compute at each step.")
       .AsIntermediate();
   AddOutput("LSTMX",
-            "(Tensor) the input X of LSTM for each step."
+            "(phi::DenseTensor) the input X of LSTM for each step."
             "Shape is (1 x M), where M is the x frame size")
       .AsIntermediate();
-  AddOutput(
-      "LSTMOUT",
-      "(Tensor) the output of LSTM X(1*(D+M))* weight((D+M)*4D) for each step."
-      "Shape is (1 x 4D), where M is the x frame size")
+  AddOutput("LSTMOUT",
+            "(phi::DenseTensor) the output of LSTM X(1*(D+M))* "
+            "weight((D+M)*4D) for each step."
+            "Shape is (1 x 4D), where M is the x frame size")
       .AsIntermediate();
   AddAttr<std::string>("gate_activation",
                        "(string, default: sigmoid)"
diff --git a/paddle/fluid/operators/attention_lstm_op.h b/paddle/fluid/operators/attention_lstm_op.h
index 41d7d594df207..0ce83be93c6cc 100644
--- a/paddle/fluid/operators/attention_lstm_op.h
+++ b/paddle/fluid/operators/attention_lstm_op.h
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class AttentionLSTMOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index abf177ee9f9f4..b4a24c84bcc45 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -207,7 +207,7 @@ framework::OpKernelType BatchNormOp::GetExpectedKernelType(
 
 framework::OpKernelType BatchNormOp::GetKernelTypeForVar(
     const std::string &var_name,
-    const Tensor &tensor,
+    const phi::DenseTensor &tensor,
     const framework::OpKernelType &expected_kernel_type) const {
 #ifdef PADDLE_WITH_MKLDNN
   // Only input require reshaping, weights and
@@ -265,7 +265,7 @@ void BatchNormOpMaker::Make() {
            "The global variance (for training) "
            "or estimated Variance (for testing)");
   AddInput("MomentumTensor",
-           "(Tensor<float32>, optional) If provided, batch_norm will "
+           "(phi::DenseTensor<float32>, optional) If provided, batch_norm will "
            "use this as momentum, this has a higher priority than "
            "attr(momentum), the shape of this tensor MUST BE [1].")
       .AsDispensable();
@@ -380,9 +380,9 @@ framework::OpKernelType BatchNormGradOp::GetExpectedKernelType(
     PADDLE_THROW(
         platform::errors::InvalidArgument("can't find gradient variable of Y"));
   }
-  const Tensor *t = nullptr;
-  if (var->IsType<Tensor>()) {
-    t = &var->Get<Tensor>();
+  const phi::DenseTensor *t = nullptr;
+  if (var->IsType<phi::DenseTensor>()) {
+    t = &var->Get<phi::DenseTensor>();
   } else if (var->IsType<phi::DenseTensor>()) {
     t = &var->Get<phi::DenseTensor>();
   }
@@ -397,7 +397,7 @@ framework::OpKernelType BatchNormGradOp::GetExpectedKernelType(
 
 framework::OpKernelType BatchNormGradOp::GetKernelTypeForVar(
     const std::string &var_name,
-    const Tensor &tensor,
+    const phi::DenseTensor &tensor,
     const framework::OpKernelType &expected_kernel_type) const {
 #ifdef PADDLE_WITH_MKLDNN
   // Only input require reshaping, weights and
@@ -522,9 +522,9 @@ framework::OpKernelType BatchNormDoubleGradOp::GetExpectedKernelType(
     PADDLE_THROW(
         platform::errors::NotFound("cannot find gradient variable of Y"));
   }
-  const Tensor *t = nullptr;
-  if (var->IsType<Tensor>()) {
-    t = &var->Get<Tensor>();
+  const phi::DenseTensor *t = nullptr;
+  if (var->IsType<phi::DenseTensor>()) {
+    t = &var->Get<phi::DenseTensor>();
   } else if (var->IsType<phi::DenseTensor>()) {
     t = &var->Get<phi::DenseTensor>();
   }
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index 29c40f1b41ef8..e643efcb8b9f5 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -34,7 +34,6 @@ DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h
index b11deeb49509b..40cdb68329fb2 100644
--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
@@ -27,7 +27,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 
 template <typename T>
diff --git a/paddle/fluid/operators/batch_norm_op_mlu.cc b/paddle/fluid/operators/batch_norm_op_mlu.cc
index a2ed462b0fe7b..77397552333d4 100644
--- a/paddle/fluid/operators/batch_norm_op_mlu.cc
+++ b/paddle/fluid/operators/batch_norm_op_mlu.cc
@@ -78,8 +78,8 @@ class MLUBatchNormOpKernel : public framework::OpKernel<T> {
     saved_mean->mutable_data<MPDType>(place);
     saved_variance->mutable_data<MPDType>(place);
 
-    Tensor transformed_x;
-    Tensor transformed_y;
+    phi::DenseTensor transformed_x;
+    phi::DenseTensor transformed_y;
     const int transformed_dim_size = 4;
     const int transformed_shape[transformed_dim_size] = {N, sample_size, 1, C};
     MLUCnnlTensorDesc transformed_desc(transformed_dim_size,
@@ -116,7 +116,7 @@ class MLUBatchNormOpKernel : public framework::OpKernel<T> {
 
     if (ctx.HasInput("MomentumTensor")) {
       const auto *mom_tensor = ctx.Input<phi::DenseTensor>("MomentumTensor");
-      Tensor mom_cpu;
+      phi::DenseTensor mom_cpu;
       framework::TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu);
       momentum = mom_cpu.data<float>()[0];
     }
@@ -226,9 +226,9 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
                                           : x_dims[x_dims.size() - 1]);
     const int sample_size = x->numel() / N / C;
 
-    Tensor transformed_d_y;
-    Tensor transformed_x;
-    Tensor transformed_d_x;
+    phi::DenseTensor transformed_d_y;
+    phi::DenseTensor transformed_x;
+    phi::DenseTensor transformed_d_x;
     const int transformed_dim_size = 4;
     const int transformed_shape[transformed_dim_size] = {N, sample_size, 1, C};
 
diff --git a/paddle/fluid/operators/batch_norm_op_npu.cc b/paddle/fluid/operators/batch_norm_op_npu.cc
index 244b76ff86be9..94c2f7297b821 100644
--- a/paddle/fluid/operators/batch_norm_op_npu.cc
+++ b/paddle/fluid/operators/batch_norm_op_npu.cc
@@ -89,7 +89,7 @@ class NPUBatchNormOpKernel : public framework::OpKernel<T> {
       // is only used in this training branch
       if (ctx.HasInput("MomentumTensor")) {
         const auto *mom_tensor = ctx.Input<phi::DenseTensor>("MomentumTensor");
-        Tensor mom_cpu;
+        phi::DenseTensor mom_cpu;
         paddle::framework::TensorCopySync(
             *mom_tensor, platform::CPUPlace(), &mom_cpu);
         momentum = mom_cpu.data<float>()[0];
diff --git a/paddle/fluid/operators/bce_loss_op_mlu.cc b/paddle/fluid/operators/bce_loss_op_mlu.cc
index 99fd402424e7c..6541de153d4be 100644
--- a/paddle/fluid/operators/bce_loss_op_mlu.cc
+++ b/paddle/fluid/operators/bce_loss_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class BCELossMLUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/bce_loss_op_npu.cc b/paddle/fluid/operators/bce_loss_op_npu.cc
index c6b2d12ac535e..5918bee19453c 100644
--- a/paddle/fluid/operators/bce_loss_op_npu.cc
+++ b/paddle/fluid/operators/bce_loss_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class BCELossNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc
index 5505d3b4e3250..10b25fc478744 100644
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -40,7 +40,7 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 Cast Operator.
 
 This Operator casts the input tensor to another data type and
-returns the Output Tensor. It's meaningless if the output dtype equals
+returns the Output phi::DenseTensor. It's meaningless if the output dtype equals
 the input dtype, but it's fine if you do so.
 
 )DOC");
diff --git a/paddle/fluid/operators/cast_op_mlu.cc b/paddle/fluid/operators/cast_op_mlu.cc
index 7e85702eee4b1..cb0bc659fbb0f 100644
--- a/paddle/fluid/operators/cast_op_mlu.cc
+++ b/paddle/fluid/operators/cast_op_mlu.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class CastMLUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/cast_op_npu.cc b/paddle/fluid/operators/cast_op_npu.cc
index 9c430fc0ffe30..0e2775efd1328 100644
--- a/paddle/fluid/operators/cast_op_npu.cc
+++ b/paddle/fluid/operators/cast_op_npu.cc
@@ -32,8 +32,6 @@ static std::map<framework::proto::VarType::Type, aclDataType>
         {framework::proto::VarType::FP64, ACL_DOUBLE},
 };
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class CastNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/center_loss_op.h b/paddle/fluid/operators/center_loss_op.h
index 989a27f552118..36fe957102bfb 100644
--- a/paddle/fluid/operators/center_loss_op.h
+++ b/paddle/fluid/operators/center_loss_op.h
@@ -26,7 +26,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -81,7 +80,7 @@ class CenterLossKernel : public framework::OpKernel<T> {
 
     auto loss_data = out_loss->mutable_data<T>(ctx.GetPlace());
 
-    Tensor centers_diffacc;  // used to accumulate all diff
+    phi::DenseTensor centers_diffacc;  // used to accumulate all diff
     auto centers_diffacc_data =
         centers_diffacc.mutable_data<T>(centers_dim, ctx.GetPlace());
     int numel = centers_diffacc.numel();
diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h
index 21658be577ebd..f54e323eefb44 100644
--- a/paddle/fluid/operators/clip_by_norm_op.h
+++ b/paddle/fluid/operators/clip_by_norm_op.h
@@ -23,7 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 // using SelectedRows = phi::SelectedRows;
 template <typename T,
           int MajorType = Eigen::RowMajor,
diff --git a/paddle/fluid/operators/clip_by_norm_op_npu.cc b/paddle/fluid/operators/clip_by_norm_op_npu.cc
index 8f0ca6dfc7635..2ccd87d58bea7 100644
--- a/paddle/fluid/operators/clip_by_norm_op_npu.cc
+++ b/paddle/fluid/operators/clip_by_norm_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class NPUClipByNormKernel : public framework::OpKernel<T> {
  public:
@@ -48,7 +46,7 @@ class NPUClipByNormKernel : public framework::OpKernel<T> {
                                 "Input(X) of ClipByNormOp should not be null. "
                                 "Please check if it is created correctly."));
 
-    Tensor square_sum(input->type());
+    phi::DenseTensor square_sum(input->type());
     square_sum.mutable_data<T>(framework::DDim({1}), place);
     const auto& x_dims = input->dims();
     std::vector<int> axis;
@@ -62,12 +60,12 @@ class NPUClipByNormKernel : public framework::OpKernel<T> {
                     {{"axis", axis}, {"keep_dims", false}});
     square_sum_runner.Run(stream);
 
-    Tensor x_norm(input->type());
+    phi::DenseTensor x_norm(input->type());
     x_norm.mutable_data<T>(framework::DDim({1}), place);
     const auto& x_norm_runner = NpuOpRunner("Sqrt", {square_sum}, {x_norm}, {});
     x_norm_runner.Run(stream);
 
-    Tensor x_norm_t;
+    phi::DenseTensor x_norm_t;
     framework::TensorCopySync(x_norm, platform::CPUPlace(), &x_norm_t);
     auto x_norm_v = static_cast<float>(*x_norm_t.data<T>());
     if (x_norm_v <= max_norm) {
diff --git a/paddle/fluid/operators/clip_op_mlu.cc b/paddle/fluid/operators/clip_op_mlu.cc
index daced778a95dc..f84a493d6d399 100644
--- a/paddle/fluid/operators/clip_op_mlu.cc
+++ b/paddle/fluid/operators/clip_op_mlu.cc
@@ -29,7 +29,7 @@ class ClipMLUKernel : public framework::OpKernel<T> {
     auto max = static_cast<T>(ctx.Attr<float>("max"));
 
     if (ctx.HasInput("Min")) {
-      Tensor min_cpu;
+      phi::DenseTensor min_cpu;
       auto* min_tensor = ctx.Input<phi::DenseTensor>("Min");
       auto* min_data = min_tensor->data<T>();
       if (platform::is_mlu_place(min_tensor->place())) {
@@ -41,7 +41,7 @@ class ClipMLUKernel : public framework::OpKernel<T> {
     }
 
     if (ctx.HasInput("Max")) {
-      Tensor max_cpu;
+      phi::DenseTensor max_cpu;
       auto* max_tensor = ctx.Input<phi::DenseTensor>("Max");
       auto* max_data = max_tensor->data<T>();
       if (platform::is_mlu_place(max_tensor->place())) {
@@ -80,7 +80,7 @@ class ClipGradMLUKernel : public framework::OpKernel<T> {
 
     auto min_val = ctx.Attr<float>("min");
     if (min_tensor) {
-      Tensor min_data;
+      phi::DenseTensor min_data;
       framework::TensorCopy(
           *min_tensor,
           platform::CPUPlace(),
@@ -91,7 +91,7 @@ class ClipGradMLUKernel : public framework::OpKernel<T> {
     }
     auto max_val = ctx.Attr<float>("max");
     if (max_tensor) {
-      Tensor max_data;
+      phi::DenseTensor max_data;
       framework::TensorCopy(
           *max_tensor,
           platform::CPUPlace(),
diff --git a/paddle/fluid/operators/clip_op_npu.cc b/paddle/fluid/operators/clip_op_npu.cc
index 19ae23add0e10..82056ab0acb4a 100644
--- a/paddle/fluid/operators/clip_op_npu.cc
+++ b/paddle/fluid/operators/clip_op_npu.cc
@@ -18,8 +18,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class ClipNPUKernel : public framework::OpKernel<T> {
  public:
@@ -33,8 +31,8 @@ class ClipNPUKernel : public framework::OpKernel<T> {
     auto max_tensor =
         ctx.HasInput("Max") ? ctx.Input<phi::DenseTensor>("Max") : nullptr;
 
-    Tensor min_tensor_temp(x->type());
-    Tensor max_tensor_temp(x->type());
+    phi::DenseTensor min_tensor_temp(x->type());
+    phi::DenseTensor max_tensor_temp(x->type());
     if (min_tensor == nullptr) {
       auto min_value = static_cast<T>(ctx.Attr<float>("min"));
       min_tensor_temp.mutable_data<T>({1}, ctx.GetPlace());
@@ -74,7 +72,7 @@ class ClipGradNPUKernel : public framework::OpKernel<T> {
 
     auto min_val = ctx.Attr<float>("min");
     if (min_tensor) {
-      Tensor min_data;
+      phi::DenseTensor min_data;
       framework::TensorCopy(
           *min_tensor,
           platform::CPUPlace(),
@@ -86,7 +84,7 @@ class ClipGradNPUKernel : public framework::OpKernel<T> {
 
     auto max_val = ctx.Attr<float>("max");
     if (max_tensor) {
-      Tensor max_data;
+      phi::DenseTensor max_data;
       framework::TensorCopy(
           *max_tensor,
           platform::CPUPlace(),
diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
index 6bdfe9e8b754f..75e6df4baf82b 100644
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -61,7 +61,7 @@ struct FillConstantVisitor {
                  * = nullptr) const {
 #ifdef PADDLE_WITH_ASCEND_CL
     if (platform::is_npu_place(dev_ctx_.GetPlace())) {
-      Tensor tensor_tmp(framework::TransToPhiDataType(dtype_));
+      phi::DenseTensor tensor_tmp(framework::TransToPhiDataType(dtype_));
       tensor_tmp.mutable_data<T>({1}, context_.GetPlace());
       FillNpuTensorWithConstant<T>(&tensor_tmp, static_cast<T>(value_));
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 8d3af26f0c254..6920d51eb2637 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -151,10 +151,9 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
 inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
                         aclrtStream stream,
                         const phi::DenseTensor* in) {
-  using Tensor = phi::DenseTensor;
-  Tensor out(in->type());
+  phi::DenseTensor out(in->type());
 
-  Tensor mean(in->type());
+  phi::DenseTensor mean(in->type());
   mean.Resize({1});
   mean.mutable_data<float>(dev_ctx.GetPlace());
   std::vector<int> axes;
diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
index 0881b702ec0d8..40a0cb196f3bb 100644
--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
@@ -24,8 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaxinumNumBlocks = 4096;
 
@@ -126,7 +124,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
     const int N = phi::funcs::SizeToAxis(axis, logits_dims);
     const int D = phi::funcs::SizeFromAxis(axis, logits_dims);
 
-    Tensor logits_2d, softmax_2d, loss_2d;
+    phi::DenseTensor logits_2d, softmax_2d, loss_2d;
     logits_2d.ShareDataWith(*logits).Resize({N, D});
     softmax_2d.ShareDataWith(*softmax).Resize({N, D});
     loss_2d.ShareDataWith(*loss).Resize({N, 1});
@@ -135,7 +133,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
     auto eigen_softmax = math::EigenMatrix<T>::From(softmax_2d);
 
     // step 1, obtain logit_max
-    Tensor logits_max;
+    phi::DenseTensor logits_max;
     logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
     void* logits_max_buff = logits_max.mutable_data<T>(place);
 
@@ -163,7 +161,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
             .unaryExpr(math::ValueClip<T>());
 
     // step 3, obtain predict target
-    Tensor predicted_logits;
+    phi::DenseTensor predicted_logits;
     predicted_logits =
         ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
     predicted_logits.mutable_data<T>(place);
@@ -215,7 +213,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
     eigen_softmax.device(*dev_ctx.eigen_device()) = eigen_softmax.exp();
 
     // step 5, obtain sum_exp_logits
-    Tensor sum_exp_logits;
+    phi::DenseTensor sum_exp_logits;
     sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
     void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
 
@@ -278,7 +276,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
     const int N = phi::funcs::SizeToAxis(axis, logits_dims);
     const int D = phi::funcs::SizeFromAxis(axis, logits_dims);
 
-    Tensor logits_2d, softmax_2d, loss_2d;
+    phi::DenseTensor logits_2d, softmax_2d, loss_2d;
     logits_2d.ShareDataWith(*logits).Resize({N, D});
     softmax_2d.ShareDataWith(*softmax).Resize({N, D});
     loss_2d.ShareDataWith(*loss).Resize({N, 1});
@@ -287,7 +285,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
     auto eigen_softmax = math::EigenMatrix<T>::From(softmax_2d);
 
     // step 1, obtain logit_max
-    Tensor logits_max;
+    phi::DenseTensor logits_max;
     logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
 
     auto eigen_logits_max = math::EigenMatrix<T>::From(logits_max);
@@ -309,7 +307,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
             .unaryExpr(math::ValueClip<T>());
 
     // step 3, obtain predict target
-    Tensor predicted_logits;
+    phi::DenseTensor predicted_logits;
     predicted_logits =
         ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
     predicted_logits.mutable_data<T>(place);
@@ -355,7 +353,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
     eigen_softmax.device(*dev_ctx.eigen_device()) = eigen_softmax.exp();
 
     // step 5, obtain sum_exp_logits
-    Tensor sum_exp_logits;
+    phi::DenseTensor sum_exp_logits;
     sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
     void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
 
@@ -405,7 +403,7 @@ class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
     const int N = phi::funcs::SizeToAxis(axis, sofrmax_dims);
     const int D = phi::funcs::SizeFromAxis(axis, sofrmax_dims);
 
-    Tensor logit_grad_2d;
+    phi::DenseTensor logit_grad_2d;
     logit_grad_2d.ShareDataWith(*logit_grad).Resize({N, D});
 
     int blocks = NumBlocks(N * D);
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index ae65930b86ac0..0c6e7b31c9d2e 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 class ConcatOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/concat_op_mlu.cc b/paddle/fluid/operators/concat_op_mlu.cc
index b73460f2057e4..ebfd2895e783b 100644
--- a/paddle/fluid/operators/concat_op_mlu.cc
+++ b/paddle/fluid/operators/concat_op_mlu.cc
@@ -119,7 +119,7 @@ class ConcatGradMLUKernel : public framework::OpKernel<T> {
             out_grad->dims().size()));
     // get output tensor that the name is not kEmptyVarName
     std::vector<void*> outputs_vec;
-    std::vector<Tensor> tmp_outputs_vec;
+    std::vector<phi::DenseTensor> tmp_outputs_vec;
     std::vector<MLUCnnlTensorDesc> output_descs;
     std::vector<cnnlTensorDescriptor_t> descs_vec;
     for (size_t j = 0; j < outs.size(); ++j) {
@@ -129,7 +129,7 @@ class ConcatGradMLUKernel : public framework::OpKernel<T> {
         output_descs.emplace_back(MLUCnnlTensorDesc(*outs[j]));
         outputs_vec.push_back(GetBasePtr(outs[j]));
       } else {
-        Tensor tmp_tensor;
+        phi::DenseTensor tmp_tensor;
         tmp_tensor.mutable_data<T>(ins[j]->dims(), ctx.GetPlace());
         tmp_outputs_vec.push_back(tmp_tensor);
         output_descs.emplace_back(MLUCnnlTensorDesc(*ins[j]));
diff --git a/paddle/fluid/operators/controlflow/logical_op_mlu.cc b/paddle/fluid/operators/controlflow/logical_op_mlu.cc
index 5e1630447b9de..7f63513af7bac 100644
--- a/paddle/fluid/operators/controlflow/logical_op_mlu.cc
+++ b/paddle/fluid/operators/controlflow/logical_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T, cnnlLogicOp_t log_method>
 class LogicalMLUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/controlflow/logical_op_npu.cc b/paddle/fluid/operators/controlflow/logical_op_npu.cc
index 7c2c11bbfb40e..38ffa202efa92 100644
--- a/paddle/fluid/operators/controlflow/logical_op_npu.cc
+++ b/paddle/fluid/operators/controlflow/logical_op_npu.cc
@@ -15,8 +15,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class LogicalNotNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index 924ed1fcf7d35..62bcfb545e00f 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -29,8 +29,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 // Base convolution operator definations for other conv
 // like operators to reuse the implementation.
 inline int ConvOutputSize(
diff --git a/paddle/fluid/operators/conv_op_mlu.cc b/paddle/fluid/operators/conv_op_mlu.cc
index d0067d5c5930a..214af06bbd7c7 100644
--- a/paddle/fluid/operators/conv_op_mlu.cc
+++ b/paddle/fluid/operators/conv_op_mlu.cc
@@ -18,7 +18,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 
 template <typename T>
@@ -56,8 +55,8 @@ class MLUConvOpKernel : public framework::OpKernel<T> {
     UpdatePaddingAndDilation(
         &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
 
-    Tensor input_tensor(input->type());
-    Tensor output_tensor(output->type());
+    phi::DenseTensor input_tensor(input->type());
+    phi::DenseTensor output_tensor(output->type());
     const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
     if (channel_last) {
       input_tensor.ShareDataWith(*input);
@@ -78,7 +77,7 @@ class MLUConvOpKernel : public framework::OpKernel<T> {
     output_tensor.set_layout(DataLayout::kNHWC);
 
     // transpose filter from MCHW to MHWC
-    Tensor trans_filter(filter->type());
+    phi::DenseTensor trans_filter(filter->type());
     TransposeFromMLUTensor<T>(ctx,
                               perm_to_nhwc,
                               filter,
@@ -166,8 +165,8 @@ class MLUConvGradOpKernel : public framework::OpKernel<T> {
     UpdatePaddingAndDilation(
         &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
 
-    Tensor input_tensor(input->type());
-    Tensor output_grad_tensor(output_grad->type());
+    phi::DenseTensor input_tensor(input->type());
+    phi::DenseTensor output_grad_tensor(output_grad->type());
     const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
     const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
     if (channel_last) {
@@ -193,7 +192,7 @@ class MLUConvGradOpKernel : public framework::OpKernel<T> {
       filter_grad->mutable_data<T>(ctx.GetPlace());
 
       auto filter_grad_dims = filter_grad->dims();
-      Tensor temp_filter_grad(filter_grad->type());
+      phi::DenseTensor temp_filter_grad(filter_grad->type());
       temp_filter_grad.mutable_data<T>({filter_grad_dims[0],
                                         filter_grad_dims[2],
                                         filter_grad_dims[3],
@@ -234,7 +233,7 @@ class MLUConvGradOpKernel : public framework::OpKernel<T> {
     if (input_grad) {
       input_grad->mutable_data<T>(ctx.GetPlace());
 
-      Tensor input_grad_tensor(input_grad->type());
+      phi::DenseTensor input_grad_tensor(input_grad->type());
       if (channel_last) {
         input_grad_tensor.ShareDataWith(*input_grad);
       } else {
@@ -248,7 +247,7 @@ class MLUConvGradOpKernel : public framework::OpKernel<T> {
       input_grad_tensor.set_layout(DataLayout::kNHWC);
 
       // transpose filter from MCHW to MHWC
-      Tensor trans_filter(filter->type());
+      phi::DenseTensor trans_filter(filter->type());
       TransposeFromMLUTensor<T>(ctx,
                                 perm_to_nhwc,
                                 filter,
@@ -326,8 +325,8 @@ class MLUDepthwiseConvOpKernel : public framework::OpKernel<T> {
     UpdatePaddingAndDilation(
         &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
 
-    Tensor input_tensor(input->type());
-    Tensor output_tensor(output->type());
+    phi::DenseTensor input_tensor(input->type());
+    phi::DenseTensor output_tensor(output->type());
     const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
     if (channel_last) {
       groups = in_dims[3];
@@ -350,7 +349,7 @@ class MLUDepthwiseConvOpKernel : public framework::OpKernel<T> {
     output_tensor.set_layout(DataLayout::kNHWC);
 
     // transpose filter from MCHW to MHWC
-    Tensor trans_filter(filter->type());
+    phi::DenseTensor trans_filter(filter->type());
     TransposeFromMLUTensor<T>(ctx,
                               perm_to_nhwc,
                               filter,
@@ -438,8 +437,8 @@ class MLUDepthwiseConvGradOpKernel : public framework::OpKernel<T> {
     UpdatePaddingAndDilation(
         &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
 
-    Tensor input_tensor(input->type());
-    Tensor output_grad_tensor(output_grad->type());
+    phi::DenseTensor input_tensor(input->type());
+    phi::DenseTensor output_grad_tensor(output_grad->type());
     const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
     const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
     const std::vector<int> perm_hwcm_to_mchw = {3, 2, 0, 1};
@@ -469,7 +468,7 @@ class MLUDepthwiseConvGradOpKernel : public framework::OpKernel<T> {
       filter_grad->mutable_data<T>(ctx.GetPlace());
 
       auto filter_grad_dims = filter_grad->dims();
-      Tensor temp_filter_grad(filter_grad->type());
+      phi::DenseTensor temp_filter_grad(filter_grad->type());
       // Details about setting diff_w hwcn for better performance, see the CNNL
       // documentation.
       temp_filter_grad.mutable_data<T>({filter_grad_dims[perm_mchw_to_hwcm[0]],
@@ -512,7 +511,7 @@ class MLUDepthwiseConvGradOpKernel : public framework::OpKernel<T> {
     if (input_grad) {
       input_grad->mutable_data<T>(ctx.GetPlace());
 
-      Tensor input_grad_tensor(input_grad->type());
+      phi::DenseTensor input_grad_tensor(input_grad->type());
       if (channel_last) {
         input_grad_tensor.ShareDataWith(*input_grad);
       } else {
@@ -526,7 +525,7 @@ class MLUDepthwiseConvGradOpKernel : public framework::OpKernel<T> {
       input_grad_tensor.set_layout(DataLayout::kNHWC);
 
       // transpose filter from MCHW to MHWC
-      Tensor trans_filter(filter->type());
+      phi::DenseTensor trans_filter(filter->type());
       TransposeFromMLUTensor<T>(ctx,
                                 perm_to_nhwc,
                                 filter,
diff --git a/paddle/fluid/operators/conv_op_npu.cc b/paddle/fluid/operators/conv_op_npu.cc
index f4c7de95483b5..6b8f7118473a5 100644
--- a/paddle/fluid/operators/conv_op_npu.cc
+++ b/paddle/fluid/operators/conv_op_npu.cc
@@ -18,7 +18,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 static void CastToFP16(const framework::ExecutionContext& ctx,
                        const aclrtStream& stream,
@@ -104,7 +103,7 @@ class DepthwiseConvNPUKernel : public framework::OpKernel<T> {
     std::vector<int> strides(4, 1);
     std::vector<int> dilations(4, 1);
 
-    Tensor input_tensor, output_tensor;
+    phi::DenseTensor input_tensor, output_tensor;
     input_tensor.ShareDataWith(*input);
     output_tensor.ShareDataWith(*output);
 
@@ -125,7 +124,7 @@ class DepthwiseConvNPUKernel : public framework::OpKernel<T> {
     auto stream = ctx.template device_context<NPUDeviceContext>().stream();
 
     // Transform filter (n, 1, h, w) --> (1, n, h, w)
-    Tensor transformed_filter(filter->type());
+    phi::DenseTensor transformed_filter(filter->type());
     transformed_filter.mutable_data<T>({filter->dims()[1],
                                         filter->dims()[0],
                                         filter->dims()[2],
@@ -189,7 +188,7 @@ class DepthwiseConvGradNPUKernel : public framework::OpKernel<T> {
     auto stream = ctx.template device_context<NPUDeviceContext>().stream();
 
     // Transform filter (n, 1, h, w) --> (1, n, h, w)
-    Tensor transformed_filter(filter->type());
+    phi::DenseTensor transformed_filter(filter->type());
     transformed_filter.mutable_data<T>({filter->dims()[1],
                                         filter->dims()[0],
                                         filter->dims()[2],
@@ -204,7 +203,7 @@ class DepthwiseConvGradNPUKernel : public framework::OpKernel<T> {
     std::vector<int> strides(4, 1);
     std::vector<int> dilations(4, 1);
 
-    Tensor input_tensor, output_grad_tensor;
+    phi::DenseTensor input_tensor, output_grad_tensor;
     input_tensor.ShareDataWith(*input);
     output_grad_tensor.ShareDataWith(*output_grad);
     if (channel_last) {
@@ -247,7 +246,7 @@ class DepthwiseConvGradNPUKernel : public framework::OpKernel<T> {
     }
     if (input_grad) {
       input_grad->mutable_data<T>(ctx.GetPlace());
-      Tensor input_grad_tensor;
+      phi::DenseTensor input_grad_tensor;
       input_grad_tensor.ShareDataWith(*input_grad);
       if (channel_last) {
         input_grad_tensor.set_layout(DataLayout::kNHWC);
@@ -305,7 +304,7 @@ class NPUConvOpKernel : public framework::OpKernel<T> {
     std::vector<int> strides_vec(4, 1);
     std::vector<int> dilations_vec(4, 1);
 
-    Tensor input_tensor, output_tensor;
+    phi::DenseTensor input_tensor, output_tensor;
     input_tensor.ShareDataWith(*input);
     output_tensor.ShareDataWith(*output);
     if (channel_last) {
@@ -378,7 +377,7 @@ class NPUConvGradOpKernel : public framework::OpKernel<T> {
     std::vector<int> strides_vec(4, 1);
     std::vector<int> dilations_vec(4, 1);
 
-    Tensor input_tensor, output_grad_tensor;
+    phi::DenseTensor input_tensor, output_grad_tensor;
     input_tensor.ShareDataWith(*input);
     output_grad_tensor.ShareDataWith(*output_grad);
     if (channel_last) {
@@ -400,7 +399,7 @@ class NPUConvGradOpKernel : public framework::OpKernel<T> {
       filter_grad->mutable_data<T>(ctx.GetPlace());
       std::vector<int> filter_shape_vec = phi::vectorize<int>(filter->dims());
 
-      Tensor filter_grad_fp32(experimental::DataType::FLOAT32);
+      phi::DenseTensor filter_grad_fp32(experimental::DataType::FLOAT32);
       filter_grad_fp32.Resize(filter_grad->dims());
 
       if (framework::TransToProtoVarType(input->dtype()) ==
@@ -430,7 +429,7 @@ class NPUConvGradOpKernel : public framework::OpKernel<T> {
       input_grad->mutable_data<T>(ctx.GetPlace());
       std::vector<int> input_shape_vec = phi::vectorize<int>(input->dims());
 
-      Tensor input_grad_tensor;
+      phi::DenseTensor input_grad_tensor;
       input_grad_tensor.ShareDataWith(*input_grad);
       if (channel_last) {
         input_grad_tensor.set_layout(DataLayout::kNHWC);
@@ -617,8 +616,9 @@ class NPUConv3dGradKernel : public framework::OpKernel<T> {
       filter_grad->mutable_data<T>(ctx.GetPlace());
       std::vector<int> filter_shape_vec = phi::vectorize<int>(filter->dims());
 
-      Tensor filter_grad_tensor = ctx.AllocateTmpTensor<T, NPUDeviceContext>(
-          filter_grad->dims(), dev_ctx);
+      phi::DenseTensor filter_grad_tensor =
+          ctx.AllocateTmpTensor<T, NPUDeviceContext>(filter_grad->dims(),
+                                                     dev_ctx);
       filter_grad_tensor.ShareDataWith(*filter_grad);
       filter_grad_tensor.set_layout(DataLayout::kNCDHW);
 
@@ -638,8 +638,9 @@ class NPUConv3dGradKernel : public framework::OpKernel<T> {
       input_grad->mutable_data<T>(ctx.GetPlace());
       std::vector<int> input_shape_vec = phi::vectorize<int>(input->dims());
 
-      Tensor input_grad_tensor = ctx.AllocateTmpTensor<T, NPUDeviceContext>(
-          input_grad->dims(), dev_ctx);
+      phi::DenseTensor input_grad_tensor =
+          ctx.AllocateTmpTensor<T, NPUDeviceContext>(input_grad->dims(),
+                                                     dev_ctx);
       input_grad_tensor.ShareDataWith(*input_grad);
       input_grad_tensor.set_layout(DataLayout::kNCDHW);
 
diff --git a/paddle/fluid/operators/conv_transpose_op_mlu.cc b/paddle/fluid/operators/conv_transpose_op_mlu.cc
index c2d68523d48cc..36d0be10575d1 100644
--- a/paddle/fluid/operators/conv_transpose_op_mlu.cc
+++ b/paddle/fluid/operators/conv_transpose_op_mlu.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 
 template <typename T>
@@ -61,8 +60,8 @@ class Conv2DTransposeMLUKernel : public framework::OpKernel<T> {
     phi::UpdatePaddingAndDilation(
         &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
 
-    Tensor input_tensor(input->type());
-    Tensor output_tensor(output->type());
+    phi::DenseTensor input_tensor(input->type());
+    phi::DenseTensor output_tensor(output->type());
     input_tensor.set_layout(DataLayout::kNHWC);
     output_tensor.set_layout(DataLayout::kNHWC);
     const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
@@ -84,7 +83,7 @@ class Conv2DTransposeMLUKernel : public framework::OpKernel<T> {
     }
 
     // transpose filter from MCHW to MHWC
-    Tensor trans_filter(filter->type());
+    phi::DenseTensor trans_filter(filter->type());
     TransposeFromMLUTensor<T>(ctx,
                               perm_to_nhwc,
                               filter,
@@ -168,8 +167,8 @@ class Conv2DTransposeGradMLUKernel : public framework::OpKernel<T> {
     phi::UpdatePaddingAndDilation(
         &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
 
-    Tensor input_tensor(input->type());
-    Tensor output_grad_tensor(output_grad->type());
+    phi::DenseTensor input_tensor(input->type());
+    phi::DenseTensor output_grad_tensor(output_grad->type());
     output_grad_tensor.set_layout(DataLayout::kNHWC);
 
     const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
@@ -191,7 +190,7 @@ class Conv2DTransposeGradMLUKernel : public framework::OpKernel<T> {
     }
 
     // transpose filter from MCHW to MHWC
-    Tensor trans_filter(filter->type());
+    phi::DenseTensor trans_filter(filter->type());
     TransposeFromMLUTensor<T>(ctx,
                               perm_to_nhwc,
                               filter,
@@ -217,7 +216,7 @@ class Conv2DTransposeGradMLUKernel : public framework::OpKernel<T> {
 
     if (filter_grad) {
       filter_grad->mutable_data<T>(ctx.GetPlace());
-      Tensor filter_grad_tensor(filter_grad->type());
+      phi::DenseTensor filter_grad_tensor(filter_grad->type());
       // filter_grad always MCHW
       // filter_grad_tensor always MHWC
       auto filter_grad_dims = filter_grad->dims();
@@ -253,7 +252,7 @@ class Conv2DTransposeGradMLUKernel : public framework::OpKernel<T> {
 
     if (input_grad) {
       input_grad->mutable_data<T>(ctx.GetPlace());
-      Tensor input_grad_tensor(input_grad->type());
+      phi::DenseTensor input_grad_tensor(input_grad->type());
       input_tensor.set_layout(DataLayout::kNHWC);
 
       if (channel_last) {
diff --git a/paddle/fluid/operators/conv_transpose_op_npu.cc b/paddle/fluid/operators/conv_transpose_op_npu.cc
index 2f674de03f7a2..3723a4841af30 100644
--- a/paddle/fluid/operators/conv_transpose_op_npu.cc
+++ b/paddle/fluid/operators/conv_transpose_op_npu.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 template <typename T>
@@ -65,7 +64,7 @@ class Conv2DTransposeNPUKernel : public framework::OpKernel<T> {
     std::vector<int> strides(4, 1);
     std::vector<int> dilations(4, 1);
 
-    Tensor input_tensor, output_tensor;
+    phi::DenseTensor input_tensor, output_tensor;
     input_tensor.ShareDataWith(*input);
     output_tensor.ShareDataWith(*output);
 
@@ -148,7 +147,7 @@ class Conv2DTransposeGradNPUKernel : public framework::OpKernel<T> {
     std::vector<int> strides_vec(4, 1);
     std::vector<int> dilations_vec(4, 1);
 
-    Tensor input_tensor, output_grad_tensor;
+    phi::DenseTensor input_tensor, output_grad_tensor;
     input_tensor.ShareDataWith(*input);
     output_grad_tensor.ShareDataWith(*output_grad);
     if (channel_last) {
@@ -182,7 +181,7 @@ class Conv2DTransposeGradNPUKernel : public framework::OpKernel<T> {
     }
     if (input_grad) {
       input_grad->mutable_data<T>(ctx.GetPlace());
-      Tensor input_grad_tensor;
+      phi::DenseTensor input_grad_tensor;
       input_grad_tensor.ShareDataWith(*input_grad);
       if (channel_last) {
         input_grad_tensor.set_layout(DataLayout::kNHWC);
@@ -248,7 +247,7 @@ class Conv3DTransposeNPUKernel : public framework::OpKernel<T> {
     std::vector<int> strides(5, 1);
     std::vector<int> dilations(5, 1);
 
-    Tensor input_tensor, output_tensor, filter_tensor;
+    phi::DenseTensor input_tensor, output_tensor, filter_tensor;
     input_tensor.Resize(input->dims());
     input_tensor.ShareDataWith(*input);
     output_tensor.Resize(output->dims());
diff --git a/paddle/fluid/operators/copy_cross_scope_op.cc b/paddle/fluid/operators/copy_cross_scope_op.cc
index a36e9b73639ba..56f334b66571d 100644
--- a/paddle/fluid/operators/copy_cross_scope_op.cc
+++ b/paddle/fluid/operators/copy_cross_scope_op.cc
@@ -30,8 +30,6 @@ class OpBase;
 }  // namespace imperative
 }  // namespace paddle
 
-using Tensor = phi::DenseTensor;
-
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/correlation_op.cc b/paddle/fluid/operators/correlation_op.cc
index 5587b595cd470..2b3450d031607 100644
--- a/paddle/fluid/operators/correlation_op.cc
+++ b/paddle/fluid/operators/correlation_op.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 inline std::vector<int64_t> CorrelationOutputSize(int batch,
                                                   int input_height,
                                                   int input_width,
diff --git a/paddle/fluid/operators/cos_sim_op.h b/paddle/fluid/operators/cos_sim_op.h
index e1935f0dae2ad..5d4f11a876585 100644
--- a/paddle/fluid/operators/cos_sim_op.h
+++ b/paddle/fluid/operators/cos_sim_op.h
@@ -21,13 +21,11 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class CosSimKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    // get Tensor
+    // get phi::DenseTensor
     auto* in_x = context.Input<phi::DenseTensor>("X");
     auto* in_y = context.Input<phi::DenseTensor>("Y");
     auto* out_z = context.Output<phi::DenseTensor>("Out");
@@ -74,7 +72,7 @@ template <typename DeviceContext, typename T>
 class CosSimGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    // get Tensor
+    // get phi::DenseTensor
     auto* in_x = context.Input<phi::DenseTensor>("X");
     auto* in_y = context.Input<phi::DenseTensor>("Y");
     auto* in_z = context.Input<phi::DenseTensor>("Out");
diff --git a/paddle/fluid/operators/crop_op_npu.cc b/paddle/fluid/operators/crop_op_npu.cc
index 8980e5f73dee7..916ad89f1e72c 100644
--- a/paddle/fluid/operators/crop_op_npu.cc
+++ b/paddle/fluid/operators/crop_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class CropNPUKernel : public framework::OpKernel<T> {
  public:
@@ -71,7 +69,7 @@ class CropNPUKernel : public framework::OpKernel<T> {
                             x->dims().size()));
 
       // shape memory maybe have gc.
-      Tensor tmp_shape(*shape);
+      phi::DenseTensor tmp_shape(*shape);
       tmp_shape.mutable_data<T>(ctx.GetPlace());
 
       const auto& runner =
@@ -90,7 +88,7 @@ class CropNPUKernel : public framework::OpKernel<T> {
                             "(%d) of the Input(X).",
                             shape_size.size(),
                             x->dims().size()));
-      Tensor tmp_shape(x->dtype());
+      phi::DenseTensor tmp_shape(x->dtype());
       tmp_shape.Resize(phi::make_ddim(shape_size));
       tmp_shape.mutable_data<T>(ctx.GetPlace());
       const auto& runner =
diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h
index 8ae6f448d24ba..c581d33091c02 100644
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class CrossEntropyOpKernel : public framework::OpKernel<T> {
  public:
@@ -36,8 +34,8 @@ class CrossEntropyOpKernel : public framework::OpKernel<T> {
 
     int rank = x->dims().size();
     auto label_dims = labels->dims();
-    Tensor x_2d = framework::ReshapeToMatrix(*x, rank - 1);
-    Tensor labels_2d, y_2d;
+    phi::DenseTensor x_2d = framework::ReshapeToMatrix(*x, rank - 1);
+    phi::DenseTensor labels_2d, y_2d;
     if (label_dims.size() < rank) {
       labels_2d.ShareDataWith(*labels);
       labels_2d.Resize({phi::product(label_dims), 1});
diff --git a/paddle/fluid/operators/ctc_align_op.h b/paddle/fluid/operators/ctc_align_op.h
index 9279cf531d449..c3647d6e8c2d7 100644
--- a/paddle/fluid/operators/ctc_align_op.h
+++ b/paddle/fluid/operators/ctc_align_op.h
@@ -24,8 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class CTCAlignKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index d436a4b5d531d..97e5eae62ab3b 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -26,8 +26,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T, typename Type>
 bool is_continuous(const Type &weight_list) {
   bool continuous = true;
@@ -41,7 +39,7 @@ bool is_continuous(const Type &weight_list) {
   return continuous;
 }
 
-int size_sum(const std::vector<const Tensor *> &weight_list) {
+int size_sum(const std::vector<const phi::DenseTensor *> &weight_list) {
   int size = 0;
   for (size_t i = 0; i < weight_list.size(); ++i) {
     auto in_size = weight_list[i]->numel();
@@ -53,8 +51,8 @@ int size_sum(const std::vector<const Tensor *> &weight_list) {
 template <typename T>
 void weight_to_tensor(const platform::Place &place,
                       gpuStream_t stream,
-                      const std::vector<const Tensor *> &weight_list,
-                      Tensor *weight) {
+                      const std::vector<const phi::DenseTensor *> &weight_list,
+                      phi::DenseTensor *weight) {
   auto weight_data = weight->data<T>();
   int weight_offset = 0;
   for (size_t i = 0; i < weight_list.size(); ++i) {
@@ -72,11 +70,12 @@ void weight_to_tensor(const platform::Place &place,
 }
 
 template <typename T>
-void weight_to_tensor_list(const platform::Place &place,
-                           gpuStream_t stream,
-                           std::vector<Tensor *> *weight_grad,
-                           const std::vector<const Tensor *> &weight_input,
-                           const Tensor *weight) {
+void weight_to_tensor_list(
+    const platform::Place &place,
+    gpuStream_t stream,
+    std::vector<phi::DenseTensor *> *weight_grad,
+    const std::vector<const phi::DenseTensor *> &weight_input,
+    const phi::DenseTensor *weight) {
   int weight_offset = 0;
   auto *weight_data = weight->data<T>();
   for (size_t i = 0; i < weight_input.size(); ++i) {
@@ -204,15 +203,15 @@ template <typename T>
 class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    const Tensor *x = ctx.Input<phi::DenseTensor>("Input");
-    const Tensor *init_h = ctx.Input<phi::DenseTensor>("InitH");
-    const Tensor *init_c = ctx.Input<phi::DenseTensor>("InitC");
+    const phi::DenseTensor *x = ctx.Input<phi::DenseTensor>("Input");
+    const phi::DenseTensor *init_h = ctx.Input<phi::DenseTensor>("InitH");
+    const phi::DenseTensor *init_c = ctx.Input<phi::DenseTensor>("InitC");
 
-    Tensor *out = ctx.Output<phi::DenseTensor>("Out");
-    Tensor *last_h = ctx.Output<phi::DenseTensor>("LastH");
-    Tensor *last_c = ctx.Output<phi::DenseTensor>("LastC");
-    Tensor *reserve = ctx.Output<phi::DenseTensor>("Reserve");
-    Tensor *state_out = ctx.Output<phi::DenseTensor>("StateOut");
+    phi::DenseTensor *out = ctx.Output<phi::DenseTensor>("Out");
+    phi::DenseTensor *last_h = ctx.Output<phi::DenseTensor>("LastH");
+    phi::DenseTensor *last_c = ctx.Output<phi::DenseTensor>("LastC");
+    phi::DenseTensor *reserve = ctx.Output<phi::DenseTensor>("Reserve");
+    phi::DenseTensor *state_out = ctx.Output<phi::DenseTensor>("StateOut");
 
     const T *x_data = x->data<T>();
     const T *init_h_data = init_h->data<T>();
@@ -256,7 +255,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
 
     size_t workspace_size;
     size_t reserve_size;
-    Tensor weight_whole;
+    phi::DenseTensor weight_whole;
     T *w_data = nullptr;
     int weight_numel;
     bool w_initialized = false;
@@ -272,7 +271,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     if (!w_initialized) {
       auto weight_list = ctx.MultiInput<phi::DenseTensor>("WeightList");
       bool continuous =
-          is_continuous<T, std::vector<const Tensor *>>(weight_list);
+          is_continuous<T, std::vector<const phi::DenseTensor *>>(weight_list);
       weight_numel = size_sum(weight_list);
 
       if (!continuous) {
@@ -288,7 +287,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
           for (size_t i = 0; i < weight_list.size(); ++i) {
             size_t len = weight_list[i]->numel();
             auto dim = weight_list[i]->dims();
-            const_cast<Tensor *>(weight_list[i])
+            const_cast<phi::DenseTensor *>(weight_list[i])
                 ->ShareDataWith(
                     weight_whole.Slice(static_cast<int64_t>(offset),
                                        static_cast<int64_t>(offset + len)))
@@ -481,12 +480,12 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     auto place = ctx.GetPlace();
     int weight_numel = size_sum(weight_list);
     bool continuous =
-        is_continuous<T, std::vector<const Tensor *>>(weight_list);
+        is_continuous<T, std::vector<const phi::DenseTensor *>>(weight_list);
 
     auto stream =
         reinterpret_cast<const phi::GPUContext &>(ctx.device_context())
             .stream();
-    Tensor weight_whole;
+    phi::DenseTensor weight_whole;
     T *weight_data = nullptr;
 
     if (!continuous) {
@@ -497,7 +496,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
       weight_data = const_cast<T *>(weight_list[0]->data<T>());
     }
 
-    Tensor weight_grad;
+    phi::DenseTensor weight_grad;
     phi::funcs::SetConstant<phi::GPUContext, T> zero;
     weight_grad.mutable_data<T>({weight_numel}, ctx.GetPlace());
     zero(dev_ctx, &weight_grad, static_cast<T>(0.0));
@@ -559,7 +558,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
                   SequenceLength,
                   &workspace_size,
                   &reserve_size,
-                  const_cast<Tensor *>(state_out));
+                  const_cast<phi::DenseTensor *>(state_out));
 
     phi::DenseTensor workspace_data_;
     workspace_data_.mutable_data<uint8_t>(
diff --git a/paddle/fluid/operators/cumsum_op_mlu.cc b/paddle/fluid/operators/cumsum_op_mlu.cc
index 83d9a10af1730..fb586b9585e03 100644
--- a/paddle/fluid/operators/cumsum_op_mlu.cc
+++ b/paddle/fluid/operators/cumsum_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class CumSumMLUKernel : public framework::OpKernel<T> {
  public:
@@ -34,7 +32,7 @@ class CumSumMLUKernel : public framework::OpKernel<T> {
     out->mutable_data<T>(ctx.GetPlace());
 
     phi::DenseTensor* input_ptr = const_cast<phi::DenseTensor*>(x);
-    Tensor flat_x(x->type());
+    phi::DenseTensor flat_x(x->type());
     if (flatten) {
       PADDLE_ENFORCE_EQ(
           axis,
diff --git a/paddle/fluid/operators/cumsum_op_npu.cc b/paddle/fluid/operators/cumsum_op_npu.cc
index 672a59cf22f59..7126e7ca4cbaf 100644
--- a/paddle/fluid/operators/cumsum_op_npu.cc
+++ b/paddle/fluid/operators/cumsum_op_npu.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 static void CumsumImp(const phi::DenseTensor& input,
                       phi::DenseTensor* output,
                       const framework::NPUAttributeMap& attr_input,
@@ -30,7 +28,7 @@ static void CumsumImp(const phi::DenseTensor& input,
           .stream();
   if (framework::TransToProtoVarType(input.dtype()) ==
       framework::proto::VarType::INT64) {
-    Tensor tmp_input;
+    phi::DenseTensor tmp_input;
     tmp_input.mutable_data<float>(input.dims(), ctx.GetPlace());
     auto dst_acl_dtype =
         ConvertToNpuDtype(framework::TransToProtoVarType(tmp_input.type()));
@@ -41,7 +39,7 @@ static void CumsumImp(const phi::DenseTensor& input,
                     {{"dst_type", static_cast<int>(dst_acl_dtype)}});
     cast_runner_1.Run(stream);
 
-    Tensor tmp_output;
+    phi::DenseTensor tmp_output;
     tmp_output.mutable_data<float>(output->dims(), ctx.GetPlace());
     const auto& runner =
         NpuOpRunner("CumsumD", {tmp_input}, {tmp_output}, attr_input);
@@ -86,7 +84,7 @@ class CumSumNPUKernel : public framework::OpKernel<T> {
               -1,
               axis));
 
-      Tensor new_x(x->type());
+      phi::DenseTensor new_x(x->type());
       new_x.ShareDataWith(*x);
 
       new_x.Resize(phi::make_ddim({x->numel()}));
diff --git a/paddle/fluid/operators/cvm_op.cc b/paddle/fluid/operators/cvm_op.cc
index 153b181b4fd6a..11af33df2f61b 100644
--- a/paddle/fluid/operators/cvm_op.cc
+++ b/paddle/fluid/operators/cvm_op.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class CVMOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/cvm_op.cu b/paddle/fluid/operators/cvm_op.cu
index 5cac5392f4abb..400e025f82030 100644
--- a/paddle/fluid/operators/cvm_op.cu
+++ b/paddle/fluid/operators/cvm_op.cu
@@ -22,7 +22,6 @@ namespace paddle {
 namespace operators {
 
 using phi::PADDLE_CUDA_NUM_THREADS;
-using Tensor = phi::DenseTensor;
 
 template <typename T>
 __global__ void CvmComputeKernel(const bool use_cvm,
diff --git a/paddle/fluid/operators/cvm_op.h b/paddle/fluid/operators/cvm_op.h
index 9bd5a00b3733f..461575d25b75d 100644
--- a/paddle/fluid/operators/cvm_op.h
+++ b/paddle/fluid/operators/cvm_op.h
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 void CvmComputeKernel(const bool use_cvm,
                       const int64_t item_width,
diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc
index 36dc93445df59..6770a7e31c1a5 100644
--- a/paddle/fluid/operators/data_norm_op.cc
+++ b/paddle/fluid/operators/data_norm_op.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 
 template <typename T>
@@ -483,9 +482,9 @@ class DataNormGradOp : public framework::OperatorWithKernel {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Y@GRAD can not be found for computation"));
     }
-    const Tensor *t = nullptr;
-    if (var->IsType<Tensor>()) {
-      t = &var->Get<Tensor>();
+    const phi::DenseTensor *t = nullptr;
+    if (var->IsType<phi::DenseTensor>()) {
+      t = &var->Get<phi::DenseTensor>();
     } else if (var->IsType<phi::DenseTensor>()) {
       t = &var->Get<phi::DenseTensor>();
     }
@@ -523,7 +522,7 @@ class DataNormGradKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
         (data_layout == DataLayout::kNCHW ? x_dims[1]
                                           : x_dims[x_dims.size() - 1]);
     // init output
-    Tensor *d_x = nullptr;
+    phi::DenseTensor *d_x = nullptr;
     if (ctx.HasOutput(framework::GradVarName("X"))) {
       d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     }
@@ -587,12 +586,12 @@ class DataNormGradKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
 
             EigenVectorArrayMap<T> d_bias_arr(d_bias_data, C);
             EigenVectorArrayMap<T> d_scale_arr(d_scale_data, C);
-            Tensor dy_sum;
+            phi::DenseTensor dy_sum;
             dy_sum.Resize({C});
             dy_sum.mutable_data<T>(ctx.GetPlace());
             EigenVectorArrayMap<T> dy_sum_arr(
                 dy_sum.mutable_data<T>(ctx.GetPlace()), C);
-            Tensor dy_mul_x_sub_mean_mul_invstd_sum;
+            phi::DenseTensor dy_mul_x_sub_mean_mul_invstd_sum;
             dy_mul_x_sub_mean_mul_invstd_sum.Resize({C});
             dy_mul_x_sub_mean_mul_invstd_sum.mutable_data<T>(ctx.GetPlace());
             EigenVectorArrayMap<T> dy_mul_x_sub_mean_mul_invstd_sum_arr(
diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu
index 1b895b0c8daa5..aaccaecc72067 100644
--- a/paddle/fluid/operators/data_norm_op.cu
+++ b/paddle/fluid/operators/data_norm_op.cu
@@ -26,7 +26,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 using phi::PADDLE_CUDA_NUM_THREADS;
 
@@ -166,7 +165,7 @@ class DataNormGradKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
     const int C = x_dims[1];
 
     // init output
-    Tensor *d_x = nullptr;
+    phi::DenseTensor *d_x = nullptr;
     if (ctx.HasOutput(framework::GradVarName("X"))) {
       d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     }
diff --git a/paddle/fluid/operators/deformable_conv_op_mlu.cc b/paddle/fluid/operators/deformable_conv_op_mlu.cc
index 08969ba98fcd2..f5814efb3f491 100644
--- a/paddle/fluid/operators/deformable_conv_op_mlu.cc
+++ b/paddle/fluid/operators/deformable_conv_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class DeformableConvMLUKernel : public framework::OpKernel<T> {
  public:
@@ -58,29 +56,29 @@ class DeformableConvMLUKernel : public framework::OpKernel<T> {
                             im2col_step);
 
     const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
-    Tensor trans_input(input->dtype());
+    phi::DenseTensor trans_input(input->dtype());
     TransposeFromMLUTensor<T>(
         ctx, perm_to_nhwc, input, &trans_input, true /*need_reshape_or_alloc*/);
 
-    Tensor trans_offset(offset->dtype());
+    phi::DenseTensor trans_offset(offset->dtype());
     TransposeFromMLUTensor<T>(ctx,
                               perm_to_nhwc,
                               offset,
                               &trans_offset,
                               true /*need_reshape_or_alloc*/);
 
-    Tensor trans_mask(mask->dtype());
+    phi::DenseTensor trans_mask(mask->dtype());
     TransposeFromMLUTensor<T>(
         ctx, perm_to_nhwc, mask, &trans_mask, true /*need_reshape_or_alloc*/);
 
-    Tensor trans_filter(filter->dtype());
+    phi::DenseTensor trans_filter(filter->dtype());
     TransposeFromMLUTensor<T>(ctx,
                               perm_to_nhwc,
                               filter,
                               &trans_filter,
                               true /*need_reshape_or_alloc*/);
 
-    Tensor tmp_output(output->dtype());
+    phi::DenseTensor tmp_output(output->dtype());
     auto output_dims = output->dims();
     tmp_output.mutable_data<T>(
         {output_dims[0], output_dims[2], output_dims[3], output_dims[1]},
@@ -167,54 +165,54 @@ class DeformableConvGradMLUKernel : public framework::OpKernel<T> {
                             groups,
                             im2col_step);
 
-    Tensor tmp_input_grad;
+    phi::DenseTensor tmp_input_grad;
     auto input_dims = input->dims();
     tmp_input_grad.mutable_data<T>(
         {input_dims[0], input_dims[2], input_dims[3], input_dims[1]},
         ctx.GetPlace());
 
-    Tensor tmp_filter_grad;
+    phi::DenseTensor tmp_filter_grad;
     auto filter_dims = filter->dims();
     tmp_filter_grad.mutable_data<T>(
         {filter_dims[0], filter_dims[2], filter_dims[3], filter_dims[1]},
         ctx.GetPlace());
 
-    Tensor tmp_offset_grad;
+    phi::DenseTensor tmp_offset_grad;
     auto offset_dims = offset->dims();
     tmp_offset_grad.mutable_data<T>(
         {offset_dims[0], offset_dims[2], offset_dims[3], offset_dims[1]},
         ctx.GetPlace());
 
-    Tensor tmp_mask_grad;
+    phi::DenseTensor tmp_mask_grad;
     auto mask_dims = mask->dims();
     tmp_mask_grad.mutable_data<T>(
         {mask_dims[0], mask_dims[2], mask_dims[3], mask_dims[1]},
         ctx.GetPlace());
 
     const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
-    Tensor trans_output_grad(output_grad->dtype());
+    phi::DenseTensor trans_output_grad(output_grad->dtype());
     TransposeFromMLUTensor<T>(ctx,
                               perm_to_nhwc,
                               output_grad,
                               &trans_output_grad,
                               true /*need_reshape_or_alloc*/);
 
-    Tensor trans_input(input->dtype());
+    phi::DenseTensor trans_input(input->dtype());
     TransposeFromMLUTensor<T>(
         ctx, perm_to_nhwc, input, &trans_input, true /*need_reshape_or_alloc*/);
 
-    Tensor trans_offset(offset->dtype());
+    phi::DenseTensor trans_offset(offset->dtype());
     TransposeFromMLUTensor<T>(ctx,
                               perm_to_nhwc,
                               offset,
                               &trans_offset,
                               true /*need_reshape_or_alloc*/);
 
-    Tensor trans_mask(mask->dtype());
+    phi::DenseTensor trans_mask(mask->dtype());
     TransposeFromMLUTensor<T>(
         ctx, perm_to_nhwc, mask, &trans_mask, true /*need_reshape_or_alloc*/);
 
-    Tensor trans_filter(filter->dtype());
+    phi::DenseTensor trans_filter(filter->dtype());
     TransposeFromMLUTensor<T>(ctx,
                               perm_to_nhwc,
                               filter,
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
index 80d248b818b4f..0e8c736431b11 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
@@ -39,7 +39,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using phi::PADDLE_CUDA_NUM_THREADS;
 
 static inline int GET_BLOCKS(const int N) {
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.h b/paddle/fluid/operators/deformable_psroi_pooling_op.h
index 231d14e537b54..dabb69b5af8c1 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.h
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.h
@@ -33,8 +33,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 T bilinear_interp(
     const T* data, const T x, const T y, const int width, const int height) {
@@ -518,7 +516,7 @@ class DeformablePSROIPoolGradCPUKernel : public framework::OpKernel<T> {
     const int num_classes = no_trans ? 1 : channels_trans / 2;
     const int channels_each_class =
         no_trans ? output_dim : output_dim / num_classes;
-    Tensor roi_batch_id_list;
+    phi::DenseTensor roi_batch_id_list;
     roi_batch_id_list.Resize({num_rois});
     int* roi_batch_id_data =
         roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h
index a9ad6cdfb659d..adb60a8a8d064 100644
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -30,8 +30,6 @@ namespace cub = hipcub;
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
 
 int const kThreadsPerBlock = sizeof(uint64_t) * 8;
@@ -47,11 +45,11 @@ struct RangeInitFunctor {
 
 template <typename T>
 static void SortDescending(const phi::GPUContext &ctx,
-                           const Tensor &value,
-                           Tensor *value_out,
-                           Tensor *index_out) {
+                           const phi::DenseTensor &value,
+                           phi::DenseTensor *value_out,
+                           phi::DenseTensor *index_out) {
   int num = static_cast<int>(value.numel());
-  Tensor index_in_t;
+  phi::DenseTensor index_in_t;
   int *idx_in = index_in_t.mutable_data<int>({num}, ctx.GetPlace());
   platform::ForRange<phi::GPUContext> for_range(ctx, num);
   for_range(RangeInitFunctor{0, 1, idx_in});
@@ -287,10 +285,10 @@ static __global__ void NMSKernel(const int n_boxes,
 
 template <typename T>
 static void NMS(const phi::GPUContext &ctx,
-                const Tensor &proposals,
-                const Tensor &sorted_indices,
+                const phi::DenseTensor &proposals,
+                const phi::DenseTensor &sorted_indices,
                 const T nms_threshold,
-                Tensor *keep_out,
+                phi::DenseTensor *keep_out,
                 bool pixel_offset = true) {
   int boxes_num = proposals.dims()[0];
   const int col_blocks = DIVUP(boxes_num, kThreadsPerBlock);
diff --git a/paddle/fluid/operators/detection/bipartite_match_op.cc b/paddle/fluid/operators/detection/bipartite_match_op.cc
index 35368d0034221..583122b473d26 100644
--- a/paddle/fluid/operators/detection/bipartite_match_op.cc
+++ b/paddle/fluid/operators/detection/bipartite_match_op.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class BipartiteMatchOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -234,7 +232,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
       auto lod = dist_mat->lod().back();
       for (size_t i = 0; i < lod.size() - 1; ++i) {
         if (lod[i + 1] > lod[i]) {
-          Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]);
+          phi::DenseTensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]);
           BipartiteMatch(one_ins, indices + i * col, dist + i * col);
           if (type == "per_prediction") {
             ArgMaxMatch(one_ins, indices + i * col, dist + i * col, threshold);
diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu
index 089f2f5569234..79f3b18b2dfce 100644
--- a/paddle/fluid/operators/detection/box_clip_op.cu
+++ b/paddle/fluid/operators/detection/box_clip_op.cu
@@ -22,7 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using LoDTenso = phi::DenseTensor;
 
 static constexpr int ImInfoSize = 3;
diff --git a/paddle/fluid/operators/detection/box_clip_op.h b/paddle/fluid/operators/detection/box_clip_op.h
index bb72ca194b54c..cb067f91662ed 100644
--- a/paddle/fluid/operators/detection/box_clip_op.h
+++ b/paddle/fluid/operators/detection/box_clip_op.h
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class BoxClipKernel : public framework::OpKernel<T> {
  public:
@@ -42,9 +40,10 @@ class BoxClipKernel : public framework::OpKernel<T> {
     auto box_lod = input_box->lod().back();
     int64_t n = static_cast<int64_t>(box_lod.size() - 1);
     for (int i = 0; i < n; ++i) {
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
-      Tensor box_slice = input_box->Slice(box_lod[i], box_lod[i + 1]);
-      Tensor output_slice = output_box->Slice(box_lod[i], box_lod[i + 1]);
+      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
+      phi::DenseTensor box_slice = input_box->Slice(box_lod[i], box_lod[i + 1]);
+      phi::DenseTensor output_slice =
+          output_box->Slice(box_lod[i], box_lod[i + 1]);
       ClipTiledBoxes<T>(dev_ctx, im_info_slice, box_slice, &output_slice);
     }
   }
diff --git a/paddle/fluid/operators/detection/box_coder_op_npu.cc b/paddle/fluid/operators/detection/box_coder_op_npu.cc
index 865f218170f45..089f58558ae73 100644
--- a/paddle/fluid/operators/detection/box_coder_op_npu.cc
+++ b/paddle/fluid/operators/detection/box_coder_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 struct BoxCoderFunction {
  public:
@@ -28,31 +26,31 @@ struct BoxCoderFunction {
     stream = ctx.template device_context<paddle::platform::NPUDeviceContext>()
                  .stream();
   }
-  Tensor Adds(const phi::DenseTensor& x, float scalar) {
-    Tensor y;
+  phi::DenseTensor Adds(const phi::DenseTensor& x, float scalar) {
+    phi::DenseTensor y;
     y.mutable_data<T>(x.dims(), place);
     const auto& runner = NpuOpRunner("Adds", {x}, {y}, {{"value", scalar}});
     runner.Run(stream);
     return y;
   }
-  Tensor Muls(const phi::DenseTensor& x, float scalar) {
-    Tensor y;
+  phi::DenseTensor Muls(const phi::DenseTensor& x, float scalar) {
+    phi::DenseTensor y;
     y.mutable_data<T>(x.dims(), place);
     const auto& runner = NpuOpRunner("Muls", {x}, {y}, {{"value", scalar}});
     runner.Run(stream);
     return y;
   }
-  Tensor Mul(const phi::DenseTensor& x, const phi::DenseTensor& y) {
-    Tensor z;
+  phi::DenseTensor Mul(const phi::DenseTensor& x, const phi::DenseTensor& y) {
+    phi::DenseTensor z;
     z.mutable_data<T>(x.dims(), place);
     const auto& runner = NpuOpRunner("Mul", {x, y}, {z}, {});
     runner.Run(stream);
     return z;
   }
-  Tensor SubWithBroadCast(const phi::DenseTensor& x,
-                          const phi::DenseTensor& y,
-                          const framework::DDim& shape) {
-    Tensor z;
+  phi::DenseTensor SubWithBroadCast(const phi::DenseTensor& x,
+                                    const phi::DenseTensor& y,
+                                    const framework::DDim& shape) {
+    phi::DenseTensor z;
     z.mutable_data<T>(shape, place);
     const auto& runner = NpuOpRunner("Sub", {x, y}, {z}, {});
     runner.Run(stream);
@@ -66,10 +64,10 @@ struct BoxCoderFunction {
     const auto& runner = NpuOpRunner("Div", {x, y}, {*z}, {});
     runner.Run(stream);
   }
-  Tensor DivWithBroadCast(const phi::DenseTensor& x,
-                          const phi::DenseTensor& y,
-                          const framework::DDim& shape) {
-    Tensor z;
+  phi::DenseTensor DivWithBroadCast(const phi::DenseTensor& x,
+                                    const phi::DenseTensor& y,
+                                    const framework::DDim& shape) {
+    phi::DenseTensor z;
     DivWithBroadCastVoid(x, y, shape, &z);
     return z;
   }
@@ -81,10 +79,10 @@ struct BoxCoderFunction {
     const auto& runner = NpuOpRunner("Mul", {x, y}, {*z}, {});
     runner.Run(stream);
   }
-  Tensor MulWithBroadCast(const phi::DenseTensor& x,
-                          const phi::DenseTensor& y,
-                          const framework::DDim& shape) {
-    Tensor z;
+  phi::DenseTensor MulWithBroadCast(const phi::DenseTensor& x,
+                                    const phi::DenseTensor& y,
+                                    const framework::DDim& shape) {
+    phi::DenseTensor z;
     MulWithBroadCastVoid(x, y, shape, &z);
     return z;
   }
@@ -96,36 +94,36 @@ struct BoxCoderFunction {
     const auto& runner = NpuOpRunner("AddV2", {x, y}, {*z}, {});
     runner.Run(stream);
   }
-  Tensor AddWithBroadCast(const phi::DenseTensor& x,
-                          const phi::DenseTensor& y,
-                          const framework::DDim& shape) {
-    Tensor z;
+  phi::DenseTensor AddWithBroadCast(const phi::DenseTensor& x,
+                                    const phi::DenseTensor& y,
+                                    const framework::DDim& shape) {
+    phi::DenseTensor z;
     AddWithBroadCastVoid(x, y, shape, &z);
     return z;
   }
-  Tensor Abs(const phi::DenseTensor& x) {
-    Tensor y;
+  phi::DenseTensor Abs(const phi::DenseTensor& x) {
+    phi::DenseTensor y;
     y.mutable_data<T>(x.dims(), place);
     const auto& runner = NpuOpRunner("Abs", {x}, {y}, {});
     runner.Run(stream);
     return y;
   }
-  Tensor Log(const phi::DenseTensor& x) {
-    Tensor t_x_m1 = Adds(x, -1);
-    Tensor y;
+  phi::DenseTensor Log(const phi::DenseTensor& x) {
+    phi::DenseTensor t_x_m1 = Adds(x, -1);
+    phi::DenseTensor y;
     y.mutable_data<T>(x.dims(), place);
     const auto& runner = NpuOpRunner("Log1p", {t_x_m1}, {y}, {});
     runner.Run(stream);
     return y;
   }
-  Tensor Exp(const phi::DenseTensor& x) {
-    Tensor y;
+  phi::DenseTensor Exp(const phi::DenseTensor& x) {
+    phi::DenseTensor y;
     y.mutable_data<T>(x.dims(), place);
     const auto& runner = NpuOpRunner("Exp", {x}, {y}, {});
     runner.Run(stream);
     return y;
   }
-  Tensor Dot(const phi::DenseTensor& x, const phi::DenseTensor& y) {
+  phi::DenseTensor Dot(const phi::DenseTensor& x, const phi::DenseTensor& y) {
     auto dim_x = x.dims();
     auto dim_y = y.dims();
     PADDLE_ENFORCE_EQ(
@@ -145,7 +143,7 @@ struct BoxCoderFunction {
                                           "got dim_x[1] = %d, dim_y[0] = %d.",
                                           dim_x[1],
                                           dim_y[0]));
-    Tensor z;
+    phi::DenseTensor z;
     z.mutable_data<T>({dim_x[0], dim_y[1]}, place);
     const auto& runner =
         NpuOpRunner("MatMul",
@@ -155,7 +153,7 @@ struct BoxCoderFunction {
     runner.Run(stream);
     return z;
   }
-  void ConcatVoid(const std::vector<Tensor>& inputs,
+  void ConcatVoid(const std::vector<phi::DenseTensor>& inputs,
                   const framework::DDim& shape_out,
                   int axis,
                   phi::DenseTensor* output) {
@@ -172,18 +170,18 @@ struct BoxCoderFunction {
     runner.AddInputNames(names);
     runner.Run(stream);
   }
-  Tensor Concat(const std::vector<Tensor>& inputs,
-                const framework::DDim& shape_out,
-                int axis) {
-    Tensor output;
+  phi::DenseTensor Concat(const std::vector<phi::DenseTensor>& inputs,
+                          const framework::DDim& shape_out,
+                          int axis) {
+    phi::DenseTensor output;
     ConcatVoid(inputs, shape_out, axis, &output);
     return output;
   }
-  Tensor Slice(const phi::DenseTensor& x,
-               const std::vector<int>& offsets,
-               const std::vector<int>& size,
-               const framework::DDim& shape) {
-    Tensor y;
+  phi::DenseTensor Slice(const phi::DenseTensor& x,
+                         const std::vector<int>& offsets,
+                         const std::vector<int>& size,
+                         const framework::DDim& shape) {
+    phi::DenseTensor y;
     y.mutable_data<T>(shape, place);
     const auto& runner =
         NpuOpRunner("SliceD", {x}, {y}, {{"offsets", offsets}, {"size", size}});
@@ -218,8 +216,8 @@ void BoxCoderEnc(const framework::ExecutionContext& ctx,
   auto M = pb->dims()[0];
   auto N = tb->dims()[0];
   auto shape_0 = phi::make_ddim({4, 2});
-  Tensor m_diff;
-  Tensor m_aver;
+  phi::DenseTensor m_diff;
+  phi::DenseTensor m_aver;
   std::vector<T> vec_diff = {static_cast<T>(-1),
                              static_cast<T>(0),
                              static_cast<T>(0),
@@ -240,10 +238,10 @@ void BoxCoderEnc(const framework::ExecutionContext& ctx,
   Vector2Tensor<T>(ctx, vec_aver, shape_0, &m_aver);
 
   BoxCoderFunction<T> F(ctx);
-  Tensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5));
-  Tensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1));
-  Tensor tb_xy = F.Dot(*tb, m_aver);
-  Tensor tb_wh = F.Adds(F.Dot(*tb, m_diff), (norm ? 0 : 1));
+  phi::DenseTensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5));
+  phi::DenseTensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1));
+  phi::DenseTensor tb_xy = F.Dot(*tb, m_aver);
+  phi::DenseTensor tb_wh = F.Adds(F.Dot(*tb, m_diff), (norm ? 0 : 1));
 
   pb_xy.Resize({1, M, 2});
   pb_wh.Resize({1, M, 2});
@@ -253,15 +251,16 @@ void BoxCoderEnc(const framework::ExecutionContext& ctx,
   auto shape_half = phi::make_ddim({N, M, 2});
   auto shape_full = phi::make_ddim({N, M, 4});
 
-  Tensor out_xy_0 = F.DivWithBroadCast(
+  phi::DenseTensor out_xy_0 = F.DivWithBroadCast(
       F.SubWithBroadCast(tb_xy, pb_xy, shape_half), pb_wh, shape_half);
-  Tensor out_wh_0 = F.Log(F.Abs(F.DivWithBroadCast(tb_wh, pb_wh, shape_half)));
-  Tensor out_0 = F.Concat({out_xy_0, out_wh_0}, shape_full, 2);
+  phi::DenseTensor out_wh_0 =
+      F.Log(F.Abs(F.DivWithBroadCast(tb_wh, pb_wh, shape_half)));
+  phi::DenseTensor out_0 = F.Concat({out_xy_0, out_wh_0}, shape_full, 2);
 
   if (pbv) {
     F.DivWithBroadCastVoid(out_0, *pbv, shape_full, out);
   } else {
-    Tensor t_var;
+    phi::DenseTensor t_var;
     std::vector<T> vec_var(4);
     for (auto i = 0; i < 4; i++) {
       vec_var[i] = static_cast<T>(variance[i]);
@@ -281,8 +280,8 @@ void BoxCoderDec(const framework::ExecutionContext& ctx,
                  int axis,
                  phi::DenseTensor* out) {
   auto shape_0 = phi::make_ddim({4, 2});
-  Tensor m_diff;
-  Tensor m_aver;
+  phi::DenseTensor m_diff;
+  phi::DenseTensor m_aver;
   std::vector<T> vec_diff = {static_cast<T>(-1),
                              static_cast<T>(0),
                              static_cast<T>(0),
@@ -303,8 +302,8 @@ void BoxCoderDec(const framework::ExecutionContext& ctx,
   Vector2Tensor<T>(ctx, vec_aver, shape_0, &m_aver);
 
   BoxCoderFunction<T> F(ctx);
-  Tensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5));
-  Tensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1));
+  phi::DenseTensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5));
+  phi::DenseTensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1));
   auto pb_resize_shape = axis == 0 ? phi::make_ddim({1, pb->dims()[0], 2})
                                    : phi::make_ddim({pb->dims()[0], 1, 2});
   pb_xy.Resize(pb_resize_shape);
@@ -313,18 +312,22 @@ void BoxCoderDec(const framework::ExecutionContext& ctx,
   auto tbox_slice_shape = phi::make_ddim({tb->dims()[0], tb->dims()[1], 2});
   std::vector<int> tbox_slice_size = {
       static_cast<int>(tb->dims()[0]), static_cast<int>(tb->dims()[1]), 2};
-  Tensor tbox01 = F.Slice(*tb, {0, 0, 0}, tbox_slice_size, tbox_slice_shape);
-  Tensor tbox23 = F.Slice(*tb, {0, 0, 2}, tbox_slice_size, tbox_slice_shape);
+  phi::DenseTensor tbox01 =
+      F.Slice(*tb, {0, 0, 0}, tbox_slice_size, tbox_slice_shape);
+  phi::DenseTensor tbox23 =
+      F.Slice(*tb, {0, 0, 2}, tbox_slice_size, tbox_slice_shape);
 
-  Tensor tb_xy;
-  Tensor tb_wh;
+  phi::DenseTensor tb_xy;
+  phi::DenseTensor tb_wh;
   if (pbv) {
     auto pbvt_slice_shape = phi::make_ddim({pbv->dims()[0], 2});
     auto pbvt_resize_shape = axis == 0 ? phi::make_ddim({1, pbv->dims()[0], 2})
                                        : phi::make_ddim({pbv->dims()[0], 1, 2});
     std::vector<int> pbvt_slice_size = {static_cast<int>(pbv->dims()[0]), 2};
-    Tensor pbv_t01 = F.Slice(*pbv, {0, 0}, pbvt_slice_size, pbvt_slice_shape);
-    Tensor pbv_t23 = F.Slice(*pbv, {0, 2}, pbvt_slice_size, pbvt_slice_shape);
+    phi::DenseTensor pbv_t01 =
+        F.Slice(*pbv, {0, 0}, pbvt_slice_size, pbvt_slice_shape);
+    phi::DenseTensor pbv_t23 =
+        F.Slice(*pbv, {0, 2}, pbvt_slice_size, pbvt_slice_shape);
     pbv_t01.Resize(pbvt_resize_shape);
     pbv_t23.Resize(pbvt_resize_shape);
 
@@ -345,7 +348,7 @@ void BoxCoderDec(const framework::ExecutionContext& ctx,
                            &tb_xy);
     F.MulWithBroadCastVoid(F.Exp(tbox23), pb_wh, tbox_slice_shape, &tb_wh);
   } else {
-    Tensor t_var01, t_var23;
+    phi::DenseTensor t_var01, t_var23;
     auto t_var_shape = phi::make_ddim({1, 1, 2});
     std::vector<T> vec_var01 = {static_cast<T>(variance[0]),
                                 static_cast<T>(variance[1])};
@@ -366,9 +369,9 @@ void BoxCoderDec(const framework::ExecutionContext& ctx,
         tbox_slice_shape,
         &tb_wh);
   }
-  Tensor obox01 =
+  phi::DenseTensor obox01 =
       F.AddWithBroadCast(tb_xy, F.Muls(tb_wh, -0.5), tbox_slice_shape);
-  Tensor obox23 =
+  phi::DenseTensor obox23 =
       F.Adds(F.AddWithBroadCast(tb_xy, F.Muls(tb_wh, 0.5), tbox_slice_shape),
              (norm ? 0 : -1));
   F.ConcatVoid({obox01, obox23}, out->dims(), 2, out);
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
index 37dc10df7292a..e07e4034f330f 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
@@ -16,7 +16,6 @@ limitations under the License.*/
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 class CollectFpnProposalsOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index b517f2ec1fdd3..29cf8da067f84 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -33,8 +33,6 @@ namespace cub = hipcub;
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 static constexpr int kNumCUDAThreads = 64;
 static constexpr int kNumMaxinumNumBlocks = 4096;
 
@@ -74,13 +72,13 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
 
     int real_post_num = min(post_nms_topN, total_roi_num);
     fpn_rois->mutable_data<T>({real_post_num, kBBoxSize}, dev_ctx.GetPlace());
-    Tensor concat_rois;
-    Tensor concat_scores;
+    phi::DenseTensor concat_rois;
+    phi::DenseTensor concat_scores;
     T* concat_rois_data = concat_rois.mutable_data<T>(
         {total_roi_num, kBBoxSize}, dev_ctx.GetPlace());
     T* concat_scores_data =
         concat_scores.mutable_data<T>({total_roi_num, 1}, dev_ctx.GetPlace());
-    Tensor roi_batch_id_list;
+    phi::DenseTensor roi_batch_id_list;
     roi_batch_id_list.Resize({total_roi_num});
     int* roi_batch_id_data =
         roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
@@ -130,20 +128,20 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     }
 
     // copy batch id list to GPU
-    Tensor roi_batch_id_list_gpu;
+    phi::DenseTensor roi_batch_id_list_gpu;
     framework::TensorCopy(
         roi_batch_id_list, dev_ctx.GetPlace(), &roi_batch_id_list_gpu);
 
-    Tensor index_in_t;
+    phi::DenseTensor index_in_t;
     int* idx_in =
         index_in_t.mutable_data<int>({total_roi_num}, dev_ctx.GetPlace());
     platform::ForRange<phi::GPUContext> for_range_total(dev_ctx, total_roi_num);
     for_range_total(RangeInitFunctor{0, 1, idx_in});
 
-    Tensor keys_out_t;
+    phi::DenseTensor keys_out_t;
     T* keys_out =
         keys_out_t.mutable_data<T>({total_roi_num}, dev_ctx.GetPlace());
-    Tensor index_out_t;
+    phi::DenseTensor index_out_t;
     int* idx_out =
         index_out_t.mutable_data<int>({total_roi_num}, dev_ctx.GetPlace());
 
@@ -175,21 +173,21 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
                                                       sizeof(T) * 8,
                                                       dev_ctx.stream());
     index_out_t.Resize({real_post_num});
-    Tensor sorted_rois;
+    phi::DenseTensor sorted_rois;
     sorted_rois.mutable_data<T>({real_post_num, kBBoxSize}, dev_ctx.GetPlace());
-    Tensor sorted_batch_id;
+    phi::DenseTensor sorted_batch_id;
     sorted_batch_id.mutable_data<int>({real_post_num}, dev_ctx.GetPlace());
     phi::funcs::GPUGather<T>(dev_ctx, concat_rois, index_out_t, &sorted_rois);
     phi::funcs::GPUGather<int>(
         dev_ctx, roi_batch_id_list_gpu, index_out_t, &sorted_batch_id);
 
-    Tensor batch_index_t;
+    phi::DenseTensor batch_index_t;
     int* batch_idx_in =
         batch_index_t.mutable_data<int>({real_post_num}, dev_ctx.GetPlace());
     platform::ForRange<phi::GPUContext> for_range_post(dev_ctx, real_post_num);
     for_range_post(RangeInitFunctor{0, 1, batch_idx_in});
 
-    Tensor out_id_t;
+    phi::DenseTensor out_id_t;
     int* out_id_data =
         out_id_t.mutable_data<int>({real_post_num}, dev_ctx.GetPlace());
     // Determine temporary device storage requirements
@@ -222,7 +220,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
 
     phi::funcs::GPUGather<T>(dev_ctx, sorted_rois, index_out_t, fpn_rois);
 
-    Tensor length_lod;
+    phi::DenseTensor length_lod;
     int* length_lod_data =
         length_lod.mutable_data<int>({lod_size}, dev_ctx.GetPlace());
     phi::funcs::SetConstant<phi::GPUContext, int> set_zero;
diff --git a/paddle/fluid/operators/detection/density_prior_box_op_npu.cc b/paddle/fluid/operators/detection/density_prior_box_op_npu.cc
index a6f9170712d96..d1a609ad45de6 100644
--- a/paddle/fluid/operators/detection/density_prior_box_op_npu.cc
+++ b/paddle/fluid/operators/detection/density_prior_box_op_npu.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using fp16 = paddle::platform::float16;
 
 template <typename T>
@@ -89,7 +88,7 @@ struct DensityPriorBoxFunction {
     const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*z}, {});
     runner.Run(stream);
   }
-  void Concat(const std::vector<Tensor>& inputs,
+  void Concat(const std::vector<phi::DenseTensor>& inputs,
               int axis,
               phi::DenseTensor* output) {
     //  output should be init first
@@ -131,14 +130,14 @@ struct DensityPriorBoxFunction {
   platform::Place place;
   aclrtStream stream;
   const framework::ExecutionContext& ctx;
-  Tensor t0;
-  Tensor t1;
-  Tensor tn;
+  phi::DenseTensor t0;
+  phi::DenseTensor t1;
+  phi::DenseTensor tn;
 };
 
 template <>
 void DensityPriorBoxFunction<fp16>::Arange(int n, phi::DenseTensor* x) {
-  Tensor x_fp32(experimental::DataType::FLOAT32);
+  phi::DenseTensor x_fp32(experimental::DataType::FLOAT32);
   x_fp32.mutable_data<float>(x->dims(), place);
   FillNpuTensorWithConstant<float>(&tn, static_cast<float>(n));
   const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {x_fp32}, {});
@@ -149,7 +148,7 @@ void DensityPriorBoxFunction<fp16>::Arange(int n, phi::DenseTensor* x) {
 template <>
 void DensityPriorBoxFunction<fp16>::FloatVec2Tsr(const std::vector<float>& vec,
                                                  phi::DenseTensor* tsr_dst) {
-  Tensor tsr_fp32(experimental::DataType::FLOAT32);
+  phi::DenseTensor tsr_fp32(experimental::DataType::FLOAT32);
   tsr_fp32.mutable_data<float>(tsr_dst->dims(), place);
   framework::TensorFromVector<float>(vec, ctx.device_context(), &tsr_fp32);
   ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
@@ -185,9 +184,9 @@ class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {
     auto place = ctx.GetPlace();
     DensityPriorBoxFunction<T> F(ctx);
 
-    Tensor h(_type);
+    phi::DenseTensor h(_type);
     h.mutable_data<T>({layer_h}, place);
-    Tensor w(_type);
+    phi::DenseTensor w(_type);
     w.mutable_data<T>({layer_w}, place);
     F.Arange(layer_h, &h);
     F.Arange(layer_w, &w);
@@ -203,11 +202,11 @@ class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {
     for (size_t i = 0; i < densities.size(); ++i) {
       num_priors_per_ratio += densities[i] * densities[i];
     }
-    Tensor di(_type);
-    Tensor dj(_type);
-    Tensor shifts(_type);
-    Tensor box_w_ratio(_type);
-    Tensor box_h_ratio(_type);
+    phi::DenseTensor di(_type);
+    phi::DenseTensor dj(_type);
+    phi::DenseTensor shifts(_type);
+    phi::DenseTensor box_w_ratio(_type);
+    phi::DenseTensor box_h_ratio(_type);
     di.mutable_data<T>({ratios_size * num_priors_per_ratio}, place);
     dj.mutable_data<T>({ratios_size * num_priors_per_ratio}, place);
     shifts.mutable_data<T>({ratios_size * num_priors_per_ratio}, place);
@@ -220,19 +219,21 @@ class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {
       //  Range = start:start+ratios_size*density_sqr, density = densities[i]
       int density_sqr = densities[i] * densities[i];
       //  shifts[Range] = [step_average/density]*ratios_size*density_sqr
-      Tensor shifts_part =
+      phi::DenseTensor shifts_part =
           shifts.Slice(start, start + ratios_size * density_sqr);
       FillNpuTensorWithConstant<T>(&shifts_part,
                                    static_cast<T>(step_average / densities[i]));
 
       //  di[Range] = [ i // density for i in range(density_sqr) ] * ratios_size
       //  dj[Range] = [ i % density for i in range(density_sqr) ] * ratios_size
-      Tensor di_part = di.Slice(start, start + ratios_size * density_sqr);
-      Tensor dj_part = dj.Slice(start, start + ratios_size * density_sqr);
+      phi::DenseTensor di_part =
+          di.Slice(start, start + ratios_size * density_sqr);
+      phi::DenseTensor dj_part =
+          dj.Slice(start, start + ratios_size * density_sqr);
       if (densities[i] > 1) {
         di_part.Resize({ratios_size, densities[i], densities[i]});
         dj_part.Resize({ratios_size, densities[i], densities[i]});
-        Tensor range_n(_type);
+        phi::DenseTensor range_n(_type);
         range_n.mutable_data<T>({densities[i]}, place);
         F.Arange(densities[i], &range_n);
         range_n.Resize({1, densities[i], 1});
@@ -254,9 +255,9 @@ class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {
         //  Range_mini = start_box_ratio:start_box_ratio+density_sqr
         //  box_h_ratio[Range_mini] = [fixed_sizes[i] * sqrt(ar)]  * density_sqr
         //  box_w_ratio[Range_mini] = [fixed_sizes[i] / sqrt(ar)]  * density_sqr
-        Tensor box_h_ratio_part =
+        phi::DenseTensor box_h_ratio_part =
             box_h_ratio.Slice(start_box_ratio, start_box_ratio + density_sqr);
-        Tensor box_w_ratio_part =
+        phi::DenseTensor box_w_ratio_part =
             box_w_ratio.Slice(start_box_ratio, start_box_ratio + density_sqr);
         FillNpuTensorWithConstant<T>(&box_w_ratio_part,
                                      static_cast<T>(fixed_sizes[i] * sqrt(ar)));
@@ -274,8 +275,8 @@ class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {
 
     //  c_x = (w+offset)*step_w - 0.5*step_average + 0.5*shifts + dj*shifts
     //  c_y = (h+offset)*step_h - 0.5*step_average + 0.5*shifts + di*shifts
-    Tensor c_x(_type);
-    Tensor c_y(_type);
+    phi::DenseTensor c_x(_type);
+    phi::DenseTensor c_y(_type);
     auto dim0 =
         phi::make_ddim({1, layer_w, ratios_size * num_priors_per_ratio, 1});
     auto dim1 =
@@ -301,17 +302,17 @@ class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {
     F.Muls(&box_w_ratio, static_cast<float>(0.5), &box_w_ratio);
     F.Muls(&box_h_ratio, static_cast<float>(0.5), &box_h_ratio);
 
-    Tensor zero_t(_type);
-    Tensor one_t(_type);
+    phi::DenseTensor zero_t(_type);
+    phi::DenseTensor one_t(_type);
     zero_t.mutable_data<T>({1}, place);
     one_t.mutable_data<T>({1}, place);
     FillNpuTensorWithConstant<T>(&zero_t, static_cast<T>(0));
     FillNpuTensorWithConstant<T>(&one_t, static_cast<T>(1));
 
-    Tensor outbox0(_type);
-    Tensor outbox1(_type);
-    Tensor outbox2(_type);
-    Tensor outbox3(_type);
+    phi::DenseTensor outbox0(_type);
+    phi::DenseTensor outbox1(_type);
+    phi::DenseTensor outbox2(_type);
+    phi::DenseTensor outbox3(_type);
     outbox0.mutable_data<T>(dim0, place);
     outbox1.mutable_data<T>(dim1, place);
     outbox2.mutable_data<T>(dim0, place);
@@ -349,17 +350,17 @@ class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {
         {layer_h, layer_w, ratios_size * num_priors_per_ratio, 4});
     boxes->mutable_data<T>(place);
     vars->mutable_data<T>(place);
-    Tensor boxes_share(_type);
-    Tensor vars_share(_type);
+    phi::DenseTensor boxes_share(_type);
+    phi::DenseTensor vars_share(_type);
     boxes_share.ShareDataWith(*boxes);
     boxes_share.Resize(out_dim);
     vars_share.ShareDataWith(*vars);
     vars_share.Resize(out_dim);
 
-    Tensor box0(_type);
-    Tensor box1(_type);
-    Tensor box2(_type);
-    Tensor box3(_type);
+    phi::DenseTensor box0(_type);
+    phi::DenseTensor box1(_type);
+    phi::DenseTensor box2(_type);
+    phi::DenseTensor box3(_type);
     // out_dim = {layer_h, layer_w, ratios_size*num_priors_per_ratio, 1}
     out_dim[3] = 1;
     box0.mutable_data<T>(out_dim, place);
@@ -377,7 +378,7 @@ class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {
 
     std::vector<int> multiples = {
         layer_h, layer_w, ratios_size * num_priors_per_ratio, 1};
-    Tensor variances_t(_type);
+    phi::DenseTensor variances_t(_type);
     //  variances.size() == 4
     variances_t.mutable_data<T>({4}, place);
     F.FloatVec2Tsr(variances, &variances_t);
diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
index f14768168a425..7ae5ba6ca8f9c 100644
--- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
@@ -25,7 +25,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 const int kBoxDim = 4;
 
 template <typename T>
@@ -151,16 +150,17 @@ static inline void ExpandMaskTarget(const phi::CPUContext& ctx,
 }
 
 template <typename T>
-std::vector<Tensor> SampleMaskForOneImage(const phi::CPUContext& ctx,
-                                          const phi::DenseTensor& im_info,
-                                          const phi::DenseTensor& gt_classes,
-                                          const phi::DenseTensor& is_crowd,
-                                          const phi::DenseTensor& gt_segms,
-                                          const phi::DenseTensor& rois,
-                                          const phi::DenseTensor& label_int32,
-                                          const int num_classes,
-                                          const int resolution,
-                                          const framework::LoD& segm_length) {
+std::vector<phi::DenseTensor> SampleMaskForOneImage(
+    const phi::CPUContext& ctx,
+    const phi::DenseTensor& im_info,
+    const phi::DenseTensor& gt_classes,
+    const phi::DenseTensor& is_crowd,
+    const phi::DenseTensor& gt_segms,
+    const phi::DenseTensor& rois,
+    const phi::DenseTensor& label_int32,
+    const int num_classes,
+    const int resolution,
+    const framework::LoD& segm_length) {
   // Prepare the mask targets by associating one gt mask to each training roi
   // that has a fg (non-bg) class label.
   const int64_t gt_size = static_cast<int64_t>(gt_classes.dims()[0]);
@@ -218,15 +218,15 @@ std::vector<Tensor> SampleMaskForOneImage(const phi::CPUContext& ctx,
   int gt_num = mask_gt_inds.size();
   int fg_num = fg_inds.size();
 
-  Tensor boxes_from_polys;
+  phi::DenseTensor boxes_from_polys;
   boxes_from_polys.mutable_data<T>({gt_num, 4}, platform::CPUPlace());
   Poly2Boxes(gt_polys, boxes_from_polys.data<T>());
 
   std::vector<int> roi_has_mask =
       std::vector<int>(fg_inds.begin(), fg_inds.end());
-  Tensor mask_class_labels;
-  Tensor masks;
-  Tensor rois_fg;
+  phi::DenseTensor mask_class_labels;
+  phi::DenseTensor masks;
+  phi::DenseTensor rois_fg;
 
   auto im_scale = im_info.data<T>()[2];
   if (fg_num > 0) {
@@ -251,7 +251,7 @@ std::vector<Tensor> SampleMaskForOneImage(const phi::CPUContext& ctx,
       rois_fg_data[k] = rois_fg_data[k] / im_scale;
     }
 
-    Tensor overlaps_bbfg_bbpolys;
+    phi::DenseTensor overlaps_bbfg_bbpolys;
     overlaps_bbfg_bbpolys.mutable_data<T>({fg_num, gt_num}, ctx.GetPlace());
     BboxOverlaps<T>(rois_fg, boxes_from_polys, &overlaps_bbfg_bbpolys);
 
@@ -306,7 +306,7 @@ std::vector<Tensor> SampleMaskForOneImage(const phi::CPUContext& ctx,
     roi_has_mask = std::vector<int>(bg_inds.begin(), bg_inds.end());
   }
 
-  Tensor masks_expand;
+  phi::DenseTensor masks_expand;
   ExpandMaskTarget<T>(
       ctx, masks, mask_class_labels, resolution, num_classes, &masks_expand);
 
@@ -315,13 +315,13 @@ std::vector<Tensor> SampleMaskForOneImage(const phi::CPUContext& ctx,
     rois_fg_data[k] = rois_fg_data[k] * im_scale;
   }
 
-  Tensor roi_has_mask_t;
+  phi::DenseTensor roi_has_mask_t;
   int roi_has_mask_size = roi_has_mask.size();
   int* roi_has_mask_data =
       roi_has_mask_t.mutable_data<int>({roi_has_mask_size, 1}, ctx.GetPlace());
   std::copy(roi_has_mask.begin(), roi_has_mask.end(), roi_has_mask_data);
 
-  std::vector<Tensor> res;
+  std::vector<phi::DenseTensor> res;
   res.emplace_back(rois_fg);
   res.emplace_back(roi_has_mask_t);
   res.emplace_back(masks_expand);
@@ -405,23 +405,23 @@ class GenerateMaskLabelsKernel : public framework::OpKernel<T> {
         lod0.emplace_back(num_mask);
         continue;
       }
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
-      Tensor gt_classes_slice =
+      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
+      phi::DenseTensor gt_classes_slice =
           gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]);
-      Tensor is_crowd_slice =
+      phi::DenseTensor is_crowd_slice =
           is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
-      Tensor label_int32_slice =
+      phi::DenseTensor label_int32_slice =
           label_int32->Slice(label_int32_lod[i], label_int32_lod[i + 1]);
-      Tensor rois_slice = rois->Slice(rois_lod[i], rois_lod[i + 1]);
+      phi::DenseTensor rois_slice = rois->Slice(rois_lod[i], rois_lod[i + 1]);
 
       auto sub_lod_and_offset =
           framework::GetSubLoDAndAbsoluteOffset(gt_segms_lod, i, i + 1, 0);
       auto lod_length = sub_lod_and_offset.first;
       size_t s = sub_lod_and_offset.second.first;
       size_t e = sub_lod_and_offset.second.second;
-      Tensor gt_segms_slice = gt_segms->Slice(s, e);
+      phi::DenseTensor gt_segms_slice = gt_segms->Slice(s, e);
 
-      std::vector<Tensor> tensor_output =
+      std::vector<phi::DenseTensor> tensor_output =
           SampleMaskForOneImage<T>(dev_ctx,
                                    im_info_slice,
                                    gt_classes_slice,
@@ -433,9 +433,9 @@ class GenerateMaskLabelsKernel : public framework::OpKernel<T> {
                                    resolution,
                                    lod_length);
 
-      Tensor sampled_mask_rois = tensor_output[0];
-      Tensor sampled_roi_has_mask_int32 = tensor_output[1];
-      Tensor sampled_mask_int32 = tensor_output[2];
+      phi::DenseTensor sampled_mask_rois = tensor_output[0];
+      phi::DenseTensor sampled_roi_has_mask_int32 = tensor_output[1];
+      phi::DenseTensor sampled_mask_int32 = tensor_output[2];
 
       AppendMask<T>(mask_rois, kBoxDim * num_mask, &sampled_mask_rois);
       AppendMask<int>(
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index 1071641b6bc60..b11030f1d086a 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -25,7 +25,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 const int kBoxDim = 4;
 
 template <typename T>
@@ -174,7 +173,7 @@ void Concat(const phi::CPUContext& context,
             const phi::DenseTensor& in_tensor_b,
             phi::DenseTensor* out_tensor) {
   int axis = 0;
-  std::vector<Tensor> inputs;
+  std::vector<phi::DenseTensor> inputs;
   inputs.emplace_back(in_tensor_a);
   inputs.emplace_back(in_tensor_b);
   math::ConcatFunctor<phi::CPUContext, T> concat_functor;
@@ -300,7 +299,7 @@ void GatherBoxesLabels(const phi::CPUContext& context,
                        phi::DenseTensor* sampled_max_overlap) {
   int fg_num = fg_inds.size();
   int bg_num = bg_inds.size();
-  Tensor fg_inds_t, bg_inds_t, gt_box_inds_t, gt_label_inds_t;
+  phi::DenseTensor fg_inds_t, bg_inds_t, gt_box_inds_t, gt_label_inds_t;
   int* fg_inds_data = fg_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
   int* bg_inds_data = bg_inds_t.mutable_data<int>({bg_num}, context.GetPlace());
   int* gt_box_inds_data =
@@ -312,7 +311,7 @@ void GatherBoxesLabels(const phi::CPUContext& context,
   std::copy(gt_inds.begin(), gt_inds.end(), gt_box_inds_data);
   std::copy(gt_inds.begin(), gt_inds.end(), gt_label_inds_data);
 
-  Tensor fg_boxes, bg_boxes, fg_labels, bg_labels;
+  phi::DenseTensor fg_boxes, bg_boxes, fg_labels, bg_labels;
   fg_boxes.mutable_data<T>({fg_num, kBoxDim}, context.GetPlace());
   phi::funcs::CPUGather<T>(context, boxes, fg_inds_t, &fg_boxes);
   bg_boxes.mutable_data<T>({bg_num, kBoxDim}, context.GetPlace());
@@ -325,7 +324,7 @@ void GatherBoxesLabels(const phi::CPUContext& context,
   phi::funcs::set_constant(context, &bg_labels, 0);
   Concat<int>(context, fg_labels, bg_labels, sampled_labels);
 
-  Tensor fg_max_overlap, bg_max_overlap;
+  phi::DenseTensor fg_max_overlap, bg_max_overlap;
   fg_max_overlap.mutable_data<T>({fg_num}, context.GetPlace());
   phi::funcs::CPUGather<T>(context, max_overlap, fg_inds_t, &fg_max_overlap);
   bg_max_overlap.mutable_data<T>({bg_num}, context.GetPlace());
@@ -334,7 +333,7 @@ void GatherBoxesLabels(const phi::CPUContext& context,
 }
 
 template <typename T>
-std::vector<Tensor> SampleRoisForOneImage(
+std::vector<phi::DenseTensor> SampleRoisForOneImage(
     const phi::CPUContext& context,
     const phi::DenseTensor& rpn_rois_in,
     const phi::DenseTensor& gt_classes,
@@ -355,7 +354,7 @@ std::vector<Tensor> SampleRoisForOneImage(
     const phi::DenseTensor& max_overlap) {
   // 1.1 map to original image
   auto im_scale = im_info.data<T>()[2];
-  Tensor rpn_rois;
+  phi::DenseTensor rpn_rois;
   rpn_rois.mutable_data<T>(rpn_rois_in.dims(), context.GetPlace());
   const T* rpn_rois_in_dt = rpn_rois_in.data<T>();
   T* rpn_rois_dt = rpn_rois.data<T>();
@@ -367,10 +366,10 @@ std::vector<Tensor> SampleRoisForOneImage(
   int proposals_num = 1;
 
   if (is_cascade_rcnn) {
-    Tensor keep;
+    phi::DenseTensor keep;
     FilterRoIs<T>(context, rpn_rois, max_overlap, &keep);
-    Tensor roi_filter;
-    // Tensor box_filter;
+    phi::DenseTensor roi_filter;
+    // phi::DenseTensor box_filter;
     if (keep.numel() == 0) {
       phi::funcs::SetConstant<phi::CPUContext, T> set_zero;
       roi_filter.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
@@ -389,16 +388,16 @@ std::vector<Tensor> SampleRoisForOneImage(
   // 1.2 compute overlaps
   proposals_num += gt_boxes.dims()[0];
 
-  Tensor proposal_to_gt_overlaps;
+  phi::DenseTensor proposal_to_gt_overlaps;
   proposal_to_gt_overlaps.mutable_data<T>({proposals_num, gt_boxes.dims()[0]},
                                           context.GetPlace());
 
-  Tensor boxes;
+  phi::DenseTensor boxes;
   boxes.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
   Concat<T>(context, gt_boxes, rpn_rois, &boxes);
   BboxOverlaps<T>(boxes, gt_boxes, &proposal_to_gt_overlaps);
 
-  Tensor proposal_with_max_overlap;
+  phi::DenseTensor proposal_with_max_overlap;
   proposal_with_max_overlap.mutable_data<T>({proposals_num},
                                             context.GetPlace());
 
@@ -423,7 +422,8 @@ std::vector<Tensor> SampleRoisForOneImage(
   std::vector<int> mapped_gt_inds = fg_bg_gt[2];  // mapped_gt_labels
 
   // Gather boxes and labels
-  Tensor sampled_boxes, sampled_labels, sampled_gts, sampled_max_overlap;
+  phi::DenseTensor sampled_boxes, sampled_labels, sampled_gts,
+      sampled_max_overlap;
   int fg_num = fg_inds.size();
   int bg_num = bg_inds.size();
   int boxes_num = fg_num + bg_num;
@@ -446,7 +446,7 @@ std::vector<Tensor> SampleRoisForOneImage(
                        &sampled_max_overlap);
 
   // Compute targets
-  Tensor bbox_targets_single;
+  phi::DenseTensor bbox_targets_single;
   bbox_targets_single.mutable_data<T>(bbox_dim, context.GetPlace());
   BoxToDelta<T>(fg_num,
                 sampled_boxes,
@@ -456,14 +456,14 @@ std::vector<Tensor> SampleRoisForOneImage(
                 &bbox_targets_single);
 
   // Scale rois
-  Tensor sampled_rois;
+  phi::DenseTensor sampled_rois;
   sampled_rois.mutable_data<T>(sampled_boxes.dims(), context.GetPlace());
   auto sampled_rois_et = framework::EigenTensor<T, 2>::From(sampled_rois);
   auto sampled_boxes_et = framework::EigenTensor<T, 2>::From(sampled_boxes);
   sampled_rois_et = sampled_boxes_et * im_scale;
 
   // Expand box targets
-  Tensor bbox_targets, bbox_inside_weights, bbox_outside_weights;
+  phi::DenseTensor bbox_targets, bbox_inside_weights, bbox_outside_weights;
   framework::DDim bbox_expand_dim({boxes_num, kBoxDim * class_nums});
   bbox_targets.mutable_data<T>(bbox_expand_dim, context.GetPlace());
   bbox_inside_weights.mutable_data<T>(bbox_expand_dim, context.GetPlace());
@@ -500,7 +500,7 @@ std::vector<Tensor> SampleRoisForOneImage(
       bbox_outside_weights_data[dst_idx + 3] = 1;
     }
   }
-  std::vector<Tensor> res;
+  std::vector<phi::DenseTensor> res;
   res.emplace_back(sampled_rois);
   res.emplace_back(sampled_labels);
   res.emplace_back(bbox_targets);
@@ -610,16 +610,16 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
         lod0.emplace_back(num_rois);
         continue;
       }
-      Tensor rpn_rois_slice =
+      phi::DenseTensor rpn_rois_slice =
           rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]);
-      Tensor gt_classes_slice =
+      phi::DenseTensor gt_classes_slice =
           gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]);
-      Tensor is_crowd_slice =
+      phi::DenseTensor is_crowd_slice =
           is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
-      Tensor gt_boxes_slice =
+      phi::DenseTensor gt_boxes_slice =
           gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
-      Tensor max_overlap_slice;
+      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
+      phi::DenseTensor max_overlap_slice;
       if (is_cascade_rcnn) {
         auto* max_overlap = context.Input<phi::DenseTensor>("MaxOverlap");
         max_overlap_slice =
@@ -628,7 +628,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
         max_overlap_slice.mutable_data<T>({rpn_rois_slice.dims()[0]},
                                           context.GetPlace());
       }
-      std::vector<Tensor> tensor_output =
+      std::vector<phi::DenseTensor> tensor_output =
           SampleRoisForOneImage<T>(dev_ctx,
                                    rpn_rois_slice,
                                    gt_classes_slice,
@@ -647,12 +647,12 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
                                    is_cascade_rcnn,
                                    is_cls_agnostic,
                                    max_overlap_slice);
-      Tensor sampled_rois = tensor_output[0];
-      Tensor sampled_labels_int32 = tensor_output[1];
-      Tensor sampled_bbox_targets = tensor_output[2];
-      Tensor sampled_bbox_inside_weights = tensor_output[3];
-      Tensor sampled_bbox_outside_weights = tensor_output[4];
-      Tensor sampled_max_overlap = tensor_output[5];
+      phi::DenseTensor sampled_rois = tensor_output[0];
+      phi::DenseTensor sampled_labels_int32 = tensor_output[1];
+      phi::DenseTensor sampled_bbox_targets = tensor_output[2];
+      phi::DenseTensor sampled_bbox_inside_weights = tensor_output[3];
+      phi::DenseTensor sampled_bbox_outside_weights = tensor_output[4];
+      phi::DenseTensor sampled_max_overlap = tensor_output[5];
 
       AppendRois<T>(rois, kBoxDim * num_rois, &sampled_rois);
       AppendRois<int>(labels_int32, num_rois, &sampled_labels_int32);
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index 6491c8b8fcece..030b99cd1dbd7 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -27,8 +27,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class GenerateProposalsOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -115,7 +113,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
                               context.GetPlace());
     rpn_roi_probs->mutable_data<T>({scores->numel(), 1}, context.GetPlace());
 
-    Tensor bbox_deltas_swap, scores_swap;
+    phi::DenseTensor bbox_deltas_swap, scores_swap;
     bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox},
                                      dev_ctx.GetPlace());
     scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
@@ -136,14 +134,14 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
 
     int64_t num_proposals = 0;
     for (int64_t i = 0; i < num; ++i) {
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
-      Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
-      Tensor scores_slice = scores_swap.Slice(i, i + 1);
+      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
+      phi::DenseTensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
+      phi::DenseTensor scores_slice = scores_swap.Slice(i, i + 1);
 
       bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
       scores_slice.Resize({h_score * w_score * c_score, 1});
 
-      std::pair<Tensor, Tensor> tensor_pair =
+      std::pair<phi::DenseTensor, phi::DenseTensor> tensor_pair =
           ProposalForOneImage(dev_ctx,
                               im_info_slice,
                               anchors,
@@ -155,8 +153,8 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
                               nms_thresh,
                               min_size,
                               eta);
-      Tensor &proposals = tensor_pair.first;
-      Tensor &scores = tensor_pair.second;
+      phi::DenseTensor &proposals = tensor_pair.first;
+      phi::DenseTensor &scores = tensor_pair.second;
 
       AppendProposals(rpn_rois, 4 * num_proposals, proposals);
       AppendProposals(rpn_roi_probs, num_proposals, scores);
@@ -179,13 +177,13 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     rpn_roi_probs->Resize({num_proposals, 1});
   }
 
-  std::pair<Tensor, Tensor> ProposalForOneImage(
+  std::pair<phi::DenseTensor, phi::DenseTensor> ProposalForOneImage(
       const phi::CPUContext &ctx,
-      const Tensor &im_info_slice,
-      const Tensor &anchors,
-      const Tensor &variances,
-      const Tensor &bbox_deltas_slice,  // [M, 4]
-      const Tensor &scores_slice,       // [N, 1]
+      const phi::DenseTensor &im_info_slice,
+      const phi::DenseTensor &anchors,
+      const phi::DenseTensor &variances,
+      const phi::DenseTensor &bbox_deltas_slice,  // [M, 4]
+      const phi::DenseTensor &scores_slice,       // [N, 1]
       int pre_nms_top_n,
       int post_nms_top_n,
       float nms_thresh,
@@ -194,7 +192,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     auto *scores_data = scores_slice.data<T>();
 
     // Sort index
-    Tensor index_t;
+    phi::DenseTensor index_t;
     index_t.Resize({scores_slice.numel()});
     int *index = index_t.mutable_data<int>(ctx.GetPlace());
     for (int i = 0; i < scores_slice.numel(); ++i) {
@@ -212,7 +210,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
       index_t.Resize({pre_nms_top_n});
     }
 
-    Tensor scores_sel, bbox_sel, anchor_sel, var_sel;
+    phi::DenseTensor scores_sel, bbox_sel, anchor_sel, var_sel;
     scores_sel.mutable_data<T>({index_t.numel(), 1}, ctx.GetPlace());
     bbox_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
     anchor_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
@@ -223,26 +221,26 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     phi::funcs::CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
     phi::funcs::CPUGather<T>(ctx, variances, index_t, &var_sel);
 
-    Tensor proposals;
+    phi::DenseTensor proposals;
     proposals.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
     BoxCoder<T>(ctx, &anchor_sel, &bbox_sel, &var_sel, &proposals);
 
     ClipTiledBoxes<T>(ctx, im_info_slice, proposals, &proposals, false);
 
-    Tensor keep;
+    phi::DenseTensor keep;
     FilterBoxes<T>(ctx, &proposals, min_size, im_info_slice, true, &keep);
     // Handle the case when there is no keep index left
     if (keep.numel() == 0) {
       phi::funcs::SetConstant<phi::CPUContext, T> set_zero;
       bbox_sel.mutable_data<T>({1, 4}, ctx.GetPlace());
       set_zero(ctx, &bbox_sel, static_cast<T>(0));
-      Tensor scores_filter;
+      phi::DenseTensor scores_filter;
       scores_filter.mutable_data<T>({1, 1}, ctx.GetPlace());
       set_zero(ctx, &scores_filter, static_cast<T>(0));
       return std::make_pair(bbox_sel, scores_filter);
     }
 
-    Tensor scores_filter;
+    phi::DenseTensor scores_filter;
     bbox_sel.mutable_data<T>({keep.numel(), 4}, ctx.GetPlace());
     scores_filter.mutable_data<T>({keep.numel(), 1}, ctx.GetPlace());
     phi::funcs::CPUGather<T>(ctx, proposals, keep, &bbox_sel);
@@ -251,7 +249,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
       return std::make_pair(bbox_sel, scores_filter);
     }
 
-    Tensor keep_nms =
+    phi::DenseTensor keep_nms =
         phi::funcs::NMS<T>(ctx, &bbox_sel, &scores_filter, nms_thresh, eta);
 
     if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index 0890ff493332c..5d7a034c28a8f 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -28,24 +28,22 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 namespace {
 template <typename T>
-static std::pair<Tensor, Tensor> ProposalForOneImage(
+static std::pair<phi::DenseTensor, phi::DenseTensor> ProposalForOneImage(
     const phi::GPUContext &ctx,
-    const Tensor &im_info,
-    const Tensor &anchors,
-    const Tensor &variances,
-    const Tensor &bbox_deltas,  // [M, 4]
-    const Tensor &scores,       // [N, 1]
+    const phi::DenseTensor &im_info,
+    const phi::DenseTensor &anchors,
+    const phi::DenseTensor &variances,
+    const phi::DenseTensor &bbox_deltas,  // [M, 4]
+    const phi::DenseTensor &scores,       // [N, 1]
     int pre_nms_top_n,
     int post_nms_top_n,
     float nms_thresh,
     float min_size,
     float eta) {
   // 1. pre nms
-  Tensor scores_sort, index_sort;
+  phi::DenseTensor scores_sort, index_sort;
   SortDescending<T>(ctx, scores, &scores_sort, &index_sort);
   int num = scores.numel();
   int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel()
@@ -54,7 +52,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   index_sort.Resize({pre_nms_num, 1});
 
   // 2. box decode and clipping
-  Tensor proposals;
+  phi::DenseTensor proposals;
   proposals.mutable_data<T>({pre_nms_num, 4}, ctx.GetPlace());
 
   {
@@ -68,7 +66,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   }
 
   // 3. filter
-  Tensor keep_index, keep_num_t;
+  phi::DenseTensor keep_index, keep_num_t;
   keep_index.mutable_data<int>({pre_nms_num}, ctx.GetPlace());
   keep_num_t.mutable_data<int>({1}, ctx.GetPlace());
   min_size = std::max(min_size, 1.0f);
@@ -90,7 +88,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   ctx.Wait();
   keep_index.Resize({keep_num});
 
-  Tensor scores_filter, proposals_filter;
+  phi::DenseTensor scores_filter, proposals_filter;
   // Handle the case when there is no keep index left
   if (keep_num == 0) {
     phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
@@ -110,13 +108,13 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   }
 
   // 4. nms
-  Tensor keep_nms;
+  phi::DenseTensor keep_nms;
   NMS<T>(ctx, proposals_filter, keep_index, nms_thresh, &keep_nms);
   if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
     keep_nms.Resize({post_nms_top_n});
   }
 
-  Tensor scores_nms, proposals_nms;
+  phi::DenseTensor scores_nms, proposals_nms;
   proposals_nms.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
   scores_nms.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
   phi::funcs::GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
@@ -171,7 +169,7 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
     int64_t h_bbox = bbox_dim[2];
     int64_t w_bbox = bbox_dim[3];
 
-    Tensor bbox_deltas_swap, scores_swap;
+    phi::DenseTensor bbox_deltas_swap, scores_swap;
     bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox},
                                      dev_ctx.GetPlace());
     scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
@@ -200,14 +198,14 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
     std::vector<int> tmp_num;
 
     for (int64_t i = 0; i < num; ++i) {
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
-      Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
-      Tensor scores_slice = scores_swap.Slice(i, i + 1);
+      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
+      phi::DenseTensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
+      phi::DenseTensor scores_slice = scores_swap.Slice(i, i + 1);
 
       bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
       scores_slice.Resize({h_score * w_score * c_score, 1});
 
-      std::pair<Tensor, Tensor> box_score_pair =
+      std::pair<phi::DenseTensor, phi::DenseTensor> box_score_pair =
           ProposalForOneImage<T>(dev_ctx,
                                  im_info_slice,
                                  anchors,
@@ -220,8 +218,8 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
                                  min_size,
                                  eta);
 
-      Tensor &proposals = box_score_pair.first;
-      Tensor &scores = box_score_pair.second;
+      phi::DenseTensor &proposals = box_score_pair.first;
+      phi::DenseTensor &scores = box_score_pair.second;
 
       memory::Copy(place,
                    rpn_rois_data + num_proposals * 4,
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
index 08c7a649c1e1f..0445c21b1de3b 100644
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
@@ -29,8 +29,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class GenerateProposalsV2Op : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/detection/iou_similarity_op_mlu.cc b/paddle/fluid/operators/detection/iou_similarity_op_mlu.cc
index 22bba5c57ffd8..2909c333e16ac 100644
--- a/paddle/fluid/operators/detection/iou_similarity_op_mlu.cc
+++ b/paddle/fluid/operators/detection/iou_similarity_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 struct IouFunction {
  public:
@@ -182,21 +180,21 @@ class IouSimilarityMLUKernel : public framework::OpKernel<T> {
     auto M = y->dims()[0];
 
     out->mutable_data<T>({N, M}, place);
-    Tensor xt(_type);
-    Tensor yt(_type);
+    phi::DenseTensor xt(_type);
+    phi::DenseTensor yt(_type);
     xt.mutable_data<T>({4, N}, place);
     yt.mutable_data<T>({4, M}, place);
     std::vector<int> vec_trans = {1, 0};
     F.Transpose(x, &xt, vec_trans);
     F.Transpose(y, &yt, vec_trans);
-    Tensor xmin1 = xt.Slice(0, 1);
-    Tensor ymin1 = xt.Slice(1, 2);
-    Tensor xmax1 = xt.Slice(2, 3);
-    Tensor ymax1 = xt.Slice(3, 4);
-    Tensor xmin2 = yt.Slice(0, 1);
-    Tensor ymin2 = yt.Slice(1, 2);
-    Tensor xmax2 = yt.Slice(2, 3);
-    Tensor ymax2 = yt.Slice(3, 4);
+    phi::DenseTensor xmin1 = xt.Slice(0, 1);
+    phi::DenseTensor ymin1 = xt.Slice(1, 2);
+    phi::DenseTensor xmax1 = xt.Slice(2, 3);
+    phi::DenseTensor ymax1 = xt.Slice(3, 4);
+    phi::DenseTensor xmin2 = yt.Slice(0, 1);
+    phi::DenseTensor ymin2 = yt.Slice(1, 2);
+    phi::DenseTensor xmax2 = yt.Slice(2, 3);
+    phi::DenseTensor ymax2 = yt.Slice(3, 4);
     xmin1.Resize({N, 1});
     ymin1.Resize({N, 1});
     xmax1.Resize({N, 1});
@@ -206,12 +204,12 @@ class IouSimilarityMLUKernel : public framework::OpKernel<T> {
     xmax2.Resize({1, M});
     ymax2.Resize({1, M});
 
-    Tensor w1(_type);
-    Tensor h1(_type);
-    Tensor w2(_type);
-    Tensor h2(_type);
-    Tensor area1(_type);
-    Tensor area2(_type);
+    phi::DenseTensor w1(_type);
+    phi::DenseTensor h1(_type);
+    phi::DenseTensor w2(_type);
+    phi::DenseTensor h2(_type);
+    phi::DenseTensor area1(_type);
+    phi::DenseTensor area2(_type);
     w1.mutable_data<T>({N, 1}, place);
     h1.mutable_data<T>({N, 1}, place);
     w2.mutable_data<T>({1, M}, place);
@@ -231,10 +229,10 @@ class IouSimilarityMLUKernel : public framework::OpKernel<T> {
     F.Mul(&w1, &h1, &area1);
     F.Mul(&w2, &h2, &area2);
 
-    Tensor inter_xmax(_type);
-    Tensor inter_ymax(_type);
-    Tensor inter_xmin(_type);
-    Tensor inter_ymin(_type);
+    phi::DenseTensor inter_xmax(_type);
+    phi::DenseTensor inter_ymax(_type);
+    phi::DenseTensor inter_xmin(_type);
+    phi::DenseTensor inter_ymin(_type);
     inter_xmax.mutable_data<T>({N, M}, place);
     inter_ymax.mutable_data<T>({N, M}, place);
     inter_xmin.mutable_data<T>({N, M}, place);
@@ -244,8 +242,8 @@ class IouSimilarityMLUKernel : public framework::OpKernel<T> {
     F.Maximum(&xmin1, &xmin2, &inter_xmin);
     F.Maximum(&ymin1, &ymin2, &inter_ymin);
 
-    Tensor inter_w(_type);
-    Tensor inter_h(_type);
+    phi::DenseTensor inter_w(_type);
+    phi::DenseTensor inter_h(_type);
     inter_w.mutable_data<T>({N, M}, place);
     inter_h.mutable_data<T>({N, M}, place);
     F.Sub(&inter_xmax, &inter_xmin, &inter_w);
@@ -255,14 +253,14 @@ class IouSimilarityMLUKernel : public framework::OpKernel<T> {
       F.Adds(&inter_w, 1.0f, &inter_w);
       F.Adds(&inter_h, 1.0f, &inter_h);
     }
-    Tensor zeros(_type);
+    phi::DenseTensor zeros(_type);
     zeros.mutable_data<T>({1}, place);
     FillMLUTensorWithHostValue<T>(ctx, static_cast<T>(0), &zeros);
     F.Maximum(&inter_w, &zeros, &inter_w);
     F.Maximum(&inter_h, &zeros, &inter_h);
 
     F.Mul(&inter_w, &inter_h, out);
-    Tensor union_area(_type);
+    phi::DenseTensor union_area(_type);
     union_area.mutable_data<T>({N, M}, place);
     F.Add(&area1, &area2, &union_area);
     F.Sub(&union_area, out, &union_area);
diff --git a/paddle/fluid/operators/detection/iou_similarity_op_npu.cc b/paddle/fluid/operators/detection/iou_similarity_op_npu.cc
index 5708d1ae6460a..7bdd105c37ae0 100644
--- a/paddle/fluid/operators/detection/iou_similarity_op_npu.cc
+++ b/paddle/fluid/operators/detection/iou_similarity_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 struct IouFunction {
  public:
@@ -108,21 +106,21 @@ class IouSimilarityNPUKernel : public framework::OpKernel<T> {
     auto M = y->dims()[0];
 
     out->mutable_data<T>({N, M}, place);
-    Tensor xt(_type);
-    Tensor yt(_type);
+    phi::DenseTensor xt(_type);
+    phi::DenseTensor yt(_type);
     xt.mutable_data<T>({4, N}, place);
     yt.mutable_data<T>({4, M}, place);
     std::vector<int> vec_trans = {1, 0};
     F.Transpose(x, &xt, vec_trans);
     F.Transpose(y, &yt, vec_trans);
-    Tensor xmin1 = xt.Slice(0, 1);
-    Tensor ymin1 = xt.Slice(1, 2);
-    Tensor xmax1 = xt.Slice(2, 3);
-    Tensor ymax1 = xt.Slice(3, 4);
-    Tensor xmin2 = yt.Slice(0, 1);
-    Tensor ymin2 = yt.Slice(1, 2);
-    Tensor xmax2 = yt.Slice(2, 3);
-    Tensor ymax2 = yt.Slice(3, 4);
+    phi::DenseTensor xmin1 = xt.Slice(0, 1);
+    phi::DenseTensor ymin1 = xt.Slice(1, 2);
+    phi::DenseTensor xmax1 = xt.Slice(2, 3);
+    phi::DenseTensor ymax1 = xt.Slice(3, 4);
+    phi::DenseTensor xmin2 = yt.Slice(0, 1);
+    phi::DenseTensor ymin2 = yt.Slice(1, 2);
+    phi::DenseTensor xmax2 = yt.Slice(2, 3);
+    phi::DenseTensor ymax2 = yt.Slice(3, 4);
     xmin1.Resize({N, 1});
     ymin1.Resize({N, 1});
     xmax1.Resize({N, 1});
@@ -132,12 +130,12 @@ class IouSimilarityNPUKernel : public framework::OpKernel<T> {
     xmax2.Resize({1, M});
     ymax2.Resize({1, M});
 
-    Tensor w1(_type);
-    Tensor h1(_type);
-    Tensor w2(_type);
-    Tensor h2(_type);
-    Tensor area1(_type);
-    Tensor area2(_type);
+    phi::DenseTensor w1(_type);
+    phi::DenseTensor h1(_type);
+    phi::DenseTensor w2(_type);
+    phi::DenseTensor h2(_type);
+    phi::DenseTensor area1(_type);
+    phi::DenseTensor area2(_type);
     w1.mutable_data<T>({N, 1}, place);
     h1.mutable_data<T>({N, 1}, place);
     w2.mutable_data<T>({1, M}, place);
@@ -157,10 +155,10 @@ class IouSimilarityNPUKernel : public framework::OpKernel<T> {
     F.Mul(&w1, &h1, &area1);
     F.Mul(&w2, &h2, &area2);
 
-    Tensor inter_xmax(_type);
-    Tensor inter_ymax(_type);
-    Tensor inter_xmin(_type);
-    Tensor inter_ymin(_type);
+    phi::DenseTensor inter_xmax(_type);
+    phi::DenseTensor inter_ymax(_type);
+    phi::DenseTensor inter_xmin(_type);
+    phi::DenseTensor inter_ymin(_type);
     inter_xmax.mutable_data<T>({N, M}, place);
     inter_ymax.mutable_data<T>({N, M}, place);
     inter_xmin.mutable_data<T>({N, M}, place);
@@ -170,8 +168,8 @@ class IouSimilarityNPUKernel : public framework::OpKernel<T> {
     F.Maximum(&xmin1, &xmin2, &inter_xmin);
     F.Maximum(&ymin1, &ymin2, &inter_ymin);
 
-    Tensor inter_w(_type);
-    Tensor inter_h(_type);
+    phi::DenseTensor inter_w(_type);
+    phi::DenseTensor inter_h(_type);
     inter_w.mutable_data<T>({N, M}, place);
     inter_h.mutable_data<T>({N, M}, place);
     F.Sub(&inter_xmax, &inter_xmin, &inter_w);
@@ -181,14 +179,14 @@ class IouSimilarityNPUKernel : public framework::OpKernel<T> {
       F.Adds(&inter_w, 1.0f, &inter_w);
       F.Adds(&inter_h, 1.0f, &inter_h);
     }
-    Tensor zeros(_type);
+    phi::DenseTensor zeros(_type);
     zeros.mutable_data<T>({1}, place);
     FillNpuTensorWithConstant<T>(&zeros, static_cast<T>(0));
     F.Maximum(&inter_w, &zeros, &inter_w);
     F.Maximum(&inter_h, &zeros, &inter_h);
 
     F.Mul(&inter_w, &inter_h, out);
-    Tensor union_area(_type);
+    phi::DenseTensor union_area(_type);
     union_area.mutable_data<T>({N, M}, place);
     F.Add(&area1, &area2, &union_area);
     F.Sub(&union_area, out, &union_area);
diff --git a/paddle/fluid/operators/detection/locality_aware_nms_op.cc b/paddle/fluid/operators/detection/locality_aware_nms_op.cc
index c2b8833bbd96c..1c5135fc4e8a7 100644
--- a/paddle/fluid/operators/detection/locality_aware_nms_op.cc
+++ b/paddle/fluid/operators/detection/locality_aware_nms_op.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class LocalityAwareNMSOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -252,7 +250,7 @@ class LocalityAwareNMSKernel : public framework::OpKernel<T> {
     int num_det = 0;
 
     int64_t class_num = scores->dims()[0];
-    Tensor bbox_slice, score_slice;
+    phi::DenseTensor bbox_slice, score_slice;
     for (int64_t c = 0; c < class_num; ++c) {
       if (c == background_label) continue;
 
@@ -325,7 +323,7 @@ class LocalityAwareNMSKernel : public framework::OpKernel<T> {
     auto* bboxes_data = bboxes.data<T>();
     auto* odata = outs->data<T>();
     const T* sdata;
-    Tensor bbox;
+    phi::DenseTensor bbox;
     bbox.Resize({scores.dims()[0], box_size});
     int count = 0;
     for (const auto& it : selected_indices) {
@@ -370,7 +368,7 @@ class LocalityAwareNMSKernel : public framework::OpKernel<T> {
     int64_t box_dim = boxes.dims()[2];
     int64_t out_dim = box_dim + 2;
     int num_nmsed_out = 0;
-    Tensor boxes_slice, scores_slice;
+    phi::DenseTensor boxes_slice, scores_slice;
     int n = batch_size;
     for (int i = 0; i < n; ++i) {
       scores_slice = scores.Slice(i, i + 1);
@@ -407,7 +405,7 @@ class LocalityAwareNMSKernel : public framework::OpKernel<T> {
         int64_t s = batch_starts[i];
         int64_t e = batch_starts[i + 1];
         if (e > s) {
-          Tensor out = outs->Slice(s, e);
+          phi::DenseTensor out = outs->Slice(s, e);
           LocalityAwareNMSOutput(dev_ctx,
                                  scores_slice,
                                  boxes_slice,
diff --git a/paddle/fluid/operators/detection/matrix_nms_op.cc b/paddle/fluid/operators/detection/matrix_nms_op.cc
index 21e52a39c37ab..1beeaf1ba3356 100644
--- a/paddle/fluid/operators/detection/matrix_nms_op.cc
+++ b/paddle/fluid/operators/detection/matrix_nms_op.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class MatrixNMSOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
index 5af93551d786f..79077b3086671 100644
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 inline std::vector<size_t> GetNmsLodFromRoisNum(
     const phi::DenseTensor* rois_num) {
   std::vector<size_t> rois_lod;
@@ -228,7 +226,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
     int num_det = 0;
 
     int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1];
-    Tensor bbox_slice, score_slice;
+    phi::DenseTensor bbox_slice, score_slice;
     for (int64_t c = 0; c < class_num; ++c) {
       if (c == background_label) continue;
       if (scores_size == 3) {
@@ -319,7 +317,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
     auto* bboxes_data = bboxes.data<T>();
     auto* odata = outs->data<T>();
     const T* sdata;
-    Tensor bbox;
+    phi::DenseTensor bbox;
     bbox.Resize({scores.dims()[0], box_size});
     int count = 0;
     for (const auto& it : selected_indices) {
@@ -373,7 +371,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
     int64_t box_dim = boxes->dims()[2];
     int64_t out_dim = box_dim + 2;
     int num_nmsed_out = 0;
-    Tensor boxes_slice, scores_slice;
+    phi::DenseTensor boxes_slice, scores_slice;
     int n = 0;
     if (has_roisnum) {
       n = score_size == 3 ? batch_size : rois_num->numel();
@@ -449,7 +447,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
         int64_t s = batch_starts[i];
         int64_t e = batch_starts[i + 1];
         if (e > s) {
-          Tensor out = outs->Slice(s, e);
+          phi::DenseTensor out = outs->Slice(s, e);
           if (return_index) {
             int* output_idx =
                 index->mutable_data<int>({num_kept, 1}, ctx.GetPlace());
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cc b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
index e386465c3bdf6..7135853f9ff8b 100644
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cc
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class PolygonBoxTransformCPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cu b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
index bbeb9f7f2858a..de43f2d62b455 100644
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
@@ -19,7 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using phi::PADDLE_CUDA_NUM_THREADS;
 #define CUDA_BLOCK_SIZE 16
 
diff --git a/paddle/fluid/operators/detection/prior_box_op_npu.cc b/paddle/fluid/operators/detection/prior_box_op_npu.cc
index 8a3a313be159c..42845ff20f129 100644
--- a/paddle/fluid/operators/detection/prior_box_op_npu.cc
+++ b/paddle/fluid/operators/detection/prior_box_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class PriorBoxNPUKernel : public framework::OpKernel<T> {
  public:
@@ -50,7 +48,7 @@ class PriorBoxNPUKernel : public framework::OpKernel<T> {
 
     auto place = ctx.GetPlace();
 
-    Tensor out(input->type());
+    phi::DenseTensor out(input->type());
     auto out_dims = phi::vectorize(boxes->dims());
     out_dims.insert(out_dims.begin(), 2);
     out.Resize(phi::make_ddim(out_dims));
@@ -75,8 +73,8 @@ class PriorBoxNPUKernel : public framework::OpKernel<T> {
     runner.Run(stream);
 
     out.Resize(phi::make_ddim({out.numel()}));
-    Tensor out_boxes = out.Slice(0, boxes->numel());
-    Tensor out_variances = out.Slice(boxes->numel(), out.numel());
+    phi::DenseTensor out_boxes = out.Slice(0, boxes->numel());
+    phi::DenseTensor out_variances = out.Slice(boxes->numel(), out.numel());
 
     out_boxes.Resize(boxes->dims());
     out_variances.Resize(variances->dims());
diff --git a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
index a38765e28d786..d2654e086d08d 100644
--- a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
+++ b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class RetinanetDetectionOutputOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -409,9 +407,9 @@ class RetinanetDetectionOutputKernel : public framework::OpKernel<T> {
   }
 
   void RetinanetDetectionOutput(const framework::ExecutionContext& ctx,
-                                const std::vector<Tensor>& scores,
-                                const std::vector<Tensor>& bboxes,
-                                const std::vector<Tensor>& anchors,
+                                const std::vector<phi::DenseTensor>& scores,
+                                const std::vector<phi::DenseTensor>& bboxes,
+                                const std::vector<phi::DenseTensor>& anchors,
                                 const phi::DenseTensor& im_info,
                                 std::vector<std::vector<T>>* nmsed_out,
                                 int* num_nmsed_out) const {
@@ -425,11 +423,11 @@ class RetinanetDetectionOutputKernel : public framework::OpKernel<T> {
     std::map<int, std::vector<std::vector<T>>> preds;
     for (size_t l = 0; l < scores.size(); ++l) {
       // Fetch per level score
-      Tensor scores_per_level = scores[l];
+      phi::DenseTensor scores_per_level = scores[l];
       // Fetch per level bbox
-      Tensor bboxes_per_level = bboxes[l];
+      phi::DenseTensor bboxes_per_level = bboxes[l];
       // Fetch per level anchor
-      Tensor anchors_per_level = anchors[l];
+      phi::DenseTensor anchors_per_level = anchors[l];
 
       int64_t scores_num = scores_per_level.numel();
       int64_t bboxes_num = bboxes_per_level.numel();
@@ -492,9 +490,9 @@ class RetinanetDetectionOutputKernel : public framework::OpKernel<T> {
     auto* im_info = ctx.Input<phi::DenseTensor>("ImInfo");
     auto* outs = ctx.Output<phi::DenseTensor>("Out");
 
-    std::vector<Tensor> boxes_list(boxes.size());
-    std::vector<Tensor> scores_list(scores.size());
-    std::vector<Tensor> anchors_list(anchors.size());
+    std::vector<phi::DenseTensor> boxes_list(boxes.size());
+    std::vector<phi::DenseTensor> scores_list(scores.size());
+    std::vector<phi::DenseTensor> anchors_list(anchors.size());
     for (size_t j = 0; j < boxes_list.size(); ++j) {
       boxes_list[j] = *boxes[j];
       scores_list[j] = *scores[j];
@@ -512,8 +510,8 @@ class RetinanetDetectionOutputKernel : public framework::OpKernel<T> {
     std::vector<size_t> batch_starts = {0};
     for (int i = 0; i < batch_size; ++i) {
       int num_nmsed_out = 0;
-      std::vector<Tensor> box_per_batch_list(boxes_list.size());
-      std::vector<Tensor> score_per_batch_list(scores_list.size());
+      std::vector<phi::DenseTensor> box_per_batch_list(boxes_list.size());
+      std::vector<phi::DenseTensor> score_per_batch_list(scores_list.size());
       for (size_t j = 0; j < boxes_list.size(); ++j) {
         const auto& score_dims = scores_list[j].dims();
         score_per_batch_list[j] = scores_list[j].Slice(i, i + 1);
@@ -521,7 +519,7 @@ class RetinanetDetectionOutputKernel : public framework::OpKernel<T> {
         box_per_batch_list[j] = boxes_list[j].Slice(i, i + 1);
         box_per_batch_list[j].Resize({score_dims[1], box_dim});
       }
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
+      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
 
       std::vector<std::vector<T>> nmsed_out;
       RetinanetDetectionOutput(ctx,
@@ -544,7 +542,7 @@ class RetinanetDetectionOutputKernel : public framework::OpKernel<T> {
         int64_t s = batch_starts[i];
         int64_t e = batch_starts[i + 1];
         if (e > s) {
-          Tensor out = outs->Slice(s, e);
+          phi::DenseTensor out = outs->Slice(s, e);
           MultiClassOutput(dev_ctx, all_nmsed_out[i], &out);
         }
       }
@@ -563,7 +561,8 @@ class RetinanetDetectionOutputOpMaker
   void Make() override {
     AddInput("BBoxes",
              "(List) A list of tensors from multiple FPN levels. Each "
-             "element is a 3-D Tensor with shape [N, Mi, 4] represents the "
+             "element is a 3-D phi::DenseTensor with shape [N, Mi, 4] "
+             "represents the "
              "predicted locations of Mi bounding boxes, N is the batch size. "
              "Mi is the number of bounding boxes from i-th FPN level. Each "
              "bounding box has four coordinate values and the layout is "
@@ -571,18 +570,20 @@ class RetinanetDetectionOutputOpMaker
         .AsDuplicable();
     AddInput("Scores",
              "(List) A list of tensors from multiple FPN levels. Each "
-             "element is a 3-D Tensor with shape [N, Mi, C] represents the "
+             "element is a 3-D phi::DenseTensor with shape [N, Mi, C] "
+             "represents the "
              "predicted confidence from its FPN level. N is the batch size, "
              "C is the class number (excluding background), Mi is the number "
              "of bounding boxes from i-th FPN level. For each bounding box, "
              "there are total C scores.")
         .AsDuplicable();
-    AddInput("Anchors",
-             "(List) A list of tensors from multiple FPN levels. Each"
-             "element is a 2-D Tensor with shape [Mi, 4] represents the "
-             "locations of Mi anchor boxes from i-th FPN level. Each "
-             "bounding box has four coordinate values and the layout is "
-             "[xmin, ymin, xmax, ymax].")
+    AddInput(
+        "Anchors",
+        "(List) A list of tensors from multiple FPN levels. Each"
+        "element is a 2-D phi::DenseTensor with shape [Mi, 4] represents the "
+        "locations of Mi anchor boxes from i-th FPN level. Each "
+        "bounding box has four coordinate values and the layout is "
+        "[xmin, ymin, xmax, ymax].")
         .AsDuplicable();
     AddInput("ImInfo",
              "(phi::DenseTensor) A 2-D phi::DenseTensor with shape [N, 3] "
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
index ff4c1159119e3..9ba51850ebaaa 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 bool GT_E(T a, T b) {
   return (a > b) || fabs(a - b) < 1e-4;
@@ -600,7 +598,7 @@ class ROIPerspectiveTransformOpMaker
  public:
   void Make() override {
     AddInput("X",
-             "(Tensor), "
+             "(phi::DenseTensor), "
              "the input of ROIPerspectiveTransformOp. "
              "The format of input tensor is NCHW. Where N is batch size, "
              "C is the number of input channels, "
@@ -617,28 +615,28 @@ class ROIPerspectiveTransformOpMaker
              "(x4, y4) is the bottom left coordinates.");
     AddOutput(
         "Out",
-        "(Tensor), "
+        "(phi::DenseTensor), "
         "The output of ROIPerspectiveTransformOp is a 4-D tensor with shape "
         "(num_rois, channels, transformed_h, transformed_w).");
     AddOutput("Mask",
-              "(Tensor), "
+              "(phi::DenseTensor), "
               "The output mask of ROIPerspectiveTransformOp is a 4-D tensor "
               "with shape "
               "(num_rois, 1, transformed_h, transformed_w).");
     AddOutput("TransformMatrix",
-              "(Tensor), "
+              "(phi::DenseTensor), "
               "The output transform matrix of ROIPerspectiveTransformOp is a "
               "1-D tensor with shape "
               "(num_rois, 9).");
     AddOutput("Out2InIdx",
-              "(Tensor), "
+              "(phi::DenseTensor), "
               "An intermediate tensor used to map indexes of input feature map "
               "and indexes of output feature map."
               "The shape of the tensor is [out_size, 4] and out_size is the "
               "number of elements in output feature map.")
         .AsIntermediate();
     AddOutput("Out2InWeights",
-              "(Tensor), "
+              "(phi::DenseTensor), "
               "An intermediate tensor used to record the weights of bilinear "
               "interpolatein for each element in output. The shape of the "
               "tensor is [out_size, 4] and out_size is the number of elements "
diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
index f73ddf9a09e6e..ba7fe51383822 100644
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -21,7 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -113,11 +112,12 @@ void AppendRpns(phi::DenseTensor* out,
 }
 
 template <typename T>
-std::vector<Tensor> FilterStraddleAnchor(const phi::CPUContext& context,
-                                         const phi::DenseTensor* anchor,
-                                         const float rpn_straddle_thresh,
-                                         T im_height,
-                                         T im_width) {
+std::vector<phi::DenseTensor> FilterStraddleAnchor(
+    const phi::CPUContext& context,
+    const phi::DenseTensor* anchor,
+    const float rpn_straddle_thresh,
+    T im_height,
+    T im_width) {
   std::vector<int> inds_inside;
   int anchor_num = anchor->dims()[0];
   auto* anchor_data = anchor->data<T>();
@@ -138,25 +138,25 @@ std::vector<Tensor> FilterStraddleAnchor(const phi::CPUContext& context,
     }
   }
   int inside_num = inds_inside.size();
-  Tensor inds_inside_t;
+  phi::DenseTensor inds_inside_t;
   int* inds_inside_data =
       inds_inside_t.mutable_data<int>({inside_num}, context.GetPlace());
   std::copy(inds_inside.begin(), inds_inside.end(), inds_inside_data);
-  Tensor inside_anchor_t;
+  phi::DenseTensor inside_anchor_t;
   T* inside_anchor_data =
       inside_anchor_t.mutable_data<T>({inside_num, 4}, context.GetPlace());
   Gather<T>(
       anchor->data<T>(), 4, inds_inside_data, inside_num, inside_anchor_data);
-  std::vector<Tensor> res;
+  std::vector<phi::DenseTensor> res;
   res.emplace_back(inds_inside_t);
   res.emplace_back(inside_anchor_t);
   return res;
 }
 
 template <typename T>
-Tensor FilterCrowdGt(const phi::CPUContext& context,
-                     phi::DenseTensor* gt_boxes,
-                     phi::DenseTensor* is_crowd) {
+phi::DenseTensor FilterCrowdGt(const phi::CPUContext& context,
+                               phi::DenseTensor* gt_boxes,
+                               phi::DenseTensor* is_crowd) {
   int gt_num = gt_boxes->dims()[0];
   std::vector<int> not_crowd_inds;
   auto* is_crowd_data = is_crowd->data<int>();
@@ -166,7 +166,7 @@ Tensor FilterCrowdGt(const phi::CPUContext& context,
     }
   }
   int ncrowd_num = not_crowd_inds.size();
-  Tensor ncrowd_gt_boxes;
+  phi::DenseTensor ncrowd_gt_boxes;
   T* ncrowd_gt_boxes_data =
       ncrowd_gt_boxes.mutable_data<T>({ncrowd_num, 4}, context.GetPlace());
   Gather<T>(gt_boxes->data<T>(),
@@ -300,7 +300,7 @@ void ScoreAssign(const T* anchor_by_gt_overlap_data,
 }
 
 template <typename T>
-std::vector<Tensor> SampleRpnFgBgGt(
+std::vector<phi::DenseTensor> SampleRpnFgBgGt(
     const phi::CPUContext& ctx,
     const phi::DenseTensor& anchor_by_gt_overlap,
     const int rpn_batch_size_per_im,
@@ -322,7 +322,7 @@ std::vector<Tensor> SampleRpnFgBgGt(
   // Calculate the max IoU between anchors and gt boxes
   // Map from anchor to gt box that has highest overlap
   auto place = ctx.GetPlace();
-  Tensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max;
+  phi::DenseTensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max;
   anchor_to_gt_max.mutable_data<T>({anchor_num}, place);
   int* argmax = anchor_to_gt_argmax.mutable_data<int>({anchor_num}, place);
   gt_to_anchor_max.mutable_data<T>({gt_num}, place);
@@ -365,7 +365,8 @@ std::vector<Tensor> SampleRpnFgBgGt(
   for (int i = 0; i < fg_fake_num; ++i) {
     gt_inds.emplace_back(argmax[fg_fake[i]]);
   }
-  Tensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t, bbox_inside_weight_t;
+  phi::DenseTensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t,
+      bbox_inside_weight_t;
   int* loc_index_data = loc_index_t.mutable_data<int>({fg_fake_num}, place);
   int* score_index_data =
       score_index_t.mutable_data<int>({fg_num + bg_num}, place);
@@ -381,7 +382,7 @@ std::vector<Tensor> SampleRpnFgBgGt(
   std::copy(bbox_inside_weight.begin(),
             bbox_inside_weight.end(),
             bbox_inside_weight_data);
-  std::vector<Tensor> loc_score_tgtlbl_gt;
+  std::vector<phi::DenseTensor> loc_score_tgtlbl_gt;
   loc_score_tgtlbl_gt.emplace_back(loc_index_t);
   loc_score_tgtlbl_gt.emplace_back(score_index_t);
   loc_score_tgtlbl_gt.emplace_back(tgt_lbl_t);
@@ -455,30 +456,30 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
     auto gt_boxes_lod = gt_boxes->lod().back();
     auto is_crowd_lod = is_crowd->lod().back();
     for (int i = 0; i < batch_num; ++i) {
-      Tensor gt_boxes_slice =
+      phi::DenseTensor gt_boxes_slice =
           gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
-      Tensor is_crowd_slice =
+      phi::DenseTensor is_crowd_slice =
           is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
+      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
       auto* im_info_data = im_info_slice.data<T>();
       auto im_height = im_info_data[0];
       auto im_width = im_info_data[1];
       auto im_scale = im_info_data[2];
 
       // Filter straddle anchor
-      std::vector<Tensor> filter_output = FilterStraddleAnchor<T>(
+      std::vector<phi::DenseTensor> filter_output = FilterStraddleAnchor<T>(
           dev_ctx, anchor, rpn_straddle_thresh, im_height, im_width);
-      Tensor inds_inside = filter_output[0];
-      Tensor inside_anchor = filter_output[1];
+      phi::DenseTensor inds_inside = filter_output[0];
+      phi::DenseTensor inside_anchor = filter_output[1];
 
       // Filter crowd gt
-      Tensor ncrowd_gt_boxes =
+      phi::DenseTensor ncrowd_gt_boxes =
           FilterCrowdGt<T>(dev_ctx, &gt_boxes_slice, &is_crowd_slice);
       auto ncrowd_gt_boxes_et =
           framework::EigenTensor<T, 2>::From(ncrowd_gt_boxes);
       ncrowd_gt_boxes_et = ncrowd_gt_boxes_et * im_scale;
 
-      Tensor anchor_by_gt_overlap;
+      phi::DenseTensor anchor_by_gt_overlap;
       anchor_by_gt_overlap.mutable_data<T>(
           {inside_anchor.dims()[0], ncrowd_gt_boxes.dims()[0]}, place);
       BboxOverlaps<T>(inside_anchor, ncrowd_gt_boxes, &anchor_by_gt_overlap);
@@ -492,16 +493,16 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
                                                     engine,
                                                     use_random);
 
-      Tensor sampled_loc_index = loc_score_tgtlbl_gt[0];
-      Tensor sampled_score_index = loc_score_tgtlbl_gt[1];
-      Tensor sampled_tgtlbl = loc_score_tgtlbl_gt[2];
-      Tensor sampled_gt_index = loc_score_tgtlbl_gt[3];
-      Tensor sampled_bbox_inside_weight = loc_score_tgtlbl_gt[4];
+      phi::DenseTensor sampled_loc_index = loc_score_tgtlbl_gt[0];
+      phi::DenseTensor sampled_score_index = loc_score_tgtlbl_gt[1];
+      phi::DenseTensor sampled_tgtlbl = loc_score_tgtlbl_gt[2];
+      phi::DenseTensor sampled_gt_index = loc_score_tgtlbl_gt[3];
+      phi::DenseTensor sampled_bbox_inside_weight = loc_score_tgtlbl_gt[4];
 
       int loc_num = sampled_loc_index.dims()[0];
       int score_num = sampled_score_index.dims()[0];
       // unmap to all anchor
-      Tensor sampled_loc_index_unmap, sampled_score_index_unmap;
+      phi::DenseTensor sampled_loc_index_unmap, sampled_score_index_unmap;
       sampled_loc_index_unmap.mutable_data<int>({loc_num}, place);
       sampled_score_index_unmap.mutable_data<int>({score_num}, place);
       Gather<int>(inds_inside.data<int>(),
@@ -516,7 +517,7 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
                   sampled_score_index_unmap.data<int>());
 
       // get target bbox deltas
-      Tensor sampled_anchor, sampled_gt, sampled_tgt_bbox;
+      phi::DenseTensor sampled_anchor, sampled_gt, sampled_tgt_bbox;
       auto* sampled_anchor_data =
           sampled_anchor.mutable_data<T>({loc_num, 4}, place);
       auto* sampled_gt_data = sampled_gt.mutable_data<T>({loc_num, 4}, place);
@@ -859,10 +860,11 @@ class RetinanetTargetAssignOp : public framework::OperatorWithKernel {
 };
 
 template <typename T>
-std::vector<Tensor> FilterCrowdGtBoxLabel(const phi::CPUContext& context,
-                                          phi::DenseTensor* gt_boxes,
-                                          phi::DenseTensor* gt_labels,
-                                          phi::DenseTensor* is_crowd) {
+std::vector<phi::DenseTensor> FilterCrowdGtBoxLabel(
+    const phi::CPUContext& context,
+    phi::DenseTensor* gt_boxes,
+    phi::DenseTensor* gt_labels,
+    phi::DenseTensor* is_crowd) {
   int gt_num = gt_boxes->dims()[0];
   std::vector<int> not_crowd_inds;
   auto* is_crowd_data = is_crowd->data<int>();
@@ -872,7 +874,7 @@ std::vector<Tensor> FilterCrowdGtBoxLabel(const phi::CPUContext& context,
     }
   }
   int ncrowd_num = not_crowd_inds.size();
-  Tensor ncrowd_gt_boxes, ncrowd_gt_labels;
+  phi::DenseTensor ncrowd_gt_boxes, ncrowd_gt_labels;
   T* ncrowd_gt_boxes_data =
       ncrowd_gt_boxes.mutable_data<T>({ncrowd_num, 4}, context.GetPlace());
   int* ncrowd_gt_labels_data =
@@ -887,19 +889,20 @@ std::vector<Tensor> FilterCrowdGtBoxLabel(const phi::CPUContext& context,
               not_crowd_inds.data(),
               ncrowd_num,
               ncrowd_gt_labels_data);
-  std::vector<Tensor> res;
+  std::vector<phi::DenseTensor> res;
   res.emplace_back(ncrowd_gt_boxes);
   res.emplace_back(ncrowd_gt_labels);
   return res;
 }
 
 template <typename T>
-std::vector<Tensor> GetAllFgBgGt(const phi::CPUContext& ctx,
-                                 const phi::DenseTensor& anchor_by_gt_overlap,
-                                 const phi::DenseTensor& ncrowd_gt_labels,
-                                 const float positive_overlap,
-                                 const float negative_overlap,
-                                 std::minstd_rand engine) {
+std::vector<phi::DenseTensor> GetAllFgBgGt(
+    const phi::CPUContext& ctx,
+    const phi::DenseTensor& anchor_by_gt_overlap,
+    const phi::DenseTensor& ncrowd_gt_labels,
+    const float positive_overlap,
+    const float negative_overlap,
+    std::minstd_rand engine) {
   auto* anchor_by_gt_overlap_data = anchor_by_gt_overlap.data<T>();
   int anchor_num = anchor_by_gt_overlap.dims()[0];
   int gt_num = anchor_by_gt_overlap.dims()[1];
@@ -913,7 +916,7 @@ std::vector<Tensor> GetAllFgBgGt(const phi::CPUContext& ctx,
   // Calculate the max IoU between anchors and gt boxes
   // Map from anchor to gt box that has highest overlap
   auto place = ctx.GetPlace();
-  Tensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max;
+  phi::DenseTensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max;
   anchor_to_gt_max.mutable_data<T>({anchor_num}, place);
   int* argmax = anchor_to_gt_argmax.mutable_data<int>({anchor_num}, place);
   gt_to_anchor_max.mutable_data<T>({gt_num}, place);
@@ -961,8 +964,9 @@ std::vector<Tensor> GetAllFgBgGt(const phi::CPUContext& ctx,
     gt_inds.emplace_back(argmax[fg_fake[i]]);
   }
 
-  Tensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t, bbox_inside_weight_t;
-  Tensor fg_num_t;
+  phi::DenseTensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t,
+      bbox_inside_weight_t;
+  phi::DenseTensor fg_num_t;
   int* loc_index_data = loc_index_t.mutable_data<int>({fg_fake_num}, place);
   int* score_index_data =
       score_index_t.mutable_data<int>({fg_num + bg_num}, place);
@@ -980,7 +984,7 @@ std::vector<Tensor> GetAllFgBgGt(const phi::CPUContext& ctx,
             bbox_inside_weight.end(),
             bbox_inside_weight_data);
   fg_num_data[0] = fg_fake.size() + 1;
-  std::vector<Tensor> loc_score_tgtlbl_gt;
+  std::vector<phi::DenseTensor> loc_score_tgtlbl_gt;
   loc_score_tgtlbl_gt.emplace_back(loc_index_t);
   loc_score_tgtlbl_gt.emplace_back(score_index_t);
   loc_score_tgtlbl_gt.emplace_back(tgt_lbl_t);
@@ -1065,35 +1069,35 @@ class RetinanetTargetAssignKernel : public framework::OpKernel<T> {
     auto gt_labels_lod = gt_labels->lod().back();
     auto is_crowd_lod = is_crowd->lod().back();
     for (int i = 0; i < batch_num; ++i) {
-      Tensor gt_boxes_slice =
+      phi::DenseTensor gt_boxes_slice =
           gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
-      Tensor gt_labels_slice =
+      phi::DenseTensor gt_labels_slice =
           gt_labels->Slice(gt_labels_lod[i], gt_labels_lod[i + 1]);
-      Tensor is_crowd_slice =
+      phi::DenseTensor is_crowd_slice =
           is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
+      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
       auto* im_info_data = im_info_slice.data<T>();
       auto im_height = im_info_data[0];
       auto im_width = im_info_data[1];
       auto im_scale = im_info_data[2];
 
       // Filter straddle anchor
-      std::vector<Tensor> filter_output =
+      std::vector<phi::DenseTensor> filter_output =
           FilterStraddleAnchor<T>(dev_ctx, anchor, -1, im_height, im_width);
-      Tensor inds_inside = filter_output[0];
-      Tensor inside_anchor = filter_output[1];
+      phi::DenseTensor inds_inside = filter_output[0];
+      phi::DenseTensor inside_anchor = filter_output[1];
 
       // Filter crowd gt
-      std::vector<Tensor> ncrowd_output = FilterCrowdGtBoxLabel<T>(
+      std::vector<phi::DenseTensor> ncrowd_output = FilterCrowdGtBoxLabel<T>(
           dev_ctx, &gt_boxes_slice, &gt_labels_slice, &is_crowd_slice);
-      Tensor ncrowd_gt_boxes = ncrowd_output[0];
-      Tensor ncrowd_gt_labels = ncrowd_output[1];
+      phi::DenseTensor ncrowd_gt_boxes = ncrowd_output[0];
+      phi::DenseTensor ncrowd_gt_labels = ncrowd_output[1];
 
       auto ncrowd_gt_boxes_et =
           framework::EigenTensor<T, 2>::From(ncrowd_gt_boxes);
       ncrowd_gt_boxes_et = ncrowd_gt_boxes_et * im_scale;
 
-      Tensor anchor_by_gt_overlap;
+      phi::DenseTensor anchor_by_gt_overlap;
       anchor_by_gt_overlap.mutable_data<T>(
           {inside_anchor.dims()[0], ncrowd_gt_boxes.dims()[0]}, place);
       BboxOverlaps<T>(inside_anchor, ncrowd_gt_boxes, &anchor_by_gt_overlap);
@@ -1105,17 +1109,17 @@ class RetinanetTargetAssignKernel : public framework::OpKernel<T> {
                                                  negative_overlap,
                                                  engine);
 
-      Tensor sampled_loc_index = loc_score_tgtlbl_gt[0];
-      Tensor sampled_score_index = loc_score_tgtlbl_gt[1];
-      Tensor sampled_tgtlbl = loc_score_tgtlbl_gt[2];
-      Tensor sampled_gt_index = loc_score_tgtlbl_gt[3];
-      Tensor sampled_bbox_inside_weight = loc_score_tgtlbl_gt[4];
-      Tensor sampled_fg_num = loc_score_tgtlbl_gt[5];
+      phi::DenseTensor sampled_loc_index = loc_score_tgtlbl_gt[0];
+      phi::DenseTensor sampled_score_index = loc_score_tgtlbl_gt[1];
+      phi::DenseTensor sampled_tgtlbl = loc_score_tgtlbl_gt[2];
+      phi::DenseTensor sampled_gt_index = loc_score_tgtlbl_gt[3];
+      phi::DenseTensor sampled_bbox_inside_weight = loc_score_tgtlbl_gt[4];
+      phi::DenseTensor sampled_fg_num = loc_score_tgtlbl_gt[5];
 
       int loc_num = sampled_loc_index.dims()[0];
       int score_num = sampled_score_index.dims()[0];
       // unmap to all anchor
-      Tensor sampled_loc_index_unmap, sampled_score_index_unmap;
+      phi::DenseTensor sampled_loc_index_unmap, sampled_score_index_unmap;
       sampled_loc_index_unmap.mutable_data<int>({loc_num}, place);
       sampled_score_index_unmap.mutable_data<int>({score_num}, place);
       Gather<int>(inds_inside.data<int>(),
@@ -1130,7 +1134,7 @@ class RetinanetTargetAssignKernel : public framework::OpKernel<T> {
                   sampled_score_index_unmap.data<int>());
 
       // get target bbox deltas
-      Tensor sampled_anchor, sampled_gt, sampled_tgt_bbox;
+      phi::DenseTensor sampled_anchor, sampled_gt, sampled_tgt_bbox;
       auto* sampled_anchor_data =
           sampled_anchor.mutable_data<T>({loc_num, 4}, place);
       auto* sampled_gt_data = sampled_gt.mutable_data<T>({loc_num, 4}, place);
diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
index 56d28c20dc8e7..6ff2e9c65d856 100644
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
+++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaxinumNumBlocks = 4096;
 
@@ -123,10 +121,10 @@ template <typename DeviceContext, typename T>
 class GPUSigmoidFocalLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<phi::DenseTensor>("X");
-    const Tensor *Labels = context.Input<phi::DenseTensor>("Label");
-    const Tensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
-    Tensor *Out = context.Output<phi::DenseTensor>("Out");
+    const phi::DenseTensor *X = context.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor *Labels = context.Input<phi::DenseTensor>("Label");
+    const phi::DenseTensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
+    phi::DenseTensor *Out = context.Output<phi::DenseTensor>("Out");
     T gamma = static_cast<T>(context.Attr<float>("gamma"));
     T alpha = static_cast<T>(context.Attr<float>("alpha"));
     auto x_dims = X->dims();
@@ -154,12 +152,13 @@ template <typename DeviceContext, typename T>
 class GPUSigmoidFocalLossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<phi::DenseTensor>("X");
-    const Tensor *Labels = context.Input<phi::DenseTensor>("Label");
-    const Tensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
-    const Tensor *dOut =
+    const phi::DenseTensor *X = context.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor *Labels = context.Input<phi::DenseTensor>("Label");
+    const phi::DenseTensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
+    const phi::DenseTensor *dOut =
         context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    Tensor *dX = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    phi::DenseTensor *dX =
+        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto dx_data = dX->mutable_data<T>(context.GetPlace());
     T gamma = static_cast<T>(context.Attr<float>("gamma"));
     T alpha = static_cast<T>(context.Attr<float>("alpha"));
diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h
index b7c77a5e28222..0632e5ab8fab0 100644
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h
+++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h
@@ -22,16 +22,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class SigmoidFocalLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<phi::DenseTensor>("X");
-    const Tensor *Labels = context.Input<phi::DenseTensor>("Label");
-    const Tensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
-    Tensor *Out = context.Output<phi::DenseTensor>("Out");
+    const phi::DenseTensor *X = context.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor *Labels = context.Input<phi::DenseTensor>("Label");
+    const phi::DenseTensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
+    phi::DenseTensor *Out = context.Output<phi::DenseTensor>("Out");
     T gamma = static_cast<T>(context.Attr<float>("gamma"));
     T alpha = static_cast<T>(context.Attr<float>("alpha"));
     auto out_data = Out->mutable_data<T>(context.GetPlace());
@@ -79,12 +77,13 @@ template <typename DeviceContext, typename T>
 class SigmoidFocalLossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<phi::DenseTensor>("X");
-    const Tensor *Labels = context.Input<phi::DenseTensor>("Label");
-    const Tensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
-    const Tensor *dOut =
+    const phi::DenseTensor *X = context.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor *Labels = context.Input<phi::DenseTensor>("Label");
+    const phi::DenseTensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
+    const phi::DenseTensor *dOut =
         context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    Tensor *dX = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    phi::DenseTensor *dX =
+        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto dx_data = dX->mutable_data<T>(context.GetPlace());
     T gamma = static_cast<T>(context.Attr<float>("gamma"));
     T alpha = static_cast<T>(context.Attr<float>("alpha"));
diff --git a/paddle/fluid/operators/detection/yolo_box_op_mlu.cc b/paddle/fluid/operators/detection/yolo_box_op_mlu.cc
index 739c05805d68a..aac3369381e95 100644
--- a/paddle/fluid/operators/detection/yolo_box_op_mlu.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op_mlu.cc
@@ -77,7 +77,7 @@ class YoloBoxMLUKernel : public framework::OpKernel<T> {
     MLUOpTensorDesc x_desc(*x, MLUOP_LAYOUT_ARRAY, ToMluOpDataType<T>());
     MLUOpTensorDesc img_size_desc(
         *img_size, MLUOP_LAYOUT_ARRAY, ToMluOpDataType<int32_t>());
-    Tensor anchors_temp(framework::TransToPhiDataType(VT::INT32));
+    phi::DenseTensor anchors_temp(framework::TransToPhiDataType(VT::INT32));
     anchors_temp.Resize({size});
     paddle::framework::TensorFromVector(
         anchors, ctx.device_context(), &anchors_temp);
diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc
index 5d3cccb3a6617..ada4d18eb00c1 100644
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class DetectionMAPOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.h b/paddle/fluid/operators/dgc_clip_by_norm_op.h
index d3f55edd8840f..8e362957e46e8 100644
--- a/paddle/fluid/operators/dgc_clip_by_norm_op.h
+++ b/paddle/fluid/operators/dgc_clip_by_norm_op.h
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class DGCClipByNormKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/dropout_op_mlu.cc b/paddle/fluid/operators/dropout_op_mlu.cc
index 7cf98738d073f..0d0686026da4b 100644
--- a/paddle/fluid/operators/dropout_op_mlu.cc
+++ b/paddle/fluid/operators/dropout_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class DropoutMLUKernel : public framework::OpKernel<T> {
  public:
@@ -106,8 +104,8 @@ class DropoutMLUKernel : public framework::OpKernel<T> {
     }
 
     // In downgrade_in_infer mode, need to multiply (1.0f - dropout_prob).
-    Tensor scale_tensor(x->dtype());
-    Tensor bias_tensor(x->dtype());
+    phi::DenseTensor scale_tensor(x->dtype());
+    phi::DenseTensor bias_tensor(x->dtype());
     scale_tensor.mutable_data<T>({1}, ctx.GetPlace());
     bias_tensor.mutable_data<T>({1}, ctx.GetPlace());
     MLUCnnlTensorDesc scale_desc(scale_tensor);
@@ -157,7 +155,7 @@ class DropoutGradMLUKernel : public framework::OpKernel<T> {
     }
 
     // cast mask from uint8 to float32/float16
-    Tensor cast_mask(grad_x->dtype());
+    phi::DenseTensor cast_mask(grad_x->dtype());
     cast_mask.Resize(mask->dims());
     cast_mask.mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/dropout_op_npu.cc b/paddle/fluid/operators/dropout_op_npu.cc
index a63b6e5e479af..72453bedee399 100644
--- a/paddle/fluid/operators/dropout_op_npu.cc
+++ b/paddle/fluid/operators/dropout_op_npu.cc
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class DropoutNPUKernel : public framework::OpKernel<T> {
  public:
@@ -56,8 +54,8 @@ class DropoutNPUKernel : public framework::OpKernel<T> {
 
     // only achieve the default `upscale_in_train` method
     if (!is_test) {
-      Tensor tmp_x(x->dtype());
-      Tensor tmp_out(out->dtype());
+      phi::DenseTensor tmp_x(x->dtype());
+      phi::DenseTensor tmp_out(out->dtype());
       tmp_x.ShareDataWith(*x);
       tmp_out.ShareDataWith(*out);
       if (x->dims().size() == 1) {
@@ -80,7 +78,7 @@ class DropoutNPUKernel : public framework::OpKernel<T> {
         seed = ctx.Attr<bool>("fix_seed") ? ctx.Attr<int>("seed") : 0;
       }
 
-      Tensor keep_prob_tensor(x->dtype());
+      phi::DenseTensor keep_prob_tensor(x->dtype());
       keep_prob_tensor.mutable_data<T>({1}, ctx.GetPlace());
       FillNpuTensorWithConstant<T>(&keep_prob_tensor,
                                    static_cast<T>(keep_prob));
@@ -89,14 +87,14 @@ class DropoutNPUKernel : public framework::OpKernel<T> {
 
       // mask used in `DropOutGenMask` NPU OP is different from
       // the output `Mask`.
-      Tensor npu_mask(experimental::DataType::UINT8);
+      phi::DenseTensor npu_mask(experimental::DataType::UINT8);
       uint32_t length = (x->numel() + 128 - 1) / 128 * 128;
       npu_mask.Resize(phi::make_ddim({length / 8}));
       npu_mask.mutable_data<uint8_t>(ctx.GetPlace());
 
       // TODO(pangyoki): `keep_prob` used in `DropOutGenMask` NPU
       // OP must be a scalar with shape[0]. At present, the shape
-      // of the `prob` Tensor of this OP is forced to be set to 0
+      // of the `prob` phi::DenseTensor of this OP is forced to be set to 0
       // in `npu_op_runner.cc`, which needs to be optimized later.
       NpuOpRunner runner_gen_mask;
       runner_gen_mask.SetType("DropOutGenMask")
@@ -116,7 +114,7 @@ class DropoutNPUKernel : public framework::OpKernel<T> {
       runner_dropout.Run(stream);
 
       // cast `out` from float/float16 to bool
-      Tensor cast_mask(experimental::DataType::BOOL);
+      phi::DenseTensor cast_mask(experimental::DataType::BOOL);
       cast_mask.Resize(mask->dims());
       cast_mask.mutable_data<bool>(ctx.GetPlace());
       auto dst_dtype_bool =
@@ -176,7 +174,7 @@ class DropoutGradNPUKernel : public framework::OpKernel<T> {
     }
 
     // cast mask from uint8 to float32/float16
-    Tensor cast_mask(dx->dtype());
+    phi::DenseTensor cast_mask(dx->dtype());
     cast_mask.Resize(mask->dims());
     cast_mask.mutable_data<T>(ctx.GetPlace());
     auto dst_dtype =
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc
index 456a11f95aaca..7c6cd94782a9c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename T>
 class ElementwiseAddMLUKernel : public framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
index 17a1736c0871b..7b6683255ea93 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
@@ -21,7 +21,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename T>
 class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
@@ -53,7 +52,7 @@ class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
       const auto& runner = NpuOpRunner("Add", {*x, *y}, {*out}, {});
       runner.Run(dev_ctx.stream());
     } else {
-      Tensor transformed_x, transformed_y;
+      phi::DenseTensor transformed_x, transformed_y;
       NpuElementWiseOpBroadcast<T>(
           dev_ctx, x, y, axis, &transformed_x, &transformed_y);
       const auto& runner =
@@ -96,7 +95,7 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
           }
         }
         if (!reduce_axes.empty()) {
-          Tensor tmp;
+          phi::DenseTensor tmp;
           tmp.ShareDataWith(*dx);
           tmp.Resize(phi::make_ddim(dst_dims_vec));
           const auto& runner =
@@ -128,7 +127,7 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
           }
         }
         if (!reduce_axes.empty()) {
-          Tensor tmp;
+          phi::DenseTensor tmp;
           tmp.ShareDataWith(*dy);
           tmp.Resize(phi::make_ddim(dst_dims_vec));
           const auto& runner =
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h
index 236b40c122204..8c7aa350b4372 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -24,7 +24,6 @@ namespace operators {
 class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  using Tensor = phi::DenseTensor;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     auto y_grad_name = framework::GradVarName("Y");
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_div_op_mlu.cc
index 27f7281b9fb1e..d3e955cd2fe32 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op_mlu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class ElementwiseDivMLUKernel : public framework::OpKernel<T> {
  public:
@@ -66,7 +64,7 @@ class ElementwiseDivGradMLUKernel : public framework::OpKernel<T> {
         CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
 
     // compute dout/y == 1/y * dout
-    Tensor dout_div_y(dout->dtype());
+    phi::DenseTensor dout_div_y(dout->dtype());
     dout_div_y.Resize(dout->dims());
     dout_div_y.mutable_data<T>(ctx.GetPlace());
     MLUBinary<DIV>(ctx,
@@ -110,7 +108,7 @@ class ElementwiseDivGradMLUKernel : public framework::OpKernel<T> {
 
     if (dy) {
       // compute dy = -out * (dout/y) = -out/y * dout
-      Tensor neg_out(out->type());
+      phi::DenseTensor neg_out(out->type());
       neg_out.mutable_data<T>(out->dims(), ctx.GetPlace());
 
       MLUCnnlTensorDesc out_desc(*out);
@@ -121,7 +119,7 @@ class ElementwiseDivGradMLUKernel : public framework::OpKernel<T> {
                     out_desc.get(),
                     GetBasePtr(&neg_out));
 
-      Tensor dy_temp(y->dtype());
+      phi::DenseTensor dy_temp(y->dtype());
       dy_temp.Resize(dout->dims());
       dy_temp.mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
index 74a2a5b6ca6eb..6cc37517d4fbe 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class ElementwiseDivNPUKernel : public framework::OpKernel<T> {
  public:
@@ -66,38 +64,38 @@ class ElementwiseDivGradNPUKernel : public framework::OpKernel<T> {
     if (dx) {
       dx->mutable_data<T>(place);
 
-      Tensor tensor_one(y->type());
+      phi::DenseTensor tensor_one(y->type());
       tensor_one.mutable_data<float>({1}, place);
       FillNpuTensorWithConstant<float>(&tensor_one, static_cast<float>(1.0));
 
       // Use `Div` CANN OP to achieve `1/y` instead of `Power` CANN OP.
       // Because `Power` will cause precision overflow, that is, `float_status`
       // will be set to 1.
-      Tensor y_div(y->type());
+      phi::DenseTensor y_div(y->type());
       y_div.mutable_data<T>(y->dims(), place);
       const auto& runner_one_div_y =
           NpuOpRunner("Div", {tensor_one, *y}, {y_div}, {});
       runner_one_div_y.Run(stream);
 
-      Tensor tensor_zeros(x->type());
+      phi::DenseTensor tensor_zeros(x->type());
       tensor_zeros.mutable_data<T>(x->dims(), place);
       const auto& runner_tensor_zeros =
           NpuOpRunner("ZerosLike", {*x}, {tensor_zeros}, {});
       runner_tensor_zeros.Run(stream);
 
-      Tensor x_zero(experimental::DataType::BOOL);
+      phi::DenseTensor x_zero(experimental::DataType::BOOL);
       x_zero.mutable_data<bool>(x->dims(), place);
       const auto& runner_x_zero =
           NpuOpRunner("Equal", {*x, tensor_zeros}, {x_zero}, {});
       runner_x_zero.Run(stream);
 
-      Tensor x_nozero(experimental::DataType::BOOL);
+      phi::DenseTensor x_nozero(experimental::DataType::BOOL);
       x_nozero.mutable_data<bool>(x->dims(), place);
       const auto& runner_x_nonzero =
           NpuOpRunner("LogicalNot", {x_zero}, {x_nozero}, {});
       runner_x_nonzero.Run(stream);
 
-      Tensor x_nozero_f(x->type());
+      phi::DenseTensor x_nozero_f(x->type());
       x_nozero_f.mutable_data<T>(x->dims(), place);
       const auto& runner_x_nonzero_f =
           NpuOpRunner("Cast",
@@ -106,7 +104,7 @@ class ElementwiseDivGradNPUKernel : public framework::OpKernel<T> {
                       {{"dst_type", static_cast<int32_t>(0)}});
       runner_x_nonzero_f.Run(stream);
 
-      Tensor x_grad_w(x->type());
+      phi::DenseTensor x_grad_w(x->type());
       x_grad_w.mutable_data<T>(x->dims(), place);
       const auto& runner_x_grad_w =
           NpuOpRunner("Mul", {x_nozero_f, y_div}, {x_grad_w}, {});
@@ -120,19 +118,19 @@ class ElementwiseDivGradNPUKernel : public framework::OpKernel<T> {
     if (dy) {
       dy->mutable_data<T>(place);
 
-      Tensor neg_out(out->type());
+      phi::DenseTensor neg_out(out->type());
       neg_out.mutable_data<T>(out->dims(), place);
       const auto& runner_neg_out = NpuOpRunner("Neg", {*out}, {neg_out}, {});
       runner_neg_out.Run(stream);
 
-      Tensor tmp_mul(out->type());
+      phi::DenseTensor tmp_mul(out->type());
       tmp_mul.mutable_data<T>(out->dims(), place);
       const auto& runner_mul =
           NpuOpRunner("Mul", {neg_out, *dout}, {tmp_mul}, {});
       runner_mul.Run(stream);
 
       if (dy->dims() != dout->dims()) {
-        Tensor reduced_tmp_mul(y->type());
+        phi::DenseTensor reduced_tmp_mul(y->type());
         reduced_tmp_mul.mutable_data<T>(y->dims(), place);
 
         std::vector<int64_t> axes;
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
index 396f1b6f6223a..5f1b84112b2f9 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class ElementwiseFloorDivNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
index fe91c28cd1f05..14bfbfb693b06 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class ElementwiseMaxNPUKernel : public framework::OpKernel<T> {
  public:
@@ -51,7 +49,7 @@ class ElementwiseMaxNPUKernel : public framework::OpKernel<T> {
       const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*out}, {});
       runner.Run(stream);
     } else {
-      Tensor transformed_x, transformed_y;
+      phi::DenseTensor transformed_x, transformed_y;
       NpuElementWiseOpBroadcast<T>(
           dev_ctx, x, y, axis, &transformed_x, &transformed_y);
       const auto& runner =
@@ -85,7 +83,7 @@ class ElementwiseMaxGradNPUKernel : public framework::OpKernel<T> {
     auto x_dims = x->dims();
     auto y_dims = y->dims();
     axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-    Tensor transformed_x, transformed_y;
+    phi::DenseTensor transformed_x, transformed_y;
     NpuElementWiseOpBroadcast<T>(
         dev_ctx, x, y, axis, &transformed_x, &transformed_y);
 
@@ -99,9 +97,9 @@ class ElementwiseMaxGradNPUKernel : public framework::OpKernel<T> {
     if (dx && dy) {
       dx->mutable_data<T>(ctx.GetPlace());
       dy->mutable_data<T>(ctx.GetPlace());
-      Tensor tmp_dx;
+      phi::DenseTensor tmp_dx;
       tmp_dx.mutable_data<T>(dout_dims, ctx.GetPlace());
-      Tensor tmp_dy;
+      phi::DenseTensor tmp_dy;
       tmp_dy.mutable_data<T>(dout_dims, ctx.GetPlace());
 
       const auto& runner = NpuOpRunner("MaximumGrad",
@@ -153,12 +151,12 @@ class ElementwiseMaxGradNPUKernel : public framework::OpKernel<T> {
       }
 
     } else if (dx) {
-      Tensor zero_tensor(dout->type());
+      phi::DenseTensor zero_tensor(dout->type());
       zero_tensor.mutable_data<T>(dout_dims, ctx.GetPlace());
       FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
 
       dx->mutable_data<T>(ctx.GetPlace());
-      Tensor tmp_dx;
+      phi::DenseTensor tmp_dx;
       tmp_dx.mutable_data<T>(dout_dims, ctx.GetPlace());
 
       const auto& runner = NpuOpRunner("MaximumGrad",
@@ -190,12 +188,12 @@ class ElementwiseMaxGradNPUKernel : public framework::OpKernel<T> {
       }
 
     } else if (dy) {
-      Tensor zero_tensor(dout->type());
+      phi::DenseTensor zero_tensor(dout->type());
       zero_tensor.mutable_data<T>(dout_dims, ctx.GetPlace());
       FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
 
       dy->mutable_data<T>(ctx.GetPlace());
-      Tensor tmp_dy;
+      phi::DenseTensor tmp_dy;
       tmp_dy.mutable_data<T>(dout_dims, ctx.GetPlace());
 
       const auto& runner = NpuOpRunner("MaximumGrad",
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_min_op_mlu.cc
index 861ed2046c077..43b25b5127c8b 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op_mlu.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class ElementwiseMinMLUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
index 8014f82ca5742..86c37e0c89020 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class ElementwiseMinNPUKernel : public framework::OpKernel<T> {
  public:
@@ -48,7 +46,7 @@ class ElementwiseMinNPUKernel : public framework::OpKernel<T> {
     } else {
       direct_compute = x_dims == phi::slice_ddim(y_dims, axis, y_dims.size());
     }
-    Tensor transformed_x, transformed_y;
+    phi::DenseTensor transformed_x, transformed_y;
     if (direct_compute) {
       transformed_x.ShareDataWith(*x);
       transformed_y.ShareDataWith(*y);
@@ -82,7 +80,7 @@ class ElementwiseMinGradNPUKernel : public framework::OpKernel<T> {
     if (dx && dy) {
       // dx
       dx->mutable_data<T>(ctx.GetPlace());
-      Tensor tmp_x;
+      phi::DenseTensor tmp_x;
       tmp_x.ShareDataWith(*dx);
       if (dx->dims() != dout->dims()) {
         std::vector<int> dst_dims_vec_x;
@@ -105,7 +103,7 @@ class ElementwiseMinGradNPUKernel : public framework::OpKernel<T> {
       }
       // dy
       dy->mutable_data<T>(ctx.GetPlace());
-      Tensor tmp_y;
+      phi::DenseTensor tmp_y;
       tmp_y.ShareDataWith(*dy);
       if (dy->dims() != dout->dims()) {
         std::vector<int> dst_dims_vec_y;
@@ -134,12 +132,12 @@ class ElementwiseMinGradNPUKernel : public framework::OpKernel<T> {
       runner.Run(stream);
 
     } else if (dx) {
-      Tensor zero_tensor(dout->type());
+      phi::DenseTensor zero_tensor(dout->type());
       zero_tensor.mutable_data<T>(y->dims(), ctx.GetPlace());
       FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
       // dx
       dx->mutable_data<T>(ctx.GetPlace());
-      Tensor tmp_x;
+      phi::DenseTensor tmp_x;
       tmp_x.ShareDataWith(*dx);
       if (dx->dims() != dout->dims()) {
         std::vector<int> dst_dims_vec_x;
@@ -168,13 +166,13 @@ class ElementwiseMinGradNPUKernel : public framework::OpKernel<T> {
       runner.Run(stream);
 
     } else if (dy) {
-      Tensor zero_tensor(dout->type());
+      phi::DenseTensor zero_tensor(dout->type());
       zero_tensor.mutable_data<T>(x->dims(), ctx.GetPlace());
       FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
 
       // dy
       dy->mutable_data<T>(ctx.GetPlace());
-      Tensor tmp_y;
+      phi::DenseTensor tmp_y;
       tmp_y.ShareDataWith(*dy);
       if (dy->dims() != dout->dims()) {
         std::vector<int> dst_dims_vec_y;
diff --git a/paddle/fluid/operators/elementwise/elementwise_mlu.h b/paddle/fluid/operators/elementwise/elementwise_mlu.h
index 57f4b0c057686..9a33d5a26ad54 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mlu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mlu.h
@@ -309,7 +309,7 @@ void MLUMinMaxGradHelper(const framework::ExecutionContext& ctx) {
   // mask = Logic(x, y) only support min & max
   cnnlLogicOp_t logic =
       Functor == MAXIMUM_GRAD ? CNNL_LOGIC_OP_GE : CNNL_LOGIC_OP_LE;
-  Tensor mask(x->dtype());
+  phi::DenseTensor mask(x->dtype());
   mask.Resize(phi::make_ddim(out_dims_array));
   mask.mutable_data<Tin>(ctx.GetPlace());
 
@@ -327,7 +327,7 @@ void MLUMinMaxGradHelper(const framework::ExecutionContext& ctx) {
                  GetBasePtr(&mask));
 
   // dx = Mul(dz, mask)
-  Tensor dx_temp(x->dtype());
+  phi::DenseTensor dx_temp(x->dtype());
   dx_temp.Resize(dout->dims());
   dx_temp.mutable_data<Tout>(ctx.GetPlace());
   MLUCnnlTensorDesc dout_desc(*dout);
@@ -344,7 +344,7 @@ void MLUMinMaxGradHelper(const framework::ExecutionContext& ctx) {
                     data_type);
 
   // dy = Sub(dz, dx)
-  Tensor dy_temp(y->dtype());
+  phi::DenseTensor dy_temp(y->dtype());
   dy_temp.Resize(dout->dims());
   dy_temp.mutable_data<Tout>(ctx.GetPlace());
   MLUCnnlOpTensorDesc sub_op_desc(
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc
index bdeef48389b6c..f73fbba0fb496 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class ElementwiseModNPUKernel : public framework::OpKernel<T> {
  public:
@@ -43,7 +41,7 @@ class ElementwiseModNPUKernel : public framework::OpKernel<T> {
       direct_compute = x_dims == phi::slice_ddim(y_dims, axis, y_dims.size());
     }
 
-    Tensor transformed_x, transformed_y;
+    phi::DenseTensor transformed_x, transformed_y;
     if (direct_compute) {
       transformed_x.ShareDataWith(*x);
       transformed_y.ShareDataWith(*y);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index c7b872af75a44..5aa1b7ed4f1dd 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -25,7 +25,6 @@ namespace operators {
 
 class ElementwiseMulOp : public ElementwiseOp {
  public:
-  using Tensor = phi::DenseTensor;
   using ElementwiseOp::ElementwiseOp;
 
   framework::OpKernelType GetExpectedKernelType(
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
index fe2848621c76f..c5f8a0ad711a6 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using MLUDeviceContext = platform::MLUDeviceContext;
 
 template <typename T>
@@ -73,7 +72,7 @@ class ElementwiseMulGradMLUKernel : public framework::OpKernel<T> {
                           GetBasePtr(dx),
                           ToCnnlDataType<T>());
       } else {
-        Tensor dx_temp(x->dtype());
+        phi::DenseTensor dx_temp(x->dtype());
         dx_temp.Resize(dout->dims());
         dx_temp.mutable_data<T>(ctx.GetPlace());
         MLUCnnl::OpTensor(ctx,
@@ -121,7 +120,7 @@ class ElementwiseMulGradMLUKernel : public framework::OpKernel<T> {
                           GetBasePtr(dy),
                           ToCnnlDataType<T>());
       } else {
-        Tensor dy_temp(y->dtype());
+        phi::DenseTensor dy_temp(y->dtype());
         dy_temp.Resize(dout->dims());
         dy_temp.mutable_data<T>(ctx.GetPlace());
         MLUCnnl::OpTensor(ctx,
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
index 4fc3be1b29cc7..d9bf2adeee72c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 template <typename T>
@@ -78,7 +77,7 @@ class ElementwiseMulNPUKernel : public framework::OpKernel<T> {
       const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {});
       runner.Run(stream);
     } else {
-      Tensor trans_x, trans_y;
+      phi::DenseTensor trans_x, trans_y;
       NpuElementWiseOpBroadcast<T>(dev_ctx, x, y, axis, &trans_x, &trans_y);
       const auto& runner = NpuOpRunner("Mul", {trans_x, trans_y}, {*out}, {});
       runner.Run(stream);
@@ -101,7 +100,7 @@ class ElementwiseMulGradNPUKernel : public framework::OpKernel<T> {
     axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
     auto stream = ctx.template device_context<NPUDeviceContext>().stream();
 
-    Tensor trans_x, trans_y;
+    phi::DenseTensor trans_x, trans_y;
     NpuElementWiseOpBroadcast<T>(dev_ctx, x, y, axis, &trans_x, &trans_y);
 
     if (dx) {
@@ -110,7 +109,7 @@ class ElementwiseMulGradNPUKernel : public framework::OpKernel<T> {
         const auto& runner_dx = NpuOpRunner("Mul", {*dout, trans_y}, {*dx}, {});
         runner_dx.Run(stream);
       } else {
-        Tensor dx_temp(x->type());
+        phi::DenseTensor dx_temp(x->type());
         dx_temp.Resize(trans_x.dims());
         dx_temp.mutable_data<T>(ctx.GetPlace());
         const auto& runner_dx =
@@ -126,7 +125,7 @@ class ElementwiseMulGradNPUKernel : public framework::OpKernel<T> {
         const auto& runner_dy = NpuOpRunner("Mul", {trans_x, *dout}, {*dy}, {});
         runner_dy.Run(stream);
       } else {
-        Tensor dy_temp(y->type());
+        phi::DenseTensor dy_temp(y->type());
         dy_temp.Resize(trans_y.dims());
         dy_temp.mutable_data<T>(ctx.GetPlace());
         const auto& runner_dy =
diff --git a/paddle/fluid/operators/elementwise/elementwise_npu.h b/paddle/fluid/operators/elementwise/elementwise_npu.h
index b7e85c45f4c7c..d8ee104c66b99 100644
--- a/paddle/fluid/operators/elementwise/elementwise_npu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_npu.h
@@ -20,7 +20,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename T>
 void NpuBroadcast(const platform::NPUDeviceContext& dev_ctx,
@@ -32,12 +31,12 @@ void NpuBroadcast(const platform::NPUDeviceContext& dev_ctx,
 
   // 1. expand the axis with dim 1
   auto src_dims = src->dims();
-  Tensor tmp_src;
+  phi::DenseTensor tmp_src;
   tmp_src.ShareDataWith(*src);
   tmp_src.Resize(src_dims);
   for (int i = 0; i < src_dims.size(); ++i) {
     if (src_dims[i] == 1 && dst_dims[i + axis] > 1) {
-      Tensor tmp_tensor;
+      phi::DenseTensor tmp_tensor;
       auto tmp_tensor_dims = tmp_src.dims();
       tmp_tensor_dims[i] = dst_dims[i + axis];
       tmp_tensor.mutable_data<T>(tmp_tensor_dims, dev_ctx.GetPlace());
@@ -56,7 +55,7 @@ void NpuBroadcast(const platform::NPUDeviceContext& dev_ctx,
   // 2.expand the ahead axis
   auto prev = phi::product(phi::slice_ddim(dst_dims, 0, axis));
   if (prev > 1) {
-    Tensor tmp_tensor;
+    phi::DenseTensor tmp_tensor;
     auto tmp_tensor_dims = phi::slice_ddim(dst_dims, 0, axis + src_dims.size());
     tmp_tensor.mutable_data<T>(tmp_tensor_dims, dev_ctx.GetPlace());
     const auto& runner =
@@ -79,7 +78,7 @@ void NpuBroadcast(const platform::NPUDeviceContext& dev_ctx,
     src_dims_vec.push_back(1);
     tmp_src.Resize(phi::make_ddim(src_dims_vec));
 
-    Tensor tmp_tensor;
+    phi::DenseTensor tmp_tensor;
     tmp_tensor.mutable_data<T>(dst_dims, dev_ctx.GetPlace());
     const auto& runner =
         NpuOpRunner("TileWithAxis",
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index 6bc9c345fcd4e..1ed8f4eb012a2 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -36,8 +36,6 @@ class ElementwiseOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  using Tensor = phi::DenseTensor;
-
   void InferShape(framework::InferShapeContext *ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ElementwiseOp");
     OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "ElementwiseOp");
@@ -282,7 +280,6 @@ For example:
 class ElementwiseOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  using Tensor = phi::DenseTensor;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     auto out_grad_name = framework::GradVarName("Out");
@@ -330,7 +327,6 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
 class ElementwiseOpDoubleGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  using Tensor = phi::DenseTensor;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     auto x_grad_name = framework::GradVarName("X");
@@ -376,7 +372,6 @@ class ElementwiseOpDoubleGradWithoutDXDY
     : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  using Tensor = phi::DenseTensor;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     if (ctx->HasOutput("DDOut")) {
@@ -427,7 +422,6 @@ class ElementwiseOpDoubleGradWithoutDXDY
 class ElementwiseOpTripleGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  using Tensor = phi::DenseTensor;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     if (ctx->HasOutput("D_DDX")) {
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc
index 6942377049b47..77d1160e4ce16 100644
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class ElementwisePowMLUKernel : public framework::OpKernel<T> {
  public:
@@ -64,11 +62,11 @@ class ElementwisePowGradMLUKernel : public framework::OpKernel<T> {
     auto dout_dims = dout->dims();
     if (dx) {
       // dx = dout * y * pow(x, y - 1);
-      Tensor one_dx(y->type());
+      phi::DenseTensor one_dx(y->type());
       one_dx.mutable_data<T>(phi::make_ddim(y_dims_array), place);
       FillMLUTensorWithHostValue(ctx, static_cast<T>(1), &one_dx);
 
-      Tensor sub_dx(y->type());
+      phi::DenseTensor sub_dx(y->type());
       sub_dx.mutable_data<T>(phi::make_ddim(y_dims_array), place);
       MLUCnnlOpTensorDesc op_tensor_desc(
           CNNL_OP_TENSOR_SUB, data_type, CNNL_NOT_PROPAGATE_NAN);
@@ -82,7 +80,7 @@ class ElementwisePowGradMLUKernel : public framework::OpKernel<T> {
                         GetBasePtr(&sub_dx),
                         data_type);
 
-      Tensor tmp_dx(x->type());
+      phi::DenseTensor tmp_dx(x->type());
       tmp_dx.mutable_data<T>(phi::make_ddim(out_dims_array), place);
       MLUCnnl::Pow(ctx,
                    CNNL_COMPUTATION_HIGH_PRECISION,
@@ -134,7 +132,7 @@ class ElementwisePowGradMLUKernel : public framework::OpKernel<T> {
     }
     if (dy) {
       // dy = dout * log(x) * pow(x, y)
-      Tensor tmp_dy(y->type());
+      phi::DenseTensor tmp_dy(y->type());
       tmp_dy.mutable_data<T>(phi::make_ddim(out_dims_array), place);
       MLUCnnl::Pow(ctx,
                    CNNL_COMPUTATION_HIGH_PRECISION,
@@ -145,7 +143,7 @@ class ElementwisePowGradMLUKernel : public framework::OpKernel<T> {
                    out_desc.get(),
                    GetBasePtr(&tmp_dy));
 
-      Tensor log_x(x->type());
+      phi::DenseTensor log_x(x->type());
       log_x.mutable_data<T>(x->dims(), place);
       MLUCnnl::Log(ctx,
                    CNNL_COMPUTATION_HIGH_PRECISION,
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
index 18853222ba6b7..b0b1b37c4f78d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class ElementwisePowNPUKernel : public framework::OpKernel<T> {
  public:
@@ -56,7 +54,7 @@ class ElementwisePowNPUKernel : public framework::OpKernel<T> {
       const auto& runner = NpuOpRunner("Pow", {*x, *y}, {*out}, {});
       runner.Run(stream);
     } else {
-      Tensor transformed_x, transformed_y;
+      phi::DenseTensor transformed_x, transformed_y;
       NpuElementWiseOpBroadcast<T>(
           dev_ctx, x, y, axis, &transformed_x, &transformed_y);
       const auto& runner =
@@ -84,7 +82,7 @@ class ElementwisePowGradNPUKernel : public framework::OpKernel<T> {
     auto y_dims = y->dims();
     axis =
         (axis < 0 ? std::abs(x_dims.size() - y_dims.size()) + axis + 1 : axis);
-    Tensor transformed_x, transformed_y;
+    phi::DenseTensor transformed_x, transformed_y;
     NpuElementWiseOpBroadcast<T>(
         dev_ctx, x, y, axis, &transformed_x, &transformed_y);
 
@@ -93,34 +91,34 @@ class ElementwisePowGradNPUKernel : public framework::OpKernel<T> {
     // Reshape info vector.
     std::vector<int> reduce_axes;
     if (dx) {
-      Tensor zero_tensor(dout->type());
+      phi::DenseTensor zero_tensor(dout->type());
       zero_tensor.mutable_data<T>(dout_dims, place);
       FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
 
       dx->mutable_data<T>(place);
-      Tensor tmp_dx;
+      phi::DenseTensor tmp_dx;
       tmp_dx.mutable_data<T>(dout_dims, place);
 
       // dx = dout * y * pow(x, y - 1);
-      Tensor PowGrad_dx_temp1(dout->type());
+      phi::DenseTensor PowGrad_dx_temp1(dout->type());
       PowGrad_dx_temp1.mutable_data<T>(dout->dims(), place);
       const auto& runner_PowGrad_dx_temp1 =
           NpuOpRunner("Mul", {*dout, transformed_y}, {PowGrad_dx_temp1}, {});
       runner_PowGrad_dx_temp1.Run(stream);
 
-      Tensor one_dx(transformed_y.type());
+      phi::DenseTensor one_dx(transformed_y.type());
       one_dx.mutable_data<T>(transformed_y.dims(), place);
       const auto& runner_one_dx =
           NpuOpRunner("OnesLike", {transformed_y}, {one_dx}, {});
       runner_one_dx.Run(stream);
 
-      Tensor sub_dx(transformed_y.type());
+      phi::DenseTensor sub_dx(transformed_y.type());
       sub_dx.mutable_data<T>(transformed_y.dims(), place);
       const auto& runner_sub_dx =
           NpuOpRunner("Sub", {transformed_y, one_dx}, {sub_dx}, {});
       runner_sub_dx.Run(stream);
 
-      Tensor PowGrad_dx_temp2(transformed_x.type());
+      phi::DenseTensor PowGrad_dx_temp2(transformed_x.type());
       PowGrad_dx_temp2.mutable_data<T>(transformed_x.dims(), place);
       const auto& runner_PowGrad_dx_temp2 =
           NpuOpRunner("Pow", {transformed_x, sub_dx}, {PowGrad_dx_temp2}, {});
@@ -153,39 +151,39 @@ class ElementwisePowGradNPUKernel : public framework::OpKernel<T> {
       }
     }
     if (dy) {
-      Tensor zero_tensor(dout->type());
+      phi::DenseTensor zero_tensor(dout->type());
       zero_tensor.mutable_data<T>(dout_dims, place);
       FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
 
       dy->mutable_data<T>(place);
-      Tensor tmp_dy;
+      phi::DenseTensor tmp_dy;
       tmp_dy.mutable_data<T>(dout_dims, place);
 
       // dy = dout * log(x) * pow(x, y)
-      Tensor PowGrad_dy_temp1(transformed_x.type());
+      phi::DenseTensor PowGrad_dy_temp1(transformed_x.type());
       PowGrad_dy_temp1.mutable_data<T>(transformed_x.dims(), place);
       const auto& runner_PowGrad_dy_temp1 = NpuOpRunner(
           "Pow", {transformed_x, transformed_y}, {PowGrad_dy_temp1}, {});
       runner_PowGrad_dy_temp1.Run(stream);
 
-      Tensor one_dy(transformed_x.type());
+      phi::DenseTensor one_dy(transformed_x.type());
       one_dy.mutable_data<T>(transformed_x.dims(), place);
       const auto& runner_one_dy =
           NpuOpRunner("OnesLike", {transformed_x}, {one_dy}, {});
       runner_one_dy.Run(stream);
 
-      Tensor sub_dy(transformed_x.type());
+      phi::DenseTensor sub_dy(transformed_x.type());
       sub_dy.mutable_data<T>(transformed_x.dims(), place);
       const auto& runner_sub_dy =
           NpuOpRunner("Sub", {transformed_x, one_dy}, {sub_dy}, {});
       runner_sub_dy.Run(stream);
 
-      Tensor log_dy(transformed_x.type());
+      phi::DenseTensor log_dy(transformed_x.type());
       log_dy.mutable_data<T>(transformed_x.dims(), place);
       const auto& runner_log_dy = NpuOpRunner("Log1p", {sub_dy}, {log_dy}, {});
       runner_log_dy.Run(stream);
 
-      Tensor PowGrad_dy_temp2(transformed_x.type());
+      phi::DenseTensor PowGrad_dy_temp2(transformed_x.type());
       PowGrad_dy_temp2.mutable_data<T>(transformed_x.dims(), place);
       const auto& runner_PowGrad_dy_temp2 = NpuOpRunner(
           "Mul", {log_dy, PowGrad_dy_temp1}, {PowGrad_dy_temp2}, {});
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc
index 0f56044d268e4..1233ae2d0ae0c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class ElementwiseSubMLUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
index 8df295a972559..9f70961c9f620 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class ElementwiseSubNPUKernel : public framework::OpKernel<T> {
  public:
@@ -76,7 +74,7 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
         axes.push_back(i);
       }
       phi::DenseTensor* tmp_dout = const_cast<phi::DenseTensor*>(dout);
-      Tensor reduced_dout(dx->type());
+      phi::DenseTensor reduced_dout(dx->type());
       if (axes.size() != 0) {
         std::vector<int64_t> reduced_dout_dims;
         for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
@@ -124,8 +122,8 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
         axes.push_back(i);
       }
       phi::DenseTensor* tmp_dout = const_cast<phi::DenseTensor*>(dout);
-      Tensor reduced_dy(dy->type());
-      Tensor reduced_dout(dy->type());
+      phi::DenseTensor reduced_dy(dy->type());
+      phi::DenseTensor reduced_dout(dy->type());
 
       if (axes.size() != 0) {
         std::vector<int64_t> reduced_dout_dims;
diff --git a/paddle/fluid/operators/expand_as_op.h b/paddle/fluid/operators/expand_as_op.h
index 58b6b619c231a..a3462a00bcfb1 100644
--- a/paddle/fluid/operators/expand_as_op.h
+++ b/paddle/fluid/operators/expand_as_op.h
@@ -23,7 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
diff --git a/paddle/fluid/operators/expand_as_v2_op.h b/paddle/fluid/operators/expand_as_v2_op.h
index 1205fc0447f1e..2c62dc570ff21 100644
--- a/paddle/fluid/operators/expand_as_v2_op.h
+++ b/paddle/fluid/operators/expand_as_v2_op.h
@@ -24,7 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
diff --git a/paddle/fluid/operators/expand_as_v2_op_mlu.cc b/paddle/fluid/operators/expand_as_v2_op_mlu.cc
index 8184af44916bb..71b154ff02274 100644
--- a/paddle/fluid/operators/expand_as_v2_op_mlu.cc
+++ b/paddle/fluid/operators/expand_as_v2_op_mlu.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class ExpandAsV2MLUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h
index 35d16311a97b3..6d6739eed6702 100644
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@@ -85,7 +85,6 @@ inline std::vector<int> get_expand_times(
   }
 }
 
-using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
diff --git a/paddle/fluid/operators/expand_v2_op_npu.cc b/paddle/fluid/operators/expand_v2_op_npu.cc
index d5748328b1d4d..95a4147c88dbd 100644
--- a/paddle/fluid/operators/expand_v2_op_npu.cc
+++ b/paddle/fluid/operators/expand_v2_op_npu.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename DeviceContext, typename T>
 class ExpandV2NPUKernel : public framework::OpKernel<T> {
  public:
@@ -121,8 +120,8 @@ class ExpandV2NPUKernel : public framework::OpKernel<T> {
 
     const auto& dev_ctx =
         ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto op_func = [](const std::vector<Tensor>& inputs,
-                      const std::vector<Tensor>& outputs,
+    auto op_func = [](const std::vector<phi::DenseTensor>& inputs,
+                      const std::vector<phi::DenseTensor>& outputs,
                       const NPUAttributeMap& attrs,
                       const platform::NPUDeviceContext& dev_ctx) {
       const auto& runner = NpuOpRunner("ExpandD", inputs, outputs, attrs);
@@ -174,8 +173,8 @@ class ExpandV2NPUGradKernel : public framework::OpKernel<T> {
       axes.push_back(i);
     }
 
-    Tensor tmp_dout(dout->dtype());
-    Tensor reduced_dout(dx->dtype());
+    phi::DenseTensor tmp_dout(dout->dtype());
+    phi::DenseTensor reduced_dout(dx->dtype());
     tmp_dout.ShareDataWith(*dout);
     if (axes.size() != 0) {
       std::vector<int64_t> reduced_dout_dims;
diff --git a/paddle/fluid/operators/eye_op_npu.cc b/paddle/fluid/operators/eye_op_npu.cc
index 6a01992c83335..74bbc531c27e3 100644
--- a/paddle/fluid/operators/eye_op_npu.cc
+++ b/paddle/fluid/operators/eye_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class EyeNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h
index 433288e885d01..025c73db8c375 100644
--- a/paddle/fluid/operators/fc_op.h
+++ b/paddle/fluid/operators/fc_op.h
@@ -22,7 +22,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 inline void FCOutputSize(const framework::DDim& in_dims,
                          const framework::DDim& w_dims,
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
index 22df3e5a9d23a..a3ea1af82ee4d 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class FillConstantBatchSizeLikeOpNPUKernel : public framework::OpKernel<T> {
  public:
@@ -80,7 +78,7 @@ class FillConstantBatchSizeLikeOpNPUKernel : public framework::OpKernel<T> {
     } else {
       out->mutable_data(ctx.GetPlace(),
                         framework::TransToPhiDataType(data_type));
-      Tensor tensor_tmp(framework::TransToPhiDataType(data_type));
+      phi::DenseTensor tensor_tmp(framework::TransToPhiDataType(data_type));
       tensor_tmp.mutable_data<T>({1}, ctx.GetPlace());
       FillNpuTensorWithConstant<T>(&tensor_tmp, value);
 
diff --git a/paddle/fluid/operators/fill_constant_op_mlu.cc b/paddle/fluid/operators/fill_constant_op_mlu.cc
index 664d70609e939..8263534f4eeeb 100644
--- a/paddle/fluid/operators/fill_constant_op_mlu.cc
+++ b/paddle/fluid/operators/fill_constant_op_mlu.cc
@@ -60,7 +60,8 @@ class FillConstantMLUKernel : public framework::OpKernel<T> {
           value_tensor->numel(),
           1,
           platform::errors::InvalidArgument(
-              "When use Tensor as value to set Tensor value in fill_cosntant, "
+              "When use phi::DenseTensor as value to set phi::DenseTensor "
+              "value in fill_cosntant, "
               "value input(ValueTensor) size must be 1, but get %d",
               value_tensor->numel()));
       value_data = value_tensor->data<T>();
diff --git a/paddle/fluid/operators/filter_by_instag_op.cu b/paddle/fluid/operators/filter_by_instag_op.cu
index 56068684e16ce..c07a69177b832 100644
--- a/paddle/fluid/operators/filter_by_instag_op.cu
+++ b/paddle/fluid/operators/filter_by_instag_op.cu
@@ -43,7 +43,6 @@ namespace cg = cooperative_groups;
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using SelectedRows = phi::SelectedRows;
 
 template <typename T>
diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h
index 04f1099168a5c..95e6611d9351f 100644
--- a/paddle/fluid/operators/filter_by_instag_op.h
+++ b/paddle/fluid/operators/filter_by_instag_op.h
@@ -29,7 +29,6 @@
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 using SelectedRows = phi::SelectedRows;
 
 template <typename T>
diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
index 65d3f809fa11c..81af8e64f2767 100644
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -28,8 +28,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class FlattenOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/flatten_op_npu.cc b/paddle/fluid/operators/flatten_op_npu.cc
index 177825020d0dc..6c8f986c5e5df 100644
--- a/paddle/fluid/operators/flatten_op_npu.cc
+++ b/paddle/fluid/operators/flatten_op_npu.cc
@@ -55,8 +55,6 @@ class Flatten2GradNPUKernel : public framework::OpKernel<T> {
   }
 };
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class FlattenContiguousRangeNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/fsp_op.h b/paddle/fluid/operators/fsp_op.h
index 0f8072520be2f..c5b903559a07b 100644
--- a/paddle/fluid/operators/fsp_op.h
+++ b/paddle/fluid/operators/fsp_op.h
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class FSPOpKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/fused/attn_gemm.h b/paddle/fluid/operators/fused/attn_gemm.h
index c8ea19d463a1b..c0157c8cb04dd 100644
--- a/paddle/fluid/operators/fused/attn_gemm.h
+++ b/paddle/fluid/operators/fused/attn_gemm.h
@@ -24,7 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 // support gemm-nt and gemm-nn, which is used in fused_attention_op.
 template <typename T>
 class AttnMatMul {
diff --git a/paddle/fluid/operators/fused/attn_gemm_int8.h b/paddle/fluid/operators/fused/attn_gemm_int8.h
index cdbd5b2e0b821..e26273b745260 100644
--- a/paddle/fluid/operators/fused/attn_gemm_int8.h
+++ b/paddle/fluid/operators/fused/attn_gemm_int8.h
@@ -27,7 +27,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using phi::backends::gpu::GpuLaunchConfig;
 
 template <typename T>
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
index 87ed8fb68fe2a..dee0c1837a452 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -27,7 +27,6 @@ namespace paddle {
 namespace operators {
 
 #if PADDLE_WITH_HIP || CUDNN_VERSION >= 7100
-using Tensor = phi::DenseTensor;
 using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
 using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
@@ -77,8 +76,8 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     const std::string padding_algorithm =
         ctx.Attr<std::string>("padding_algorithm");
 
-    Tensor transformed_input_channel(input->dtype());
-    Tensor transformed_output(output->dtype());
+    phi::DenseTensor transformed_input_channel(input->dtype());
+    phi::DenseTensor transformed_output(output->dtype());
     transformed_input_channel = *input;
     transformed_output = *output;
     T* output_data = transformed_output.data<T>();
@@ -99,7 +98,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     int data_dim = strides.size();  // 2d or 3d
     bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
 
-    Tensor transformed_input;
+    phi::DenseTensor transformed_input;
     std::vector<int> padding_common(data_dim, 0);
     if (!is_sys_pad) {
       std::vector<int> padding_diff(data_dim);
@@ -144,7 +143,8 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
         } break;
         default:
           PADDLE_THROW(platform::errors::PermissionDenied(
-              "Operator Conv2DFusion expects Input to be a 4-D or 5-D Tensor. "
+              "Operator Conv2DFusion expects Input to be a 4-D or 5-D "
+              "phi::DenseTensor. "
               "But received the actual dimension = %d, shape = [%s].",
               rank,
               transformed_input_channel.dims()));
diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
index 7c9af00955963..cbf098819212f 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
@@ -31,7 +31,6 @@ DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace op = paddle::operators;
-using Tensor = phi::DenseTensor;
 
 USE_OP_ITSELF(batch_norm);
 PD_DECLARE_KERNEL(batch_norm, GPU, ALL_LAYOUT);
@@ -149,15 +148,15 @@ void ComputeInplaceRelu(phi::DenseTensor *cpu_x) {
 }
 
 void ComputeBatchNormForward(const phi::GPUContext &ctx,
-                             const Tensor &cpu_x,
-                             const Tensor &cpu_scale,
-                             const Tensor &cpu_bias,
-                             Tensor *cpu_mean,
-                             Tensor *cpu_var,
-                             Tensor *cpu_saved_mean,
-                             Tensor *cpu_saved_var,
-                             Tensor *cpu_y,
-                             Tensor *saved_reserve_space) {
+                             const phi::DenseTensor &cpu_x,
+                             const phi::DenseTensor &cpu_scale,
+                             const phi::DenseTensor &cpu_bias,
+                             phi::DenseTensor *cpu_mean,
+                             phi::DenseTensor *cpu_var,
+                             phi::DenseTensor *cpu_saved_mean,
+                             phi::DenseTensor *cpu_saved_var,
+                             phi::DenseTensor *cpu_y,
+                             phi::DenseTensor *saved_reserve_space) {
   framework::Scope scope;
   auto *x = scope.Var("X")->GetMutable<phi::DenseTensor>();
   auto *scale = scope.Var("Scale")->GetMutable<phi::DenseTensor>();
@@ -215,16 +214,16 @@ void ComputeBatchNormForward(const phi::GPUContext &ctx,
 }
 
 void ComputeFusedBNAddReluForward(const phi::GPUContext &ctx,
-                                  const Tensor &cpu_x,
-                                  const Tensor &cpu_z,
-                                  const Tensor &cpu_scale,
-                                  const Tensor &cpu_bias,
-                                  Tensor *cpu_mean,
-                                  Tensor *cpu_var,
-                                  Tensor *cpu_saved_mean,
-                                  Tensor *cpu_saved_var,
-                                  Tensor *cpu_y,
-                                  Tensor *saved_reserve_space) {
+                                  const phi::DenseTensor &cpu_x,
+                                  const phi::DenseTensor &cpu_z,
+                                  const phi::DenseTensor &cpu_scale,
+                                  const phi::DenseTensor &cpu_bias,
+                                  phi::DenseTensor *cpu_mean,
+                                  phi::DenseTensor *cpu_var,
+                                  phi::DenseTensor *cpu_saved_mean,
+                                  phi::DenseTensor *cpu_saved_var,
+                                  phi::DenseTensor *cpu_y,
+                                  phi::DenseTensor *saved_reserve_space) {
   framework::Scope scope;
   auto *x = scope.Var("X")->GetMutable<phi::DenseTensor>();
   auto *z = scope.Var("Z")->GetMutable<phi::DenseTensor>();
@@ -278,18 +277,18 @@ void ComputeFusedBNAddReluForward(const phi::GPUContext &ctx,
 }
 
 void ComputeFusedBNAddReluBackward(const phi::GPUContext &ctx,
-                                   const Tensor &cpu_dy,
-                                   const Tensor &cpu_x,
-                                   const Tensor &cpu_scale,
-                                   const Tensor &cpu_bias,
-                                   const Tensor &cpu_saved_mean,
-                                   const Tensor &cpu_saved_var,
-                                   const Tensor &cpu_y,
-                                   const Tensor &saved_reserve_space,
-                                   Tensor *cpu_dx,
-                                   Tensor *cpu_dz,
-                                   Tensor *cpu_dscale,
-                                   Tensor *cpu_dbias) {
+                                   const phi::DenseTensor &cpu_dy,
+                                   const phi::DenseTensor &cpu_x,
+                                   const phi::DenseTensor &cpu_scale,
+                                   const phi::DenseTensor &cpu_bias,
+                                   const phi::DenseTensor &cpu_saved_mean,
+                                   const phi::DenseTensor &cpu_saved_var,
+                                   const phi::DenseTensor &cpu_y,
+                                   const phi::DenseTensor &saved_reserve_space,
+                                   phi::DenseTensor *cpu_dx,
+                                   phi::DenseTensor *cpu_dz,
+                                   phi::DenseTensor *cpu_dscale,
+                                   phi::DenseTensor *cpu_dbias) {
   framework::Scope scope;
   auto *x = scope.Var("X")->GetMutable<phi::DenseTensor>();
   auto *y = scope.Var("Y")->GetMutable<phi::DenseTensor>();
@@ -383,7 +382,9 @@ class CudnnBNAddReluTester {
     phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
         platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
 
-    auto select = [&](Tensor *in) { return has_shortcut_ ? in : nullptr; };
+    auto select = [&](phi::DenseTensor *in) {
+      return has_shortcut_ ? in : nullptr;
+    };
 
     phi::DenseTensor cpu_mean_base_x;
     phi::DenseTensor cpu_var_base_x;
@@ -506,10 +507,10 @@ class CudnnBNAddReluTester {
     InitRandomTensor<T>({batch_size_, height_, width_, channels_}, &cpu_dy_);
   }
 
-  void InitMeanVar(Tensor *cpu_mean,
-                   Tensor *cpu_var,
-                   Tensor *cpu_saved_mean,
-                   Tensor *cpu_saved_var) {
+  void InitMeanVar(phi::DenseTensor *cpu_mean,
+                   phi::DenseTensor *cpu_var,
+                   phi::DenseTensor *cpu_saved_mean,
+                   phi::DenseTensor *cpu_saved_var) {
     InitConstantTensor<float>({channels_}, static_cast<float>(0.0f), cpu_mean);
     InitConstantTensor<float>({channels_}, static_cast<float>(1.0f), cpu_var);
     InitConstantTensor<float>(
@@ -519,17 +520,17 @@ class CudnnBNAddReluTester {
   }
 
   void BaselineForward(const phi::GPUContext &ctx,
-                       Tensor *cpu_mean_x,
-                       Tensor *cpu_var_x,
-                       Tensor *cpu_saved_mean_x,
-                       Tensor *cpu_saved_var_x,
-                       Tensor *cpu_y,
-                       Tensor *saved_reserve_space_x,
-                       Tensor *cpu_mean_z = nullptr,
-                       Tensor *cpu_var_z = nullptr,
-                       Tensor *cpu_saved_mean_z = nullptr,
-                       Tensor *cpu_saved_var_z = nullptr,
-                       Tensor *saved_reserve_space_z = nullptr) {
+                       phi::DenseTensor *cpu_mean_x,
+                       phi::DenseTensor *cpu_var_x,
+                       phi::DenseTensor *cpu_saved_mean_x,
+                       phi::DenseTensor *cpu_saved_var_x,
+                       phi::DenseTensor *cpu_y,
+                       phi::DenseTensor *saved_reserve_space_x,
+                       phi::DenseTensor *cpu_mean_z = nullptr,
+                       phi::DenseTensor *cpu_var_z = nullptr,
+                       phi::DenseTensor *cpu_saved_mean_z = nullptr,
+                       phi::DenseTensor *cpu_saved_var_z = nullptr,
+                       phi::DenseTensor *saved_reserve_space_z = nullptr) {
     InitMeanVar(cpu_mean_x, cpu_var_x, cpu_saved_mean_x, cpu_saved_var_x);
     ComputeBatchNormForward(ctx,
                             cpu_x_,
@@ -566,12 +567,12 @@ class CudnnBNAddReluTester {
   }
 
   void BaselineForwardFusedBNAddRelu(const phi::GPUContext &ctx,
-                                     Tensor *cpu_mean,
-                                     Tensor *cpu_var,
-                                     Tensor *cpu_saved_mean,
-                                     Tensor *cpu_saved_var,
-                                     Tensor *cpu_y,
-                                     Tensor *saved_reserve_space) {
+                                     phi::DenseTensor *cpu_mean,
+                                     phi::DenseTensor *cpu_var,
+                                     phi::DenseTensor *cpu_saved_mean,
+                                     phi::DenseTensor *cpu_saved_var,
+                                     phi::DenseTensor *cpu_y,
+                                     phi::DenseTensor *saved_reserve_space) {
     InitMeanVar(cpu_mean, cpu_var, cpu_saved_mean, cpu_saved_var);
     ComputeFusedBNAddReluForward(ctx,
                                  cpu_x_,
@@ -587,10 +588,10 @@ class CudnnBNAddReluTester {
   }
 
   void BaselineBackwardFusedBNAddRelu(const phi::GPUContext &ctx,
-                                      Tensor *cpu_dx,
-                                      Tensor *cpu_dz,
-                                      Tensor *cpu_dscale,
-                                      Tensor *cpu_dbias) {
+                                      phi::DenseTensor *cpu_dx,
+                                      phi::DenseTensor *cpu_dz,
+                                      phi::DenseTensor *cpu_dscale,
+                                      phi::DenseTensor *cpu_dbias) {
     ComputeFusedBNAddReluBackward(ctx,
                                   cpu_dy_,
                                   cpu_x_,
@@ -607,19 +608,19 @@ class CudnnBNAddReluTester {
   }
 
   void ComputeFusedBNStatsFinalize(const phi::GPUContext &ctx,
-                                   const Tensor &cpu_x,
-                                   const Tensor &cpu_bn_scale,
-                                   const Tensor &cpu_bn_bias,
-                                   Tensor *sum,
-                                   Tensor *sum_of_square,
-                                   Tensor *bn_scale,
-                                   Tensor *bn_bias,
-                                   Tensor *mean,
-                                   Tensor *var,
-                                   Tensor *saved_mean,
-                                   Tensor *saved_var,
-                                   Tensor *equiv_scale,
-                                   Tensor *equiv_bias) {
+                                   const phi::DenseTensor &cpu_x,
+                                   const phi::DenseTensor &cpu_bn_scale,
+                                   const phi::DenseTensor &cpu_bn_bias,
+                                   phi::DenseTensor *sum,
+                                   phi::DenseTensor *sum_of_square,
+                                   phi::DenseTensor *bn_scale,
+                                   phi::DenseTensor *bn_bias,
+                                   phi::DenseTensor *mean,
+                                   phi::DenseTensor *var,
+                                   phi::DenseTensor *saved_mean,
+                                   phi::DenseTensor *saved_var,
+                                   phi::DenseTensor *equiv_scale,
+                                   phi::DenseTensor *equiv_bias) {
     phi::DenseTensor cpu_sum;
     phi::DenseTensor cpu_sum_of_square;
     ComputeSumAndSquareSum<T>(cpu_x, &cpu_sum, &cpu_sum_of_square);
@@ -664,16 +665,16 @@ class CudnnBNAddReluTester {
 
   // Get forward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu
   void FusedForward(const phi::GPUContext &ctx,
-                    Tensor *cpu_mean_x,
-                    Tensor *cpu_var_x,
-                    Tensor *cpu_saved_mean_x,
-                    Tensor *cpu_saved_var_x,
-                    Tensor *cpu_y,
-                    Tensor *cpu_bitmask,
-                    Tensor *cpu_mean_z = nullptr,
-                    Tensor *cpu_var_z = nullptr,
-                    Tensor *cpu_saved_mean_z = nullptr,
-                    Tensor *cpu_saved_var_z = nullptr) {
+                    phi::DenseTensor *cpu_mean_x,
+                    phi::DenseTensor *cpu_var_x,
+                    phi::DenseTensor *cpu_saved_mean_x,
+                    phi::DenseTensor *cpu_saved_var_x,
+                    phi::DenseTensor *cpu_y,
+                    phi::DenseTensor *cpu_bitmask,
+                    phi::DenseTensor *cpu_mean_z = nullptr,
+                    phi::DenseTensor *cpu_var_z = nullptr,
+                    phi::DenseTensor *cpu_saved_mean_z = nullptr,
+                    phi::DenseTensor *cpu_saved_var_z = nullptr) {
     phi::DenseTensor x;
     phi::DenseTensor sum_x;
     phi::DenseTensor sum_of_square_x;
@@ -802,10 +803,10 @@ class CudnnBNAddReluTester {
 
   // Get backward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu
   void FusedBackward(const phi::GPUContext &ctx,
-                     Tensor *cpu_dx,
-                     Tensor *cpu_dz,
-                     Tensor *cpu_dscale,
-                     Tensor *cpu_dbias) {
+                     phi::DenseTensor *cpu_dx,
+                     phi::DenseTensor *cpu_dz,
+                     phi::DenseTensor *cpu_dscale,
+                     phi::DenseTensor *cpu_dbias) {
     phi::DenseTensor dy;
     phi::DenseTensor x;
     phi::DenseTensor bn_scale;
diff --git a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
index 0325a0e585ed3..762e86406917d 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 namespace dynload = platform::dynload;
 template <typename T>
 using BatchNormParamType =
@@ -70,16 +69,16 @@ class CudnnBNStatsFinalize {
   ~CudnnBNStatsFinalize() {}
 
   void Forward(const phi::GPUContext &ctx,
-               const Tensor &sum,
-               const Tensor &sum_of_squares,
-               const Tensor &scale,
-               const Tensor &bias,
-               Tensor *saved_mean,
-               Tensor *saved_invstd,
-               Tensor *running_mean,
-               Tensor *running_var,
-               Tensor *equiv_scale,
-               Tensor *equiv_bias,
+               const phi::DenseTensor &sum,
+               const phi::DenseTensor &sum_of_squares,
+               const phi::DenseTensor &scale,
+               const phi::DenseTensor &bias,
+               phi::DenseTensor *saved_mean,
+               phi::DenseTensor *saved_invstd,
+               phi::DenseTensor *running_mean,
+               phi::DenseTensor *running_var,
+               phi::DenseTensor *equiv_scale,
+               phi::DenseTensor *equiv_bias,
                double eps,
                float momentum,
                int64_t ele_count,
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
index bf0e06b825e4b..c82ccc959d204 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
@@ -19,7 +19,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 namespace dynload = platform::dynload;
 
 template <typename T>
@@ -195,11 +194,11 @@ class CudnnNormConvolution {
   ~CudnnNormConvolution() {}
 
   void Forward(const phi::GPUContext &ctx,
-               const Tensor &input,
-               const Tensor &filter,
-               Tensor *output,
-               Tensor *sum,
-               Tensor *sum_of_squares) {
+               const phi::DenseTensor &input,
+               const phi::DenseTensor &filter,
+               phi::DenseTensor *output,
+               phi::DenseTensor *sum,
+               phi::DenseTensor *sum_of_squares) {
     auto cudnn_handle = ctx.cudnn_handle();
 
     CudnnFusionOp *fwd_op = GetForwardOp(ctx);
@@ -314,11 +313,11 @@ class CudnnNormConvolutionGrad {
   ~CudnnNormConvolutionGrad() {}
 
   void Backward(const phi::GPUContext &ctx,
-                const Tensor &input,
-                const Tensor &filter,
-                const Tensor &output_grad,
-                Tensor *input_grad,
-                Tensor *filter_grad,
+                const phi::DenseTensor &input,
+                const phi::DenseTensor &filter,
+                const phi::DenseTensor &output_grad,
+                phi::DenseTensor *input_grad,
+                phi::DenseTensor *filter_grad,
                 bool use_addto = false) {
     T *input_ptr = const_cast<T *>(input.data<T>());
     T *filter_ptr = const_cast<T *>(filter.data<T>());
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
index 3369a8ca4a9c5..4f7555aed8282 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
@@ -28,7 +28,6 @@ limitations under the License. */
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace op = paddle::operators;
-using Tensor = phi::DenseTensor;
 
 USE_OP_ITSELF(conv2d);
 USE_OP_ITSELF(conv2d_grad);
@@ -95,9 +94,9 @@ void CheckOutput(const phi::DenseTensor &cpu_res,
 
 // Use Paddle conv2d op results as baseline
 void ComputeConv2DForward(const phi::GPUContext &ctx,
-                          const Tensor &cpu_input,
-                          const Tensor &cpu_filter,
-                          Tensor *cpu_output,
+                          const phi::DenseTensor &cpu_input,
+                          const phi::DenseTensor &cpu_filter,
+                          phi::DenseTensor *cpu_output,
                           int stride,
                           int padding) {
   framework::Scope scope;
@@ -131,9 +130,9 @@ void ComputeConv2DForward(const phi::GPUContext &ctx,
 
 // Use Paddle conv2d_grad op results as baseline
 void ComputeConv2DBackward(const phi::GPUContext &ctx,
-                           const Tensor &cpu_input,
-                           const Tensor &cpu_filter,
-                           const Tensor &cpu_output_grad,
+                           const phi::DenseTensor &cpu_input,
+                           const phi::DenseTensor &cpu_filter,
+                           const phi::DenseTensor &cpu_output_grad,
                            phi::DenseTensor *cpu_input_grad,
                            phi::DenseTensor *cpu_filter_grad,
                            int stride,
diff --git a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
index df79ed758dbc5..4ecc5795ff41a 100644
--- a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
@@ -19,7 +19,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
 namespace dynload = platform::dynload;
@@ -117,14 +116,14 @@ class CudnnScaleBiasAddRelu {
   ~CudnnScaleBiasAddRelu() {}
 
   void Forward(const phi::GPUContext &ctx,
-               const Tensor &x,
-               const Tensor &x_scale,
-               const Tensor &x_bias,
-               const Tensor *z,
-               const Tensor *z_scale,
-               const Tensor *z_bias,
-               Tensor *out,
-               Tensor *bitmask) {
+               const phi::DenseTensor &x,
+               const phi::DenseTensor &x_scale,
+               const phi::DenseTensor &x_bias,
+               const phi::DenseTensor *z,
+               const phi::DenseTensor *z_scale,
+               const phi::DenseTensor *z_bias,
+               phi::DenseTensor *out,
+               phi::DenseTensor *bitmask) {
     ForwardInit(ctx);
     auto handle = ctx.cudnn_handle();
     auto workspace_handle = ctx.cudnn_workspace_handle();
@@ -172,17 +171,17 @@ class CudnnScaleBiasAddRelu {
   }
 
   void Backward(const phi::GPUContext &ctx,
-                const Tensor &dy,
-                const Tensor &x,
-                const Tensor &scale,
-                const Tensor &bias,
-                const Tensor &saved_mean,
-                const Tensor &saved_invstd,
-                const Tensor *bitmask,
-                Tensor *dx,
-                Tensor *dz,
-                Tensor *dscale,
-                Tensor *dbias,
+                const phi::DenseTensor &dy,
+                const phi::DenseTensor &x,
+                const phi::DenseTensor &scale,
+                const phi::DenseTensor &bias,
+                const phi::DenseTensor &saved_mean,
+                const phi::DenseTensor &saved_invstd,
+                const phi::DenseTensor *bitmask,
+                phi::DenseTensor *dx,
+                phi::DenseTensor *dz,
+                phi::DenseTensor *dscale,
+                phi::DenseTensor *dbias,
                 double eps) {
     BackwardInit(ctx);
     auto handle = ctx.cudnn_handle();
diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
index 11939a454b9a0..47459884cc544 100644
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -27,8 +27,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class AttnDropoutParam {
  public:
   AttnDropoutParam() {
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc
index 03c97ec345fb8..b05a63510e385 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_attention_op.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class FusedAttentionOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index ef5087f0534e1..9454e589ec920 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -38,8 +38,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 static void AllReduce(phi::DenseTensor &tensor,  // NOLINT
                       const int ring_id,
@@ -528,7 +526,7 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
     int input_size = dim_embed;
 
     bool add_residual = ctx.Attr<bool>("add_residual");
-    Tensor d_residual;
+    phi::DenseTensor d_residual;
     T *d_residual_data = nullptr;
     if (add_residual) {
       d_residual.Resize(input_x_dims);
@@ -728,8 +726,8 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
 
     if (add_residual) {
       // gradient accumulation
-      std::vector<const Tensor *> ins = {&d_residual, d_x};
-      std::vector<Tensor *> outs = {d_x};
+      std::vector<const phi::DenseTensor *> ins = {&d_residual, d_x};
+      std::vector<phi::DenseTensor *> outs = {d_x};
       phi::funcs::ElementwiseKernel<T>(
           ctx.cuda_device_context(), ins, &outs, phi::funcs::AddFunctor<T>());
     }
diff --git a/paddle/fluid/operators/fused/fused_attention_op_xpu.cc b/paddle/fluid/operators/fused/fused_attention_op_xpu.cc
index 6bf2e3d80335f..bbfa48f1dca78 100644
--- a/paddle/fluid/operators/fused/fused_attention_op_xpu.cc
+++ b/paddle/fluid/operators/fused/fused_attention_op_xpu.cc
@@ -24,8 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class FusedAttentionOpKernel : public framework::OpKernel<T> {
  public:
@@ -33,86 +31,88 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
     using XPUTypeT = typename XPUTypeTrait<T>::Type;
 
     // inputs tensor
-    auto *input_x = ctx.Input<Tensor>("X");
+    auto *input_x = ctx.Input<phi::DenseTensor>("X");
 
     const auto pre_layer_norm = ctx.Attr<bool>("pre_layer_norm");
 
     // shape [3, num_head, dim_head, dim_embed]
-    auto *qkv_weight = ctx.Input<Tensor>("QKVW");
+    auto *qkv_weight = ctx.Input<phi::DenseTensor>("QKVW");
     // shape [3 , num_head, dim_head]
-    auto *qkv_bias = ctx.Input<Tensor>("QKVBias");
+    auto *qkv_bias = ctx.Input<phi::DenseTensor>("QKVBias");
 
     // shape [batch_size, 1, 1, seq_len]
-    auto *src_mask = ctx.Input<Tensor>("SrcMask");
+    auto *src_mask = ctx.Input<phi::DenseTensor>("SrcMask");
 
     // shape [dim_embed, dim_embed]
-    auto *out_linear_weight = ctx.Input<Tensor>("OutLinearW");
+    auto *out_linear_weight = ctx.Input<phi::DenseTensor>("OutLinearW");
     // shape [dim_embed]
-    auto *out_linear_bias = ctx.Input<Tensor>("OutLinearBias");
+    auto *out_linear_bias = ctx.Input<phi::DenseTensor>("OutLinearBias");
 
-    const Tensor *ln_scale = nullptr;
-    const Tensor *ln_bias = nullptr;
+    const phi::DenseTensor *ln_scale = nullptr;
+    const phi::DenseTensor *ln_bias = nullptr;
     float epsilon = 0.0f;
 
     if (pre_layer_norm) {
-      ln_scale = ctx.Input<Tensor>("LnScale");
-      ln_bias = ctx.Input<Tensor>("LnBias");
+      ln_scale = ctx.Input<phi::DenseTensor>("LnScale");
+      ln_bias = ctx.Input<phi::DenseTensor>("LnBias");
       epsilon = ctx.Attr<float>("epsilon");
     } else {
-      ln_scale = ctx.Input<Tensor>("Ln2Scale");
-      ln_bias = ctx.Input<Tensor>("Ln2Bias");
+      ln_scale = ctx.Input<phi::DenseTensor>("Ln2Scale");
+      ln_bias = ctx.Input<phi::DenseTensor>("Ln2Bias");
       epsilon = ctx.Attr<float>("ln_epsilon");
     }
 
     // outputs tensor
     // qkv 的值，并已经做了transpos后的值
     // shape [3, batch_size, num_head, seq_len, dim_head]
-    auto *TransposeOut2 = ctx.Output<Tensor>("TransposeOut2");
+    auto *TransposeOut2 = ctx.Output<phi::DenseTensor>("TransposeOut2");
 
     // shape [batch_size, num_head, seq_len, seq_len]
-    auto *softmax_out = ctx.Output<Tensor>("SoftmaxOut");
+    auto *softmax_out = ctx.Output<phi::DenseTensor>("SoftmaxOut");
     // shape [batch_size, num_head, seq_len, seq_len]
-    auto *attn_dropout_mask_out = ctx.Output<Tensor>("AttnDropoutMaskOut");
+    auto *attn_dropout_mask_out =
+        ctx.Output<phi::DenseTensor>("AttnDropoutMaskOut");
     // shape [batch_size, num_head, seq_len, seq_len]
-    auto *attn_dropout_out = ctx.Output<Tensor>("AttnDropoutOut");
+    auto *attn_dropout_out = ctx.Output<phi::DenseTensor>("AttnDropoutOut");
 
     // shape [[batch_size, seq_len, num_head, dim_head]]
-    auto *fmha_out = ctx.Output<Tensor>("FMHAOut");
+    auto *fmha_out = ctx.Output<phi::DenseTensor>("FMHAOut");
 
     // shape [batch_size, seq_len, dim_embed]
-    auto *dropout_mask_out = ctx.Output<Tensor>("DropoutMaskOut");
+    auto *dropout_mask_out = ctx.Output<phi::DenseTensor>("DropoutMaskOut");
 
     // final output
     // shape [batch_size, seq_len, dim_embed]
-    auto *out = ctx.Output<Tensor>("Y");
+    auto *out = ctx.Output<phi::DenseTensor>("Y");
 
     // 下面这个tensor是不需要返回, 但是新的动态图需要
-    auto *QKOut = ctx.Output<Tensor>("QKOut");
+    auto *QKOut = ctx.Output<phi::DenseTensor>("QKOut");
     QKOut->mutable_data<T>(ctx.GetPlace());
-    auto *QKTVOut = ctx.Output<Tensor>("QKTVOut");
+    auto *QKTVOut = ctx.Output<phi::DenseTensor>("QKTVOut");
     QKTVOut->mutable_data<T>(ctx.GetPlace());
-    auto *OutLinearOut = ctx.Output<Tensor>("OutLinearOut");
+    auto *OutLinearOut = ctx.Output<phi::DenseTensor>("OutLinearOut");
     OutLinearOut->mutable_data<T>(ctx.GetPlace());
-    auto *QKVBiasOut = ctx.Output<Tensor>("QKVBiasOut");
+    auto *QKVBiasOut = ctx.Output<phi::DenseTensor>("QKVBiasOut");
     QKVBiasOut->mutable_data<T>(ctx.GetPlace());
-    auto *SrcMaskOut = ctx.Output<Tensor>("SrcMaskOut");
+    auto *SrcMaskOut = ctx.Output<phi::DenseTensor>("SrcMaskOut");
     SrcMaskOut->mutable_data<T>(ctx.GetPlace());
-    auto *qkv_out = ctx.Output<Tensor>("QKVOut");
+    auto *qkv_out = ctx.Output<phi::DenseTensor>("QKVOut");
     qkv_out->mutable_data<T>(ctx.GetPlace());
 
-    Tensor *bias_dropout_residual_out = nullptr;
-    Tensor *ln_mean = nullptr;
-    Tensor *ln_var = nullptr;
-    Tensor *ln_out = nullptr;
+    phi::DenseTensor *bias_dropout_residual_out = nullptr;
+    phi::DenseTensor *ln_mean = nullptr;
+    phi::DenseTensor *ln_var = nullptr;
+    phi::DenseTensor *ln_out = nullptr;
 
     if (pre_layer_norm) {
-      ln_mean = ctx.Output<Tensor>("LnMean");
-      ln_var = ctx.Output<Tensor>("LnVariance");
-      ln_out = ctx.Output<Tensor>("LnOut");
+      ln_mean = ctx.Output<phi::DenseTensor>("LnMean");
+      ln_var = ctx.Output<phi::DenseTensor>("LnVariance");
+      ln_out = ctx.Output<phi::DenseTensor>("LnOut");
     } else {
-      ln_mean = ctx.Output<Tensor>("Ln2Mean");
-      ln_var = ctx.Output<Tensor>("Ln2Variance");
-      bias_dropout_residual_out = ctx.Output<Tensor>("BiasDropoutResidualOut");
+      ln_mean = ctx.Output<phi::DenseTensor>("Ln2Mean");
+      ln_var = ctx.Output<phi::DenseTensor>("Ln2Variance");
+      bias_dropout_residual_out =
+          ctx.Output<phi::DenseTensor>("BiasDropoutResidualOut");
     }
 
     // dropout info
@@ -125,7 +125,8 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
 
     bool is_upscale_in_train_1 =
         (dropout_implementation_1 == "upscale_in_train");
-    auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input<Tensor>("Seed1") : nullptr;
+    auto *seed_1 =
+        ctx.HasInput("Seed1") ? ctx.Input<phi::DenseTensor>("Seed1") : nullptr;
 
     bool is_fix_seed_1 = ctx.Attr<bool>("attn_dropout_fix_seed");
 
@@ -468,7 +469,8 @@ class FusedAttentionGradXPUKernel : public framework::OpKernel<T> {
         ctx.Attr<std::string>("attn_dropout_implementation");
     bool is_upscale_in_train_1 =
         (dropout_implementation_1 == "upscale_in_train");
-    auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input<Tensor>("Seed1") : nullptr;
+    auto *seed_1 =
+        ctx.HasInput("Seed1") ? ctx.Input<phi::DenseTensor>("Seed1") : nullptr;
     bool is_fix_seed_1 = ctx.Attr<bool>("attn_dropout_fix_seed");
     int seed_val_1 = ctx.Attr<int>("attn_dropout_seed");
 
@@ -482,79 +484,81 @@ class FusedAttentionGradXPUKernel : public framework::OpKernel<T> {
 
     XPUDropoutParam dropout_param(ctx, 0);
     // get inputs.
-    auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto *d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
     const XPUTypeT *d_y_ptr =
         reinterpret_cast<const XPUTypeT *>(d_y->data<T>());
     // 前向必要参数
-    auto *input_x = ctx.Input<Tensor>("X");
+    auto *input_x = ctx.Input<phi::DenseTensor>("X");
     const XPUTypeT *input_x_ptr =
         reinterpret_cast<const XPUTypeT *>(input_x->data<T>());
-    auto *qkv_transpose_out = ctx.Input<Tensor>("TransposeOut2");
+    auto *qkv_transpose_out = ctx.Input<phi::DenseTensor>("TransposeOut2");
     const XPUTypeT *qkv_transpose_out_ptr =
         reinterpret_cast<const XPUTypeT *>(qkv_transpose_out->data<T>());
-    auto *qkv_weight = ctx.Input<Tensor>("QKVW");
+    auto *qkv_weight = ctx.Input<phi::DenseTensor>("QKVW");
     const XPUTypeT *qkv_weight_ptr =
         reinterpret_cast<const XPUTypeT *>(qkv_weight->data<T>());
 
-    auto *softmax_out = ctx.Input<Tensor>("SoftmaxOut");
+    auto *softmax_out = ctx.Input<phi::DenseTensor>("SoftmaxOut");
     const XPUTypeT *softmax_out_ptr =
         reinterpret_cast<const XPUTypeT *>(softmax_out->data<T>());
-    auto *attn_dropout_out = ctx.Input<Tensor>("AttnDropoutOut");
+    auto *attn_dropout_out = ctx.Input<phi::DenseTensor>("AttnDropoutOut");
     const XPUTypeT *attn_dropout_out_ptr =
         reinterpret_cast<const XPUTypeT *>(attn_dropout_out->data<T>());
 
-    auto *attn_dropout_mask = ctx.Input<Tensor>("AttnDropoutMaskOut");
+    auto *attn_dropout_mask = ctx.Input<phi::DenseTensor>("AttnDropoutMaskOut");
     const XPUTypeT *attn_dropout_mask_ptr =
         reinterpret_cast<const XPUTypeT *>(attn_dropout_mask->data<T>());
-    auto *fmha_out = ctx.Input<Tensor>("FMHAOut");
+    auto *fmha_out = ctx.Input<phi::DenseTensor>("FMHAOut");
     const XPUTypeT *fmha_out_ptr =
         reinterpret_cast<const XPUTypeT *>(fmha_out->data<T>());
 
-    auto *out_linear_weight = ctx.Input<Tensor>("OutLinearW");
+    auto *out_linear_weight = ctx.Input<phi::DenseTensor>("OutLinearW");
     const XPUTypeT *out_linear_weight_ptr =
         reinterpret_cast<const XPUTypeT *>(out_linear_weight->data<T>());
 
-    auto *dropout_mask_out = ctx.Input<Tensor>("DropoutMaskOut");
+    auto *dropout_mask_out = ctx.Input<phi::DenseTensor>("DropoutMaskOut");
     const XPUTypeT *dropout_mask_out_ptr =
         reinterpret_cast<const XPUTypeT *>(dropout_mask_out->data<T>());
     // 需要计算的梯度
-    auto *d_qkv_weight = ctx.Output<Tensor>(framework::GradVarName("QKVW"));
+    auto *d_qkv_weight =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("QKVW"));
     XPUTypeT *d_qkv_weight_ptr = reinterpret_cast<XPUTypeT *>(
         d_qkv_weight->mutable_data<T>(ctx.GetPlace()));
 
-    auto *d_qkv_bias = ctx.Output<Tensor>(framework::GradVarName("QKVBias"));
+    auto *d_qkv_bias =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("QKVBias"));
     XPUTypeT *d_qkv_bias_ptr = reinterpret_cast<XPUTypeT *>(
         d_qkv_bias->mutable_data<T>(ctx.GetPlace()));
     auto *d_out_linear_weight =
-        ctx.Output<Tensor>(framework::GradVarName("OutLinearW"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("OutLinearW"));
 
     XPUTypeT *d_out_linear_weight_ptr = reinterpret_cast<XPUTypeT *>(
         d_out_linear_weight->mutable_data<T>(ctx.GetPlace()));
 
     auto *d_out_linear_bias =
-        ctx.Output<Tensor>(framework::GradVarName("OutLinearBias"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("OutLinearBias"));
     XPUTypeT *d_out_linear_bias_ptr = reinterpret_cast<XPUTypeT *>(
         d_out_linear_bias->mutable_data<T>(ctx.GetPlace()));
     // 有可能需要
     auto *d_src_mask_out =
-        ctx.Output<Tensor>(framework::GradVarName("SrcMaskOut"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("SrcMaskOut"));
     XPUTypeT *d_src_mask_out_ptr =
         (d_src_mask_out == nullptr)
             ? (nullptr)
             : (reinterpret_cast<XPUTypeT *>(
                   d_src_mask_out->mutable_data<T>(ctx.GetPlace())));
     // 输出 dx
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     XPUTypeT *d_x_ptr =
         reinterpret_cast<XPUTypeT *>(d_x->mutable_data<T>(ctx.GetPlace()));
 
-    const Tensor *ln_out = nullptr;
-    const Tensor *bias_dropout_residual_out = nullptr;
-    const Tensor *ln_scale = nullptr;
-    const Tensor *ln_mean = nullptr;
-    const Tensor *ln_var = nullptr;
-    Tensor *d_ln_scale = nullptr;
-    Tensor *d_ln_bias = nullptr;
+    const phi::DenseTensor *ln_out = nullptr;
+    const phi::DenseTensor *bias_dropout_residual_out = nullptr;
+    const phi::DenseTensor *ln_scale = nullptr;
+    const phi::DenseTensor *ln_mean = nullptr;
+    const phi::DenseTensor *ln_var = nullptr;
+    phi::DenseTensor *d_ln_scale = nullptr;
+    phi::DenseTensor *d_ln_bias = nullptr;
 
     const XPUTypeT *ln_out_ptr = NULL;
     const float *ln_scale_ptr = NULL;
@@ -567,23 +571,28 @@ class FusedAttentionGradXPUKernel : public framework::OpKernel<T> {
     float epsilon = 0.0f;
 
     if (pre_layer_norm) {
-      ln_out = ctx.Input<Tensor>("LnOut");
+      ln_out = ctx.Input<phi::DenseTensor>("LnOut");
       ln_out_ptr = reinterpret_cast<const XPUTypeT *>(ln_out->data<T>());
-      ln_scale = ctx.Input<Tensor>("LnScale");
-      ln_mean = ctx.Input<Tensor>("LnMean");
-      ln_var = ctx.Input<Tensor>("LnVariance");
+      ln_scale = ctx.Input<phi::DenseTensor>("LnScale");
+      ln_mean = ctx.Input<phi::DenseTensor>("LnMean");
+      ln_var = ctx.Input<phi::DenseTensor>("LnVariance");
       epsilon = ctx.Attr<float>("epsilon");
-      d_ln_scale = ctx.Output<Tensor>(framework::GradVarName("LnScale"));
-      d_ln_bias = ctx.Output<Tensor>(framework::GradVarName("LnBias"));
+      d_ln_scale =
+          ctx.Output<phi::DenseTensor>(framework::GradVarName("LnScale"));
+      d_ln_bias =
+          ctx.Output<phi::DenseTensor>(framework::GradVarName("LnBias"));
 
     } else {
-      ln_scale = ctx.Input<Tensor>("Ln2Scale");
-      ln_mean = ctx.Input<Tensor>("Ln2Mean");
-      ln_var = ctx.Input<Tensor>("Ln2Variance");
+      ln_scale = ctx.Input<phi::DenseTensor>("Ln2Scale");
+      ln_mean = ctx.Input<phi::DenseTensor>("Ln2Mean");
+      ln_var = ctx.Input<phi::DenseTensor>("Ln2Variance");
       epsilon = ctx.Attr<float>("ln_epsilon");
-      d_ln_scale = ctx.Output<Tensor>(framework::GradVarName("Ln2Scale"));
-      d_ln_bias = ctx.Output<Tensor>(framework::GradVarName("Ln2Bias"));
-      bias_dropout_residual_out = ctx.Input<Tensor>("BiasDropoutResidualOut");
+      d_ln_scale =
+          ctx.Output<phi::DenseTensor>(framework::GradVarName("Ln2Scale"));
+      d_ln_bias =
+          ctx.Output<phi::DenseTensor>(framework::GradVarName("Ln2Bias"));
+      bias_dropout_residual_out =
+          ctx.Input<phi::DenseTensor>("BiasDropoutResidualOut");
       bias_dropout_residual_out_ptr = reinterpret_cast<const XPUTypeT *>(
           bias_dropout_residual_out->data<T>());
     }
diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
index 94131197060b5..02494e33e1241 100644
--- a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class FusedBiasDropoutResidualLnOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
index 664e20b686d7e..2562c2cc22575 100644
--- a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
@@ -25,8 +25,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class FusedBiasDropoutResidualLnOpKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_activation_op.cc
index faf4a1aae44b6..e68be43eb7ec0 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cc
@@ -304,9 +304,9 @@ framework::OpKernelType FusedBatchNormActGradOp::GetExpectedKernelType(
     PADDLE_THROW(platform::errors::NotFound(
         "Can not find Y@GRAD in the execution context."));
   }
-  const Tensor *t = nullptr;
-  if (var->IsType<Tensor>()) {
-    t = &var->Get<Tensor>();
+  const phi::DenseTensor *t = nullptr;
+  if (var->IsType<phi::DenseTensor>()) {
+    t = &var->Get<phi::DenseTensor>();
   } else if (var->IsType<phi::DenseTensor>()) {
     t = &var->Get<phi::DenseTensor>();
   }
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
index c7fbdc88abb33..4023aaa8445f9 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
@@ -30,7 +30,6 @@ DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
@@ -143,7 +142,7 @@ class FusedBatchNormActKernel<phi::GPUContext, T>
     size_t reserve_space_size = 0;
     void *reserve_space_ptr = nullptr;
     void *workspace_ptr = nullptr;
-    Tensor workspace_tensor;
+    phi::DenseTensor workspace_tensor;
     // Create reserve space and workspace for batch norm.
     // Create tensor for each batchnorm op, it will be used in the
     // backward. Thus this tensor shouldn't be temp.
@@ -340,7 +339,7 @@ class FusedBatchNormActGradKernel<phi::GPUContext, T>
 
     size_t workspace_size = 0;
     void *workspace_ptr = nullptr;
-    Tensor workspace_tensor;
+    phi::DenseTensor workspace_tensor;
     auto reserve_space_size = reserve_space->memory_size();
     cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION;
     platform::ScopedActivationDescriptor scope_act_desc;
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.h b/paddle/fluid/operators/fused/fused_bn_activation_op.h
index f8aab994cb371..b71812db9d3d3 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.h
@@ -26,7 +26,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 class FusedBatchNormActOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
index 2d51a3efaf699..08f7087b48d01 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
@@ -262,9 +262,9 @@ framework::OpKernelType FusedBatchNormAddActGradOp::GetExpectedKernelType(
     PADDLE_THROW(platform::errors::NotFound(
         "Can not find Y@GRAD in the execution context."));
   }
-  const Tensor *t = nullptr;
-  if (var->IsType<Tensor>()) {
-    t = &var->Get<Tensor>();
+  const phi::DenseTensor *t = nullptr;
+  if (var->IsType<phi::DenseTensor>()) {
+    t = &var->Get<phi::DenseTensor>();
   } else if (var->IsType<phi::DenseTensor>()) {
     t = &var->Get<phi::DenseTensor>();
   }
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
index 5a192b2df5c94..4c4756b8e1979 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
@@ -30,7 +30,6 @@ DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
@@ -120,7 +119,7 @@ class FusedBatchNormAddActKernel<phi::GPUContext, T>
     size_t reserve_space_size = 0;
     void *reserve_space_ptr = nullptr;
     void *workspace_ptr = nullptr;
-    Tensor workspace_tensor;
+    phi::DenseTensor workspace_tensor;
     // Create reserve space and workspace for batch norm.
     // Create tensor for each batchnorm op, it will be used in the
     // backward. Thus this tensor shouldn't be temp.
@@ -296,7 +295,7 @@ class FusedBatchNormAddActGradKernel<phi::GPUContext, T>
 
     size_t workspace_size = 0;
     void *workspace_ptr = nullptr;
-    Tensor workspace_tensor;
+    phi::DenseTensor workspace_tensor;
     auto reserve_space_size = reserve_space->memory_size();
     cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION;
     platform::ScopedActivationDescriptor scope_act_desc;
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.h b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
index f4913bca3df98..bdb1f2f35444c 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
@@ -26,7 +26,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 class FusedBatchNormAddActOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
index 8360f07a5f3e7..b8f2cc5b4b335 100644
--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
@@ -33,7 +33,6 @@ template <typename DeviceContext, typename T>
 class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    using Tensor = phi::DenseTensor;
     auto &device_ctx = context.template device_context<DeviceContext>();
     auto ids = context.MultiInput<phi::DenseTensor>("Ids");
     auto embs = context.MultiInput<phi::DenseTensor>("Embs");
diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
index 9c58c6900959e..885f3412a4e06 100644
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -182,16 +182,17 @@ void FusedEmbeddingFCLSTMOpMaker::Make() {
            "contains the ids to be looked up in W. "
            "The last dimension size must be 1.");
   AddInput("Embeddings",
-           "(Tensor) the learnable weights of X."
+           "(phi::DenseTensor) the learnable weights of X."
            " - The shape is (M x 4D), where M is the dim size of x, D is the "
            "hidden size. "
            " - Weight = {W_cx, W_ix, W_fx, W_ox}");
-  AddInput("WeightH",
-           "(Tensor) same as LSTMOp, the learnable hidden-hidden weights."
-           " - The shape is (D x 4D), where D is the hidden size. "
-           " - Weight = {W_ch, W_ih, W_fh, W_oh}");
+  AddInput(
+      "WeightH",
+      "(phi::DenseTensor) same as LSTMOp, the learnable hidden-hidden weights."
+      " - The shape is (D x 4D), where D is the hidden size. "
+      " - Weight = {W_ch, W_ih, W_fh, W_oh}");
   AddInput("Bias",
-           "(Tensor) the learnable weights. Almost same as LSTMOp"
+           "(phi::DenseTensor) the learnable weights. Almost same as LSTMOp"
            "Note: we should add the fc bias into this (1x4D) in bias."
            "input-hidden bias weight and peephole connections weight if "
            "setting `use_peepholes` True. "
@@ -202,13 +203,15 @@ void FusedEmbeddingFCLSTMOpMaker::Make() {
            " - The shape is (1 x 7D). "
            " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
   AddInput("H0",
-           "(Tensor, optional) (same as LSTMOp) the initial hidden state is an "
+           "(phi::DenseTensor, optional) (same as LSTMOp) the initial hidden "
+           "state is an "
            "optional "
            "input. This is a tensor with shape (N x D), where N is the "
            "batch size and D is the hidden size.")
       .AsDispensable();
   AddInput("C0",
-           "(Tensor, optional) (same as LSTMOp) (the initial cell state is an "
+           "(phi::DenseTensor, optional) (same as LSTMOp) (the initial cell "
+           "state is an "
            "optional "
            "input. This is a tensor with shape (N x D), where N is the "
            "batch size. `H0` and `C0` can be NULL but only at the same time.")
@@ -318,7 +321,7 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
   /* diagonal weight*/                                               \
   const T* wc_data = bias->data<T>() + D4;                           \
   /* for peephole only*/                                             \
-  Tensor checked_cell;                                               \
+  phi::DenseTensor checked_cell;                                     \
   T* checked_cell_data = nullptr;                                    \
   auto place = ctx.GetPlace();                                       \
   if (use_peepholes) {                                               \
diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h
index 181fa06b02034..19039ec55946d 100644
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class FusedEmbeddingFCLSTMOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
index 0e4134d428094..9fa62a3704547 100644
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
@@ -28,7 +28,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using SelectedRows = phi::SelectedRows;
 using DDim = framework::DDim;
 
@@ -175,7 +174,7 @@ class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> {
       auto len = ids_t->numel();
       int idx_width = len / offset.back();
 
-      Tensor csr_vals_t, csr_colmuns_t, csr_row_idx_t;
+      phi::DenseTensor csr_vals_t, csr_colmuns_t, csr_row_idx_t;
       csr_vals_t.Resize({len});
       csr_colmuns_t.Resize({len});
       csr_row_idx_t.Resize({(batch_size + 1) * idx_width});
@@ -300,7 +299,7 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
       auto len = ids->numel();
       int idx_width = len / offset.back();
 
-      Tensor csr_vals_t, csr_colmuns_t, csr_row_idx_t;
+      phi::DenseTensor csr_vals_t, csr_colmuns_t, csr_row_idx_t;
       csr_vals_t.Resize({len});
       csr_colmuns_t.Resize({len});
       int64_t batch_size = ids_lod[0].size() - 1;
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc
index aaf84c7b1eadb..3bf039829ac3d 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cc
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 class FusedFeedForwardOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu
index 669672084b52b..28a9cb167e093 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cu
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
@@ -31,8 +31,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 static void AllReduce(phi::DenseTensor& tensor,  // NOLINT
                       const int ring_id,
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op_xpu.cc b/paddle/fluid/operators/fused/fused_feedforward_op_xpu.cc
index b94d37a921fb6..4b9ba95143345 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op_xpu.cc
+++ b/paddle/fluid/operators/fused/fused_feedforward_op_xpu.cc
@@ -26,30 +26,28 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class FusedFeedForwardXPUKernel : public framework::OpKernel<T> {
   using XPUTypeT = typename XPUTypeTrait<T>::Type;
 
  public:
   void FFN(const phi::XPUContext& dev_ctx,
-           const Tensor* x,
-           const Tensor* linear1_weight,
-           const Tensor* linear1_bias,
-           const Tensor* linear2_weight,
-           const Tensor* linear2_bias,
-           const Tensor* ln_scale,
-           const Tensor* ln_bias,
-           Tensor* out,
-           Tensor* dropout1_mask,
-           Tensor* dropout2_mask,
-           Tensor* ln_mean,
-           Tensor* ln_variance,
-           Tensor* linear1_out,
-           Tensor* ln1_out,
-           Tensor* dropout1_out,
-           Tensor* dropout2_out,
+           const phi::DenseTensor* x,
+           const phi::DenseTensor* linear1_weight,
+           const phi::DenseTensor* linear1_bias,
+           const phi::DenseTensor* linear2_weight,
+           const phi::DenseTensor* linear2_bias,
+           const phi::DenseTensor* ln_scale,
+           const phi::DenseTensor* ln_bias,
+           phi::DenseTensor* out,
+           phi::DenseTensor* dropout1_mask,
+           phi::DenseTensor* dropout2_mask,
+           phi::DenseTensor* ln_mean,
+           phi::DenseTensor* ln_variance,
+           phi::DenseTensor* linear1_out,
+           phi::DenseTensor* ln1_out,
+           phi::DenseTensor* dropout1_out,
+           phi::DenseTensor* dropout2_out,
            const int bsz_seq,
            const int d_model,
            const int dim_feedforward,
@@ -255,41 +253,41 @@ class FusedFeedForwardXPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto place = context.GetPlace();
 
-    auto* x = context.Input<Tensor>("X");
+    auto* x = context.Input<phi::DenseTensor>("X");
 
-    auto* linear1_weight = context.Input<Tensor>("Linear1Weight");
-    auto* linear1_bias = context.Input<Tensor>("Linear1Bias");
-    auto* linear2_weight = context.Input<Tensor>("Linear2Weight");
-    auto* linear2_bias = context.Input<Tensor>("Linear2Bias");
+    auto* linear1_weight = context.Input<phi::DenseTensor>("Linear1Weight");
+    auto* linear1_bias = context.Input<phi::DenseTensor>("Linear1Bias");
+    auto* linear2_weight = context.Input<phi::DenseTensor>("Linear2Weight");
+    auto* linear2_bias = context.Input<phi::DenseTensor>("Linear2Bias");
     const bool pre_layer_norm = context.Attr<bool>("pre_layer_norm");
 
-    const Tensor* ln_scale = nullptr;
-    const Tensor* ln_bias = nullptr;
-    Tensor* ln_mean = nullptr;
-    Tensor* ln_variance = nullptr;
-    Tensor* ln1_out = nullptr;
+    const phi::DenseTensor* ln_scale = nullptr;
+    const phi::DenseTensor* ln_bias = nullptr;
+    phi::DenseTensor* ln_mean = nullptr;
+    phi::DenseTensor* ln_variance = nullptr;
+    phi::DenseTensor* ln1_out = nullptr;
 
     if (pre_layer_norm) {
-      ln_scale = context.Input<Tensor>("Ln1Scale");
-      ln_bias = context.Input<Tensor>("Ln1Bias");
-      ln_mean = context.Output<Tensor>("Ln1Mean");
-      ln_variance = context.Output<Tensor>("Ln1Variance");
-      ln1_out = context.Output<Tensor>("Ln1Out");
+      ln_scale = context.Input<phi::DenseTensor>("Ln1Scale");
+      ln_bias = context.Input<phi::DenseTensor>("Ln1Bias");
+      ln_mean = context.Output<phi::DenseTensor>("Ln1Mean");
+      ln_variance = context.Output<phi::DenseTensor>("Ln1Variance");
+      ln1_out = context.Output<phi::DenseTensor>("Ln1Out");
       ln1_out->mutable_data<T>(place);
     } else {
-      ln_scale = context.Input<Tensor>("Ln2Scale");
-      ln_bias = context.Input<Tensor>("Ln2Bias");
-      ln_mean = context.Output<Tensor>("Ln2Mean");
-      ln_variance = context.Output<Tensor>("Ln2Variance");
+      ln_scale = context.Input<phi::DenseTensor>("Ln2Scale");
+      ln_bias = context.Input<phi::DenseTensor>("Ln2Bias");
+      ln_mean = context.Output<phi::DenseTensor>("Ln2Mean");
+      ln_variance = context.Output<phi::DenseTensor>("Ln2Variance");
     }
 
-    auto* out = context.Output<Tensor>("Out");
-    auto* dropout1_mask = context.Output<Tensor>("Dropout1Mask");
-    auto* dropout2_mask = context.Output<Tensor>("Dropout2Mask");
-    auto* linear1_out = context.Output<Tensor>("Linear1Out");
+    auto* out = context.Output<phi::DenseTensor>("Out");
+    auto* dropout1_mask = context.Output<phi::DenseTensor>("Dropout1Mask");
+    auto* dropout2_mask = context.Output<phi::DenseTensor>("Dropout2Mask");
+    auto* linear1_out = context.Output<phi::DenseTensor>("Linear1Out");
 
-    auto* dropout1_out = context.Output<Tensor>("Dropout1Out");
-    auto* dropout2_out = context.Output<Tensor>("Dropout2Out");
+    auto* dropout1_out = context.Output<phi::DenseTensor>("Dropout1Out");
+    auto* dropout2_out = context.Output<phi::DenseTensor>("Dropout2Out");
 
     const std::string act_method = context.Attr<std::string>("act_method");
 
@@ -356,26 +354,26 @@ class FusedFeedForwardGradXPUKernel : public framework::OpKernel<T> {
 
  public:
   void FFNGrad(const phi::XPUContext& dev_ctx,
-               const Tensor* d_out,
-               const Tensor* x,
-               const Tensor* dropout1_mask,
-               const Tensor* dropout2_mask,
-               const Tensor* linear1_out,
-               const Tensor* ln1_out,
-               const Tensor* dropout1_out,
-               const Tensor* dropout2_out,
-               const Tensor* linear1_weight,
-               const Tensor* linear2_weight,
-               const Tensor* ln_scale,
-               const Tensor* ln_mean,
-               const Tensor* ln_variance,
-               Tensor* d_x,
-               Tensor* d_linear1_weight,
-               Tensor* d_linear1_bias,
-               Tensor* d_linear2_weight,
-               Tensor* d_linear2_bias,
-               Tensor* d_ln_scale,
-               Tensor* d_ln_bias,
+               const phi::DenseTensor* d_out,
+               const phi::DenseTensor* x,
+               const phi::DenseTensor* dropout1_mask,
+               const phi::DenseTensor* dropout2_mask,
+               const phi::DenseTensor* linear1_out,
+               const phi::DenseTensor* ln1_out,
+               const phi::DenseTensor* dropout1_out,
+               const phi::DenseTensor* dropout2_out,
+               const phi::DenseTensor* linear1_weight,
+               const phi::DenseTensor* linear2_weight,
+               const phi::DenseTensor* ln_scale,
+               const phi::DenseTensor* ln_mean,
+               const phi::DenseTensor* ln_variance,
+               phi::DenseTensor* d_x,
+               phi::DenseTensor* d_linear1_weight,
+               phi::DenseTensor* d_linear1_bias,
+               phi::DenseTensor* d_linear2_weight,
+               phi::DenseTensor* d_linear2_bias,
+               phi::DenseTensor* d_ln_scale,
+               phi::DenseTensor* d_ln_bias,
                const int bsz_seq,
                const int d_model,
                const int dim_feedforward,
@@ -696,55 +694,61 @@ class FusedFeedForwardGradXPUKernel : public framework::OpKernel<T> {
     auto place = context.GetPlace();
     const bool pre_layer_norm = context.Attr<bool>("pre_layer_norm");
     // inputs
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x = context.Input<Tensor>("X");
+    auto* d_out =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* x = context.Input<phi::DenseTensor>("X");
 
-    auto* dropout1_mask = context.Input<Tensor>("Dropout1Mask");
-    auto* dropout2_mask = context.Input<Tensor>("Dropout2Mask");
-    auto* linear1_out = context.Input<Tensor>("Linear1Out");
-    auto* ln1_out = pre_layer_norm ? context.Input<Tensor>("Ln1Out") : nullptr;
+    auto* dropout1_mask = context.Input<phi::DenseTensor>("Dropout1Mask");
+    auto* dropout2_mask = context.Input<phi::DenseTensor>("Dropout2Mask");
+    auto* linear1_out = context.Input<phi::DenseTensor>("Linear1Out");
+    auto* ln1_out =
+        pre_layer_norm ? context.Input<phi::DenseTensor>("Ln1Out") : nullptr;
 
-    auto* dropout1_out = context.Input<Tensor>("Dropout1Out");
-    auto* dropout2_out = context.Input<Tensor>("Dropout2Out");
-    auto* linear1_weight = context.Input<Tensor>("Linear1Weight");
-    auto* linear2_weight = context.Input<Tensor>("Linear2Weight");
+    auto* dropout1_out = context.Input<phi::DenseTensor>("Dropout1Out");
+    auto* dropout2_out = context.Input<phi::DenseTensor>("Dropout2Out");
+    auto* linear1_weight = context.Input<phi::DenseTensor>("Linear1Weight");
+    auto* linear2_weight = context.Input<phi::DenseTensor>("Linear2Weight");
 
-    const Tensor* ln_mean = nullptr;
-    const Tensor* ln_variance = nullptr;
-    const Tensor* ln_scale = nullptr;
+    const phi::DenseTensor* ln_mean = nullptr;
+    const phi::DenseTensor* ln_variance = nullptr;
+    const phi::DenseTensor* ln_scale = nullptr;
 
     if (pre_layer_norm) {
-      ln_mean = context.Input<Tensor>("Ln1Mean");
-      ln_variance = context.Input<Tensor>("Ln1Variance");
-      ln_scale = context.Input<Tensor>("Ln1Scale");
+      ln_mean = context.Input<phi::DenseTensor>("Ln1Mean");
+      ln_variance = context.Input<phi::DenseTensor>("Ln1Variance");
+      ln_scale = context.Input<phi::DenseTensor>("Ln1Scale");
     } else {
-      ln_mean = context.Input<Tensor>("Ln2Mean");
-      ln_variance = context.Input<Tensor>("Ln2Variance");
-      ln_scale = context.Input<Tensor>("Ln2Scale");
+      ln_mean = context.Input<phi::DenseTensor>("Ln2Mean");
+      ln_variance = context.Input<phi::DenseTensor>("Ln2Variance");
+      ln_scale = context.Input<phi::DenseTensor>("Ln2Scale");
     }
 
     // output
-    auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_x = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
-    Tensor* d_ln_scale = nullptr;
-    Tensor* d_ln_bias = nullptr;
+    phi::DenseTensor* d_ln_scale = nullptr;
+    phi::DenseTensor* d_ln_bias = nullptr;
 
     if (pre_layer_norm) {
-      d_ln_scale = context.Output<Tensor>(framework::GradVarName("Ln1Scale"));
-      d_ln_bias = context.Output<Tensor>(framework::GradVarName("Ln1Bias"));
+      d_ln_scale =
+          context.Output<phi::DenseTensor>(framework::GradVarName("Ln1Scale"));
+      d_ln_bias =
+          context.Output<phi::DenseTensor>(framework::GradVarName("Ln1Bias"));
     } else {
-      d_ln_scale = context.Output<Tensor>(framework::GradVarName("Ln2Scale"));
-      d_ln_bias = context.Output<Tensor>(framework::GradVarName("Ln2Bias"));
+      d_ln_scale =
+          context.Output<phi::DenseTensor>(framework::GradVarName("Ln2Scale"));
+      d_ln_bias =
+          context.Output<phi::DenseTensor>(framework::GradVarName("Ln2Bias"));
     }
 
-    auto* d_linear1_weight =
-        context.Output<Tensor>(framework::GradVarName("Linear1Weight"));
+    auto* d_linear1_weight = context.Output<phi::DenseTensor>(
+        framework::GradVarName("Linear1Weight"));
     auto* d_linear1_bias =
-        context.Output<Tensor>(framework::GradVarName("Linear1Bias"));
-    auto* d_linear2_weight =
-        context.Output<Tensor>(framework::GradVarName("Linear2Weight"));
+        context.Output<phi::DenseTensor>(framework::GradVarName("Linear1Bias"));
+    auto* d_linear2_weight = context.Output<phi::DenseTensor>(
+        framework::GradVarName("Linear2Weight"));
     auto* d_linear2_bias =
-        context.Output<Tensor>(framework::GradVarName("Linear2Bias"));
+        context.Output<phi::DenseTensor>(framework::GradVarName("Linear2Bias"));
 
     float epsilon = 0.0f;
     if (pre_layer_norm) {
diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h
index d55d047009255..b7611eff765d2 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention.h
+++ b/paddle/fluid/operators/fused/fused_gate_attention.h
@@ -24,8 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 inline std::string MemoryDebugString(const phi::DenseTensor& t) {
   int device_id = platform::GetCurrentDeviceId();
   int64_t allocated =
@@ -233,17 +231,17 @@ struct GateAttentionConfig {
   }
 
  protected:
-  Tensor qkv_out;
-  Tensor query_out;
-  Tensor key_out;
-  Tensor value_out;
+  phi::DenseTensor qkv_out;
+  phi::DenseTensor query_out;
+  phi::DenseTensor key_out;
+  phi::DenseTensor value_out;
   // qk_out = BatchedGEMM(Q, K^T)
   // qk_out: shape=[batch_size, seq_len_m, num_heads, seq_len_r, m_size]
   // softmax_out = softmax(qk_out + nonbatched_bias + src_mask)
   // The shape of qk_out, softmax_out is the same, thus can be called inplace.
-  Tensor qk_out;
+  phi::DenseTensor qk_out;
   // qktv_out may reuse gate_out.
-  Tensor qktv_out;
+  phi::DenseTensor qktv_out;
 };
 
 template <typename T>
@@ -312,11 +310,11 @@ struct GateAttentionGradConfig : public GateAttentionConfig<T> {
   }
 
  protected:
-  Tensor qkv_out_grad;
-  Tensor query_out_grad;
-  Tensor key_out_grad;
-  Tensor value_out_grad;
-  Tensor qk_out_grad;
+  phi::DenseTensor qkv_out_grad;
+  phi::DenseTensor query_out_grad;
+  phi::DenseTensor key_out_grad;
+  phi::DenseTensor value_out_grad;
+  phi::DenseTensor qk_out_grad;
 };
 
 template <typename T>
@@ -461,10 +459,10 @@ class FMHAGateRef {
     T* k_grad_ptr = nullptr;
     T* v_grad_ptr = nullptr;
 
-    Tensor q_transpose_out_grad;
-    Tensor k_transpose_out_grad;
-    Tensor v_transpose_out_grad;
-    Tensor qkv_transpose_out_grad;
+    phi::DenseTensor q_transpose_out_grad;
+    phi::DenseTensor k_transpose_out_grad;
+    phi::DenseTensor v_transpose_out_grad;
+    phi::DenseTensor qkv_transpose_out_grad;
     if (merge_qkv_) {
       PADDLE_ENFORCE_NOT_NULL(
           qkv_transpose_out,
@@ -513,7 +511,7 @@ class FMHAGateRef {
                                      v_transpose_out_grad.numel() * sizeof(T));
     }
 
-    Tensor softmax_out_grad;
+    phi::DenseTensor softmax_out_grad;
     softmax_out_grad.Resize(config->softmax_out_dims);
     AllocWithDebugInfo<T>(dev_ctx_, "softmax_out_grad", &softmax_out_grad);
 
@@ -521,7 +519,7 @@ class FMHAGateRef {
         config->batch_size * config->seq_len_m * config->num_heads;
     {
       // Forward: fmha_out = transpose(qktv_out)
-      Tensor qktv_out_grad;
+      phi::DenseTensor qktv_out_grad;
       qktv_out_grad.Resize(config->qktv_out_dims);
       AllocWithDebugInfo<T>(dev_ctx_, "qktv_out_grad", &qktv_out_grad);
       ComputeQKTVTransposeBackward(*fmha_out_grad, &qktv_out_grad);
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cc b/paddle/fluid/operators/fused/fused_gate_attention_op.cc
index ce7929c39ffa8..c91bca47cf42f 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
 
 class FusedGateAttentionOp : public framework::OperatorWithKernel {
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
index 9cb3f19ab1740..8ca6cdb46ccd9 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 struct SigmoidMultiplyFunctor {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
@@ -64,8 +62,8 @@ struct SigmoidMultiplyGradFunctor {
 template <typename T>
 void ComputeMergedQKVMatmulForward(const framework::ExecutionContext &ctx,
                                    const GateAttentionConfig<T> &config,
-                                   const Tensor *query,
-                                   Tensor *qkv_out) {
+                                   const phi::DenseTensor *query,
+                                   phi::DenseTensor *qkv_out) {
   // query: shape=[batch_size, seq_len_m, seq_len_r, qkv_dim]
   // qkv_weight: shape=[3, num_heads, head_dim, qkv_dim]
   // qkv_out: shape=[batch_size, seq_len_m, seq_len_r, 3, num_heads, head_dim]
@@ -83,9 +81,9 @@ void ComputeMergedQKVMatmulForward(const framework::ExecutionContext &ctx,
 template <typename T>
 void ComputeMergedQKVMatmulBackward(const framework::ExecutionContext &ctx,
                                     const GateAttentionGradConfig<T> &config,
-                                    const Tensor *query,
-                                    const Tensor *qkv_out_grad,
-                                    Tensor *query_grad,
+                                    const phi::DenseTensor *query,
+                                    const phi::DenseTensor *qkv_out_grad,
+                                    phi::DenseTensor *query_grad,
                                     bool use_addto) {
   auto *qkv_weight = ctx.Input<phi::DenseTensor>("QKVWeight");
   auto *qkv_weight_grad =
@@ -111,11 +109,11 @@ void ComputeMergedQKVMatmulBackward(const framework::ExecutionContext &ctx,
 template <typename T>
 void ComputeSeparatedQKVMatmulForward(const framework::ExecutionContext &ctx,
                                       const GateAttentionConfig<T> &config,
-                                      const Tensor *query,
-                                      const Tensor *key,
-                                      Tensor *query_out,
-                                      Tensor *key_out,
-                                      Tensor *value_out) {
+                                      const phi::DenseTensor *query,
+                                      const phi::DenseTensor *key,
+                                      phi::DenseTensor *query_out,
+                                      phi::DenseTensor *key_out,
+                                      phi::DenseTensor *value_out) {
   auto *query_weight = ctx.Input<phi::DenseTensor>("QueryWeight");
   auto *key_weight = ctx.Input<phi::DenseTensor>("KeyWeight");
   auto *value_weight = ctx.Input<phi::DenseTensor>("ValueWeight");
@@ -149,13 +147,13 @@ void ComputeSeparatedQKVMatmulForward(const framework::ExecutionContext &ctx,
 template <typename T>
 void ComputeSeparatedQKVMatmulBackward(const framework::ExecutionContext &ctx,
                                        const GateAttentionGradConfig<T> &config,
-                                       const Tensor *query,
-                                       const Tensor *key,
-                                       const Tensor *query_out_grad,
-                                       const Tensor *key_out_grad,
-                                       const Tensor *value_out_grad,
-                                       Tensor *query_grad,
-                                       Tensor *key_grad,
+                                       const phi::DenseTensor *query,
+                                       const phi::DenseTensor *key,
+                                       const phi::DenseTensor *query_out_grad,
+                                       const phi::DenseTensor *key_out_grad,
+                                       const phi::DenseTensor *value_out_grad,
+                                       phi::DenseTensor *query_grad,
+                                       phi::DenseTensor *key_grad,
                                        bool use_addto) {
   // Gradient of GEMM(key, k_weight)
   const auto *key_weight = ctx.Input<phi::DenseTensor>("KeyWeight");
@@ -209,9 +207,9 @@ void ComputeSeparatedQKVMatmulBackward(const framework::ExecutionContext &ctx,
 template <typename T>
 void ComputeGatingLinearForward(const framework::ExecutionContext &ctx,
                                 const GateAttentionConfig<T> &config,
-                                const Tensor *query,
-                                const Tensor *fmha_out,
-                                Tensor *gate_out) {
+                                const phi::DenseTensor *query,
+                                const phi::DenseTensor *fmha_out,
+                                phi::DenseTensor *gate_out) {
   auto *gate_weight = ctx.Input<phi::DenseTensor>("GateWeight");
   auto *gate_bias = ctx.Input<phi::DenseTensor>("GateBias");
 
@@ -228,8 +226,8 @@ void ComputeGatingLinearForward(const framework::ExecutionContext &ctx,
       gate_weight, query, gate_bias, gate_out, gate_out);
 
   // gate_out = sigmoid(gate_out) * fmha_out
-  std::vector<const Tensor *> ins = {gate_out, fmha_out};
-  std::vector<Tensor *> outs = {gate_out};
+  std::vector<const phi::DenseTensor *> ins = {gate_out, fmha_out};
+  std::vector<phi::DenseTensor *> outs = {gate_out};
   phi::funcs::ElementwiseKernel<T>(
       ctx.cuda_device_context(), ins, &outs, SigmoidMultiplyFunctor<T>());
 }
@@ -237,16 +235,16 @@ void ComputeGatingLinearForward(const framework::ExecutionContext &ctx,
 template <typename T>
 void ComputeGatingLinearBackward(const framework::ExecutionContext &ctx,
                                  const GateAttentionGradConfig<T> &config,
-                                 const Tensor *query,
-                                 const Tensor *fmha_out,
-                                 const Tensor *gate_out_grad,
-                                 Tensor *query_grad,
-                                 Tensor *fmha_out_grad) {
+                                 const phi::DenseTensor *query,
+                                 const phi::DenseTensor *fmha_out,
+                                 const phi::DenseTensor *gate_out_grad,
+                                 phi::DenseTensor *query_grad,
+                                 phi::DenseTensor *fmha_out_grad) {
   const auto *gate_weight = ctx.Input<phi::DenseTensor>("GateWeight");
   const auto *gate_bias = ctx.Input<phi::DenseTensor>("GateBias");
   auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
   // Re-compute gate_bias_out
-  Tensor gate_bias_out;
+  phi::DenseTensor gate_bias_out;
   gate_bias_out.Resize(config.gate_out_dims);
   dev_ctx.Alloc<T>(&gate_bias_out, gate_bias_out.numel() * sizeof(T));
 
@@ -260,8 +258,9 @@ void ComputeGatingLinearBackward(const framework::ExecutionContext &ctx,
 
   // Gradient of sigmoid(gate_bias_out) * fmha_out
   // Compute inplace and save gate_bias_out_grad to gate_bias_out.
-  std::vector<const Tensor *> ins = {gate_out_grad, &gate_bias_out, fmha_out};
-  std::vector<Tensor *> outs = {&gate_bias_out, fmha_out_grad};
+  std::vector<const phi::DenseTensor *> ins = {
+      gate_out_grad, &gate_bias_out, fmha_out};
+  std::vector<phi::DenseTensor *> outs = {&gate_bias_out, fmha_out_grad};
   phi::funcs::ElementwiseKernel<T, SigmoidMultiplyGradFunctor<T>, 2>(
       ctx.cuda_device_context(), ins, &outs, SigmoidMultiplyGradFunctor<T>());
 
@@ -284,8 +283,8 @@ void ComputeGatingLinearBackward(const framework::ExecutionContext &ctx,
 template <typename T>
 void ComputeOutputLinearForward(const framework::ExecutionContext &ctx,
                                 const GateAttentionConfig<T> &config,
-                                const Tensor *fmha_or_gate_out,
-                                Tensor *out) {
+                                const phi::DenseTensor *fmha_or_gate_out,
+                                phi::DenseTensor *out) {
   const auto *out_linear_weight =
       ctx.Input<phi::DenseTensor>("OutLinearWeight");
   const auto *out_linear_bias = ctx.Input<phi::DenseTensor>("OutLinearBias");
@@ -303,8 +302,8 @@ void ComputeOutputLinearForward(const framework::ExecutionContext &ctx,
 template <typename T>
 void ComputeOutputLinearBackward(const framework::ExecutionContext &ctx,
                                  const GateAttentionGradConfig<T> &config,
-                                 const Tensor *input,
-                                 Tensor *input_grad) {
+                                 const phi::DenseTensor *input,
+                                 phi::DenseTensor *input_grad) {
   auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
   const auto *out_grad =
       ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
@@ -382,15 +381,15 @@ class FusedGateAttentionOpKernel : public framework::OpKernel<T> {
               query));
 
       // 1. Merged QKV Matmul: einsum(nbhqk,nbkhc -> nbqhc)
-      Tensor *qkv_out = config.GetQKVOut();
+      phi::DenseTensor *qkv_out = config.GetQKVOut();
       ComputeMergedQKVMatmulForward<T>(ctx, config, query, qkv_out);
 
       AllocWithDebugInfo<T>(dev_ctx, "qkv_transpose_out", qkv_transpose_out);
     } else {
       // 1. Separated QKV Matmul
-      Tensor *query_out = config.GetQueryOut();
-      Tensor *key_out = config.GetKeyOut();
-      Tensor *value_out = config.GetValueOut();
+      phi::DenseTensor *query_out = config.GetQueryOut();
+      phi::DenseTensor *key_out = config.GetKeyOut();
+      phi::DenseTensor *value_out = config.GetValueOut();
       ComputeSeparatedQKVMatmulForward<T>(
           ctx, config, query, key, query_out, key_out, value_out);
 
@@ -418,7 +417,7 @@ class FusedGateAttentionOpKernel : public framework::OpKernel<T> {
     }
 
     // 4. Output Linear
-    Tensor *fmha_or_gate_out = has_gating ? gate_out : fmha_out;
+    phi::DenseTensor *fmha_or_gate_out = has_gating ? gate_out : fmha_out;
     ComputeOutputLinearForward<T>(ctx, config, fmha_or_gate_out, out);
   }
 };
@@ -461,12 +460,12 @@ class FusedGateAttentionGradKernel : public framework::OpKernel<T> {
     GateAttentionGradConfig<T> config(
         dev_ctx, query, key, query_weight, qkv_weight, merge_qkv, has_gating);
 
-    Tensor fmha_out_grad;
+    phi::DenseTensor fmha_out_grad;
     fmha_out_grad.Resize(config.gate_out_dims);
     AllocWithDebugInfo<T>(dev_ctx, "fmha_out_grad", &fmha_out_grad);
     if (has_gating) {
       // 1. Gradient of Output Linear: out = Linear(gate_out)
-      Tensor gate_out_grad;
+      phi::DenseTensor gate_out_grad;
       gate_out_grad.Resize(config.gate_out_dims);
       AllocWithDebugInfo<T>(dev_ctx, "gate_out_grad", &gate_out_grad);
       ComputeOutputLinearBackward<T>(ctx, config, gate_out, &gate_out_grad);
@@ -505,7 +504,7 @@ class FusedGateAttentionGradKernel : public framework::OpKernel<T> {
     bool use_addto = has_gating ? true : false;
     if (merge_qkv) {
       // 4. Gradient of Merged QKV Matmul
-      Tensor *qkv_out_grad = config.GetQKVOutGrad();
+      phi::DenseTensor *qkv_out_grad = config.GetQKVOutGrad();
       ComputeMergedQKVMatmulBackward<T>(
           ctx, config, query, qkv_out_grad, query_grad, use_addto);
     } else {
@@ -515,9 +514,9 @@ class FusedGateAttentionGradKernel : public framework::OpKernel<T> {
       if (key_grad) {
         AllocWithDebugInfo<T>(dev_ctx, "key_grad", key_grad);
       }
-      Tensor *query_out_grad = config.GetQueryOutGrad();
-      Tensor *key_out_grad = config.GetKeyOutGrad();
-      Tensor *value_out_grad = config.GetValueOutGrad();
+      phi::DenseTensor *query_out_grad = config.GetQueryOutGrad();
+      phi::DenseTensor *key_out_grad = config.GetKeyOutGrad();
+      phi::DenseTensor *value_out_grad = config.GetValueOutGrad();
       ComputeSeparatedQKVMatmulBackward<T>(ctx,
                                            config,
                                            query,
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
index 1f9cbf320fb50..013593176aa2d 100644
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 class FusedGemmEpilogueOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
index e5bab3cae4fab..05beddc52211b 100644
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
@@ -24,8 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc
index b1707ff55950d..687ce97068a35 100644
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class FusedGemmEpilogueXPUKernel : public framework::OpKernel<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc
index 3a9bd15c101e9..e1be5afa0bd68 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class FusedMultiTransformerINT8Op : public framework::OperatorWithKernel {
  private:
   static constexpr const char *OpName = "FusedMultiTransformerINT8Op";
@@ -176,7 +174,7 @@ class FusedMultiTransformerINT8Op : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "TimeStep") {
       VLOG(10) << "var_name:" << var_name << " need not to transform";
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
index fa22ee8d57e65..a4c11b85b9eeb 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
@@ -62,7 +62,7 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
 
     auto ln_compute =
         AttnLayerNorm<T, T, int8_t>(dev_ctx, epsilon, bsz_seq, dim_embed);
-    Tensor ln_mean, ln_var;
+    phi::DenseTensor ln_mean, ln_var;
     ln_mean.Resize({{bsz_seq}});
     auto *ln_mean_data =
         dev_ctx.Alloc<U>(&ln_mean, ln_mean.numel() * sizeof(U));
@@ -86,7 +86,7 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
     // (transA, transB, compute_bias) = (false, trans_qkvw, false)
     AttnMatmulINT8<T> qkv_compute(
         dev_ctx, bsz_seq, output_size, input_size, compute_bias);
-    Tensor qkv_out;
+    phi::DenseTensor qkv_out;
     qkv_out.Resize({{bsz, seq_len, 3, num_head, dim_head}});
     auto *qkv_out_data =
         dev_ctx.Alloc<T>(&qkv_out, qkv_out.numel() * sizeof(T));
@@ -123,7 +123,7 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
       out_seq_len += time_step_value;
     }
 
-    Tensor transpose_out_2, qk_out;
+    phi::DenseTensor transpose_out_2, qk_out;
     transpose_out_2.Resize({{3, bsz, num_head, seq_len, dim_head}});
     auto *transpose_out_2_data =
         dev_ctx.Alloc<T>(&transpose_out_2, transpose_out_2.numel() * sizeof(T));
@@ -131,9 +131,9 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
     qk_out.Resize({{bsz, num_head, seq_len, out_seq_len}});
     auto *qk_out_data = dev_ctx.Alloc<T>(&qk_out, qk_out.numel() * sizeof(T));
 
-    Tensor softmax_out;
-    Tensor attn_dropout_mask_out, attn_dropout_out;
-    Tensor qktv_out, fmha_out;
+    phi::DenseTensor softmax_out;
+    phi::DenseTensor attn_dropout_mask_out, attn_dropout_out;
+    phi::DenseTensor qktv_out, fmha_out;
     softmax_out.Resize({{bsz, num_head, seq_len, out_seq_len}});
     auto *softmax_out_data =
         dev_ctx.Alloc<T>(&softmax_out, softmax_out.numel() * sizeof(T));
@@ -170,7 +170,7 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
             dev_ctx, bsz_seq, dim_embed, dropout_param2, epsilon);
     auto ffn_ln_scales = ctx.MultiInput<phi::DenseTensor>("FFNLnScale");
     auto ffn_ln_biases = ctx.MultiInput<phi::DenseTensor>("FFNLnBias");
-    Tensor bias_dropout_residual_out, dropout_mask_out;
+    phi::DenseTensor bias_dropout_residual_out, dropout_mask_out;
     T *bias_dropout_residual_out_data = nullptr;
     if (pre_layer_norm) {
       bias_dropout_residual_out.Resize({{bsz, seq_len, dim_embed}});
@@ -190,7 +190,7 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
     int dim_ffn = ffn1_weight_dim[0];
     AttnMatmulINT8<T> ffn1_linear_compute(
         dev_ctx, bsz_seq, dim_ffn, dim_embed, false);
-    Tensor ffn1_out;
+    phi::DenseTensor ffn1_out;
     ffn1_out.Resize({{bsz_seq, dim_ffn}});
     auto *ffn1_out_data =
         dev_ctx.Alloc<T>(&ffn1_out, ffn1_out.numel() * sizeof(T));
@@ -201,7 +201,7 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
         dev_ctx, bsz_seq, dim_ffn, ffn1_dropout_param);
     FusedDropoutHelper<T, uint8_t> fused_act_dropout_helper_for_post_layernorm(
         dev_ctx, bsz_seq, dim_ffn, ffn1_dropout_param);
-    Tensor ffn1_dropout_out, ffn1_dropout_mask;
+    phi::DenseTensor ffn1_dropout_out, ffn1_dropout_mask;
     ffn1_dropout_out.Resize({{bsz_seq, dim_ffn}});
     auto *ffn1_dropout_out_data = dev_ctx.Alloc<T>(
         &ffn1_dropout_out, ffn1_dropout_out.numel() * sizeof(T));
@@ -228,7 +228,7 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
             dev_ctx, bsz_seq, dim_embed, ffn2_dropout_param, epsilon);
 
     // []. init workspace for cublasLt transform
-    Tensor input_workspace, output_workspace, cublaslt_workspace;
+    phi::DenseTensor input_workspace, output_workspace, cublaslt_workspace;
     // for input and output transform data is CUBLASLT_ORDER_COL32 format,
     int m_max = bsz_seq, k_max = std::max(dim_embed, dim_ffn),
         n_max = std::max({output_size, dim_embed, dim_ffn});
@@ -248,15 +248,15 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
     // calc
     auto *out = ctx.Output<phi::DenseTensor>("Out");
     auto *from_data = dev_ctx.Alloc<T>(out, out->numel() * sizeof(T));
-    Tensor *from_tensor = out;
-    Tensor tmp_out;
+    phi::DenseTensor *from_tensor = out;
+    phi::DenseTensor tmp_out;
     tmp_out.Resize({{bsz, seq_len, dim_embed}});
     auto *tmp_out_data =
         dev_ctx.Alloc<T>(&tmp_out, tmp_out.numel() * sizeof(T));
 
     auto *x_data = input_x->data<T>();
-    Tensor *buf0 = nullptr;
-    Tensor *buf1 = nullptr;
+    phi::DenseTensor *buf0 = nullptr;
+    phi::DenseTensor *buf1 = nullptr;
 
     // step0:  x   --> buf1
     // step1: buf1 --> buf0
@@ -293,9 +293,10 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
 #endif
 
       // step2. qkv
-      const Tensor *qkv_bias = qkv_biases.size() > 0 ? qkv_biases[i] : nullptr;
+      const phi::DenseTensor *qkv_bias =
+          qkv_biases.size() > 0 ? qkv_biases[i] : nullptr;
       // NOTE: in decoder stage, bias is fused in fmha
-      const Tensor *bias = time_step ? nullptr : qkv_bias;
+      const phi::DenseTensor *bias = time_step ? nullptr : qkv_bias;
       if (!pre_layer_norm && i == 0) {
         qkv_compute.ComputeForward(qkv_weights[i],
                                    input_x,
@@ -337,8 +338,9 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
 #endif
 
       // step3. fmha
-      const Tensor *cache_kv = cache_kvs.size() > 0 ? cache_kvs[i] : nullptr;
-      Tensor *cache_kv_out = cache_kv ? cache_kv_outs[i] : nullptr;
+      const phi::DenseTensor *cache_kv =
+          cache_kvs.size() > 0 ? cache_kvs[i] : nullptr;
+      phi::DenseTensor *cache_kv_out = cache_kv ? cache_kv_outs[i] : nullptr;
 
       if (time_step) {  // generation decoder stage
         // [2, batch_size, num_head, max_seq_len, head_size]
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
index 94a89338a6205..92b782c44c77a 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class FusedMultiTransformerOp : public framework::OperatorWithKernel {
  private:
   static constexpr const char *OpName = "FusedMultiTransformerOp";
@@ -143,7 +141,7 @@ class FusedMultiTransformerOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "TimeStep") {
       VLOG(10) << "var_name:" << var_name << " need not to transform";
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
index aeb00a7947cd6..5ca66cb132b05 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
@@ -40,7 +40,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     auto ln_biases = ctx.MultiInput<phi::DenseTensor>("LnBias");
 
     auto ln_compute = AttnLayerNorm<T>(dev_ctx, epsilon, bsz_seq, dim_embed);
-    Tensor ln_mean, ln_var;
+    phi::DenseTensor ln_mean, ln_var;
     ln_mean.Resize({{bsz_seq}});
     auto *ln_mean_data =
         dev_ctx.Alloc<U>(&ln_mean, ln_mean.numel() * sizeof(U));
@@ -72,7 +72,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
                                      input_size,
                                      /*compute_bias=*/false);
 
-    Tensor qkv_out;
+    phi::DenseTensor qkv_out;
     qkv_out.Resize({{bsz, seq_len, 3, num_head, dim_head}});
     auto *qkv_out_data =
         dev_ctx.Alloc<T>(&qkv_out, qkv_out.numel() * sizeof(T));
@@ -116,7 +116,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
       out_seq_len += cache_offset;
     }
 
-    Tensor q_transpose_out, kv_transpose_out, qk_out;
+    phi::DenseTensor q_transpose_out, kv_transpose_out, qk_out;
     q_transpose_out.Resize({{bsz, num_head, seq_len, dim_head}});
     auto *q_transpose_out_data =
         dev_ctx.Alloc<T>(&q_transpose_out, q_transpose_out.numel() * sizeof(T));
@@ -128,7 +128,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     qk_out.Resize({{bsz, num_head, seq_len, out_seq_len}});
     auto *qk_out_data = dev_ctx.Alloc<T>(&qk_out, qk_out.numel() * sizeof(T));
 
-    Tensor src_mask_out;
+    phi::DenseTensor src_mask_out;
     if (cache_offset > 0) {
       src_mask_out.Resize({{bsz, num_head, seq_len, out_seq_len}});
       auto *src_mask_out_data =
@@ -136,7 +136,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     }
 
     // [2, bs, num_head, cache_seq_len + seq_len, head_dim]
-    Tensor pre_cache_kv_out;
+    phi::DenseTensor pre_cache_kv_out;
     if (cache_offset > 0) {
       pre_cache_kv_out.Resize(
           {{2, bsz, num_head, seq_len + cache_offset, dim_head}});
@@ -144,9 +144,9 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
           &pre_cache_kv_out, pre_cache_kv_out.numel() * sizeof(T));
     }
 
-    Tensor softmax_out;
-    Tensor attn_dropout_mask_out, attn_dropout_out;
-    Tensor qktv_out, fmha_out;
+    phi::DenseTensor softmax_out;
+    phi::DenseTensor attn_dropout_mask_out, attn_dropout_out;
+    phi::DenseTensor qktv_out, fmha_out;
     softmax_out.Resize({{bsz, num_head, seq_len, out_seq_len}});
     auto *softmax_out_data =
         dev_ctx.Alloc<T>(&softmax_out, softmax_out.numel() * sizeof(T));
@@ -179,7 +179,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
         dev_ctx, bsz_seq, dim_embed, dropout_param2, epsilon);
     auto ffn_ln_scales = ctx.MultiInput<phi::DenseTensor>("FFNLnScale");
     auto ffn_ln_biases = ctx.MultiInput<phi::DenseTensor>("FFNLnBias");
-    Tensor bias_dropout_residual_out, dropout_mask_out;
+    phi::DenseTensor bias_dropout_residual_out, dropout_mask_out;
     T *bias_dropout_residual_out_data = nullptr;
     if (pre_layer_norm) {
       bias_dropout_residual_out.Resize({{bsz, seq_len, dim_embed}});
@@ -202,7 +202,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     const phi::DDim ffn1_input_shape({bsz_seq, dim_embed});
     ffn1_cublas_linear.Setup(ffn1_input_shape, ffn1_weight_dim, false, false);
 
-    Tensor ffn1_out;
+    phi::DenseTensor ffn1_out;
     ffn1_out.Resize({{bsz_seq, dim_ffn}});
     auto *ffn1_out_data =
         dev_ctx.Alloc<T>(&ffn1_out, ffn1_out.numel() * sizeof(T));
@@ -223,15 +223,15 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     // calc
     auto *out = ctx.Output<phi::DenseTensor>("Out");
     auto *from_data = dev_ctx.Alloc<T>(out, out->numel() * sizeof(T));
-    Tensor *from_tensor = out;
-    Tensor tmp_out;
+    phi::DenseTensor *from_tensor = out;
+    phi::DenseTensor tmp_out;
     tmp_out.Resize({{bsz, seq_len, dim_embed}});
     auto *tmp_out_data =
         dev_ctx.Alloc<T>(&tmp_out, tmp_out.numel() * sizeof(T));
 
     auto *x_data = input_x->data<T>();
-    Tensor *buf0 = nullptr;
-    Tensor *buf1 = nullptr;
+    phi::DenseTensor *buf0 = nullptr;
+    phi::DenseTensor *buf1 = nullptr;
 
     // step0:  x   --> buf1
     // step1: buf1 --> buf0
@@ -270,9 +270,10 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
 #endif
 
       // step2. qkv
-      const Tensor *qkv_bias = qkv_biases.size() > 0 ? qkv_biases[i] : nullptr;
+      const phi::DenseTensor *qkv_bias =
+          qkv_biases.size() > 0 ? qkv_biases[i] : nullptr;
       // NOTE: in decoder stage, bias is fused in fmha
-      const Tensor *bias = time_step ? nullptr : qkv_bias;
+      const phi::DenseTensor *bias = time_step ? nullptr : qkv_bias;
       if (!pre_layer_norm && i == 0) {
         qkv_compute.ComputeForward(
             qkv_weights[i], input_x, bias, &qkv_out, &qkv_out);
@@ -285,8 +286,9 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
 #endif
 
       // step3. fmha
-      const Tensor *cache_kv = cache_kvs.size() > 0 ? cache_kvs[i] : nullptr;
-      Tensor *cache_kv_out = cache_kv ? cache_kv_outs[i] : nullptr;
+      const phi::DenseTensor *cache_kv =
+          cache_kvs.size() > 0 ? cache_kvs[i] : nullptr;
+      phi::DenseTensor *cache_kv_out = cache_kv ? cache_kv_outs[i] : nullptr;
 
       if (time_step) {  // generation decoder stage
         // [2, batch_size, num_head, max_seq_len, head_size]
@@ -304,11 +306,12 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
                 time_step->data<int>()[0],
                 1. / sqrt(dim_head));
       } else if (cache_kv_out) {  // generation context stage
-        const Tensor *pre_cache_kv_tensor =
+        const phi::DenseTensor *pre_cache_kv_tensor =
             pre_caches.size() > 0 ? pre_caches[i] : nullptr;
-        Tensor *pre_cache_kv_out_tmp =
+        phi::DenseTensor *pre_cache_kv_out_tmp =
             cache_offset > 0 ? &pre_cache_kv_out : nullptr;
-        Tensor *src_mask_tmp = cache_offset > 0 ? &src_mask_out : nullptr;
+        phi::DenseTensor *src_mask_tmp =
+            cache_offset > 0 ? &src_mask_out : nullptr;
         qkv_bias_add_transpose_split<T>(dev_ctx,
                                         q_transpose_out_data,
                                         kv_transpose_out_data,
@@ -554,7 +557,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     auto ln_biases = ctx.MultiInput<phi::DenseTensor>("LnBias");
 
     auto ln_compute = AttnLayerNorm<T>(dev_ctx, epsilon, bsz_seq, dim_embed);
-    Tensor ln_mean, ln_var;
+    phi::DenseTensor ln_mean, ln_var;
     ln_mean.Resize({{bsz_seq}});
     auto *ln_mean_data =
         dev_ctx.Alloc<U>(&ln_mean, ln_mean.numel() * sizeof(U));
@@ -586,7 +589,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
                                      input_size,
                                      /*compute_bias=*/false);
 
-    Tensor qkv_out;
+    phi::DenseTensor qkv_out;
     qkv_out.Resize({{bsz, seq_len, 3, num_head, dim_head}});
     auto *qkv_out_data =
         dev_ctx.Alloc<T>(&qkv_out, qkv_out.numel() * sizeof(T));
@@ -630,7 +633,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
       out_seq_len += cache_offset;
     }
 
-    Tensor q_transpose_out, kv_transpose_out, qk_out;
+    phi::DenseTensor q_transpose_out, kv_transpose_out, qk_out;
     q_transpose_out.Resize({{bsz, num_head, seq_len, dim_head}});
     auto *q_transpose_out_data =
         dev_ctx.Alloc<T>(&q_transpose_out, q_transpose_out.numel() * sizeof(T));
@@ -642,7 +645,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     qk_out.Resize({{bsz, num_head, seq_len, out_seq_len}});
     auto *qk_out_data = dev_ctx.Alloc<T>(&qk_out, qk_out.numel() * sizeof(T));
 
-    Tensor src_mask_out;
+    phi::DenseTensor src_mask_out;
     if (cache_offset > 0) {
       src_mask_out.Resize({{bsz, num_head, seq_len, out_seq_len}});
       auto *src_mask_out_data =
@@ -650,7 +653,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     }
 
     // [2, bs, num_head, cache_seq_len + seq_len, head_dim]
-    Tensor pre_cache_kv_out;
+    phi::DenseTensor pre_cache_kv_out;
     if (cache_offset > 0) {
       pre_cache_kv_out.Resize(
           {{2, bsz, num_head, seq_len + cache_offset, dim_head}});
@@ -658,9 +661,9 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
           &pre_cache_kv_out, pre_cache_kv_out.numel() * sizeof(T));
     }
 
-    Tensor softmax_out;
-    Tensor attn_dropout_mask_out, attn_dropout_out;
-    Tensor qktv_out, fmha_out;
+    phi::DenseTensor softmax_out;
+    phi::DenseTensor attn_dropout_mask_out, attn_dropout_out;
+    phi::DenseTensor qktv_out, fmha_out;
     softmax_out.Resize({{bsz, num_head, seq_len, out_seq_len}});
     auto *softmax_out_data =
         dev_ctx.Alloc<T>(&softmax_out, softmax_out.numel() * sizeof(T));
@@ -693,7 +696,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
         dev_ctx, bsz_seq, dim_embed, dropout_param2, epsilon);
     auto ffn_ln_scales = ctx.MultiInput<phi::DenseTensor>("FFNLnScale");
     auto ffn_ln_biases = ctx.MultiInput<phi::DenseTensor>("FFNLnBias");
-    Tensor bias_dropout_residual_out, dropout_mask_out;
+    phi::DenseTensor bias_dropout_residual_out, dropout_mask_out;
     T *bias_dropout_residual_out_data = nullptr;
     if (pre_layer_norm) {
       bias_dropout_residual_out.Resize({{bsz, seq_len, dim_embed}});
@@ -713,7 +716,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     int dim_ffn = ffn1_weight_dim[1];
     auto ffn1_linear_compute = AttnMatMul<T>(
         dev_ctx, false, false, bsz_seq, dim_ffn, dim_embed, false);
-    Tensor ffn1_out;
+    phi::DenseTensor ffn1_out;
     ffn1_out.Resize({{bsz_seq, dim_ffn}});
     auto *ffn1_out_data =
         dev_ctx.Alloc<T>(&ffn1_out, ffn1_out.numel() * sizeof(T));
@@ -722,7 +725,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     DropoutParam ffn1_dropout_param(true, 0, true, true, 0.0, nullptr, 0);
     FusedDropoutHelper<T, uint8_t> fused_act_dropout_helper(
         dev_ctx, bsz_seq, dim_ffn, ffn1_dropout_param);
-    Tensor ffn1_dropout_out, ffn1_dropout_mask;
+    phi::DenseTensor ffn1_dropout_out, ffn1_dropout_mask;
     ffn1_dropout_out.Resize({{bsz_seq, dim_ffn}});
     auto *ffn1_dropout_out_data = dev_ctx.Alloc<T>(
         &ffn1_dropout_out, ffn1_dropout_out.numel() * sizeof(T));
@@ -744,15 +747,15 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     // calc
     auto *out = ctx.Output<phi::DenseTensor>("Out");
     auto *from_data = dev_ctx.Alloc<T>(out, out->numel() * sizeof(T));
-    Tensor *from_tensor = out;
-    Tensor tmp_out;
+    phi::DenseTensor *from_tensor = out;
+    phi::DenseTensor tmp_out;
     tmp_out.Resize({{bsz, seq_len, dim_embed}});
     auto *tmp_out_data =
         dev_ctx.Alloc<T>(&tmp_out, tmp_out.numel() * sizeof(T));
 
     auto *x_data = input_x->data<T>();
-    Tensor *buf0 = nullptr;
-    Tensor *buf1 = nullptr;
+    phi::DenseTensor *buf0 = nullptr;
+    phi::DenseTensor *buf1 = nullptr;
 
     // step0:  x   --> buf1
     // step1: buf1 --> buf0
@@ -791,9 +794,10 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
 #endif
 
       // step2. qkv
-      const Tensor *qkv_bias = qkv_biases.size() > 0 ? qkv_biases[i] : nullptr;
+      const phi::DenseTensor *qkv_bias =
+          qkv_biases.size() > 0 ? qkv_biases[i] : nullptr;
       // NOTE: in decoder stage, bias is fused in fmha
-      const Tensor *bias = time_step ? nullptr : qkv_bias;
+      const phi::DenseTensor *bias = time_step ? nullptr : qkv_bias;
       if (!pre_layer_norm && i == 0) {
         qkv_compute.ComputeForward(
             qkv_weights[i], input_x, bias, &qkv_out, &qkv_out);
@@ -806,8 +810,9 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
 #endif
 
       // step3. fmha
-      const Tensor *cache_kv = cache_kvs.size() > 0 ? cache_kvs[i] : nullptr;
-      Tensor *cache_kv_out = cache_kv ? cache_kv_outs[i] : nullptr;
+      const phi::DenseTensor *cache_kv =
+          cache_kvs.size() > 0 ? cache_kvs[i] : nullptr;
+      phi::DenseTensor *cache_kv_out = cache_kv ? cache_kv_outs[i] : nullptr;
 
       if (time_step) {  // generation decoder stage
         // [2, batch_size, num_head, max_seq_len, head_size]
@@ -825,11 +830,12 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
                 time_step->data<int>()[0],
                 1. / sqrt(dim_head));
       } else if (cache_kv_out) {  // generation context stage
-        const Tensor *pre_cache_kv_tensor =
+        const phi::DenseTensor *pre_cache_kv_tensor =
             pre_caches.size() > 0 ? pre_caches[i] : nullptr;
-        Tensor *pre_cache_kv_out_tmp =
+        phi::DenseTensor *pre_cache_kv_out_tmp =
             cache_offset > 0 ? &pre_cache_kv_out : nullptr;
-        Tensor *src_mask_tmp = cache_offset > 0 ? &src_mask_out : nullptr;
+        phi::DenseTensor *src_mask_tmp =
+            cache_offset > 0 ? &src_mask_out : nullptr;
         qkv_bias_add_transpose_split<T>(dev_ctx,
                                         q_transpose_out_data,
                                         kv_transpose_out_data,
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
index 69ac06206c62b..0500f76110f33 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
@@ -44,8 +44,6 @@ DECLARE_bool(gemm_use_half_precision_compute_type);
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 // for debug
 // #define _DEBUG_FUSED_MULTI_TRANSFORMER
 
@@ -1119,11 +1117,11 @@ void fmha_launch_kernel(const Masked_multihead_attention_params<T> &params,
 
 template <typename T>
 void fmha(const phi::GPUContext &dev_ctx,
-          const Tensor &qkv_tensor,
-          const Tensor &qkv_bias_tensor,
-          const Tensor &src_mask_tensor,
-          Tensor *cache_kv_tensor,
-          Tensor *out_tensor,
+          const phi::DenseTensor &qkv_tensor,
+          const phi::DenseTensor &qkv_bias_tensor,
+          const phi::DenseTensor &src_mask_tensor,
+          phi::DenseTensor *cache_kv_tensor,
+          phi::DenseTensor *out_tensor,
           int batch_size,
           int max_seq_length,
           int num_head,
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
index 07cfb44a312bc..519ce1c6aca08 100644
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
@@ -20,7 +20,6 @@ namespace paddle {
 namespace operators {
 
 #if CUDNN_VERSION >= 7100
-using Tensor = phi::DenseTensor;
 using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
 using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index 814631bd87b47..fc7804f9c4e8c 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -160,26 +160,29 @@ void FusionGRUOpMaker::Make() {
       "variable-time length input sequence. The underlying tensor in "
       "this phi::DenseTensor is a matrix with shape (T X M), where T is the "
       "total time steps in this mini-batch, M is the dim size of x.");
-  AddInput("H0",
-           "(Tensor, optional) The initial hidden state is an optional "
-           "input. This is a tensor with shape (N x D), where N is the "
-           "batch size, D is the hidden size.")
+  AddInput(
+      "H0",
+      "(phi::DenseTensor, optional) The initial hidden state is an optional "
+      "input. This is a tensor with shape (N x D), where N is the "
+      "batch size, D is the hidden size.")
       .AsDispensable();
   AddInput("WeightX",
-           "(Tensor) The FC weight with shape (M x 3D),"
+           "(phi::DenseTensor) The FC weight with shape (M x 3D),"
            "where M is the dim size of x, D is the hidden size. ");
-  AddInput("WeightH",
-           "(Tensor) (D x 3D) Same as GRUOp, where D is the hidden size. "
-           "This weight is not exactly D x 3D as: {W_update, W_reset, W_state}"
-           "Acutally they are D x 2D and D x D two part weights."
-           "{W_update, W_reset; W_state}"
-           "{D x (D + D); D x D}");
+  AddInput(
+      "WeightH",
+      "(phi::DenseTensor) (D x 3D) Same as GRUOp, where D is the hidden size. "
+      "This weight is not exactly D x 3D as: {W_update, W_reset, W_state}"
+      "Acutally they are D x 2D and D x D two part weights."
+      "{W_update, W_reset; W_state}"
+      "{D x (D + D); D x D}");
   AddInput("Bias",
-           "(Tensor, optional) (1 x 3D)."
+           "(phi::DenseTensor, optional) (1 x 3D)."
            "Almost same as GRUOp."
            "Note: if have FC bias it should be added on this bias.")
       .AsDispensable();
-  AddOutput("ReorderedH0", "(Tensor) (N x D), which N is the min-batch size.")
+  AddOutput("ReorderedH0",
+            "(phi::DenseTensor) (N x D), which N is the min-batch size.")
       .AsIntermediate();
   AddOutput("XX",
             "(phi::DenseTensor) the result after X * WeightX (size is T x 3D)"
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.h b/paddle/fluid/operators/fused/fusion_gru_op.h
index 4df5042089053..94bf38068d0dd 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.h
+++ b/paddle/fluid/operators/fused/fusion_gru_op.h
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class FusionGRUOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc
index b612d590ea1d9..c526fdc18428c 100644
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -184,16 +184,17 @@ void FusionLSTMOpMaker::Make() {
       "this phi::DenseTensor is a matrix with shape (T X M), where T is the "
       "total time steps in this mini-batch, M is the dim size of x.");
   AddInput("WeightX",
-           "(Tensor) the learnable weights of X."
+           "(phi::DenseTensor) the learnable weights of X."
            " - The shape is (M x 4D), where M is the dim size of x, D is the "
            "hidden size. "
            " - Weight = {W_cx, W_ix, W_fx, W_ox}");
-  AddInput("WeightH",
-           "(Tensor) same as LSTMOp, the learnable hidden-hidden weights."
-           " - The shape is (D x 4D), where D is the hidden size. "
-           " - Weight = {W_ch, W_ih, W_fh, W_oh}");
+  AddInput(
+      "WeightH",
+      "(phi::DenseTensor) same as LSTMOp, the learnable hidden-hidden weights."
+      " - The shape is (D x 4D), where D is the hidden size. "
+      " - Weight = {W_ch, W_ih, W_fh, W_oh}");
   AddInput("Bias",
-           "(Tensor) the learnable weights. Almost same as LSTMOp"
+           "(phi::DenseTensor) the learnable weights. Almost same as LSTMOp"
            "Note: we should add the fc bias into this (1x4D) in bias."
            "input-hidden bias weight and peephole connections weight if "
            "setting `use_peepholes` True. "
@@ -204,13 +205,15 @@ void FusionLSTMOpMaker::Make() {
            " - The shape is (1 x 7D). "
            " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
   AddInput("H0",
-           "(Tensor, optional) (same as LSTMOp) the initial hidden state is an "
+           "(phi::DenseTensor, optional) (same as LSTMOp) the initial hidden "
+           "state is an "
            "optional "
            "input. This is a tensor with shape (N x D), where N is the "
            "batch size and D is the hidden size.")
       .AsDispensable();
   AddInput("C0",
-           "(Tensor, optional) (same as LSTMOp) (the initial cell state is an "
+           "(phi::DenseTensor, optional) (same as LSTMOp) (the initial cell "
+           "state is an "
            "optional "
            "input. This is a tensor with shape (N x D), where N is the "
            "batch size. `H0` and `C0` can be NULL but only at the same time.")
@@ -234,7 +237,7 @@ void FusionLSTMOpMaker::Make() {
   AddOutput("BatchedCell", "(phi::DenseTensor) (T x D).").AsIntermediate();
   AddOutput("ReorderedH0", "(phi::DenseTensor) (N x D).").AsIntermediate();
   AddOutput("ReorderedC0", "(phi::DenseTensor) (N x D).").AsIntermediate();
-  AddOutput("CheckedCell", "(Tensor) (2 x D) only for peephole.")
+  AddOutput("CheckedCell", "(phi::DenseTensor) (2 x D) only for peephole.")
       .AsIntermediate();
   AddAttr<bool>("use_peepholes",
                 "(bool, default: True) "
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.h b/paddle/fluid/operators/fused/fusion_lstm_op.h
index 590d4bd7c2914..93f8eb981bbd9 100644
--- a/paddle/fluid/operators/fused/fusion_lstm_op.h
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.h
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class FusionLSTMOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
index 7bad7c78edc75..bab06f55be856 100644
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
@@ -107,10 +107,12 @@ framework::OpKernelType FusionRepeatedFCReluOp::GetExpectedKernelType(
 
 void FusionRepeatedFCReluOpMaker::Make() {
   AddInput("X", "(phi::DenseTensor) Input tensors of this operator.");
-  AddInput("W", "(Tensor) The weight tensors of this operator.").AsDuplicable();
-  AddInput("Bias", "(Tensor) The bias tensors of this operator.")
+  AddInput("W", "(phi::DenseTensor) The weight tensors of this operator.")
       .AsDuplicable();
-  AddOutput("ReluOut", "(Tensor) The output tensor of each relu operator.")
+  AddInput("Bias", "(phi::DenseTensor) The bias tensors of this operator.")
+      .AsDuplicable();
+  AddOutput("ReluOut",
+            "(phi::DenseTensor) The output tensor of each relu operator.")
       .AsDuplicable()
       .AsIntermediate();
   AddOutput("Out", "(phi::DenseTensor) Output tensor of this operator.");
diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.h b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.h
index 2cfb404913c42..16025bf5181b6 100644
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.h
+++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.h
@@ -18,8 +18,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class FusionRepeatedFCReluOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
index cb08e4fbff258..c9166919636bf 100644
--- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
@@ -102,14 +102,16 @@ void FusionSeqConvEltAddReluOpMaker::Make() {
       "this phi::DenseTensor is a matrix with shape (T X M), where T is the "
       "total time steps in this mini-batch, M is the dim size of x.");
   // PaddingData only support false yet, should be ensured at pass.
-  AddInput("Filter",
-           "(Tensor) same as the input(Filter) of sequence conv op is an "
-           "learnable parameter."
-           "This is a tensor with shape (K, N), where K is the "
-           "context_length * dim size of x, N is the output feature size.");
-  AddInput("Bias",
-           "(Tensor) the learnable weights. shape (1, N), where N is the "
-           "output feature size");
+  AddInput(
+      "Filter",
+      "(phi::DenseTensor) same as the input(Filter) of sequence conv op is an "
+      "learnable parameter."
+      "This is a tensor with shape (K, N), where K is the "
+      "context_length * dim size of x, N is the output feature size.");
+  AddInput(
+      "Bias",
+      "(phi::DenseTensor) the learnable weights. shape (1, N), where N is the "
+      "output feature size");
   AddOutput(
       "Out",
       "(phi::DenseTensor) the output(Out) is a LodTensor, which support "
@@ -117,7 +119,7 @@ void FusionSeqConvEltAddReluOpMaker::Make() {
       "this phi::DenseTensor is a matrix with shape (T, N), where, T is the "
       "total time steps in this mini-batch, N is the output feature size.");
   AddOutput("ColMat",
-            "(Tensor) (T, K), where T is where T is the "
+            "(phi::DenseTensor) (T, K), where T is where T is the "
             "total time steps in this mini-batch, K is height of Filter")
       .AsIntermediate();
   AddAttr<int>("contextLength",
diff --git a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h
index d1b7ae835821f..96f231f9a3cd5 100644
--- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h
+++ b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class FusionSeqConvEltAddReluOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
index bcc8ee894543f..df4cbba1dec15 100644
--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
@@ -114,12 +114,13 @@ void FusionSeqExpandConcatFCOpMaker::Make() {
            "ref lod "
            "for sequence expand, and the rest input should have same lod.")
       .AsDuplicable();
-  AddInput("FCWeight", "(Tensor) the weights of fc.");
-  AddInput("FCBias", "(Tensor, optional) the bias of fc.").AsDispensable();
+  AddInput("FCWeight", "(phi::DenseTensor) the weights of fc.");
+  AddInput("FCBias", "(phi::DenseTensor, optional) the bias of fc.")
+      .AsDispensable();
   AddOutput("Out", "(phi::DenseTensor) Output LodTensor.");
   AddOutput(
       "FCOut",
-      "(Tensor) the intermediate tensor to keep the result of fc."
+      "(phi::DenseTensor) the intermediate tensor to keep the result of fc."
       "Shape is (N x D), where N is the batch size, D is the output dim of fc")
       .AsIntermediate();
   AddAttr<std::string>("fc_activation",
diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h
index 9c611025351e8..495de5f233445 100644
--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class FusionSeqExpandConcatFCOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.h b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.h
index 6dc29b23cbb89..2e2d6e07dc7e5 100644
--- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.h
+++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.h
@@ -18,8 +18,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class FusionSeqPoolConcatOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
index 41944f4bc095f..e3953f9e6abc0 100644
--- a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
@@ -77,7 +77,8 @@ void FusionSeqPoolCVMConcatOpMaker::Make() {
   AddInput("X", "(phi::DenseTensor) Input tensors of this operator.")
       .AsDuplicable();
   AddInput("CVM",
-           "(Tensor),  a 2-D Tensor with shape [N x 2], where N is the batch "
+           "(phi::DenseTensor),  a 2-D phi::DenseTensor with shape [N x 2], "
+           "where N is the batch "
            "size, 2 is show and click.");
   AddOutput("Out", "(phi::DenseTensor) Output tensor of concat operator.");
   AddAttr<std::string>("pooltype",
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h
index 24a02553044b0..b9d7d0dfc340e 100644
--- a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h
+++ b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h
@@ -18,8 +18,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class FusionSeqPoolCVMConcatOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
index b7a01b7955887..8d7f792f3c25b 100644
--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
+++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
@@ -70,12 +70,12 @@ framework::OpKernelType FusionSquaredMatSubOp::GetExpectedKernelType(
 }
 
 void FusionSquaredMatSubOpMaker::Make() {
-  AddInput("X", "(Tensor) Input Mat A of this operator.");
-  AddInput("Y", "(Tensor) Input Mat B of this operator.");
-  AddOutput("SquaredX", "(Tensor) Squared X.").AsIntermediate();
-  AddOutput("SquaredY", "(Tensor) Squared Y.").AsIntermediate();
-  AddOutput("SquaredXY", "(Tensor) Squared X*Y.").AsIntermediate();
-  AddOutput("Out", "(Tensor) Output tensor of concat operator.");
+  AddInput("X", "(phi::DenseTensor) Input Mat A of this operator.");
+  AddInput("Y", "(phi::DenseTensor) Input Mat B of this operator.");
+  AddOutput("SquaredX", "(phi::DenseTensor) Squared X.").AsIntermediate();
+  AddOutput("SquaredY", "(phi::DenseTensor) Squared Y.").AsIntermediate();
+  AddOutput("SquaredXY", "(phi::DenseTensor) Squared X*Y.").AsIntermediate();
+  AddOutput("Out", "(phi::DenseTensor) Output tensor of concat operator.");
   AddAttr<float>("scalar", "The scalar on output matrix.").SetDefault(1.f);
   AddComment(R"DOC(
     Fusion Squared Matrix and substrct operator.
diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.h b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.h
index 7707bb14fcefe..fc6a54fd9eb03 100644
--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.h
+++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.h
@@ -18,8 +18,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 // ( (A.^2 * B.^2) - (A * B).^2 ) .* scalar
 class FusionSquaredMatSubOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/fused/multihead_matmul_op.cu b/paddle/fluid/operators/fused/multihead_matmul_op.cu
index 2e8b6f7d0a6b8..ba2b71ff6ffd7 100644
--- a/paddle/fluid/operators/fused/multihead_matmul_op.cu
+++ b/paddle/fluid/operators/fused/multihead_matmul_op.cu
@@ -273,7 +273,6 @@ template <typename DeviceContext, typename T>
 class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    using Tensor = phi::DenseTensor;
     auto *input = context.Input<phi::DenseTensor>("Input");
     auto *w = context.Input<phi::DenseTensor>("W");
     auto *bias = context.Input<phi::DenseTensor>("Bias");
@@ -296,7 +295,7 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
     int batch = input_dims[0];
     int seq_len = input_dims[1];
     int hidden = input_dims[2];
-    Tensor temp_bias_tensor;
+    phi::DenseTensor temp_bias_tensor;
     // if bias_qk is[batch, 1, 1, seq_len], the bias_qk_d need to be broadcasted
     if (bias_qk && bias_qk->numel() == (batch * seq_len)) {
       VLOG(4) << "Do broadcasted bias_qk from [batch, 1, 1, seq_len]";
@@ -343,13 +342,13 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
         device_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
 
     // (B*S, hidden)
-    const Tensor input_matrix =
+    const phi::DenseTensor input_matrix =
         framework::ReshapeToMatrix(*input, 2 /*x_num_col_dims */);
     // (hidden, 3 * all_head_size)
-    const Tensor w_matrix =
+    const phi::DenseTensor w_matrix =
         framework::ReshapeToMatrix(*w, 1 /*y_num_col_dims*/);
 
-    Tensor temp_out_tensor;
+    phi::DenseTensor temp_out_tensor;
     auto temp_out_dims =
         phi::make_ddim({batch, seq_len, 3, head_number, head_size});
     temp_out_tensor.Resize(
@@ -364,7 +363,7 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
     VLOG(2) << temp_out_tensor;
     // temp_out_tensor.Resize(temp_out_dims);
 
-    Tensor multihead_temp_tensor;
+    phi::DenseTensor multihead_temp_tensor;
     // B * head_number * S * S * 1 + B * S * 3 * N * H
     int scratch_size = batch * head_number * seq_len * seq_len * 1;
     multihead_temp_tensor.Resize({scratch_size + temp_out_tensor.numel()});
diff --git a/paddle/fluid/operators/fused/resnet_basic_block_op.cc b/paddle/fluid/operators/fused/resnet_basic_block_op.cc
index 76f173c2d6d09..b449ca3bbe8da 100644
--- a/paddle/fluid/operators/fused/resnet_basic_block_op.cc
+++ b/paddle/fluid/operators/fused/resnet_basic_block_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 class ResNetBasicBlockOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc b/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc
index 8310116849611..f6b2d30453f42 100644
--- a/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc
+++ b/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc
@@ -21,8 +21,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class ResnetBasicBlockAttr {
  public:
   explicit ResnetBasicBlockAttr(const framework::ExecutionContext& ctx) {
diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cc b/paddle/fluid/operators/fused/resnet_unit_op.cc
index b2d44057365b9..4b46dc76b260e 100644
--- a/paddle/fluid/operators/fused/resnet_unit_op.cc
+++ b/paddle/fluid/operators/fused/resnet_unit_op.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 // Shape of bitmask
 static framework::DDim GetBitmaskDims(std::vector<int> out_shape) {
   int c = out_shape.back();
diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cu b/paddle/fluid/operators/fused/resnet_unit_op.cu
index 02bde0ef04ff2..446d289a1b959 100644
--- a/paddle/fluid/operators/fused/resnet_unit_op.cu
+++ b/paddle/fluid/operators/fused/resnet_unit_op.cu
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class ResNetUnitKernel : public framework::OpKernel<T> {
  public:
@@ -39,20 +37,23 @@ class ResNetUnitKernel : public framework::OpKernel<T> {
                           "ResNetUnitOp only supports float16 for now."));
 
     // input x
-    const Tensor *input_x = ctx.Input<phi::DenseTensor>("X");
-    const Tensor *filter_x = ctx.Input<phi::DenseTensor>("FilterX");
-    const Tensor *scale_x = ctx.Input<phi::DenseTensor>("ScaleX");
-    const Tensor *bias_x = ctx.Input<phi::DenseTensor>("BiasX");
+    const phi::DenseTensor *input_x = ctx.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor *filter_x = ctx.Input<phi::DenseTensor>("FilterX");
+    const phi::DenseTensor *scale_x = ctx.Input<phi::DenseTensor>("ScaleX");
+    const phi::DenseTensor *bias_x = ctx.Input<phi::DenseTensor>("BiasX");
     // norm conv
-    Tensor *conv_out_x = ctx.Output<phi::DenseTensor>("ConvX");
+    phi::DenseTensor *conv_out_x = ctx.Output<phi::DenseTensor>("ConvX");
     // bn finalize
-    Tensor *saved_mean_x = ctx.Output<phi::DenseTensor>("SavedMeanX");
-    Tensor *saved_invstd_x = ctx.Output<phi::DenseTensor>("SavedInvstdX");
-    Tensor *running_mean_x = ctx.Output<phi::DenseTensor>("RunningMeanX");
-    Tensor *running_var_x = ctx.Output<phi::DenseTensor>("RunningVarX");
+    phi::DenseTensor *saved_mean_x = ctx.Output<phi::DenseTensor>("SavedMeanX");
+    phi::DenseTensor *saved_invstd_x =
+        ctx.Output<phi::DenseTensor>("SavedInvstdX");
+    phi::DenseTensor *running_mean_x =
+        ctx.Output<phi::DenseTensor>("RunningMeanX");
+    phi::DenseTensor *running_var_x =
+        ctx.Output<phi::DenseTensor>("RunningVarX");
     // sbar
-    Tensor *output = ctx.Output<phi::DenseTensor>("Y");
-    Tensor *bitmask = ctx.Output<phi::DenseTensor>("BitMask");
+    phi::DenseTensor *output = ctx.Output<phi::DenseTensor>("Y");
+    phi::DenseTensor *bitmask = ctx.Output<phi::DenseTensor>("BitMask");
     // attrs
     int padding = ctx.Attr<int>("padding");
     int stride = ctx.Attr<int>("stride");
@@ -93,8 +94,8 @@ class ResNetUnitKernel : public framework::OpKernel<T> {
     auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
 
     // 1. Conv
-    Tensor sum_x;
-    Tensor sum_of_squares_x;
+    phi::DenseTensor sum_x;
+    phi::DenseTensor sum_of_squares_x;
     sum_x.Resize(param_dims);
     sum_of_squares_x.Resize(param_dims);
     CudnnNormConvolution<T> conv_x_op(dev_ctx,
@@ -109,8 +110,8 @@ class ResNetUnitKernel : public framework::OpKernel<T> {
         dev_ctx, *input_x, *filter_x, conv_out_x, &sum_x, &sum_of_squares_x);
 
     // 2. BN
-    Tensor equiv_scale_x;
-    Tensor equiv_bias_x;
+    phi::DenseTensor equiv_scale_x;
+    phi::DenseTensor equiv_bias_x;
     equiv_scale_x.Resize(param_dims);
     equiv_bias_x.Resize(param_dims);
     CudnnBNStatsFinalize<T> bn_x_op(dev_ctx, param_shape);
@@ -140,24 +141,28 @@ class ResNetUnitKernel : public framework::OpKernel<T> {
                                      bitmask_shape);
     if (has_shortcut) {
       // input z
-      const Tensor *input_z = ctx.Input<phi::DenseTensor>("Z");
-      const Tensor *filter_z = ctx.Input<phi::DenseTensor>("FilterZ");
-      const Tensor *scale_z = ctx.Input<phi::DenseTensor>("ScaleZ");
-      const Tensor *bias_z = ctx.Input<phi::DenseTensor>("BiasZ");
+      const phi::DenseTensor *input_z = ctx.Input<phi::DenseTensor>("Z");
+      const phi::DenseTensor *filter_z = ctx.Input<phi::DenseTensor>("FilterZ");
+      const phi::DenseTensor *scale_z = ctx.Input<phi::DenseTensor>("ScaleZ");
+      const phi::DenseTensor *bias_z = ctx.Input<phi::DenseTensor>("BiasZ");
       // norm conv
-      Tensor *conv_out_z = ctx.Output<phi::DenseTensor>("ConvZ");
+      phi::DenseTensor *conv_out_z = ctx.Output<phi::DenseTensor>("ConvZ");
       // bn finalize
-      Tensor *saved_mean_z = ctx.Output<phi::DenseTensor>("SavedMeanZ");
-      Tensor *saved_invstd_z = ctx.Output<phi::DenseTensor>("SavedInvstdZ");
-      Tensor *running_mean_z = ctx.Output<phi::DenseTensor>("RunningMeanZ");
-      Tensor *running_var_z = ctx.Output<phi::DenseTensor>("RunningVarZ");
+      phi::DenseTensor *saved_mean_z =
+          ctx.Output<phi::DenseTensor>("SavedMeanZ");
+      phi::DenseTensor *saved_invstd_z =
+          ctx.Output<phi::DenseTensor>("SavedInvstdZ");
+      phi::DenseTensor *running_mean_z =
+          ctx.Output<phi::DenseTensor>("RunningMeanZ");
+      phi::DenseTensor *running_var_z =
+          ctx.Output<phi::DenseTensor>("RunningVarZ");
 
       auto input_z_shape = phi::vectorize<int>(input_z->dims());
       auto filter_z_shape = phi::vectorize<int>(filter_z->dims());
 
       // 3.1 Conv for second input
-      Tensor sum_z;
-      Tensor sum_of_squares_z;
+      phi::DenseTensor sum_z;
+      phi::DenseTensor sum_of_squares_z;
       sum_z.Resize(param_dims);
       sum_of_squares_z.Resize(param_dims);
       CudnnNormConvolution<T> conv_z_op(dev_ctx,
@@ -172,8 +177,8 @@ class ResNetUnitKernel : public framework::OpKernel<T> {
           dev_ctx, *input_z, *filter_z, conv_out_z, &sum_z, &sum_of_squares_z);
 
       // 3.2 BN for second input
-      Tensor equiv_scale_z;
-      Tensor equiv_bias_z;
+      phi::DenseTensor equiv_scale_z;
+      phi::DenseTensor equiv_bias_z;
       equiv_scale_z.Resize(param_dims);
       equiv_bias_z.Resize(param_dims);
       CudnnBNStatsFinalize<T> bn_z_op(dev_ctx, param_shape);
@@ -203,7 +208,7 @@ class ResNetUnitKernel : public framework::OpKernel<T> {
                       output,
                       bitmask);
     } else {
-      const Tensor *input_z =
+      const phi::DenseTensor *input_z =
           fuse_add ? ctx.Input<phi::DenseTensor>("Z") : nullptr;
       sbar_op.Forward(dev_ctx,
                       *conv_out_x,
@@ -231,26 +236,29 @@ class ResNetUnitGradKernel : public framework::OpKernel<T> {
                       platform::errors::Unavailable(
                           "ResNetUnitOp only supports float16 for now."));
 
-    const Tensor *y_grad =
+    const phi::DenseTensor *y_grad =
         ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
 
-    const Tensor *x = ctx.Input<phi::DenseTensor>("X");
-    const Tensor *filter_x = ctx.Input<phi::DenseTensor>("FilterX");
-    const Tensor *scale_x = ctx.Input<phi::DenseTensor>("ScaleX");
-    const Tensor *bias_x = ctx.Input<phi::DenseTensor>("BiasX");
-    const Tensor *saved_mean_x = ctx.Input<phi::DenseTensor>("SavedMeanX");
-    const Tensor *saved_invstd_x = ctx.Input<phi::DenseTensor>("SavedInvstdX");
-
-    const Tensor *conv_out_x = ctx.Input<phi::DenseTensor>("ConvX");
-    const Tensor *output = ctx.Input<phi::DenseTensor>("Y");
-    const Tensor *bitmask = ctx.Input<phi::DenseTensor>("BitMask");
-
-    Tensor *x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    Tensor *filter_x_grad =
+    const phi::DenseTensor *x = ctx.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor *filter_x = ctx.Input<phi::DenseTensor>("FilterX");
+    const phi::DenseTensor *scale_x = ctx.Input<phi::DenseTensor>("ScaleX");
+    const phi::DenseTensor *bias_x = ctx.Input<phi::DenseTensor>("BiasX");
+    const phi::DenseTensor *saved_mean_x =
+        ctx.Input<phi::DenseTensor>("SavedMeanX");
+    const phi::DenseTensor *saved_invstd_x =
+        ctx.Input<phi::DenseTensor>("SavedInvstdX");
+
+    const phi::DenseTensor *conv_out_x = ctx.Input<phi::DenseTensor>("ConvX");
+    const phi::DenseTensor *output = ctx.Input<phi::DenseTensor>("Y");
+    const phi::DenseTensor *bitmask = ctx.Input<phi::DenseTensor>("BitMask");
+
+    phi::DenseTensor *x_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    phi::DenseTensor *filter_x_grad =
         ctx.Output<phi::DenseTensor>(framework::GradVarName("FilterX"));
-    Tensor *scale_x_grad =
+    phi::DenseTensor *scale_x_grad =
         ctx.Output<phi::DenseTensor>(framework::GradVarName("ScaleX"));
-    Tensor *bias_x_grad =
+    phi::DenseTensor *bias_x_grad =
         ctx.Output<phi::DenseTensor>(framework::GradVarName("BiasX"));
 
     int padding = ctx.Attr<int>("padding");
@@ -276,7 +284,7 @@ class ResNetUnitGradKernel : public framework::OpKernel<T> {
 
     // 1. Backward of BN (+ Add + Relu) for x, get conv_out_x_grad,
     // scale_x_grad, bias_x_grad
-    Tensor conv_out_x_grad;
+    phi::DenseTensor conv_out_x_grad;
     conv_out_x_grad.Resize(conv_out_x->dims());
     CudnnScaleBiasAddRelu<T> sbar_x_op(dev_ctx,
                                        act_type,
@@ -295,27 +303,28 @@ class ResNetUnitGradKernel : public framework::OpKernel<T> {
       //          ScaleBiasAddRelu
       //                  |
       //                  Y
-      const Tensor *z = ctx.Input<phi::DenseTensor>("Z");
-      const Tensor *filter_z = ctx.Input<phi::DenseTensor>("FilterZ");
-      const Tensor *scale_z = ctx.Input<phi::DenseTensor>("ScaleZ");
-      const Tensor *bias_z = ctx.Input<phi::DenseTensor>("BiasZ");
-      const Tensor *saved_mean_z = ctx.Input<phi::DenseTensor>("SavedMeanZ");
-      const Tensor *saved_invstd_z =
+      const phi::DenseTensor *z = ctx.Input<phi::DenseTensor>("Z");
+      const phi::DenseTensor *filter_z = ctx.Input<phi::DenseTensor>("FilterZ");
+      const phi::DenseTensor *scale_z = ctx.Input<phi::DenseTensor>("ScaleZ");
+      const phi::DenseTensor *bias_z = ctx.Input<phi::DenseTensor>("BiasZ");
+      const phi::DenseTensor *saved_mean_z =
+          ctx.Input<phi::DenseTensor>("SavedMeanZ");
+      const phi::DenseTensor *saved_invstd_z =
           ctx.Input<phi::DenseTensor>("SavedInvstdZ");
-      const Tensor *conv_out_z = ctx.Input<phi::DenseTensor>("ConvZ");
+      const phi::DenseTensor *conv_out_z = ctx.Input<phi::DenseTensor>("ConvZ");
 
-      Tensor *z_grad =
+      phi::DenseTensor *z_grad =
           ctx.Output<phi::DenseTensor>(framework::GradVarName("Z"));
-      Tensor *filter_z_grad =
+      phi::DenseTensor *filter_z_grad =
           ctx.Output<phi::DenseTensor>(framework::GradVarName("FilterZ"));
-      Tensor *scale_z_grad =
+      phi::DenseTensor *scale_z_grad =
           ctx.Output<phi::DenseTensor>(framework::GradVarName("ScaleZ"));
-      Tensor *bias_z_grad =
+      phi::DenseTensor *bias_z_grad =
           ctx.Output<phi::DenseTensor>(framework::GradVarName("BiasZ"));
 
       // 1.1 Backward of BN + Add (+ Relu) for x, get conv_out_x_grad,
       // scale_x_grad, bias_x_grad and z_grad_temp
-      Tensor z_grad_temp;
+      phi::DenseTensor z_grad_temp;
       z_grad_temp.Resize(conv_out_z->dims());
       sbar_x_op.Backward(dev_ctx,
                          *y_grad,
@@ -332,7 +341,7 @@ class ResNetUnitGradKernel : public framework::OpKernel<T> {
                          eps);
 
       // 1.2 bn backward for z, get conv_out_z_grad, dscale_z, dbias_z
-      Tensor conv_out_z_grad;
+      phi::DenseTensor conv_out_z_grad;
       conv_out_z_grad.Resize(conv_out_z->dims());
       CudnnScaleBiasAddRelu<T> sbar_z_op(
           dev_ctx, "", false, false, output_shape, param_shape, bitmask_shape);
@@ -366,7 +375,7 @@ class ResNetUnitGradKernel : public framework::OpKernel<T> {
     } else {
       // 1.1 Backward of BN (+ Add + Relu) for x, get conv_out_x_grad,
       // scale_x_grad, bias_x_grad (and z_grad)
-      Tensor *z_grad =
+      phi::DenseTensor *z_grad =
           fuse_add ? ctx.Output<phi::DenseTensor>(framework::GradVarName("Z"))
                    : nullptr;
       sbar_x_op.Backward(dev_ctx,
diff --git a/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc b/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc
index 80986761c7cba..1e2741cde5d9e 100644
--- a/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc
+++ b/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class ResNetUnitXPUKernel : public framework::OpKernel<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
@@ -35,19 +33,22 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
 
     bool is_nchw = (ctx.Attr<std::string>("data_format") == "NCHW");
     // input x
-    const Tensor *input_x = ctx.Input<phi::DenseTensor>("X");
-    const Tensor *filter_x = ctx.Input<phi::DenseTensor>("FilterX");
-    const Tensor *scale_x = ctx.Input<phi::DenseTensor>("ScaleX");
-    const Tensor *bias_x = ctx.Input<phi::DenseTensor>("BiasX");
+    const phi::DenseTensor *input_x = ctx.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor *filter_x = ctx.Input<phi::DenseTensor>("FilterX");
+    const phi::DenseTensor *scale_x = ctx.Input<phi::DenseTensor>("ScaleX");
+    const phi::DenseTensor *bias_x = ctx.Input<phi::DenseTensor>("BiasX");
 
     // output x
-    Tensor *conv_out_x = ctx.Output<phi::DenseTensor>("ConvX");
-    Tensor *saved_mean_x = ctx.Output<phi::DenseTensor>("SavedMeanX");
-    Tensor *saved_invstd_x = ctx.Output<phi::DenseTensor>("SavedInvstdX");
-    Tensor *running_mean_x = ctx.Output<phi::DenseTensor>("RunningMeanX");
-    Tensor *running_var_x = ctx.Output<phi::DenseTensor>("RunningVarX");
+    phi::DenseTensor *conv_out_x = ctx.Output<phi::DenseTensor>("ConvX");
+    phi::DenseTensor *saved_mean_x = ctx.Output<phi::DenseTensor>("SavedMeanX");
+    phi::DenseTensor *saved_invstd_x =
+        ctx.Output<phi::DenseTensor>("SavedInvstdX");
+    phi::DenseTensor *running_mean_x =
+        ctx.Output<phi::DenseTensor>("RunningMeanX");
+    phi::DenseTensor *running_var_x =
+        ctx.Output<phi::DenseTensor>("RunningVarX");
 
-    Tensor *output = ctx.Output<phi::DenseTensor>("Y");
+    phi::DenseTensor *output = ctx.Output<phi::DenseTensor>("Y");
 
     //  attrs
     int padding = ctx.Attr<int>("padding");
@@ -101,16 +102,20 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
     std::vector<const float *> w_maxlist = {nullptr};
     if (has_shortcut) {
       // input z
-      const Tensor *input_z = ctx.Input<phi::DenseTensor>("Z");
-      const Tensor *filter_z = ctx.Input<phi::DenseTensor>("FilterZ");
-      const Tensor *scale_z = ctx.Input<phi::DenseTensor>("ScaleZ");
-      const Tensor *bias_z = ctx.Input<phi::DenseTensor>("BiasZ");
-
-      Tensor *conv_out_z = ctx.Output<phi::DenseTensor>("ConvZ");
-      Tensor *saved_mean_z = ctx.Output<phi::DenseTensor>("SavedMeanZ");
-      Tensor *saved_invstd_z = ctx.Output<phi::DenseTensor>("SavedInvstdZ");
-      Tensor *running_mean_z = ctx.Output<phi::DenseTensor>("RunningMeanZ");
-      Tensor *running_var_z = ctx.Output<phi::DenseTensor>("RunningVarZ");
+      const phi::DenseTensor *input_z = ctx.Input<phi::DenseTensor>("Z");
+      const phi::DenseTensor *filter_z = ctx.Input<phi::DenseTensor>("FilterZ");
+      const phi::DenseTensor *scale_z = ctx.Input<phi::DenseTensor>("ScaleZ");
+      const phi::DenseTensor *bias_z = ctx.Input<phi::DenseTensor>("BiasZ");
+
+      phi::DenseTensor *conv_out_z = ctx.Output<phi::DenseTensor>("ConvZ");
+      phi::DenseTensor *saved_mean_z =
+          ctx.Output<phi::DenseTensor>("SavedMeanZ");
+      phi::DenseTensor *saved_invstd_z =
+          ctx.Output<phi::DenseTensor>("SavedInvstdZ");
+      phi::DenseTensor *running_mean_z =
+          ctx.Output<phi::DenseTensor>("RunningMeanZ");
+      phi::DenseTensor *running_var_z =
+          ctx.Output<phi::DenseTensor>("RunningVarZ");
 
       x_list.push_back(reinterpret_cast<const XPUType *>(input_z->data<T>()));
       w_list.push_back(reinterpret_cast<const XPUType *>(filter_z->data<T>()));
@@ -137,7 +142,7 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
       w_maxlist.push_back(nullptr);
     } else {
       if (fuse_add) {
-        const Tensor *input_z = ctx.Input<phi::DenseTensor>("Z");
+        const phi::DenseTensor *input_z = ctx.Input<phi::DenseTensor>("Z");
         auto input_z_shape = phi::vectorize<int>(input_z->dims());
         x_list.push_back(reinterpret_cast<const XPUType *>(input_z->data<T>()));
         x_shape_list.push_back(input_z_shape);
@@ -189,22 +194,25 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
         platform::errors::PreconditionNotMet("It must use XPUPlace."));
 
     bool is_nchw = (ctx.Attr<std::string>("data_format") == "NCHW");
-    const Tensor *y_grad =
+    const phi::DenseTensor *y_grad =
         ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    const Tensor *x = ctx.Input<phi::DenseTensor>("X");
-    const Tensor *filter_x = ctx.Input<phi::DenseTensor>("FilterX");
-    const Tensor *scale_x = ctx.Input<phi::DenseTensor>("ScaleX");
-    const Tensor *saved_mean_x = ctx.Input<phi::DenseTensor>("SavedMeanX");
-    const Tensor *saved_invstd_x = ctx.Input<phi::DenseTensor>("SavedInvstdX");
-    const Tensor *conv_out_x = ctx.Input<phi::DenseTensor>("ConvX");
-    const Tensor *output = ctx.Input<phi::DenseTensor>("Y");
-
-    Tensor *x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    Tensor *filter_x_grad =
+    const phi::DenseTensor *x = ctx.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor *filter_x = ctx.Input<phi::DenseTensor>("FilterX");
+    const phi::DenseTensor *scale_x = ctx.Input<phi::DenseTensor>("ScaleX");
+    const phi::DenseTensor *saved_mean_x =
+        ctx.Input<phi::DenseTensor>("SavedMeanX");
+    const phi::DenseTensor *saved_invstd_x =
+        ctx.Input<phi::DenseTensor>("SavedInvstdX");
+    const phi::DenseTensor *conv_out_x = ctx.Input<phi::DenseTensor>("ConvX");
+    const phi::DenseTensor *output = ctx.Input<phi::DenseTensor>("Y");
+
+    phi::DenseTensor *x_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    phi::DenseTensor *filter_x_grad =
         ctx.Output<phi::DenseTensor>(framework::GradVarName("FilterX"));
-    Tensor *scale_x_grad =
+    phi::DenseTensor *scale_x_grad =
         ctx.Output<phi::DenseTensor>(framework::GradVarName("ScaleX"));
-    Tensor *bias_x_grad =
+    phi::DenseTensor *bias_x_grad =
         ctx.Output<phi::DenseTensor>(framework::GradVarName("BiasX"));
 
     int padding = ctx.Attr<int>("padding");
@@ -265,21 +273,22 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
       //          ScaleBiasAddRelu
       //                  |
       //                  Y
-      const Tensor *z = ctx.Input<phi::DenseTensor>("Z");
-      const Tensor *filter_z = ctx.Input<phi::DenseTensor>("FilterZ");
-      const Tensor *scale_z = ctx.Input<phi::DenseTensor>("ScaleZ");
-      const Tensor *saved_mean_z = ctx.Input<phi::DenseTensor>("SavedMeanZ");
-      const Tensor *saved_invstd_z =
+      const phi::DenseTensor *z = ctx.Input<phi::DenseTensor>("Z");
+      const phi::DenseTensor *filter_z = ctx.Input<phi::DenseTensor>("FilterZ");
+      const phi::DenseTensor *scale_z = ctx.Input<phi::DenseTensor>("ScaleZ");
+      const phi::DenseTensor *saved_mean_z =
+          ctx.Input<phi::DenseTensor>("SavedMeanZ");
+      const phi::DenseTensor *saved_invstd_z =
           ctx.Input<phi::DenseTensor>("SavedInvstdZ");
-      const Tensor *conv_out_z = ctx.Input<phi::DenseTensor>("ConvZ");
+      const phi::DenseTensor *conv_out_z = ctx.Input<phi::DenseTensor>("ConvZ");
 
-      Tensor *z_grad =
+      phi::DenseTensor *z_grad =
           ctx.Output<phi::DenseTensor>(framework::GradVarName("Z"));
-      Tensor *filter_z_grad =
+      phi::DenseTensor *filter_z_grad =
           ctx.Output<phi::DenseTensor>(framework::GradVarName("FilterZ"));
-      Tensor *scale_z_grad =
+      phi::DenseTensor *scale_z_grad =
           ctx.Output<phi::DenseTensor>(framework::GradVarName("ScaleZ"));
-      Tensor *bias_z_grad =
+      phi::DenseTensor *bias_z_grad =
           ctx.Output<phi::DenseTensor>(framework::GradVarName("BiasZ"));
       x_list.push_back(reinterpret_cast<const XPUType *>(z->data<T>()));
       w_list.push_back(reinterpret_cast<const XPUType *>(filter_z->data<T>()));
diff --git a/paddle/fluid/operators/fused/skip_layernorm_op.cu b/paddle/fluid/operators/fused/skip_layernorm_op.cu
index 96646071567d5..f6fd97f918c07 100644
--- a/paddle/fluid/operators/fused/skip_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/skip_layernorm_op.cu
@@ -29,7 +29,6 @@ template <typename DeviceContext, typename T>
 class SkipLayerNormKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    using Tensor = phi::DenseTensor;
     auto *X = context.Input<phi::DenseTensor>("X");
     auto *Y = context.Input<phi::DenseTensor>("Y");
     auto *scale = context.Input<phi::DenseTensor>("Scale");
diff --git a/paddle/fluid/operators/fused/xpu_fused_common_function.h b/paddle/fluid/operators/fused/xpu_fused_common_function.h
index 1a1ec8c47f9ba..63a22838e8c35 100644
--- a/paddle/fluid/operators/fused/xpu_fused_common_function.h
+++ b/paddle/fluid/operators/fused/xpu_fused_common_function.h
@@ -19,14 +19,13 @@
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 struct XPUDropoutParam {
   float dropout_prob;
   bool is_upscale_in_train;
   bool is_test;
   bool fix_seed;
-  const Tensor *tensor_seed;
+  const phi::DenseTensor *tensor_seed;
   int seed_val;
 
   XPUDropoutParam() {
@@ -61,8 +60,9 @@ struct XPUDropoutParam {
       str_seed = str_seed + "Seed";
     }
 
-    tensor_seed =
-        context.HasInput(str_seed) ? context.Input<Tensor>(str_seed) : nullptr;
+    tensor_seed = context.HasInput(str_seed)
+                      ? context.Input<phi::DenseTensor>(str_seed)
+                      : nullptr;
     if (tensor_seed) {
       seed_val = *(tensor_seed->data<int>());
     } else {
@@ -74,7 +74,7 @@ struct XPUDropoutParam {
                            bool is_upscale_in_train_,
                            bool is_test_,
                            bool fix_seed_,
-                           const Tensor *tensor_seed,
+                           const phi::DenseTensor *tensor_seed,
                            int seed_val_) {
     dropout_prob = dropout_prob_;
     is_upscale_in_train = is_upscale_in_train_;
@@ -108,8 +108,9 @@ struct XPUDropoutParam {
     } else {
       str_seed = str_seed + "Seed";
     }
-    tensor_seed =
-        context.HasInput(str_seed) ? context.Input<Tensor>(str_seed) : nullptr;
+    tensor_seed = context.HasInput(str_seed)
+                      ? context.Input<phi::DenseTensor>(str_seed)
+                      : nullptr;
 
     if (tensor_seed) {
       seed_val = *(tensor_seed->data<int>());
diff --git a/paddle/fluid/operators/fused/yolo_box_head_op.cu b/paddle/fluid/operators/fused/yolo_box_head_op.cu
index 696cab20db714..88d589f85b0ec 100644
--- a/paddle/fluid/operators/fused/yolo_box_head_op.cu
+++ b/paddle/fluid/operators/fused/yolo_box_head_op.cu
@@ -67,7 +67,6 @@ template <typename T>
 class YoloBoxHeadKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    using Tensor = phi::DenseTensor;
     auto* x = context.Input<phi::DenseTensor>("X");
     auto* out = context.Output<phi::DenseTensor>("Out");
     auto anchors = context.Attr<std::vector<int>>("anchors");
diff --git a/paddle/fluid/operators/fused/yolo_box_post_op.cu b/paddle/fluid/operators/fused/yolo_box_post_op.cu
index 072f0374c5b82..fc01d7027f31d 100644
--- a/paddle/fluid/operators/fused/yolo_box_post_op.cu
+++ b/paddle/fluid/operators/fused/yolo_box_post_op.cu
@@ -319,7 +319,6 @@ template <typename T>
 class YoloBoxPostKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    using Tensor = phi::DenseTensor;
     // prepare inputs
     std::vector<const float*> boxes_input(3);
     std::vector<std::vector<int32_t>> boxes_input_dims(3);
diff --git a/paddle/fluid/operators/gather_nd_op_mlu.cc b/paddle/fluid/operators/gather_nd_op_mlu.cc
index b6c96e3c2edd5..93b20c86af860 100644
--- a/paddle/fluid/operators/gather_nd_op_mlu.cc
+++ b/paddle/fluid/operators/gather_nd_op_mlu.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class GatherNdMLUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/gather_nd_op_npu.cc b/paddle/fluid/operators/gather_nd_op_npu.cc
index 5cea840b4aec5..6629d369db0c6 100644
--- a/paddle/fluid/operators/gather_nd_op_npu.cc
+++ b/paddle/fluid/operators/gather_nd_op_npu.cc
@@ -21,7 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 template <typename T>
diff --git a/paddle/fluid/operators/gather_scatter_kernel.cc b/paddle/fluid/operators/gather_scatter_kernel.cc
index b579b3175d396..1c6b2e6c1a095 100644
--- a/paddle/fluid/operators/gather_scatter_kernel.cc
+++ b/paddle/fluid/operators/gather_scatter_kernel.cc
@@ -16,8 +16,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class TensorAssign {
  public:
   template <typename tensor_t>
@@ -50,7 +48,7 @@ template <typename tensor_t,
           bool is_scatter_like = true>
 struct cpu_gather_scatter_functor {
   template <typename func_t>
-  void operator()(Tensor self,
+  void operator()(phi::DenseTensor self,
                   int dim,
                   const phi::DenseTensor& index,
                   const phi::DenseTensor& src,
@@ -130,10 +128,10 @@ struct cpu_gather_scatter_functor {
 };
 
 template <typename tensor_t, typename index_t>
-void cpu_gather_kernel(Tensor self,
+void cpu_gather_kernel(phi::DenseTensor self,
                        int dim,
                        const phi::DenseTensor& index,
-                       Tensor result,
+                       phi::DenseTensor result,
                        const platform::DeviceContext& ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
@@ -142,10 +140,10 @@ void cpu_gather_kernel(Tensor self,
 }
 
 template <typename tensor_t, typename index_t>
-void cpu_scatter_assign_kernel(Tensor self,
+void cpu_scatter_assign_kernel(phi::DenseTensor self,
                                int dim,
                                const phi::DenseTensor& index,
-                               Tensor src,
+                               phi::DenseTensor src,
                                const platform::DeviceContext& ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
@@ -154,10 +152,10 @@ void cpu_scatter_assign_kernel(Tensor self,
 }
 
 template <typename tensor_t, typename index_t>
-void cpu_scatter_add_kernel(Tensor self,
+void cpu_scatter_add_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
-                            Tensor src,
+                            phi::DenseTensor src,
                             const platform::DeviceContext& ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
@@ -166,10 +164,10 @@ void cpu_scatter_add_kernel(Tensor self,
 }
 
 template <typename tensor_t, typename index_t>
-void cpu_scatter_mul_kernel(Tensor self,
+void cpu_scatter_mul_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
-                            Tensor src,
+                            phi::DenseTensor src,
                             const platform::DeviceContext& ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
@@ -178,10 +176,10 @@ void cpu_scatter_mul_kernel(Tensor self,
 }
 
 template <typename tensor_t, typename index_t>
-void cpu_scatter_input_grad_kernel(Tensor self,
+void cpu_scatter_input_grad_kernel(phi::DenseTensor self,
                                    int dim,
                                    const phi::DenseTensor& index,
-                                   Tensor output,
+                                   phi::DenseTensor output,
                                    const platform::DeviceContext& ctx) {
   auto* index_data = index.data<index_t>();
   auto* output_data = output.data<tensor_t>();
diff --git a/paddle/fluid/operators/gather_scatter_kernel.cu b/paddle/fluid/operators/gather_scatter_kernel.cu
index 2f17b946c6149..1cb4e4a4e9d78 100644
--- a/paddle/fluid/operators/gather_scatter_kernel.cu
+++ b/paddle/fluid/operators/gather_scatter_kernel.cu
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class TensorAssign {
  public:
   template <typename tensor_t>
@@ -107,10 +105,10 @@ template <typename tensor_t,
           bool is_scatter_like = true>
 struct gpu_gather_scatter_functor {
   template <typename func_t>
-  void operator()(Tensor self,
+  void operator()(phi::DenseTensor self,
                   int dim,
                   const phi::DenseTensor& index,
-                  Tensor src,
+                  phi::DenseTensor src,
                   const std::string& method_name,
                   const func_t& reduce_op,
                   const platform::DeviceContext& ctx) {
@@ -160,10 +158,10 @@ struct gpu_gather_scatter_functor {
 };  // struct gpu_gather_scatter_functor
 
 template <typename tensor_t, typename index_t>
-void gpu_gather_kernel(Tensor self,
+void gpu_gather_kernel(phi::DenseTensor self,
                        int dim,
                        const phi::DenseTensor& index,
-                       Tensor result,
+                       phi::DenseTensor result,
                        const platform::DeviceContext& ctx) {
   gpu_gather_scatter_functor<tensor_t,
                              index_t,
@@ -173,10 +171,10 @@ void gpu_gather_kernel(Tensor self,
 }
 
 template <typename tensor_t, typename index_t>
-void gpu_scatter_assign_kernel(Tensor self,
+void gpu_scatter_assign_kernel(phi::DenseTensor self,
                                int dim,
                                const phi::DenseTensor& index,
-                               Tensor src,
+                               phi::DenseTensor src,
                                const platform::DeviceContext& ctx) {
   gpu_gather_scatter_functor<tensor_t,
                              index_t,
@@ -185,10 +183,10 @@ void gpu_scatter_assign_kernel(Tensor self,
 }
 
 template <typename tensor_t, typename index_t>
-void gpu_scatter_add_kernel(Tensor self,
+void gpu_scatter_add_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
-                            Tensor src,
+                            phi::DenseTensor src,
                             const platform::DeviceContext& ctx) {
   gpu_gather_scatter_functor<tensor_t,
                              index_t,
@@ -197,10 +195,10 @@ void gpu_scatter_add_kernel(Tensor self,
 }
 
 template <typename tensor_t, typename index_t>
-void gpu_scatter_mul_kernel(Tensor self,
+void gpu_scatter_mul_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
-                            Tensor src,
+                            phi::DenseTensor src,
                             const platform::DeviceContext& ctx) {
   gpu_gather_scatter_functor<tensor_t,
                              index_t,
@@ -230,10 +228,10 @@ __global__ void ScatterInputGradGPUKernel(tensor_t* grad_data,
   grad_data[replace_index] = 0;
 }
 template <typename tensor_t, typename index_t>
-void gpu_scatter_input_grad_kernel(Tensor self,
+void gpu_scatter_input_grad_kernel(phi::DenseTensor self,
                                    int dim,
                                    const phi::DenseTensor& index,
-                                   Tensor grad,
+                                   phi::DenseTensor grad,
                                    const platform::DeviceContext& ctx) {
   auto* index_data = index.data<index_t>();
   auto* grad_data = grad.data<tensor_t>();
diff --git a/paddle/fluid/operators/gather_scatter_kernel.h b/paddle/fluid/operators/gather_scatter_kernel.h
index b97451b488b92..9cf3c3e33009a 100644
--- a/paddle/fluid/operators/gather_scatter_kernel.h
+++ b/paddle/fluid/operators/gather_scatter_kernel.h
@@ -30,87 +30,85 @@ namespace operators {
                   Instantiate_Template_Function_index_t(func, unsigned char)
 
 #define Instantiate_Template_Function_index_t(func, tensor_t)            \
-  template void func<tensor_t, int>(Tensor input,                        \
+  template void func<tensor_t, int>(phi::DenseTensor input,              \
                                     int dim,                             \
                                     const phi::DenseTensor& index,       \
-                                    Tensor result,                       \
+                                    phi::DenseTensor result,             \
                                     const platform::DeviceContext& ctx); \
-  template void func<tensor_t, int64_t>(Tensor input,                    \
+  template void func<tensor_t, int64_t>(phi::DenseTensor input,          \
                                         int dim,                         \
                                         const phi::DenseTensor& index,   \
-                                        Tensor result,                   \
+                                        phi::DenseTensor result,         \
                                         const platform::DeviceContext& ctx);
 
-using Tensor = phi::DenseTensor;
-
 template <typename tensor_t, typename index_t>
-void cpu_gather_kernel(Tensor self,
+void cpu_gather_kernel(phi::DenseTensor self,
                        int dim,
                        const phi::DenseTensor& index,
-                       Tensor result,
+                       phi::DenseTensor result,
                        const platform::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
-void cpu_scatter_assign_kernel(Tensor self,
+void cpu_scatter_assign_kernel(phi::DenseTensor self,
                                int dim,
                                const phi::DenseTensor& index,
-                               Tensor src,
+                               phi::DenseTensor src,
                                const platform::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
-void cpu_scatter_add_kernel(Tensor self,
+void cpu_scatter_add_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
-                            Tensor src,
+                            phi::DenseTensor src,
                             const platform::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
-void cpu_scatter_mul_kernel(Tensor self,
+void cpu_scatter_mul_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
-                            Tensor src,
+                            phi::DenseTensor src,
                             const platform::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
-void cpu_scatter_input_grad_kernel(Tensor self,
+void cpu_scatter_input_grad_kernel(phi::DenseTensor self,
                                    int dim,
                                    const phi::DenseTensor& index,
-                                   Tensor result,
+                                   phi::DenseTensor result,
                                    const platform::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
-void gpu_gather_kernel(Tensor self,
+void gpu_gather_kernel(phi::DenseTensor self,
                        int dim,
                        const phi::DenseTensor& index,
-                       Tensor result,
+                       phi::DenseTensor result,
                        const platform::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
-void gpu_scatter_assign_kernel(Tensor self,
+void gpu_scatter_assign_kernel(phi::DenseTensor self,
                                int dim,
                                const phi::DenseTensor& index,
-                               Tensor src,
+                               phi::DenseTensor src,
                                const platform::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
-void gpu_scatter_add_kernel(Tensor self,
+void gpu_scatter_add_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
-                            Tensor src,
+                            phi::DenseTensor src,
                             const platform::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
-void gpu_scatter_mul_kernel(Tensor self,
+void gpu_scatter_mul_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
-                            Tensor src,
+                            phi::DenseTensor src,
                             const platform::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
-void gpu_scatter_input_grad_kernel(Tensor self,
+void gpu_scatter_input_grad_kernel(phi::DenseTensor self,
                                    int dim,
                                    const phi::DenseTensor& index,
-                                   Tensor result,
+                                   phi::DenseTensor result,
                                    const platform::DeviceContext& ctx);
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index ee095c598bc1b..0f81d7fec3184 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class CPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/gaussian_random_op_mlu.cc b/paddle/fluid/operators/gaussian_random_op_mlu.cc
index a70ddc428d840..5128cc9502581 100644
--- a/paddle/fluid/operators/gaussian_random_op_mlu.cc
+++ b/paddle/fluid/operators/gaussian_random_op_mlu.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T>
 class MLUGaussianRandomKernel : public framework::OpKernel<T> {
  public:
@@ -30,7 +29,7 @@ class MLUGaussianRandomKernel : public framework::OpKernel<T> {
     auto* tensor = context.Output<phi::DenseTensor>("Out");
     tensor->mutable_data<T>(context.GetPlace());
 
-    Tensor cpu_tensor(tensor->type());
+    phi::DenseTensor cpu_tensor(tensor->type());
     cpu_tensor.Resize(tensor->dims());
     T* cpu_data = cpu_tensor.mutable_data<T>(platform::CPUPlace());
     std::normal_distribution<T> dist(mean, std);
diff --git a/paddle/fluid/operators/gaussian_random_op_npu.cc b/paddle/fluid/operators/gaussian_random_op_npu.cc
index 3523eb7379399..5e3fa3dbef5e6 100644
--- a/paddle/fluid/operators/gaussian_random_op_npu.cc
+++ b/paddle/fluid/operators/gaussian_random_op_npu.cc
@@ -22,7 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T>
 class NPUGaussianRandomKernel : public framework::OpKernel<T> {
  public:
@@ -32,7 +31,7 @@ class NPUGaussianRandomKernel : public framework::OpKernel<T> {
     auto* tensor = context.Output<phi::DenseTensor>("Out");
     tensor->mutable_data<T>(context.GetPlace());
 
-    Tensor cpu_tensor(tensor->dtype());
+    phi::DenseTensor cpu_tensor(tensor->dtype());
     cpu_tensor.Resize(tensor->dims());
     T* cpu_data = cpu_tensor.mutable_data<T>(platform::CPUPlace());
     std::normal_distribution<T> dist(mean, std);
diff --git a/paddle/fluid/operators/gelu_op_npu.cc b/paddle/fluid/operators/gelu_op_npu.cc
index f462336b412a3..7f6d5be9d0c73 100644
--- a/paddle/fluid/operators/gelu_op_npu.cc
+++ b/paddle/fluid/operators/gelu_op_npu.cc
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class GeluNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu
index 2e703282bf932..39767b5e20a87 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.cu
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cu
@@ -49,8 +49,6 @@ constexpr int WARP_SIZE = 32;
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 struct MaxFunctor {
   T cap;
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.h b/paddle/fluid/operators/graph_khop_sampler_op.h
index 278bbd5efd723..f5ec87f23c88b 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.h
+++ b/paddle/fluid/operators/graph_khop_sampler_op.h
@@ -28,8 +28,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <class bidiiter>
 void SampleUniqueNeighbors(bidiiter begin, bidiiter end, int num_samples) {
   int left_num = std::distance(begin, end);
diff --git a/paddle/fluid/operators/grid_sampler_op_mlu.cc b/paddle/fluid/operators/grid_sampler_op_mlu.cc
index f71969d8b551c..07aa025a9a26c 100644
--- a/paddle/fluid/operators/grid_sampler_op_mlu.cc
+++ b/paddle/fluid/operators/grid_sampler_op_mlu.cc
@@ -18,8 +18,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class GridSamplerMLUKernel : public framework::OpKernel<T> {
  public:
@@ -60,13 +58,13 @@ class GridSamplerMLUKernel : public framework::OpKernel<T> {
         platform::errors::Unavailable(
             "Only support zeros padding_mode in mlu grid_sample kernel."));
 
-    Tensor trans_input(input->dtype());
+    phi::DenseTensor trans_input(input->dtype());
     // transpose input from NCHW to NHWC
     const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
     TransposeFromMLUTensor<T>(
         ctx, perm_to_nhwc, input, &trans_input, true /*need_reshape_or_alloc*/);
 
-    Tensor tmp_output(output->dtype());
+    phi::DenseTensor tmp_output(output->dtype());
     tmp_output.mutable_data<T>({n, out_h, out_w, c}, ctx.GetPlace());
 
     MLUCnnlGridSampleDesc grid_sample_desc(mode, padding_mode, align_corners);
diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc
index 3d6566d62b2a7..7331c792ea568 100644
--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
@@ -28,7 +28,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 
 class GroupNormOp : public framework::OperatorWithKernel {
@@ -123,16 +122,16 @@ class GroupNormGradOp : public framework::OperatorWithKernel {
         var,
         platform::errors::InvalidArgument(
             "Input(Y@GRAD) of GroupNormGradOp should not be null"));
-    const Tensor *t = nullptr;
-    if (var->IsType<Tensor>()) {
-      t = &var->Get<Tensor>();
+    const phi::DenseTensor *t = nullptr;
+    if (var->IsType<phi::DenseTensor>()) {
+      t = &var->Get<phi::DenseTensor>();
     } else if (var->IsType<phi::DenseTensor>()) {
       t = &var->Get<phi::DenseTensor>();
     }
-    PADDLE_ENFORCE_NOT_NULL(
-        t,
-        platform::errors::InvalidArgument(
-            "Input(Y@GRAD) Tensor of GroupNormGradOp should not be null"));
+    PADDLE_ENFORCE_NOT_NULL(t,
+                            platform::errors::InvalidArgument(
+                                "Input(Y@GRAD) phi::DenseTensor of "
+                                "GroupNormGradOp should not be null"));
     return framework::OpKernelType(framework::TransToProtoVarType(t->dtype()),
                                    ctx.GetPlace());
   }
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
index 6b2ba1670a3b7..9cb4e54ac0054 100644
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -291,7 +291,7 @@ class GroupNormKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
     var->mutable_data<T>(ctx.GetPlace());
     phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
     auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
-    Tensor temp_var;
+    phi::DenseTensor temp_var;
     temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
     auto* x_data = x->data<T>();
     auto* y_data = y->data<T>();
@@ -642,7 +642,7 @@ class GroupNormGradKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
     phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
     auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
 
-    Tensor ds, db;
+    phi::DenseTensor ds, db;
     ds.mutable_data<T>({x_dims[0], C}, ctx.GetPlace());
     db.mutable_data<T>({x_dims[0], C}, ctx.GetPlace());
     T* ds_data = ds.data<T>();
@@ -728,7 +728,7 @@ class GroupNormGradKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
         // p1 = scale * var_inv
         // p2 = (db * scale * mean - ds * scale) * pow(var_inv, 3) * (1/n)
         // p3 = -p2 * mean[ng] - db * scale * var_inv * (1/n);
-        Tensor p1, p2, p3;
+        phi::DenseTensor p1, p2, p3;
         p1.mutable_data<T>({x_dims[0] * C}, ctx.GetPlace());
         p2.mutable_data<T>({x_dims[0], groups}, ctx.GetPlace());
         p3.mutable_data<T>({x_dims[0], groups}, ctx.GetPlace());
@@ -770,12 +770,12 @@ class GroupNormGradKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
         set_zero(dev_ctx, d_bias, static_cast<T>(0));
       }
 
-      Tensor temp_var;
+      phi::DenseTensor temp_var;
       temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
       set_zero(dev_ctx, &temp_var, static_cast<T>(0));
       T* temp_var_data = temp_var.data<T>();
 
-      Tensor temp_mean;
+      phi::DenseTensor temp_mean;
       temp_mean.mutable_data<T>(var->dims(), ctx.GetPlace());
       set_zero(dev_ctx, &temp_mean, static_cast<T>(0));
       T* temp_mean_data = temp_mean.data<T>();
diff --git a/paddle/fluid/operators/group_norm_op.h b/paddle/fluid/operators/group_norm_op.h
index 0ce89b4625a13..95cdeefc783f4 100644
--- a/paddle/fluid/operators/group_norm_op.h
+++ b/paddle/fluid/operators/group_norm_op.h
@@ -28,7 +28,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/group_norm_op_npu.cc b/paddle/fluid/operators/group_norm_op_npu.cc
index 5fded4cffc713..2c0dec9dd4d0b 100644
--- a/paddle/fluid/operators/group_norm_op_npu.cc
+++ b/paddle/fluid/operators/group_norm_op_npu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 struct GroupNormFunction {
  public:
@@ -103,14 +101,14 @@ struct GroupNormFunction {
     const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}});
     runner.Run(stream);
   }
-  Tensor ReduceMeanToNG(const phi::DenseTensor* x,
-                        const DataLayout& data_layout,
-                        const int64_t N,
-                        const int64_t C,
-                        const int64_t H,
-                        const int64_t W,
-                        const int G) {
-    Tensor y(x->type());
+  phi::DenseTensor ReduceMeanToNG(const phi::DenseTensor* x,
+                                  const DataLayout& data_layout,
+                                  const int64_t N,
+                                  const int64_t C,
+                                  const int64_t H,
+                                  const int64_t W,
+                                  const int G) {
+    phi::DenseTensor y(x->type());
     // y.mutable_data<T>( {N,G,1}, place );
     if (data_layout == DataLayout::kNCHW) {
       y.mutable_data<T>({N, G, 1}, place);
@@ -119,7 +117,7 @@ struct GroupNormFunction {
     } else {
       y.mutable_data<T>({N, 1, G}, place);
       //  shape of x is [N, C*H*W/G, G]
-      Tensor x_trans(x->type());
+      phi::DenseTensor x_trans(x->type());
       x_trans.mutable_data<T>({N, G, C * H * W / G}, place);
       this->Transpose(x, &x_trans, std::vector<int>{0, 2, 1});
       this->ReduceMean(&x_trans, &y, std::vector<int>{2});
@@ -150,7 +148,7 @@ class GroupNormNPUKernel : public framework::OpKernel<T> {
     const auto groups = ctx.Attr<int>("groups");
 
     auto place = ctx.GetPlace();
-    Tensor xnorm(x->type());
+    phi::DenseTensor xnorm(x->type());
     xnorm.mutable_data<T>(x->dims(), place);
     GroupNormFunction<T> F(ctx);
     if (data_layout != DataLayout::kNCHW) {
@@ -173,12 +171,12 @@ class GroupNormNPUKernel : public framework::OpKernel<T> {
     F.ReduceMean(&xnorm, mean, axis);
 
     F.Sub(&xnorm, mean, &xnorm);
-    Tensor sqr(x->type());
+    phi::DenseTensor sqr(x->type());
     sqr.mutable_data<T>(xnorm.dims(), place);
 
     F.Mul(&xnorm, &xnorm, &sqr);
     F.ReduceMean(&sqr, var, axis);
-    Tensor std(x->type());
+    phi::DenseTensor std(x->type());
     std.mutable_data<T>(var->dims(), place);
     F.Adds(var, epsilon, &std);
     F.Sqrt(&std, &std);
@@ -186,13 +184,13 @@ class GroupNormNPUKernel : public framework::OpKernel<T> {
     F.Div(&xnorm, &std, y);
     y->Resize({N, C, H, W});
     if (scale) {
-      Tensor scale_t(scale->type());
+      phi::DenseTensor scale_t(scale->type());
       scale_t.ShareDataWith(*scale);
       scale_t.Resize({C, 1, 1});
       F.Mul(y, &scale_t, y);
     }
     if (bias) {
-      Tensor bias_t(bias->type());
+      phi::DenseTensor bias_t(bias->type());
       bias_t.ShareDataWith(*bias);
       bias_t.Resize({C, 1, 1});
       F.Add(y, &bias_t, y);
@@ -231,11 +229,11 @@ class GroupNormGradNPUKernel : public framework::OpKernel<T> {
     auto place = ctx.GetPlace();
     auto _type = y->type();
 
-    Tensor xnorm(_type);
+    phi::DenseTensor xnorm(_type);
     xnorm.mutable_data<T>(y->dims(), place);
-    Tensor scale_share(_type);
+    phi::DenseTensor scale_share(_type);
     scale_share.ShareDataWith(*scale);
-    Tensor bias_share(_type);
+    phi::DenseTensor bias_share(_type);
     bias_share.ShareDataWith(*bias);
 
     int64_t N = y->dims()[0];
@@ -267,7 +265,7 @@ class GroupNormGradNPUKernel : public framework::OpKernel<T> {
     }
     if (d_scale) {
       d_scale->mutable_data<T>(place);
-      Tensor dy_xnorm(_type);
+      phi::DenseTensor dy_xnorm(_type);
       dy_xnorm.mutable_data<T>(d_y->dims(), place);
       F.Mul(d_y, &xnorm, &dy_xnorm);
       if (data_layout == DataLayout::kNCHW) {
@@ -278,12 +276,12 @@ class GroupNormGradNPUKernel : public framework::OpKernel<T> {
     }
 
     //  std = Sqrt(var+epsilon), init shape = [ N, G ]
-    Tensor std(_type);
+    phi::DenseTensor std(_type);
     std.mutable_data<T>(var->dims(), place);
     F.Adds(var, epsilon, &std);
     F.Sqrt(&std, &std);
     //  d_xnorm_std = dy_proc * scale / std
-    Tensor d_xnorm_std(_type);
+    phi::DenseTensor d_xnorm_std(_type);
     d_xnorm_std.mutable_data<T>(y->dims(), place);
     F.Mul(d_y, &scale_share, &d_xnorm_std);
     if (data_layout == DataLayout::kNCHW) {
@@ -303,10 +301,11 @@ class GroupNormGradNPUKernel : public framework::OpKernel<T> {
     d_x->mutable_data<T>(place);
     d_x->Resize(xnorm.dims());
     F.Mul(&d_xnorm_std, &xnorm, d_x);
-    Tensor dx1 = F.ReduceMeanToNG(d_x, data_layout, N, C, H, W, G);
+    phi::DenseTensor dx1 = F.ReduceMeanToNG(d_x, data_layout, N, C, H, W, G);
     F.Mul(&dx1, &xnorm, d_x);
 
-    Tensor dx2 = F.ReduceMeanToNG(&d_xnorm_std, data_layout, N, C, H, W, G);
+    phi::DenseTensor dx2 =
+        F.ReduceMeanToNG(&d_xnorm_std, data_layout, N, C, H, W, G);
 
     F.Sub(&d_xnorm_std, d_x, d_x);
     F.Sub(d_x, &dx2, d_x);
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index cceecdcad5fd2..1c10692d15fad 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -370,7 +370,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
     gru_value.gate_weight = const_cast<T*>(weight_data);
     gru_value.state_weight =
         const_cast<T*>(weight_data + 2 * frame_size * frame_size);
-    Tensor ordered_h0;
+    phi::DenseTensor ordered_h0;
 
     framework::Vector<size_t> order(batch_gate->lod()[2]);
 
@@ -440,10 +440,10 @@ class GRUCPUKernel : public framework::OpKernel<T> {
         int bend = static_cast<int>(batch_starts[n + 1]);
         int cur_batch_size = bend - bstart;
 
-        Tensor gate_t = batch_gate->Slice(bstart, bend);
-        Tensor reset_hidden_prev_t =
+        phi::DenseTensor gate_t = batch_gate->Slice(bstart, bend);
+        phi::DenseTensor reset_hidden_prev_t =
             batch_reset_hidden_prev->Slice(bstart, bend);
-        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+        phi::DenseTensor hidden_t = batch_hidden->Slice(bstart, bend);
         gru_value.output_value = hidden_t.data<T>();
         gru_value.gate_value = gate_t.data<T>();
         gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
@@ -505,10 +505,10 @@ class GRUCPUKernel : public framework::OpKernel<T> {
         int bend = static_cast<int>(batch_starts[n + 1]);
         int cur_batch_size = bend - bstart;
 
-        Tensor gate_t = batch_gate->Slice(bstart, bend);
-        Tensor reset_hidden_prev_t =
+        phi::DenseTensor gate_t = batch_gate->Slice(bstart, bend);
+        phi::DenseTensor reset_hidden_prev_t =
             batch_reset_hidden_prev->Slice(bstart, bend);
-        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+        phi::DenseTensor hidden_t = batch_hidden->Slice(bstart, bend);
         gru_value.output_value = hidden_t.data<T>();
         gru_value.gate_value = gate_t.data<T>();
         gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
diff --git a/paddle/fluid/operators/gru_op.cu.cc b/paddle/fluid/operators/gru_op.cu.cc
index a6b57bd88f77d..53006c55f6b98 100644
--- a/paddle/fluid/operators/gru_op.cu.cc
+++ b/paddle/fluid/operators/gru_op.cu.cc
@@ -73,7 +73,7 @@ class GRUKernel : public framework::OpKernel<T> {
     gru_value.gate_weight = const_cast<T*>(weight_data);
     gru_value.state_weight =
         const_cast<T*>(weight_data + 2 * frame_size * frame_size);
-    Tensor ordered_h0;
+    phi::DenseTensor ordered_h0;
 
     framework::Vector<size_t> order(batch_gate->lod()[2]);
 
@@ -102,9 +102,10 @@ class GRUKernel : public framework::OpKernel<T> {
       int bend = static_cast<int>(batch_starts[n + 1]);
       int cur_batch_size = bend - bstart;
 
-      Tensor gate_t = batch_gate->Slice(bstart, bend);
-      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
-      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+      phi::DenseTensor gate_t = batch_gate->Slice(bstart, bend);
+      phi::DenseTensor reset_hidden_prev_t =
+          batch_reset_hidden_prev->Slice(bstart, bend);
+      phi::DenseTensor hidden_t = batch_hidden->Slice(bstart, bend);
       gru_value.output_value = hidden_t.data<T>();
       gru_value.gate_value = gate_t.data<T>();
       gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h
index 89731e2efa022..286bf9fe2732d 100644
--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
@@ -25,8 +25,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 inline void ReorderInitState(const DeviceContext& ctx,
                              const phi::DenseTensor& src,
@@ -79,7 +77,7 @@ class GRUGradKernel : public framework::OpKernel<T> {
     zero(dev_ctx, &batch_gate_grad, static_cast<T>(0.0));
     zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast<T>(0.0));
 
-    Tensor ordered_h0, ordered_h0_grad;
+    phi::DenseTensor ordered_h0, ordered_h0_grad;
 
     framework::Vector<size_t> order(batch_gate->lod()[2]);
 
@@ -126,16 +124,17 @@ class GRUGradKernel : public framework::OpKernel<T> {
       int bend = static_cast<int>(batch_starts[n + 1]);
       int cur_batch_size = bend - bstart;
 
-      Tensor gate_t = batch_gate->Slice(bstart, bend);
+      phi::DenseTensor gate_t = batch_gate->Slice(bstart, bend);
       gru_value.gate_value = gate_t.data<T>();
-      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
+      phi::DenseTensor reset_hidden_prev_t =
+          batch_reset_hidden_prev->Slice(bstart, bend);
       gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
 
-      Tensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend);
+      phi::DenseTensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend);
       gru_grad.output_grad = hidden_grad_t.data<T>();
-      Tensor gate_grad_t = batch_gate_grad.Slice(bstart, bend);
+      phi::DenseTensor gate_grad_t = batch_gate_grad.Slice(bstart, bend);
       gru_grad.gate_grad = gate_grad_t.data<T>();
-      Tensor reset_hidden_prev_grad_t =
+      phi::DenseTensor reset_hidden_prev_grad_t =
           batch_reset_hidden_prev_grad.Slice(bstart, bend);
       gru_grad.reset_output_grad = reset_hidden_prev_grad_t.data<T>();
       if (n == 0) {
@@ -144,9 +143,11 @@ class GRUGradKernel : public framework::OpKernel<T> {
             h0 && h0_grad ? ordered_h0_grad.data<T>() : nullptr;
       } else {
         int bstart_pre = static_cast<int>(batch_starts[n - 1]);
-        Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart);
+        phi::DenseTensor hidden_prev_t =
+            batch_hidden->Slice(bstart_pre, bstart);
         gru_value.prev_out_value = hidden_prev_t.data<T>();
-        Tensor hidden_prev_grad_t = batch_hidden_grad.Slice(bstart_pre, bstart);
+        phi::DenseTensor hidden_prev_grad_t =
+            batch_hidden_grad.Slice(bstart_pre, bstart);
         gru_grad.prev_out_grad = hidden_prev_grad_t.data<T>();
       }
       gru_value.output_value = nullptr;
diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h
index 3ed3179a63e63..d46e6cf429f6f 100644
--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 };
 
 template <typename DeviceContext, typename T>
@@ -192,8 +190,8 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
         context.Output<phi::DenseTensor>(framework::GradVarName("Weight"));
     auto* bias_grad =
         context.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
-    Tensor gate_grad;
-    Tensor reset_hidden_prev_grad;
+    phi::DenseTensor gate_grad;
+    phi::DenseTensor reset_hidden_prev_grad;
 
     const T* hidden_prev_data = hidden_prev->data<T>();
     const T* weight_data = weight->data<T>();
diff --git a/paddle/fluid/operators/huber_loss_op_mlu.cc b/paddle/fluid/operators/huber_loss_op_mlu.cc
index 4387037ad01af..4dc542b675f54 100644
--- a/paddle/fluid/operators/huber_loss_op_mlu.cc
+++ b/paddle/fluid/operators/huber_loss_op_mlu.cc
@@ -18,17 +18,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class HuberLossMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx = GetDevCtxFromCTX(ctx);
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* residual = ctx.Output<Tensor>("Residual");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* residual = ctx.Output<phi::DenseTensor>("Residual");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     auto delta = ctx.Attr<float>("delta");
 
     auto place = ctx.GetPlace();
@@ -65,7 +63,7 @@ class HuberLossMLUKernel : public framework::OpKernel<T> {
                                  GetBasePtr(out));
 
     // compute multiply by delta
-    Tensor scale_tensor, bias_tensor;
+    phi::DenseTensor scale_tensor, bias_tensor;
     scale_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
     bias_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
     FillMLUTensorWithHostValue(ctx, static_cast<T>(delta), &scale_tensor);
@@ -93,20 +91,20 @@ class HuberLossGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx = GetDevCtxFromCTX(ctx);
-    auto* residual = ctx.Input<Tensor>("Residual");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto* residual = ctx.Input<phi::DenseTensor>("Residual");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
     auto delta = ctx.Attr<float>("delta");
 
     auto place = ctx.GetPlace();
 
-    Tensor t_grad_rd;
+    phi::DenseTensor t_grad_rd;
     t_grad_rd =
         ctx.AllocateTmpTensor<T, MLUDeviceContext>(residual->dims(), dev_ctx);
     MLUCnnlTensorDesc t_grad_rd_desc(t_grad_rd);
     if (dx || dy) {
-      Tensor t_zero;
+      phi::DenseTensor t_zero;
       t_zero =
           ctx.AllocateTmpTensor<T, MLUDeviceContext>(residual->dims(), dev_ctx);
       FillMLUTensorWithHostValue(ctx, static_cast<T>(0.f), &t_zero);
@@ -130,7 +128,7 @@ class HuberLossGradMLUKernel : public framework::OpKernel<T> {
                                     GetBasePtr(&t_grad_rd));
     }
     // compute multiply by delta
-    Tensor scale_tensor, bias_tensor;
+    phi::DenseTensor scale_tensor, bias_tensor;
     scale_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
     bias_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
 
diff --git a/paddle/fluid/operators/huber_loss_op_npu.cc b/paddle/fluid/operators/huber_loss_op_npu.cc
index a7be6feb628bf..78529df55aa94 100644
--- a/paddle/fluid/operators/huber_loss_op_npu.cc
+++ b/paddle/fluid/operators/huber_loss_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 void HuberLossSub(const platform::Place& place,
                   const aclrtStream& stream,
@@ -117,9 +115,9 @@ class HuberLossGradNPUKernel : public framework::OpKernel<T> {
             .stream();
     auto place = ctx.GetPlace();
 
-    Tensor t_grad_rd;
+    phi::DenseTensor t_grad_rd;
     if (dx || dy) {
-      Tensor t_zero;
+      phi::DenseTensor t_zero;
       HuberLossZerosLike<T>(place, stream, residual, &t_zero);
       HuberLossSmoothL1LossGrad<T>(
           place, stream, residual, &t_zero, dout, delta, &t_grad_rd);
diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h
index a9da8f8f4dbbc..523639faddcbe 100644
--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
@@ -26,8 +26,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 inline int Im2SeqOutputSize(
     int input_size, int filter_size, int padding_0, int padding_1, int stride) {
   const int output_size =
@@ -52,7 +50,7 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
     if (ctx.HasInput("Y") && batch_size > 1) {
       const phi::DenseTensor* imgrealsize = ctx.Input<phi::DenseTensor>("Y");
       auto out_stride = ctx.Attr<std::vector<int>>("out_stride");
-      Tensor cpu_shape_tensor;
+      phi::DenseTensor cpu_shape_tensor;
       paddle::framework::TensorCopySync(
           *imgrealsize, platform::CPUPlace(), &cpu_shape_tensor);
       std::vector<int> imgreal_h;
@@ -89,15 +87,16 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
       const std::vector<int> dilations({1, 1});
       int offset_out = 0;
       for (int i = 0; i < batch_size; i++) {
-        const Tensor src =
+        const phi::DenseTensor src =
             in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
-        Tensor dst = out->Slice(offset_out,
-                                offset_out + output_height[i] * output_width[i])
-                         .Resize({output_height[i],
-                                  output_width[i],
-                                  img_channels,
-                                  kernels[0],
-                                  kernels[1]});
+        phi::DenseTensor dst =
+            out->Slice(offset_out,
+                       offset_out + output_height[i] * output_width[i])
+                .Resize({output_height[i],
+                         output_width[i],
+                         img_channels,
+                         kernels[0],
+                         kernels[1]});
         offset_out += output_height[i] * output_width[i];
 
         phi::funcs::Im2ColFunctor<phi::funcs::ColFormat::kOCF, DeviceContext, T>
@@ -127,13 +126,13 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
       auto out_dims = out->dims();
       out->Resize({batch_size, out->numel() / batch_size});
       for (int i = 0; i < batch_size; i++) {
-        const Tensor src =
+        const phi::DenseTensor src =
             in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
-        Tensor dst = out->Slice(i, i + 1).Resize({output_height,
-                                                  output_width,
-                                                  img_channels,
-                                                  kernels[0],
-                                                  kernels[1]});
+        phi::DenseTensor dst = out->Slice(i, i + 1).Resize({output_height,
+                                                            output_width,
+                                                            img_channels,
+                                                            kernels[0],
+                                                            kernels[1]});
 
         phi::funcs::Im2ColFunctor<phi::funcs::ColFormat::kOCF, DeviceContext, T>
             f;
@@ -187,9 +186,9 @@ class Im2SequenceGradKernel : public framework::OpKernel<T> {
     auto d_out_dims = d_out->dims();
     d_out->Resize({batch_size, d_out->numel() / batch_size});
     for (int i = 0; i < batch_size; i++) {
-      Tensor dst =
+      phi::DenseTensor dst =
           d_x->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
-      const Tensor src = d_out->Slice(i, i + 1).Resize(
+      const phi::DenseTensor src = d_out->Slice(i, i + 1).Resize(
           {output_height, output_width, img_channels, kernels[0], kernels[1]});
       phi::funcs::Col2ImFunctor<phi::funcs::ColFormat::kOCF, DeviceContext, T>
           f;
diff --git a/paddle/fluid/operators/index_sample_op_npu.cc b/paddle/fluid/operators/index_sample_op_npu.cc
index 425590ebeeb52..0e7f1fea1bd81 100644
--- a/paddle/fluid/operators/index_sample_op_npu.cc
+++ b/paddle/fluid/operators/index_sample_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename IndexT>
 void IndexSampleGather(const paddle::platform::NPUDeviceContext& dev_ctx,
@@ -38,7 +37,7 @@ void IndexSampleGather(const paddle::platform::NPUDeviceContext& dev_ctx,
       gather_index_vec.push_back(index_vec[i * index_length + j]);
     }
   }
-  Tensor gather_index;
+  phi::DenseTensor gather_index;
   framework::TensorFromVector(gather_index_vec, dev_ctx, &gather_index);
   gather_index.Resize({batch_size, index_length, 2});
 
@@ -89,7 +88,7 @@ void IndexSampleGradScatter(const paddle::platform::NPUDeviceContext& dev_ctx,
       scatter_index_vec.push_back(index_vec[i * index_length + j]);
     }
   }
-  Tensor scatter_index;
+  phi::DenseTensor scatter_index;
   framework::TensorFromVector(scatter_index_vec, dev_ctx, &scatter_index);
   scatter_index.Resize({batch_size, index_length, 2});
 
diff --git a/paddle/fluid/operators/index_select_op.h b/paddle/fluid/operators/index_select_op.h
index a705a95156608..6bb91f325f953 100644
--- a/paddle/fluid/operators/index_select_op.h
+++ b/paddle/fluid/operators/index_select_op.h
@@ -22,7 +22,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
 
 template <typename DeviceContext, typename T, typename IndexT = int>
diff --git a/paddle/fluid/operators/index_select_op_npu.cc b/paddle/fluid/operators/index_select_op_npu.cc
index 0f18f9793d305..327471b216f0b 100644
--- a/paddle/fluid/operators/index_select_op_npu.cc
+++ b/paddle/fluid/operators/index_select_op_npu.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class IndexSelectNPUKernel : public framework::OpKernel<T> {
  public:
@@ -66,7 +64,7 @@ class IndexSelectGradNPUKernel : public framework::OpKernel<T> {
       dim += out_dims.size();
     }
 
-    Tensor casted_index;
+    phi::DenseTensor casted_index;
     if (framework::TransToProtoVarType(index->dtype()) !=
         framework::proto::VarType::INT32) {
       casted_index.mutable_data<int32_t>(index->dims(), ctx.GetPlace());
@@ -90,7 +88,7 @@ class IndexSelectGradNPUKernel : public framework::OpKernel<T> {
           .AddOutput(*x_grad);
       runner.Run(stream);
     } else {
-      Tensor transed_out_grad;
+      phi::DenseTensor transed_out_grad;
       std::vector<int> in_trans_perm;
       in_trans_perm.push_back(dim);
       for (int i = 0; i < out_dims.size(); ++i) {
@@ -109,7 +107,7 @@ class IndexSelectGradNPUKernel : public framework::OpKernel<T> {
           .AddOutput(transed_out_grad);
       in_trans_runner.Run(stream);
 
-      Tensor sum_out;
+      phi::DenseTensor sum_out;
       framework::DDim sum_dims(x_dims);
       sum_dims[0] = x_dims[dim];
       auto idx = 1;
diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc
index 53453c6cad184..a80324d5d303a 100644
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
@@ -145,8 +145,8 @@ class InplaceABNGradOp : public paddle::operators::BatchNormGradOp {
           "can't find gradient variable of Y"));
     }
     const phi::DenseTensor* t = nullptr;
-    if (var->IsType<Tensor>()) {
-      t = &var->Get<Tensor>();
+    if (var->IsType<phi::DenseTensor>()) {
+      t = &var->Get<phi::DenseTensor>();
     } else if (var->IsType<phi::DenseTensor>()) {
       t = &var->Get<phi::DenseTensor>();
     }
@@ -323,9 +323,9 @@ class InplaceABNGradKernel : public framework::OpKernel<T> {
     auto* mean = ctx.Input<phi::DenseTensor>("ReserveSpace");
     auto* variance = ctx.Input<phi::DenseTensor>("ReserveSpace");
 
-    paddle::optional<Tensor> space_opt;
-    paddle::optional<Tensor> mean_opt;
-    paddle::optional<Tensor> variance_opt;
+    paddle::optional<phi::DenseTensor> space_opt;
+    paddle::optional<phi::DenseTensor> mean_opt;
+    paddle::optional<phi::DenseTensor> variance_opt;
 
     if (reserve_space != nullptr) {
       space_opt = *reserve_space;
diff --git a/paddle/fluid/operators/inplace_abn_op.cu b/paddle/fluid/operators/inplace_abn_op.cu
index e1131822f289e..bec88e5dfd2a7 100644
--- a/paddle/fluid/operators/inplace_abn_op.cu
+++ b/paddle/fluid/operators/inplace_abn_op.cu
@@ -171,9 +171,9 @@ class InplaceABNGradKernel : public framework::OpKernel<T> {
           scale_grad,
           bias_grad);
     } else {
-      paddle::optional<Tensor> space_opt;
-      paddle::optional<Tensor> mean_opt;
-      paddle::optional<Tensor> variance_opt;
+      paddle::optional<phi::DenseTensor> space_opt;
+      paddle::optional<phi::DenseTensor> mean_opt;
+      paddle::optional<phi::DenseTensor> variance_opt;
 
       if (reserve_space != nullptr) {
         space_opt = *reserve_space;
diff --git a/paddle/fluid/operators/inplace_abn_op.h b/paddle/fluid/operators/inplace_abn_op.h
index 2a9568e845492..29253662d4deb 100644
--- a/paddle/fluid/operators/inplace_abn_op.h
+++ b/paddle/fluid/operators/inplace_abn_op.h
@@ -22,7 +22,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc
index ed474193461c3..c9f33799c9e10 100644
--- a/paddle/fluid/operators/instance_norm_op.cc
+++ b/paddle/fluid/operators/instance_norm_op.cc
@@ -105,9 +105,9 @@ framework::OpKernelType InstanceNormGradOp::GetExpectedKernelType(
     PADDLE_THROW(
         platform::errors::NotFound("cannot find gradient variable of Y"));
   }
-  const Tensor *t = nullptr;
-  if (var->IsType<Tensor>()) {
-    t = &var->Get<Tensor>();
+  const phi::DenseTensor *t = nullptr;
+  if (var->IsType<phi::DenseTensor>()) {
+    t = &var->Get<phi::DenseTensor>();
   } else if (var->IsType<phi::DenseTensor>()) {
     t = &var->Get<phi::DenseTensor>();
   }
@@ -126,9 +126,9 @@ framework::OpKernelType InstanceNormDoubleGradOp::GetExpectedKernelType(
     PADDLE_THROW(
         platform::errors::NotFound("cannot find gradient variable of Y"));
   }
-  const Tensor *t = nullptr;
-  if (var->IsType<Tensor>()) {
-    t = &var->Get<Tensor>();
+  const phi::DenseTensor *t = nullptr;
+  if (var->IsType<phi::DenseTensor>()) {
+    t = &var->Get<phi::DenseTensor>();
   } else if (var->IsType<phi::DenseTensor>()) {
     t = &var->Get<phi::DenseTensor>();
   }
diff --git a/paddle/fluid/operators/instance_norm_op.h b/paddle/fluid/operators/instance_norm_op.h
index 2101f6a12bb53..05e2bde973924 100644
--- a/paddle/fluid/operators/instance_norm_op.h
+++ b/paddle/fluid/operators/instance_norm_op.h
@@ -22,7 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 
 class InstanceNormOp : public framework::OperatorWithKernel {
diff --git a/paddle/fluid/operators/instance_norm_op_npu.cc b/paddle/fluid/operators/instance_norm_op_npu.cc
index f46c3a806a2c0..f11719bea9c7c 100644
--- a/paddle/fluid/operators/instance_norm_op_npu.cc
+++ b/paddle/fluid/operators/instance_norm_op_npu.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class InstanceNormNPUKernel : public framework::OpKernel<T> {
@@ -56,7 +55,7 @@ class InstanceNormNPUKernel : public framework::OpKernel<T> {
       }
     }
 
-    Tensor tmp_x, tmp_y;
+    phi::DenseTensor tmp_x, tmp_y;
     tmp_x.ShareDataWith(*x);
 
     tmp_x.Resize(phi::make_ddim(tmp_x_dims));
diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu
index a589b49500e0a..a0e1410f52d3d 100644
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
@@ -945,7 +945,7 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
     }
     auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
     if (out_size != nullptr) {
-      Tensor sizes;
+      phi::DenseTensor sizes;
       framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
       auto size_data = sizes.data<int>();
       out_w = size_data[0];
@@ -1040,7 +1040,7 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
     }
     auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
     if (out_size != nullptr) {
-      Tensor sizes;
+      phi::DenseTensor sizes;
       framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
       auto size_data = sizes.data<int>();
       out_h = size_data[0];
@@ -1195,7 +1195,7 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
     }
     auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
     if (out_size != nullptr) {
-      Tensor sizes;
+      phi::DenseTensor sizes;
       framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
       auto size_data = sizes.data<int>();
       out_d = size_data[0];
@@ -1288,7 +1288,7 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
 template <typename T>
 static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
                                  phi::DenseTensor* input_grad,
-                                 const Tensor output_grad) {
+                                 const phi::DenseTensor output_grad) {
   auto* input = ctx.Input<phi::DenseTensor>("X");
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
   const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
@@ -1314,7 +1314,7 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
 
   auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
   if (out_size != nullptr) {
-    Tensor sizes;
+    phi::DenseTensor sizes;
     framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
     auto size_data = sizes.data<int>();
     out_w = size_data[0];
@@ -1379,7 +1379,7 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
 template <typename T>
 static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
                                  phi::DenseTensor* input_grad,
-                                 const Tensor output_grad) {
+                                 const phi::DenseTensor output_grad) {
   auto* input = ctx.Input<phi::DenseTensor>("X");
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
   const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
@@ -1407,7 +1407,7 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
 
   auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
   if (out_size != nullptr) {
-    Tensor sizes;
+    phi::DenseTensor sizes;
     framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
     auto size_data = sizes.data<int>();
     out_h = size_data[0];
@@ -1555,7 +1555,7 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
 
   auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
   if (out_size != nullptr) {
-    Tensor sizes;
+    phi::DenseTensor sizes;
     framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
     auto size_data = sizes.data<int>();
     out_d = size_data[0];
diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h
index 11c1429107654..ad67efc4b78d5 100644
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
@@ -26,7 +26,6 @@ template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenTensor = phi::EigenTensor<T, D, MajorType, IndexType>;
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 
 inline std::vector<int> get_new_shape(
@@ -1344,7 +1343,7 @@ static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx,
 template <typename T>
 static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx,
                                 phi::DenseTensor* input_grad,
-                                const Tensor output_grad) {
+                                const phi::DenseTensor output_grad) {
   auto* input = ctx.Input<phi::DenseTensor>("X");
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
   const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
diff --git a/paddle/fluid/operators/interpolate_op_npu.cc b/paddle/fluid/operators/interpolate_op_npu.cc
index a059d5b522ee3..36ffd1ae53ed6 100644
--- a/paddle/fluid/operators/interpolate_op_npu.cc
+++ b/paddle/fluid/operators/interpolate_op_npu.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 
 inline static void CheckArgument(const framework::ExecutionContext& ctx) {
@@ -136,7 +135,7 @@ class InterpolateNPUKernel : public framework::OpKernel<T> {
     CalcOutSize(ctx, h, w, &out_h, &out_w);
 
     // the 'input' tensor may has no set (or wrong set) of the layout
-    Tensor input_x(input->type());
+    phi::DenseTensor input_x(input->type());
     input_x.ShareDataWith(*input);
     input_x.set_layout(data_layout);
 
@@ -188,7 +187,7 @@ class InterpolateGradNPUKernel : public framework::OpKernel<T> {
     // the 'output_grad' tensor may has no set (or wrong set) of the layout
     auto* output_grad =
         ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    Tensor output_grad_tmp(output_grad->type());
+    phi::DenseTensor output_grad_tmp(output_grad->type());
     output_grad_tmp.ShareDataWith(*output_grad);
     output_grad_tmp.set_layout(data_layout);
 
diff --git a/paddle/fluid/operators/interpolate_v2_op_mlu.cc b/paddle/fluid/operators/interpolate_v2_op_mlu.cc
index 833d650d6a131..e6f34539b1c01 100644
--- a/paddle/fluid/operators/interpolate_v2_op_mlu.cc
+++ b/paddle/fluid/operators/interpolate_v2_op_mlu.cc
@@ -175,7 +175,7 @@ class InterpolateV2MLUKernel : public framework::OpKernel<T> {
     // cnnlInterp_v2 only accepts NHWC when mode is CNNL_INTERP_BILINEAR and
     // CNNL_INTERP_NEAREST,
     framework::DDim dim_in, dim_in_trans, dim_out, dim_out_trans;
-    Tensor transformed_input, transformed_output;
+    phi::DenseTensor transformed_input, transformed_output;
     bool need_transpose = input_dims.size() != 2;
     if (input_dims.size() == 4) {
       // need to do transpose if layout is kNCHW
@@ -439,7 +439,7 @@ class InterpolateV2GradMLUKernel : public framework::OpKernel<T> {
     framework::DDim dim_grad;
     framework::DDim dim_out_grad, dim_out_trans_grad, dim_in_grad,
         dim_in_trans_grad;
-    Tensor transformed_output_grad, transformed_input_grad;
+    phi::DenseTensor transformed_output_grad, transformed_input_grad;
     bool need_transpose =
         input_dims.size() != 2 && data_layout == DataLayout::kNCHW;
 
diff --git a/paddle/fluid/operators/interpolate_v2_op_npu.cc b/paddle/fluid/operators/interpolate_v2_op_npu.cc
index 69d2f563e37bc..31f08badd128c 100644
--- a/paddle/fluid/operators/interpolate_v2_op_npu.cc
+++ b/paddle/fluid/operators/interpolate_v2_op_npu.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 using DDim = framework::DDim;
 using fp16 = paddle::platform::float16;
@@ -104,7 +103,7 @@ struct InterpolateFunction {
     auto yt = y_new_shape[axis];
     y_new_shape[axis] = y_new_shape[0];
     y_new_shape[0] = yt;
-    Tensor gy_t;
+    phi::DenseTensor gy_t;
     gy_t.mutable_data<T>(y_new_shape, place);
     Transpose(gy, &gy_t, axis_swap);
     //  2  scatter
@@ -112,7 +111,7 @@ struct InterpolateFunction {
     auto xt = x_new_shape[axis];
     x_new_shape[axis] = x_new_shape[0];
     x_new_shape[0] = xt;
-    Tensor gx_zero, gx_t;
+    phi::DenseTensor gx_zero, gx_t;
     gx_zero.mutable_data<T>(x_new_shape, place);
     gx_t.mutable_data<T>(x_new_shape, place);
     FillNpuTensorWithConstant<T>(&gx_zero, static_cast<T>(0));
@@ -161,14 +160,14 @@ struct InterpolateFunction {
   platform::Place place;
   aclrtStream stream;
   const framework::ExecutionContext& ctx;
-  Tensor t0;
-  Tensor t1;
-  Tensor tn;
+  phi::DenseTensor t0;
+  phi::DenseTensor t1;
+  phi::DenseTensor tn;
 };
 
 template <>
 void InterpolateFunction<fp16>::Arange(int n, phi::DenseTensor* x) {
-  Tensor x_fp32(experimental::DataType::FLOAT32);
+  phi::DenseTensor x_fp32(experimental::DataType::FLOAT32);
   x_fp32.mutable_data<float>(x->dims(), place);
   FillNpuTensorWithConstant<float>(&tn, static_cast<float>(n));
   const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {x_fp32}, {});
@@ -238,7 +237,7 @@ void BilinearParamTensorCompute(const framework::ExecutionContext& ctx,
                                 phi::DenseTensor* coef_w1) {
   InterpolateFunction<T> F(ctx);
   auto place = ctx.GetPlace();
-  Tensor _h0, _w0;
+  phi::DenseTensor _h0, _w0;
   _h0.mutable_data<T>({out_h}, place);
   _w0.mutable_data<T>({out_w}, place);
   F.Arange(out_h, &_h0);
@@ -255,8 +254,8 @@ void BilinearParamTensorCompute(const framework::ExecutionContext& ctx,
     F.Muls(&_w0, ratio_w, &_w0);
   }
 
-  Tensor zero_t;
-  Tensor one_t;
+  phi::DenseTensor zero_t;
+  phi::DenseTensor one_t;
   zero_t.mutable_data<T>({1}, place);
   one_t.mutable_data<T>({1}, place);
   FillNpuTensorWithConstant<T>(&zero_t, static_cast<T>(0));
@@ -264,7 +263,7 @@ void BilinearParamTensorCompute(const framework::ExecutionContext& ctx,
   F.Maximum(&_h0, &zero_t, &_h0);
   F.Maximum(&_w0, &zero_t, &_w0);
 
-  Tensor _h0_floor, _w0_floor;
+  phi::DenseTensor _h0_floor, _w0_floor;
   _h0_floor.mutable_data<T>({out_h}, place);
   _w0_floor.mutable_data<T>({out_w}, place);
   F.Floor(&_h0, &_h0_floor);
@@ -272,12 +271,12 @@ void BilinearParamTensorCompute(const framework::ExecutionContext& ctx,
   F.Cast(&_h0_floor, h0);
   F.Cast(&_w0_floor, w0);
 
-  Tensor one_int;
+  phi::DenseTensor one_int;
   one_int.mutable_data<int>({1}, place);
   FillNpuTensorWithConstant<int>(&one_int, static_cast<int>(1));
   F.Add(h0, &one_int, h1);
   F.Add(w0, &one_int, w1);
-  Tensor t_max_h, t_max_w;
+  phi::DenseTensor t_max_h, t_max_w;
   t_max_h.mutable_data<int>({1}, place);
   t_max_w.mutable_data<int>({1}, place);
   FillNpuTensorWithConstant<int>(&t_max_h, static_cast<int>(in_h - 1));
@@ -334,12 +333,12 @@ void BilinearFwdNpu(const framework::ExecutionContext& ctx,
                           &ratio_h,
                           &ratio_w);
 
-  Tensor h0, h1, w0, w1;
+  phi::DenseTensor h0, h1, w0, w1;
   h0.mutable_data<int>({out_h}, place);
   h1.mutable_data<int>({out_h}, place);
   w0.mutable_data<int>({out_w}, place);
   w1.mutable_data<int>({out_w}, place);
-  Tensor coef_h0, coef_h1, coef_w0, coef_w1;
+  phi::DenseTensor coef_h0, coef_h1, coef_w0, coef_w1;
   coef_h0.mutable_data<T>({out_h}, place);
   coef_h1.mutable_data<T>({out_h}, place);
   coef_w0.mutable_data<T>({out_w}, place);
@@ -363,7 +362,7 @@ void BilinearFwdNpu(const framework::ExecutionContext& ctx,
                                 &coef_w0,
                                 &coef_w1);
 
-  Tensor input_gather_h0, input_gather_h1;
+  phi::DenseTensor input_gather_h0, input_gather_h1;
   auto dim_gather_h = indim;
   dim_gather_h[axis_h] = out_h;
   input_gather_h0.mutable_data<T>(dim_gather_h, place);
@@ -374,13 +373,13 @@ void BilinearFwdNpu(const framework::ExecutionContext& ctx,
 
   F.Mul(&input_gather_h0, &coef_h0, &input_gather_h0);
   F.Mul(&input_gather_h1, &coef_h1, &input_gather_h1);
-  Tensor out_x4;
+  phi::DenseTensor out_x4;
   out_x4.mutable_data<T>({4, outdim[0], outdim[1], outdim[2], outdim[3]},
                          place);
-  Tensor input_gather_h0_w0 = out_x4.Slice(0, 1);
-  Tensor input_gather_h0_w1 = out_x4.Slice(1, 2);
-  Tensor input_gather_h1_w0 = out_x4.Slice(2, 3);
-  Tensor input_gather_h1_w1 = out_x4.Slice(3, 4);
+  phi::DenseTensor input_gather_h0_w0 = out_x4.Slice(0, 1);
+  phi::DenseTensor input_gather_h0_w1 = out_x4.Slice(1, 2);
+  phi::DenseTensor input_gather_h1_w0 = out_x4.Slice(2, 3);
+  phi::DenseTensor input_gather_h1_w1 = out_x4.Slice(3, 4);
   F.Gather(&input_gather_h0, &w0, axis_w, &input_gather_h0_w0);
   F.Gather(&input_gather_h0, &w1, axis_w, &input_gather_h0_w1);
   F.Gather(&input_gather_h1, &w0, axis_w, &input_gather_h1_w0);
@@ -425,12 +424,12 @@ void BilinearBwdNpu(const framework::ExecutionContext& ctx,
                           &ratio_h,
                           &ratio_w);
 
-  Tensor h0, h1, w0, w1;
+  phi::DenseTensor h0, h1, w0, w1;
   h0.mutable_data<int>({out_h}, place);
   h1.mutable_data<int>({out_h}, place);
   w0.mutable_data<int>({out_w}, place);
   w1.mutable_data<int>({out_w}, place);
-  Tensor coef_h0, coef_h1, coef_w0, coef_w1;
+  phi::DenseTensor coef_h0, coef_h1, coef_w0, coef_w1;
   coef_h0.mutable_data<T>({out_h}, place);
   coef_h1.mutable_data<T>({out_h}, place);
   coef_w0.mutable_data<T>({out_w}, place);
@@ -454,7 +453,7 @@ void BilinearBwdNpu(const framework::ExecutionContext& ctx,
                                 &coef_w0,
                                 &coef_w1);
 
-  Tensor gy_w0, gy_w1;
+  phi::DenseTensor gy_w0, gy_w1;
   gy_w0.mutable_data<T>(outdim, place);
   gy_w1.mutable_data<T>(outdim, place);
   F.Mul(gout, &coef_w0, &gy_w0);
@@ -462,7 +461,7 @@ void BilinearBwdNpu(const framework::ExecutionContext& ctx,
 
   auto dim_gather_h = indim;
   dim_gather_h[axis_h] = out_h;
-  Tensor g_gather_w0, g_gather_w1;
+  phi::DenseTensor g_gather_w0, g_gather_w1;
   g_gather_w0.mutable_data<T>(dim_gather_h, place);
   g_gather_w1.mutable_data<T>(dim_gather_h, place);
   w0.Resize({out_w, 1});
@@ -474,7 +473,7 @@ void BilinearBwdNpu(const framework::ExecutionContext& ctx,
   F.Mul(&g_gather_w0, &coef_h1, &g_gather_w1);
   F.Mul(&g_gather_w0, &coef_h0, &g_gather_w0);
 
-  Tensor gx_0, gx_1;
+  phi::DenseTensor gx_0, gx_1;
   gx_0.mutable_data<T>(indim, place);
   gx_1.mutable_data<T>(indim, place);
   h0.Resize({out_h, 1});
@@ -493,10 +492,11 @@ class InterpolateV2NPUKernel : public framework::OpKernel<T> {
     auto* output = ctx.Output<phi::DenseTensor>("Out");
 
     auto input_dims = input->dims();
-    PADDLE_ENFORCE_EQ(input_dims.size(),
-                      4UL,
-                      platform::errors::External(
-                          "NPU Interpolate Kernel only support 4-D Tensor."));
+    PADDLE_ENFORCE_EQ(
+        input_dims.size(),
+        4UL,
+        platform::errors::External(
+            "NPU Interpolate Kernel only support 4-D phi::DenseTensor."));
 
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
     const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
@@ -552,14 +552,16 @@ class InterpolateV2NPUKernel : public framework::OpKernel<T> {
             scale_w > 0,
             true,
             platform::errors::InvalidArgument(
-                "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+                "The scale_w in input 'Scale' phi::DenseTensor of "
+                "Operator(interpolate) "
                 "should be greater than 0, but received value is %d.",
                 scale_w));
         PADDLE_ENFORCE_EQ(
             scale_h > 0,
             true,
             platform::errors::InvalidArgument(
-                "The scale_h in input 'Scale' Tensor of Operator(interpolate) "
+                "The scale_h in input 'Scale' phi::DenseTensor of "
+                "Operator(interpolate) "
                 "should be greater than 0, but received value is %d.",
                 scale_h));
       } else {
@@ -704,14 +706,16 @@ class InterpolateV2NPUGradKernel : public framework::OpKernel<T> {
             scale_w > 0,
             true,
             platform::errors::InvalidArgument(
-                "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+                "The scale_w in input 'Scale' phi::DenseTensor of "
+                "Operator(interpolate) "
                 "should be greater than 0, but received value is %d.",
                 scale_w));
         PADDLE_ENFORCE_EQ(
             scale_h > 0,
             true,
             platform::errors::InvalidArgument(
-                "The scale_h in input 'Scale' Tensor of Operator(interpolate) "
+                "The scale_h in input 'Scale' phi::DenseTensor of "
+                "Operator(interpolate) "
                 "should be greater than 0, but received value is %d.",
                 scale_h));
       } else {
diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
index 8070527a56a8c..2c2cab61521ef 100644
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -135,12 +135,11 @@ void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) {
   LOG(INFO) << loginfos.str();
 }
 
-using Tensor = phi::DenseTensor;
 template <typename KernelTuple, typename PlaceType>
 void BenchKernelXYZN() {
   using T = typename KernelTuple::data_type;
   for (int d : TestSizes()) {
-    Tensor x, y, z;
+    phi::DenseTensor x, y, z;
     x.Resize({d});
     y.Resize({d});
     z.Resize({d});
@@ -161,7 +160,7 @@ void BenchKernelAXYN() {
   using T = typename KernelTuple::data_type;
   for (int d : TestSizes()) {
     const T a = static_cast<T>(3);
-    Tensor x, y;
+    phi::DenseTensor x, y;
     x.Resize({d});
     y.Resize({d});
     T* x_data = x.mutable_data<T>(PlaceType());
@@ -177,7 +176,7 @@ template <typename KernelTuple, typename PlaceType>
 void BenchKernelXRN() {
   using T = typename KernelTuple::data_type;
   for (int d : TestSizes()) {
-    Tensor x;
+    phi::DenseTensor x;
     RandomVec<T>(d, x.mutable_data<T>({d}, PlaceType()));
     T res;
     BenchAllImpls<KernelTuple, PlaceType>(d, x.data<T>(), &res, d);
@@ -188,7 +187,7 @@ template <typename KernelTuple, typename PlaceType>
 void BenchKernelXYN() {
   using T = typename KernelTuple::data_type;
   for (int d : TestSizes()) {
-    Tensor x, y;
+    phi::DenseTensor x, y;
     x.Resize({d});
     y.Resize({d});
     T* x_data = x.mutable_data<T>(PlaceType());
@@ -205,7 +204,7 @@ void BenchKernelLSTM() {
     for (int d : TestSizes()) {
       const jit::lstm_attr_t attr(
           d, jit::kVSigmoid, jit::kVTanh, jit::kVTanh, use_peephole);
-      Tensor x, ct_1, ct, ht, wp, checked;
+      phi::DenseTensor x, ct_1, ct, ht, wp, checked;
       x.Resize({4 * d});
       ct_1.Resize({d});
       ct.Resize({d});
@@ -242,7 +241,7 @@ void BenchKernelGRU() {
   for (int d : TestSizes()) {
     const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh);
     auto place = PlaceType();
-    Tensor x, ht_1, ht;
+    phi::DenseTensor x, ht_1, ht;
     x.Resize({3 * d});
     ht_1.Resize({d});
     ht.Resize({d});
@@ -269,7 +268,7 @@ void BenchKernelSeqPool() {
       jit::seq_pool_attr_t attr(w, type);
       for (int h : TestSizes()) {
         attr.h = h;
-        Tensor x, y;
+        phi::DenseTensor x, y;
         x.Resize({h * w});
         y.Resize({w});
         RandomVec<T>(h * w, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
@@ -287,7 +286,7 @@ void BenchKernelEmbSeqPool() {
   std::vector<jit::SeqPoolType> pool_types = {jit::SeqPoolType::kSum};
   int64_t tbl_h = 1e4;
   for (int tbl_w : {10, 16, 256}) {
-    Tensor table;
+    phi::DenseTensor table;
     table.Resize({tbl_h, tbl_w});
     RandomVec<T>(tbl_h * tbl_w, table.mutable_data<T>(PlaceType()), -2.f, 2.f);
     const T* table_data = table.data<T>();
@@ -297,7 +296,7 @@ void BenchKernelEmbSeqPool() {
           int64_t out_w = tbl_w * idx_w;
           jit::emb_seq_pool_attr_t attr(
               tbl_h, tbl_w, idx_h, idx_w, out_w, type);
-          Tensor idx, out;
+          phi::DenseTensor idx, out;
           idx.Resize({idx_h, idx_w});
           out.Resize({out_w});
           RandomVec<int64_t>(idx_h * idx_w,
@@ -348,12 +347,12 @@ void BenchKernelSgd() {
   for (int param_h : {1, 1000}) {
     for (int grad_w : {1, 2, 8, 16, 30, 256}) {
       // only benchmark inplace
-      Tensor param;
+      phi::DenseTensor param;
       param.Resize({param_h, grad_w});
       T* param_data = param.mutable_data<T>(PlaceType());
       RandomVec<T>(param_h * grad_w, param_data, -2.f, 2.f);
       for (int rows_size = 1; rows_size <= std::min(param_h, 10); ++rows_size) {
-        Tensor grad;
+        phi::DenseTensor grad;
         grad.Resize({rows_size, grad_w});
         std::vector<int64_t> rows =
             UnDuplicatedRandomVec(rows_size, 0, rows_size - 1);
@@ -375,7 +374,7 @@ void BenchKernelMatMul() {
   for (int m : {1, 2, 3, 4}) {
     for (int n : TestSizes()) {
       for (int k : TestSizes()) {
-        Tensor a, b, c;
+        phi::DenseTensor a, b, c;
         a.Resize({m * k});
         b.Resize({k * n});
         c.Resize({m * n});
@@ -397,7 +396,7 @@ void BenchKernelSoftmax() {
   using T = typename KernelTuple::data_type;
   for (int bs : {1, 2, 10}) {
     for (int n : TestSizes()) {
-      Tensor x, y;
+      phi::DenseTensor x, y;
       x.Resize({bs, n});
       y.Resize({bs, n});
       RandomVec<T>(bs * n, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
@@ -418,7 +417,7 @@ void BenchKernelLayerNorm() {
       for (int x_dim_1 : TestSizes()) {
         int right = x_dim_1;
         int sz = left * right;
-        Tensor x, mean, var, scale, bias, out;
+        phi::DenseTensor x, mean, var, scale, bias, out;
         x.Resize({n, x_dim_0, x_dim_1});
         out.Resize({n, x_dim_0, x_dim_1});
         mean.Resize({n, x_dim_0});
@@ -462,7 +461,7 @@ void BenchKernelCRFDecoding() {
     for (int tag_num : TestSizes()) {
       int x_sz = seq_len * tag_num;
       int w_sz = (tag_num + state_trans_base_idx) * tag_num;
-      Tensor x, w, alpha, track;
+      phi::DenseTensor x, w, alpha, track;
       x.Resize({seq_len, tag_num});
       w.Resize({tag_num + state_trans_base_idx, tag_num});
       alpha.Resize({seq_len, tag_num});
@@ -486,12 +485,12 @@ template <typename KernelTuple, typename PlaceType>
 void BenchKernelVBroadcast() {
   using T = typename KernelTuple::data_type;
   for (int64_t w : {1, 16, 64, 100, 256}) {
-    Tensor x;
+    phi::DenseTensor x;
     x.Resize({w});
     RandomVec<T>(w, x.mutable_data<T>(PlaceType()));
     const T* x_data = x.data<T>();
     for (int h : TestSizes()) {
-      Tensor y;
+      phi::DenseTensor y;
       y.Resize({h * w});
       T* y_data = y.mutable_data<T>(PlaceType());
       BenchAllImpls<KernelTuple, PlaceType>(
diff --git a/paddle/fluid/operators/kldiv_loss_op_npu.cc b/paddle/fluid/operators/kldiv_loss_op_npu.cc
index f21e939a7b118..760675ea74663 100644
--- a/paddle/fluid/operators/kldiv_loss_op_npu.cc
+++ b/paddle/fluid/operators/kldiv_loss_op_npu.cc
@@ -20,8 +20,6 @@ limitations under the Licnse. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class KLDivLossNPUKernel : public framework::OpKernel<T> {
  public:
@@ -114,7 +112,7 @@ class KLDivLossGradNPUKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<platform::NPUDeviceContext>();
     auto stream = dev_ctx.stream();
 
-    Tensor loss_grad_transformed;
+    phi::DenseTensor loss_grad_transformed;
     if ("none" == reduction) {
       loss_grad_transformed.ShareDataWith(*loss_grad);
     } else {
diff --git a/paddle/fluid/operators/label_smooth_op_mlu.cc b/paddle/fluid/operators/label_smooth_op_mlu.cc
index 211ffc7fb2cd6..96f629e14df5c 100644
--- a/paddle/fluid/operators/label_smooth_op_mlu.cc
+++ b/paddle/fluid/operators/label_smooth_op_mlu.cc
@@ -18,8 +18,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class LabelSmoothMLUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/label_smooth_op_npu.cc b/paddle/fluid/operators/label_smooth_op_npu.cc
index 529e8564cb19b..71bb1786bd018 100644
--- a/paddle/fluid/operators/label_smooth_op_npu.cc
+++ b/paddle/fluid/operators/label_smooth_op_npu.cc
@@ -18,8 +18,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 void LabelSmoothMuls(const platform::Place& place,
                      const aclrtStream& stream,
@@ -70,15 +68,15 @@ class LabelSmoothNPUKernel : public framework::OpKernel<T> {
             .stream();
 
     if (dist_t) {
-      Tensor tmp;
-      Tensor dist;
-      Tensor tmp2;
+      phi::DenseTensor tmp;
+      phi::DenseTensor dist;
+      phi::DenseTensor tmp2;
       LabelSmoothMuls<T>(place, stream, in_t, (1 - epsilon), &tmp);
       LabelSmoothMuls<T>(place, stream, dist_t, epsilon, &tmp2);
       tmp2.Resize({1, label_dim});
       LabelSmoothAddBroadCast<T>(place, stream, &tmp, &tmp2, out_t);
     } else {
-      Tensor tmp;
+      phi::DenseTensor tmp;
       LabelSmoothMuls<T>(place, stream, in_t, (1 - epsilon), &tmp);
       LabelSmoothAdds<T>(place, stream, &tmp, (epsilon / label_dim), out_t);
     }
diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h
index 3d1bd7490795d..703a3b7506efc 100644
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -33,7 +33,6 @@ namespace cub = hipcub;
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
index 1081df4166aac..461d77f324bcf 100644
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 
 class LayerNormOp : public framework::OperatorWithKernel {
@@ -210,9 +209,9 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_NOT_NULL(
         var,
         platform::errors::NotFound("Y@GRAD of LayerNorm Op is not found."));
-    const Tensor *t = nullptr;
-    if (var->IsType<Tensor>()) {
-      t = &var->Get<Tensor>();
+    const phi::DenseTensor *t = nullptr;
+    if (var->IsType<phi::DenseTensor>()) {
+      t = &var->Get<phi::DenseTensor>();
     } else if (var->IsType<phi::DenseTensor>()) {
       t = &var->Get<phi::DenseTensor>();
     }
diff --git a/paddle/fluid/operators/layer_norm_op_mlu.cc b/paddle/fluid/operators/layer_norm_op_mlu.cc
index 7058f9f094923..deb7bb5045eba 100644
--- a/paddle/fluid/operators/layer_norm_op_mlu.cc
+++ b/paddle/fluid/operators/layer_norm_op_mlu.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
 
 template <typename T>
@@ -72,7 +71,7 @@ class LayerNormMLUKernel : public framework::OpKernel<T> {
                                 GetBasePtr(mean),
                                 GetBasePtr(variance));
     } else {
-      Tensor tmp_scale(x->dtype());
+      phi::DenseTensor tmp_scale(x->dtype());
       if (!scale) {
         tmp_scale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
         FillMLUTensorWithHostValue(ctx, static_cast<T>(1), &tmp_scale);
@@ -80,7 +79,7 @@ class LayerNormMLUKernel : public framework::OpKernel<T> {
         tmp_scale = *scale;
       }
 
-      Tensor tmp_bias(x->dtype());
+      phi::DenseTensor tmp_bias(x->dtype());
       if (!bias) {
         tmp_bias.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
         FillMLUTensorWithHostValue(ctx, static_cast<T>(0), &tmp_bias);
@@ -95,7 +94,7 @@ class LayerNormMLUKernel : public framework::OpKernel<T> {
           scale_bias_axes.size(), scale_bias_axes.data(), CNNL_DTYPE_HALF);
       cnnlCastDataType_t cast_type = GetCastDataType(VT::FP32, VT::FP16);
 
-      Tensor final_scale(x->dtype());
+      phi::DenseTensor final_scale(x->dtype());
       if (final_scale.dtype() == DataType::FLOAT16 &&
           tmp_scale.dtype() == DataType::FLOAT32) {
         final_scale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
@@ -110,7 +109,7 @@ class LayerNormMLUKernel : public framework::OpKernel<T> {
         final_scale = tmp_scale;
       }
 
-      Tensor final_bias(x->dtype());
+      phi::DenseTensor final_bias(x->dtype());
       if (final_bias.dtype() == DataType::FLOAT16 &&
           tmp_bias.dtype() == DataType::FLOAT32) {
         final_bias.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
@@ -181,7 +180,7 @@ class LayerNormGradMLUKernel : public framework::OpKernel<T> {
         mean_var_axes.size(), mean_var_axes.data(), ToCnnlDataType<T>());
     MLUCnnlTensorDesc dx_desc(*dx);
 
-    Tensor tmp_scale(x->dtype());
+    phi::DenseTensor tmp_scale(x->dtype());
     if (!scale) {
       tmp_scale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
       FillMLUTensorWithHostValue(ctx, static_cast<T>(1), &tmp_scale);
@@ -196,7 +195,7 @@ class LayerNormGradMLUKernel : public framework::OpKernel<T> {
     cnnlCastDataType_t cast_fp32_to_fp16 = GetCastDataType(VT::FP32, VT::FP16);
     cnnlCastDataType_t cast_fp16_to_fp32 = GetCastDataType(VT::FP16, VT::FP32);
 
-    Tensor final_scale(x->dtype());
+    phi::DenseTensor final_scale(x->dtype());
     if (final_scale.dtype() == DataType::FLOAT16 &&
         tmp_scale.dtype() == DataType::FLOAT32) {
       final_scale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
@@ -211,14 +210,14 @@ class LayerNormGradMLUKernel : public framework::OpKernel<T> {
       final_scale = tmp_scale;
     }
 
-    Tensor tmp_dscale(x->dtype());
+    phi::DenseTensor tmp_dscale(x->dtype());
     if (dscale && (tmp_dscale.dtype() == dscale->dtype())) {
       dscale->mutable_data<T>(place);
       tmp_dscale = *dscale;
     } else {
       tmp_dscale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
     }
-    Tensor tmp_dbias(x->dtype());
+    phi::DenseTensor tmp_dbias(x->dtype());
     if (dbias && (tmp_dbias.dtype() == dbias->dtype())) {
       dbias->mutable_data<T>(place);
       tmp_dbias = *dbias;
diff --git a/paddle/fluid/operators/layer_norm_op_npu.cc b/paddle/fluid/operators/layer_norm_op_npu.cc
index f529bb651c042..5d0313a8f9404 100644
--- a/paddle/fluid/operators/layer_norm_op_npu.cc
+++ b/paddle/fluid/operators/layer_norm_op_npu.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
 
 using DataLayout = phi::DataLayout;
@@ -75,10 +74,10 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    Tensor default_scale(x->type());
+    phi::DenseTensor default_scale(x->type());
     if (!scale) {
       default_scale.mutable_data<T>(phi::make_ddim(axes), place);
-      Tensor value(x->type());
+      phi::DenseTensor value(x->type());
       value.mutable_data<T>({1}, place);
       FillNpuTensorWithConstant<T>(&value, static_cast<T>(1.0));
       const auto& runner =
@@ -89,10 +88,10 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
       const_cast<phi::DenseTensor*>(scale)->Resize(phi::make_ddim(axes));
     }
 
-    Tensor default_bias(x->type());
+    phi::DenseTensor default_bias(x->type());
     if (!bias) {
       default_bias.mutable_data<T>(phi::make_ddim(axes), place);
-      Tensor value(x->type());
+      phi::DenseTensor value(x->type());
       value.mutable_data<T>({1}, place);
       FillNpuTensorWithConstant<T>(&value, static_cast<T>(0));
       const auto& runner =
@@ -104,7 +103,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
     }
 
     // cast scale from LayerNormParamType to T if needed
-    Tensor cast_scale(x->type());
+    phi::DenseTensor cast_scale(x->type());
     if (framework::TransToProtoVarType(x->dtype()) ==
             framework::proto::VarType::FP16 &&
         framework::TransToProtoVarType(scale->dtype()) ==
@@ -124,7 +123,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
     }
 
     // cast bias from LayerNormParamType to T if needed
-    Tensor cast_bias(x->type());
+    phi::DenseTensor cast_bias(x->type());
     if (framework::TransToProtoVarType(x->dtype()) ==
             framework::proto::VarType::FP16 &&
         framework::TransToProtoVarType(bias->dtype()) ==
@@ -147,7 +146,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
 
     // mean should be of  U type
     phi::DenseTensor* tmp_mean = mean;
-    Tensor cast_mean(x->type());
+    phi::DenseTensor cast_mean(x->type());
     if (framework::TransToProtoVarType(x->dtype()) ==
             framework::proto::VarType::FP16 &&
         (framework::TransToProtoVarType(scale->dtype()) ==
@@ -164,7 +163,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
 
     // same for variance
     phi::DenseTensor* tmp_variance = variance;
-    Tensor cast_variance(x->type());
+    phi::DenseTensor cast_variance(x->type());
     if (framework::TransToProtoVarType(x->dtype()) ==
             framework::proto::VarType::FP16 &&
         (framework::TransToProtoVarType(scale->dtype()) ==
@@ -273,10 +272,10 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
     const_cast<phi::DenseTensor*>(variance)->Resize(
         phi::make_ddim({new_shape}));
 
-    Tensor default_scale(x->type());
+    phi::DenseTensor default_scale(x->type());
     if (!scale) {
       default_scale.mutable_data<T>(phi::make_ddim(axes), place);
-      Tensor value(x->type());
+      phi::DenseTensor value(x->type());
       value.mutable_data<T>({1}, place);
       FillNpuTensorWithConstant<T>(&value, static_cast<T>(1.0));
       const auto& runner =
@@ -288,7 +287,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
     }
 
     // cast scale from LayerNormParamType to T if needed
-    Tensor cast_scale(x->type());
+    phi::DenseTensor cast_scale(x->type());
     if (framework::TransToProtoVarType(x->dtype()) ==
             framework::proto::VarType::FP16 &&
         framework::TransToProtoVarType(scale->dtype()) ==
@@ -308,7 +307,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
     }
 
     // cast mean from LayerNormParamType to T if needed
-    Tensor cast_mean(x->type());
+    phi::DenseTensor cast_mean(x->type());
     if (framework::TransToProtoVarType(x->dtype()) ==
             framework::proto::VarType::FP16 &&
         framework::TransToProtoVarType(mean->dtype()) ==
@@ -328,7 +327,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
     }
 
     // cast variance from LayerNormParamType to T if needed
-    Tensor cast_variance(x->type());
+    phi::DenseTensor cast_variance(x->type());
     if (framework::TransToProtoVarType(x->dtype()) ==
             framework::proto::VarType::FP16 &&
         framework::TransToProtoVarType(variance->dtype()) ==
@@ -347,7 +346,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
       cast_variance.ShareDataWith(*variance);
     }
 
-    Tensor dx_(dy->type()), dscale_(dy->type()), dbias_(dy->type());
+    phi::DenseTensor dx_(dy->type()), dscale_(dy->type()), dbias_(dy->type());
     dx = (dx == nullptr) ? &dx_ : dx;
     dscale = (dscale == nullptr) ? &dscale_ : dscale;
     dbias = (dbias == nullptr) ? &dbias_ : dbias;
@@ -361,7 +360,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
 
     // dscale should be of  U type
     phi::DenseTensor* tmp_dscale = dscale;
-    Tensor cast_dscale(x->type());
+    phi::DenseTensor cast_dscale(x->type());
     if (framework::TransToProtoVarType(x->dtype()) ==
             framework::proto::VarType::FP16 &&
         (framework::TransToProtoVarType(mean->dtype()) ==
@@ -378,7 +377,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
 
     // same for dbias
     phi::DenseTensor* tmp_dbias = dbias;
-    Tensor cast_dbias(x->type());
+    phi::DenseTensor cast_dbias(x->type());
     if (framework::TransToProtoVarType(x->dtype()) ==
             framework::proto::VarType::FP16 &&
         (framework::TransToProtoVarType(mean->dtype()) ==
diff --git a/paddle/fluid/operators/layout_utils.h b/paddle/fluid/operators/layout_utils.h
index d475eab967d78..2faf47538ffa5 100644
--- a/paddle/fluid/operators/layout_utils.h
+++ b/paddle/fluid/operators/layout_utils.h
@@ -26,8 +26,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 inline void ResizeToChannelFirst(const framework::ExecutionContext& context,
                                  const phi::DenseTensor* input,
diff --git a/paddle/fluid/operators/limit_by_capacity_op.cu b/paddle/fluid/operators/limit_by_capacity_op.cu
index 28ae524e0a4f9..d14e4c75425c9 100644
--- a/paddle/fluid/operators/limit_by_capacity_op.cu
+++ b/paddle/fluid/operators/limit_by_capacity_op.cu
@@ -28,8 +28,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 __global__ void limit_by_capacity_impl(
     const T* expc, T* cap, T* out, const int n_expert, const int n_worker) {
diff --git a/paddle/fluid/operators/log_loss_op_npu.cc b/paddle/fluid/operators/log_loss_op_npu.cc
index 47c6bef196be1..ed045fad4a95e 100644
--- a/paddle/fluid/operators/log_loss_op_npu.cc
+++ b/paddle/fluid/operators/log_loss_op_npu.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 void LogLossAdds(const platform::Place& place,
                  const aclrtStream& stream,
diff --git a/paddle/fluid/operators/log_loss_op_xpu.cc b/paddle/fluid/operators/log_loss_op_xpu.cc
index 59e0c15678247..87e6d42e98ad5 100644
--- a/paddle/fluid/operators/log_loss_op_xpu.cc
+++ b/paddle/fluid/operators/log_loss_op_xpu.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T, typename AttrType = T>
 class LogLossXPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/lookup_table_dequant_op.h b/paddle/fluid/operators/lookup_table_dequant_op.h
index 3f9ec485ce4f8..1c8001e371764 100644
--- a/paddle/fluid/operators/lookup_table_dequant_op.h
+++ b/paddle/fluid/operators/lookup_table_dequant_op.h
@@ -27,7 +27,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using SelectedRows = phi::SelectedRows;
 using DDim = framework::DDim;
 
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index 1ba6d6e31ecdc..04153eecc3927 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -26,7 +26,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using SelectedRows = phi::SelectedRows;
 using DDim = framework::DDim;
 
diff --git a/paddle/fluid/operators/lookup_table_v2_op.h b/paddle/fluid/operators/lookup_table_v2_op.h
index e9369bcb475cc..f43fccb19e0b6 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.h
+++ b/paddle/fluid/operators/lookup_table_v2_op.h
@@ -27,14 +27,13 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using SelectedRows = phi::SelectedRows;
 using DDim = framework::DDim;
 
 constexpr int64_t kNoPadding = -1;
 
 template <typename InT, typename OutT>
-static std::vector<OutT> CopyIdsToVector(const Tensor &ids) {
+static std::vector<OutT> CopyIdsToVector(const phi::DenseTensor &ids) {
   auto numel = ids.numel();
   const auto *src = ids.data<InT>();
   std::vector<OutT> ret(numel);
@@ -51,7 +50,7 @@ static std::vector<OutT> CopyIdsToVector(const Tensor &ids) {
 template <typename T>
 struct LookupTableV2CPUFunctor {
   LookupTableV2CPUFunctor(const framework::ExecutionContext &context,
-                          const Tensor *ids_t)
+                          const phi::DenseTensor *ids_t)
       : context_(context), ids_t_(ids_t) {}
 
   template <typename IdT>
@@ -143,7 +142,7 @@ struct LookupTableV2CPUFunctor {
 
  private:
   const framework::ExecutionContext &context_;
-  const Tensor *ids_t_;
+  const phi::DenseTensor *ids_t_;
 };
 
 template <typename T>
@@ -160,7 +159,7 @@ class LookupTableV2Kernel : public framework::OpKernel<T> {
 template <typename T>
 struct LookupTableV2GradCPUFunctor {
   LookupTableV2GradCPUFunctor(const framework::ExecutionContext &context,
-                              const Tensor *ids_t)
+                              const phi::DenseTensor *ids_t)
       : context_(context), ids_t_(ids_t) {}
 
   template <typename IdT>
@@ -267,7 +266,7 @@ struct LookupTableV2GradCPUFunctor {
 
  private:
   const framework::ExecutionContext &context_;
-  const Tensor *ids_t_;
+  const phi::DenseTensor *ids_t_;
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/lookup_table_v2_op_mlu.cc b/paddle/fluid/operators/lookup_table_v2_op_mlu.cc
index de9864aeee6a1..c407d91e6b80d 100644
--- a/paddle/fluid/operators/lookup_table_v2_op_mlu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_mlu.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class LookupTableV2MLUKernel : public framework::OpKernel<T> {
  public:
@@ -84,7 +82,7 @@ class LookupTableV2GradMLUKernel : public framework::OpKernel<T> {
             "Number of ids greater than int32_t::max , please check "
             "number of ids in LookupTableV2GradMLUKernel."));
 
-    Tensor ids_int32(ids_t->dtype());
+    phi::DenseTensor ids_int32(ids_t->dtype());
     if (ids_t->dtype() != DataType::INT32) {
       ids_int32.mutable_data<int>(ids_t->dims(), ctx.GetPlace());
       MLUCnnlTensorDesc ids_desc(*ids_t);
diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
index d11ef440f8a3f..3dc94d49244b1 100644
--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
@@ -22,7 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 constexpr int64_t kNoPadding = -1;
 
 template <typename DeviceContext, typename T>
@@ -53,16 +52,16 @@ class LookupTableV2NPUKernel : public framework::OpKernel<T> {
           .AddOutput(*output_t);
       runner.Run();
     } else {
-      Tensor tmp_table_t(table_t->type());
+      phi::DenseTensor tmp_table_t(table_t->type());
       tmp_table_t.mutable_data<T>(table_t->dims(), ctx.GetPlace());
 
-      Tensor index;
+      phi::DenseTensor index;
       index.mutable_data<int32_t>({1, 1}, ctx.GetPlace());
       FillNpuTensorWithConstant<int32_t>(&index,
                                          static_cast<int32_t>(padding_idx));
 
       auto updata_dim = phi::make_ddim({1, table_t->dims()[1]});
-      Tensor update;
+      phi::DenseTensor update;
       update.mutable_data<T>(updata_dim, ctx.GetPlace());
       FillNpuTensorWithConstant<T>(&update, static_cast<T>(0));
       update.Resize(updata_dim);
@@ -109,7 +108,7 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
 
     int embedding_dim = table_grad_t->dims()[1];
     if (embedding_dim % 32 == 0) {
-      // NOTE(pangyoki): The embedding_dim of Tensor used in
+      // NOTE(pangyoki): The embedding_dim of phi::DenseTensor used in
       // EmbeddingDenseGrad must be an integer multiple of 32.
       int num_weights = table_grad_t->dims()[0];
       const auto &runner =
@@ -137,7 +136,7 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
                       {{"use_locking", true}});
       runner_scatter.Run(stream);
     } else {
-      Tensor casted_ids_t;
+      phi::DenseTensor casted_ids_t;
       if (framework::TransToProtoVarType(ids_t->dtype()) !=
           framework::proto::VarType::INT32) {
         casted_ids_t.mutable_data<int32_t>(ids_t->dims(), ctx.GetPlace());
diff --git a/paddle/fluid/operators/lrn_op.h b/paddle/fluid/operators/lrn_op.h
index aa2596e6a22ba..b772aa82e9d7e 100644
--- a/paddle/fluid/operators/lrn_op.h
+++ b/paddle/fluid/operators/lrn_op.h
@@ -46,8 +46,6 @@ struct LRNFunctor {
 template <typename DeviceContext, typename T>
 class LRNKernel : public framework::OpKernel<T> {
  public:
-  using Tensor = phi::DenseTensor;
-
   // f(x) = x * ( k + alpha * SUM((x)^2) )^(-beta)
   // x represents inputs
   // f(x) represents outputs
@@ -141,7 +139,6 @@ struct LRNGradFunctor {
 template <typename DeviceContext, typename T>
 class LRNGradKernel : public framework::OpKernel<T> {
  public:
-  using Tensor = phi::DenseTensor;
   void Compute(const framework::ExecutionContext& ctx) const override {
     const phi::DenseTensor& x = *ctx.Input<phi::DenseTensor>("X");
     const phi::DenseTensor& out = *ctx.Input<phi::DenseTensor>("Out");
diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h
index dc4f2f1548612..d5ced3edd2add 100644
--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -24,8 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 inline void ReorderInitState(const DeviceContext& ctx,
                              const phi::DenseTensor& src,
@@ -74,9 +72,9 @@ class LSTMKernel : public framework::OpKernel<T> {
     framework::DDim dims({in_dims[0], frame_size});
 
     if (bias) {
-      Tensor b = *bias;
+      phi::DenseTensor b = *bias;
       b.Resize({bias->numel(), 1});
-      Tensor gate_bias = b.Slice(0, 4 * frame_size);
+      phi::DenseTensor gate_bias = b.Slice(0, 4 * frame_size);
       phi::funcs::RowwiseAdd<DeviceContext, T> add_bias;
       add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
     }
@@ -95,7 +93,7 @@ class LSTMKernel : public framework::OpKernel<T> {
       lstm_value.check_og = nullptr;
     }
     lstm_value.prev_state_value = nullptr;
-    Tensor ordered_c0;
+    phi::DenseTensor ordered_c0;
 
     framework::Vector<size_t> order(batch_gate->lod()[2]);
 
@@ -134,10 +132,10 @@ class LSTMKernel : public framework::OpKernel<T> {
       int bstart = static_cast<int>(batch_starts[n]);
       int bend = static_cast<int>(batch_starts[n + 1]);
 
-      Tensor gate_t = batch_gate->Slice(bstart, bend);
-      Tensor out_t = batch_hidden.Slice(bstart, bend);
-      Tensor cell_t = batch_cell.Slice(bstart, bend);
-      Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
+      phi::DenseTensor gate_t = batch_gate->Slice(bstart, bend);
+      phi::DenseTensor out_t = batch_hidden.Slice(bstart, bend);
+      phi::DenseTensor cell_t = batch_cell.Slice(bstart, bend);
+      phi::DenseTensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
 
       int cur_batch_size = bend - bstart;
 
@@ -160,7 +158,7 @@ class LSTMKernel : public framework::OpKernel<T> {
         // Since the batch computing for LSTM reorders the input sequence
         // according to their length. The initialized hidden state also needs
         // to reorder.
-        Tensor ordered_h0;
+        phi::DenseTensor ordered_h0;
         ReorderInitState<DeviceContext, T>(
             device_ctx, *hidden_t0, order, &ordered_h0, true);
         blas.MatMul(ordered_h0,
@@ -237,7 +235,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     // ordered_h0/c0 is the reordered hidden/cell initialization.
     // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
     // initialization.
-    Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
+    phi::DenseTensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
     framework::Vector<size_t> order(batch_gate->lod()[2]);
 
     if (c0) {
@@ -328,24 +326,24 @@ class LSTMGradKernel : public framework::OpKernel<T> {
       int bstart = static_cast<int>(batch_starts[n]);
       int bend = static_cast<int>(batch_starts[n + 1]);
 
-      Tensor gate = batch_gate->Slice(bstart, bend);
-      Tensor cell = batch_cell.Slice(bstart, bend);
-      Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
+      phi::DenseTensor gate = batch_gate->Slice(bstart, bend);
+      phi::DenseTensor cell = batch_cell.Slice(bstart, bend);
+      phi::DenseTensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
       lstm_value.gate_value = gate.data<T>();
       lstm_value.state_value = cell.data<T>();
       lstm_value.state_active_value = cell_pre_act.data<T>();
 
-      Tensor out_g = batch_hidden_g.Slice(bstart, bend);
-      Tensor gate_g = batch_gate_g.Slice(bstart, bend);
-      Tensor cell_g = batch_cell_g.Slice(bstart, bend);
+      phi::DenseTensor out_g = batch_hidden_g.Slice(bstart, bend);
+      phi::DenseTensor gate_g = batch_gate_g.Slice(bstart, bend);
+      phi::DenseTensor cell_g = batch_cell_g.Slice(bstart, bend);
       lstm_grad.state_grad = cell_g.data<T>();
       lstm_grad.gate_grad = gate_g.data<T>();
       lstm_grad.output_grad = out_g.data<T>();
 
       if (n > 0) {
         int bstart_pre = static_cast<int>(batch_starts[n - 1]);
-        Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
-        Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
+        phi::DenseTensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
+        phi::DenseTensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
         lstm_value.prev_state_value = cell_pre.data<T>();
         lstm_grad.prev_state_grad = cell_pre_g.data<T>();
       } else {
@@ -424,9 +422,9 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     }
     if (bias && bias_g) {
       /* backward bias */
-      Tensor b_g = *bias_g;
+      phi::DenseTensor b_g = *bias_g;
       b_g.Resize({bias_g->numel(), 1});
-      Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
+      phi::DenseTensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
       phi::funcs::ColwiseSum<DeviceContext, T> col_sum;
       col_sum(device_ctx, batch_gate_g, &gate_bias_g);
     }
diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
index 8056bf0bd49f2..c26a421966e7b 100644
--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -29,7 +29,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using platform::Transform;
 
 template <typename T,
@@ -135,9 +134,9 @@ class LSTMPKernel : public framework::OpKernel<T> {
     framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]});
 
     if (bias) {
-      Tensor b = *bias;
+      phi::DenseTensor b = *bias;
       b.Resize({bias->numel(), 1});
-      Tensor gate_bias = b.Slice(0, 4 * frame_size);
+      phi::DenseTensor gate_bias = b.Slice(0, 4 * frame_size);
       phi::funcs::RowwiseAdd<DeviceContext, T> add_bias;
       add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
     }
@@ -156,8 +155,8 @@ class LSTMPKernel : public framework::OpKernel<T> {
       lstmp_value.check_og = nullptr;
     }
     lstmp_value.prev_state_value = nullptr;
-    Tensor ordered_c0;
-    Tensor ordered_h0;
+    phi::DenseTensor ordered_c0;
+    phi::DenseTensor ordered_h0;
 
     framework::Vector<size_t> order(batch_gate->lod()[2]);
 
@@ -195,11 +194,11 @@ class LSTMPKernel : public framework::OpKernel<T> {
       int bstart = static_cast<int>(batch_starts[n]);
       int bend = static_cast<int>(batch_starts[n + 1]);
 
-      Tensor gate_t = batch_gate->Slice(bstart, bend);
-      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-      Tensor proj_t = batch_proj.Slice(bstart, bend);
-      Tensor cell_t = batch_cell.Slice(bstart, bend);
-      Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
+      phi::DenseTensor gate_t = batch_gate->Slice(bstart, bend);
+      phi::DenseTensor hidden_t = batch_hidden->Slice(bstart, bend);
+      phi::DenseTensor proj_t = batch_proj.Slice(bstart, bend);
+      phi::DenseTensor cell_t = batch_cell.Slice(bstart, bend);
+      phi::DenseTensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
 
       int cur_batch_size = bend - bstart;
 
@@ -349,7 +348,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
     // ordered_h0/c0 is the reordered hidden/cell initialization.
     // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
     // initialization.
-    Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
+    phi::DenseTensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
 
     framework::Vector<size_t> order(batch_gate->lod()[2]);
 
@@ -445,8 +444,8 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
       int bstart = static_cast<int>(batch_starts[n]);
       int bend = static_cast<int>(batch_starts[n + 1]);
 
-      Tensor cur_proj = batch_proj.Slice(bstart, bend);
-      Tensor proj_g = batch_proj_g.Slice(bstart, bend);
+      phi::DenseTensor cur_proj = batch_proj.Slice(bstart, bend);
+      phi::DenseTensor proj_g = batch_proj_g.Slice(bstart, bend);
 
       if (proj_clip && proj_clip > 0.0) {
         T* dx_data = proj_g.data<T>();
@@ -472,7 +471,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
                        proj_g_dev);
       }
       /* hidden state backwarad */
-      Tensor out_g = batch_hidden_g.Slice(bstart, bend);
+      phi::DenseTensor out_g = batch_hidden_g.Slice(bstart, bend);
       blas.MatMul(proj_g,
                   false,
                   *proj_weight,
@@ -482,7 +481,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
                   static_cast<T>(0.0));
       /* projection weight backward*/
       if (proj_weight_g) {
-        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+        phi::DenseTensor hidden_t = batch_hidden->Slice(bstart, bend);
         blas.MatMul(hidden_t,
                     true,
                     proj_g,
@@ -492,23 +491,23 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
                     static_cast<T>(1.0));
       }
 
-      Tensor gate = batch_gate->Slice(bstart, bend);
-      Tensor cell = batch_cell.Slice(bstart, bend);
-      Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
+      phi::DenseTensor gate = batch_gate->Slice(bstart, bend);
+      phi::DenseTensor cell = batch_cell.Slice(bstart, bend);
+      phi::DenseTensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
       lstmp_value.gate_value = gate.data<T>();
       lstmp_value.state_value = cell.data<T>();
       lstmp_value.state_active_value = cell_pre_act.data<T>();
 
-      Tensor gate_g = batch_gate_g.Slice(bstart, bend);
-      Tensor cell_g = batch_cell_g.Slice(bstart, bend);
+      phi::DenseTensor gate_g = batch_gate_g.Slice(bstart, bend);
+      phi::DenseTensor cell_g = batch_cell_g.Slice(bstart, bend);
       lstmp_grad.state_grad = cell_g.data<T>();
       lstmp_grad.gate_grad = gate_g.data<T>();
       lstmp_grad.output_grad = out_g.data<T>();
 
       if (n > 0) {
         int bstart_pre = static_cast<int>(batch_starts[n - 1]);
-        Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
-        Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
+        phi::DenseTensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
+        phi::DenseTensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
         lstmp_value.prev_state_value = cell_pre.data<T>();
         lstmp_grad.prev_state_grad = cell_pre_g.data<T>();
       } else {
@@ -589,9 +588,9 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
     }
     if (bias && bias_g) {
       /* backward bias */
-      Tensor b_g = *bias_g;
+      phi::DenseTensor b_g = *bias_g;
       b_g.Resize({bias_g->numel(), 1});
-      Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
+      phi::DenseTensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
       phi::funcs::ColwiseSum<DeviceContext, T> col_sum;
       col_sum(device_ctx, batch_gate_g, &gate_bias_g);
     }
diff --git a/paddle/fluid/operators/masked_select_op_mlu.cc b/paddle/fluid/operators/masked_select_op_mlu.cc
index 50c9973721836..86e4029512b07 100644
--- a/paddle/fluid/operators/masked_select_op_mlu.cc
+++ b/paddle/fluid/operators/masked_select_op_mlu.cc
@@ -39,7 +39,7 @@ class MaskedSelectedMLUKernel : public framework::OpKernel<T> {
             input_dim,
             mask_dim));
 
-    Tensor number(framework::TransToPhiDataType(VT::INT32));
+    phi::DenseTensor number(framework::TransToPhiDataType(VT::INT32));
     void* number_ptr = number.mutable_data<int32_t>({1}, ctx.GetPlace());
 
     out->Resize(mask->dims());
@@ -72,7 +72,7 @@ class MaskedSelectedGradMLUKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::MLUDeviceContext>();
-    Tensor mask_int32, out_size;
+    phi::DenseTensor mask_int32, out_size;
     std::vector<int32_t> out_size_vec;
     mask_int32.mutable_data<int32_t>(mask->dims(), ctx.GetPlace());
     out_size.mutable_data<int32_t>({1}, ctx.GetPlace());
@@ -118,10 +118,10 @@ class MaskedSelectedGradMLUKernel : public framework::OpKernel<T> {
     paddle::framework::TensorToVector(out_size, dev_ctx, &out_size_vec);
     dev_ctx.Wait();
 
-    Tensor mask_int32_tmp;
+    phi::DenseTensor mask_int32_tmp;
     mask_int32_tmp.ShareDataWith(mask_int32);
     mask_int32_tmp.Resize({mask_int32.numel()});
-    Tensor topk_v2_out(framework::TransToPhiDataType(VT::INT32)),
+    phi::DenseTensor topk_v2_out(framework::TransToPhiDataType(VT::INT32)),
         indices_int32(framework::TransToPhiDataType(VT::INT32));
     topk_v2_out.mutable_data<int32_t>({mask_int32.numel()}, ctx.GetPlace());
     indices_int32.mutable_data<int32_t>({mask_int32.numel()}, ctx.GetPlace());
@@ -145,7 +145,7 @@ class MaskedSelectedGradMLUKernel : public framework::OpKernel<T> {
 
     auto stream = ctx.template device_context<MLUDeviceContext>().stream();
 
-    Tensor indices_int32_out;
+    phi::DenseTensor indices_int32_out;
     indices_int32_out.mutable_data<int32_t>({out_size_vec[0]}, ctx.GetPlace());
     memory::Copy(ctx.GetPlace(),
                  GetBasePtr(&indices_int32_out),
@@ -154,7 +154,7 @@ class MaskedSelectedGradMLUKernel : public framework::OpKernel<T> {
                  out_size_vec[0] * sizeof(int32_t),
                  stream);
 
-    Tensor y_grad_tmp_out;
+    phi::DenseTensor y_grad_tmp_out;
     y_grad_tmp_out.mutable_data<T>({out_size_vec[0]}, ctx.GetPlace());
     MLUCnnlTensorDesc y_grad_tmp_out_desc(y_grad_tmp_out);
     memory::Copy(ctx.GetPlace(),
@@ -164,7 +164,7 @@ class MaskedSelectedGradMLUKernel : public framework::OpKernel<T> {
                  out_size_vec[0] * sizeof(T),
                  stream);
 
-    Tensor indices_int32_tmp;
+    phi::DenseTensor indices_int32_tmp;
     indices_int32_tmp.ShareDataWith(indices_int32_out);
     indices_int32_tmp.Resize({out_size_vec[0], 1});
     MLUCnnlTensorDesc indices_int32_tmp_desc(indices_int32_tmp);
diff --git a/paddle/fluid/operators/match_matrix_tensor_op.cc b/paddle/fluid/operators/match_matrix_tensor_op.cc
index facf44725e2b6..3473a051b7324 100644
--- a/paddle/fluid/operators/match_matrix_tensor_op.cc
+++ b/paddle/fluid/operators/match_matrix_tensor_op.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 using LoD = framework::LoD;
 
 void MatchMatrixTensorOP::InferShape(framework::InferShapeContext* ctx) const {
@@ -353,7 +352,7 @@ class CPUMatchMatrixTensorOPGradKernel : public framework::OpKernel<T> {
     auto* d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto* d_y = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
 
-    Tensor tmp_grad;
+    phi::DenseTensor tmp_grad;
     tmp_grad.Resize(tmp->dims());
     auto* d_tmp_data = tmp_grad.mutable_data<T>(ctx.GetPlace());
     auto* top_diff = d_out->data<T>();
diff --git a/paddle/fluid/operators/match_matrix_tensor_op.h b/paddle/fluid/operators/match_matrix_tensor_op.h
index 72e99222ddffb..6aa5b12ff6778 100644
--- a/paddle/fluid/operators/match_matrix_tensor_op.h
+++ b/paddle/fluid/operators/match_matrix_tensor_op.h
@@ -18,7 +18,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 class MatchMatrixTensorOP : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/math/context_project.h b/paddle/fluid/operators/math/context_project.h
index 0038b25fb42de..0b6dc510f477f 100644
--- a/paddle/fluid/operators/math/context_project.h
+++ b/paddle/fluid/operators/math/context_project.h
@@ -26,8 +26,6 @@ namespace operators {
 
 namespace math {
 
-using Tensor = phi::DenseTensor;
-
 /*
  * \brief Context projection concatenates features in adjacent time-steps in
  * a sequence. The i-th row of the output is the concatenation of
@@ -117,13 +115,13 @@ class ContextProjectFunctor {
                             : static_cast<int>(lod_level_0[i]);
       input_row_end = static_cast<int>(lod_level_0[i + 1]);
 
-      Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
-                                static_cast<int>(lod_level_0[i + 1]));
+      phi::DenseTensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
+                                          static_cast<int>(lod_level_0[i + 1]));
 
       sequence_height = static_cast<int>(out_t.dims()[0]);
 
       if (input_row_begin < input_row_end) {
-        Tensor in_t = in.Slice(input_row_begin, input_row_end);
+        phi::DenseTensor in_t = in.Slice(input_row_begin, input_row_end);
 
         std::vector<int64_t> output_shape(
             {sequence_height,
@@ -151,8 +149,9 @@ class ContextProjectFunctor {
       for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
         if (lod_level_0[i] == lod_level_0[i + 1]) continue;
 
-        Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
-                                  static_cast<int>(lod_level_0[i + 1]));
+        phi::DenseTensor out_t =
+            col->Slice(static_cast<int>(lod_level_0[i]),
+                       static_cast<int>(lod_level_0[i + 1]));
 
         sequence_height = static_cast<int>(out_t.dims()[0]);
 
@@ -167,9 +166,9 @@ class ContextProjectFunctor {
           for (int k = 0; k < padding_rows; ++k) {
             int padding_size =
                 k + context_length < up_pad ? context_length : up_pad - k;
-            Tensor out_t_sub = out_t.Slice(k * context_length,
-                                           k * context_length + padding_size);
-            Tensor w_sub = padding_data->Slice(k, k + padding_size);
+            phi::DenseTensor out_t_sub = out_t.Slice(
+                k * context_length, k * context_length + padding_size);
+            phi::DenseTensor w_sub = padding_data->Slice(k, k + padding_size);
             framework::TensorCopy(
                 w_sub, context.GetPlace(), context, &out_t_sub);
           }
@@ -196,10 +195,10 @@ class ContextProjectFunctor {
             if (padding_begin > 0 || sequence_height == context_start)
               padding_idx = padding_begin + t;
 
-            Tensor out_t_sub = out_t.Slice(
+            phi::DenseTensor out_t_sub = out_t.Slice(
                 (down_pad_begin_row + t) * context_length - padding_size,
                 (down_pad_begin_row + t) * context_length);
-            Tensor w_sub = padding_data->Slice(
+            phi::DenseTensor w_sub = padding_data->Slice(
                 up_pad + padding_idx, up_pad + padding_idx + padding_size);
             framework::TensorCopy(
                 w_sub, context.GetPlace(), context, &out_t_sub);
@@ -250,13 +249,14 @@ class ContextProjectGradFunctor {
                               : static_cast<int>(lod_level_0[i]);
         input_row_end = static_cast<int>(lod_level_0[i + 1]);
 
-        Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
-                                  static_cast<int>(lod_level_0[i + 1]));
+        phi::DenseTensor out_t =
+            col->Slice(static_cast<int>(lod_level_0[i]),
+                       static_cast<int>(lod_level_0[i + 1]));
 
         sequence_height = static_cast<int>(out_t.dims()[0]);
 
         if (input_row_begin < input_row_end) {
-          Tensor in_t = in.Slice(input_row_begin, input_row_end);
+          phi::DenseTensor in_t = in.Slice(input_row_begin, input_row_end);
 
           std::vector<int64_t> output_shape(
               {sequence_height,
@@ -283,8 +283,9 @@ class ContextProjectGradFunctor {
         for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
           if (lod_level_0[i] == lod_level_0[i + 1]) continue;
 
-          Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
-                                    static_cast<int>(lod_level_0[i + 1]));
+          phi::DenseTensor out_t =
+              col->Slice(static_cast<int>(lod_level_0[i]),
+                         static_cast<int>(lod_level_0[i + 1]));
 
           sequence_height = static_cast<int>(out_t.dims()[0]);
           out_t.Resize({static_cast<int64_t>(sequence_height) * context_length,
@@ -297,9 +298,9 @@ class ContextProjectGradFunctor {
             for (int k = 0; k < padding_rows; ++k) {
               int padding_size =
                   k + context_length < up_pad ? context_length : up_pad - k;
-              Tensor out_t_sub = out_t.Slice(k * context_length,
-                                             k * context_length + padding_size);
-              Tensor w_sub = padding_data->Slice(k, k + padding_size);
+              phi::DenseTensor out_t_sub = out_t.Slice(
+                  k * context_length, k * context_length + padding_size);
+              phi::DenseTensor w_sub = padding_data->Slice(k, k + padding_size);
               blas.AXPY(w_sub.numel(),
                         static_cast<T>(1),
                         out_t_sub.data<T>(),
@@ -329,10 +330,10 @@ class ContextProjectGradFunctor {
               if (padding_begin > 0 || sequence_height == context_start)
                 padding_idx = padding_begin + t;
 
-              Tensor out_t_sub = out_t.Slice(
+              phi::DenseTensor out_t_sub = out_t.Slice(
                   (down_pad_begin_row + t) * context_length - padding_size,
                   (down_pad_begin_row + t) * context_length);
-              Tensor w_sub = padding_data->Slice(
+              phi::DenseTensor w_sub = padding_data->Slice(
                   up_pad + padding_idx, up_pad + padding_idx + padding_size);
               blas.AXPY(w_sub.numel(),
                         static_cast<T>(1),
diff --git a/paddle/fluid/operators/math/eigen_values_vectors.h b/paddle/fluid/operators/math/eigen_values_vectors.h
index 1ba2d8f18ca1c..f4198acfd830c 100644
--- a/paddle/fluid/operators/math/eigen_values_vectors.h
+++ b/paddle/fluid/operators/math/eigen_values_vectors.h
@@ -55,9 +55,9 @@ static void CheckEighResult(const int batch, const int info) {
 template <typename DeviceContext, typename T>
 struct MatrixEighFunctor {
   void operator()(const framework::ExecutionContext &ctx,
-                  const Tensor &input,
-                  Tensor *eigen_values,
-                  Tensor *eigen_vectors,
+                  const phi::DenseTensor &input,
+                  phi::DenseTensor *eigen_values,
+                  phi::DenseTensor *eigen_vectors,
                   bool is_lower,
                   bool has_vectors);
 };
@@ -69,9 +69,9 @@ template <typename T>
 struct MatrixEighFunctor<phi::CPUContext, T> {
  public:
   void operator()(const framework::ExecutionContext &ctx,
-                  const Tensor &input,
-                  Tensor *eigen_values,
-                  Tensor *eigen_vectors,
+                  const phi::DenseTensor &input,
+                  phi::DenseTensor *eigen_values,
+                  phi::DenseTensor *eigen_vectors,
                   bool is_lower,
                   bool has_vectors) {
     using ValueType = phi::dtype::Real<T>;
@@ -80,7 +80,7 @@ struct MatrixEighFunctor<phi::CPUContext, T> {
     auto dito =
         math::DeviceIndependenceTensorOperations<phi::CPUContext, T>(ctx);
 
-    Tensor input_trans;
+    phi::DenseTensor input_trans;
     // lapack is a column-major storge, transpose make the input to
     // have a continuous memory layout
     input_trans = dito.Transpose(input);
@@ -124,7 +124,7 @@ struct MatrixEighFunctor<phi::CPUContext, T> {
     lwork = std::max<int>(1, static_cast<int>(lwork_opt));
     liwork = std::max<int>(1, iwork_opt);
 
-    Tensor rwork_tensor;
+    phi::DenseTensor rwork_tensor;
     ValueType *rwork_data = nullptr;
 
     // complex type
@@ -134,7 +134,7 @@ struct MatrixEighFunctor<phi::CPUContext, T> {
       rwork_data = rwork_tensor.mutable_data<ValueType>(
           phi::make_ddim({lrwork}), ctx.GetPlace());
     }
-    Tensor iwork_tensor, work_tensor;
+    phi::DenseTensor iwork_tensor, work_tensor;
     auto *iwork_data = iwork_tensor.mutable_data<int>(phi::make_ddim({liwork}),
                                                       ctx.GetPlace());
     auto *work_data =
@@ -179,9 +179,9 @@ template <typename T>
 struct MatrixEighFunctor<phi::GPUContext, T> {
  public:
   void operator()(const framework::ExecutionContext &ctx,
-                  const Tensor &input,
-                  Tensor *eigen_values,
-                  Tensor *eigen_vectors,
+                  const phi::DenseTensor &input,
+                  phi::DenseTensor *eigen_values,
+                  phi::DenseTensor *eigen_vectors,
                   bool is_lower,
                   bool has_vectors) {
     using ValueType = phi::dtype::Real<T>;
@@ -190,7 +190,7 @@ struct MatrixEighFunctor<phi::GPUContext, T> {
     auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto dito =
         math::DeviceIndependenceTensorOperations<phi::GPUContext, T>(ctx);
-    Tensor input_trans;
+    phi::DenseTensor input_trans;
     input_trans = dito.Transpose(input);
     auto *input_vector = input_trans.data<T>();
     auto &dims = input.dims();
diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu
index e3cc5a5741b02..0c6b49729546c 100644
--- a/paddle/fluid/operators/math/sample_prob.cu
+++ b/paddle/fluid/operators/math/sample_prob.cu
@@ -31,8 +31,6 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 __device__ T gpu_adjust_prob(const T prob,
                              const int num_samples,
@@ -146,7 +144,7 @@ void GPUSampleWithProb<T>::operator()(const phi::GPUContext& context,
 
   int s_size = num_samples;
   framework::DDim s_dim{s_size};
-  Tensor s;
+  phi::DenseTensor s;
   int64_t* s_data = s.mutable_data<int64_t>(s_dim, platform::CPUPlace());
 
   math::LogUniformSampler sampler(dict_size, seed);
diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h
index 2464ac25186f0..7c60be6841552 100644
--- a/paddle/fluid/operators/math/sample_prob.h
+++ b/paddle/fluid/operators/math/sample_prob.h
@@ -27,8 +27,6 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-using Tensor = phi::DenseTensor;
-
 /* UNDERSTAND: utility function to adjust probability for unique sampling,
 return whatever as it is if not using unique samping */
 template <typename T>
diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc
index 65d4a479a4988..53b3b632dd4be 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -24,7 +24,6 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -405,7 +404,7 @@ class SequencePoolFunctor<phi::CPUContext, T> {
     }
     auto& place = *context.eigen_device();
     for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-      Tensor out_t = output->Slice(i, i + 1);
+      phi::DenseTensor out_t = output->Slice(i, i + 1);
       int64_t w = input.numel() / input.dims()[0];
       if (lod[i] == lod[i + 1]) {
         for (int j = 0; j < w; ++j) {
@@ -413,7 +412,7 @@ class SequencePoolFunctor<phi::CPUContext, T> {
         }
         continue;
       }
-      Tensor in_t =
+      phi::DenseTensor in_t =
           input.Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
       int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
       auto in_e = EigenMatrix<T>::From(in_t, phi::make_ddim({h, w}));
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index c70e1e3e7405a..b7a9b9a19c970 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -23,7 +23,6 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-using Tensor = phi::DenseTensor;
 using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using DataLayout = platform::DataLayout;
 template <typename T>
@@ -42,7 +41,7 @@ void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()(
   if (cudnn_tensor_dims.size() == 5) {
     layout = DataLayout::kNCDHW;
   }
-  // NOTE(*) : cudnn softmax only support >= 4D Tensor,
+  // NOTE(*) : cudnn softmax only support >= 4D phi::DenseTensor,
   // fill 1 at unused dims
   if (cudnn_tensor_dims.size() <= 2) {
     cudnn_tensor_dims.resize(4, 1);
@@ -95,7 +94,7 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
   if (cudnn_tensor_dims.size() == 5) {
     layout = DataLayout::kNCDHW;
   }
-  // NOTE(*) : cudnn softmax only support >= 4D Tensor,
+  // NOTE(*) : cudnn softmax only support >= 4D phi::DenseTensor,
   // fill 1 at unused dims
   if (cudnn_tensor_dims.size() <= 2) {
     cudnn_tensor_dims.resize(4, 1);
diff --git a/paddle/fluid/operators/math/tree2col.cu b/paddle/fluid/operators/math/tree2col.cu
index 3b467448ac09d..22bdc48768dae 100644
--- a/paddle/fluid/operators/math/tree2col.cu
+++ b/paddle/fluid/operators/math/tree2col.cu
@@ -20,7 +20,6 @@
 namespace paddle {
 namespace operators {
 namespace math {
-using Tensor = phi::DenseTensor;
 using Node = paddle::operators::math::TreeNode;
 template <typename T>
 __global__ void tree2col(const T* eta,
@@ -65,7 +64,7 @@ class Tree2ColFunctor<phi::GPUContext, T> {
     auto feature_dims = node_features.dims();
     phi::funcs::SetConstant<phi::GPUContext, T> constant;
 
-    Tensor EdgeSet_cpu;
+    phi::DenseTensor EdgeSet_cpu;
     framework::TensorCopy(EdgeSet, cpu_place, &EdgeSet_cpu);
     int64_t feature_size = feature_dims[1];
     size_t patch_elem_size = 3 * static_cast<size_t>(feature_size);
@@ -83,7 +82,7 @@ class Tree2ColFunctor<phi::GPUContext, T> {
     }
 
     size_t patch_size = processing_list.size();
-    Tensor node_cpu, node_gpu, eta_cpu, eta_gpu, index_cpu, index_gpu;
+    phi::DenseTensor node_cpu, node_gpu, eta_cpu, eta_gpu, index_cpu, index_gpu;
     int* node = node_cpu.mutable_data<int>({static_cast<int64_t>(total_size)},
                                            cpu_place);
     T* eta = eta_cpu.mutable_data<T>({static_cast<int64_t>(total_size * 3)},
@@ -142,7 +141,7 @@ class Col2TreeFunctor<phi::GPUContext, T> {
     auto output_dims = patch_grad.dims();
     phi::funcs::SetConstant<phi::GPUContext, T> constant;
 
-    Tensor EdgeSet_cpu;
+    phi::DenseTensor EdgeSet_cpu;
     framework::TensorCopy(EdgeSet, cpu_place, &EdgeSet_cpu);
     int64_t output_size = output_dims[1];
     size_t patch_elem_size = 3 * static_cast<size_t>(output_size);
@@ -168,7 +167,7 @@ class Col2TreeFunctor<phi::GPUContext, T> {
       total_size += tmp.size();
     }
 
-    Tensor node_cpu, node_gpu, eta_cpu, eta_gpu, index_cpu, index_gpu;
+    phi::DenseTensor node_cpu, node_gpu, eta_cpu, eta_gpu, index_cpu, index_gpu;
     int* node = node_cpu.mutable_data<int>({static_cast<int64_t>(total_size)},
                                            cpu_place);
     T* eta = eta_cpu.mutable_data<T>({static_cast<int64_t>(total_size * 3)},
diff --git a/paddle/fluid/operators/matmul_op_mlu.cc b/paddle/fluid/operators/matmul_op_mlu.cc
index e55996903a7d1..84d2f031d4bcb 100644
--- a/paddle/fluid/operators/matmul_op_mlu.cc
+++ b/paddle/fluid/operators/matmul_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 static void Mul(const framework::ExecutionContext& ctx,
                 const phi::DenseTensor& X,
@@ -183,7 +181,7 @@ class MatMulMLUKernel : public framework::OpKernel<T> {
     }
 
     // Resize dim 1 to 2
-    Tensor x_temp, y_temp;
+    phi::DenseTensor x_temp, y_temp;
     x_temp.ShareDataWith(*X);
     y_temp.ShareDataWith(*Y);
     if (x_ndim == 1) {
@@ -281,7 +279,7 @@ class MatMulGradMLUKernel : public framework::OpKernel<T> {
     }
 
     // Resize dim 1 to 2
-    Tensor x_temp, y_temp, dout_temp;
+    phi::DenseTensor x_temp, y_temp, dout_temp;
     x_temp.ShareDataWith(*X);
     y_temp.ShareDataWith(*Y);
     dout_temp.ShareDataWith(*dOut);
@@ -335,7 +333,7 @@ class MatMulGradMLUKernel : public framework::OpKernel<T> {
     std::copy(y_dims.end() - 2, y_dims.end(), y_bcast_dims.end() - 2);
 
     if (dX) {
-      Tensor dx_temp(X->type());
+      phi::DenseTensor dx_temp(X->type());
       if (x_dims != x_bcast_dims) {
         dx_temp.Resize(phi::make_ddim(x_bcast_dims));
       } else {
@@ -356,7 +354,7 @@ class MatMulGradMLUKernel : public framework::OpKernel<T> {
     }
 
     if (dY) {
-      Tensor dy_temp(Y->type());
+      phi::DenseTensor dy_temp(Y->type());
       if (y_dims != y_bcast_dims) {
         dy_temp.Resize(phi::make_ddim(y_bcast_dims));
       } else {
diff --git a/paddle/fluid/operators/matmul_op_npu.cc b/paddle/fluid/operators/matmul_op_npu.cc
index 31b352b90f6a8..8ab395e8aa3e4 100644
--- a/paddle/fluid/operators/matmul_op_npu.cc
+++ b/paddle/fluid/operators/matmul_op_npu.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 template <typename T>
@@ -35,7 +34,7 @@ static void Mul(const framework::ExecutionContext& ctx,
     const auto& runner_dx = NpuOpRunner("Mul", {X, Y}, {*Out}, {});
     runner_dx.Run(stream);
   } else {
-    Tensor Out_temp(Out->dtype());
+    phi::DenseTensor Out_temp(Out->dtype());
     Out_temp.mutable_data<T>(Out->dims(), ctx.GetPlace());
     const auto& runner_dx = NpuOpRunner("Mul", {X, Y}, {Out_temp}, {});
     runner_dx.Run(stream);
@@ -59,7 +58,7 @@ static void Dot(const framework::ExecutionContext& ctx,
     const auto& runner = NpuOpRunner("Dot", {X, Y}, {*Out});
     runner.Run(stream);
   } else {
-    Tensor Out_temp(Out->dtype());
+    phi::DenseTensor Out_temp(Out->dtype());
     Out_temp.mutable_data<T>(Out->dims(), ctx.GetPlace());
     const auto& out_temp_runner = NpuOpRunner("Dot", {X, Y}, {Out_temp});
     out_temp_runner.Run(stream);
@@ -89,7 +88,7 @@ static void MatMul2D(const framework::ExecutionContext& ctx,
                     {{"transpose_x1", trans_x}, {"transpose_x2", trans_y}});
     runner.Run(stream);
   } else {
-    Tensor Out_temp(Out->dtype());
+    phi::DenseTensor Out_temp(Out->dtype());
     Out_temp.mutable_data<T>(Out->dims(), ctx.GetPlace());
     const auto& out_temp_runner =
         NpuOpRunner("MatMul",
@@ -123,7 +122,7 @@ static void MatMulND(const framework::ExecutionContext& ctx,
                     {{"adj_x1", trans_x}, {"adj_x2", trans_y}});
     runner.Run(stream);
   } else {
-    Tensor Out_temp(Out->dtype());
+    phi::DenseTensor Out_temp(Out->dtype());
     Out_temp.mutable_data<T>(Out->dims(), ctx.GetPlace());
     const auto& out_temp_runner =
         NpuOpRunner("BatchMatMul",
@@ -200,7 +199,7 @@ class MatMulNPUKernel : public framework::OpKernel<T> {
     }
 
     // Resize dim 1 to 2
-    Tensor x_temp, y_temp;
+    phi::DenseTensor x_temp, y_temp;
     x_temp.ShareDataWith(*X);
     y_temp.ShareDataWith(*Y);
     if (x_ndim == 1) {
@@ -268,7 +267,7 @@ class MatMulNPUKernel : public framework::OpKernel<T> {
     std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2);
     std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2);
 
-    Tensor x_temp_brd(X->dtype());
+    phi::DenseTensor x_temp_brd(X->dtype());
     if (x_dims == x_broadcast_dims) {
       x_temp_brd.ShareDataWith(*X);
       x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims));
@@ -283,7 +282,7 @@ class MatMulNPUKernel : public framework::OpKernel<T> {
           .Run(stream);
     }
 
-    Tensor y_temp_brd(Y->dtype());
+    phi::DenseTensor y_temp_brd(Y->dtype());
     if (y_dims == y_broadcast_dims) {
       y_temp_brd.ShareDataWith(*Y);
       y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims));
@@ -332,7 +331,7 @@ class MatMulGradNPUKernel : public framework::OpKernel<T> {
 
     // Case 1: [K] x [K] = [1]
     if (x_ndim == 1 && y_ndim == 1) {
-      Tensor dout_temp(dOut->dtype());
+      phi::DenseTensor dout_temp(dOut->dtype());
       dout_temp.Resize(X->dims());
       dout_temp.mutable_data<T>(ctx.GetPlace());
       NpuOpRunner runner;
@@ -352,7 +351,7 @@ class MatMulGradNPUKernel : public framework::OpKernel<T> {
     }
 
     // Resize dim 1 to 2
-    Tensor x_temp, y_temp, dout_temp;
+    phi::DenseTensor x_temp, y_temp, dout_temp;
     x_temp.ShareDataWith(*X);
     y_temp.ShareDataWith(*Y);
     dout_temp.ShareDataWith(*dOut);
@@ -434,7 +433,7 @@ class MatMulGradNPUKernel : public framework::OpKernel<T> {
     std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2);
     std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2);
 
-    Tensor x_temp_brd(X->dtype());
+    phi::DenseTensor x_temp_brd(X->dtype());
     if (x_dims == x_broadcast_dims) {
       x_temp_brd.ShareDataWith(*X);
       x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims));
@@ -449,7 +448,7 @@ class MatMulGradNPUKernel : public framework::OpKernel<T> {
           .Run(stream);
     }
 
-    Tensor y_temp_brd(Y->dtype());
+    phi::DenseTensor y_temp_brd(Y->dtype());
     if (y_dims == y_broadcast_dims) {
       y_temp_brd.ShareDataWith(*Y);
       y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims));
@@ -480,7 +479,7 @@ class MatMulGradNPUKernel : public framework::OpKernel<T> {
                       alpha);
         }
       } else {
-        Tensor dx_temp(X->dtype());
+        phi::DenseTensor dx_temp(X->dtype());
         dx_temp.Resize(phi::make_ddim(x_broadcast_dims));
         if (transpose_x) {
           MatMulND<T>(ctx,
@@ -520,7 +519,7 @@ class MatMulGradNPUKernel : public framework::OpKernel<T> {
                       alpha);
         }
       } else {
-        Tensor dy_temp(Y->dtype());
+        phi::DenseTensor dy_temp(Y->dtype());
         dy_temp.Resize(phi::make_ddim(y_broadcast_dims));
         if (transpose_y) {
           MatMulND<T>(ctx,
diff --git a/paddle/fluid/operators/matmul_v2_op_mlu.cc b/paddle/fluid/operators/matmul_v2_op_mlu.cc
index 134819b7920a0..db7a92409bf6c 100644
--- a/paddle/fluid/operators/matmul_v2_op_mlu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 static void Mul(const framework::ExecutionContext& ctx,
                 const phi::DenseTensor& X,
@@ -193,7 +191,7 @@ class MatMulV2MLUKernel : public framework::OpKernel<T> {
     }
 
     // Resize dim 1 to 2
-    Tensor x_temp, y_temp;
+    phi::DenseTensor x_temp, y_temp;
     x_temp.ShareDataWith(*X);
     y_temp.ShareDataWith(*Y);
     if (x_ndim == 1) {
@@ -290,7 +288,7 @@ class MatMulGradV2MLUKernel : public framework::OpKernel<T> {
     }
 
     // Resize dim 1 to 2
-    Tensor x_temp, y_temp, dout_temp;
+    phi::DenseTensor x_temp, y_temp, dout_temp;
     x_temp.ShareDataWith(*X);
     y_temp.ShareDataWith(*Y);
     dout_temp.ShareDataWith(*dOut);
@@ -344,7 +342,7 @@ class MatMulGradV2MLUKernel : public framework::OpKernel<T> {
     std::copy(y_dims.end() - 2, y_dims.end(), y_bcast_dims.end() - 2);
 
     if (dX) {
-      Tensor dx_temp(X->type());
+      phi::DenseTensor dx_temp(X->type());
       if (x_dims != x_bcast_dims) {
         dx_temp.Resize(phi::make_ddim(x_bcast_dims));
       } else {
@@ -375,7 +373,7 @@ class MatMulGradV2MLUKernel : public framework::OpKernel<T> {
               ctx, x_temp, dout_temp, dY, !trans_x, false);
         }
       } else {
-        Tensor dy_temp(Y->type());
+        phi::DenseTensor dy_temp(Y->type());
         if (y_dims != y_bcast_dims) {
           dy_temp.Resize(phi::make_ddim(y_bcast_dims));
         } else {
diff --git a/paddle/fluid/operators/matmul_v2_op_npu.cc b/paddle/fluid/operators/matmul_v2_op_npu.cc
index 4df3de71134ed..715171452a987 100644
--- a/paddle/fluid/operators/matmul_v2_op_npu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_npu.cc
@@ -21,7 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 template <typename T>
@@ -67,7 +66,7 @@ void MatMulND<phi::dtype::float16>(const framework::ExecutionContext& ctx,
                                    const bool trans_x,
                                    const bool trans_y) {
   Out->mutable_data<phi::dtype::float16>(ctx.GetPlace());
-  Tensor x_fp32, y_fp32, out_fp32;
+  phi::DenseTensor x_fp32, y_fp32, out_fp32;
   x_fp32.Resize(X.dims());
   y_fp32.Resize(Y.dims());
   out_fp32.Resize(Out->dims());
@@ -173,7 +172,7 @@ class MatMulV2NPUKernel : public framework::OpKernel<T> {
     }
 
     // Resize dim 1 to 2
-    Tensor x_temp, y_temp;
+    phi::DenseTensor x_temp, y_temp;
     x_temp.ShareDataWith(*X);
     y_temp.ShareDataWith(*Y);
     if (x_ndim == 1) {
@@ -239,7 +238,7 @@ class MatMulV2NPUKernel : public framework::OpKernel<T> {
     std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2);
     std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2);
 
-    Tensor x_temp_brd(X->type());
+    phi::DenseTensor x_temp_brd(X->type());
     if (x_dims == x_broadcast_dims) {
       x_temp_brd.ShareDataWith(*X);
       x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims));
@@ -254,7 +253,7 @@ class MatMulV2NPUKernel : public framework::OpKernel<T> {
           .Run(stream);
     }
 
-    Tensor y_temp_brd(Y->type());
+    phi::DenseTensor y_temp_brd(Y->type());
     if (y_dims == y_broadcast_dims) {
       y_temp_brd.ShareDataWith(*Y);
       y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims));
@@ -295,7 +294,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
 
     // Case 1: [K] x [K] = [1]
     if (x_ndim == 1 && y_ndim == 1) {
-      Tensor dout_temp(dOut->type());
+      phi::DenseTensor dout_temp(dOut->type());
       dout_temp.Resize(X->dims());
       dout_temp.mutable_data<T>(ctx.GetPlace());
       NpuOpRunner runner;
@@ -319,7 +318,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
     }
 
     // Resize dim 1 to 2
-    Tensor x_temp, y_temp, dout_temp;
+    phi::DenseTensor x_temp, y_temp, dout_temp;
     x_temp.ShareDataWith(*X);
     y_temp.ShareDataWith(*Y);
     dout_temp.ShareDataWith(*dOut);
@@ -396,7 +395,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
     std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2);
     std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2);
 
-    Tensor x_temp_brd(X->type());
+    phi::DenseTensor x_temp_brd(X->type());
     if (x_dims == x_broadcast_dims) {
       x_temp_brd.ShareDataWith(*X);
       x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims));
@@ -411,7 +410,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
           .Run(stream);
     }
 
-    Tensor y_temp_brd(Y->type());
+    phi::DenseTensor y_temp_brd(Y->type());
     if (y_dims == y_broadcast_dims) {
       y_temp_brd.ShareDataWith(*Y);
       y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims));
@@ -434,7 +433,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
           MatMulND<T>(ctx, stream, dout_temp, y_temp_brd, dX, false, !trans_y);
         }
       } else {
-        Tensor dx_temp(X->type());
+        phi::DenseTensor dx_temp(X->type());
         dx_temp.Resize(phi::make_ddim(x_broadcast_dims));
         if (trans_x) {
           MatMulND<T>(
@@ -454,7 +453,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
           MatMulND<T>(ctx, stream, x_temp_brd, dout_temp, dY, !trans_x, false);
         }
       } else {
-        Tensor dy_temp(Y->type());
+        phi::DenseTensor dy_temp(Y->type());
         dy_temp.Resize(phi::make_ddim(y_broadcast_dims));
         if (trans_y) {
           MatMulND<T>(
diff --git a/paddle/fluid/operators/mean_iou_op.h b/paddle/fluid/operators/mean_iou_op.h
index 7681af011e663..9be97f5ba958e 100644
--- a/paddle/fluid/operators/mean_iou_op.h
+++ b/paddle/fluid/operators/mean_iou_op.h
@@ -20,7 +20,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename T,
           int D,
@@ -56,9 +55,9 @@ class MeanIoUKernel : public framework::OpKernel<T> {
     auto out_correct_t = EigenTensor<int, 1>::From(*out_correct);
 
     // Tmp tensor
-    Tensor denominator;
-    Tensor valid_count;
-    Tensor iou_sum;
+    phi::DenseTensor denominator;
+    phi::DenseTensor valid_count;
+    phi::DenseTensor iou_sum;
 
     // get data ptr of tmp tensor
     int* denominator_data = denominator.mutable_data<int>(
diff --git a/paddle/fluid/operators/mean_op_mlu.cc b/paddle/fluid/operators/mean_op_mlu.cc
index 8fea989941c88..e9266b30fcd01 100644
--- a/paddle/fluid/operators/mean_op_mlu.cc
+++ b/paddle/fluid/operators/mean_op_mlu.cc
@@ -20,8 +20,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class MeanMLUKernel : public framework::OpKernel<T> {
  public:
@@ -79,12 +77,13 @@ class MeanMLUGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto output_grad =
         context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    PADDLE_ENFORCE_EQ(output_grad->numel(),
-                      1,
-                      platform::errors::InvalidArgument(
-                          "Mean Gradient Input Tensor len should be 1. But "
-                          "received Out@Grad's elements num is %d.",
-                          output_grad->numel()));
+    PADDLE_ENFORCE_EQ(
+        output_grad->numel(),
+        1,
+        platform::errors::InvalidArgument(
+            "Mean Gradient Input phi::DenseTensor len should be 1. But "
+            "received Out@Grad's elements num is %d.",
+            output_grad->numel()));
     auto input_grad =
         context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     input_grad->mutable_data<T>(context.GetPlace());
@@ -102,7 +101,7 @@ class MeanMLUGradKernel : public framework::OpKernel<T> {
     }
 
     // means
-    Tensor mean_var(output_grad->dtype());
+    phi::DenseTensor mean_var(output_grad->dtype());
     mean_var.mutable_data<T>(input_grad->dims(), context.GetPlace());
     MLUCnnlTensorDesc mean_var_desc(
         mean_var, CNNL_LAYOUT_ARRAY, ToCnnlDataType(mean_var.dtype()));
diff --git a/paddle/fluid/operators/mean_op_npu.cc b/paddle/fluid/operators/mean_op_npu.cc
index 99fd77dd7f7df..3417045690ff6 100644
--- a/paddle/fluid/operators/mean_op_npu.cc
+++ b/paddle/fluid/operators/mean_op_npu.cc
@@ -16,8 +16,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class MeanNPUKernel : public framework::OpKernel<T> {
  public:
@@ -51,31 +49,32 @@ class MeanGradNPUKernel : public framework::OpKernel<T> {
 
     auto grad = context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
 
-    PADDLE_ENFORCE_EQ(grad->numel(),
-                      1,
-                      platform::errors::InvalidArgument(
-                          "Mean Gradient Input Tensor len should be 1. But "
-                          "received Out@Grad's elements num is %d.",
-                          grad->numel()));
+    PADDLE_ENFORCE_EQ(
+        grad->numel(),
+        1,
+        platform::errors::InvalidArgument(
+            "Mean Gradient Input phi::DenseTensor len should be 1. But "
+            "received Out@Grad's elements num is %d.",
+            grad->numel()));
 
     auto IG = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     IG->mutable_data<T>(context.GetPlace());
 
     // ones
-    Tensor ones(grad->dtype());
+    phi::DenseTensor ones(grad->dtype());
     ones.mutable_data<T>(IG->dims(), context.GetPlace());
     const auto& runner_ones = NpuOpRunner("OnesLike", {*IG}, {ones}, {});
     runner_ones.Run(stream);
 
     // means
-    Tensor mean_tensor(grad->dtype());
+    phi::DenseTensor mean_tensor(grad->dtype());
     mean_tensor.Resize({1});
     mean_tensor.mutable_data<T>(context.GetPlace());
     FillNpuTensorWithConstant<T>(
         &mean_tensor, static_cast<T>(1.0 / static_cast<float>(IG->numel())));
 
     // means mul ones
-    Tensor mean_ma(grad->dtype());
+    phi::DenseTensor mean_ma(grad->dtype());
     mean_ma.Resize(IG->dims());
     mean_ma.mutable_data<T>(context.GetPlace());
     const auto& runner_mul_1 =
diff --git a/paddle/fluid/operators/meshgrid_op_mlu.cc b/paddle/fluid/operators/meshgrid_op_mlu.cc
index 76beb021bc654..f0103afbb0bc5 100644
--- a/paddle/fluid/operators/meshgrid_op_mlu.cc
+++ b/paddle/fluid/operators/meshgrid_op_mlu.cc
@@ -24,12 +24,12 @@ class MeshgridMLUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto ins = ctx.MultiInput<phi::DenseTensor>("X");
     auto outs = ctx.MultiOutput<phi::DenseTensor>("Out");
-    PADDLE_ENFORCE_EQ(
-        (ins.size() > 1) && (ins.size() < 7),
-        true,
-        platform::errors::InvalidArgument(
-            "Excepted Tensor numbers between 2 and 6, but only received d% .",
-            ins.size()));
+    PADDLE_ENFORCE_EQ((ins.size() > 1) && (ins.size() < 7),
+                      true,
+                      platform::errors::InvalidArgument(
+                          "Excepted phi::DenseTensor numbers between 2 and 6, "
+                          "but only received d% .",
+                          ins.size()));
 
     int64_t size = ins.size();
     std::vector<int64_t> shape(size);
diff --git a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
index ec78fb09eab30..b66966ac64b90 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
@@ -36,8 +36,8 @@ class AccuracyMLUKernel : public framework::OpKernel<T> {
     }
 
     // cast `indices` or `label` if their type is not INT32
-    Tensor indices_int32(framework::TransToPhiDataType(VT::INT32));
-    Tensor label_int32(framework::TransToPhiDataType(VT::INT32));
+    phi::DenseTensor indices_int32(framework::TransToPhiDataType(VT::INT32));
+    phi::DenseTensor label_int32(framework::TransToPhiDataType(VT::INT32));
     auto indices_type = framework::TransToProtoVarType(indices->type());
     if (indices_type != VT::INT32) {
       PADDLE_ENFORCE_EQ(MLUSupportsCast(indices_type, VT::INT32),
@@ -89,7 +89,7 @@ class AccuracyMLUKernel : public framework::OpKernel<T> {
     // equal
     MLUCnnlTensorDesc indices_int32_desc(indices_int32);
     MLUCnnlTensorDesc label_int32_desc(label_int32);
-    Tensor equal_tensor(framework::TransToPhiDataType(VT::BOOL));
+    phi::DenseTensor equal_tensor(framework::TransToPhiDataType(VT::BOOL));
     equal_tensor.Resize(indices->dims());
     equal_tensor.mutable_data<bool>(ctx.GetPlace());
     MLUCnnlTensorDesc equal_tensor_desc(equal_tensor);
@@ -103,7 +103,7 @@ class AccuracyMLUKernel : public framework::OpKernel<T> {
                    GetBasePtr(&equal_tensor));
 
     // cast equal
-    Tensor equal_fp32(framework::TransToPhiDataType(VT::FP32));
+    phi::DenseTensor equal_fp32(framework::TransToPhiDataType(VT::FP32));
     equal_fp32.Resize(indices->dims());
     equal_fp32.mutable_data<float>(ctx.GetPlace());
     MLUCnnlTensorDesc equal_fp32_desc(equal_fp32);
@@ -117,7 +117,7 @@ class AccuracyMLUKernel : public framework::OpKernel<T> {
 
     // [correct]
     // reduce_max
-    Tensor correct_max(framework::TransToPhiDataType(VT::FP32));
+    phi::DenseTensor correct_max(framework::TransToPhiDataType(VT::FP32));
     correct_max.Resize(phi::make_ddim({num_samples}));
     correct_max.mutable_data<float>(ctx.GetPlace());
     MLUCnnlTensorDesc correct_max_desc(correct_max);
@@ -140,7 +140,7 @@ class AccuracyMLUKernel : public framework::OpKernel<T> {
                     GetBasePtr(&correct_max));
 
     // reduce_sum
-    Tensor correct_sum(framework::TransToPhiDataType(VT::FP32));
+    phi::DenseTensor correct_sum(framework::TransToPhiDataType(VT::FP32));
     correct_sum.Resize(correct->dims());
     correct_sum.mutable_data<float>(ctx.GetPlace());
     MLUCnnlTensorDesc correct_sum_desc(correct_sum);
@@ -183,7 +183,7 @@ class AccuracyMLUKernel : public framework::OpKernel<T> {
                   GetBasePtr(total));
 
     // use `total` of type `float32` for calculating accuracy
-    Tensor total_fp32(framework::TransToPhiDataType(VT::FP32));
+    phi::DenseTensor total_fp32(framework::TransToPhiDataType(VT::FP32));
     total_fp32.Resize(total->dims());
     total_fp32.mutable_data<float>(ctx.GetPlace());
     MLUCnnlTensorDesc total_fp32_desc(total_fp32);
diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
index 4c83071264a42..737228902b6e7 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
@@ -21,7 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename DeviceContext, typename T>
 class AccuracyXPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/metrics/precision_recall_op.h b/paddle/fluid/operators/metrics/precision_recall_op.h
index 55be510dcd237..bec8bba09ad1a 100644
--- a/paddle/fluid/operators/metrics/precision_recall_op.h
+++ b/paddle/fluid/operators/metrics/precision_recall_op.h
@@ -19,7 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
index d27234344ff27..146ee52fc62ff 100644
--- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
@@ -26,7 +26,6 @@ namespace operators {
 using dnnl::memory;
 using dnnl::primitive;
 using dnnl::reorder;
-using Tensor = phi::DenseTensor;
 using dnnl::stream;
 
 template <typename T>
diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
index be965c4abb895..c2556b6bfc41d 100644
--- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
@@ -20,16 +20,15 @@ limitations under the License. */
 namespace {
 using dnnl::memory;
 using paddle::framework::ExecutionContext;
+using paddle::framework::GradVarName;
 using paddle::platform::MatMulV2MKLDNNHandler;
 using phi::OneDNNContext;
 using phi::vectorize;
 using phi::funcs::OneDNNGetDataType;
-using Tensor = phi::DenseTensor;
-using paddle::framework::GradVarName;
 
 // Reshape a rank-3 tensor from P x M x N to (P * M) x N.
 // Identity op if the tensor is not of rank 3.
-static Tensor FoldOuterDims(const Tensor &input) {
+static phi::DenseTensor FoldOuterDims(const phi::DenseTensor &input) {
   auto output = input;
   auto in_dims = input.dims();
   if (in_dims.size() == 3) {
@@ -42,14 +41,14 @@ static Tensor FoldOuterDims(const Tensor &input) {
 // (Warning: This requires transposing data and writes into new memory.)
 // Identity op if the tensor is not of rank 3.
 template <typename T>
-static Tensor FoldFirstAndLastDims(const OneDNNContext &dev_ctx,
-                                   const Tensor *input) {
+static phi::DenseTensor FoldFirstAndLastDims(const OneDNNContext &dev_ctx,
+                                             const phi::DenseTensor *input) {
   auto input_dims = vectorize(input->dims());
   if (input_dims.size() != 3) {
     return *input;
   }
 
-  Tensor output;
+  phi::DenseTensor output;
   output.Resize({input_dims[1], input_dims[0], input_dims[2]});
 
   auto output_dims = vectorize(output.dims());
@@ -89,11 +88,11 @@ class MatMulMKLDNNHandler
  public:
   MatMulMKLDNNHandler(const dnnl::engine engine,
                       paddle::platform::Place cpu_place,
-                      Tensor *x,
+                      phi::DenseTensor *x,
                       bool trans_x,
-                      Tensor *y,
+                      phi::DenseTensor *y,
                       bool trans_y,
-                      Tensor *out,
+                      phi::DenseTensor *out,
                       float scale)
       : phi::funcs::OneDNNHandlerNoCachingT<XT, dnnl::matmul>(engine,
                                                               cpu_place) {
@@ -129,7 +128,7 @@ class MatMulMKLDNNHandler
     this->AcquireForwardPrimitiveDescriptor(attrs, x_md, y_md, out_md);
   }
 
-  std::shared_ptr<memory> AcquireWeightsMemory(const Tensor *input) {
+  std::shared_ptr<memory> AcquireWeightsMemory(const phi::DenseTensor *input) {
     const YT *input_data = input->data<YT>();
     return this->AcquireMemoryFromPrimitive(
         this->fwd_pd_->weights_desc(),
@@ -176,11 +175,10 @@ class MatMulMKLDNNHandler
     // We cannot use base AcquireDstMemory as it makes an allocation request
     // base on DST memory primitive size. This is fine in general, but in MatMul
     // we have primitive that covers only one batch of Data and then shift
-    // pointer for every new batch. Hence Tensor size is bigger that dst memory
-    // primitive size. So would we request less memory that is there and it
-    // triggers an
-    // assertion.  So as there is no 'any' format here we can leave default size
-    // of Tensor as computed in ComputeInferShape
+    // pointer for every new batch. Hence phi::DenseTensor size is bigger that
+    // dst memory primitive size. So would we request less memory that is there
+    // and it triggers an assertion.  So as there is no 'any' format here we can
+    // leave default size of phi::DenseTensor as computed in ComputeInferShape
     OT *ptr = output->mutable_data<OT>(this->place_);
     return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr);
   }
@@ -199,7 +197,7 @@ class MatMulMKLDNNHandler
  * If transposed, `H,W` will be swapped.
  */
 static void ReshapeTensorToMatrixSequence(
-    Tensor *x, const phi::funcs::MatDescriptor &descriptor) {
+    phi::DenseTensor *x, const phi::funcs::MatDescriptor &descriptor) {
   int64_t h, w;
   h = descriptor.height_;
   w = descriptor.width_;
@@ -227,8 +225,11 @@ static void ReshapeTensorToMatrixSequence(
  * If any of `X` and `Y` has batch size BatchSize, the out will have the
  * BatchSize.
  */
-static void ReshapeXYOutToMatrixSequence(
-    Tensor *x, Tensor *y, Tensor *out, bool trans_x, bool trans_y) {
+static void ReshapeXYOutToMatrixSequence(phi::DenseTensor *x,
+                                         phi::DenseTensor *y,
+                                         phi::DenseTensor *out,
+                                         bool trans_x,
+                                         bool trans_y) {
   auto x_dim = phi::funcs::RowMatrixDimsFromVector(x->dims());
   auto y_dim = phi::funcs::ColumnMatrixDimsFromVector(y->dims());
   auto mat_dim_x = phi::funcs::CreateMatrixDescriptor(x_dim, 0, trans_x);
@@ -326,13 +327,13 @@ bool IsOutputFused(const ExecutionContext &ctx) {
 template <typename T, typename T_out>
 void ExecuteMatMulV2(const ExecutionContext &ctx,
                      const dnnl::engine onednn_engine,
-                     const Tensor *x,
+                     const phi::DenseTensor *x,
                      const std::vector<int64_t> &x_dims,
                      bool trans_x,
-                     const Tensor *y,
+                     const phi::DenseTensor *y,
                      const std::vector<int64_t> &y_dims,
                      bool trans_y,
-                     Tensor *out) {
+                     phi::DenseTensor *out) {
   std::vector<int64_t> x_strides_override = GetInputStrides(ctx, "X");
   std::vector<int64_t> y_strides_override = GetInputStrides(ctx, "Y");
   MatMulV2MKLDNNHandler<T, T, T_out> handler(ctx,
@@ -471,7 +472,7 @@ class MatMulMKLDNNKernel : public paddle::framework::OpKernel<T> {
                            const std::vector<int64_t> &y_dims,
                            std::vector<int64_t> *x_bd_dims,
                            std::vector<int64_t> *y_bd_dims,
-                           Tensor *out) const {
+                           phi::DenseTensor *out) const {
     if (x_dims.size() == 1) {
       (*x_bd_dims)[(*x_bd_dims).size() - 1] = x_dims[0];
     } else if (x_dims.size() == 2) {
@@ -501,7 +502,7 @@ class MatMulMKLDNNKernel : public paddle::framework::OpKernel<T> {
                 (*y_bd_dims)[i] == 1,
             true,
             paddle::platform::errors::InvalidArgument(
-                "Tensor dimensions are incorrect for broadcasting."
+                "phi::DenseTensor dimensions are incorrect for broadcasting."
                 "Dimensions in X and Y must be same or equal to 1, but "
                 "received x_dim[%d]=%d and y_dims[%d]= %d",
                 i,
@@ -649,7 +650,7 @@ class MatMulGradMKLDNNKernel : public paddle::framework::OpKernel<T> {
     bool need_combine = (x->dims().size() == 3 || y->dims().size() == 3) &&
                         out->dims().size() == 2;
 
-    Tensor x_combined, y_combined;
+    phi::DenseTensor x_combined, y_combined;
     if (!need_combine) {
       x_combined = *x;
       y_combined = *y;
diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
index 098623ea52466..c23f247c9d212 100644
--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
@@ -25,7 +25,6 @@ namespace operators {
 using dnnl::memory;
 using dnnl::primitive;
 using dnnl::reorder;
-using Tensor = phi::DenseTensor;
 using dnnl::stream;
 using phi::DataLayout;
 
diff --git a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
index 36498e60f4e54..a9408ad38e3a1 100644
--- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
@@ -25,7 +25,6 @@ namespace operators {
 
 using dnnl::memory;
 using dnnl::reorder;
-using Tensor = phi::DenseTensor;
 
 namespace {
 
diff --git a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
index ff2484c7ced38..0c2b439b3e510 100644
--- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
@@ -40,7 +40,7 @@ static std::vector<int> extract_shape(
         tensor->dims(),
         phi::make_ddim({1}),
         platform::errors::InvalidArgument(
-            "If the element type of 'shape' in ReshapeOp is Tensor, "
+            "If the element type of 'shape' in ReshapeOp is phi::DenseTensor, "
             "the element's shape must be [1]. But received the element's shape "
             "is [%s]",
             tensor->dims()));
diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
index 2c5b269c3923b..077107dca68f3 100644
--- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
@@ -21,7 +21,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using phi::DataLayout;
 using phi::OneDNNContext;
 
@@ -37,8 +36,8 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const auto& dnnl_engine = dev_ctx.GetEngine();
     std::vector<int> transpose_axis = ctx.Attr<std::vector<int>>("axis");
     int ndims = transpose_axis.size();
-    const phi::DenseTensor* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    const phi::DenseTensor* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto& astream = OneDNNContext::tls().get_stream();
 
@@ -122,8 +121,9 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                       paddle::platform::errors::PreconditionNotMet(
                           "Operator DNNL TransposeGrad must use CPUPlace"));
 
-    const auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    const auto* dout =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     if (!dx) return;
     auto& dev_ctx = ctx.template device_context<OneDNNContext>();
     const auto& dnnl_engine = dev_ctx.GetEngine();
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc
index d205bc2b2554d..09b1551086fab 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -386,7 +386,7 @@ MLUOpTensorDesc::MLUOpTensorDesc(const int tensor_dim,
       mluOpSetTensorDescriptorPosition(raw_tensor_desc, position));
 }
 
-MLUOpTensorDesc::MLUOpTensorDesc(const Tensor& tensor,
+MLUOpTensorDesc::MLUOpTensorDesc(const phi::DenseTensor& tensor,
                                  const mluOpTensorLayout_t layout,
                                  const mluOpDataType_t tensor_dtype) {
   auto dims = phi::vectorize<int>(tensor.dims());
@@ -407,11 +407,11 @@ MLUOpTensorDesc::MLUOpTensorDesc(const Tensor& tensor,
   }
 }
 
-MLUOpTensorDesc::MLUOpTensorDesc(const Tensor& tensor)
+MLUOpTensorDesc::MLUOpTensorDesc(const phi::DenseTensor& tensor)
     : MLUOpTensorDesc(
           tensor, MLUOP_LAYOUT_ARRAY, ToMluOpDataType(tensor.dtype())) {}
 
-MLUOpTensorDesc::MLUOpTensorDesc(const Tensor& tensor,
+MLUOpTensorDesc::MLUOpTensorDesc(const phi::DenseTensor& tensor,
                                  mluOpTensorLayout_t layout,
                                  const mluOpDataType_t tensor_dtype,
                                  int position)
@@ -420,7 +420,7 @@ MLUOpTensorDesc::MLUOpTensorDesc(const Tensor& tensor,
       mluOpSetTensorDescriptorPosition(raw_tensor_desc, position));
 }
 
-MLUOpTensorDesc::MLUOpTensorDesc(const Tensor& tensor,
+MLUOpTensorDesc::MLUOpTensorDesc(const phi::DenseTensor& tensor,
                                  mluOpTensorLayout_t layout,
                                  const mluOpDataType_t tensor_dtype,
                                  int position,
@@ -562,7 +562,7 @@ const cnnlRandGenerator_t MLUCnnlRandomGeneratorDesc::get() const {
   return mlu_generator;
 }
 
-Tensor& MLUCnnlRandomGeneratorDesc::get_state() { return mlu_state; }
+phi::DenseTensor& MLUCnnlRandomGeneratorDesc::get_state() { return mlu_state; }
 
 MLUCnnlRandomGeneratorDesc::~MLUCnnlRandomGeneratorDesc() {
   if (mlu_generator) {
@@ -953,7 +953,7 @@ MLURNNDesc::~MLURNNDesc() {
       cnnlGetConcatWorkspaceSize(handle, pack_num, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -981,7 +981,7 @@ MLURNNDesc::~MLURNNDesc() {
   PADDLE_ENFORCE_MLU_SUCCESS(
       cnnlGetConcatWorkspaceSize(handle, pack_num, &workspace_size));
 
-  Tensor workspace(paddle::experimental::DataType::INT8);
+  phi::DenseTensor workspace(paddle::experimental::DataType::INT8);
   workspace.Resize(framework::DDim({static_cast<int64_t>(workspace_size)}));
   void* workspace_ptr = workspace.mutable_data(dev_ctx.GetPlace());
 
@@ -1011,7 +1011,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, in0_desc, in1_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -1067,7 +1067,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input_quant_desc, output_desc, local_size, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -1104,7 +1104,7 @@ MLURNNDesc::~MLURNNDesc() {
 
   // use ctx allocate interface for profiling purpose
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -1580,7 +1580,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, in0_desc, in1_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -1634,7 +1634,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, a_desc, b_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -1665,7 +1665,7 @@ MLURNNDesc::~MLURNNDesc() {
       cnnlGetAxWorkspaceSize(handle, alpha_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -1754,7 +1754,7 @@ MLURNNDesc::~MLURNNDesc() {
                                                             &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -1803,7 +1803,7 @@ MLURNNDesc::~MLURNNDesc() {
       cnnlGetSplitWorkspaceSize(handle, split_num, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -1831,7 +1831,7 @@ MLURNNDesc::~MLURNNDesc() {
   PADDLE_ENFORCE_MLU_SUCCESS(
       cnnlGetSplitWorkspaceSize(handle, split_num, &workspace_size));
 
-  Tensor workspace(paddle::experimental::DataType::INT8);
+  phi::DenseTensor workspace(paddle::experimental::DataType::INT8);
   workspace.Resize(framework::DDim({static_cast<int64_t>(workspace_size)}));
   void* workspace_ptr = workspace.mutable_data(dev_ctx.GetPlace());
 
@@ -1947,7 +1947,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input1_desc, input2_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -1979,7 +1979,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, condition_desc, then_desc, else_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2023,7 +2023,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2162,7 +2162,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, pool_mode, output_w, output_h, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2217,7 +2217,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, pool_mode, output_shape[2], output_shape[1], &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2270,7 +2270,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, data_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2359,7 +2359,7 @@ MLURNNDesc::~MLURNNDesc() {
                                              &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2433,7 +2433,7 @@ MLURNNDesc::~MLURNNDesc() {
 
   size_t workspace_size = 0;
   void* workspace_ptr = nullptr;
-  Tensor workspace;
+  phi::DenseTensor workspace;
   if (need_workspace) {
     PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetReduceOpWorkspaceSize(
         handle, input_desc, output_desc, reduction_desc, &workspace_size));
@@ -2473,7 +2473,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input1_desc, input2_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2502,7 +2502,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input1_desc, input2_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2530,7 +2530,7 @@ MLURNNDesc::~MLURNNDesc() {
       cnnlGetMaximumWorkspaceSize(handle, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2558,7 +2558,7 @@ MLURNNDesc::~MLURNNDesc() {
       cnnlGetMinimumWorkspaceSize(handle, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2587,7 +2587,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input1_desc, input2_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2617,7 +2617,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input1_desc, input2_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2647,7 +2647,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input1_desc, input2_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2677,7 +2677,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input1_desc, input2_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2944,7 +2944,7 @@ MLURNNDesc::~MLURNNDesc() {
       cnnlGetDynamicStitchWorkspaceSize(handle, size, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -3203,7 +3203,7 @@ MLURNNDesc::~MLURNNDesc() {
       cnnlGetNmsWorkspaceSize_v2(handle, confidence_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -3252,7 +3252,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -3427,7 +3427,7 @@ MLURNNDesc::~MLURNNDesc() {
       cnnlGetLayerNormOpWorkspaceSize(handle, axis, x_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -3499,7 +3499,7 @@ MLURNNDesc::~MLURNNDesc() {
       cnnlGetQuantizeParamWorkspaceSize(handle, input_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -3565,7 +3565,7 @@ MLURNNDesc::~MLURNNDesc() {
                                              &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -3708,7 +3708,7 @@ MLURNNDesc::~MLURNNDesc() {
       cnnlMakeFusedOpsPlan(handle, fusion_plan, cparam_pack, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -3773,7 +3773,7 @@ MLURNNDesc::~MLURNNDesc() {
                                                   &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -3839,7 +3839,7 @@ MLURNNDesc::~MLURNNDesc() {
                                                   &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -3899,7 +3899,7 @@ MLURNNDesc::~MLURNNDesc() {
                                                     &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -3967,7 +3967,7 @@ MLURNNDesc::~MLURNNDesc() {
                                                     &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4021,7 +4021,7 @@ MLURNNDesc::~MLURNNDesc() {
                                                             &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4079,7 +4079,7 @@ MLURNNDesc::~MLURNNDesc() {
                                           &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4135,7 +4135,7 @@ MLURNNDesc::~MLURNNDesc() {
                                             &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4207,7 +4207,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, matmul_desc, a_desc, b_desc, output_desc, algo, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4295,7 +4295,7 @@ MLURNNDesc::~MLURNNDesc() {
                                               &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4399,7 +4399,7 @@ MLURNNDesc::~MLURNNDesc() {
                                                    &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4450,7 +4450,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input_desc, perm_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4517,7 +4517,7 @@ MLURNNDesc::~MLURNNDesc() {
       cnnlGetWhereWorkspaceSize(handle, num_true_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4593,7 +4593,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input1_desc, input2_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4623,7 +4623,7 @@ MLURNNDesc::~MLURNNDesc() {
       cnnlGetQRWorkspaceSize(handle, a_desc, some, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4667,7 +4667,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input_desc, weight_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4705,7 +4705,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, target_desc, weight_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4742,7 +4742,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, x_desc, algorithm, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4778,7 +4778,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, x_desc, algorithm, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4857,7 +4857,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, diff_desc, output_desc, scale_grad_by_freq, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4903,7 +4903,7 @@ MLURNNDesc::~MLURNNDesc() {
           "MLU RNNForward failed. x_desc initializing failed."));
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
   size_t workspace_size, reservespace_size;
-  Tensor workspace;
+  phi::DenseTensor workspace;
   PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetRNNTempSizes(
       handle, rnn_desc, x_desc, &workspace_size, &reservespace_size));
   workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
@@ -4967,7 +4967,7 @@ MLURNNDesc::~MLURNNDesc() {
           "MLU RNNForward failed. x_desc initializing failed."));
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
   size_t workspace_size;
-  Tensor workspace;
+  phi::DenseTensor workspace;
   PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetRNNTempSizes(
       handle, rnn_desc, x_desc, &workspace_size, &reservespace_size));
   workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
@@ -5028,7 +5028,7 @@ MLURNNDesc::~MLURNNDesc() {
   cnnlHandle_t handle = GetHandleFromCTX(ctx);
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
   size_t workspace_size;
-  Tensor workspace;
+  phi::DenseTensor workspace;
   PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetMaskedWorkspaceSize(handle,
                                                         masked_mode,
                                                         input_desc,
@@ -5075,7 +5075,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input_desc, weight_desc, pos_weight_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -5119,7 +5119,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, target_desc, weight_desc, pos_weight_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -5227,7 +5227,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input_desc, grid_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index 8fbaade9dc01b..413158f441a7b 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -29,7 +29,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 using ExecutionContext = framework::ExecutionContext;
 using DeviceContextPool = platform::DeviceContextPool;
@@ -377,18 +376,18 @@ class MLUOpTensorDesc {
                   const mluOpDataType_t tensor_dtype,
                   int position);
 
-  MLUOpTensorDesc(const Tensor& tensor,
+  MLUOpTensorDesc(const phi::DenseTensor& tensor,
                   const mluOpTensorLayout_t layout,
                   const mluOpDataType_t tensor_dtype);
 
-  explicit MLUOpTensorDesc(const Tensor& tensor);
+  explicit MLUOpTensorDesc(const phi::DenseTensor& tensor);
 
-  MLUOpTensorDesc(const Tensor& tensor,
+  MLUOpTensorDesc(const phi::DenseTensor& tensor,
                   mluOpTensorLayout_t layout,
                   const mluOpDataType_t tensor_dtype,
                   int position);
 
-  MLUOpTensorDesc(const Tensor& tensor,
+  MLUOpTensorDesc(const phi::DenseTensor& tensor,
                   mluOpTensorLayout_t layout,
                   const mluOpDataType_t tensor_dtype,
                   int position,
@@ -458,11 +457,11 @@ class MLUCnnlRandomGeneratorDesc {
  public:
   MLUCnnlRandomGeneratorDesc(const ExecutionContext& ctx, const int seed);
   const cnnlRandGenerator_t get() const;
-  Tensor& get_state();
+  phi::DenseTensor& get_state();
   ~MLUCnnlRandomGeneratorDesc();
 
  private:
-  Tensor mlu_state;
+  phi::DenseTensor mlu_state;
   cnnlRandGenerator_t mlu_generator = nullptr;
 };
 
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cu b/paddle/fluid/operators/modified_huber_loss_op.cu
index 330f4ca3596bd..bd4451ebda46d 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cu
+++ b/paddle/fluid/operators/modified_huber_loss_op.cu
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 struct ModifiedHuberLossBackward {
   template <typename Tuple>
   HOSTDEVICE void operator()(Tuple t) const {
diff --git a/paddle/fluid/operators/modified_huber_loss_op.h b/paddle/fluid/operators/modified_huber_loss_op.h
index 50d5a14548e35..62600ed7c6970 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.h
+++ b/paddle/fluid/operators/modified_huber_loss_op.h
@@ -21,7 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc
index b83bc8ea6541b..483c2bda72efa 100644
--- a/paddle/fluid/operators/multi_dot_op.cc
+++ b/paddle/fluid/operators/multi_dot_op.cc
@@ -27,7 +27,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 class MultiDotOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
diff --git a/paddle/fluid/operators/multinomial_op_npu.cc b/paddle/fluid/operators/multinomial_op_npu.cc
index 206c7b041a9b3..74f3578c6e8d4 100644
--- a/paddle/fluid/operators/multinomial_op_npu.cc
+++ b/paddle/fluid/operators/multinomial_op_npu.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class NPUMultinomialKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc
index 749849a333f3d..ba263427caa87 100644
--- a/paddle/fluid/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class MultiplexOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
index a4b418b14cc84..4b9fe86b22565 100644
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -31,7 +31,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using SelectedRows = phi::SelectedRows;
 using Sampler = math::Sampler;
 using DDim = framework::DDim;
@@ -44,7 +43,7 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename DeviceContext, typename T>
 void PrepareSamples(const framework::ExecutionContext &context,
                     Sampler *sampler,
-                    Tensor *sample_labels) {
+                    phi::DenseTensor *sample_labels) {
   auto label = context.Input<phi::DenseTensor>("Label");
   const int64_t *label_data = label->data<int64_t>();
   auto label_dims = label->dims();
@@ -154,9 +153,9 @@ class NCEKernel : public framework::OpKernel<T> {
 
     std::vector<int64_t> sample_out_dims;
     auto label = context.Input<phi::DenseTensor>("Label");
-    Tensor *sample_labels;
-    Tensor *sample_out;
-    Tensor sample_labels_tmp, sample_out_tmp;
+    phi::DenseTensor *sample_labels;
+    phi::DenseTensor *sample_out;
+    phi::DenseTensor sample_labels_tmp, sample_out_tmp;
     if (is_test) {
       // set dims of output(SampleOut)
       int num_true_classes = label->dims().size() == 2 ? label->dims()[1] : 1;
@@ -339,7 +338,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
     }
 
     //    T b = 1. / num_total_classes * num_neg_samples;
-    Tensor sample_grad;  // tmp tensor
+    phi::DenseTensor sample_grad;  // tmp tensor
     T *sample_grad_data =
         sample_grad.mutable_data<T>(sample_labels->dims(), context.GetPlace());
     // backward cost
diff --git a/paddle/fluid/operators/norm_op_npu.cc b/paddle/fluid/operators/norm_op_npu.cc
index c5f0749227e23..619f902513459 100644
--- a/paddle/fluid/operators/norm_op_npu.cc
+++ b/paddle/fluid/operators/norm_op_npu.cc
@@ -16,7 +16,6 @@ namespace paddle {
 namespace operators {
 
 using DDim = framework::DDim;
-using Tensor = phi::DenseTensor;
 
 void CheckAxis(int axis, int rank) {
   // check the axis is in [-rank, rank-1]
diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h
index b6e27e6b54151..2412913995b95 100644
--- a/paddle/fluid/operators/norm_utils.cu.h
+++ b/paddle/fluid/operators/norm_utils.cu.h
@@ -37,7 +37,6 @@ namespace cub = hipcub;
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 
 // math: dx = scale * ((x - mean) * inv_var / NxHxW * (np.mean(ddx,
@@ -433,21 +432,21 @@ __global__ void DoubleGradComputeDDYWithGlobal(const T *ddx,
 template <typename DeviceContext, typename T>
 void NormDoubleGradFunctor(const DeviceContext &ctx,
                            const DataLayout data_layout,
-                           const Tensor *X,
-                           const Tensor *Scale,
-                           const Tensor *dY,
-                           const Tensor *Saved_mean,
-                           const Tensor *Saved_variance,
-                           const Tensor *Mean,
-                           const Tensor *Variance,
+                           const phi::DenseTensor *X,
+                           const phi::DenseTensor *Scale,
+                           const phi::DenseTensor *dY,
+                           const phi::DenseTensor *Saved_mean,
+                           const phi::DenseTensor *Saved_variance,
+                           const phi::DenseTensor *Mean,
+                           const phi::DenseTensor *Variance,
                            const double epsilon,
                            const bool use_global_stats,
-                           const Tensor *ddX,
-                           const Tensor *ddScale,
-                           const Tensor *ddBias,
-                           Tensor *dX,
-                           Tensor *dScale,
-                           Tensor *ddY) {
+                           const phi::DenseTensor *ddX,
+                           const phi::DenseTensor *ddScale,
+                           const phi::DenseTensor *ddBias,
+                           phi::DenseTensor *dX,
+                           phi::DenseTensor *dScale,
+                           phi::DenseTensor *ddY) {
   const T *x_data = X->data<T>();
   const T *dy_data = dY->data<T>();
   const T *ddx_data = (ddX == nullptr ? nullptr : ddX->data<T>());
@@ -463,7 +462,7 @@ void NormDoubleGradFunctor(const DeviceContext &ctx,
   const int N = x_dims[0];
   const int num = X->numel();
   const int sample_size = num / N / C;
-  Tensor scale_tmp;
+  phi::DenseTensor scale_tmp;
   if (!Scale) {
     scale_tmp.mutable_data<T>({C}, ctx.GetPlace());
     set_constant(ctx, &scale_tmp, static_cast<T>(1));
diff --git a/paddle/fluid/operators/number_count_op.cu b/paddle/fluid/operators/number_count_op.cu
index 99623917d59ee..fdab03698711c 100644
--- a/paddle/fluid/operators/number_count_op.cu
+++ b/paddle/fluid/operators/number_count_op.cu
@@ -37,8 +37,6 @@ static inline int GET_BLOCKS(const int N) {
   return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
 }
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 __global__ void initialize_zero_kernel(T* data, const int length) {
   CUDA_KERNEL_LOOP(idx, length) { data[idx] = static_cast<T>(0); }
diff --git a/paddle/fluid/operators/one_hot_op.h b/paddle/fluid/operators/one_hot_op.h
index d878fd5a6d44b..41ec3eb9a135f 100644
--- a/paddle/fluid/operators/one_hot_op.h
+++ b/paddle/fluid/operators/one_hot_op.h
@@ -76,7 +76,6 @@ struct OneHotOpFunctor {
   }
 };
 
-using Tensor = phi::DenseTensor;
 template <typename DeviceContext, typename T>
 class OneHotKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/one_hot_op_npu.cc b/paddle/fluid/operators/one_hot_op_npu.cc
index e2997dc079c61..35e8bcde9daad 100644
--- a/paddle/fluid/operators/one_hot_op_npu.cc
+++ b/paddle/fluid/operators/one_hot_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename T>
 class OneHotNPUKernel : public framework::OpKernel<T> {
@@ -54,7 +53,7 @@ class OneHotNPUKernel : public framework::OpKernel<T> {
           .AddOutput(*out);
       runner.Run(dev_ctx.stream());
     } else {
-      Tensor transformed_in;
+      phi::DenseTensor transformed_in;
       transformed_in.mutable_data<int32_t>(in->dims(), dev_ctx.GetPlace());
       const auto& cast_runner = NpuOpRunner(
           "Cast", {*in}, {transformed_in}, {{"dst_type", ACL_INT32}});
diff --git a/paddle/fluid/operators/one_hot_op_xpu.cc b/paddle/fluid/operators/one_hot_op_xpu.cc
index 66826cd4ff33a..e4f8555fceae2 100644
--- a/paddle/fluid/operators/one_hot_op_xpu.cc
+++ b/paddle/fluid/operators/one_hot_op_xpu.cc
@@ -22,8 +22,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class OneHotXPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/one_hot_v2_op_mlu.cc b/paddle/fluid/operators/one_hot_v2_op_mlu.cc
index f98cbabf58a87..0b2fbfe85d403 100644
--- a/paddle/fluid/operators/one_hot_v2_op_mlu.cc
+++ b/paddle/fluid/operators/one_hot_v2_op_mlu.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename T>
 class OneHotV2MLUKernel : public framework::OpKernel<T> {
@@ -44,10 +43,12 @@ class OneHotV2MLUKernel : public framework::OpKernel<T> {
 
     float on_value = 1.0f, off_value = 0.0f;
     const int in_off_dim[1] = {1};
-    Tensor on_value_tensor = ctx.AllocateTmpTensor<float, MLUDeviceContext>(
-        framework::DDim(in_off_dim, 1), dev_ctx);
-    Tensor off_value_tensor = ctx.AllocateTmpTensor<float, MLUDeviceContext>(
-        framework::DDim(in_off_dim, 1), dev_ctx);
+    phi::DenseTensor on_value_tensor =
+        ctx.AllocateTmpTensor<float, MLUDeviceContext>(
+            framework::DDim(in_off_dim, 1), dev_ctx);
+    phi::DenseTensor off_value_tensor =
+        ctx.AllocateTmpTensor<float, MLUDeviceContext>(
+            framework::DDim(in_off_dim, 1), dev_ctx);
     FillMLUTensorWithHostValue(ctx, on_value, &on_value_tensor);
     FillMLUTensorWithHostValue(ctx, off_value, &off_value_tensor);
 
@@ -64,7 +65,7 @@ class OneHotV2MLUKernel : public framework::OpKernel<T> {
                       ToCnnlDataType(out->dtype()),
                       GetBasePtr(out));
     } else {
-      Tensor transformed_in;
+      phi::DenseTensor transformed_in;
       transformed_in.mutable_data<int32_t>(in->dims(), dev_ctx.GetPlace());
       // use cnnlCast to cast int64_t to int32_t then do one_hot
       MLUCnnlTensorDesc in_desc(*in);
diff --git a/paddle/fluid/operators/one_hot_v2_op_npu.cc b/paddle/fluid/operators/one_hot_v2_op_npu.cc
index 8cc97b417ca78..d305a04ea0782 100644
--- a/paddle/fluid/operators/one_hot_v2_op_npu.cc
+++ b/paddle/fluid/operators/one_hot_v2_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename T>
 class OneHotV2NPUKernel : public framework::OpKernel<T> {
@@ -53,7 +52,7 @@ class OneHotV2NPUKernel : public framework::OpKernel<T> {
           .AddOutput(*out);
       runner.Run(dev_ctx.stream());
     } else {
-      Tensor transformed_in;
+      phi::DenseTensor transformed_in;
       transformed_in.mutable_data<int32_t>(in->dims(), dev_ctx.GetPlace());
       const auto& cast_runner = NpuOpRunner(
           "Cast", {*in}, {transformed_in}, {{"dst_type", ACL_INT32}});
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cc b/paddle/fluid/operators/optimizers/adadelta_op.cc
index 4390da3c4e479..262aa0fc350e2 100644
--- a/paddle/fluid/operators/optimizers/adadelta_op.cc
+++ b/paddle/fluid/operators/optimizers/adadelta_op.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class AdadeltaOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cc b/paddle/fluid/operators/optimizers/adagrad_op.cc
index 4f800233c24fe..54643a39bcd4c 100644
--- a/paddle/fluid/operators/optimizers/adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/adagrad_op.cc
@@ -25,7 +25,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 class AdagradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
index aa331df4cbd0c..cf447bc593103 100644
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -19,8 +19,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class AdamOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/adam_op_mlu.cc b/paddle/fluid/operators/optimizers/adam_op_mlu.cc
index c9c33643d1ee5..d998cff14126c 100644
--- a/paddle/fluid/operators/optimizers/adam_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_mlu.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class AdamMLUKernel : public framework::OpKernel<T> {
  public:
@@ -156,9 +154,9 @@ class AdamMLUKernel : public framework::OpKernel<T> {
     const phi::DenseTensor* beta2_tensor = nullptr;
     const phi::DenseTensor* epsilon_tensor = nullptr;
 
-    Tensor beta1_tmp(experimental::DataType::FLOAT32);
-    Tensor beta2_tmp(experimental::DataType::FLOAT32);
-    Tensor epsilon_tmp(experimental::DataType::FLOAT32);
+    phi::DenseTensor beta1_tmp(experimental::DataType::FLOAT32);
+    phi::DenseTensor beta2_tmp(experimental::DataType::FLOAT32);
+    phi::DenseTensor epsilon_tmp(experimental::DataType::FLOAT32);
 
     if (ctx.HasInput("Beta1Tensor")) {
       beta1_tensor = ctx.Input<phi::DenseTensor>("Beta1Tensor");
@@ -462,9 +460,9 @@ class MergedAdamMLUKernel : public framework::OpKernel<T> {
     const phi::DenseTensor* beta2_tensor = nullptr;
     const phi::DenseTensor* epsilon_tensor = nullptr;
 
-    Tensor beta1_tmp(experimental::DataType::FLOAT32);
-    Tensor beta2_tmp(experimental::DataType::FLOAT32);
-    Tensor epsilon_tmp(experimental::DataType::FLOAT32);
+    phi::DenseTensor beta1_tmp(experimental::DataType::FLOAT32);
+    phi::DenseTensor beta2_tmp(experimental::DataType::FLOAT32);
+    phi::DenseTensor epsilon_tmp(experimental::DataType::FLOAT32);
 
     T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
     T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc
index f94b32413a04a..356bef435e45c 100644
--- a/paddle/fluid/operators/optimizers/adam_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class AdamNPUKernel : public framework::OpKernel<T> {
  public:
@@ -132,9 +130,9 @@ class AdamNPUKernel : public framework::OpKernel<T> {
     const phi::DenseTensor* beta2_tensor = nullptr;
     const phi::DenseTensor* epsilon_tensor = nullptr;
 
-    Tensor beta1_tmp(experimental::DataType::FLOAT32);
-    Tensor beta2_tmp(experimental::DataType::FLOAT32);
-    Tensor epsilon_tmp(experimental::DataType::FLOAT32);
+    phi::DenseTensor beta1_tmp(experimental::DataType::FLOAT32);
+    phi::DenseTensor beta2_tmp(experimental::DataType::FLOAT32);
+    phi::DenseTensor epsilon_tmp(experimental::DataType::FLOAT32);
 
     if (ctx.HasInput("Beta1Tensor")) {
       beta1_tensor = ctx.Input<phi::DenseTensor>("Beta1Tensor");
@@ -286,9 +284,9 @@ class AdamWNPUKernel : public AdamNPUKernel<platform::NPUDeviceContext, T> {
           ctx.template device_context<paddle::platform::NPUDeviceContext>()
               .stream();
 
-      Tensor one(experimental::DataType::FLOAT32);
-      Tensor decay(experimental::DataType::FLOAT32);
-      Tensor tmp(experimental::DataType::FLOAT32);
+      phi::DenseTensor one(experimental::DataType::FLOAT32);
+      phi::DenseTensor decay(experimental::DataType::FLOAT32);
+      phi::DenseTensor tmp(experimental::DataType::FLOAT32);
 
       tmp.mutable_data<float>({1}, place);
       one.mutable_data<float>({1}, place);
diff --git a/paddle/fluid/operators/optimizers/adamax_op.cc b/paddle/fluid/operators/optimizers/adamax_op.cc
index 5298030f17a04..12429933e03d3 100644
--- a/paddle/fluid/operators/optimizers/adamax_op.cc
+++ b/paddle/fluid/operators/optimizers/adamax_op.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 class AdamaxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
index 5ab3ef3b2e61c..6c73439c62551 100644
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 class DecayedAdagradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/dpsgd_op.cc b/paddle/fluid/operators/optimizers/dpsgd_op.cc
index e866a97f1ddcc..f5710f2e7d8eb 100644
--- a/paddle/fluid/operators/optimizers/dpsgd_op.cc
+++ b/paddle/fluid/operators/optimizers/dpsgd_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 class DpsgdOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/ftrl_op.cc b/paddle/fluid/operators/optimizers/ftrl_op.cc
index b81a6c5ab6bb7..22be1f5ac685a 100644
--- a/paddle/fluid/operators/optimizers/ftrl_op.cc
+++ b/paddle/fluid/operators/optimizers/ftrl_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 class FTRLOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/ftrl_op.h b/paddle/fluid/operators/optimizers/ftrl_op.h
index 97b1a09766b68..99e210ce51e96 100644
--- a/paddle/fluid/operators/optimizers/ftrl_op.h
+++ b/paddle/fluid/operators/optimizers/ftrl_op.h
@@ -21,7 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
diff --git a/paddle/fluid/operators/optimizers/merged_adam_op.cc b/paddle/fluid/operators/optimizers/merged_adam_op.cc
index 8e4ff40372a12..867cfe0268c51 100644
--- a/paddle/fluid/operators/optimizers/merged_adam_op.cc
+++ b/paddle/fluid/operators/optimizers/merged_adam_op.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class MergedAdamOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
index c390a12863bc4..ea74dba1c54d1 100644
--- a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
@@ -134,7 +134,8 @@ class MLUMergedMomentumOpKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
 
-    Tensor mu_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
+    phi::DenseTensor mu_tensor =
+        ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
     MLUCnnlTensorDesc mu_tensor_desc(mu_tensor);
     MLUCnnl::Fill(ctx,
                   CNNL_POINTER_MODE_HOST,
@@ -158,7 +159,7 @@ class MLUMergedMomentumOpKernel : public framework::OpKernel<T> {
       auto velocity_out = velocitys_out[idx];
 
       auto grad = grads[idx];
-      Tensor regularized_grad;
+      phi::DenseTensor regularized_grad;
       MLUCnnlTensorDesc param_desc(*param_out);
       if (regularization_flag == phi::RegularizationType::kL2DECAY) {
         regularized_grad = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
diff --git a/paddle/fluid/operators/optimizers/momentum_op.cc b/paddle/fluid/operators/optimizers/momentum_op.cc
index 4171f0c11955a..538028139b8c4 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class MomentumOpInferVarType : public framework::VarTypeInference {
  public:
   void operator()(framework::InferVarTypeContext* ctx) const override {
@@ -38,24 +36,24 @@ class MomentumOpInferVarType : public framework::VarTypeInference {
 
 void MomentumOpMaker::Make() {
   AddInput("Param",
-           "(Tensor, default Tensor<float>) "
+           "(phi::DenseTensor, default phi::DenseTensor<float>) "
            "Input parameter that has to be updated");
   AddInput("Grad",
-           "(Tensor, default Tensor<float>) "
+           "(phi::DenseTensor, default phi::DenseTensor<float>) "
            "Input gradient of the parameter");
   AddInput("Velocity",
-           "(Tensor, default Tensor<float>) "
+           "(phi::DenseTensor, default phi::DenseTensor<float>) "
            "Input velocity (corresponding to the parameter) "
            "that has to be updated");
   AddInput("LearningRate",
-           "(Tensor, default Tensor<float>) "
+           "(phi::DenseTensor, default phi::DenseTensor<float>) "
            "Input learning rate");
   AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
   AddOutput("ParamOut",
-            "(Tensor) This output is updated parameter. "
+            "(phi::DenseTensor) This output is updated parameter. "
             "It shared memory with Input(Param).");
   AddOutput("VelocityOut",
-            "(Tensor) This output is updated velocity. "
+            "(phi::DenseTensor) This output is updated velocity. "
             "It shared memory with Input(Velocity).");
   AddOutput("MasterParamOut",
             "The updated FP32 master weight for AMP. "
diff --git a/paddle/fluid/operators/optimizers/momentum_op_mlu.cc b/paddle/fluid/operators/optimizers/momentum_op_mlu.cc
index 4bebb4264cc29..b37e7aa99f793 100644
--- a/paddle/fluid/operators/optimizers/momentum_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op_mlu.cc
@@ -50,7 +50,7 @@ class MLUMomentumOpKernel : public framework::OpKernel<T> {
     auto* grad_var = ctx.InputVar("Grad");
     if (grad_var->IsType<phi::DenseTensor>()) {
       auto grad = ctx.Input<phi::DenseTensor>("Grad");
-      Tensor mu_tensor =
+      phi::DenseTensor mu_tensor =
           ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
       MLUCnnlTensorDesc mu_tensor_desc(mu_tensor);
       MLUCnnl::Fill(ctx,
@@ -59,7 +59,7 @@ class MLUMomentumOpKernel : public framework::OpKernel<T> {
                     mu_tensor_desc.get(),
                     GetBasePtr(&mu_tensor));
 
-      Tensor regularized_grad;
+      phi::DenseTensor regularized_grad;
       MLUCnnlTensorDesc param_desc(*param);
       if (regularization_flag == phi::RegularizationType::kL2DECAY) {
         regularized_grad =
diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
index 2da5bed7642c1..598b84415f9ec 100644
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 class ProximalAdagradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.h b/paddle/fluid/operators/optimizers/proximal_adagrad_op.h
index 136e416307ab0..72eccd17e4489 100644
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.h
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.h
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class ProximalAdagradOpKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.cc b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
index 061e495c4bacd..21b145ee49d7c 100644
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 class ProximalGDOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.h b/paddle/fluid/operators/optimizers/proximal_gd_op.h
index 024062045ae43..49cf7b68bd32a 100644
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.h
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.h
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class ProximalGDOpKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc b/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc
index 579bc76be5f47..abbe7ddcc5b61 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc
@@ -15,8 +15,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class RMSPROPNPUKernel : public framework::OpKernel<T> {
  public:
@@ -46,18 +44,18 @@ class RMSPROPNPUKernel : public framework::OpKernel<T> {
       auto *grad_tensor = ctx.Input<phi::DenseTensor>("Grad");
       if (centered) {
         framework::NPUAttributeMap attr_input = {{"use_locking", false}};
-        const Tensor *rho_tensor = nullptr;
-        const Tensor *momentum_tensor = nullptr;
-        const Tensor *epsilon_tensor = nullptr;
-        Tensor rho_tmp(experimental::DataType::FLOAT32);
+        const phi::DenseTensor *rho_tensor = nullptr;
+        const phi::DenseTensor *momentum_tensor = nullptr;
+        const phi::DenseTensor *epsilon_tensor = nullptr;
+        phi::DenseTensor rho_tmp(experimental::DataType::FLOAT32);
         rho_tmp.mutable_data<T>({1}, ctx.GetPlace());
         FillNpuTensorWithConstant<T>(&rho_tmp, rho);
         rho_tensor = &rho_tmp;
-        Tensor momentum_tmp(experimental::DataType::FLOAT32);
+        phi::DenseTensor momentum_tmp(experimental::DataType::FLOAT32);
         momentum_tmp.mutable_data<T>({1}, ctx.GetPlace());
         FillNpuTensorWithConstant<T>(&momentum_tmp, momentum);
         momentum_tensor = &momentum_tmp;
-        Tensor epsilon_tmp(experimental::DataType::FLOAT32);
+        phi::DenseTensor epsilon_tmp(experimental::DataType::FLOAT32);
         epsilon_tmp.mutable_data<T>({1}, ctx.GetPlace());
         FillNpuTensorWithConstant<T>(&epsilon_tmp, epsilon);
         epsilon_tensor = &epsilon_tmp;
diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.cc b/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
index 3e072a5e17a64..f59171e3ae7c4 100644
--- a/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
@@ -19,8 +19,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class SparseMomentumOpInferVarType : public framework::VarTypeInference {
  public:
   void operator()(framework::InferVarTypeContext* ctx) const override {
@@ -36,30 +34,31 @@ class SparseMomentumOpInferVarType : public framework::VarTypeInference {
 
 void SparseMomentumOpMaker::Make() {
   AddInput("Param",
-           "(Tensor, default Tensor<float>) "
+           "(phi::DenseTensor, default phi::DenseTensor<float>) "
            "Input parameter that has to be updated");
   AddInput("Grad",
-           "(Tensor, default Tensor<float>) "
+           "(phi::DenseTensor, default phi::DenseTensor<float>) "
            "Input gradient of the parameter");
   AddInput("Velocity",
-           "(Tensor, default Tensor<float>) "
+           "(phi::DenseTensor, default phi::DenseTensor<float>) "
            "Input velocity (corresponding to the parameter) "
            "that has to be updated");
   AddInput("Index",
-           "(Tensor, default Tensor<int>) "
+           "(phi::DenseTensor, default phi::DenseTensor<int>) "
            "Input index of Param to do update operation");
   AddInput("Axis",
-           "The Tensor which contains the axis that we do update operation.")
+           "The phi::DenseTensor which contains the axis that we do update "
+           "operation.")
       .AsDispensable();
   AddInput("LearningRate",
-           "(Tensor, default Tensor<float>) "
+           "(phi::DenseTensor, default phi::DenseTensor<float>) "
            "Input learning rate");
   AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
   AddOutput("ParamOut",
-            "(Tensor) This output is updated parameter. "
+            "(phi::DenseTensor) This output is updated parameter. "
             "It shared memory with Input(Param).");
   AddOutput("VelocityOut",
-            "(Tensor) This output is updated velocity. "
+            "(phi::DenseTensor) This output is updated velocity. "
             "It shared memory with Input(Velocity).");
   AddOutput("MasterParamOut",
             "The updated FP32 master weight for AMP. "
diff --git a/paddle/fluid/operators/p_norm_op_npu.cc b/paddle/fluid/operators/p_norm_op_npu.cc
index 9d312dd572a45..5c86cf188c021 100644
--- a/paddle/fluid/operators/p_norm_op_npu.cc
+++ b/paddle/fluid/operators/p_norm_op_npu.cc
@@ -62,7 +62,7 @@ class PnormNPUKernel : public framework::OpKernel<T> {
                                         {"keep_dims", keepdim}});
       runner.Run(stream);
     } else {
-      Tensor tmp_x;
+      phi::DenseTensor tmp_x;
       tmp_x.mutable_data<T>(xdim, ctx.GetPlace());
 
       const auto& power_runner1 =
@@ -93,7 +93,6 @@ template <typename DeviceContext, typename T>
 class PnormGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = phi::DenseTensor;
     auto* x = ctx.Input<phi::DenseTensor>("X");
     auto* y = ctx.Input<phi::DenseTensor>("Out");
     auto* dy = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
@@ -113,8 +112,8 @@ class PnormGradNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    Tensor y_share(y->type());
-    Tensor dy_share(dy->type());
+    phi::DenseTensor y_share(y->type());
+    phi::DenseTensor dy_share(dy->type());
     y_share.ShareDataWith(*y);
     dy_share.ShareDataWith(*dy);
     auto ydim = xdim;
@@ -130,22 +129,22 @@ class PnormGradNPUKernel : public framework::OpKernel<T> {
       FillNpuTensorWithConstant(dx, static_cast<T>(0));
       dx->Resize(xdim);
     } else if (porder == INFINITY || porder == -INFINITY) {
-      Tensor x_abs;
+      phi::DenseTensor x_abs;
       x_abs.mutable_data<T>(xdim, place);
       const auto& r_abs = NpuOpRunner("Abs", {*x}, {x_abs}, {});
       r_abs.Run(stream);
 
-      Tensor t_cond;
+      phi::DenseTensor t_cond;
       t_cond.mutable_data<bool>(xdim, place);
       const auto& r_equal =
           NpuOpRunner("Equal", {x_abs, y_share}, {t_cond}, {});
       r_equal.Run(stream);
 
-      Tensor t_zero;
+      phi::DenseTensor t_zero;
       t_zero.mutable_data<T>({1}, place);
       FillNpuTensorWithConstant(&t_zero, static_cast<T>(0));
 
-      Tensor x_sign;
+      phi::DenseTensor x_sign;
       x_sign.mutable_data<T>(xdim, place);
       const auto& r_sign = NpuOpRunner("Sign", {*x}, {x_sign}, {});
       r_sign.Run(stream);
@@ -157,17 +156,17 @@ class PnormGradNPUKernel : public framework::OpKernel<T> {
           NpuOpRunner("SelectV2", {t_cond, *dx, t_zero}, {*dx}, {});
       r_sel.Run(stream);
     } else {
-      Tensor x_abs;
+      phi::DenseTensor x_abs;
       x_abs.mutable_data<T>(xdim, place);
       const auto& r_abs = NpuOpRunner("Abs", {*x}, {x_abs}, {});
       r_abs.Run(stream);
 
-      Tensor x_sign;
+      phi::DenseTensor x_sign;
       x_sign.mutable_data<T>(xdim, place);
       const auto& r_sign = NpuOpRunner("Sign", {*x}, {x_sign}, {});
       r_sign.Run(stream);
 
-      Tensor y_pow;
+      phi::DenseTensor y_pow;
       y_pow.mutable_data<T>(ydim, place);
       if (porder >= 1) {
         const auto& r_pow1 = NpuOpRunner(
diff --git a/paddle/fluid/operators/pad3d_op_npu.cc b/paddle/fluid/operators/pad3d_op_npu.cc
index 7694e0edbf9f9..497dc51e39f0d 100644
--- a/paddle/fluid/operators/pad3d_op_npu.cc
+++ b/paddle/fluid/operators/pad3d_op_npu.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 static inline std::vector<int> GetPaddings(
     const framework::ExecutionContext& context) {
   std::vector<int> paddings(6);
diff --git a/paddle/fluid/operators/pad_op_npu.cc b/paddle/fluid/operators/pad_op_npu.cc
index 425defc9792c7..27443b8b425d7 100644
--- a/paddle/fluid/operators/pad_op_npu.cc
+++ b/paddle/fluid/operators/pad_op_npu.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class PadNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/partial_concat_op.cc b/paddle/fluid/operators/partial_concat_op.cc
index e5a066fdc6539..01095b6d429b4 100644
--- a/paddle/fluid/operators/partial_concat_op.cc
+++ b/paddle/fluid/operators/partial_concat_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 class PartialConcatOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/partial_concat_op.cu b/paddle/fluid/operators/partial_concat_op.cu
index ef52bbad525a4..f4acf68dcbc70 100644
--- a/paddle/fluid/operators/partial_concat_op.cu
+++ b/paddle/fluid/operators/partial_concat_op.cu
@@ -23,8 +23,6 @@ namespace operators {
 
 #define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
 
-using Tensor = phi::DenseTensor;
-
 template <class T>
 __global__ void ConcatPartialCUDAKernel(T **in,
                                         T *out,
@@ -72,7 +70,7 @@ class PartialConcatOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto in_vars = ctx.MultiInput<phi::DenseTensor>("X");
-    Tensor *out = ctx.Output<phi::DenseTensor>("Out");
+    phi::DenseTensor *out = ctx.Output<phi::DenseTensor>("Out");
     PADDLE_ENFORCE_EQ(in_vars[0] != nullptr,
                       true,
                       platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/partial_concat_op.h b/paddle/fluid/operators/partial_concat_op.h
index c0c6b2f607526..050752f23888b 100644
--- a/paddle/fluid/operators/partial_concat_op.h
+++ b/paddle/fluid/operators/partial_concat_op.h
@@ -23,7 +23,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 static inline int64_t ComputeStartIndex(int64_t start_index, int64_t size) {
   PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/operators/partial_sum_op.cc b/paddle/fluid/operators/partial_sum_op.cc
index aa2f30aaafc2c..6473f8d603789 100644
--- a/paddle/fluid/operators/partial_sum_op.cc
+++ b/paddle/fluid/operators/partial_sum_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 class PartialSumOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/partial_sum_op.cu b/paddle/fluid/operators/partial_sum_op.cu
index c92e9618bfce0..093e0032b3cb9 100644
--- a/paddle/fluid/operators/partial_sum_op.cu
+++ b/paddle/fluid/operators/partial_sum_op.cu
@@ -23,8 +23,6 @@ namespace operators {
 
 #define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
 
-using Tensor = phi::DenseTensor;
-
 template <class T>
 __global__ void SumArrayPartialCUDAKernel(T **in,
                                           T *out,
@@ -77,7 +75,7 @@ class PartialSumOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto in_vars = ctx.MultiInput<phi::DenseTensor>("X");
-    Tensor *out = ctx.Output<phi::DenseTensor>("Out");
+    phi::DenseTensor *out = ctx.Output<phi::DenseTensor>("Out");
 
     PADDLE_ENFORCE_EQ(
         in_vars[0] != nullptr,
@@ -150,7 +148,7 @@ template <typename T>
 class PartialSumGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    const Tensor *out_grad =
+    const phi::DenseTensor *out_grad =
         ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     auto ins = ctx.MultiInput<phi::DenseTensor>("X");
     auto outs = ctx.MultiOutput<phi::DenseTensor>(framework::GradVarName("X"));
diff --git a/paddle/fluid/operators/partial_sum_op.h b/paddle/fluid/operators/partial_sum_op.h
index 26f5039e6363f..fa4cc19d5e2c3 100644
--- a/paddle/fluid/operators/partial_sum_op.h
+++ b/paddle/fluid/operators/partial_sum_op.h
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class PartialSumKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index 25d2ac8ce0d7a..c160dc28bfda4 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -109,12 +109,12 @@ framework::OpKernelType PoolOpGrad::GetKernelTypeForVar(
 void Pool2dOpMaker::Make() {
   AddInput(
       "X",
-      "(Tensor) The input tensor of pooling operator. "
+      "(phi::DenseTensor) The input tensor of pooling operator. "
       "The format of input tensor is NCHW, where N is batch size, C is the "
       "number of channels, H is the height of the feature, "
       "and W is the width of the feature.");
   AddOutput("Out",
-            "(Tensor) The output tensor of pooling operator. "
+            "(phi::DenseTensor) The output tensor of pooling operator. "
             "The format of output tensor is also NCHW, "
             "where N is batch size, C is the number of channels, "
             "H is the height of the feature, "
@@ -301,14 +301,14 @@ class PoolOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
 
 void Pool3dOpMaker::Make() {
   AddInput("X",
-           "(Tensor) The input tensor of pooling operator. "
+           "(phi::DenseTensor) The input tensor of pooling operator. "
            "The format of input tensor is NCDHW or NDHWC, where N is batch "
            "size, C is "
            "the number of channels, and D, H and W is the depth, height and "
            "width of "
            "the feature, respectively.");
   AddOutput("Out",
-            "(Tensor) The output tensor of pooling operator."
+            "(phi::DenseTensor) The output tensor of pooling operator."
             "The format of output tensor is also NCDHW or NDHWC, "
             "where N is batch size, C is "
             "the number of channels, and D, H and W is the depth, height and "
diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h
index c08b589cbe12e..fd2c0ce15b461 100644
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class PoolOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/pool_op_mlu.cc b/paddle/fluid/operators/pool_op_mlu.cc
index e2af30faf36f4..6e422a645fffb 100644
--- a/paddle/fluid/operators/pool_op_mlu.cc
+++ b/paddle/fluid/operators/pool_op_mlu.cc
@@ -46,8 +46,8 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto &dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
-    const Tensor *in_x = ctx.Input<phi::DenseTensor>("X");
-    Tensor *out = ctx.Output<phi::DenseTensor>("Out");
+    const phi::DenseTensor *in_x = ctx.Input<phi::DenseTensor>("X");
+    phi::DenseTensor *out = ctx.Output<phi::DenseTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
     std::string pooling_type = ctx.Attr<std::string>("pooling_type");
@@ -212,11 +212,11 @@ class MLUPoolGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto &dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
-    const Tensor *in_x = ctx.Input<phi::DenseTensor>("X");
-    const Tensor *out = ctx.Input<phi::DenseTensor>("Out");
-    const Tensor *out_grad =
+    const phi::DenseTensor *in_x = ctx.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor *out = ctx.Input<phi::DenseTensor>("Out");
+    const phi::DenseTensor *out_grad =
         ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    Tensor *in_x_grad =
+    phi::DenseTensor *in_x_grad =
         ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     in_x_grad->mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/positive_negative_pair_op.h b/paddle/fluid/operators/positive_negative_pair_op.h
index 1cc89cda21bc7..745b793f51147 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.h
+++ b/paddle/fluid/operators/positive_negative_pair_op.h
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class PositiveNegativePairKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
index 51aa3dcd39a35..8a2199e0231bf 100644
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class PReluOp : public framework::OperatorWithKernel {
  public:
   PReluOp(const std::string &type,
diff --git a/paddle/fluid/operators/prroi_pool_op.cc b/paddle/fluid/operators/prroi_pool_op.cc
index 9b3146c3b8487..ca291187b9cdd 100644
--- a/paddle/fluid/operators/prroi_pool_op.cc
+++ b/paddle/fluid/operators/prroi_pool_op.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class PRROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
diff --git a/paddle/fluid/operators/prroi_pool_op.cu b/paddle/fluid/operators/prroi_pool_op.cu
index b24ded79dd050..d1aa1d37d0479 100644
--- a/paddle/fluid/operators/prroi_pool_op.cu
+++ b/paddle/fluid/operators/prroi_pool_op.cu
@@ -17,8 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaximumNumBlocks = 4096;
 
diff --git a/paddle/fluid/operators/pyramid_hash_op.cc b/paddle/fluid/operators/pyramid_hash_op.cc
index 5eead81365053..a24b234a05da7 100644
--- a/paddle/fluid/operators/pyramid_hash_op.cc
+++ b/paddle/fluid/operators/pyramid_hash_op.cc
@@ -28,7 +28,6 @@ extern "C" {
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using LoD = framework::LoD;
 
 class PyramidHashOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/random_routing_op.cu b/paddle/fluid/operators/random_routing_op.cu
index f7f111299c73d..afe45894d5c1a 100644
--- a/paddle/fluid/operators/random_routing_op.cu
+++ b/paddle/fluid/operators/random_routing_op.cu
@@ -29,8 +29,6 @@ static inline int GET_BLOCKS(const int N) {
   return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
 }
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 __global__ void random_routing_kernel(int64_t* data,
                                       const int64_t length,
diff --git a/paddle/fluid/operators/rank_attention_op.cc b/paddle/fluid/operators/rank_attention_op.cc
index 4c740c5985ade..80bd022aff340 100644
--- a/paddle/fluid/operators/rank_attention_op.cc
+++ b/paddle/fluid/operators/rank_attention_op.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 class RankAttentionOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
index ce06d1b1089a5..41de1f6b1300a 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
@@ -21,7 +21,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename T>
 class ReduceAnyNPUKernel : public framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
index d623bf23534a0..aec1640181bcc 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
@@ -33,8 +33,6 @@ limitations under the License. */
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-using Tensor = phi::DenseTensor;
-
 USE_OP_ITSELF(reduce_any);
 USE_OP_DEVICE_KERNEL(reduce_any, NPU);
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
index a23931c0aa246..ca19b9e6e52da 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
@@ -96,9 +96,10 @@ template <typename T>
 class ReduceMaxGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* out = context.Input<Tensor>("Out");
-    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x = context.Input<phi::DenseTensor>("X");
+    auto* out = context.Input<phi::DenseTensor>("Out");
+    auto* out_grad =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     auto reduce_dims = context.Attr<std::vector<int>>("dim");
     bool reduce_all = context.Attr<bool>("reduce_all");
     int in_dtype = context.Attr<int>("in_dtype");
@@ -108,7 +109,8 @@ class ReduceMaxGradMLUKernel : public framework::OpKernel<T> {
         true,
         platform::errors::InvalidArgument(
             "MLU only support in_dtype == -1 in reduce_max_grad op."));
-    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* x_grad =
+        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     x_grad->mutable_data<T>(context.GetPlace());
 
     auto place = context.GetPlace();
@@ -122,7 +124,7 @@ class ReduceMaxGradMLUKernel : public framework::OpKernel<T> {
       }
     }
 
-    Tensor tmp_out, tmp_out_grad;
+    phi::DenseTensor tmp_out, tmp_out_grad;
     auto tmp_out_dims_vec = x_dims_vec;
     for (auto d : reduce_dims) {
       if (d < 0) {
@@ -136,7 +138,7 @@ class ReduceMaxGradMLUKernel : public framework::OpKernel<T> {
     tmp_out_grad.ShareDataWith(*out_grad);
     tmp_out_grad.Resize(phi::make_ddim(tmp_out_dims_vec));
 
-    Tensor transformed_out(x->type());
+    phi::DenseTensor transformed_out(x->type());
     transformed_out.Resize(phi::make_ddim(x_dims_vec));
     transformed_out.mutable_data<T>(place);
 
@@ -149,7 +151,7 @@ class ReduceMaxGradMLUKernel : public framework::OpKernel<T> {
                          transformed_out_desc.get(),
                          GetBasePtr(&transformed_out));
 
-    Tensor transformed_out_grad(x->type());
+    phi::DenseTensor transformed_out_grad(x->type());
     transformed_out_grad.Resize(phi::make_ddim(x_dims_vec));
     transformed_out_grad.mutable_data<T>(place);
     MLUCnnlTensorDesc tmp_out_grad_desc(tmp_out_grad);
@@ -162,7 +164,7 @@ class ReduceMaxGradMLUKernel : public framework::OpKernel<T> {
                          GetBasePtr(&transformed_out_grad));
 
     // compare
-    Tensor equal_cond;
+    phi::DenseTensor equal_cond;
     equal_cond.mutable_data<bool>(x_grad->dims(), place);
 
     MLUCnnlTensorDesc x_desc(*x);
@@ -178,7 +180,7 @@ class ReduceMaxGradMLUKernel : public framework::OpKernel<T> {
                    GetBasePtr(&equal_cond));
 
     // select
-    Tensor t_zero;
+    phi::DenseTensor t_zero;
     t_zero.mutable_data<T>(x_grad->dims(), place);
     FillMLUTensorWithHostValue<T>(context, static_cast<T>(0), &t_zero);
     t_zero.Resize(x_grad->dims());
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
index 172786963e4c9..1ade0c6746918 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename DeviceContext, typename T>
 class ReduceMaxNPUKernel : public framework::OpKernel<T> {
  public:
@@ -77,8 +76,8 @@ class ReduceMaxNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>();
     if (framework::TransToProtoVarType(x->dtype()) ==
         framework::proto::VarType::INT64) {
-      auto op_func = [](const std::vector<Tensor>& inputs,
-                        const std::vector<Tensor>& outputs,
+      auto op_func = [](const std::vector<phi::DenseTensor>& inputs,
+                        const std::vector<phi::DenseTensor>& outputs,
                         const NPUAttributeMap& attrs,
                         const platform::NPUDeviceContext& dev_ctx) {
         const auto& runner =
@@ -147,7 +146,7 @@ class ReduceMaxGradNPUKernel : public framework::OpKernel<T> {
       }
     }
 
-    Tensor tmp_out, tmp_out_grad;
+    phi::DenseTensor tmp_out, tmp_out_grad;
     auto tmp_out_dims_vec = x_dims_vec;
     for (auto d : reduce_dims) {
       if (d < 0) {
@@ -161,7 +160,7 @@ class ReduceMaxGradNPUKernel : public framework::OpKernel<T> {
     tmp_out_grad.ShareDataWith(*out_grad);
     tmp_out_grad.Resize(phi::make_ddim(tmp_out_dims_vec));
 
-    Tensor transformed_out(x->type());
+    phi::DenseTensor transformed_out(x->type());
     transformed_out.Resize(phi::make_ddim(x_dims_vec));
     transformed_out.mutable_data<T>(place);
     NpuOpRunner r_brd_out;
@@ -170,7 +169,7 @@ class ReduceMaxGradNPUKernel : public framework::OpKernel<T> {
         .AddInput(std::move(x_dims_vec))
         .AddOutput(transformed_out)
         .Run(stream);
-    Tensor transformed_out_grad(x->type());
+    phi::DenseTensor transformed_out_grad(x->type());
     transformed_out_grad.Resize(phi::make_ddim(x_dims_vec));
     transformed_out_grad.mutable_data<T>(place);
     NpuOpRunner r_brd_out_grad;
@@ -181,14 +180,14 @@ class ReduceMaxGradNPUKernel : public framework::OpKernel<T> {
         .Run(stream);
 
     // compare
-    Tensor equal_cond;
+    phi::DenseTensor equal_cond;
     equal_cond.mutable_data<bool>(x_grad->dims(), place);
     const auto& r_equal =
         NpuOpRunner("Equal", {*x, transformed_out}, {equal_cond}, {});
     r_equal.Run(stream);
 
     // select
-    Tensor t_zero;
+    phi::DenseTensor t_zero;
     t_zero.mutable_data<T>(x_grad->dims(), place);
     FillNpuTensorWithConstant(&t_zero, static_cast<T>(0));
     t_zero.Resize(x_grad->dims());
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
index b73bde6275347..d1658c24733c9 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
@@ -54,7 +54,7 @@ class ReduceMeanGradMLUKernel : public framework::OpKernel<T> {
       reduce_numel *= input_dims[d];
     }
 
-    Tensor tmp_output_grad(output_grad->dtype());
+    phi::DenseTensor tmp_output_grad(output_grad->dtype());
     auto tmp_output_dims = input_dims;
     for (auto d : reduce_dims) {
       tmp_output_dims[d] = 1;
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
index feca58ce19861..35273df44d1e2 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
@@ -81,7 +81,7 @@ class NPUReduceMeanGradOpKernel : public framework::OpKernel<T> {
       reduce_numel *= input_dims[d];
     }
 
-    Tensor tensor_value(input_grad->dtype());
+    phi::DenseTensor tensor_value(input_grad->dtype());
     tensor_value.mutable_data<T>({1}, ctx.GetPlace());
     FillNpuTensorWithConstant<T>(
         &tensor_value, static_cast<T>(1.0f / static_cast<T>(reduce_numel)));
@@ -96,8 +96,8 @@ class NPUReduceMeanGradOpKernel : public framework::OpKernel<T> {
         .AddOutput(*input_grad)
         .Run(stream);
 
-    Tensor transformed_input_grad, transformed_out_grad;
-    Tensor tmp_output_grad;
+    phi::DenseTensor transformed_input_grad, transformed_out_grad;
+    phi::DenseTensor tmp_output_grad;
     auto tmp_output_dims = input_dims;
     for (auto d : reduce_dims) {
       tmp_output_dims[d] = 1;
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc
index 19efb2e6bfb4c..e7401d7917763 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename DeviceContext, typename T>
 class ReduceMinNPUKernel : public framework::OpKernel<T> {
  public:
@@ -76,8 +75,8 @@ class ReduceMinNPUKernel : public framework::OpKernel<T> {
     const auto& dev_ctx =
         ctx.template device_context<paddle::platform::NPUDeviceContext>();
     if (x->dtype() == experimental::DataType::INT64) {
-      auto op_func = [](const std::vector<Tensor>& inputs,
-                        const std::vector<Tensor>& outputs,
+      auto op_func = [](const std::vector<phi::DenseTensor>& inputs,
+                        const std::vector<phi::DenseTensor>& outputs,
                         const NPUAttributeMap& attrs,
                         const platform::NPUDeviceContext& dev_ctx) {
         const auto& runner =
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 027a787cbf50b..0cc7bf2898f86 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -48,7 +48,6 @@ namespace operators {
             keep_dim);                                           \
   }
 
-using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
 
 inline void GetShuffledDim(const DDim& src_dims,
@@ -137,7 +136,7 @@ void HandleLargeDim(const framework::ExecutionContext& context,
                     const std::vector<int>& dims,
                     bool keep_dim) {
   //  shuffle the reduced dim to the end
-  Tensor shuffled_input;
+  phi::DenseTensor shuffled_input;
   GetShuffledInput<DeviceContext, OutT>(context, input, &shuffled_input, dims);
 
   // transpose to 2D tensor whose shape is {unreduced, reduced}.
@@ -168,7 +167,7 @@ void HandleLargeDimGrad(const framework::ExecutionContext& context,
   DDim out_dim(out->dims());
   DDim x_dim(x->dims());
   // transpose and reshape X
-  Tensor shuffled_x;
+  phi::DenseTensor shuffled_x;
   GetShuffledInput<DeviceContext, T>(context, x, &shuffled_x, dims);
   DDim shuffled_dim = shuffled_x.dims();
   shuffled_x.Resize({unreduced, reduced});
@@ -185,7 +184,7 @@ void HandleLargeDimGrad(const framework::ExecutionContext& context,
   // transpose dX
   std::vector<int> origin_axis(x_dim.size());
   GetOriginDimFromShuffled(x_dim, dims, &origin_axis);
-  Tensor dx_tmp;
+  phi::DenseTensor dx_tmp;
   framework::TensorCopy(*dx, context.GetPlace(), &dx_tmp);
   dx_tmp.Resize(shuffled_dim);
   dx->Resize(x_dim);
@@ -453,7 +452,7 @@ class ReduceGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     int in_dtype = context.Attr<int>("in_dtype");
     if (in_dtype >= 0) {
-      Tensor tmp_tensor;
+      phi::DenseTensor tmp_tensor;
       auto* pre_input =
           context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
       auto in_kernel_type = framework::OpKernelType(
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_function.h b/paddle/fluid/operators/reduce_ops/reduce_op_function.h
index 39a0dc044f272..3176e489f89b3 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op_function.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op_function.h
@@ -21,7 +21,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
 template <typename T,
           size_t D,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
index 85b589ebf916e..ae42501f7e395 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename DeviceContext, typename T>
 class ReduceProdNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h
index 69c8935dafd6b..7b1b6bc831f0e 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h
@@ -80,7 +80,7 @@ class ReduceSumGradKernel : public framework::OpKernel<T> {
       int in_dtype = context.Attr<int>("out_dtype");
 
       if (in_dtype >= 0) {
-        Tensor tmp_tensor;
+        phi::DenseTensor tmp_tensor;
         auto* pre_input =
             context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
         auto in_kernel_type = framework::OpKernelType(
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc
index 4ecf6e907b4cb..130c617f873ba 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc
@@ -52,7 +52,7 @@ class ReduceSumGradMLUKernel : public framework::OpKernel<T> {
       }
     }
 
-    Tensor tmp_out(out_grad->dtype());
+    phi::DenseTensor tmp_out(out_grad->dtype());
     auto tmp_output_dims = in_dims;
     for (auto d : reduce_dims) {
       tmp_output_dims[d] = 1;
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
index 6ba8a9c1373a1..9588aa54f3877 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
@@ -129,7 +129,7 @@ class ReduceSumGradNPUKernel : public framework::OpKernel<T> {
       out_dims = UnsqueezeKernel<DeviceContext, T>::GetOutputShape(
           dims, out_grad->dims());
 
-      Tensor out_grad_tmp(out_grad->type());
+      phi::DenseTensor out_grad_tmp(out_grad->type());
       out_grad_tmp.Resize(out_dims);
       out_grad_tmp.mutable_data<T>(ctx.GetPlace());
       framework::TensorCopy(
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 161f230bacbe4..42e6929508bff 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -41,8 +41,6 @@ class OpBase;
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class ReshapeOp : public framework::OperatorWithKernel {
  public:
   ReshapeOp(const std::string &type,
@@ -272,7 +270,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "ShapeTensor") {
       return expected_kernel_type;
@@ -638,7 +636,7 @@ class Reshape2GradOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "ShapeTensor") {
       return expected_kernel_type;
@@ -666,7 +664,7 @@ class Reshape2DoubleGradOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "ShapeTensor") {
       return expected_kernel_type;
diff --git a/paddle/fluid/operators/rnn_op_mlu.cc b/paddle/fluid/operators/rnn_op_mlu.cc
index cf4e255668232..1773c526b4635 100644
--- a/paddle/fluid/operators/rnn_op_mlu.cc
+++ b/paddle/fluid/operators/rnn_op_mlu.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
 using TensorList = std::vector<phi::DenseTensor>;
 template <typename TensorType, typename T>
@@ -459,7 +458,7 @@ class RNNMLUGradKernel : public framework::OpKernel<T> {
     input_grad->mutable_data<T>(input->dims(), ctx.GetPlace());
     FillMLUTensorWithHostValue(ctx, static_cast<T>(0.0), input_grad);
 
-    Tensor a, b;
+    phi::DenseTensor a, b;
     phi::DenseTensor* dynamic_grad_pre_h = &a;
     phi::DenseTensor* dynamic_grad_pre_c = &b;
     if (init_h_grad) {
diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc
index 6a7999c56557f..4407fbf1a8c96 100644
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class ROIAlignOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/roi_align_op_mlu.cc b/paddle/fluid/operators/roi_align_op_mlu.cc
index 5bde4dd7b6686..de0a8be93452d 100644
--- a/paddle/fluid/operators/roi_align_op_mlu.cc
+++ b/paddle/fluid/operators/roi_align_op_mlu.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class ROIAlignOpMLUKernel : public framework::OpKernel<T> {
  public:
@@ -76,7 +74,7 @@ class ROIAlignOpMLUKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(lod.empty(),
                         false,
                         platform::errors::InvalidArgument(
-                            "Input(ROIs) Tensor of ROIAlignOp "
+                            "Input(ROIs) phi::DenseTensor of ROIAlignOp "
                             "does not contain LoD information."));
       auto rois_lod = lod.back();
       rois_batch_size = rois_lod.size() - 1;
@@ -110,7 +108,7 @@ class ROIAlignOpMLUKernel : public framework::OpKernel<T> {
     }
 
     // only support float32 for now
-    Tensor rois_cpu(framework::TransToPhiDataType(VT::FP32));
+    phi::DenseTensor rois_cpu(framework::TransToPhiDataType(VT::FP32));
     rois_cpu.Resize({rois_num, 4});
     rois_cpu.mutable_data<T>(ctx.GetPlace());
     auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
@@ -119,8 +117,8 @@ class ROIAlignOpMLUKernel : public framework::OpKernel<T> {
     T* rois_cpu_ptr = rois_cpu.mutable_data<T>(platform::CPUPlace());
 
     // boxes; [batch_idx, x1, y1, x2, y2]
-    Tensor boxes_cpu(framework::TransToPhiDataType(VT::FP32));
-    Tensor boxes_mlu(framework::TransToPhiDataType(VT::FP32));
+    phi::DenseTensor boxes_cpu(framework::TransToPhiDataType(VT::FP32));
+    phi::DenseTensor boxes_mlu(framework::TransToPhiDataType(VT::FP32));
     boxes_cpu.Resize({rois_num, 5});
     boxes_mlu.Resize({rois_num, 5});
     T* boxes_cpu_ptr = boxes_cpu.mutable_data<T>(platform::CPUPlace());
@@ -139,8 +137,8 @@ class ROIAlignOpMLUKernel : public framework::OpKernel<T> {
 
     const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
     const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
-    Tensor input_nhwc(in->type());
-    Tensor output_nhwc(out->type());
+    phi::DenseTensor input_nhwc(in->type());
+    phi::DenseTensor output_nhwc(out->type());
     TransposeFromMLUTensor<T>(
         ctx, perm_to_nhwc, in, &input_nhwc, true /*need_reshape_or_alloc*/);
     auto output_dims = out->dims();
@@ -221,7 +219,7 @@ class ROIAlignGradOpMLUKernel : public framework::OpKernel<T> {
       }
     }
 
-    Tensor rois_cpu(framework::TransToPhiDataType(VT::FP32));
+    phi::DenseTensor rois_cpu(framework::TransToPhiDataType(VT::FP32));
     rois_cpu.Resize({rois_num, 4});
     rois_cpu.mutable_data<T>(ctx.GetPlace());
     auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
@@ -230,8 +228,8 @@ class ROIAlignGradOpMLUKernel : public framework::OpKernel<T> {
     T* rois_cpu_ptr = rois_cpu.mutable_data<T>(platform::CPUPlace());
 
     // boxes; [batch_idx, x1, y1, x2, y2]
-    Tensor boxes_cpu(framework::TransToPhiDataType(VT::FP32));
-    Tensor boxes_mlu(framework::TransToPhiDataType(VT::FP32));
+    phi::DenseTensor boxes_cpu(framework::TransToPhiDataType(VT::FP32));
+    phi::DenseTensor boxes_mlu(framework::TransToPhiDataType(VT::FP32));
     boxes_cpu.Resize({rois_num, 5});
     boxes_mlu.Resize({rois_num, 5});
     T* boxes_cpu_ptr = boxes_cpu.mutable_data<T>(platform::CPUPlace());
@@ -250,8 +248,8 @@ class ROIAlignGradOpMLUKernel : public framework::OpKernel<T> {
 
     const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
     const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
-    Tensor grads_nhwc(out_grad->type());
-    Tensor grads_image_nhwc(in_grad->type());
+    phi::DenseTensor grads_nhwc(out_grad->type());
+    phi::DenseTensor grads_image_nhwc(in_grad->type());
     TransposeFromMLUTensor<T>(ctx,
                               perm_to_nhwc,
                               out_grad,
diff --git a/paddle/fluid/operators/roi_align_op_npu.cc b/paddle/fluid/operators/roi_align_op_npu.cc
index 72578ca0177c0..06be3f35b3f23 100644
--- a/paddle/fluid/operators/roi_align_op_npu.cc
+++ b/paddle/fluid/operators/roi_align_op_npu.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class ROIAlignNPUKernel : public framework::OpKernel<T> {
@@ -54,7 +53,7 @@ class ROIAlignNPUKernel : public framework::OpKernel<T> {
     int dtype =
         static_cast<int>(ConvertToNpuDtype(framework::proto::VarType::FP32));
     framework::NPUAttributeMap attr_cast = {{"dst_type", dtype}};
-    Tensor ROIsNum_fp(ROIs->dtype());
+    phi::DenseTensor ROIsNum_fp(ROIs->dtype());
     ROIsNum_fp.Resize(phi::make_ddim({ROIs->dims()[0], 1}));
     ROIsNum_fp.mutable_data<T>(ctx.GetPlace());
 
@@ -68,7 +67,7 @@ class ROIAlignNPUKernel : public framework::OpKernel<T> {
     x_list.push_back(*ROIs);
     auto axis = 1;
     // output of concate
-    Tensor ROIs_N5(ROIs->dtype());
+    phi::DenseTensor ROIs_N5(ROIs->dtype());
     ROIs_N5.Resize(phi::make_ddim({ROIs->dims()[0], 5}));
     ROIs_N5.mutable_data<T>(ctx.GetPlace());
 
@@ -137,9 +136,9 @@ class ROIAlignNPUGradKernel : public framework::OpKernel<T> {
 
     // Cast RoisNum to fp32 tensor
     auto* RoisNum = ctx.Input<phi::DenseTensor>("RoisNum");
-    Tensor ROIs_N5;
+    phi::DenseTensor ROIs_N5;
     ROIs_N5.mutable_data<float>({rois_num, 5}, place);
-    Tensor ROIsNum_fp;
+    phi::DenseTensor ROIsNum_fp;
     ROIsNum_fp.mutable_data<T>(RoisNum->dims(), place);  // shape = [rois_num]
     int nputype_fp32 =
         static_cast<int>(ConvertToNpuDtype(framework::proto::VarType::FP32));
@@ -161,7 +160,7 @@ class ROIAlignNPUGradKernel : public framework::OpKernel<T> {
     //  function
 #if (CANN_VERSION_CODE < 504000)
     std::vector<float> vec_dlt = {0, 0, 0, -1.0f, -1.0f};
-    Tensor tsr_dlt;
+    phi::DenseTensor tsr_dlt;
     tsr_dlt.mutable_data<float>({5}, place);
     framework::TensorFromVector<float>(vec_dlt, ctx.device_context(), &tsr_dlt);
     ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index b2e8a6ae58883..e79975e6254eb 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class ROIPoolOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/sample_logits_op.cu b/paddle/fluid/operators/sample_logits_op.cu
index 8871627b85242..7d61088dd9fd6 100644
--- a/paddle/fluid/operators/sample_logits_op.cu
+++ b/paddle/fluid/operators/sample_logits_op.cu
@@ -112,7 +112,6 @@ __global__ void gpu_compute_remove_accidental_hits(const int size,
 template <typename T>
 class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
  public:
-  using Tensor = phi::DenseTensor;
   void Compute(const framework::ExecutionContext& context) const override {
     // get necessary inputs
     const phi::DenseTensor* logits = context.Input<phi::DenseTensor>("Logits");
@@ -165,16 +164,17 @@ class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
           context.Input<phi::DenseTensor>("CustomizedSamples");
       const phi::DenseTensor* customized_probabilities =
           context.Input<phi::DenseTensor>("CustomizedProbabilities");
-      PADDLE_ENFORCE_EQ(customized_samples,
-                        samples,
-                        platform::errors::InvalidArgument(
-                            "CustomizedSamples must be the same Tensor with "
-                            "Samples when use_customized_samples = True"));
+      PADDLE_ENFORCE_EQ(
+          customized_samples,
+          samples,
+          platform::errors::InvalidArgument(
+              "CustomizedSamples must be the same phi::DenseTensor with "
+              "Samples when use_customized_samples = True"));
       PADDLE_ENFORCE_EQ(
           customized_probabilities,
           probabilities,
           platform::errors::InvalidArgument(
-              "CustomizedProbabilities must be the same Tensor with "
+              "CustomizedProbabilities must be the same phi::DenseTensor with "
               "Probabilities when use_customized_samples = True"));
     } else {
       samples->mutable_data<int64_t>(context.GetPlace());
@@ -238,7 +238,6 @@ class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
 template <typename T>
 class SampleLogitsGradCUDAKernel : public framework::OpKernel<T> {
  public:
-  using Tensor = phi::DenseTensor;
   void Compute(const framework::ExecutionContext& context) const override {
     auto logits_grad =
         context.Output<phi::DenseTensor>(framework::GradVarName("Logits"));
diff --git a/paddle/fluid/operators/sample_logits_op.h b/paddle/fluid/operators/sample_logits_op.h
index 584d115d28ff3..fe53a12e5ed71 100644
--- a/paddle/fluid/operators/sample_logits_op.h
+++ b/paddle/fluid/operators/sample_logits_op.h
@@ -27,7 +27,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -212,7 +211,6 @@ static void compute_remove_accidental_hits(const platform::DeviceContext& ctx,
 template <typename T>
 class SampleLogitsKernel : public framework::OpKernel<T> {
  public:
-  using Tensor = phi::DenseTensor;
   void Compute(const framework::ExecutionContext& context) const override {
     PADDLE_ENFORCE_EQ(
         platform::is_cpu_place(context.GetPlace()),
@@ -264,16 +262,17 @@ class SampleLogitsKernel : public framework::OpKernel<T> {
           context.Input<phi::DenseTensor>("CustomizedSamples");
       const phi::DenseTensor* customized_probabilities =
           context.Input<phi::DenseTensor>("CustomizedProbabilities");
-      PADDLE_ENFORCE_EQ(customized_samples,
-                        samples,
-                        platform::errors::InvalidArgument(
-                            "CustomizedSamples must be the same Tensor with "
-                            "Samples when use_customized_samples = True"));
+      PADDLE_ENFORCE_EQ(
+          customized_samples,
+          samples,
+          platform::errors::InvalidArgument(
+              "CustomizedSamples must be the same phi::DenseTensor with "
+              "Samples when use_customized_samples = True"));
       PADDLE_ENFORCE_EQ(
           customized_probabilities,
           probabilities,
           platform::errors::InvalidArgument(
-              "CustomizedProbabilities must be the same Tensor with "
+              "CustomizedProbabilities must be the same phi::DenseTensor with "
               "Probabilities when use_customized_samples = True"));
     } else {
       samples->mutable_data<int64_t>(context.GetPlace());
@@ -308,7 +307,6 @@ class SampleLogitsKernel : public framework::OpKernel<T> {
 template <typename T>
 class SampleLogitsGradKernel : public framework::OpKernel<T> {
  public:
-  using Tensor = phi::DenseTensor;
   void Compute(const framework::ExecutionContext& context) const override {
     auto logits_grad =
         context.Output<phi::DenseTensor>(framework::GradVarName("Logits"));
diff --git a/paddle/fluid/operators/sampling_id_op.cc b/paddle/fluid/operators/sampling_id_op.cc
index 6d2d3f4a60047..7e84077fd60ae 100644
--- a/paddle/fluid/operators/sampling_id_op.cc
+++ b/paddle/fluid/operators/sampling_id_op.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class SamplingIdOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/sampling_id_op.h b/paddle/fluid/operators/sampling_id_op.h
index 43c0bdcf4043e..e5c4f744db4a5 100644
--- a/paddle/fluid/operators/sampling_id_op.h
+++ b/paddle/fluid/operators/sampling_id_op.h
@@ -27,8 +27,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class SamplingIdKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc
index 41780561144b1..71f78911456f7 100644
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class SaveCombineOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/scatter_op_mlu.cc b/paddle/fluid/operators/scatter_op_mlu.cc
index a4cb5d7424936..83cbbbd7b9e69 100644
--- a/paddle/fluid/operators/scatter_op_mlu.cc
+++ b/paddle/fluid/operators/scatter_op_mlu.cc
@@ -42,7 +42,7 @@ class ScatterMLUKernel : public framework::OpKernel<T> {
                                  GetBasePtr(indices),
                                  mode);
     } else {
-      Tensor tensor_zeros(updates->type());
+      phi::DenseTensor tensor_zeros(updates->type());
       tensor_zeros.mutable_data<T>(updates->dims(), ctx.GetPlace());
       MLUCnnlTensorDesc tensor_zeros_desc(tensor_zeros);
       float value = 0.0;
diff --git a/paddle/fluid/operators/scatter_op_npu.cc b/paddle/fluid/operators/scatter_op_npu.cc
index 6bffd24734055..ded722c7eb794 100644
--- a/paddle/fluid/operators/scatter_op_npu.cc
+++ b/paddle/fluid/operators/scatter_op_npu.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class ScatterNPUKernel : public framework::OpKernel<T> {
  public:
@@ -49,16 +47,16 @@ class ScatterNPUKernel : public framework::OpKernel<T> {
 
     const auto& dev_ctx =
         ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto op_func_update = [](const std::vector<Tensor>& inputs,
-                             const std::vector<Tensor>& outputs,
+    auto op_func_update = [](const std::vector<phi::DenseTensor>& inputs,
+                             const std::vector<phi::DenseTensor>& outputs,
                              const NPUAttributeMap& attrs,
                              const platform::NPUDeviceContext& dev_ctx) {
       const auto& runner =
           NpuOpRunner("TensorScatterUpdate", inputs, outputs, attrs);
       runner.Run(dev_ctx.stream());
     };
-    auto op_func_add = [](const std::vector<Tensor>& inputs,
-                          const std::vector<Tensor>& outputs,
+    auto op_func_add = [](const std::vector<phi::DenseTensor>& inputs,
+                          const std::vector<phi::DenseTensor>& outputs,
                           const NPUAttributeMap& attrs,
                           const platform::NPUDeviceContext& dev_ctx) {
       const auto& runner =
diff --git a/paddle/fluid/operators/search_compute.h b/paddle/fluid/operators/search_compute.h
index 34728c86c56b6..15f87803f5ab8 100644
--- a/paddle/fluid/operators/search_compute.h
+++ b/paddle/fluid/operators/search_compute.h
@@ -28,7 +28,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using LoD = framework::LoD;
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/seed_op.cc b/paddle/fluid/operators/seed_op.cc
index 88a1884ae53e4..93d57aedd8aff 100644
--- a/paddle/fluid/operators/seed_op.cc
+++ b/paddle/fluid/operators/seed_op.cc
@@ -17,7 +17,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 class SeedOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/seed_op.h b/paddle/fluid/operators/seed_op.h
index a1c3484b7a728..c3cbc16fb4884 100644
--- a/paddle/fluid/operators/seed_op.h
+++ b/paddle/fluid/operators/seed_op.h
@@ -19,7 +19,6 @@
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 static int get_seed(const framework::ExecutionContext& context) {
   int user_seed = context.Attr<int>("seed");
diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc
index 86049bec1eb21..a41b0f5f2b996 100644
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -36,8 +36,6 @@ class OpBase;
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class SetValue : public framework::OperatorWithKernel {
  public:
   SetValue(const std::string &type,
@@ -55,7 +53,7 @@ class SetValue : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "StartsTensorList" || var_name == "EndsTensorList" ||
         var_name == "StepsTensorList") {
@@ -70,24 +68,28 @@ class SetValueMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     // Input
-    AddInput("Input", "(Tensor) Input tensor of set_value operator.");
-    AddInput("ValueTensor", "(Tensor) Value tensor of set_value operator.")
+    AddInput("Input", "(phi::DenseTensor) Input tensor of set_value operator.");
+    AddInput("ValueTensor",
+             "(phi::DenseTensor) Value tensor of set_value operator.")
         .AsDispensable();
     AddInput("StartsTensorList",
-             "(vector<Tensor<int32>>, optional) If provided, set_value will "
+             "(vector<phi::DenseTensor<int32>>, optional) If provided, "
+             "set_value will "
              "use this. The shape of the tensor in vector must be [1]."
              "It has higher priority compare with attr(starts).")
         .AsDuplicable()
         .AsDispensable();
     AddInput("EndsTensorList",
-             "(vector<Tensor<int32>>, optional) If provided, set_value will "
+             "(vector<phi::DenseTensor<int32>>, optional) If provided, "
+             "set_value will "
              "use this. The shape of the tensor in vector must BE [1]."
              "It has higher priority compare with attr(ends).")
         .AsDuplicable()
         .AsDispensable();
 
     AddInput("StepsTensorList",
-             "(vector<Tensor<int32>>, optional) If provided, set_value will "
+             "(vector<phi::DenseTensor<int32>>, optional) If provided, "
+             "set_value will "
              "use this. The shape of the tensor in vector must BE [1]."
              "It has higher priority compare with attr(steps).")
         .AsDuplicable()
@@ -95,8 +97,9 @@ class SetValueMaker : public framework::OpProtoAndCheckerMaker {
 
     // Output
     AddOutput("Out",
-              "(Tensor) Output tensor of set_value operator. The output is the "
-              "same Tensor as input");
+              "(phi::DenseTensor) Output tensor of set_value operator. The "
+              "output is the "
+              "same phi::DenseTensor as input");
 
     // Attr
     AddAttr<int>("dtype", "data type of input.")
@@ -142,7 +145,7 @@ class SetValueMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<std::vector<int64_t>>("shape", "(vector<int64_t>) Shape of values.")
         .SetDefault({});
     AddComment(R"DOC(SetValue operator.
-Assignment to a Tensor in static mode.
+Assignment to a phi::DenseTensor in static mode.
 )DOC");
   }
 };
@@ -220,7 +223,7 @@ class SetValueGrad : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "StartsTensorList" || var_name == "EndsTensorList" ||
         var_name == "StepsTensorList") {
diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h
index 7ef766020251b..d4ed1ce586e8f 100644
--- a/paddle/fluid/operators/set_value_op.h
+++ b/paddle/fluid/operators/set_value_op.h
@@ -31,7 +31,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
 
 inline std::string GetValueName(framework::proto::VarType::Type data_type) {
diff --git a/paddle/fluid/operators/set_value_op_mlu.cc b/paddle/fluid/operators/set_value_op_mlu.cc
index 1b950a6da6084..06369b83bbab9 100644
--- a/paddle/fluid/operators/set_value_op_mlu.cc
+++ b/paddle/fluid/operators/set_value_op_mlu.cc
@@ -102,7 +102,7 @@ class SetValueMLUKernel : public framework::OpKernel<T> {
       ends_indices[axis_index] = static_cast<int>(ends[i]);
       strides_indices[axis_index] = static_cast<int>(steps[i]);
     }
-    Tensor value_t(in->type());
+    phi::DenseTensor value_t(in->type());
     if (value_tensor != nullptr) {
       value_t.ShareDataWith(*value_tensor);
     } else {
@@ -116,7 +116,7 @@ class SetValueMLUKernel : public framework::OpKernel<T> {
       value_t.Resize(value_dims);
     }
 
-    Tensor value_temp(in->type());
+    phi::DenseTensor value_temp(in->type());
     if (slice_dims_for_assign == value_t.dims()) {
       value_temp.ShareDataWith(value_t);
     } else {
@@ -133,7 +133,7 @@ class SetValueMLUKernel : public framework::OpKernel<T> {
 
     int64_t input_numel = phi::product(in_dims);
     int64_t value_numel = phi::product(value_temp.dims());
-    Tensor in_temp, out_temp, val_temp, index_out;
+    phi::DenseTensor in_temp, out_temp, val_temp, index_out;
     int64_t stride_step = phi::product(in_dims);
     std::vector<int64_t> index_indices(stride_step);
     std::iota(index_indices.begin(), index_indices.end(), 0);
@@ -185,7 +185,7 @@ class SetValueMLUKernel : public framework::OpKernel<T> {
         phi::product(slice_dims_for_assign),
         platform::errors::InvalidArgument(
             "OP(set_value) error index indices and value update not match "));
-    Tensor index_final;
+    phi::DenseTensor index_final;
     index_final.ShareDataWith(index_out);
     int64_t indices_numel = phi::product(index_dims);
     auto new_index_dims = phi::make_ddim({indices_numel});
diff --git a/paddle/fluid/operators/set_value_op_npu.cc b/paddle/fluid/operators/set_value_op_npu.cc
index 7526b13311b05..9dde6c6fbb3c0 100644
--- a/paddle/fluid/operators/set_value_op_npu.cc
+++ b/paddle/fluid/operators/set_value_op_npu.cc
@@ -132,7 +132,7 @@ class SetValueNPUKernel : public framework::OpKernel<T> {
         platform::errors::InvalidArgument(
             "OP(set_value) error index indices and value update not match "));
 
-    Tensor value_t(in->type());
+    phi::DenseTensor value_t(in->type());
     if (value_tensor != nullptr) {
       value_t.ShareDataWith(*value_tensor);
     } else {
@@ -148,7 +148,7 @@ class SetValueNPUKernel : public framework::OpKernel<T> {
 
     auto stream = ctx.template device_context<NPUDeviceContext>().stream();
 
-    Tensor value_temp(in->type());
+    phi::DenseTensor value_temp(in->type());
     if (slice_dims_for_assign == value_t.dims()) {
       value_temp.ShareDataWith(value_t);
     } else {
@@ -165,7 +165,7 @@ class SetValueNPUKernel : public framework::OpKernel<T> {
     int64_t input_numel = phi::product(in_dims);
     int64_t index_numel = index_indices.size();
 
-    Tensor in_temp, out_temp, val_temp;
+    phi::DenseTensor in_temp, out_temp, val_temp;
     in_temp.ShareDataWith(*in);
     out_temp.ShareDataWith(*out);
     val_temp.ShareDataWith(value_temp);
diff --git a/paddle/fluid/operators/shape_op_mlu.cc b/paddle/fluid/operators/shape_op_mlu.cc
index bd51b49851840..f69a202819935 100644
--- a/paddle/fluid/operators/shape_op_mlu.cc
+++ b/paddle/fluid/operators/shape_op_mlu.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using SelectedRows = phi::SelectedRows;
 
 template <typename T>
@@ -39,7 +38,7 @@ class ShapeMLUKernel : public framework::OpKernel<T> {
     out_t->mutable_data<int32_t>(ctx.GetPlace());
 
     // shape op cpu
-    Tensor shape_on_cpu(
+    phi::DenseTensor shape_on_cpu(
         framework::TransToPhiDataType(framework::proto::VarType::INT32));
     shape_on_cpu.Resize({in_dims.size()});
     auto cpu_data = shape_on_cpu.mutable_data<int32_t>(platform::CPUPlace());
diff --git a/paddle/fluid/operators/shape_op_npu.cc b/paddle/fluid/operators/shape_op_npu.cc
index 60a0162818c9d..f66ae5dc750fe 100644
--- a/paddle/fluid/operators/shape_op_npu.cc
+++ b/paddle/fluid/operators/shape_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class ShapeNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/shard_index_op_npu.cc b/paddle/fluid/operators/shard_index_op_npu.cc
index 3cc025ca9ed64..488615f66325e 100644
--- a/paddle/fluid/operators/shard_index_op_npu.cc
+++ b/paddle/fluid/operators/shard_index_op_npu.cc
@@ -18,7 +18,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T>
 class ShardIndexNPUKernel : public framework::OpKernel<T> {
  public:
@@ -67,17 +66,17 @@ class ShardIndexNPUKernel : public framework::OpKernel<T> {
     out->set_lod(in->lod());
     out->mutable_data<T>(place);
 
-    Tensor tmp(in->type());
+    phi::DenseTensor tmp(in->type());
     tmp.mutable_data<T>(framework::DDim({1}), place);
     FillNpuTensorWithConstant(&tmp, shard_size);
 
-    Tensor condition(experimental::DataType::BOOL);
+    phi::DenseTensor condition(experimental::DataType::BOOL);
     condition.mutable_data<bool>(in->dims(), place);
 
-    Tensor tmp2(in->type());
+    phi::DenseTensor tmp2(in->type());
     tmp2.mutable_data<T>(in->dims(), place);
 
-    Tensor tmp3(in->type());
+    phi::DenseTensor tmp3(in->type());
     tmp3.mutable_data<T>(in->dims(), place);
 
     auto stream =
@@ -103,7 +102,7 @@ class ShardIndexNPUKernel : public framework::OpKernel<T> {
     runner2.SetType("Equal");
     runner2.Run(stream);
 
-    Tensor tmp4(in->type());
+    phi::DenseTensor tmp4(in->type());
     tmp4.mutable_data<T>(in->dims(), place);
     FillNpuTensorWithConstant(&tmp4, ignore_value);
     tmp4.Resize(in->dims());
diff --git a/paddle/fluid/operators/shuffle_batch_op.h b/paddle/fluid/operators/shuffle_batch_op.h
index 2f1fbee16e3d9..4bc1289bf468c 100644
--- a/paddle/fluid/operators/shuffle_batch_op.h
+++ b/paddle/fluid/operators/shuffle_batch_op.h
@@ -32,7 +32,6 @@
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename T>
 using Vector = framework::Vector<T>;
diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu
index 4869a4c6c5e22..6aa59becb1d6f 100644
--- a/paddle/fluid/operators/shuffle_channel_op.cu
+++ b/paddle/fluid/operators/shuffle_channel_op.cu
@@ -16,7 +16,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaximumNumBlocks = 4096;
 
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_mlu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_mlu.cc
index d77724281327c..431a36d414c99 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_mlu.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_mlu.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 const int kIgnoreIndex = -100;
 
 void CheckAttrs(const framework::ExecutionContext& ctx) {
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
index ea3f119a05a91..df4270b6f23bc 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 const int kIgnoreIndex = -100;
 
 void CheckAttrs(const framework::ExecutionContext& ctx) {
diff --git a/paddle/fluid/operators/similarity_focus_op.h b/paddle/fluid/operators/similarity_focus_op.h
index 8c055c2323c84..e706da9e01419 100644
--- a/paddle/fluid/operators/similarity_focus_op.h
+++ b/paddle/fluid/operators/similarity_focus_op.h
@@ -24,7 +24,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename T>
 class SimilarityFocusKernel : public framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index d6f48d334759d..a418719907872 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class SliceOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -181,7 +179,7 @@ class SliceOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "StartsTensor" || var_name == "EndsTensor") {
       return expected_kernel_type;
@@ -349,7 +347,7 @@ class SliceOpGrad : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "StartsTensor" || var_name == "EndsTensor") {
       return expected_kernel_type;
diff --git a/paddle/fluid/operators/slice_op_mlu.cc b/paddle/fluid/operators/slice_op_mlu.cc
index 1935e2d0c9b14..771fca6a5ef18 100644
--- a/paddle/fluid/operators/slice_op_mlu.cc
+++ b/paddle/fluid/operators/slice_op_mlu.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class SliceMLUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc
index 13ad263575698..59d6e2c2e42c1 100644
--- a/paddle/fluid/operators/slice_op_npu.cc
+++ b/paddle/fluid/operators/slice_op_npu.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 void UpdateAttr(const framework::DDim& in_dims,
@@ -199,7 +198,7 @@ class SliceGradNPUKernel : public framework::OpKernel<T> {
       paddings[i][1] = static_cast<int64_t>(in_dims[i] - size[i] - offsets[i]);
     }
 
-    Tensor tmp_dout;
+    phi::DenseTensor tmp_dout;
     tmp_dout.ShareDataWith(*dout);
     auto out_dims = dout->dims();
     auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
diff --git a/paddle/fluid/operators/smooth_l1_loss_op.h b/paddle/fluid/operators/smooth_l1_loss_op.h
index 3cc565ef91203..e11f629d86dda 100644
--- a/paddle/fluid/operators/smooth_l1_loss_op.h
+++ b/paddle/fluid/operators/smooth_l1_loss_op.h
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -79,7 +78,7 @@ class SmoothL1LossKernel : public framework::OpKernel<T> {
     }
 
     auto in_counts = in0->numel();
-    Tensor ptensor_errors;
+    phi::DenseTensor ptensor_errors;
     ptensor_errors.mutable_data<T>({static_cast<int>(in_counts)},
                                    context.GetPlace());
     auto errors = EigenVector<T>::Flatten(ptensor_errors);
@@ -138,7 +137,7 @@ class SmoothL1LossGradKernel : public framework::OpKernel<T> {
     auto mat_dims =
         phi::make_ddim({static_cast<int>(in_dims[0]), static_cast<int>(cols)});
 
-    Tensor ptensor_diff;
+    phi::DenseTensor ptensor_diff;
     ptensor_diff.mutable_data<T>({static_cast<int>(counts)},
                                  context.GetPlace());
     auto diff = EigenVector<T>::Flatten(ptensor_diff);
@@ -147,7 +146,7 @@ class SmoothL1LossGradKernel : public framework::OpKernel<T> {
         SmoothL1LossBackward<T>(sigma2));
 
     // compute weights
-    Tensor ptensor_weights;
+    phi::DenseTensor ptensor_weights;
     ptensor_weights.mutable_data<T>(mat_dims, context.GetPlace());
     auto weights = EigenMatrix<T>::From(ptensor_weights);
     // initialize to 1.0
diff --git a/paddle/fluid/operators/smooth_l1_loss_op_npu.cc b/paddle/fluid/operators/smooth_l1_loss_op_npu.cc
index 1a4fb14bbb0b6..811a016c6515c 100644
--- a/paddle/fluid/operators/smooth_l1_loss_op_npu.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op_npu.cc
@@ -42,12 +42,12 @@ class SmoothL1LossNPUKernel : public framework::OpKernel<T> {
     const auto& runner1 = NpuOpRunner("Sub", {*in_x, *in_y}, {*out_diff}, {});
     runner1.Run(stream);
 
-    Tensor no_reduce_loss(in_x->dtype());
+    phi::DenseTensor no_reduce_loss(in_x->dtype());
     no_reduce_loss.Resize(in_x->dims());
     no_reduce_loss.mutable_data<T>(context.GetPlace());
     // multiply inside weight before get the loss
     if (has_weight) {
-      Tensor tmp_diff(out_diff->dtype());
+      phi::DenseTensor tmp_diff(out_diff->dtype());
       tmp_diff.Resize(out_diff->dims());
       tmp_diff.mutable_data<T>(context.GetPlace());
       const auto& runner2 =
@@ -59,11 +59,11 @@ class SmoothL1LossNPUKernel : public framework::OpKernel<T> {
           context.template device_context<paddle::platform::NPUDeviceContext>(),
           out_diff);
 
-      Tensor tmp_x(in_x->dtype());
+      phi::DenseTensor tmp_x(in_x->dtype());
       tmp_x.Resize(in_x->dims());
       tmp_x.mutable_data<T>(context.GetPlace());
 
-      Tensor tmp_y(in_y->dtype());
+      phi::DenseTensor tmp_y(in_y->dtype());
       tmp_y.Resize(in_y->dims());
       tmp_y.mutable_data<T>(context.GetPlace());
 
@@ -90,7 +90,7 @@ class SmoothL1LossNPUKernel : public framework::OpKernel<T> {
     // multiply outside weight and loss
     // reduceSum because the output'shape must be [B,1]
     if (has_weight) {
-      Tensor tmp_loss(no_reduce_loss.dtype());
+      phi::DenseTensor tmp_loss(no_reduce_loss.dtype());
       tmp_loss.Resize(no_reduce_loss.dims());
       tmp_loss.mutable_data<T>(context.GetPlace());
       const auto& runner4 =
@@ -134,13 +134,13 @@ class SmoothL1LossGradNPUKernel : public framework::OpKernel<T> {
             .stream();
 
     // diff == in_x - in_y == diff - 0
-    Tensor tmp_zero(diff->dtype());
+    phi::DenseTensor tmp_zero(diff->dtype());
     tmp_zero.Resize(diff->dims());
     tmp_zero.mutable_data<T>(context.GetPlace());
     const auto& runner_zero = NpuOpRunner("ZerosLike", {*diff}, {tmp_zero}, {});
     runner_zero.Run(stream);
 
-    Tensor grad(diff->dtype());
+    phi::DenseTensor grad(diff->dtype());
     grad.Resize(diff->dims());
     grad.mutable_data<T>(context.GetPlace());
     // broadcast og(output_grad) to adapt to the npu interface
@@ -151,7 +151,7 @@ class SmoothL1LossGradNPUKernel : public framework::OpKernel<T> {
                     {{"shape", phi::vectorize(diff->dims())}});
     runner_broad.Run(stream);
 
-    Tensor gradient(diff->dtype());
+    phi::DenseTensor gradient(diff->dtype());
     gradient.Resize(diff->dims());
     gradient.mutable_data<T>(context.GetPlace());
     // diff == diff - 0 == in_x - in_y
@@ -163,14 +163,14 @@ class SmoothL1LossGradNPUKernel : public framework::OpKernel<T> {
 
     // mul weight and gradient
     if (has_weight) {
-      Tensor weight(inside_weight->dtype());
+      phi::DenseTensor weight(inside_weight->dtype());
       weight.Resize(inside_weight->dims());
       weight.mutable_data<T>(context.GetPlace());
       const auto& runner_weight =
           NpuOpRunner("Mul", {*inside_weight, *outside_weight}, {weight}, {});
       runner_weight.Run(stream);
 
-      Tensor tmp_grad(gradient.dtype());
+      phi::DenseTensor tmp_grad(gradient.dtype());
       tmp_grad.Resize(gradient.dims());
       tmp_grad.mutable_data<T>(context.GetPlace());
       const auto& runner_weight_grad =
@@ -196,7 +196,7 @@ class SmoothL1LossGradNPUKernel : public framework::OpKernel<T> {
     // outy_grad = - gradient
     if (outy_grad) {
       outy_grad->mutable_data<T>(context.GetPlace());
-      Tensor coeff(experimental::DataType::FLOAT32);
+      phi::DenseTensor coeff(experimental::DataType::FLOAT32);
       coeff.mutable_data<float>({1}, context.GetPlace());
       FillNpuTensorWithConstant<float>(&coeff, -1);
       const auto& runner_y_grad =
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
index 91333b3393000..87d788b478367 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class SoftmaxWithCrossEntropyMLUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
index d42f993f46219..6a51198e75460 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
@@ -24,8 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
  public:
@@ -61,7 +59,7 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
     backprop->mutable_data<T>(ctx.GetPlace());
     softmax->mutable_data<T>(ctx.GetPlace());
 
-    Tensor logits_2d, labels_1d, loss_1d, backprop_2d, softmax_2d;
+    phi::DenseTensor logits_2d, labels_1d, loss_1d, backprop_2d, softmax_2d;
     logits_2d.ShareDataWith(*logits).Resize({n, d});
     labels_1d.ShareDataWith(*labels).Resize({n});
     loss_1d.ShareDataWith(*loss).Resize({n});
@@ -110,7 +108,7 @@ class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> {
     const int n = phi::funcs::SizeToAxis(axis, logits_grad->dims());
     const int d = phi::funcs::SizeFromAxis(axis, logits_grad->dims());
 
-    Tensor logits_grad_2d, loss_grad_1d, backprop_2d;
+    phi::DenseTensor logits_grad_2d, loss_grad_1d, backprop_2d;
 
     logits_grad_2d.ShareDataWith(*logits_grad).Resize({n, d});
     loss_grad_1d.ShareDataWith(*loss_grad).Resize({n});
diff --git a/paddle/fluid/operators/space_to_depth_op.cc b/paddle/fluid/operators/space_to_depth_op.cc
index 6cc8d0f79be4e..0d4af9c0ce94a 100644
--- a/paddle/fluid/operators/space_to_depth_op.cc
+++ b/paddle/fluid/operators/space_to_depth_op.cc
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class SpaceToDepthOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/sparse_attention_op.cu b/paddle/fluid/operators/sparse_attention_op.cu
index b03a0b6c84e71..c0ad3c9a57823 100644
--- a/paddle/fluid/operators/sparse_attention_op.cu
+++ b/paddle/fluid/operators/sparse_attention_op.cu
@@ -203,7 +203,6 @@ __global__ void BlockSparseSoftmaxBackward(T* dst,
   }
 }
 
-using Tensor = phi::DenseTensor;
 /*
 input: sparse C in CSR format (num_rows,num_rows)
 output: sparse C after softmax operation
@@ -641,7 +640,7 @@ void DotDsd(const phi::GPUContext& ctx,
   platform::dynload::cusparseDestroy(handle);
 }
 
-std::vector<Tensor> GetSplitTensor(phi::DenseTensor* input) {
+std::vector<phi::DenseTensor> GetSplitTensor(phi::DenseTensor* input) {
   auto dims = input->dims();
   int batch_size = dims[0];
   int num_heads = dims[1];
@@ -687,14 +686,16 @@ class SparseAttentionCUDAKernel : public framework::OpKernel<T> {
     int M = query_dims[2];
     int N = query_dims[3];
 
-    std::vector<Tensor> query_lists = GetSplitTensor(&query);
-    std::vector<Tensor> key_lists = GetSplitTensor(&key);
-    std::vector<Tensor> value_lists = GetSplitTensor(&value);
-    std::vector<Tensor> offset_lists = GetSplitTensor(&offset);
-    std::vector<Tensor> columns_lists = GetSplitTensor(&columns);
-    std::vector<Tensor> result_sdd_lists = GetSplitTensor(&result_sdd);
-    std::vector<Tensor> result_softmax_lists = GetSplitTensor(&result_softmax);
-    std::vector<Tensor> output_lists = GetSplitTensor(&output);
+    std::vector<phi::DenseTensor> query_lists = GetSplitTensor(&query);
+    std::vector<phi::DenseTensor> key_lists = GetSplitTensor(&key);
+    std::vector<phi::DenseTensor> value_lists = GetSplitTensor(&value);
+    std::vector<phi::DenseTensor> offset_lists = GetSplitTensor(&offset);
+    std::vector<phi::DenseTensor> columns_lists = GetSplitTensor(&columns);
+    std::vector<phi::DenseTensor> result_sdd_lists =
+        GetSplitTensor(&result_sdd);
+    std::vector<phi::DenseTensor> result_softmax_lists =
+        GetSplitTensor(&result_softmax);
+    std::vector<phi::DenseTensor> output_lists = GetSplitTensor(&output);
 
     const auto& dev_ctx = ctx.cuda_device_context();
     const int iter_num = batch_size * num_heads;
@@ -802,17 +803,18 @@ class SparseAttentionGradCUDAKernel : public framework::OpKernel<T> {
     int M = query_dims[2];
     int N = query_dims[3];
 
-    std::vector<Tensor> query_lists = GetSplitTensor(&query);
-    std::vector<Tensor> key_lists = GetSplitTensor(&key);
-    std::vector<Tensor> value_lists = GetSplitTensor(&value);
-    std::vector<Tensor> offset_lists = GetSplitTensor(&offset);
-    std::vector<Tensor> columns_lists = GetSplitTensor(&columns);
-    std::vector<Tensor> sparse_dot_sdd_lists = GetSplitTensor(&sparse_dot_sdd);
-    std::vector<Tensor> softmax_lists = GetSplitTensor(&softmax);
-    std::vector<Tensor> dout_lists = GetSplitTensor(&dout);
-    std::vector<Tensor> dquery_lists = GetSplitTensor(&dquery);
-    std::vector<Tensor> dkey_lists = GetSplitTensor(&dkey);
-    std::vector<Tensor> dvalue_lists = GetSplitTensor(&dvalue);
+    std::vector<phi::DenseTensor> query_lists = GetSplitTensor(&query);
+    std::vector<phi::DenseTensor> key_lists = GetSplitTensor(&key);
+    std::vector<phi::DenseTensor> value_lists = GetSplitTensor(&value);
+    std::vector<phi::DenseTensor> offset_lists = GetSplitTensor(&offset);
+    std::vector<phi::DenseTensor> columns_lists = GetSplitTensor(&columns);
+    std::vector<phi::DenseTensor> sparse_dot_sdd_lists =
+        GetSplitTensor(&sparse_dot_sdd);
+    std::vector<phi::DenseTensor> softmax_lists = GetSplitTensor(&softmax);
+    std::vector<phi::DenseTensor> dout_lists = GetSplitTensor(&dout);
+    std::vector<phi::DenseTensor> dquery_lists = GetSplitTensor(&dquery);
+    std::vector<phi::DenseTensor> dkey_lists = GetSplitTensor(&dkey);
+    std::vector<phi::DenseTensor> dvalue_lists = GetSplitTensor(&dvalue);
 
     const int iter_num = batch_size * num_heads;
     const auto& dev_ctx = ctx.cuda_device_context();
@@ -831,7 +833,7 @@ class SparseAttentionGradCUDAKernel : public framework::OpKernel<T> {
 
       // dSoftmax = dOut * transpose(Value)
       int nnz_num = columns_lists[i].numel();
-      Tensor dsoftmax;
+      phi::DenseTensor dsoftmax;
       dsoftmax.Resize({nnz_num});
       dsoftmax.mutable_data<T>(ctx.GetPlace());
       DotSdd<DeviceContext, T>(dev_ctx,
@@ -846,7 +848,7 @@ class SparseAttentionGradCUDAKernel : public framework::OpKernel<T> {
                                true);
 
       // dSparseDotSdd = dSoftmax * softmax'(SparseDotSdd)
-      Tensor dsparse_dot_sdd;
+      phi::DenseTensor dsparse_dot_sdd;
       dsparse_dot_sdd.Resize({nnz_num});
       dsparse_dot_sdd.mutable_data<T>(ctx.GetPlace());
       SparseSoftmaxBackward<DeviceContext, T>(dev_ctx,
diff --git a/paddle/fluid/operators/split_op_mlu.cc b/paddle/fluid/operators/split_op_mlu.cc
index cda18720e7aba..77928c7efc8da 100644
--- a/paddle/fluid/operators/split_op_mlu.cc
+++ b/paddle/fluid/operators/split_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class SplitMLUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/split_op_npu.cc b/paddle/fluid/operators/split_op_npu.cc
index 2fa8fa2a805eb..966f2ea6849b9 100644
--- a/paddle/fluid/operators/split_op_npu.cc
+++ b/paddle/fluid/operators/split_op_npu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class SplitNPUKernel : public framework::OpKernel<T> {
  public:
@@ -44,7 +42,7 @@ class SplitNPUKernel : public framework::OpKernel<T> {
           "The SectionsTensorList is not supported on NPU now."));
     }
 
-    std::vector<Tensor> outputs;
+    std::vector<phi::DenseTensor> outputs;
     for (size_t j = 0; j < outs.size(); ++j) {
       outs[j]->mutable_data<T>(ctx.GetPlace());
       outputs.push_back(*outs[j]);
diff --git a/paddle/fluid/operators/squared_l2_distance_op.h b/paddle/fluid/operators/squared_l2_distance_op.h
index 1698c65fc47ac..f0838c4fad2de 100644
--- a/paddle/fluid/operators/squared_l2_distance_op.h
+++ b/paddle/fluid/operators/squared_l2_distance_op.h
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class SquaredL2DistanceKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/squared_l2_norm_op_mlu.cc b/paddle/fluid/operators/squared_l2_norm_op_mlu.cc
index fcd83b40875ec..0c558502ddf65 100644
--- a/paddle/fluid/operators/squared_l2_norm_op_mlu.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op_mlu.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class SquaredL2NormMLUKernel : public framework::OpKernel<T> {
  public:
@@ -82,7 +80,7 @@ class SquaredL2NormGradMLUKernel : public framework::OpKernel<T> {
     auto place = context.GetPlace();
 
     // broadcast out_grad
-    Tensor broadcasted_out_grad;
+    phi::DenseTensor broadcasted_out_grad;
     broadcasted_out_grad.mutable_data<T>(x_grad->dims(), place);
     MLUCnnlTensorDesc broadcasted_out_grad_desc(broadcasted_out_grad);
     MLUCnnlTensorDesc out_grad_desc(*out_grad);
@@ -93,7 +91,7 @@ class SquaredL2NormGradMLUKernel : public framework::OpKernel<T> {
                          GetBasePtr(&broadcasted_out_grad));
 
     // mul x
-    Tensor tmp_x_grad;
+    phi::DenseTensor tmp_x_grad;
     tmp_x_grad.mutable_data<T>(x_grad->dims(), place);
     MLUCnnlTensorDesc x_desc(*x);
     MLUCnnlTensorDesc tmp_x_grad_desc(tmp_x_grad);
diff --git a/paddle/fluid/operators/squared_l2_norm_op_npu.cc b/paddle/fluid/operators/squared_l2_norm_op_npu.cc
index 25260ed4c1286..0cebf8e59d6a6 100644
--- a/paddle/fluid/operators/squared_l2_norm_op_npu.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class SquaredL2NormNPUKernel : public framework::OpKernel<T> {
  public:
@@ -65,7 +63,7 @@ class SquaredL2NormGradNPUKernel : public framework::OpKernel<T> {
             .stream();
 
     // broadcast out_grad
-    Tensor broadcasted_out_grad;
+    phi::DenseTensor broadcasted_out_grad;
     broadcasted_out_grad.mutable_data<T>(x_grad->dims(), place);
     const auto &broadcast_runner =
         NpuOpRunner("BroadcastToD",
@@ -74,13 +72,13 @@ class SquaredL2NormGradNPUKernel : public framework::OpKernel<T> {
                     {{"shape", phi::vectorize(x_grad->dims())}});
     broadcast_runner.Run(stream);
     // mul x
-    Tensor tmp_x_grad;
+    phi::DenseTensor tmp_x_grad;
     tmp_x_grad.mutable_data<T>(x_grad->dims(), place);
     const auto &mul_x_runner =
         NpuOpRunner("Mul", {broadcasted_out_grad, *x}, {tmp_x_grad}, {});
     mul_x_runner.Run(stream);
     // mul coefficient:2
-    Tensor coefficient;
+    phi::DenseTensor coefficient;
     coefficient.mutable_data<T>({1}, place);
     FillNpuTensorWithConstant<T>(&coefficient, static_cast<T>(2.0));
     x_grad->mutable_data<T>(place);
diff --git a/paddle/fluid/operators/stack_op_mlu.cc b/paddle/fluid/operators/stack_op_mlu.cc
index eeac200676f4a..16076a180a54e 100644
--- a/paddle/fluid/operators/stack_op_mlu.cc
+++ b/paddle/fluid/operators/stack_op_mlu.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class StackMLUKernel : public framework::OpKernel<T> {
  public:
@@ -31,10 +29,10 @@ class StackMLUKernel : public framework::OpKernel<T> {
     if (axis < 0) axis += (x[0]->dims().size() + 1);
     int num = static_cast<int>(x.size());
 
-    PADDLE_ENFORCE_GT(
-        num,
-        0,
-        platform::errors::InvalidArgument("number of input Tensor <= 0"));
+    PADDLE_ENFORCE_GT(num,
+                      0,
+                      platform::errors::InvalidArgument(
+                          "number of input phi::DenseTensor <= 0"));
 
     std::vector<MLUCnnlTensorDesc> x_descs;
     std::vector<cnnlTensorDescriptor_t> x_raw_descs;
diff --git a/paddle/fluid/operators/stack_op_npu.cc b/paddle/fluid/operators/stack_op_npu.cc
index 3b5c0b1dc0cb6..7919294f60c33 100644
--- a/paddle/fluid/operators/stack_op_npu.cc
+++ b/paddle/fluid/operators/stack_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class StackNPUKernel : public framework::OpKernel<T> {
  public:
@@ -30,10 +28,10 @@ class StackNPUKernel : public framework::OpKernel<T> {
     if (axis < 0) axis += (x[0]->dims().size() + 1);
     int num = static_cast<int>(x.size());
 
-    PADDLE_ENFORCE_GT(
-        num,
-        0,
-        platform::errors::InvalidArgument("number of input Tensor <= 0"));
+    PADDLE_ENFORCE_GT(num,
+                      0,
+                      platform::errors::InvalidArgument(
+                          "number of input phi::DenseTensor <= 0"));
 
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
@@ -61,10 +59,10 @@ class StackGradNPUKernel : public framework::OpKernel<T> {
     if (axis < 0) axis += dy->dims().size();
     int num = dy->dims()[axis];
 
-    PADDLE_ENFORCE_GT(
-        num,
-        0,
-        platform::errors::InvalidArgument("number of input Tensor <= 0"));
+    PADDLE_ENFORCE_GT(num,
+                      0,
+                      platform::errors::InvalidArgument(
+                          "number of input phi::DenseTensor <= 0"));
 
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
diff --git a/paddle/fluid/operators/stft_op.h b/paddle/fluid/operators/stft_op.h
index 23130f687e305..b0d6091ecd37b 100644
--- a/paddle/fluid/operators/stft_op.h
+++ b/paddle/fluid/operators/stft_op.h
@@ -27,8 +27,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class StftKernel : public framework::OpKernel<T> {
  public:
@@ -59,7 +57,7 @@ class StftKernel : public framework::OpKernel<T> {
     std::vector<int64_t> axes = {1};
 
     // Frame
-    Tensor frames;
+    phi::DenseTensor frames;
     framework::DDim frames_dims(out->dims());
     frames_dims.at(axes.back()) = n_fft;
     frames.mutable_data<T>(frames_dims, ctx.GetPlace());
@@ -73,7 +71,7 @@ class StftKernel : public framework::OpKernel<T> {
                                                  /*is_grad*/ false);
 
     // Window
-    Tensor frames_w;
+    phi::DenseTensor frames_w;
     frames_w.mutable_data<T>(frames_dims, ctx.GetPlace());
     ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
         ctx, &frames, window, axes.back(), MulFunctor<T>(), &frames_w);
@@ -93,7 +91,7 @@ class StftKernel : public framework::OpKernel<T> {
       framework::DDim onesided_dims(out->dims());
       const int64_t onesided_axis_size = out->dims().at(axes.back()) / 2 + 1;
       onesided_dims.at(axes.back()) = onesided_axis_size;
-      Tensor onesided_out;
+      phi::DenseTensor onesided_out;
       onesided_out.mutable_data<C>(onesided_dims, ctx.GetPlace());
       fft_r2c_func(dev_ctx, frames_w, &onesided_out, axes, normalization, true);
       phi::funcs::FFTFillConj<DeviceContext, C>(
@@ -125,12 +123,12 @@ class StftGradKernel : public framework::OpKernel<T> {
     const int seq_length = dx->dims()[dx_rank - 1];
 
     std::vector<int64_t> axes = {1};
-    Tensor d_frames_w;
+    phi::DenseTensor d_frames_w;
     framework::DDim d_frames_dims(dy->dims());
     d_frames_dims.at(axes.back()) = n_fft;
     d_frames_w.mutable_data<T>(d_frames_dims, ctx.GetPlace());
 
-    Tensor complex_d_frames_w;
+    phi::DenseTensor complex_d_frames_w;
     complex_d_frames_w.mutable_data<C>(d_frames_dims, ctx.GetPlace());
 
     // dy -> d_frames_w
@@ -146,7 +144,7 @@ class StftGradKernel : public framework::OpKernel<T> {
       fft_c2c_func(
           dev_ctx, *dy, &complex_d_frames_w, axes, normalization, false);
     } else {
-      Tensor full_dy;
+      phi::DenseTensor full_dy;
       full_dy.mutable_data<C>(d_frames_dims, ctx.GetPlace());
       auto zero_length = static_cast<int>(full_dy.dims().at(axes.back()) -
                                           dy->dims().at(axes.back()));
@@ -163,7 +161,7 @@ class StftGradKernel : public framework::OpKernel<T> {
     phi::RealKernel<C>(dev_ctx, complex_d_frames_w, &d_frames_w);
 
     // d_frames_w -> d_frames
-    Tensor d_frames;
+    phi::DenseTensor d_frames;
     d_frames.mutable_data<T>(d_frames_dims, ctx.GetPlace());
     ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
         ctx, &d_frames_w, window, axes.back(), MulFunctor<T>(), &d_frames);
diff --git a/paddle/fluid/operators/strided_slice_op.cc b/paddle/fluid/operators/strided_slice_op.cc
index a91b210f2dc7b..c08f214ab58bc 100644
--- a/paddle/fluid/operators/strided_slice_op.cc
+++ b/paddle/fluid/operators/strided_slice_op.cc
@@ -26,8 +26,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class StridedSliceOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -69,7 +67,7 @@ class StridedSliceOp : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "StartsTensor" || var_name == "EndsTensor" ||
         var_name == "StridesTensor") {
@@ -174,7 +172,7 @@ class StridedSliceOpGrad : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "StartsTensor" || var_name == "EndsTensor" ||
         var_name == "StridesTensor") {
diff --git a/paddle/fluid/operators/strided_slice_op_mlu.cc b/paddle/fluid/operators/strided_slice_op_mlu.cc
index 6caf1ad5ad15f..21eb47f187b00 100644
--- a/paddle/fluid/operators/strided_slice_op_mlu.cc
+++ b/paddle/fluid/operators/strided_slice_op_mlu.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using Variable = framework::Variable;
 using LoDTensorArray = framework::LoDTensorArray;
 using DDim = framework::DDim;
@@ -100,7 +99,7 @@ class StridedSliceMLUKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(is_tensor_array,
                       false,
                       platform::errors::InvalidArgument(
-                          "Tensor array as input is not supported."));
+                          "phi::DenseTensor array as input is not supported."));
     int rank = ctx.Input<phi::DenseTensor>("Input")->dims().size();
     switch (rank) {
       case 1:
@@ -156,7 +155,7 @@ class StridedSliceMLUKernel : public framework::OpKernel<T> {
     auto infer_flags = ctx.Attr<std::vector<int>>("infer_flags");
     auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
 
-    // vector<Tensor<int32>>
+    // vector<phi::DenseTensor<int32>>
     auto list_new_starts_tensor =
         ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
     auto list_new_ends_tensor =
@@ -164,7 +163,7 @@ class StridedSliceMLUKernel : public framework::OpKernel<T> {
     auto list_new_strides_tensor =
         ctx.MultiInput<phi::DenseTensor>("StridesTensorList");
 
-    // Tensor<int32>
+    // phi::DenseTensor<int32>
     if (list_new_starts_tensor.size() > 0) {
       starts = GetDataFromTensorList<int64_t>(list_new_starts_tensor);
     } else if (ctx.HasInput("StartsTensor")) {
@@ -268,7 +267,7 @@ class StridedSliceGradMLUKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(is_tensor_array,
                       false,
                       platform::errors::InvalidArgument(
-                          "Tensor array as input is not supported."));
+                          "phi::DenseTensor array as input is not supported."));
     int rank = ctx.Input<phi::DenseTensor>("Input")->dims().size();
 
     switch (rank) {
diff --git a/paddle/fluid/operators/strided_slice_op_npu.cc b/paddle/fluid/operators/strided_slice_op_npu.cc
index f613dc1054088..23bf6ea689602 100644
--- a/paddle/fluid/operators/strided_slice_op_npu.cc
+++ b/paddle/fluid/operators/strided_slice_op_npu.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using Variable = framework::Variable;
 using LoDTensorArray = framework::LoDTensorArray;
 using DDim = framework::DDim;
@@ -34,7 +33,7 @@ class StridedSliceNPUKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(is_tensor_array,
                       false,
                       platform::errors::InvalidArgument(
-                          "Tensor array as input is not supported."));
+                          "phi::DenseTensor array as input is not supported."));
     int rank = ctx.Input<phi::DenseTensor>("Input")->dims().size();
     switch (rank) {
       case 1:
@@ -87,7 +86,7 @@ class StridedSliceNPUKernel : public framework::OpKernel<T> {
     auto infer_flags = ctx.Attr<std::vector<int>>("infer_flags");
     auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
 
-    // vector<Tensor<int32>>
+    // vector<phi::DenseTensor<int32>>
     auto list_new_ends_tensor =
         ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
     auto list_new_starts_tensor =
@@ -95,7 +94,7 @@ class StridedSliceNPUKernel : public framework::OpKernel<T> {
     auto list_new_strides_tensor =
         ctx.MultiInput<phi::DenseTensor>("StridesTensorList");
 
-    // Tensor<int32>
+    // phi::DenseTensor<int32>
     if (list_new_starts_tensor.size() > 0) {
       starts = GetDataFromTensorList<int64_t>(list_new_starts_tensor);
     } else if (ctx.HasInput("StartsTensor")) {
@@ -157,9 +156,9 @@ class StridedSliceNPUKernel : public framework::OpKernel<T> {
       strides_indices_vector[axis_index] = strides[axis];
     }
 
-    Tensor starts_indices_tensor;
-    Tensor ends_indices_tensor;
-    Tensor strides_indices_tensor;
+    phi::DenseTensor starts_indices_tensor;
+    phi::DenseTensor ends_indices_tensor;
+    phi::DenseTensor strides_indices_tensor;
 
     starts_indices_tensor.mutable_data<int64_t>({D}, place);
     ends_indices_tensor.mutable_data<int64_t>({D}, place);
@@ -221,7 +220,7 @@ class StridedSliceNPUKernel : public framework::OpKernel<T> {
     runner.Run(stream);
 
     if (need_reverse) {
-      Tensor out_tmp;
+      phi::DenseTensor out_tmp;
       out_tmp.mutable_data<T>(out_dims, place);
       paddle::framework::TensorCopy(
           *out,
@@ -229,7 +228,7 @@ class StridedSliceNPUKernel : public framework::OpKernel<T> {
           ctx.template device_context<platform::DeviceContext>(),
           &out_tmp);
 
-      Tensor reverse_axis;
+      phi::DenseTensor reverse_axis;
       std::vector<int> reverse_axis_vector;
       for (size_t axis = 0; axis < axes.size(); axis++) {
         if (reverse_vector[axis] == 1) {
@@ -261,7 +260,7 @@ class StridedSliceGradNPUKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(is_tensor_array,
                       false,
                       platform::errors::InvalidArgument(
-                          "Tensor array as input is not supported."));
+                          "phi::DenseTensor array as input is not supported."));
     int rank = ctx.Input<phi::DenseTensor>("Input")->dims().size();
 
     switch (rank) {
@@ -378,9 +377,9 @@ class StridedSliceGradNPUKernel : public framework::OpKernel<T> {
       strides_indices_vector[axis_index] = strides[axis];
     }
 
-    Tensor starts_indices_tensor;
-    Tensor ends_indices_tensor;
-    Tensor strides_indices_tensor;
+    phi::DenseTensor starts_indices_tensor;
+    phi::DenseTensor ends_indices_tensor;
+    phi::DenseTensor strides_indices_tensor;
 
     starts_indices_tensor.mutable_data<int64_t>({D}, place);
     ends_indices_tensor.mutable_data<int64_t>({D}, place);
@@ -397,7 +396,7 @@ class StridedSliceGradNPUKernel : public framework::OpKernel<T> {
     for (int i = 0; i < input_dims.size(); i++) {
       input_dims_vector.push_back(input_dims[i]);
     }
-    Tensor input_dims_tensor;
+    phi::DenseTensor input_dims_tensor;
     paddle::framework::TensorFromVector(
         input_dims_vector, dev_ctx, &input_dims_tensor);
 
@@ -417,7 +416,7 @@ class StridedSliceGradNPUKernel : public framework::OpKernel<T> {
                                              {"shrink_axis_mask", 0}};
 
     if (need_reverse) {
-      Tensor reverse_axis;
+      phi::DenseTensor reverse_axis;
       std::vector<int> reverse_axis_vector;
       for (size_t axis = 0; axis < axes.size(); axis++) {
         if (reverse_vector[axis] == 1) {
@@ -429,7 +428,7 @@ class StridedSliceGradNPUKernel : public framework::OpKernel<T> {
       paddle::framework::TensorFromVector(
           reverse_axis_vector, dev_ctx, &reverse_axis);
 
-      Tensor dout_tmp;
+      phi::DenseTensor dout_tmp;
       dout_tmp.mutable_data<T>(dout->dims(), place);
       const auto& runner_reverse =
           NpuOpRunner("ReverseV2", {*dout, reverse_axis}, {dout_tmp});
diff --git a/paddle/fluid/operators/sum_op_mlu.cc b/paddle/fluid/operators/sum_op_mlu.cc
index aad62e9ce2c33..a2f69a394902c 100644
--- a/paddle/fluid/operators/sum_op_mlu.cc
+++ b/paddle/fluid/operators/sum_op_mlu.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using SelectedRows = phi::SelectedRows;
 
 template <typename DeviceContext, typename T>
@@ -62,7 +61,7 @@ class SumMLUKernel : public framework::OpKernel<T> {
 
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
-          "Expected type of Output(out) must be Tensor or But got "
+          "Expected type of Output(out) must be phi::DenseTensor or But got "
           "unsupport type: %s.",
           framework::ToTypeName(out_var->Type())));
     }
diff --git a/paddle/fluid/operators/sum_op_npu.cc b/paddle/fluid/operators/sum_op_npu.cc
index 20cc7ec18b8b7..afc489e2ab412 100644
--- a/paddle/fluid/operators/sum_op_npu.cc
+++ b/paddle/fluid/operators/sum_op_npu.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using SelectedRows = phi::SelectedRows;
 
 template <typename DeviceContext, typename T>
@@ -106,7 +105,7 @@ class SumNPUKernel : public framework::OpKernel<T> {
       }
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
-          "Expected type of Output(out) must be Tensor or "
+          "Expected type of Output(out) must be phi::DenseTensor or "
           "LoDTensorArray. But got "
           "unsupport type: %s.",
           framework::ToTypeName(out_var->Type())));
diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h
index d6c306ff2a9f3..6358722e94390 100644
--- a/paddle/fluid/operators/svd_helper.h
+++ b/paddle/fluid/operators/svd_helper.h
@@ -36,7 +36,6 @@
 namespace paddle {
 namespace operators {
 namespace math {
-using Tensor = phi::DenseTensor;
 using InTensors = std::vector<const phi::DenseTensor*>;
 using OutTensors = std::vector<phi::DenseTensor*>;
 using OpName = std::string;
@@ -274,8 +273,8 @@ template <typename DeviceContext, typename T, typename ValueType = T>
 struct DeviceIndependenceTensorOperations {
   // 1. Device indenpendence, for kernel reuse.
   // 2. Input and output is always tensor type.
-  // 3. output Tensor is alway allocated
-  // 4. Basic Tensor operator is supported
+  // 3. output phi::DenseTensor is alway allocated
+  // 4. Basic phi::DenseTensor operator is supported
   // 5. The Reused Operator Kernel should only be considered as
   //    a wrap function
   using NameInTensorMap =
@@ -382,8 +381,8 @@ struct DeviceIndependenceTensorOperations {
   }
 
   // batch_diag for CPU only
-  Tensor BatchDiag(const phi::DenseTensor& x, int batch) {
-    Tensor out;
+  phi::DenseTensor BatchDiag(const phi::DenseTensor& x, int batch) {
+    phi::DenseTensor out;
     auto* x_data = x.data<phi::dtype::Real<T>>();
     auto numel = x.numel();
     auto* out_data = out.mutable_data<phi::dtype::Real<T>>(
@@ -411,7 +410,8 @@ struct DeviceIndependenceTensorOperations {
   }
 
   // a complex number x times a real number y, which is represented as (a+0j)
-  Tensor RealMulComplex(const phi::DenseTensor& x, const phi::DenseTensor& y) {
+  phi::DenseTensor RealMulComplex(const phi::DenseTensor& x,
+                                  const phi::DenseTensor& y) {
     phi::DenseTensor ret;
     std::vector<int> out_shape = GetBroadcastShape({&x, &y});
     ret.Resize(phi::make_ddim(out_shape));
@@ -650,8 +650,8 @@ struct DeviceIndependenceTensorOperations {
     return CreateOpRunAndReturnTensor("concat", inputs, attrs, out_shape);
   }
 
-  Tensor Conj(const phi::DenseTensor& x) {
-    Tensor out;
+  phi::DenseTensor Conj(const phi::DenseTensor& x) {
+    phi::DenseTensor out;
     auto* out_data = out.mutable_data<T>(x.dims(), context.GetPlace());
     auto* x_data = x.data<T>();
     auto for_range = GetForRange(x.numel());
@@ -660,8 +660,8 @@ struct DeviceIndependenceTensorOperations {
     return out;
   }
 
-  Tensor Real(const phi::DenseTensor& x) {
-    Tensor out;
+  phi::DenseTensor Real(const phi::DenseTensor& x) {
+    phi::DenseTensor out;
     auto numel = x.numel();
     auto* out_data = out.mutable_data<phi::dtype::Real<T>>(
         x.dims(),
@@ -674,13 +674,13 @@ struct DeviceIndependenceTensorOperations {
     return out;
   }
 
-  Tensor DiagFill(const int m,
-                  const int n,
-                  const int num_lower_diags,
-                  const int num_upper_diags,
-                  const phi::DenseTensor& scale,
-                  const phi::DenseTensor& input) {
-    Tensor out;
+  phi::DenseTensor DiagFill(const int m,
+                            const int n,
+                            const int num_lower_diags,
+                            const int num_upper_diags,
+                            const phi::DenseTensor& scale,
+                            const phi::DenseTensor& input) {
+    phi::DenseTensor out;
     auto& dev_ctx = context.template device_context<DeviceContext>();
     platform::ForRange<DeviceContext> for_range(dev_ctx, input.numel());
     DiagAndFillFunctor<T, ValueType> diag_and_copy_functor(
@@ -709,7 +709,7 @@ struct DeviceIndependenceTensorOperations {
                          const std::vector<int>& start,
                          const std::vector<int>& end,
                          phi::DenseTensor* out) {
-    // Slice by call Eigen Tensor Function `.slice()`
+    // Slice by call Eigen phi::DenseTensor Function `.slice()`
     size_t rank = in->dims().size();
     PADDLE_ENFORCE_EQ(start.size(),
                       rank,
@@ -752,7 +752,7 @@ struct DeviceIndependenceTensorOperations {
       op_outputs[out_name].emplace_back("tmp_" + out_name);
     }
     auto out_var = local_scope.Var("tmp_Out");  // return the Out
-    // create Out Tensor and allocat memory
+    // create Out phi::DenseTensor and allocat memory
     out_var->GetMutable<phi::DenseTensor>()->mutable_data<T>(
         phi::make_ddim(out_shape), context.GetPlace());
     // phi::make_ddim(out_shape)
diff --git a/paddle/fluid/operators/sync_batch_norm_op_mlu.cc b/paddle/fluid/operators/sync_batch_norm_op_mlu.cc
index b1e6bec8a4cad..2d037a7c3ecc1 100644
--- a/paddle/fluid/operators/sync_batch_norm_op_mlu.cc
+++ b/paddle/fluid/operators/sync_batch_norm_op_mlu.cc
@@ -26,7 +26,6 @@ namespace operators {
 #define NO_USE_CNCL 0
 #define GET_LAYOUT_OFFSET 2
 
-using Tensor = phi::DenseTensor;
 static std::vector<cnnlTensorLayout_t> supported_input_layout = {
     CNNL_LAYOUT_NC, CNNL_LAYOUT_NLC, CNNL_LAYOUT_NHWC, CNNL_LAYOUT_NDHWC};
 
@@ -81,8 +80,8 @@ class SyncBatchNormMLUKernel : public framework::OpKernel<T> {
     saved_mean->mutable_data<MPDType>(ctx.GetPlace());
     saved_variance->mutable_data<MPDType>(ctx.GetPlace());
 
-    Tensor trans_x;
-    Tensor trans_y;
+    phi::DenseTensor trans_x;
+    phi::DenseTensor trans_y;
     std::vector<int> forward_perm;
     std::vector<int> backward_perm;
     std::vector<int> trans_shape;
@@ -137,13 +136,13 @@ class SyncBatchNormMLUKernel : public framework::OpKernel<T> {
     } else {  // training
       if (ctx.HasInput("MomentumTensor")) {
         const auto *mom_tensor = ctx.Input<phi::DenseTensor>("MomentumTensor");
-        Tensor mom_cpu;
+        phi::DenseTensor mom_cpu;
         paddle::framework::TensorCopySync(
             *mom_tensor, platform::CPUPlace(), &mom_cpu);
         momentum = mom_cpu.data<float>()[0];
       }
 
-      Tensor local_mean, local_var;
+      phi::DenseTensor local_mean, local_var;
       local_mean.mutable_data<MPDType>(mean->dims(), ctx.GetPlace());
       local_var.mutable_data<MPDType>(variance->dims(), ctx.GetPlace());
       MLUCnnlTensorDesc desc_mean_var(*mean_out);
@@ -158,14 +157,14 @@ class SyncBatchNormMLUKernel : public framework::OpKernel<T> {
                                   desc_mean_var.get(),
                                   GetBasePtr(&local_var));
 
-      Tensor input_count;
+      phi::DenseTensor input_count;
       input_count.mutable_data<MPDType>(phi::make_ddim({1}), ctx.GetPlace());
       FillMLUTensorWithHostValue<MPDType>(
           ctx, static_cast<MPDType>(x->numel() / C), &input_count);
 
-      Tensor count_all;
-      Tensor mean_all(mean->dtype());
-      Tensor invstd_all(variance->dtype());
+      phi::DenseTensor count_all;
+      phi::DenseTensor mean_all(mean->dtype());
+      phi::DenseTensor invstd_all(variance->dtype());
 
 #ifdef PADDLE_WITH_CNCL
       auto &dev_ctx =
@@ -300,7 +299,7 @@ class SyncBatchNormMLUGradKernel : public framework::OpKernel<T> {
     const auto *saved_mean = ctx.Input<phi::DenseTensor>("SavedMean");
     const auto *saved_inv_var = ctx.Input<phi::DenseTensor>("SavedVariance");
 
-    const Tensor *x;
+    const phi::DenseTensor *x;
     if (ctx.HasInput("Y")) {
       PADDLE_ENFORCE_EQ(true,
                         false,
@@ -342,9 +341,9 @@ class SyncBatchNormMLUGradKernel : public framework::OpKernel<T> {
                           "OP(sync_batch_norm) be (1), but given (%d).",
                           scale->dims().size()));
 
-    Tensor trans_x;
-    Tensor trans_dy;
-    Tensor trans_dx;
+    phi::DenseTensor trans_x;
+    phi::DenseTensor trans_dy;
+    phi::DenseTensor trans_dx;
     std::vector<int> forward_perm;
     std::vector<int> backward_perm;
     std::vector<int> trans_shape;
@@ -384,7 +383,7 @@ class SyncBatchNormMLUGradKernel : public framework::OpKernel<T> {
         supported_input_layout[x_dims.size() - GET_LAYOUT_OFFSET],
         ToCnnlDataType<T>());
 
-    Tensor sum_dy, sum_dy_xmu;
+    phi::DenseTensor sum_dy, sum_dy_xmu;
     sum_dy.mutable_data<MPDType>(bias->dims(), ctx.GetPlace());
     sum_dy_xmu.mutable_data<MPDType>(bias->dims(), ctx.GetPlace());
     MLUCnnlTensorDesc desc_other_param(*bias);
@@ -411,7 +410,7 @@ class SyncBatchNormMLUGradKernel : public framework::OpKernel<T> {
         d_scale ? true : false /*compute d_scale*/,
         d_bias ? true : false /*compute d_bias*/);
 
-    Tensor numel_count;
+    phi::DenseTensor numel_count;
     numel_count.mutable_data<int32_t>(phi::make_ddim({1}), ctx.GetPlace());
     FillMLUTensorWithHostValue<int32_t>(
         ctx, static_cast<int32_t>(x->numel() / C), &numel_count);
diff --git a/paddle/fluid/operators/sync_batch_norm_op_npu.cc b/paddle/fluid/operators/sync_batch_norm_op_npu.cc
index 6cfd753c4ab6e..46b1ccc140ddb 100644
--- a/paddle/fluid/operators/sync_batch_norm_op_npu.cc
+++ b/paddle/fluid/operators/sync_batch_norm_op_npu.cc
@@ -20,8 +20,6 @@ limitations under the Licnse. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 void training_or_inference(const framework::ExecutionContext &ctx,
                            const aclrtStream &stream,
@@ -34,18 +32,18 @@ void training_or_inference(const framework::ExecutionContext &ctx,
                            const int &W,
                            const float epsilon,
                            const float &momentum,
-                           const Tensor *common_mean,
-                           const Tensor *common_var,
-                           const Tensor *x,
-                           const Tensor *scale,
-                           const Tensor *bias,
-                           const Tensor *mean,
-                           const Tensor *variance,
-                           Tensor *mean_out,
-                           Tensor *variance_out,
-                           Tensor *saved_mean,
-                           Tensor *saved_variance,
-                           Tensor *y) {
+                           const phi::DenseTensor *common_mean,
+                           const phi::DenseTensor *common_var,
+                           const phi::DenseTensor *x,
+                           const phi::DenseTensor *scale,
+                           const phi::DenseTensor *bias,
+                           const phi::DenseTensor *mean,
+                           const phi::DenseTensor *variance,
+                           phi::DenseTensor *mean_out,
+                           phi::DenseTensor *variance_out,
+                           phi::DenseTensor *saved_mean,
+                           phi::DenseTensor *saved_variance,
+                           phi::DenseTensor *y) {
   std::vector<int> axes;
   if (layout == phi::DataLayout::kNCHW) {
     axes = {0, 2, 3};
@@ -59,7 +57,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
   else if (layout == phi::DataLayout::kNHWC)
     multiples = {N, H, W, 1};
 
-  Tensor common_mean_tile_1;
+  phi::DenseTensor common_mean_tile_1;
   {
     common_mean_tile_1.Resize({C});
     common_mean_tile_1.mutable_data<float>(place);
@@ -70,7 +68,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
       common_mean_tile_1.Resize({1, 1, 1, C});
   }
 
-  Tensor common_mean_tile;
+  phi::DenseTensor common_mean_tile;
   {
     framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
     common_mean_tile.Resize(x->dims());
@@ -80,7 +78,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
     runner.Run(stream);
   }
 
-  Tensor common_var_tile_1;
+  phi::DenseTensor common_var_tile_1;
   {
     common_var_tile_1.Resize({C});
     common_var_tile_1.mutable_data<float>(place);
@@ -91,7 +89,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
       common_var_tile_1.Resize({1, 1, 1, C});
   }
 
-  Tensor common_var_tile;
+  phi::DenseTensor common_var_tile;
   {
     framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
     common_var_tile.Resize(x->dims());
@@ -101,7 +99,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
     runner.Run(stream);
   }
 
-  Tensor common_var_tile_add_epsilon;
+  phi::DenseTensor common_var_tile_add_epsilon;
   {
     framework::NPUAttributeMap attr_input = {{"value", epsilon}};
     common_var_tile_add_epsilon.Resize(x->dims());
@@ -111,7 +109,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
     runner.Run(stream);
   }
 
-  Tensor common_var_tile_add_epsilon_sqrt;
+  phi::DenseTensor common_var_tile_add_epsilon_sqrt;
   {
     common_var_tile_add_epsilon_sqrt.Resize(x->dims());
     common_var_tile_add_epsilon_sqrt.mutable_data<float>(place);
@@ -122,7 +120,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
     runner.Run(stream);
   }
 
-  Tensor x_sub_common_mean;
+  phi::DenseTensor x_sub_common_mean;
   {
     x_sub_common_mean.Resize(x->dims());
     x_sub_common_mean.mutable_data<float>(place);
@@ -131,7 +129,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
     runner.Run(stream);
   }
 
-  Tensor normalized;
+  phi::DenseTensor normalized;
   {
     normalized.Resize(x->dims());
     normalized.mutable_data<float>(place);
@@ -143,7 +141,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
     runner.Run(stream);
   }
 
-  Tensor scale_tile_1;
+  phi::DenseTensor scale_tile_1;
   {
     scale_tile_1.Resize({C});
     scale_tile_1.mutable_data<float>(place);
@@ -154,7 +152,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
       scale_tile_1.Resize({1, 1, 1, C});
   }
 
-  Tensor scale_tile;
+  phi::DenseTensor scale_tile;
   {
     framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
     scale_tile.Resize(x->dims());
@@ -164,7 +162,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
     runner.Run(stream);
   }
 
-  Tensor normalized_mul_scale;
+  phi::DenseTensor normalized_mul_scale;
   {
     normalized_mul_scale.Resize(x->dims());
     normalized_mul_scale.mutable_data<float>(place);
@@ -173,7 +171,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
     runner.Run(stream);
   }
 
-  Tensor bias_tile_1;
+  phi::DenseTensor bias_tile_1;
   {
     bias_tile_1.Resize({C});
     bias_tile_1.mutable_data<float>(place);
@@ -184,7 +182,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
       bias_tile_1.Resize({1, 1, 1, C});
   }
 
-  Tensor bias_tile;
+  phi::DenseTensor bias_tile;
   {
     framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
     bias_tile.Resize(x->dims());
@@ -203,7 +201,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
   }
 
   if (!test_mode) {
-    Tensor ones;
+    phi::DenseTensor ones;
     {
       ones.Resize({C});
       ones.mutable_data<float>(place);
@@ -212,7 +210,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
 
     // cacl mean_out
     {
-      Tensor common_mean_mul_1_sub_momentum;
+      phi::DenseTensor common_mean_mul_1_sub_momentum;
       {
         framework::NPUAttributeMap attr_input = {{"value", 1 - momentum}};
         common_mean_mul_1_sub_momentum.Resize({C});
@@ -224,7 +222,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
         runner.Run(stream);
       }
 
-      Tensor mean_mul_momentum;
+      phi::DenseTensor mean_mul_momentum;
       {
         framework::NPUAttributeMap attr_input = {{"value", momentum}};
         mean_mul_momentum.Resize({C});
@@ -246,7 +244,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
 
     // cacl variance_out
     {
-      Tensor momentum_mul_var;
+      phi::DenseTensor momentum_mul_var;
       {
         framework::NPUAttributeMap attr_input = {{"value", momentum}};
         momentum_mul_var.Resize({C});
@@ -256,7 +254,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
         runner.Run(stream);
       }
 
-      Tensor var_ref_mul_1_sub_momentum;
+      phi::DenseTensor var_ref_mul_1_sub_momentum;
       {
         framework::NPUAttributeMap attr_input = {{"value", 1 - momentum}};
         var_ref_mul_1_sub_momentum.Resize({C});
@@ -278,7 +276,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
 
     // cacl saved_variance
     {
-      Tensor var_ref_add_epsilon;
+      phi::DenseTensor var_ref_add_epsilon;
       {
         framework::NPUAttributeMap attr_input = {{"value", epsilon}};
         var_ref_add_epsilon.Resize({C});
@@ -288,7 +286,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
         runner.Run(stream);
       }
 
-      Tensor var_ref_add_epsilon_sqrt;
+      phi::DenseTensor var_ref_add_epsilon_sqrt;
       {
         var_ref_add_epsilon_sqrt.Resize({C});
         var_ref_add_epsilon_sqrt.mutable_data<float>(place);
@@ -399,18 +397,18 @@ class SyncBatchNormNPUKernel : public framework::OpKernel<T> {
     } else {  // training
       if (ctx.HasInput("MomentumTensor")) {
         const auto *mom_tensor = ctx.Input<phi::DenseTensor>("MomentumTensor");
-        Tensor mom_cpu;
+        phi::DenseTensor mom_cpu;
         paddle::framework::TensorCopySync(
             *mom_tensor, platform::CPUPlace(), &mom_cpu);
         momentum = mom_cpu.data<float>()[0];
       }
 
       // cacl saved_mean and var_ref
-      Tensor var_ref;
+      phi::DenseTensor var_ref;
       var_ref.Resize({C});
       var_ref.mutable_data<float>(place);
       {
-        Tensor x_sum;
+        phi::DenseTensor x_sum;
         {
           framework::NPUAttributeMap attr_input = {{"keep_dims", false},
                                                    {"axes", axes}};
@@ -421,7 +419,7 @@ class SyncBatchNormNPUKernel : public framework::OpKernel<T> {
           runner.Run(stream);
         }
 
-        Tensor x_square;
+        phi::DenseTensor x_square;
         {
           x_square.Resize(x->dims());
           x_square.mutable_data<float>(place);
@@ -429,7 +427,7 @@ class SyncBatchNormNPUKernel : public framework::OpKernel<T> {
           runner.Run(stream);
         }
 
-        Tensor x_square_sum;
+        phi::DenseTensor x_square_sum;
         {
           framework::NPUAttributeMap attr_input = {{"keep_dims", false},
                                                    {"axes", axes}};
@@ -447,7 +445,7 @@ class SyncBatchNormNPUKernel : public framework::OpKernel<T> {
           HcclDataType dtype = platform::ToHCCLDataType(
               framework::TransToProtoVarType(mean_out->dtype()));
 
-          Tensor device_count_tensor;
+          phi::DenseTensor device_count_tensor;
           {
             device_count_tensor.Resize({1});
             device_count_tensor.mutable_data<float>(place);
@@ -517,7 +515,7 @@ class SyncBatchNormNPUKernel : public framework::OpKernel<T> {
 
         // cacl var_ref
         {
-          Tensor saved_mean_square;
+          phi::DenseTensor saved_mean_square;
           {
             saved_mean_square.Resize({C});
             saved_mean_square.mutable_data<float>(place);
@@ -526,7 +524,7 @@ class SyncBatchNormNPUKernel : public framework::OpKernel<T> {
             runner.Run(stream);
           }
 
-          Tensor var_ref_tmp;
+          phi::DenseTensor var_ref_tmp;
           var_ref_tmp.Resize({C});
           var_ref_tmp.mutable_data<float>(place);
           {
@@ -589,7 +587,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
     auto *d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
     const auto *saved_mean = ctx.Input<phi::DenseTensor>("SavedMean");
 
-    const Tensor *x;
+    const phi::DenseTensor *x;
     if (ctx.HasInput("Y")) {
       PADDLE_ENFORCE_EQ(true,
                         false,
@@ -627,7 +625,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
 
     float device_counts = 0.0;
     if (comm) {
-      Tensor device_count_tensor;
+      phi::DenseTensor device_count_tensor;
       {
         device_count_tensor.Resize({1});
         device_count_tensor.mutable_data<float>(place);
@@ -660,13 +658,13 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
     }
 
     // cacl var_ref
-    Tensor var_ref;
+    phi::DenseTensor var_ref;
     var_ref.Resize({C});
     var_ref.mutable_data<float>(place);
     {
       // cacl var_ref
       {
-        Tensor x_square;
+        phi::DenseTensor x_square;
         {
           x_square.Resize(x->dims());
           x_square.mutable_data<float>(place);
@@ -674,7 +672,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
           runner.Run(stream);
         }
 
-        Tensor x_square_sum;
+        phi::DenseTensor x_square_sum;
         {
           framework::NPUAttributeMap attr_input = {{"keep_dims", false},
                                                    {"axes", axes}};
@@ -685,7 +683,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
           runner.Run(stream);
         }
 
-        Tensor x_square_sum_mean;
+        phi::DenseTensor x_square_sum_mean;
         {
           framework::NPUAttributeMap attr_input = {
               {"value", 1.0f * C / x_numel}};
@@ -696,7 +694,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
           runner.Run(stream);
         }
 
-        Tensor mean_square;
+        phi::DenseTensor mean_square;
         {
           mean_square.Resize({C});
           mean_square.mutable_data<float>(place);
@@ -714,7 +712,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
       }
     }
 
-    Tensor saved_mean_tile_1;
+    phi::DenseTensor saved_mean_tile_1;
     {
       saved_mean_tile_1.Resize({C});
       saved_mean_tile_1.mutable_data<float>(place);
@@ -725,7 +723,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
         saved_mean_tile_1.Resize({1, 1, 1, C});
     }
 
-    Tensor saved_mean_tile;
+    phi::DenseTensor saved_mean_tile;
     {
       framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
       saved_mean_tile.Resize(x->dims());
@@ -735,7 +733,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
       runner.Run(stream);
     }
 
-    Tensor x_sub_saved_mean;
+    phi::DenseTensor x_sub_saved_mean;
     {
       x_sub_saved_mean.Resize(x->dims());
       x_sub_saved_mean.mutable_data<float>(place);
@@ -744,7 +742,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
       runner.Run(stream);
     }
 
-    Tensor var_ref_tile_1;
+    phi::DenseTensor var_ref_tile_1;
     {
       var_ref_tile_1.Resize({C});
       var_ref_tile_1.mutable_data<float>(place);
@@ -755,7 +753,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
         var_ref_tile_1.Resize({1, 1, 1, C});
     }
 
-    Tensor var_ref_tile;
+    phi::DenseTensor var_ref_tile;
     {
       framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
       var_ref_tile.Resize(x->dims());
@@ -765,7 +763,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
       runner.Run(stream);
     }
 
-    Tensor var_ref_tile_add_epsilon;
+    phi::DenseTensor var_ref_tile_add_epsilon;
     {
       framework::NPUAttributeMap attr_input = {{"value", epsilon}};
       var_ref_tile_add_epsilon.Resize(x->dims());
@@ -775,7 +773,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
       runner.Run(stream);
     }
 
-    Tensor var_ref_tile_add_epsilon_sqrt;
+    phi::DenseTensor var_ref_tile_add_epsilon_sqrt;
     {
       var_ref_tile_add_epsilon_sqrt.Resize(x->dims());
       var_ref_tile_add_epsilon_sqrt.mutable_data<float>(place);
@@ -786,7 +784,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
       runner.Run(stream);
     }
 
-    Tensor dy_mul_x_sub_mean_for_scale;
+    phi::DenseTensor dy_mul_x_sub_mean_for_scale;
     {
       if (framework::TransToProtoVarType(d_y->dtype()) ==
           framework::proto::VarType::FP16) {
@@ -804,7 +802,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
       }
     }
 
-    Tensor dy_mul_x_sub_mean;
+    phi::DenseTensor dy_mul_x_sub_mean;
     {
       if (framework::TransToProtoVarType(d_y->dtype()) ==
           framework::proto::VarType::FP16) {
@@ -849,7 +847,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
 
     // cacl d_x
     if (d_x) {
-      Tensor dy_mean;
+      phi::DenseTensor dy_mean;
       {
         if (framework::TransToProtoVarType(d_y->dtype()) ==
             framework::proto::VarType::FP16) {
@@ -896,7 +894,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
         }
       }
 
-      Tensor dy_mean_tile_1;
+      phi::DenseTensor dy_mean_tile_1;
       {
         dy_mean_tile_1.Resize({C});
         dy_mean_tile_1.mutable_data<float>(place);
@@ -907,7 +905,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
           dy_mean_tile_1.Resize({1, 1, 1, C});
       }
 
-      Tensor dy_mean_tile;
+      phi::DenseTensor dy_mean_tile;
       {
         framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
         dy_mean_tile.Resize(x->dims());
@@ -917,7 +915,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
         runner.Run(stream);
       }
 
-      Tensor dy_sub_dy_mean;
+      phi::DenseTensor dy_sub_dy_mean;
       {
         if (framework::TransToProtoVarType(d_y->dtype()) ==
             framework::proto::VarType::FP16) {
@@ -935,7 +933,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
         }
       }
 
-      Tensor dy_mul_x_sub_mean_mean;
+      phi::DenseTensor dy_mul_x_sub_mean_mean;
       {
         framework::NPUAttributeMap attr_input = {{"keep_dims", false},
                                                  {"axes", axes}};
@@ -948,7 +946,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
         runner.Run(stream);
       }
 
-      Tensor dy_mul_x_sub_mean_mean_tile_1;
+      phi::DenseTensor dy_mul_x_sub_mean_mean_tile_1;
       {
         dy_mul_x_sub_mean_mean_tile_1.Resize({C});
         dy_mul_x_sub_mean_mean_tile_1.mutable_data<float>(place);
@@ -960,7 +958,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
           dy_mul_x_sub_mean_mean_tile_1.Resize({1, 1, 1, C});
       }
 
-      Tensor dy_mul_x_sub_mean_mean_tile;
+      phi::DenseTensor dy_mul_x_sub_mean_mean_tile;
       {
         framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
         dy_mul_x_sub_mean_mean_tile.Resize(x->dims());
@@ -974,7 +972,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
 
       // (x - mean) * np.mean(dy * (x - mean), axis=axis)
       // x_sub_saved_mean * dy_mul_x_sub_mean_mean_tile
-      Tensor tmp1;
+      phi::DenseTensor tmp1;
       {
         tmp1.Resize(x->dims());
         tmp1.mutable_data<float>(place);
@@ -986,7 +984,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
       // (x - mean) * np.mean(dy * (x - mean), axis=axis) / (var + epsilon)
       // tmp1 / (var + epsilon)
       // tmp1 / var_ref_tile_add_epsilon
-      Tensor tmp2;
+      phi::DenseTensor tmp2;
       {
         tmp2.Resize(x->dims());
         tmp2.mutable_data<float>(place);
@@ -998,7 +996,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
       // dy - np.mean(dy, axis) - (x - mean) * np.mean(dy * (x - mean), axis) /
       // (var + epsilon)
       // dy_sub_dy_mean - tmp2
-      Tensor tmp3;
+      phi::DenseTensor tmp3;
       {
         tmp3.Resize(x->dims());
         tmp3.mutable_data<float>(place);
@@ -1007,7 +1005,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
         runner.Run(stream);
       }
 
-      Tensor scale_tile_1;
+      phi::DenseTensor scale_tile_1;
       {
         scale_tile_1.Resize({C});
         scale_tile_1.mutable_data<float>(place);
@@ -1018,7 +1016,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
           scale_tile_1.Resize({1, 1, 1, C});
       }
 
-      Tensor scale_tile;
+      phi::DenseTensor scale_tile;
       {
         framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
         scale_tile.Resize(x->dims());
@@ -1031,7 +1029,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
       // scale * (dy - np.mean(dy, axis) - (x - mean) * np.mean(dy * (x - mean),
       // axis) / (var + epsilon))
       // scale * tmp3
-      Tensor dx_1;
+      phi::DenseTensor dx_1;
       {
         dx_1.Resize(x->dims());
         dx_1.mutable_data<float>(place);
@@ -1052,7 +1050,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
 
     // cacl d_scale
     if (d_scale) {
-      Tensor d_scale_2;
+      phi::DenseTensor d_scale_2;
       {
         d_scale_2.Resize(x->dims());
         d_scale_2.mutable_data<float>(place);
diff --git a/paddle/fluid/operators/take_along_axis_op_npu.cc b/paddle/fluid/operators/take_along_axis_op_npu.cc
index d4f06e6446887..3eed4989bb7ea 100644
--- a/paddle/fluid/operators/take_along_axis_op_npu.cc
+++ b/paddle/fluid/operators/take_along_axis_op_npu.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class NPUTakeAlongAxisKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/tdm_child_op.h b/paddle/fluid/operators/tdm_child_op.h
index 3f781ab65eeb8..b41453b849bc4 100644
--- a/paddle/fluid/operators/tdm_child_op.h
+++ b/paddle/fluid/operators/tdm_child_op.h
@@ -28,7 +28,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
 using LoD = framework::LoD;
 
diff --git a/paddle/fluid/operators/tdm_sampler_op.h b/paddle/fluid/operators/tdm_sampler_op.h
index d98680c574154..1ba0e2c66be8d 100644
--- a/paddle/fluid/operators/tdm_sampler_op.h
+++ b/paddle/fluid/operators/tdm_sampler_op.h
@@ -29,7 +29,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using Sampler = math::Sampler;
 using DDim = framework::DDim;
 using LoD = framework::LoD;
diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
index f880181662e24..bad4479868053 100644
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
+++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class TeacherStudentSigmoidLossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -201,15 +199,17 @@ class TeacherStudentSigmoidLossOpMaker
  public:
   void Make() override {
     AddInput("X",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape [N x 1],"
+             "(phi::DenseTensor, default phi::DenseTensor<float>), a 2-D "
+             "tensor with shape [N x 1],"
              " where N is the batch size and D is the output. "
              "This input is a probability computed by the previous operator, "
              "which is almost always the result of a softmax operator.");
     AddInput("Label",
-             "(Tensor), the ground truth which is a 2-D tensor. "
-             "Label is a Tensor<float> with shape [N x 1]. ");
+             "(phi::DenseTensor), the ground truth which is a 2-D tensor. "
+             "Label is a phi::DenseTensor<float> with shape [N x 1]. ");
     AddOutput("Y",
-              "(Tensor, default Tensor<float>), a 2-D tensor with shape "
+              "(phi::DenseTensor, default phi::DenseTensor<float>), a 2-D "
+              "tensor with shape "
               "[N x 1]. The teacher student sigmoid loss.");
     AddAttr<float>(
         "soft_max_up_bound",
diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
index 40bac8c364583..133d9656284f3 100644
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
+++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
@@ -19,7 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T>
 class TeacherStudentSigmoidLossOpKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/temporal_shift_op.h b/paddle/fluid/operators/temporal_shift_op.h
index 5ea2ead118892..ec2533316e107 100644
--- a/paddle/fluid/operators/temporal_shift_op.h
+++ b/paddle/fluid/operators/temporal_shift_op.h
@@ -16,7 +16,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 
 template <typename T>
diff --git a/paddle/fluid/operators/tile_op_mlu.cc b/paddle/fluid/operators/tile_op_mlu.cc
index 2b2b3df4431f1..3660627b8b578 100644
--- a/paddle/fluid/operators/tile_op_mlu.cc
+++ b/paddle/fluid/operators/tile_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class TileMLUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/tile_op_npu.cc b/paddle/fluid/operators/tile_op_npu.cc
index 2997052257d18..4ae1f6cbed330 100644
--- a/paddle/fluid/operators/tile_op_npu.cc
+++ b/paddle/fluid/operators/tile_op_npu.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 template <typename T>
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index ab90fa78d3d45..f1674bc5005a0 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -30,8 +30,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 #define FIXED_BLOCK_DIM_BASE(dim, ...) \
   case (dim): {                        \
     constexpr auto kBlockDim = (dim);  \
@@ -74,7 +72,7 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
 
     auto* k_t = ctx.Input<phi::DenseTensor>("K");
     if (k_t) {
-      Tensor k_host;
+      phi::DenseTensor k_host;
       framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host);
       k = k_host.data<int>()[0];
       framework::DDim output_dims = output->dims();
diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h
index cd29137d530f4..27f246415a94c 100644
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -24,8 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class TopkKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/top_k_op_npu.cc b/paddle/fluid/operators/top_k_op_npu.cc
index cbe5c224ae1d3..5b9b507989952 100644
--- a/paddle/fluid/operators/top_k_op_npu.cc
+++ b/paddle/fluid/operators/top_k_op_npu.cc
@@ -65,7 +65,7 @@ class TopkNPUKernel : public framework::OpKernel<T> {
                                              {"dim", -1},
                                              {"largest", true}};
 
-    Tensor tmp_indices(experimental::DataType::INT32);
+    phi::DenseTensor tmp_indices(experimental::DataType::INT32);
     tmp_indices.Resize(indices->dims());
     tmp_indices.mutable_data<int>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/top_k_op_xpu.cc b/paddle/fluid/operators/top_k_op_xpu.cc
index 25f3faa38a0c5..df1725265ebde 100644
--- a/paddle/fluid/operators/top_k_op_xpu.cc
+++ b/paddle/fluid/operators/top_k_op_xpu.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T>
 class TopkXPUKernel : public framework::OpKernel<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
diff --git a/paddle/fluid/operators/tree_conv_op.h b/paddle/fluid/operators/tree_conv_op.h
index ee37c2e9fe09b..cab0796a71019 100644
--- a/paddle/fluid/operators/tree_conv_op.h
+++ b/paddle/fluid/operators/tree_conv_op.h
@@ -22,7 +22,6 @@
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
 template <typename DeviceContext, typename T>
 class TreeConvKernel : public framework::OpKernel<T> {
@@ -40,7 +39,7 @@ class TreeConvKernel : public framework::OpKernel<T> {
     auto &dev_ctx = ctx.template device_context<DeviceContext>();
     auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
 
-    Tensor W;
+    phi::DenseTensor W;
     W.ShareDataWith(*Filter);
     W.Resize(phi::flatten_to_2d(Filter->dims(), 2));
 
@@ -67,7 +66,7 @@ class TreeConvKernel : public framework::OpKernel<T> {
       auto embeddings =
           Embeddings->Slice(idx, idx + 1).Resize(embedding_slicedim);
       auto out_vec = output_emb->Slice(idx, idx + 1).Resize(output_slicedim);
-      Tensor patch;
+      phi::DenseTensor patch;
       tree2col(dev_ctx, edge_set, embeddings, &patch, max_depth);
       constant(dev_ctx, &out_vec, 0);
       blas.MatMul(patch, W, &out_vec);
@@ -93,7 +92,7 @@ class TreeConvGradKernel : public framework::OpKernel<T> {
     auto &dev_ctx = ctx.template device_context<DeviceContext>();
     auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
 
-    Tensor W;
+    phi::DenseTensor W;
     W.ShareDataWith(*Filter);
     W.Resize(phi::flatten_to_2d(Filter->dims(), 1));
 
@@ -110,7 +109,7 @@ class TreeConvGradKernel : public framework::OpKernel<T> {
     out_grad_dims = phi::flatten_to_2d(out_grad_dims, 1);
     if (filter_g) {
       filter_g->mutable_data<T>(Filter->dims(), ctx.GetPlace());
-      Tensor f_g;
+      phi::DenseTensor f_g;
       f_g.ShareDataWith(*filter_g);
       f_g.Resize(phi::flatten_to_2d(Filter->dims(), 2));
       constant(dev_ctx, filter_g, 0);
@@ -121,7 +120,7 @@ class TreeConvGradKernel : public framework::OpKernel<T> {
                               .Resize(embedding_slicedim);
         auto out_grad =
             out_g->Slice(batch_id, batch_id + 1).Resize(out_grad_dims);
-        Tensor patch;
+        phi::DenseTensor patch;
         tree2col(dev_ctx, edge_set, embeddings, &patch, max_depth);
         blas.MatMul(patch, true, out_grad, false, T(1.0), &f_g, T(1.0));
       }
@@ -138,7 +137,7 @@ class TreeConvGradKernel : public framework::OpKernel<T> {
             out_g->Slice(batch_id, batch_id + 1).Resize(out_grad_dims);
         auto in_grad =
             in_g->Slice(batch_id, batch_id + 1).Resize(input_grad_dims);
-        Tensor in_grad_temp;
+        phi::DenseTensor in_grad_temp;
         col2tree(dev_ctx, edge_set, out_grad, &in_grad_temp, max_depth);
         blas.MatMul(in_grad_temp, false, W, true, &in_grad);
       }
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
index b5e67ccb24a9a..c3b2e24892e40 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
@@ -22,37 +22,35 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class TruncatedGaussianRandomNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     // TODO(zhiqiu): support dynamic shape and call ParameterizedTruncatedNormal
     std::vector<int> shape = ctx.Attr<std::vector<int>>("shape");
-    Tensor shape_tensor(experimental::DataType::INT32);
+    phi::DenseTensor shape_tensor(experimental::DataType::INT32);
     shape_tensor.mutable_data<int32_t>({static_cast<int>(shape.size())},
                                        ctx.GetPlace());
     paddle::framework::TensorFromVector(
         shape, ctx.device_context(), &shape_tensor);
     float mean = ctx.Attr<float>("mean");
-    Tensor mean_tensor(experimental::DataType::FLOAT32);
+    phi::DenseTensor mean_tensor(experimental::DataType::FLOAT32);
     mean_tensor.mutable_data<float>({1}, ctx.GetPlace());
     FillNpuTensorWithConstant<float>(&mean_tensor, mean);
 
     float std = ctx.Attr<float>("std");
-    Tensor std_tensor(experimental::DataType::FLOAT32);
+    phi::DenseTensor std_tensor(experimental::DataType::FLOAT32);
     std_tensor.mutable_data<float>({1}, ctx.GetPlace());
     FillNpuTensorWithConstant<float>(&std_tensor, std);
 
     int32_t seed_var = ctx.Attr<int32_t>("seed");
 
-    Tensor min_tensor(experimental::DataType::FLOAT32);
+    phi::DenseTensor min_tensor(experimental::DataType::FLOAT32);
     min_tensor.mutable_data<float>({1}, ctx.GetPlace());
     float min_value = mean - std * 2.0;
     FillNpuTensorWithConstant<float>(&min_tensor, min_value);
 
-    Tensor max_tensor(experimental::DataType::FLOAT32);
+    phi::DenseTensor max_tensor(experimental::DataType::FLOAT32);
     max_tensor.mutable_data<float>({1}, ctx.GetPlace());
     float max_value = mean + std * 2.0;
     FillNpuTensorWithConstant<float>(&max_tensor, max_value);
@@ -83,7 +81,7 @@ class NPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
     auto* tensor = context.Output<phi::DenseTensor>("Out");
     tensor->mutable_data<T>(context.GetPlace());
 
-    Tensor cpu_tensor(tensor->dtype());
+    phi::DenseTensor cpu_tensor(tensor->dtype());
     cpu_tensor.Resize(tensor->dims());
     T* cpu_data = cpu_tensor.mutable_data<T>(platform::CPUPlace());
     std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 0da82f73028d9..7ba22baff99b9 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -145,7 +145,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "ShapeTensorList" || var_name == "ShapeTensor") {
       return expected_kernel_type;
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 8dd7a140ae914..4c60cb76fb9ea 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -46,7 +46,8 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
       if (!new_shape.empty()) tensor->Resize(phi::make_ddim(new_shape));
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
-          "Expected type of Output(out) in uniform_random_op must be Tensor, "
+          "Expected type of Output(out) in uniform_random_op must be "
+          "phi::DenseTensor, "
           "SelectedRows. But got "
           "unsupport type: %s.",
           framework::ToTypeName(out_var->Type())));
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index 3ddf6092f04bf..05a643b33b215 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -30,7 +30,6 @@
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 inline std::vector<int64_t> GetNewDataFromShapeTensor(
     const phi::DenseTensor* new_data_tensor) {
diff --git a/paddle/fluid/operators/uniform_random_op_mlu.cc b/paddle/fluid/operators/uniform_random_op_mlu.cc
index 1f7f3e2f2bad3..8e5f61c831088 100644
--- a/paddle/fluid/operators/uniform_random_op_mlu.cc
+++ b/paddle/fluid/operators/uniform_random_op_mlu.cc
@@ -50,7 +50,8 @@ class MLUUniformRandomKernel : public framework::OpKernel<T> {
       if (!new_shape.empty()) tensor->Resize(phi::make_ddim(new_shape));
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
-          "Expected type of Output(out) in uniform_random_op must be Tensor, "
+          "Expected type of Output(out) in uniform_random_op must be "
+          "phi::DenseTensor, "
           "SelectedRows. But got "
           "unsupport type: %s.",
           framework::ToTypeName(out_var->Type())));
@@ -59,7 +60,7 @@ class MLUUniformRandomKernel : public framework::OpKernel<T> {
     tensor->mutable_data<T>(ctx.GetPlace());
     int64_t size = tensor->numel();
 
-    Tensor cpu_tensor(tensor->dtype());
+    phi::DenseTensor cpu_tensor(tensor->dtype());
     cpu_tensor.Resize(tensor->dims());
     T *data_cpu = cpu_tensor.mutable_data<T>(platform::CPUPlace());
 
diff --git a/paddle/fluid/operators/uniform_random_op_npu.cc b/paddle/fluid/operators/uniform_random_op_npu.cc
index b1499b30fede7..e82c6e1f2a91a 100644
--- a/paddle/fluid/operators/uniform_random_op_npu.cc
+++ b/paddle/fluid/operators/uniform_random_op_npu.cc
@@ -52,7 +52,8 @@ class NPUUniformRandomKernel : public framework::OpKernel<T> {
       if (!new_shape.empty()) tensor->Resize(phi::make_ddim(new_shape));
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
-          "Expected type of Output(out) in uniform_random_op must be Tensor, "
+          "Expected type of Output(out) in uniform_random_op must be "
+          "phi::DenseTensor, "
           "SelectedRows. But got "
           "unsupport type: %s.",
           framework::ToTypeName(out_var->Type())));
@@ -60,7 +61,7 @@ class NPUUniformRandomKernel : public framework::OpKernel<T> {
     tensor->mutable_data<T>(ctx.GetPlace());
     int64_t size = tensor->numel();
 
-    Tensor cpu_tensor(tensor->dtype());
+    phi::DenseTensor cpu_tensor(tensor->dtype());
     cpu_tensor.Resize(tensor->dims());
     T *data_cpu = cpu_tensor.mutable_data<T>(platform::CPUPlace());
 
diff --git a/paddle/fluid/operators/var_conv_2d_op.cc b/paddle/fluid/operators/var_conv_2d_op.cc
index 35118ae64876c..b470874f26083 100644
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ b/paddle/fluid/operators/var_conv_2d_op.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using LoD = framework::LoD;
 
 void VarConv2dOpMaker::Make() {
@@ -36,7 +35,7 @@ void VarConv2dOpMaker::Make() {
            "(phi::DenseTensor) the row variable provides lod information");
   AddInput("COLUMN",
            "(phi::DenseTensor) the column variable provides lod information");
-  AddInput("W", "W (Tensor), the filter.");
+  AddInput("W", "W (phi::DenseTensor), the filter.");
   AddAttr<int>("InputChannel", "the input filter num").SetDefault(1);
   AddAttr<int>("OutputChannel", "the output filter num").SetDefault(1);
   AddAttr<int>("StrideH", "the height of Stride").SetDefault(1);
@@ -130,11 +129,11 @@ void VarConv2dOP::InferShape(framework::InferShapeContext* ctx) const {
     framework::Variable* x_var =
         PADDLE_GET(framework::Variable*, ctx->GetInputVarPtrs("X")[0]);
     const auto& x_lod = x_var->Get<phi::DenseTensor>().lod();
-    PADDLE_ENFORCE_EQ(
-        !x_lod.empty(),
-        true,
-        platform::errors::InvalidArgument("The Input(X) Tensor of VarConv2dOP "
-                                          "does not contain LoD information."));
+    PADDLE_ENFORCE_EQ(!x_lod.empty(),
+                      true,
+                      platform::errors::InvalidArgument(
+                          "The Input(X) phi::DenseTensor of VarConv2dOP "
+                          "does not contain LoD information."));
 
     PADDLE_ENFORCE_GE(x_lod.size(),
                       1,
@@ -151,20 +150,22 @@ void VarConv2dOP::InferShape(framework::InferShapeContext* ctx) const {
     framework::Variable* row_var =
         PADDLE_GET(framework::Variable*, ctx->GetInputVarPtrs("ROW")[0]);
     const auto& row_lod = row_var->Get<phi::DenseTensor>().lod();
-    PADDLE_ENFORCE_EQ(!row_lod.empty(),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "The Input(ROW) Tensor of VarConv2dOP does not "
-                          "contain LoD information."));
+    PADDLE_ENFORCE_EQ(
+        !row_lod.empty(),
+        true,
+        platform::errors::InvalidArgument(
+            "The Input(ROW) phi::DenseTensor of VarConv2dOP does not "
+            "contain LoD information."));
 
     framework::Variable* col_var =
         PADDLE_GET(framework::Variable*, ctx->GetInputVarPtrs("COLUMN")[0]);
     const auto& col_lod = col_var->Get<phi::DenseTensor>().lod();
-    PADDLE_ENFORCE_EQ(!col_lod.empty(),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "The Input(COLUMN) Tensor of VarConv2dOP does not "
-                          "contain LoD information."));
+    PADDLE_ENFORCE_EQ(
+        !col_lod.empty(),
+        true,
+        platform::errors::InvalidArgument(
+            "The Input(COLUMN) phi::DenseTensor of VarConv2dOP does not "
+            "contain LoD information."));
   } else {
     std::vector<int64_t> out_dims_vec{-1};
     out_dims_vec.push_back(1);
@@ -468,7 +469,7 @@ class CPUVarConv2dOPGradKernel : public framework::OpKernel<T> {
     auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto* d_w = ctx.Output<phi::DenseTensor>(framework::GradVarName("W"));
 
-    Tensor col_grad;
+    phi::DenseTensor col_grad;
     col_grad.Resize(col->dims());
     auto* col_diff = col_grad.mutable_data<T>(ctx.GetPlace());
     auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/var_conv_2d_op.h b/paddle/fluid/operators/var_conv_2d_op.h
index 1a5fa9de2c7ce..cc0c97e671e8a 100644
--- a/paddle/fluid/operators/var_conv_2d_op.h
+++ b/paddle/fluid/operators/var_conv_2d_op.h
@@ -19,7 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using LoD = framework::LoD;
 
 class VarConv2dOP : public framework::OperatorWithKernel {
diff --git a/paddle/fluid/operators/where_index_op_mlu.cc b/paddle/fluid/operators/where_index_op_mlu.cc
index 85f463f723ef5..59ffb43f7ce5c 100644
--- a/paddle/fluid/operators/where_index_op_mlu.cc
+++ b/paddle/fluid/operators/where_index_op_mlu.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class MLUWhereIndexKernel : public framework::OpKernel<T> {
  public:
@@ -31,7 +29,7 @@ class MLUWhereIndexKernel : public framework::OpKernel<T> {
     auto dims = condition->dims();
     const int rank = dims.size();
 
-    Tensor num_true;
+    phi::DenseTensor num_true;
     num_true.mutable_data<int>({1}, context.GetPlace());
     MLUCnnlTensorDesc con_desc(*condition);
     MLUCnnlTensorDesc num_true_desc(num_true);
@@ -41,7 +39,7 @@ class MLUWhereIndexKernel : public framework::OpKernel<T> {
                      num_true_desc.get(),
                      GetBasePtr(&num_true));
 
-    Tensor local_true_num;
+    phi::DenseTensor local_true_num;
     paddle::framework::TensorCopySync(
         num_true, platform::CPUPlace(), &local_true_num);
     auto true_num = *local_true_num.data<int>();
diff --git a/paddle/fluid/operators/where_index_op_npu.cc b/paddle/fluid/operators/where_index_op_npu.cc
index 5b006cbdcf1b0..d888513c2ebd2 100644
--- a/paddle/fluid/operators/where_index_op_npu.cc
+++ b/paddle/fluid/operators/where_index_op_npu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class NPUWhereIndexKernel : public framework::OpKernel<T> {
  public:
@@ -39,7 +37,7 @@ class NPUWhereIndexKernel : public framework::OpKernel<T> {
     const aclrtStream& stream = dev_ctx.stream();
 
     // Run Cast and ReduceSum to get 0 dim of Out
-    Tensor booled_cond;
+    phi::DenseTensor booled_cond;
     if (framework::TransToProtoVarType(condition->dtype()) !=
         framework::proto::VarType::BOOL) {
       auto bool_type = ConvertToNpuDtype(framework::proto::VarType::BOOL);
@@ -53,7 +51,7 @@ class NPUWhereIndexKernel : public framework::OpKernel<T> {
     } else {
       booled_cond.ShareDataWith(*condition);
     }
-    Tensor casted_cond;
+    phi::DenseTensor casted_cond;
     auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT64);
     casted_cond.mutable_data<int64_t>(dims, place);
     const auto& cast_runner =
@@ -63,9 +61,9 @@ class NPUWhereIndexKernel : public framework::OpKernel<T> {
                     {{"dst_type", static_cast<int>(dst_dtype)}});
     cast_runner.Run(stream);
 
-    Tensor sumed_true_num;
+    phi::DenseTensor sumed_true_num;
     sumed_true_num.mutable_data<int64_t>({1}, place);
-    Tensor cond_axes;
+    phi::DenseTensor cond_axes;
     cond_axes.mutable_data<int>({dims.size()}, place);
     std::vector<int> axes_vec;
     for (int i = 0; i < dims.size(); ++i) {
@@ -78,7 +76,7 @@ class NPUWhereIndexKernel : public framework::OpKernel<T> {
                                          {{"keep_dims", false}});
     sum_runner.Run(stream);
 
-    Tensor local_true_num;
+    phi::DenseTensor local_true_num;
     paddle::framework::TensorCopySync(
         sumed_true_num, platform::CPUPlace(), &local_true_num);
     auto true_num = *local_true_num.data<int64_t>();

From 1a3d25921456f19e8dd734805a6ef2989339e918 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= <Xs1580802568@gmail.com>
Date: Wed, 7 Dec 2022 15:00:59 +0800
Subject: [PATCH 26/60] [Zero-Dim] Support 0D for paddle.diagflat (#48735)

* [Zero-Dim] Support 0D for paddle.diagflat
---
 paddle/phi/infermeta/unary.cc                 |  4 +-
 paddle/phi/kernels/cpu/diag_grad_kernel.cc    |  6 +--
 paddle/phi/kernels/cpu/diag_kernel.cc         |  6 +--
 paddle/phi/kernels/gpu/diag_grad_kernel.cu    |  6 +--
 paddle/phi/kernels/gpu/diag_kernel.cu         |  6 +--
 .../tests/unittests/test_zero_dim_tensor.py   | 49 +++++++++++++++++++
 .../unittests/xpu/test_zero_dim_tensor_xpu.py | 29 +++++++++++
 python/paddle/tensor/creation.py              |  4 +-
 8 files changed, 94 insertions(+), 16 deletions(-)

diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 768c33d4f4ad6..c3b96b813b8c3 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -563,8 +563,8 @@ void DiagInferMeta(const MetaTensor& x,
                    MetaTensor* out) {
   auto x_dims = x.dims();
 
-  if (x_dims.size() == 1UL) {
-    int64_t size_ = x_dims[0] + std::abs(offset);
+  if (x_dims.size() <= 1) {
+    int64_t size_ = (x_dims.size() == 1UL ? x_dims[0] : 1) + std::abs(offset);
     out->set_dims({size_, size_});
     out->set_dtype(x.dtype());
   } else if (x_dims.size() == 2UL) {
diff --git a/paddle/phi/kernels/cpu/diag_grad_kernel.cc b/paddle/phi/kernels/cpu/diag_grad_kernel.cc
index 616ea753ef1ba..13d3d679ff006 100644
--- a/paddle/phi/kernels/cpu/diag_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/diag_grad_kernel.cc
@@ -32,9 +32,9 @@ void DiagGradKernel(const Context& dev_ctx,
   auto dx_dims = x_grad->dims();
   auto dout_dims = out_grad.dims();
 
-  if (dx_dims.size() == 1) {
-    auto dx_length = dx_dims[0];
-    int dx_stride = phi::funcs::ComputeStride(0, dx_dims);
+  if (dx_dims.size() <= 1) {
+    auto dx_length = (dx_dims.size() == 1 ? dx_dims[0] : int64_t(1));
+    int dx_stride = 1;
 
     auto dout_stride_0 = phi::funcs::ComputeStride(0, dout_dims);
     auto dout_stride_1 = phi::funcs::ComputeStride(1, dout_dims);
diff --git a/paddle/phi/kernels/cpu/diag_kernel.cc b/paddle/phi/kernels/cpu/diag_kernel.cc
index 4b060f0372a5b..1576d80b15206 100644
--- a/paddle/phi/kernels/cpu/diag_kernel.cc
+++ b/paddle/phi/kernels/cpu/diag_kernel.cc
@@ -33,12 +33,12 @@ void DiagKernel(const Context& dev_ctx,
   auto out_dims = out->dims();
 
   int64_t i;
-  if (x_dims.size() == 1) {
+  if (x_dims.size() <= 1) {
     phi::funcs::SetConstant<Context, T> set_padding_value;
     set_padding_value(dev_ctx, out, static_cast<T>(padding_value));
 
-    auto x_length = x_dims[0];
-    const int& x_stride = phi::funcs::ComputeStride(0, x_dims);
+    auto x_length = (x_dims.size() == 1UL ? x_dims[0] : int64_t(1));
+    const int& x_stride = 1;
 
     auto out_stride_0 = phi::funcs::ComputeStride(0, out_dims);
     auto out_stride_1 = phi::funcs::ComputeStride(1, out_dims);
diff --git a/paddle/phi/kernels/gpu/diag_grad_kernel.cu b/paddle/phi/kernels/gpu/diag_grad_kernel.cu
index 5a579ecc27b7f..39ac78dae0216 100644
--- a/paddle/phi/kernels/gpu/diag_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/diag_grad_kernel.cu
@@ -73,10 +73,10 @@ void DiagGradKernel(const Context& dev_ctx,
     return std::tuple<int64_t, int64_t>{block_size, grid_size};
   };
 
-  if (dx_dims.size() == 1) {
-    auto dx_length = dx_dims[0];
+  if (dx_dims.size() <= 1) {
+    auto dx_length = (dx_dims.size() == 1 ? dx_dims[0] : int64_t(1));
     auto size = (offset > 0) ? dx_length + offset : dx_length - offset;
-    int dx_stride = phi::funcs::ComputeStride(0, dx_dims);
+    int dx_stride = 1;
     if (size > 0) {
       auto dout_stride_0 = phi::funcs::ComputeStride(0, dout_dims);
       auto dout_stride_1 = phi::funcs::ComputeStride(1, dout_dims);
diff --git a/paddle/phi/kernels/gpu/diag_kernel.cu b/paddle/phi/kernels/gpu/diag_kernel.cu
index 95d3d3365d91b..588bb17b79a0d 100644
--- a/paddle/phi/kernels/gpu/diag_kernel.cu
+++ b/paddle/phi/kernels/gpu/diag_kernel.cu
@@ -77,13 +77,13 @@ void DiagKernel(const Context& dev_ctx,
     return std::tuple<int64_t, int64_t>{block_size, grid_size};
   };
 
-  if (x_dims.size() == 1) {
+  if (x_dims.size() <= 1) {
     phi::funcs::SetConstant<Context, T> set_padding_value;
     set_padding_value(dev_ctx, out, static_cast<T>(padding_value));
 
-    auto x_length = x_dims[0];
+    auto x_length = (x_dims.size() == 1UL ? x_dims[0] : int64_t(1));
     auto size = (offset > 0) ? x_length + offset : x_length - offset;
-    const int& x_stride = phi::funcs::ComputeStride(0, x_dims);
+    const int& x_stride = 1;
     if (size > 0) {
       const auto& out_stride_0 = phi::funcs::ComputeStride(0, out_dims);
       const auto& out_stride_1 = phi::funcs::ComputeStride(1, out_dims);
diff --git a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
index e7381350624b9..eae5528fba244 100644
--- a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
@@ -653,6 +653,35 @@ def test_scatter_XD(self):
             self.assertEqual(out.numpy()[1][i], updates.numpy()[i])
         self.assertEqual(out.grad.shape, [2, 3])
 
+    def test_diagflat(self):
+        x1 = paddle.rand([])
+        x2 = paddle.rand([])
+        x3 = paddle.rand([])
+
+        x1.stop_gradient = False
+        x2.stop_gradient = False
+        x3.stop_gradient = False
+
+        out1 = paddle.diagflat(x1, 1)
+        out2 = paddle.diagflat(x2, -1)
+        out3 = paddle.diagflat(x3, 0)
+
+        out1.backward()
+        out2.backward()
+        out3.backward()
+
+        self.assertEqual(out1.shape, [2, 2])
+        self.assertEqual(out2.shape, [2, 2])
+        self.assertEqual(out3.shape, [1, 1])
+
+        self.assertEqual(out1.grad.shape, [2, 2])
+        self.assertEqual(out2.grad.shape, [2, 2])
+        self.assertEqual(out3.grad.shape, [1, 1])
+
+        self.assertEqual(x1.grad.shape, [])
+        self.assertEqual(x2.grad.shape, [])
+        self.assertEqual(x3.grad.shape, [])
+
 
 class TestSundryAPIStatic(unittest.TestCase):
     def setUp(self):
@@ -796,6 +825,26 @@ def test_scatter_XD(self):
         for i in range(3):
             self.assertEqual(res[0][1][i], 4)
 
+    @prog_scope()
+    def test_diagflat(self):
+        x1 = paddle.rand([])
+        out1 = paddle.diagflat(x1, 1)
+        paddle.static.append_backward(out1)
+
+        x2 = paddle.rand([])
+        out2 = paddle.diagflat(x2, -1)
+        paddle.static.append_backward(out2)
+
+        x3 = paddle.rand([])
+        out3 = paddle.diagflat(x3)
+        paddle.static.append_backward(out3)
+
+        prog = paddle.static.default_main_program()
+        res1, res2, res3 = self.exe.run(prog, fetch_list=[out1, out2, out3])
+        self.assertEqual(res1.shape, (2, 2))
+        self.assertEqual(res2.shape, (2, 2))
+        self.assertEqual(res3.shape, (1, 1))
+
 
 # Use to test API whose zero-dim input tensors don't have grad and not need to test backward in OpTest.
 class TestNoBackwardAPI(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
index b07043689f7fe..a6f91e5df4c66 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
@@ -475,6 +475,35 @@ def test_scatter_XD(self):
         for i in range(3):
             self.assertEqual(out.numpy()[1][i], updates.numpy()[i])
 
+    def test_diagflat(self):
+        x1 = paddle.rand([])
+        x2 = paddle.rand([])
+        x3 = paddle.rand([])
+
+        x1.stop_gradient = False
+        x2.stop_gradient = False
+        x3.stop_gradient = False
+
+        out1 = paddle.diagflat(x1, 1)
+        out2 = paddle.diagflat(x2, -1)
+        out3 = paddle.diagflat(x3, 0)
+
+        out1.backward()
+        out2.backward()
+        out3.backward()
+
+        self.assertEqual(out1.shape, [2, 2])
+        self.assertEqual(out2.shape, [2, 2])
+        self.assertEqual(out3.shape, [1, 1])
+
+        self.assertEqual(out1.grad.shape, [2, 2])
+        self.assertEqual(out2.grad.shape, [2, 2])
+        self.assertEqual(out3.grad.shape, [1, 1])
+
+        self.assertEqual(x1.grad.shape, [])
+        self.assertEqual(x2.grad.shape, [])
+        self.assertEqual(x3.grad.shape, [])
+
 
 # Use to test API whose zero-dim input tensors don't have grad and not need to test backward in OpTest.
 class TestNoBackwardAPI(unittest.TestCase):
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 134e27eef9df6..d597ff6a1317f 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -1479,7 +1479,7 @@ def diagflat(x, offset=0, name=None):
     """
     padding_value = 0
     if in_dygraph_mode():
-        if len(x.shape) == 1:
+        if len(x.shape) <= 1:
             return _C_ops.diag(x, offset, padding_value)
         else:
             y = _C_ops.flatten(x, 0, -1)
@@ -1509,7 +1509,7 @@ def diagflat(x, offset=0, name=None):
     out1_shape = helper.create_variable_for_type_inference(x.dtype)
     out2 = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    if len(x.shape) == 1:
+    if len(x.shape) <= 1:
         helper.append_op(
             type='diag_v2',
             inputs={'X': x},

From 9a9e0aa02cc4e623d94b98086fce6a17b9ccf8ef Mon Sep 17 00:00:00 2001
From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com>
Date: Wed, 7 Dec 2022 15:23:46 +0800
Subject: [PATCH 27/60] =?UTF-8?q?=E3=80=90fluid=20api=20clear=E3=80=91Move?=
 =?UTF-8?q?=20batch=20norm1=20(#47965)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* modify slice infershape

* code style

* modify slice_unittest

* temp fix

* batch_norm api move

* code_style

* codestyle

* ci_static

* add __init__

* reset other change

* revert .cc

* add import batchnorm

* conflict and revert

* fix bug

* fix third conflict one day

* fix conflict

* fix conflict bug

* fix conflict bug

* modify api

* code_style

* modify doc

* add lost doc stable

* fix conflict bug

* ci lack of gpu
---
 python/paddle/fluid/contrib/layers/nn.py      |   5 +-
 .../fluid/contrib/slim/tests/test_graph.py    |   2 +-
 .../tests/test_quantization_mkldnn_pass.py    |   2 +-
 .../slim/tests/test_quantization_pass.py      |   6 +-
 .../tests/test_quantization_scale_pass.py     |   2 +-
 .../tests/test_user_defined_quantization.py   |   2 +-
 .../tests/test_image_classification_fp16.py   |   4 +-
 .../tests/test_multi_precision_fp16_train.py  |   2 +-
 .../contrib/tests/test_quantize_transpiler.py |   4 +-
 python/paddle/fluid/layers/nn.py              | 323 ------------------
 python/paddle/fluid/nets.py                   |   2 +-
 .../tests/book/test_image_classification.py   |   4 +-
 .../fluid/tests/book/test_recognize_digits.py |   2 +-
 .../fluid/tests/unittests/dist_se_resnext.py  |   2 +-
 .../unittests/ipu/test_batch_norm_op_ipu.py   |   2 +-
 .../ir/inference/test_trt_activation_pass.py  |   3 +-
 .../inference/test_trt_anchor_generator_op.py |   3 +-
 .../ir/inference/test_trt_elementwise_op.py   |   3 +-
 .../ir/inference/test_trt_flatten_op.py       |   5 +-
 .../ir/inference/test_trt_gather_nd_op.py     |   5 +-
 .../ir/inference/test_trt_inspector.py        |   2 +-
 .../ir/inference/test_trt_instance_norm_op.py |   6 +-
 .../unittests/ir/inference/test_trt_matmul.py |   7 +-
 .../test_trt_matmul_quant_dequant.py          |   4 +-
 .../inference/test_trt_multiclass_nms3_op.py  |   3 +-
 .../inference/test_trt_multiclass_nms_op.py   |   3 +-
 .../inference/test_trt_nearest_interp_op.py   |   3 +-
 .../test_trt_nearest_interp_v2_op.py          |   3 +-
 .../unittests/ir/inference/test_trt_pad_op.py |   3 +-
 .../ir/inference/test_trt_pool3d_op.py        |   6 +-
 .../ir/inference/test_trt_pool_op.py          |   3 +-
 .../ir/inference/test_trt_reduce_sum_op.py    |   5 +-
 .../ir/inference/test_trt_reshape_op.py       |   9 +-
 .../ir/inference/test_trt_scale_op.py         |   5 +-
 .../test_trt_shuffle_channel_detect_pass.py   |   4 +-
 .../ir/inference/test_trt_slice_plugin.py     |   7 +-
 .../ir/inference/test_trt_subgraph_pass.py    |  15 +-
 .../ir/inference/test_trt_tile_op.py          |   8 +-
 ..._trt_transpose_flatten_concat_fuse_pass.py |   2 +-
 .../unittests/mlu/sync_batch_norm_op_mlu.py   |   2 +-
 .../unittests/mlu/test_batch_norm_op_mlu.py   |   4 +-
 .../unittests/npu/sync_batch_norm_op_npu.py   |   2 +-
 .../fluid/tests/unittests/seresnext_net.py    |   2 +-
 .../fluid/tests/unittests/simple_nets.py      |   2 +-
 .../test_async_ssa_graph_executor_mnist.py    |   2 +-
 .../tests/unittests/test_batch_norm_op.py     |   4 +-
 .../tests/unittests/test_fetch_unmerged.py    |   2 +-
 .../tests/unittests/test_fuse_bn_act_pass.py  |   4 +-
 .../unittests/test_fuse_bn_add_act_pass.py    |   6 +-
 .../test_fuse_relu_depthwise_conv_pass.py     |   2 +-
 .../test_image_classification_layer.py        |   4 +-
 .../test_imperative_load_static_param.py      |   4 +-
 .../test_imperative_static_runner_mnist.py    |   2 +-
 .../tests/unittests/test_inplace_abn_op.py    |   2 +-
 .../tests/unittests/test_ir_inplace_pass.py   |   2 +-
 .../fluid/tests/unittests/test_layers.py      |   4 +-
 .../test_load_state_dict_from_old_format.py   |   2 +-
 .../test_mix_precision_all_reduce_fuse.py     |   2 +-
 .../tests/unittests/test_norm_nn_grad.py      |   4 +-
 .../test_parallel_executor_fetch_feed.py      |   4 +-
 .../unittests/test_parallel_executor_mnist.py |   2 +-
 .../tests/unittests/test_set_bool_attr.py     |   4 +-
 .../unittests/test_sync_batch_norm_op.py      |   2 +-
 python/paddle/static/__init__.py              |   2 +
 python/paddle/static/nn/__init__.py           |   3 +-
 python/paddle/static/nn/common.py             | 322 +++++++++++++++++
 66 files changed, 450 insertions(+), 428 deletions(-)

diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py
index 02c5a7bfe4f87..fffc3cd5a6e3f 100644
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -1963,8 +1963,11 @@ def fused_bn_add_act(
     Examples:
             .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
 
+            paddle.enable_static()
+            # required: gpu
             def build_program(main_program, startup_program):
                 with fluid.program_guard(main_program, startup_program):
                     x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
@@ -1987,7 +1990,7 @@ def build_program(main_program, startup_program):
                         act=None,
                         bias_attr=False,
                         data_format='NHWC')
-                    bn = fluid.layers.batch_norm(
+                    bn = paddle.static.nn.batch_norm(
                         input=conv1_1,
                         act=None,
                         data_layout='NHWC')
diff --git a/python/paddle/fluid/contrib/slim/tests/test_graph.py b/python/paddle/fluid/contrib/slim/tests/test_graph.py
index 482c7237bfce8..1b692bcaafb0e 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_graph.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph.py
@@ -37,7 +37,7 @@ def conv_block():
         pool_stride=2,
         act="relu",
     )
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
index 23b89512454a7..a89042c0b5959 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
@@ -37,7 +37,7 @@ def conv_net(img, label):
         pool_stride=2,
         act="relu",
     )
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
index 7fa95fd13f494..f49d019bc1752 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
@@ -57,7 +57,7 @@ def conv_bn_layer(
             act=None,
             bias_attr=bias_attr,
         )
-        return fluid.layers.batch_norm(input=tmp, act=act)
+        return paddle.static.nn.batch_norm(input=tmp, act=act)
 
     data = fluid.layers.data(
         name='image',
@@ -102,7 +102,7 @@ def conv_net(img, label, quant_skip_pattern):
         pool_type='max',
         act="relu",
     )
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
@@ -712,7 +712,7 @@ def conv_bn_layer(
             act=None,
             bias_attr=bias_attr,
         )
-        return fluid.layers.batch_norm(input=tmp, act=act)
+        return paddle.static.nn.batch_norm(input=tmp, act=act)
 
     data1 = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
     data2 = fluid.layers.data(
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
index 46e3700246037..d19b62a376279 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
@@ -43,7 +43,7 @@ def conv_net(img, label):
         pool_type='max',
         act="relu",
     )
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
diff --git a/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py b/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
index 25656278137a7..fc5d18227b92a 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
@@ -45,7 +45,7 @@ def conv_net(img, label):
         pool_type='max',
         act="relu",
     )
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
diff --git a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
index 908622d76a154..b5df94c0cb497 100644
--- a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
+++ b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
@@ -41,7 +41,7 @@ def conv_bn_layer(
             act=None,
             bias_attr=bias_attr,
         )
-        return fluid.layers.batch_norm(input=tmp, act=act)
+        return paddle.static.nn.batch_norm(input=tmp, act=act)
 
     def shortcut(input, ch_in, ch_out, stride):
         if ch_in != ch_out:
@@ -97,7 +97,7 @@ def conv_block(input, num_filter, groups, dropouts):
 
     drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
     fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    bn = paddle.static.nn.batch_norm(input=fc1, act='relu')
     drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
     fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
     return fc2
diff --git a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
index 8f4bf36e5b2b5..b3d12bf9a4ba3 100644
--- a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
+++ b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
@@ -59,7 +59,7 @@ def conv_bn_layer(
             act=None,
             bias_attr=bias_attr,
         )
-        return fluid.layers.batch_norm(input=tmp, act=act)
+        return paddle.static.nn.batch_norm(input=tmp, act=act)
 
     def shortcut(input, ch_in, ch_out, stride):
         if ch_in != ch_out:
diff --git a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
index cdbd65fad68a6..b2f166def0798 100644
--- a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
@@ -48,7 +48,7 @@ def conv_bn_layer(
             act=None,
             bias_attr=bias_attr,
         )
-        return fluid.layers.batch_norm(input=tmp, act=act)
+        return paddle.static.nn.batch_norm(input=tmp, act=act)
 
     data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
@@ -72,7 +72,7 @@ def conv_net(img, label):
         pool_stride=2,
         act="relu",
     )
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 474bccc162e2b..9d4429ef04685 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -69,7 +69,6 @@
     'crf_decoding',
     'conv2d',
     'pool2d',
-    'batch_norm',
     'dropout',
     'split',
     'l2_normalize',
@@ -1681,328 +1680,6 @@ def is_list_or_tuple(ele):
     return pool_out
 
 
-def batch_norm(
-    input,
-    act=None,
-    is_test=False,
-    momentum=0.9,
-    epsilon=1e-05,
-    param_attr=None,
-    bias_attr=None,
-    data_layout='NCHW',
-    in_place=False,
-    name=None,
-    moving_mean_name=None,
-    moving_variance_name=None,
-    do_model_average_for_mean_and_var=True,
-    use_global_stats=False,
-):
-    r"""
-    :api_attr: Static Graph
-
-    **Batch Normalization Layer**
-
-    Can be used as a normalizer function for convolution or fully_connected operations.
-    The required data format for this layer is one of the following:
-
-    1. NHWC `[batch, in_height, in_width, in_channels]`
-
-    2. NCHW `[batch, in_channels, in_height, in_width]`
-
-    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
-    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
-    for more details.
-
-    :math:`input` is the input features over a mini-batch.
-
-    ..  math::
-
-        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
-        \ mini-batch\ mean \\\\
-        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
-        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
-        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
-
-        moving\_mean = moving\_mean * momentum + mini-batch\_mean * (1. - momentum) \\\\
-        moving\_var = moving\_var * momentum + mini-batch\_var * (1. - momentum)
-
-
-    moving_mean is global mean and moving_var is global variance.
-
-    When use_global_stats = True, the :math:`\\mu_{\\beta}`
-    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
-    They are global (or running) statistics. (It usually got from the
-    pre-trained model.)
-    The training and testing (or inference) have the same behavior:
-
-    ..  math::
-
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
-        \\sigma_{\\beta}^{2} + \\epsilon}}  \\\\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta
-
-    Note:
-        if build_strategy.sync_batch_norm=True, the batch_norm in network will use
-        sync_batch_norm automatically.
-        `is_test = True` can only be used in test program and inference program, `is_test` CANNOT be set to True in train program, if you want to use global status from pre_train model in train program, please set `use_global_stats = True`.
-
-    Args:
-        input(Tensor): The rank of input Tensor can be 2, 3, 4, 5. The data type
-            is float16 or float32 or float64.
-        act(string, Default None): Activation type, linear|relu|prelu|...
-        is_test (bool, Default False): A flag indicating whether it is in
-            test phrase or not.
-        momentum(float|Tensor, Default 0.9): The value used for the moving_mean and
-            moving_var computation. This should be a float number or a Tensor with
-            shape [1] and data type as float32. The updated formula is:
-            :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
-            :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
-            Default is 0.9.
-        epsilon(float, Default 1e-05): A value added to the denominator for
-            numerical stability. Default is 1e-5.
-        param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
-             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
-	     will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
-	     If the Initializer of the param_attr is not set, the parameter is initialized
-	     with Xavier. Default: None.
-        bias_attr(ParamAttr|None): The parameter attribute for the bias of batch_norm.
-             If it is set to None or one attribute of ParamAttr, batch_norm
-	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
-	     If the Initializer of the bias_attr is not set, the bias is initialized zero.
-	     Default: None.
-        data_layout (str, optional): Specify the data format of the input, and the data format of the output
-             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
-             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-             `[batch_size, input_channels, input_height, input_width]`.
-        in_place(bool, Default False): Make the input and output of batch norm reuse memory.
-        name(str|None): For detailed information, please refer to :ref:`api_guide_Name`.
-            Usually name is no need to set and None by default.
-        moving_mean_name(str, Default None): The name of moving_mean which store the global Mean. If it
-            is set to None, batch_norm will save global mean with a random name, otherwise, batch_norm
-            will save global mean with the string.
-        moving_variance_name(str, Default None): The name of the moving_variance which store the global Variance.
-            If it is set to None, batch_norm will save global variance with a random name, otherwise, batch_norm
-            will save global variance with the string.
-        do_model_average_for_mean_and_var(bool, Default True): Whether parameter mean and variance should do model
-            average when model average is enabled.
-        use_global_stats(bool, Default False): Whether to use global mean and
-            variance. In inference or test mode, set use_global_stats to true
-            or is_test to true, and the behavior is equivalent.
-            In train mode, when setting use_global_stats True, the global mean
-            and variance are also used during train period.
-    Returns:
-        A Tensor which is the result after applying batch normalization on the input,
-        has same shape and data type with input.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-
-            paddle.enable_static()
-            x = paddle.static.data(name='x', shape=[3, 7, 3, 7], dtype='float32')
-            hidden1 = paddle.static.nn.fc(x=x, size=200)
-            print(hidden1.shape)
-            # [3, 200]
-            hidden2 = paddle.static.nn.batch_norm(input=hidden1)
-            print(hidden2.shape)
-            # [3, 200]
-    """
-    assert (
-        bias_attr is not False
-    ), "bias_attr should not be False in batch_norm."
-    helper = LayerHelper('batch_norm', **locals())
-
-    check_variable_and_dtype(
-        input, 'input', ['float16', 'float32', 'float64'], 'batch_norm'
-    )
-    dtype = helper.input_dtype()
-
-    # use fp32 for bn parameter
-    if dtype == core.VarDesc.VarType.FP16:
-        dtype = core.VarDesc.VarType.FP32
-
-    input_shape = input.shape
-    if data_layout == 'NCHW':
-        channel_num = input_shape[1]
-    else:
-        if data_layout == 'NHWC':
-            channel_num = input_shape[-1]
-        else:
-            raise ValueError("unsupported data layout:" + data_layout)
-
-    param_shape = [channel_num]
-
-    # create parameter
-    scale = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=param_shape,
-        dtype=dtype,
-        default_initializer=Constant(1.0),
-    )
-    bias = helper.create_parameter(
-        attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True
-    )
-
-    mean = helper.create_parameter(
-        attr=ParamAttr(
-            name=moving_mean_name,
-            initializer=Constant(0.0),
-            trainable=False,
-            do_model_average=do_model_average_for_mean_and_var,
-        ),
-        shape=param_shape,
-        dtype=dtype,
-    )
-    mean.stop_gradient = True
-
-    variance = helper.create_parameter(
-        attr=ParamAttr(
-            name=moving_variance_name,
-            initializer=Constant(1.0),
-            trainable=False,
-            do_model_average=do_model_average_for_mean_and_var,
-        ),
-        shape=param_shape,
-        dtype=dtype,
-    )
-    variance.stop_gradient = True
-
-    # create output
-    # mean and mean_out share the same memory
-    mean_out = mean
-    # variance and variance_out share the same memory
-    variance_out = variance
-
-    if in_dygraph_mode():
-        inputs_has_MomemtumTensor = False
-        attrs_has_momentum = False
-        tmp_tensor_type = core.eager.Tensor
-        if isinstance(momentum, tmp_tensor_type):
-            inputs_has_MomemtumTensor = True
-        else:
-            attrs_has_momentum = True
-
-        attrs_ = ()
-        if attrs_has_momentum:
-            attrs_ = (
-                'momentum',
-                momentum,
-                'epsilon',
-                epsilon,
-                'is_test',
-                is_test,
-                'data_layout',
-                data_layout,
-                'use_mkldnn',
-                False,
-                'fuse_with_relu',
-                False,
-                'use_global_stats',
-                use_global_stats,
-            )
-        else:
-            attrs_ = (
-                'epsilon',
-                epsilon,
-                'is_test',
-                is_test,
-                'data_layout',
-                data_layout,
-                'use_mkldnn',
-                False,
-                'fuse_with_relu',
-                False,
-                'use_global_stats',
-                use_global_stats,
-            )
-        if inputs_has_MomemtumTensor:
-            batch_norm_out, _, _, _, _, _ = _legacy_C_ops.batch_norm(
-                input,
-                scale,
-                bias,
-                mean,
-                variance,
-                momentum,
-                mean_out,
-                variance_out,
-                *attrs_,
-            )
-        else:
-            batch_norm_out, _, _, _, _, _ = _legacy_C_ops.batch_norm(
-                input,
-                scale,
-                bias,
-                mean,
-                variance,
-                None,
-                mean_out,
-                variance_out,
-                *attrs_,
-            )
-
-        return dygraph_utils._append_activation_in_dygraph(
-            batch_norm_out, act=act, use_mkldnn=False
-        )
-
-    saved_mean = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True
-    )
-    saved_variance = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True
-    )
-    reserve_space = None
-    if not is_test:
-        reserve_space = helper.create_variable_for_type_inference(
-            dtype=helper.input_dtype(), stop_gradient=True
-        )
-
-    batch_norm_out = (
-        input if in_place else helper.create_variable_for_type_inference(dtype)
-    )
-
-    inputs = {
-        "X": input,
-        "Scale": scale,
-        "Bias": bias,
-        "Mean": mean,
-        "Variance": variance,
-        "MeanOut": mean_out,
-        "VarianceOut": variance_out,
-    }
-    attrs = {
-        "epsilon": epsilon,
-        "is_test": is_test,
-        "data_layout": data_layout,
-        "use_mkldnn": False,
-        "fuse_with_relu": False,
-        "use_global_stats": use_global_stats,
-    }
-    if isinstance(momentum, Variable):
-        inputs['MomemtumTensor'] = momentum
-    else:
-        attrs['momentum'] = momentum
-
-    outputs = {
-        "Y": batch_norm_out,
-        "MeanOut": mean_out,
-        "VarianceOut": variance_out,
-        "SavedMean": saved_mean,
-        "SavedVariance": saved_variance,
-    }
-    if reserve_space is not None:
-        outputs["ReserveSpace"] = reserve_space
-
-    helper.append_op(
-        type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs
-    )
-
-    return helper.append_activation(batch_norm_out)
-
-
 @templatedoc()
 def layer_norm(
     input,
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index 0a781e67a82fc..2e8c83be2423f 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -253,7 +253,7 @@ def __extend_list__(obj):
         )
 
         if conv_with_batchnorm[i]:
-            tmp = layers.batch_norm(input=tmp, act=conv_act)
+            tmp = paddle.static.nn.batch_norm(input=tmp, act=conv_act)
             drop_rate = conv_batchnorm_drop_rate[i]
             if abs(drop_rate) > 1e-5:
                 tmp = layers.dropout(x=tmp, dropout_prob=drop_rate)
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index 3a401df20370d..77a59bc037037 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -40,7 +40,7 @@ def conv_bn_layer(
             act=None,
             bias_attr=bias_attr,
         )
-        return fluid.layers.batch_norm(input=tmp, act=act)
+        return paddle.static.nn.batch_norm(input=tmp, act=act)
 
     def shortcut(input, ch_in, ch_out, stride):
         if ch_in != ch_out:
@@ -96,7 +96,7 @@ def conv_block(input, num_filter, groups, dropouts):
 
     drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
     fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    bn = paddle.static.nn.batch_norm(input=fc1, act='relu')
     drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
     fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
     return fc2
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
index a89cb1617a12a..b96ff9940985b 100644
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -51,7 +51,7 @@ def conv_net(img, label):
         pool_stride=2,
         act="relu",
     )
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
index 05b3f3b093a65..8753d660beb16 100644
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -182,7 +182,7 @@ def conv_bn_layer(
             ),
             bias_attr=False,
         )
-        return fluid.layers.batch_norm(input=conv, act=act)
+        return paddle.static.nn.batch_norm(input=conv, act=act)
 
     def squeeze_excitation(self, input, num_channels, reduction_ratio):
         pool = fluid.layers.pool2d(
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
index 11275d0227488..ac418c2531904 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
@@ -58,7 +58,7 @@ def build_model(self):
         x = paddle.static.nn.conv2d(
             x, num_filters=3, filter_size=3, bias_attr=False
         )
-        x = paddle.fluid.layers.batch_norm(x, **self.attrs)
+        x = paddle.static.nn.batch_norm(x, **self.attrs)
         self.fetch_list = [x.name]
 
     def run_model(self, exec_mode):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
index 29393ff96ca2b..abc96d262e04e 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
@@ -22,6 +22,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -39,7 +40,7 @@ def setUp(self):
                 name="data", shape=[-1, 6, 32, 32], dtype="float32"
             )
             act_out = self.append_act(data)
-            out = fluid.layers.batch_norm(act_out, is_test=True)
+            out = nn.batch_norm(act_out, is_test=True)
         self.feeds = {
             "data": np.random.random([1, 6, 32, 32]).astype("float32"),
         }
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py
index a794298130866..88743ef399740 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py
@@ -20,6 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -67,7 +68,7 @@ def build(self):
             )
             if self.dynamic_shape_params is not None:
                 anchor = paddle.transpose(anchor, [2, 3, 0, 1])
-            out = fluid.layers.batch_norm(anchor, is_test=True)
+            out = nn.batch_norm(anchor, is_test=True)
 
         self.fetch_list = [out, var]
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_elementwise_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_elementwise_op.py
index ed7aa546b345d..df31be07eb8c8 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_elementwise_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_elementwise_op.py
@@ -21,6 +21,7 @@
 
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -34,7 +35,7 @@ def setUp(self):
                 name="data2", shape=[-1, 3, 64, 1], dtype="float32"
             )
             eltwise_out = self.append_eltwise(data1, data2)
-            out = fluid.layers.batch_norm(eltwise_out, is_test=True)
+            out = nn.batch_norm(eltwise_out, is_test=True)
         self.feeds = {
             "data1": np.random.random([1, 3, 64, 64]).astype("float32"),
             "data2": np.random.random([1, 3, 64, 1]).astype("float32"),
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_flatten_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_flatten_op.py
index 4ed648ed9c806..eec26fefec2d1 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_flatten_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_flatten_op.py
@@ -20,6 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -30,7 +31,7 @@ def setUp(self):
                 name="data", shape=[-1, 6, 64, 64], dtype="float32"
             )
             flatten_out = self.append_flatten(data)
-            out = fluid.layers.batch_norm(flatten_out, is_test=True)
+            out = nn.batch_norm(flatten_out, is_test=True)
         self.feeds = {
             "data": np.random.random([1, 6, 64, 64]).astype("float32"),
         }
@@ -59,7 +60,7 @@ def setUp(self):
                 name="data", shape=[-1, 6, 64, 64], dtype="float32"
             )
             flatten_out = self.append_flatten(data)
-            out = fluid.layers.batch_norm(flatten_out, is_test=True)
+            out = nn.batch_norm(flatten_out, is_test=True)
         self.feeds = {
             "data": np.random.random([2, 6, 64, 64]).astype("float32"),
         }
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_nd_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_nd_op.py
index b96eddb87e779..161a3142d5210 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_nd_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_nd_op.py
@@ -20,6 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -29,7 +30,7 @@ def setUp(self):
             data = fluid.data(name="data", shape=[-1, 3, 4], dtype="float32")
             index = fluid.data(name="index", shape=[-1, 2, 2], dtype="int32")
             gather_nd = paddle.gather_nd(data, index)
-            out = fluid.layers.batch_norm(gather_nd, is_test=True)
+            out = nn.batch_norm(gather_nd, is_test=True)
 
         self.feeds = {
             "data": np.random.random([2, 3, 4]).astype("float32"),
@@ -66,7 +67,7 @@ def setUp(self):
             )
             index = fluid.data(name="index", shape=[-1, 1028, 2], dtype="int32")
             gather_nd = paddle.gather_nd(data, index)
-            out = fluid.layers.batch_norm(gather_nd, is_test=True)
+            out = nn.batch_norm(gather_nd, is_test=True)
 
         index_data = np.zeros((1, 1028, 2), dtype='int32')
         self.feeds = {
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_inspector.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_inspector.py
index 9c8e1ee04cc38..379c3872242f0 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_inspector.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_inspector.py
@@ -37,7 +37,7 @@ def setUp(self):
                 transpose_y=self.transpose_y,
             )
             matmul_out = paddle.scale(matmul_out, scale=self.alpha)
-            out = fluid.layers.batch_norm(matmul_out, is_test=True)
+            out = paddle.static.nn.batch_norm(matmul_out, is_test=True)
 
         self.feeds = {
             "data": np.ones([1, 16, 16]).astype("float32"),
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_instance_norm_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_instance_norm_op.py
index 2901238ffe4a4..4d98c8cb3f382 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_instance_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_instance_norm_op.py
@@ -20,9 +20,9 @@
 import numpy as np
 from inference_pass_test import InferencePassTest
 
-import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -44,8 +44,8 @@ def build(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             shape = [-1, self.channel, self.height, self.width]
             data = fluid.data(name='in', shape=shape, dtype='float32')
-            instance_norm_out = paddle.static.nn.instance_norm(data)
-            out = fluid.layers.batch_norm(instance_norm_out, is_test=True)
+            instance_norm_out = nn.instance_norm(data)
+            out = nn.batch_norm(instance_norm_out, is_test=True)
 
         shape[0] = self.bs
         self.feeds = {
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
index 038912fbe4cb1..0d10acae95c3f 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
@@ -20,6 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -35,7 +36,7 @@ def setUp(self):
                 transpose_y=self.transpose_y,
             )
             matmul_out = paddle.scale(matmul_out, scale=self.alpha)
-            out = fluid.layers.batch_norm(matmul_out, is_test=True)
+            out = nn.batch_norm(matmul_out, is_test=True)
 
         self.feeds = {
             "data": np.ones([24, 24]).astype("float32"),
@@ -74,7 +75,7 @@ def setUp(self):
                 transpose_y=self.transpose_y,
             )
             matmul_out = paddle.scale(matmul_out, scale=self.alpha)
-            out = fluid.layers.batch_norm(matmul_out, is_test=True)
+            out = nn.batch_norm(matmul_out, is_test=True)
 
         self.feeds = {
             "data": np.ones([1, 6, 24, 24]).astype("float32"),
@@ -136,7 +137,7 @@ def setUp(self):
                 transpose_y=self.transpose_y,
             )
             matmul_out = paddle.scale(matmul_out, scale=self.alpha)
-            out = fluid.layers.batch_norm(matmul_out, is_test=True)
+            out = nn.batch_norm(matmul_out, is_test=True)
 
         self.feeds = {
             "data_x": np.ones([2, 6, 24]).astype("float32"),
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py
index b8566840d2131..4e2b3e0ae2420 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py
@@ -135,7 +135,7 @@ def network():
                 transpose_y=self.transpose_y,
             )
             matmul_out = paddle.scale(matmul_out, scale=self.alpha)
-            out = fluid.layers.batch_norm(matmul_out, is_test=True)
+            out = paddle.static.nn.batch_norm(matmul_out, is_test=True)
             fc_out = fluid.layers.fc(
                 input=matmul_out,
                 size=10,
@@ -231,7 +231,7 @@ def network():
                 transpose_y=self.transpose_y,
             )
             matmul_out = paddle.scale(matmul_out, scale=self.alpha)
-            out = fluid.layers.batch_norm(matmul_out, is_test=True)
+            out = paddle.static.nn.batch_norm(matmul_out, is_test=True)
             fc_out = fluid.layers.fc(
                 input=matmul_out,
                 size=10,
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms3_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms3_op.py
index 00a980415e1a7..2f2908d5f3198 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms3_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms3_op.py
@@ -21,6 +21,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 from paddle.fluid.framework import in_dygraph_mode
 from paddle.fluid.layer_helper import LayerHelper
@@ -242,7 +243,7 @@ def build(self):
                 [self.bs, 1, self.keep_top_k, 6],
                 name='reshape',
             )
-            out = fluid.layers.batch_norm(multiclass_nms_out, is_test=True)
+            out = nn.batch_norm(multiclass_nms_out, is_test=True)
 
         boxes_data = (
             np.arange(self.num_boxes * 4)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py
index 68ec0c22703c0..b5f84dcc9f760 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py
@@ -21,6 +21,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -69,7 +70,7 @@ def build(self):
                 [self.bs, 1, self.keep_top_k, 6],
                 name='reshape',
             )
-            out = fluid.layers.batch_norm(multiclass_nms_out, is_test=True)
+            out = nn.batch_norm(multiclass_nms_out, is_test=True)
 
         boxes_data = (
             np.arange(self.num_boxes * 4)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_op.py
index 505a7ccad3bc2..f335bd8f82399 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_op.py
@@ -20,6 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -44,7 +45,7 @@ def setUp(self):
                 ]
             data = fluid.data(name='data', shape=shape, dtype='float32')
             resize_out = self.append_nearest_interp(data)
-            out = fluid.layers.batch_norm(resize_out, is_test=True)
+            out = nn.batch_norm(resize_out, is_test=True)
 
         if self.data_layout == 'NCHW':
             shape = [
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_v2_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_v2_op.py
index a4c7dba0e6eba..056e5b6e29212 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_v2_op.py
@@ -19,6 +19,7 @@
 
 import paddle.fluid.core as core
 import paddle.nn.functional as F
+import paddle.static.nn as nn
 from paddle import fluid
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
@@ -44,7 +45,7 @@ def setUp(self):
                 ]
             data = fluid.data(name='data', shape=shape, dtype='float32')
             resize_out = self.append_nearest_interp(data)
-            out = fluid.layers.batch_norm(resize_out, is_test=True)
+            out = nn.batch_norm(resize_out, is_test=True)
 
         if self.data_layout == 'NCHW':
             shape = [
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pad_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pad_op.py
index f0cf6ead9d380..4b7dc7c9cb689 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pad_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pad_op.py
@@ -20,6 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig
 
 
@@ -32,7 +33,7 @@ def setUp(self):
             pad_out = paddle.nn.functional.pad(
                 x=data, pad=[0, 0, 0, 0, 0, 1, 1, 2], value=0.0
             )
-            out = fluid.layers.batch_norm(pad_out, is_test=True)
+            out = nn.batch_norm(pad_out, is_test=True)
 
         self.feeds = {
             "data": np.random.random((1, 3, 128, 128)).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool3d_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool3d_op.py
index 0362d96fc2a91..f8abf50dd10ff 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool3d_op.py
@@ -80,7 +80,7 @@ def build_network(self):
                     ceil_mode=self.ceil_mode,
                     exclusive=self.exclusive,
                 )
-            # out = fluid.layers.batch_norm(pool_out, is_test=True)
+            # out = paddle.static.nn.batch_norm(pool_out, is_test=True)
             self.fetch_list = [pool_out]
 
     def check_output(self):
@@ -198,7 +198,7 @@ def build_network(self):
             pool_out = paddle.nn.functional.adaptive_avg_pool3d(
                 x=data, output_size=[3, 3, 3]
             )
-            # out = fluid.layers.batch_norm(pool_out, is_test=True)
+            # out = paddle.static.nn.batch_norm(pool_out, is_test=True)
             self.fetch_list = [pool_out]
 
     def check_output(self):
@@ -298,7 +298,7 @@ def build_network(self):
             pool_out = paddle.nn.functional.adaptive_max_pool3d(
                 x=data, output_size=[3, 3, 3]
             )
-            # out = fluid.layers.batch_norm(pool_out, is_test=True)
+            # out = paddle.static.nn.batch_norm(pool_out, is_test=True)
             self.fetch_list = [pool_out]
 
     def check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py
index b8f3ced692134..c916109803630 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py
@@ -22,6 +22,7 @@
 
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -72,7 +73,7 @@ def build_network(self):
                 ceil_mode=self.ceil_mode,
                 exclusive=self.exclusive,
             )
-            out = fluid.layers.batch_norm(pool_out, is_test=True)
+            out = nn.batch_norm(pool_out, is_test=True)
             self.fetch_list = [out]
 
     def check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_sum_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_sum_op.py
index 79aa8cf14fa24..cd66cb1e914b8 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_sum_op.py
@@ -20,6 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -30,7 +31,7 @@ def setUp(self):
                 name="data", shape=[-1, 3, 10, 192], dtype="float32"
             )
             reduce_sum = paddle.sum(data, axis=[2, -1], keepdim=True)
-            out = fluid.layers.batch_norm(reduce_sum, is_test=True)
+            out = nn.batch_norm(reduce_sum, is_test=True)
 
         self.feeds = {
             "data": np.random.random([3, 3, 10, 192]).astype("float32"),
@@ -63,7 +64,7 @@ def setUp(self):
                 name="data", shape=[-1, 3, 10, 192], dtype="float32"
             )
             reduce_sum = paddle.sum(data, keepdim=True)
-            out = fluid.layers.batch_norm(reduce_sum, is_test=True)
+            out = nn.batch_norm(reduce_sum, is_test=True)
 
         self.feeds = {
             "data": np.random.random([3, 3, 10, 192]).astype("float32"),
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py
index ffbe80387719a..8edd7cafcbe4d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py
@@ -20,6 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -39,7 +40,7 @@ def setUp(self):
                 name='data', shape=self.data_shape, dtype='float32'
             )
             reshape_out = self.append_reshape(data, self.reshape)
-            out = fluid.layers.batch_norm(reshape_out, is_test=True)
+            out = nn.batch_norm(reshape_out, is_test=True)
         self.feeds = {
             'data': np.random.random(self.data_shape).astype('float32'),
         }
@@ -77,7 +78,7 @@ def setUp(self):
                 name='data', shape=self.data_shape, dtype='float32'
             )
             reshape_out = self.append_reshape(data, self.reshape)
-            out = fluid.layers.batch_norm(reshape_out, is_test=True)
+            out = nn.batch_norm(reshape_out, is_test=True)
         self.feeds = {
             'data': np.random.random(self.data_shape).astype('float32'),
         }
@@ -104,7 +105,7 @@ def setUp(self):
                 name='data', shape=self.data_shape, dtype='float32'
             )
             reshape_out = paddle.reshape(x=data, shape=self.reshape)
-            out = fluid.layers.batch_norm(reshape_out, is_test=True)
+            out = nn.batch_norm(reshape_out, is_test=True)
         self.feeds = {
             'data': np.random.random(self.data_shape).astype('float32')
         }
@@ -130,7 +131,7 @@ def setUp(self):
             data = fluid.data(
                 name='data', shape=self.data_shape, dtype='float32'
             )
-            bn_out = fluid.layers.batch_norm(data, is_test=True)
+            bn_out = nn.batch_norm(data, is_test=True)
             out = self.append_reshape(bn_out, self.reshape)
         self.feeds = {
             'data': np.random.random(self.data_shape).astype('float32'),
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py
index 0ffabd0178141..3bca0dbf18482 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py
@@ -20,6 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -28,7 +29,7 @@ def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(name="data", shape=[-1, 512], dtype="float32")
             scale_out = self.append_scale(data)
-            out = fluid.layers.batch_norm(scale_out, is_test=True)
+            out = nn.batch_norm(scale_out, is_test=True)
 
         self.feeds = {
             "data": np.random.random([1, 512]).astype("float32"),
@@ -60,7 +61,7 @@ def setUp(self):
                 name="data", shape=[-1, 512, 512], dtype="float32"
             )
             scale_out = self.append_scale(data)
-            out = fluid.layers.batch_norm(scale_out, is_test=True)
+            out = nn.batch_norm(scale_out, is_test=True)
 
         self.feeds = {
             "data": np.random.random([1, 512, 512]).astype("float32"),
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py
index ad0f2a66489c8..fc3b066556d6e 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py
@@ -19,6 +19,7 @@
 
 import paddle
 import paddle.fluid as fluid
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -31,8 +32,7 @@ def setUp(self):
             reshape1 = paddle.reshape(x=data, shape=[-1, 2, 3, 64, 64])
             trans = paddle.transpose(x=reshape1, perm=[0, 2, 1, 3, 4])
             reshape2 = paddle.reshape(x=trans, shape=[-1, 6, 64, 64])
-
-            out = fluid.layers.batch_norm(reshape2, is_test=True)
+            out = nn.batch_norm(reshape2, is_test=True)
 
         self.feeds = {
             "data": np.random.random([1, 6, 64, 64]).astype("float32"),
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py
index b8b0e6a55033a..355c0c9a00e65 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py
@@ -20,6 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig
 
 
@@ -45,7 +46,7 @@ def setUp(self):
             starts = self.params_starts
             ends = self.params_ends
             slice_out = paddle.slice(data, axes=axes, starts=starts, ends=ends)
-            out = fluid.layers.batch_norm(slice_out, is_test=True)
+            out = nn.batch_norm(slice_out, is_test=True)
 
         self.feeds = {
             "data": np.random.random((3, 3, 3, 3)).astype("float32"),
@@ -115,7 +116,7 @@ def setUp(self):
             ends = self.params_ends
             slice_out = paddle.slice(data, axes=axes, starts=starts, ends=ends)
             cast_out = fluid.layers.cast(slice_out, 'float32')
-            out = fluid.layers.batch_norm(cast_out, is_test=True)
+            out = nn.batch_norm(cast_out, is_test=True)
 
         self.feeds = {
             "data": np.random.random((3, 3, 3, 3)).astype("int32"),
@@ -140,7 +141,7 @@ def setUp(self):
             ends = self.params_ends
             slice_out = paddle.slice(data, axes=axes, starts=starts, ends=ends)
             cast_out = fluid.layers.cast(slice_out, 'float32')
-            out = fluid.layers.batch_norm(cast_out, is_test=True)
+            out = nn.batch_norm(cast_out, is_test=True)
 
         self.feeds = {
             "data": np.random.random((3, 3, 3, 3)).astype("int32"),
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
index a0f034462f3ba..c864cc91c3c33 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
@@ -22,6 +22,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import paddle.static.nn as nn
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -62,7 +63,7 @@ def setUp(self):
                 name="data2", shape=[-1, 3, 64, 64], dtype="float32"
             )
             concat_out = fluid.layers.concat([data1, data2], axis=2)
-            out = fluid.layers.batch_norm(concat_out, is_test=True)
+            out = nn.batch_norm(concat_out, is_test=True)
         self.feeds = {
             "data1": np.random.random([1, 3, 64, 64]).astype("float32"),
             "data2": np.random.random([1, 3, 64, 64]).astype("float32"),
@@ -89,7 +90,7 @@ def setUp(self):
                 name="data", shape=[-1, 3, 64, 64], dtype="float32"
             )
             split_out = fluid.layers.split(data, dim=-1, num_or_sections=2)
-            out = fluid.layers.batch_norm(split_out[0], is_test=True)
+            out = nn.batch_norm(split_out[0], is_test=True)
         self.feeds = {
             "data": np.random.random([1, 3, 64, 64]).astype("float32"),
         }
@@ -115,7 +116,7 @@ def setUp(self):
                 name="data", shape=[-1, 3, 64, 64], dtype="float32"
             )
             split_out = fluid.layers.split(data, dim=-1, num_or_sections=2)
-            out = fluid.layers.batch_norm(split_out[0], is_test=True)
+            out = nn.batch_norm(split_out[0], is_test=True)
         self.feeds = {
             "data": np.random.random([1, 3, 64, 64]).astype("float32"),
         }
@@ -143,7 +144,7 @@ def setUp(self):
                 name="data", shape=[-1, 3, 64, 64], dtype="float32"
             )
             split_out = fluid.layers.split(data, dim=-1, num_or_sections=2)
-            out = fluid.layers.batch_norm(split_out[0], is_test=True)
+            out = nn.batch_norm(split_out[0], is_test=True)
         self.feeds = {
             "data": np.random.random([1, 3, 64, 64]).astype("float32"),
         }
@@ -216,7 +217,7 @@ def setUp(self):
                 name="data", shape=[-1, 6, 64, 64], dtype="float32"
             )
             transpose_out = self.append_transpose(data)
-            out = fluid.layers.batch_norm(transpose_out, is_test=True)
+            out = nn.batch_norm(transpose_out, is_test=True)
         self.feeds = {
             "data": np.random.random([1, 6, 64, 64]).astype("float32"),
         }
@@ -366,7 +367,7 @@ def setUp(self):
                 name="data2", shape=[-1, 3, 64, 64], dtype="float32"
             )
             eltwise_out = self.append_eltwise(data1, data2)
-            out = fluid.layers.batch_norm(eltwise_out, is_test=True)
+            out = nn.batch_norm(eltwise_out, is_test=True)
         self.feeds = {
             "data1": np.random.random([1, 3, 64, 64]).astype("float32"),
             "data2": np.random.random([1, 3, 64, 64]).astype("float32"),
@@ -419,7 +420,7 @@ def setUp(self):
             )
             data2 = fluid.data(name="data2", shape=[64, 64], dtype="float32")
             eltwise_out = self.append_eltwise(data1, data2)
-            out = fluid.layers.batch_norm(eltwise_out, is_test=True)
+            out = nn.batch_norm(eltwise_out, is_test=True)
         self.feeds = {
             "data1": np.random.random([1, 3, 64, 64]).astype("float32"),
             "data2": np.random.random([64, 64]).astype("float32"),
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_tile_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_tile_op.py
index 45fa629f6cd30..9557f8c71c904 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_tile_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_tile_op.py
@@ -30,7 +30,7 @@ def setUp(self):
                 name="data", shape=[4, 3, 224, 256], dtype="float32"
             )
             tile_out = paddle.tile(x=data, repeat_times=[1, 1, 1, 1])
-            out = fluid.layers.batch_norm(tile_out, is_test=True)
+            out = paddle.static.nn.batch_norm(tile_out, is_test=True)
 
         self.feeds = {
             "data": np.random.random([4, 3, 224, 256]).astype("float32"),
@@ -55,7 +55,7 @@ def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(name="data", shape=[1, 1, 1, 1], dtype="float32")
             tile_out = paddle.tile(x=data, repeat_times=[1, 4, 1080, 1920])
-            out = fluid.layers.batch_norm(tile_out, is_test=True)
+            out = paddle.static.nn.batch_norm(tile_out, is_test=True)
 
         self.feeds = {
             "data": np.random.random([1, 1, 1, 1]).astype("float32"),
@@ -80,7 +80,7 @@ def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(name="data", shape=[1, 1, 1, 1], dtype="float32")
             tile_out = paddle.tile(x=data, repeat_times=[1, 4, 1080, 1920])
-            out = fluid.layers.batch_norm(tile_out, is_test=True)
+            out = paddle.static.nn.batch_norm(tile_out, is_test=True)
 
         self.feeds = {
             "data": np.random.random([1, 1, 1, 1]).astype("float32"),
@@ -105,7 +105,7 @@ def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(name="data", shape=[1, 1, 1, 1], dtype="float32")
             tile_out = paddle.tile(x=data, repeat_times=[1, 4, 1080, 1920])
-            out = fluid.layers.batch_norm(tile_out, is_test=True)
+            out = paddle.static.nn.batch_norm(tile_out, is_test=True)
 
         self.feeds = {
             "data": np.random.random([1, 1, 1, 1]).astype("float32"),
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py
index 192274ef34106..ff464a0e1e058 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py
@@ -42,7 +42,7 @@ def setUp(self):
             # There is no parameters for above structure.
             # Hence, append a batch_norm to avoid failure caused by load_combined.
             reshape_out = paddle.reshape(concat_out, [-1, 0, 1, 1])
-            out = fluid.layers.batch_norm(reshape_out, is_test=True)
+            out = paddle.static.nn.batch_norm(reshape_out, is_test=True)
 
         self.feeds = {
             "data1": np.random.random([8, 32, 128]).astype("float32"),
diff --git a/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py
index a528731001711..6412c4b5f5a93 100644
--- a/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py
@@ -86,7 +86,7 @@ def get_model(
                 )
                 if self.bn_dtype == np.float16:
                     conv = fluid.layers.cast(conv, 'float16')
-                bn = fluid.layers.batch_norm(
+                bn = paddle.static.nn.batch_norm(
                     conv,
                     param_attr=fluid.ParamAttr(name='bn_scale'),
                     bias_attr=fluid.ParamAttr(name='bn_bias'),
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py
index 53b78e18f8861..29be16759e9c2 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py
@@ -742,12 +742,12 @@ def test_errors(self):
             x1 = fluid.create_lod_tensor(
                 np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace()
             )
-            self.assertRaises(TypeError, fluid.layers.batch_norm, x1)
+            self.assertRaises(TypeError, paddle.static.nn.batch_norm, x1)
 
             # the input dtype of batch_norm must be float16 or float32 or float64
             # float16 only can be set on GPU place
             x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="int32")
-            self.assertRaises(TypeError, fluid.layers.batch_norm, x2)
+            self.assertRaises(TypeError, paddle.static.nn.batch_norm, x2)
 
 
 class TestDygraphBatchNormAPIError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/npu/sync_batch_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/sync_batch_norm_op_npu.py
index 2fd353af1a2dc..49d4f92bdf983 100644
--- a/python/paddle/fluid/tests/unittests/npu/sync_batch_norm_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/sync_batch_norm_op_npu.py
@@ -88,7 +88,7 @@ def get_model(
                     bias_attr=False,
                     use_cudnn=use_cudnn,
                 )
-                bn = fluid.layers.batch_norm(
+                bn = paddle.static.nn.batch_norm(
                     conv,
                     param_attr=fluid.ParamAttr(name='bn_scale'),
                     bias_attr=fluid.ParamAttr(name='bn_bias'),
diff --git a/python/paddle/fluid/tests/unittests/seresnext_net.py b/python/paddle/fluid/tests/unittests/seresnext_net.py
index 146cd58bcf438..15b47d427395c 100644
--- a/python/paddle/fluid/tests/unittests/seresnext_net.py
+++ b/python/paddle/fluid/tests/unittests/seresnext_net.py
@@ -80,7 +80,7 @@ def conv_bn_layer(
     return (
         conv
         if remove_bn
-        else fluid.layers.batch_norm(input=conv, act=act, momentum=0.1)
+        else paddle.static.nn.batch_norm(input=conv, act=act, momentum=0.1)
     )
 
 
diff --git a/python/paddle/fluid/tests/unittests/simple_nets.py b/python/paddle/fluid/tests/unittests/simple_nets.py
index 4f90fe3cc0966..9d124ee509200 100644
--- a/python/paddle/fluid/tests/unittests/simple_nets.py
+++ b/python/paddle/fluid/tests/unittests/simple_nets.py
@@ -53,7 +53,7 @@ def batchnorm_fc_with_inputs(img, label, class_num=10):
             ),
         )
 
-        hidden = fluid.layers.batch_norm(input=hidden)
+        hidden = paddle.static.nn.batch_norm(input=hidden)
 
     prediction = fluid.layers.fc(hidden, size=class_num, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
diff --git a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
index 26eb0a628ab9a..54e74ade09aef 100644
--- a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
@@ -46,7 +46,7 @@ def convolutional_neural_network(use_py_reader):
             pool_stride=2,
             act="relu",
         )
-        conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+        conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
         conv_pool_2 = fluid.nets.simple_img_conv_pool(
             input=conv_pool_1,
             filter_size=5,
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index 34b358130219d..079628658addb 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -759,12 +759,12 @@ def test_errors(self):
             x1 = fluid.create_lod_tensor(
                 np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace()
             )
-            self.assertRaises(TypeError, fluid.layers.batch_norm, x1)
+            self.assertRaises(TypeError, paddle.static.nn.batch_norm, x1)
 
             # the input dtype of batch_norm must be float16 or float32 or float64
             # float16 only can be set on GPU place
             x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="int32")
-            self.assertRaises(TypeError, fluid.layers.batch_norm, x2)
+            self.assertRaises(TypeError, paddle.static.nn.batch_norm, x2)
 
 
 class TestDygraphBatchNormAPIError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py b/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py
index 978298f8f859d..028954d22ffdc 100644
--- a/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py
+++ b/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py
@@ -34,7 +34,7 @@ def conv_net(self, img, label):
             pool_type='max',
             act="relu",
         )
-        conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+        conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
         conv_pool_2 = fluid.nets.simple_img_conv_pool(
             input=conv_pool_1,
             filter_size=5,
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
index a04e845db0af4..9a7a907321089 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
@@ -41,7 +41,7 @@ def build_program(self, main_program, startup_program, use_cuda, seed=1):
                 name='batch_norm_b',
                 initializer=fluid.initializer.Constant(value=0.0),
             )
-            hidden2 = fluid.layers.batch_norm(
+            hidden2 = paddle.static.nn.batch_norm(
                 input=hidden1,
                 param_attr=param_attr,
                 bias_attr=bias_attr,
@@ -49,7 +49,7 @@ def build_program(self, main_program, startup_program, use_cuda, seed=1):
                 data_layout='NHWC',
             )
             hidden3 = fluid.layers.fc(input=hidden2, size=32, act='relu')
-            hidden4 = fluid.layers.batch_norm(
+            hidden4 = paddle.static.nn.batch_norm(
                 input=hidden3, act='relu', data_layout='NHWC'
             )
             prediction = fluid.layers.fc(input=hidden4, size=10, act='softmax')
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py
index 68d11d0897279..1b83dfa2b010d 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py
@@ -87,7 +87,7 @@ def build_fused_program(
                 bias_attr=False,
                 data_format='NHWC',
             )
-            bn = fluid.layers.batch_norm(
+            bn = paddle.static.nn.batch_norm(
                 input=conv1_1,
                 param_attr=self.bn_param_attr1,
                 bias_attr=self.bn_bias_attr1,
@@ -133,7 +133,7 @@ def build_origin_program(
                 bias_attr=False,
                 data_format='NHWC',
             )
-            bn1 = fluid.layers.batch_norm(
+            bn1 = paddle.static.nn.batch_norm(
                 input=conv1_1,
                 param_attr=self.bn_param_attr1,
                 bias_attr=self.bn_bias_attr1,
@@ -150,7 +150,7 @@ def build_origin_program(
                 bias_attr=False,
                 data_format='NHWC',
             )
-            bn2 = fluid.layers.batch_norm(
+            bn2 = paddle.static.nn.batch_norm(
                 input=conv1_1,
                 param_attr=self.bn_param_attr2,
                 bias_attr=self.bn_bias_attr2,
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
index a136a623d0331..025e12c02c611 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
@@ -23,7 +23,7 @@
 
 
 def norm(*args, **kargs):
-    return fluid.layers.batch_norm(*args, **kargs)
+    return paddle.static.nn.batch_norm(*args, **kargs)
 
 
 def sep_conv(input, channel, stride, filter, dilation=1, act=None):
diff --git a/python/paddle/fluid/tests/unittests/test_image_classification_layer.py b/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
index 7a61eaaa04937..2d39fb4ab70c4 100644
--- a/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
@@ -42,9 +42,9 @@ def test_batch_norm_layer(self):
             images = fluid.layers.data(
                 name='pixel', shape=[3, 48, 48], dtype='float32'
             )
-            hidden1 = fluid.layers.batch_norm(input=images)
+            hidden1 = paddle.static.nn.batch_norm(input=images)
             hidden2 = fluid.layers.fc(input=hidden1, size=128, act='relu')
-            fluid.layers.batch_norm(input=hidden2)
+            paddle.static.nn.batch_norm(input=hidden2)
 
         print(str(main_program))
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
index 573c1699acd9e..05c7542792cc6 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
@@ -55,8 +55,8 @@ def testLoadStaticModel(self):
         batchnorm_in = fluid.data(
             name="batchnorm_in", shape=[None, 10], dtype='float32'
         )
-        batchnorm_out_1 = fluid.layers.batch_norm(batchnorm_in)
-        batchnorm_out_2 = fluid.layers.batch_norm(batchnorm_in)
+        batchnorm_out_1 = paddle.static.nn.batch_norm(batchnorm_in)
+        batchnorm_out_2 = paddle.static.nn.batch_norm(batchnorm_in)
 
         emb_in = fluid.data(name='emb_in', shape=[None, 10], dtype='int64')
         emb_out_1 = fluid.embedding(emb_in, [1000, 100])
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py
index d129a9270ab5d..ee2cc13d6a8c0 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py
@@ -33,7 +33,7 @@ def convolutional_neural_network(img):
         pool_stride=2,
         act="relu",
     )
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
index fd9d7a26b1abc..299d3218cfdba 100644
--- a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
@@ -56,7 +56,7 @@ def build_program(
                     stop_gradient=False,
                 )
 
-                bn = fluid.layers.batch_norm(
+                bn = paddle.static.nn.batch_norm(
                     data,
                     param_attr=fluid.ParamAttr(name='bn_scale'),
                     bias_attr=fluid.ParamAttr(name='bn_bias'),
diff --git a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
index b7a0ab0d45042..3c5f2edc4f53b 100644
--- a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
@@ -38,7 +38,7 @@ def fc_with_batchnorm(use_feed):
             ),
         )
 
-        hidden = fluid.layers.batch_norm(input=hidden)
+        hidden = paddle.static.nn.batch_norm(input=hidden)
     prediction = fluid.layers.fc(hidden, size=10, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
     loss = paddle.mean(loss)
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 043321bf566cc..99fb5fac4ae6b 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -2863,7 +2863,7 @@ def make_batch_norm(self):
             data = self._get_data(
                 name='data', shape=[32, 128, 128], dtype="float32"
             )
-            out = layers.batch_norm(data)
+            out = paddle.static.nn.batch_norm(data)
             return out
 
     def make_batch_norm_momentum_variable(self):
@@ -2879,7 +2879,7 @@ def make_batch_norm_momentum_variable(self):
                 dtype='float32',
                 append_batch_size=False,
             )
-            out = layers.batch_norm(data, momentum=momentum)
+            out = paddle.static.nn.batch_norm(data, momentum=momentum)
             return out
 
     def make_range(self):
diff --git a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
index 98518f52f669a..db4af74fc35bb 100644
--- a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
+++ b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
@@ -33,7 +33,7 @@ def convolutional_neural_network(img):
         pool_stride=2,
         act="relu",
     )
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
         input=conv_pool_1,
         filter_size=5,
diff --git a/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py b/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
index 9d42b68e144ba..e024917a30682 100644
--- a/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
+++ b/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
@@ -45,7 +45,7 @@ def conv_net(use_feed):
         pool_stride=2,
         act="relu",
     )
-    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
 
     conv_pool_1 = fluid.layers.cast(conv_pool_1, np.float32)
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
diff --git a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
index bbcb5ef7b9b85..72efd20c6d116 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
@@ -171,7 +171,7 @@ def func(self, place):
             eps = 0.005
             atol = 1e-4
             x = paddle.create_parameter(dtype=dtype, shape=self.shape, name='x')
-            z = fluid.layers.batch_norm(
+            z = paddle.static.nn.batch_norm(
                 input=x,
                 data_layout=self.data_layout,
                 use_global_stats=self.use_global_stats,
@@ -251,7 +251,7 @@ def func(self, place):
                 self.shape[1] if self.data_layout == 'NCHW' else self.shape[-1]
             )
             x = paddle.create_parameter(dtype=dtype, shape=self.shape, name='x')
-            z = fluid.layers.batch_norm(
+            z = paddle.static.nn.batch_norm(
                 input=x,
                 data_layout=self.data_layout,
                 use_global_stats=self.use_global_stats,
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
index f00595b1145e7..f8f65b63b8003 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -26,10 +26,10 @@
 
 def Lenet(data, class_dim):
     conv1 = fluid.layers.conv2d(data, 4, 5, 1, act=None)
-    bn1 = fluid.layers.batch_norm(conv1, act='relu')
+    bn1 = paddle.static.nn.batch_norm(conv1, act='relu')
     pool1 = fluid.layers.pool2d(bn1, 2, 'max', 2)
     conv2 = fluid.layers.conv2d(pool1, 16, 5, 1, act=None)
-    bn2 = fluid.layers.batch_norm(conv2, act='relu')
+    bn2 = paddle.static.nn.batch_norm(conv2, act='relu')
     pool2 = fluid.layers.pool2d(bn2, 2, 'max', 2)
 
     fc1 = fluid.layers.fc(pool2, size=50, act='relu')
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
index 4dc0020b91fd4..e86a09e898ec1 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -58,7 +58,7 @@ def fc_with_batchnorm(use_feed):
                 ),
             )
 
-            hidden = fluid.layers.batch_norm(input=hidden)
+            hidden = paddle.static.nn.batch_norm(input=hidden)
     with fluid.name_scope("fc_layer"):
         prediction = fluid.layers.fc(hidden, size=10, act='softmax')
     with fluid.name_scope("loss"):
diff --git a/python/paddle/fluid/tests/unittests/test_set_bool_attr.py b/python/paddle/fluid/tests/unittests/test_set_bool_attr.py
index 0a43e57e903eb..da02e4621d0c2 100644
--- a/python/paddle/fluid/tests/unittests/test_set_bool_attr.py
+++ b/python/paddle/fluid/tests/unittests/test_set_bool_attr.py
@@ -12,8 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 import unittest
 
+import paddle
 import paddle.fluid as fluid
 
 
@@ -28,7 +30,7 @@ def test_set_bool_attr(self):
             name='batch_norm_b',
             initializer=fluid.initializer.Constant(value=0.0),
         )
-        bn = fluid.layers.batch_norm(
+        bn = paddle.static.nn.batch_norm(
             input=x, param_attr=param_attr, bias_attr=bias_attr
         )
         block = fluid.default_main_program().desc.block(0)
diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
index ef42ab8a52259..10755fb729b2d 100644
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -81,7 +81,7 @@ def _build_program(
                     bias_attr=False,
                     use_cudnn=use_cudnn,
                 )
-                bn = fluid.layers.batch_norm(
+                bn = paddle.static.nn.batch_norm(
                     conv,
                     param_attr=fluid.ParamAttr(name='bn_scale'),
                     bias_attr=fluid.ParamAttr(name='bn_bias'),
diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py
index 983138ce976c2..e638ca5531721 100644
--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
@@ -74,11 +74,13 @@
 from ..fluid.contrib.layers import ctr_metric_bundle  # noqa: F401
 from ..fluid.layers import exponential_decay  # noqa: F401
 
+from .nn.common import batch_norm  # noqa: F401
 from paddle.static.nn.metric import auc  # noqa: F401
 from paddle.static.nn.metric import accuracy  # noqa: F401
 
 __all__ = [  # noqa
     'append_backward',
+    'batch_norm',
     'gradients',
     'Executor',
     'global_scope',
diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py
index 9635811f6a818..cae4b52fe4c59 100755
--- a/python/paddle/static/nn/__init__.py
+++ b/python/paddle/static/nn/__init__.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from .common import fc  # noqa: F401
+from .common import batch_norm  # noqa: F401
 from .common import instance_norm  # noqa: F401
 from .common import data_norm  # noqa: F401
 from .common import group_norm  # noqa: F401
@@ -22,9 +23,7 @@
 from .common import conv3d_transpose  # noqa: F401
 from .common import bilinear_tensor_product  # noqa: F401
 from .common import py_func  # noqa: F401
-
 from ...tensor.creation import create_parameter  # noqa: F401
-from ...fluid.layers import batch_norm  # noqa: F401
 from ...fluid.layers import case  # noqa: F401
 from ...fluid.layers import cond  # noqa: F401
 from ...fluid.layers import conv2d  # noqa: F401
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
index 420a00ddbdc51..f74e6aa605a36 100755
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -2160,6 +2160,328 @@ def bilinear_tensor_product(
     return helper.append_activation(out)
 
 
+def batch_norm(
+    input,
+    act=None,
+    is_test=False,
+    momentum=0.9,
+    epsilon=1e-05,
+    param_attr=None,
+    bias_attr=None,
+    data_layout='NCHW',
+    in_place=False,
+    name=None,
+    moving_mean_name=None,
+    moving_variance_name=None,
+    do_model_average_for_mean_and_var=True,
+    use_global_stats=False,
+):
+    r"""
+
+    **Batch Normalization Layer**
+
+    Can be used as a normalizer function for convolution or fully_connected operations.
+    The required data format for this layer is one of the following:
+
+    1. NHWC `[batch, in_height, in_width, in_channels]`
+
+    2. NCHW `[batch, in_channels, in_height, in_width]`
+
+    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
+    for more details.
+
+    :math:input is the input features over a mini-batch.
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+        moving\_mean = moving\_mean * momentum + mini-batch\_mean * (1. - momentum) \\\\
+        moving\_var = moving\_var * momentum + mini-batch\_var * (1. - momentum)
+
+
+    moving_mean is global mean and moving_var is global variance.
+
+    When use_global_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global (or running) statistics. (It usually got from the
+    pre-trained model.)
+    The training and testing (or inference) have the same behavior:
+
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}}  \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta
+
+    Note:
+        if build_strategy.sync_batch_norm=True, the batch_norm in network will use
+        sync_batch_norm automatically.
+        `is_test = True` can only be used in test program and inference program, `is_test` CANNOT be set to True in train program, if you want to use global status from pre_train model in train program, please set `use_global_stats = True`.
+
+    Args:
+        input(Tensor): The rank of input Tensor can be 2, 3, 4, 5. The data type
+            is float16 or float32 or float64.
+        act(string, Default None): Activation type, linear|relu|prelu|...
+        is_test (bool, Default False): A flag indicating whether it is in
+            test phrase or not.
+        momentum(float|Tensor, Default 0.9): The value used for the moving_mean and
+            moving_var computation. This should be a float number or a Tensor with
+            shape [1] and data type as float32. The updated formula is:
+            :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
+            :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
+            Default is 0.9.
+        epsilon(float, Default 1e-05): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
+             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+         will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
+         If the Initializer of the param_attr is not set, the parameter is initialized
+         with Xavier. Default: None.
+        bias_attr(ParamAttr|None): The parameter attribute for the bias of batch_norm.
+             If it is set to None or one attribute of ParamAttr, batch_norm
+         will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
+         If the Initializer of the bias_attr is not set, the bias is initialized zero.
+         Default: None.
+        data_layout (str, optional): Specify the data format of the input, and the data format of the output
+             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
+             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+             `[batch_size, input_channels, input_height, input_width]`.
+        in_place(bool, Default False): Make the input and output of batch norm reuse memory.
+        name(str|None): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
+        moving_mean_name(str, Default None): The name of moving_mean which store the global Mean. If it
+            is set to None, batch_norm will save global mean with a random name, otherwise, batch_norm
+            will save global mean with the string.
+        moving_variance_name(str, Default None): The name of the moving_variance which store the global Variance.
+            If it is set to None, batch_norm will save global variance with a random name, otherwise, batch_norm
+            will save global variance with the string.
+        do_model_average_for_mean_and_var(bool, Default True): Whether parameter mean and variance should do model
+            average when model average is enabled.
+        use_global_stats(bool, Default False): Whether to use global mean and
+            variance. In inference or test mode, set use_global_stats to true
+            or is_test to true, and the behavior is equivalent.
+            In train mode, when setting use_global_stats True, the global mean
+            and variance are also used during train period.
+
+    Returns:
+        A Tensor which is the result after applying batch normalization on the input,
+        has same shape and data type with input.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            paddle.enable_static()
+            x = paddle.static.data(name='x', shape=[3, 7, 3, 7], dtype='float32')
+            hidden1 = paddle.static.nn.fc(x=x, size=200)
+            print(hidden1.shape)
+            # [3, 200]
+            hidden2 = paddle.static.nn.batch_norm(input=hidden1)
+            print(hidden2.shape)
+            # [3, 200]
+    """
+    assert (
+        bias_attr is not False
+    ), "bias_attr should not be False in batch_norm."
+    helper = LayerHelper('batch_norm', **locals())
+
+    check_variable_and_dtype(
+        input, 'input', ['float16', 'float32', 'float64'], 'batch_norm'
+    )
+    dtype = helper.input_dtype()
+
+    # use fp32 for bn parameter
+    if dtype == core.VarDesc.VarType.FP16:
+        dtype = core.VarDesc.VarType.FP32
+
+    input_shape = input.shape
+    if data_layout == 'NCHW':
+        channel_num = input_shape[1]
+    else:
+        if data_layout == 'NHWC':
+            channel_num = input_shape[-1]
+        else:
+            raise ValueError("unsupported data layout:" + data_layout)
+
+    param_shape = [channel_num]
+
+    # create parameter
+    scale = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=param_shape,
+        dtype=dtype,
+        default_initializer=paddle.fluid.initializer.Constant(1.0),
+    )
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True
+    )
+
+    mean = helper.create_parameter(
+        attr=paddle.ParamAttr(
+            name=moving_mean_name,
+            initializer=paddle.fluid.initializer.Constant(0.0),
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var,
+        ),
+        shape=param_shape,
+        dtype=dtype,
+    )
+    mean.stop_gradient = True
+
+    variance = helper.create_parameter(
+        attr=paddle.ParamAttr(
+            name=moving_variance_name,
+            initializer=paddle.fluid.initializer.Constant(1.0),
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var,
+        ),
+        shape=param_shape,
+        dtype=dtype,
+    )
+    variance.stop_gradient = True
+
+    # create output
+    # mean and mean_out share the same memory
+    mean_out = mean
+    # variance and variance_out share the same memory
+    variance_out = variance
+
+    if _non_static_mode():
+        inputs_has_MomemtumTensor = False
+        attrs_has_momentum = False
+        tmp_tensor_type = core.eager.Tensor
+        if isinstance(momentum, tmp_tensor_type):
+            inputs_has_MomemtumTensor = True
+        else:
+            attrs_has_momentum = True
+
+        attrs_ = ()
+        if attrs_has_momentum:
+            attrs_ = (
+                'momentum',
+                momentum,
+                'epsilon',
+                epsilon,
+                'is_test',
+                is_test,
+                'data_layout',
+                data_layout,
+                'use_mkldnn',
+                False,
+                'fuse_with_relu',
+                False,
+                'use_global_stats',
+                use_global_stats,
+            )
+        else:
+            attrs_ = (
+                'epsilon',
+                epsilon,
+                'is_test',
+                is_test,
+                'data_layout',
+                data_layout,
+                'use_mkldnn',
+                False,
+                'fuse_with_relu',
+                False,
+                'use_global_stats',
+                use_global_stats,
+            )
+        if inputs_has_MomemtumTensor:
+            batch_norm_out, _, _, _, _, _ = paddle._legacy_C_ops.batch_norm(
+                input,
+                scale,
+                bias,
+                mean,
+                variance,
+                momentum,
+                mean_out,
+                variance_out,
+                *attrs_,
+            )
+        else:
+            batch_norm_out, _, _, _, _, _ = paddle._legacy_C_ops.batch_norm(
+                input,
+                scale,
+                bias,
+                mean,
+                variance,
+                None,
+                mean_out,
+                variance_out,
+                *attrs_,
+            )
+
+        return paddle.fluid.dygraph_utils._append_activation_in_dygraph(
+            batch_norm_out, act=act, use_mkldnn=False
+        )
+
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True
+    )
+    saved_variance = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True
+    )
+    reserve_space = None
+    if not is_test:
+        reserve_space = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype(), stop_gradient=True
+        )
+
+    batch_norm_out = (
+        input if in_place else helper.create_variable_for_type_inference(dtype)
+    )
+
+    inputs = {
+        "X": input,
+        "Scale": scale,
+        "Bias": bias,
+        "Mean": mean,
+        "Variance": variance,
+        "MeanOut": mean_out,
+        "VarianceOut": variance_out,
+    }
+    attrs = {
+        "epsilon": epsilon,
+        "is_test": is_test,
+        "data_layout": data_layout,
+        "use_mkldnn": False,
+        "fuse_with_relu": False,
+        "use_global_stats": use_global_stats,
+    }
+    if isinstance(momentum, paddle.static.Variable):
+        inputs['MomemtumTensor'] = momentum
+    else:
+        attrs['momentum'] = momentum
+
+    outputs = {
+        "Y": batch_norm_out,
+        "MeanOut": mean_out,
+        "VarianceOut": variance_out,
+        "SavedMean": saved_mean,
+        "SavedVariance": saved_variance,
+    }
+    if reserve_space is not None:
+        outputs["ReserveSpace"] = reserve_space
+
+    helper.append_op(
+        type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs
+    )
+
+    return helper.append_activation(batch_norm_out)
+
+
 @static_only
 def prelu(x, mode, param_attr=None, data_format="NCHW", name=None):
     r"""

From c2f15f056610d9451bb516ba06e77b3aee40675b Mon Sep 17 00:00:00 2001
From: wangzhen38 <41941775+wangzhen38@users.noreply.github.com>
Date: Wed, 7 Dec 2022 15:29:02 +0800
Subject: [PATCH 28/60] [remove fluid] PRelu BilinearTensorProduct
 Conv2DTranspose SequenceConv RowConv (#48654)

* [remove fluid] PRelu BilinearTensorProduct

* [remove fluid] PRelu BilinearTensorProduct Conv2DTranspose SequenceConv RowConv

* [remove fluid] PRelu BilinearTensorProduct Conv2DTranspose SequenceConv RowConv

* [remove fluid] PRelu BilinearTensorProduct Conv2DTranspose SequenceConv RowConv

* [remove fluid] PRelu BilinearTensorProduct Conv2DTranspose SequenceConv RowConv

* [remove fluid] PRelu BilinearTensorProduct Conv2DTranspose SequenceConv RowConv

* [remove fluid] PRelu BilinearTensorProduct Conv2DTranspose SequenceConv RowConv

* [remove fluid] PRelu BilinearTensorProduct Conv2DTranspose SequenceConv RowConv
---
 python/paddle/fluid/dygraph/nn.py             | 634 ------------------
 .../test_basic_api_transformation.py          |  30 +-
 .../dygraph_to_static/test_cycle_gan.py       |   9 +-
 .../test_bilinear_tensor_product_op.py        |   4 +-
 .../unittests/test_conv2d_transpose_op.py     |  81 ---
 .../unittests/test_dataloader_early_reset.py  |   6 +-
 .../test_imperative_load_static_param.py      |   5 +-
 ...perative_star_gan_with_gradient_penalty.py |   8 +-
 .../fluid/tests/unittests/test_layers.py      | 327 ++-------
 .../test_multiprocess_dataloader_dynamic.py   |   1 -
 .../unittests/xpu/test_activation_op_xpu.py   |   4 +-
 .../tests/unittests/xpu/test_unfold_op_xpu.py |  10 +-
 python/paddle/static/nn/metric.py             |  12 +-
 13 files changed, 87 insertions(+), 1044 deletions(-)

diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 4dfb67ab4aa42..2fa3945987dd1 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -53,9 +53,6 @@
     'Linear',
     'BatchNorm',
     'Embedding',
-    'PRelu',
-    'BilinearTensorProduct',
-    'Conv2DTranspose',
     'Conv3DTranspose',
     'GroupNorm',
     'SpectralNorm',
@@ -1128,637 +1125,6 @@ def forward(self, input):
         return out
 
 
-class PRelu(layers.Layer):
-    r"""
-    This interface is used to construct a callable object of the ``PRelu`` class.
-    For more details, refer to code examples.
-    It implements three activation methods of the ``PRelu`` activation function.
-
-    Equation:
-
-    .. math::
-        y = \max(0, x) + \\alpha * \min(0, x)
-
-    Parameters:
-        mode (str): The mode for weight sharing. It supports all, channel
-          and element. all: all elements share same weight
-          channel:elements in a channel share same weight
-          element:each element has a weight
-        channel (int, optional): The number of channels.
-          This argument is required when mode is "channel".
-          Default: None.
-        input_shape (list or tuple, optional): The shape of input.
-          This argument is required when mode is "element".
-          Default: None.
-        param_attr(ParamAttr, optional): The parameter attribute for the learnable
-          weight (alpha). Default: None.
-        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".
-
-    Attribute:
-        **weight** (Parameter): the learnable weights of this layer.
-
-    Returns:
-        None
-
-    Examples:
-
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          from paddle.fluid.dygraph.base import to_variable
-          import numpy as np
-
-          inp_np = np.ones([5, 200, 100, 100]).astype('float32')
-          with fluid.dygraph.guard():
-              inp_np = to_variable(inp_np)
-              prelu0 = fluid.PRelu(
-                 mode='all',
-                 param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(1.0)))
-              dy_rlt0 = prelu0(inp_np)
-              prelu1 = fluid.PRelu(
-                 mode='channel',
-                 channel=200,
-                 param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(1.0)))
-              dy_rlt1 = prelu1(inp_np)
-              prelu2 = fluid.PRelu(
-                 mode='element',
-                 input_shape=inp_np.shape,
-                 param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(1.0)))
-              dy_rlt2 = prelu2(inp_np)
-
-    """
-
-    def __init__(
-        self,
-        mode,
-        channel=None,
-        input_shape=None,
-        param_attr=None,
-        dtype='float32',
-    ):
-        # need specify name_scope since snake-cased 'PRelu' is 'p_relu'
-        super().__init__(name_scope='prelu')
-        self._mode = mode
-        self._param_attr = param_attr
-        self._dtype = dtype
-        if mode == 'all':
-            self._alpha_shape = [1]
-        elif mode == 'channel':
-            assert isinstance(
-                channel, int
-            ), "channel argument is required when mode is 'channel'."
-            # NOTE(zhiqiu): The _alpha_shape should be [1, channel] + [1] * len(input_shape[2:]), not [1, channel, 1, 1].
-            # However, the suffix 1 in the list is useless, since the tensor is viewed as one demension array during kernel calculation.
-            # And, input_shape is not required when mode is 'channel', so it is simplified.
-            # NOTE(zhiqiu): Revert shape to [1, channel, 1, 1] for compatibility with saved model of old version.
-            self._alpha_shape = [1, channel, 1, 1]
-        elif mode == 'element':
-            assert isinstance(
-                input_shape, (list, tuple)
-            ), "input_shape argument is required when mode is 'element'."
-            self._alpha_shape = [1] + list(input_shape)[1:]
-        else:
-            raise ValueError('mode should be one of all, channel, element.')
-        self.weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=self._alpha_shape,
-            dtype='float32',
-            is_bias=False,
-            default_initializer=Constant(1.0),
-        )
-
-    def forward(self, input):
-        if in_dygraph_mode():
-            return _C_ops.prelu(input, self.weight, "NCHW", self._mode)
-
-        check_variable_and_dtype(input, 'input', ['float32'], 'PRelu')
-        out = self._helper.create_variable_for_type_inference(self._dtype)
-        self._helper.append_op(
-            type="prelu",
-            inputs={"X": input, 'Alpha': self.weight},
-            attrs={"mode": self._mode},
-            outputs={"Out": out},
-        )
-        return out
-
-
-class BilinearTensorProduct(layers.Layer):
-    r"""
-
-    **Add Bilinear Tensor Product Layer**
-
-    This layer performs bilinear tensor product on two inputs.
-    For example:
-
-    .. math::
-      out_{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1
-
-    In this formula:
-     - :math:`x`: the first input contains M elements, shape is [batch_size, M].
-     - :math:`y`: the second input contains N elements, shape is [batch_size, N].
-     - :math:`W_{i}`: the i-th learned weight, shape is [M, N]
-     - :math:`out_{i}`: the i-th element of out, shape is [batch_size, size].
-     - :math:`y^\mathrm{T}`: the transpose of :math:`y`.
-
-    Parameters:
-       input1_dim (int): The dimension of each first input.
-       input2_dim (int): The dimension of each second input.
-       output_dim (int): The dimension of output of this layer.
-       name (str, optional): The default value is None. Normally there is no need for user
-           to set this property. For more information, please refer to :ref:`api_guide_Name`. Default: None.
-       act (str, optional): Activation to be applied to the output of this layer. The default value is None.
-       param_attr (ParamAttr, optional): The parameter attribute for the learnable w, parameters/weights of
-           this layer. The default value is None.
-       bias_attr (ParamAttr, optional): The parameter attribute for the bias
-           of this layer. If it is set to False, no bias will be added to the output units.
-           If it is set to None, the bias is initialized zero. The default value is None.
-       dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".
-
-    Attribute:
-        **weight** (Parameter): the learnable weights of this layer.
-
-        **bias** (Parameter): the learnable bias of this layer.
-
-    Returns:
-       Tensor: A 2-D Tensor of shape [batch_size, size].
-
-    Examples:
-       .. code-block:: python
-
-        import paddle
-        import numpy
-
-        layer1 = numpy.random.random((5, 5)).astype('float32')
-        layer2 = numpy.random.random((5, 4)).astype('float32')
-        bilinearTensorProduct = paddle.nn.BilinearTensorProduct(
-            input1_dim=5, input2_dim=4, output_dim=1000)
-        ret = bilinearTensorProduct(paddle.to_tensor(layer1),
-                                    paddle.to_tensor(layer2))
-
-    """
-
-    def __init__(
-        self,
-        input1_dim,
-        input2_dim,
-        output_dim,
-        name=None,
-        act=None,
-        param_attr=None,
-        bias_attr=None,
-        dtype='float32',
-    ):
-        super().__init__()
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._act = act
-        self._name = name
-        self._input1_dim = input1_dim
-        self._input2_dim = input2_dim
-        self._output_dim = output_dim
-        self._inputs = dict()
-        self._dtype = dtype
-
-        param_shape = [self._output_dim, self._input1_dim, self._input2_dim]
-        self.weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=param_shape,
-            dtype=self._dtype,
-            is_bias=False,
-        )
-        bias_size = [1, self._output_dim]
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=bias_size,
-            dtype=self._dtype,
-            is_bias=True,
-        )
-
-    @deprecated(
-        since="2.0.0",
-        update_to="paddle.nn.Bilinear",
-        reason="New name and new args in Bilinear, easier to use.",
-    )
-    def forward(self, x, y):
-        check_variable_and_dtype(
-            x, 'x', ['float32', 'float64'], 'BilinearTensorProduct'
-        )
-        check_variable_and_dtype(
-            y, 'y', ['float32', 'float64'], 'BilinearTensorProduct'
-        )
-        self._inputs = {"X": x, "Y": y, "Weight": self.weight}
-        if self.bias is not None:
-            self._inputs["Bias"] = self.bias
-        if self._name is not None:
-            out = self._helper.create_variable(
-                name=".".join([self.full_name(), self._name]),
-                dtype=self._dtype,
-                persistable=False,
-            )
-        else:
-            out = self._helper.create_variable(
-                dtype=self._dtype, persistable=False
-            )
-        self._helper.append_op(
-            type="bilinear_tensor_product",
-            inputs=self._inputs,
-            outputs={"Out": out},
-        )
-
-        # add activation
-        return self._helper.append_activation(out, act=self._act)
-
-
-class Conv2DTranspose(layers.Layer):
-    r"""
-    This interface is used to construct a callable object of the ``Conv2DTranspose`` class.
-    For more details, refer to code examples.
-    The convolution2D transpose layer calculates the output based on the input,
-    filter, and dilations, strides, paddings. Input and output
-    are in NCHW format. Where N is batch size, C is the number of feature map,
-    H is the height of the feature map, and W is the width of the feature map.
-    Filter's shape is [MCHW] , where M is the number of input feature map,
-    C is the number of output feature map, H is the height of the filter,
-    and W is the width of the filter. If the groups is greater than 1,
-    C will equal the number of input feature map divided by the groups.
-    If bias attribution and activation type are provided, bias is added to
-    the output of the convolution, and the corresponding activation function
-    is applied to the final result.
-    The details of convolution transpose layer, please refer to the following explanation and references
-    `conv2dtranspose <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_ .
-
-    For each input :math:`X`, the equation is:
-
-    .. math::
-
-        Out = \sigma (W \\ast X + b)
-
-    Where:
-
-    * :math:`X`: Input value, a ``Tensor`` with NCHW format.
-    * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
-    * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
-    * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-
-    Example:
-
-        - Input:
-
-          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-
-          Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
-
-        - Output:
-
-          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
-
-        Where
-
-        .. math::
-
-           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
-           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
-           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
-           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
-
-    Parameters:
-        num_channels(int): The number of channels in the input image.
-        num_filters(int): The number of the filter. It is as same as the output
-            feature map.
-        filter_size(int or tuple): The filter size. If filter_size is a tuple,
-            it must contain two integers, (filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square.
-        output_size(int or tuple, optional): The output image size. If output size is a
-            tuple, it must contain two integers, (image_H, image_W). None if use
-            filter_size, padding, and stride to calculate output_size.
-            if output_size and filter_size are specified at the same time, They
-            should follow the formula above. Default: None.
-        padding(int or tuple, optional): The padding size. If padding is a tuple, it must
-            contain two integers, (padding_H, padding_W). Otherwise, the
-            padding_H = padding_W = padding. Default: 0.
-        stride(int or tuple, optional): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. Default: 1.
-        dilation(int or tuple, optional): The dilation size. If dilation is a tuple, it must
-            contain two integers, (dilation_H, dilation_W). Otherwise, the
-            dilation_H = dilation_W = dilation. Default: 1.
-        groups(int, optional): The groups number of the Conv2D transpose layer. Inspired by
-            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
-            when group=2, the first half of the filters is only connected to the
-            first half of the input channels, while the second half of the
-            filters is only connected to the second half of the input channels.
-            Default: 1.
-        param_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
-            of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose
-            will create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr or bool, optional): The attribute for the bias of conv2d_transpose.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, conv2d_transpose
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True.
-        act (str, optional): Activation type, if it is set to None, activation is not appended.
-            Default: None.
-        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".
-
-    Attribute:
-        **weight** (Parameter): the learnable weights of filters of this layer.
-
-        **bias** (Parameter or None): the learnable bias of this layer.
-
-    Returns:
-        None
-
-    Examples:
-       .. code-block:: python
-
-          import paddle.fluid as fluid
-          import numpy as np
-
-          with fluid.dygraph.guard():
-              data = np.random.random((3, 32, 32, 5)).astype('float32')
-              conv2DTranspose = fluid.dygraph.nn.Conv2DTranspose(
-                    num_channels=32, num_filters=2, filter_size=3)
-              ret = conv2DTranspose(fluid.dygraph.base.to_variable(data))
-
-    """
-
-    def __init__(
-        self,
-        num_channels,
-        num_filters,
-        filter_size,
-        output_size=None,
-        padding=0,
-        stride=1,
-        dilation=1,
-        groups=None,
-        param_attr=None,
-        bias_attr=None,
-        use_cudnn=True,
-        act=None,
-        dtype='float32',
-    ):
-        super().__init__()
-        assert (
-            param_attr is not False
-        ), "param_attr should not be False in conv2d_transpose."
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._act = act
-        self._groups = groups
-        self._num_channels = num_channels
-        self._num_filters = num_filters
-        self._use_cudnn = use_cudnn
-        self._padding = padding
-        self._stride = stride
-        self._dilation = dilation
-        self._filter_size = filter_size
-        self._output_size = output_size
-        self._dtype = dtype
-
-        if (
-            self._num_channels == self._groups
-            and self._num_filters == self._num_channels
-            and not self._use_cudnn
-        ):
-            self._op_type = 'depthwise_conv2d_transpose'
-        else:
-            self._op_type = 'conv2d_transpose'
-
-        self._padding = utils.convert_to_list(self._padding, 2, 'padding')
-        self._stride = utils.convert_to_list(self._stride, 2, 'stride')
-        self._dilation = utils.convert_to_list(self._dilation, 2, 'dilation')
-
-        self._filter_size = utils.convert_to_list(
-            self._filter_size, 2, 'conv2d_transpose.filter_size'
-        )
-
-        if self._output_size is None:
-            self._output_size = []
-        elif isinstance(self._output_size, list):
-            if utils._contain_var(self._output_size):
-                self._output_size = utils._convert_to_tensor_list(
-                    self._output_size
-                )
-            else:
-                self._output_size = utils.convert_to_list(
-                    self._output_size, 2, 'output_size'
-                )
-        elif isinstance(self._output_size, int):
-            self._output_size = utils.convert_to_list(
-                self._output_size, 2, 'output_size'
-            )
-        elif isinstance(self._output_size, Variable):
-            check_dtype(
-                self._output_size.dtype,
-                'output_size',
-                ['int32', 'int64'],
-                'Conv2DTranspose',
-            )
-            if len(self._output_size.shape) == 1 and (
-                self._output_size.shape[0] == 1
-                or self._output_size.shape[0] == 2
-            ):
-                if self._output_size.shape[0] == 1:
-                    self._output_size = [self._output_size, self._output_size]
-            else:
-                raise ValueError(
-                    "output_size must contain one or two integers."
-                )
-        else:
-            raise ValueError("output_size should be list or int or Tensor")
-        self._padding = utils.convert_to_list(self._padding, 2, 'padding')
-        self._groups = 1 if self._groups is None else self._groups
-        filter_shape = [
-            self._num_channels,
-            self._num_filters // self._groups,
-        ] + self._filter_size
-
-        self.weight = self.create_parameter(
-            dtype=self._dtype, shape=filter_shape, attr=self._param_attr
-        )
-
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True,
-        )
-
-    def forward(self, input):
-        if _non_static_mode():
-            op = getattr(_legacy_C_ops, self._op_type)
-            out = op(
-                input,
-                self.weight,
-                'output_size',
-                self._output_size,
-                'strides',
-                self._stride,
-                'paddings',
-                self._padding,
-                'dilations',
-                self._dilation,
-                'groups',
-                self._groups,
-                'use_cudnn',
-                self._use_cudnn,
-            )
-            pre_bias = out
-            pre_act = dygraph_utils._append_bias_in_dygraph(
-                pre_bias, self.bias, 1
-            )
-            return dygraph_utils._append_activation_in_dygraph(
-                pre_act, act=self._act
-            )
-
-        check_variable_and_dtype(
-            input, 'input', ['float16', 'float32', 'float64'], "Conv2DTranspose"
-        )
-
-        inputs = {'Input': [input], 'Filter': [self.weight]}
-        attrs = {
-            'output_size': self._output_size,
-            'strides': self._stride,
-            'paddings': self._padding,
-            'dilations': self._dilation,
-            'groups': self._groups,
-            'use_cudnn': self._use_cudnn,
-        }
-
-        pre_bias = self._helper.create_variable_for_type_inference(
-            dtype=input.dtype
-        )
-        self._helper.append_op(
-            type=self._op_type,
-            inputs=inputs,
-            outputs={'Output': pre_bias},
-            attrs=attrs,
-        )
-
-        if self.bias is not None:
-            pre_act = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype
-            )
-            self._helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [pre_bias], 'Y': [self.bias]},
-                outputs={'Out': [pre_act]},
-                attrs={'axis': 1},
-            )
-        else:
-            pre_act = pre_bias
-
-        out = self._helper.append_activation(pre_act, act=self._act)
-        return out
-
-
-class SequenceConv(layers.Layer):
-    """
-    This function creates the op for sequence_conv, using the inputs and
-    other convolutional configurations for the filters and stride as given
-    in the input parameters to the function.
-
-    Parameters:
-        name_scope(str): The name of this class.
-        num_filters (int): number of filters.
-        filter_size (int): the filter size (H and W). Default: 3.
-        filter_stride (int): stride of the filter. Default: 1.
-        padding (bool|None): if True, add paddings. Default: None
-        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of sequence_conv.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, sequence_conv
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
-            of sequence_conv. If it is set to None or one attribute of ParamAttr, sequence_conv
-            will create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        act (str): Activation type, if it is set to None, activation is not appended.
-            Default: None.
-
-    Attributes:
-        weight (Parameter): the learnable weights of filters of this layer.
-        bias (Parameter|None): the learnable bias of this layer.
-
-    Returns:
-        Variable: output of sequence_conv
-    """
-
-    def __init__(
-        self,
-        name_scope,
-        num_filters,
-        filter_size=3,
-        filter_stride=1,
-        padding=None,
-        bias_attr=None,
-        param_attr=None,
-        act=None,
-    ):
-        assert (
-            not _non_static_mode()
-        ), "SequenceConv is not supported by dynamic graph mode yet!"
-        super().__init__(name_scope)
-        self._num_filters = num_filters
-        self._filter_size = filter_size
-        self._filter_stride = filter_stride
-        self._padding = padding
-        self._bias_attr = bias_attr
-        self._param_attr = param_attr
-        self._act = act
-
-    def _build_once(self, input):
-        self._dtype = self._helper.input_dtype(input)
-        filter_shape = [self._filter_size * input.shape[1], self._num_filters]
-        self.weight = self.create_parameter(
-            attr=self._param_attr, shape=filter_shape, dtype=self._dtype
-        )
-
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True,
-        )
-
-    def forward(self, input):
-        pre_bias = self._helper.create_variable_for_type_inference(self._dtype)
-        self._helper.append_op(
-            type='sequence_conv',
-            inputs={
-                'X': [input],
-                'Filter': [self.weight],
-            },
-            outputs={"Out": pre_bias},
-            attrs={
-                'contextStride': self._filter_stride,
-                'contextStart': -int(self._filter_size // 2),
-                'contextLength': self._filter_size,
-            },
-        )
-
-        if self.bias is not None:
-            pre_act = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype
-            )
-            self._helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [pre_bias], 'Y': [self.bias]},
-                outputs={'Out': [pre_act]},
-                attrs={'axis': 1},
-            )
-        else:
-            pre_act = pre_bias
-
-        return self._helper.append_activation(pre_act, act=self._act)
-
-
 class RowConv(layers.Layer):
     """
     ***Row-convolution operator***
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
index 3733977c5dbcf..1f589b8d6fc8b 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
@@ -113,11 +113,11 @@ def test_transformed_static_result(self):
 
 # 1. test Apis that inherit from layers.Layer
 def dyfunc_BilinearTensorProduct(layer1, layer2):
-    bilinearTensorProduct = fluid.dygraph.nn.BilinearTensorProduct(
-        input1_dim=5,
-        input2_dim=4,
-        output_dim=1000,
-        param_attr=fluid.ParamAttr(
+    bilinearTensorProduct = paddle.nn.Bilinear(
+        5,
+        4,
+        1000,
+        weight_attr=fluid.ParamAttr(
             initializer=fluid.initializer.Constant(value=0.99)
         ),
         bias_attr=fluid.ParamAttr(
@@ -165,12 +165,11 @@ def dyfunc_Conv3D(input):
 
 
 def dyfunc_Conv2DTranspose(input):
-    conv2dTranspose = fluid.dygraph.nn.Conv2DTranspose(
-        num_channels=3,
-        num_filters=12,
-        filter_size=12,
-        use_cudnn=False,
-        param_attr=fluid.ParamAttr(
+    conv2dTranspose = paddle.nn.Conv2DTranspose(
+        3,
+        12,
+        12,
+        weight_attr=fluid.ParamAttr(
             initializer=fluid.initializer.Constant(value=0.99)
         ),
         bias_attr=fluid.ParamAttr(
@@ -221,11 +220,12 @@ def dyfunc_Pool2D(input):
 
 
 def dyfunc_Prelu(input):
-    prelu0 = fluid.PRelu(
-        mode='all',
-        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(1.0)),
+    prelu0 = paddle.nn.PReLU(
+        weight_attr=fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(1.0)
+        ),
     )
-    res = prelu0(input=input)
+    res = prelu0(input)
     return res
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
index a8d6595c5bd0b..a47a5cfe1dad4 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
@@ -37,7 +37,7 @@
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import to_variable
-from paddle.fluid.dygraph.nn import BatchNorm, Conv2DTranspose
+from paddle.fluid.dygraph.nn import BatchNorm
 from paddle.jit import ProgramTranslator
 from paddle.jit.api import declarative
 
@@ -430,14 +430,13 @@ def __init__(
                 initializer=fluid.initializer.Constant(0.0)
             )
 
-        self._deconv = Conv2DTranspose(
+        self._deconv = paddle.nn.Conv2DTranspose(
             num_channels,
             num_filters,
-            filter_size=filter_size,
+            filter_size,
             stride=stride,
             padding=padding,
-            use_cudnn=use_cudnn,
-            param_attr=fluid.ParamAttr(
+            weight_attr=fluid.ParamAttr(
                 initializer=fluid.initializer.NormalInitializer(
                     loc=0.0, scale=stddev
                 )
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
index 74910a4a87c15..77416dd9826fb 100644
--- a/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
@@ -24,9 +24,7 @@
 class TestDygraphBilinearTensorProductAPIError(unittest.TestCase):
     def test_errors(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            layer = fluid.dygraph.nn.BilinearTensorProduct(
-                input1_dim=5, input2_dim=4, output_dim=1000
-            )
+            layer = paddle.nn.Bilinear(5, 4, 1000)
             # the input must be Variable.
             x0 = fluid.create_lod_tensor(
                 np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
index 8d87195db497a..2d4694be2e9e3 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -1084,86 +1084,5 @@ def call_func(self, x):
         return out
 
 
-class TestTensorOutputSize5(TestTensorOutputSize1):
-    def path_prefix(self):
-        return 'conv2d_transpose_tensor_output_size5'
-
-    def call_func(self, x):
-        w_var = paddle.randn((3, 6, 3, 3), dtype='float32')
-        output_size = [17, paddle.assign([17])]
-        conv2d_trans = paddle.fluid.dygraph.Conv2DTranspose(
-            num_channels=3,
-            num_filters=6,
-            filter_size=3,
-            output_size=output_size,
-            stride=2,
-        )
-        out = conv2d_trans(x)
-        return out
-
-
-class TestTensorOutputSize6(TestTensorOutputSize1):
-    def path_prefix(self):
-        return 'conv2d_transpose_tensor_output_size6'
-
-    def var_prefix(self):
-        return "Var["
-
-    def call_func(self, x):
-        w_var = paddle.randn((3, 6, 3, 3), dtype='float32')
-        output_size = paddle.assign([17, 17])
-        conv2d_trans = paddle.fluid.dygraph.Conv2DTranspose(
-            num_channels=3,
-            num_filters=6,
-            filter_size=3,
-            output_size=output_size,
-            stride=2,
-        )
-        out = conv2d_trans(x)
-        return out
-
-
-class TestTensorOutputSize7(TestTensorOutputSize1):
-    def path_prefix(self):
-        return 'conv2d_transpose_tensor_output_size7'
-
-    def var_prefix(self):
-        return ""
-
-    def call_func(self, x):
-        w_var = paddle.randn((3, 6, 3, 3), dtype='float32')
-        output_size = 17
-        conv2d_trans = paddle.fluid.dygraph.Conv2DTranspose(
-            num_channels=3,
-            num_filters=6,
-            filter_size=3,
-            output_size=output_size,
-            stride=2,
-        )
-        out = conv2d_trans(x)
-        return out
-
-
-class TestTensorOutputSize8(TestTensorOutputSize1):
-    def path_prefix(self):
-        return 'conv2d_transpose_tensor_output_size8'
-
-    def var_prefix(self):
-        return ""
-
-    def call_func(self, x):
-        w_var = paddle.randn((3, 6, 3, 3), dtype='float32')
-        output_size = [17, 17]
-        conv2d_trans = paddle.fluid.dygraph.Conv2DTranspose(
-            num_channels=3,
-            num_filters=6,
-            filter_size=3,
-            output_size=output_size,
-            stride=2,
-        )
-        out = conv2d_trans(x)
-        return out
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dataloader_early_reset.py b/python/paddle/fluid/tests/unittests/test_dataloader_early_reset.py
index f55cc3370473b..16939f26a2fb7 100644
--- a/python/paddle/fluid/tests/unittests/test_dataloader_early_reset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataloader_early_reset.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import unittest
+
+import numpy as np
+
 import paddle
 import paddle.fluid as fluid
-import numpy as np
-import unittest
 
 
 def infinite_reader():
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
index 05c7542792cc6..96bb7914a6c7e 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
@@ -21,7 +21,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.framework as framework
-from paddle.fluid.dygraph.nn import BatchNorm, Embedding, GroupNorm, PRelu
+from paddle.fluid.dygraph.nn import BatchNorm, Embedding, GroupNorm
 from paddle.nn import Linear
 
 
@@ -212,9 +212,6 @@ def __init__(self):
                     self.layer_norm_1 = paddle.nn.LayerNorm([10])
                     self.layer_norm_2 = paddle.nn.LayerNorm(10)
 
-                    self.prelu1 = PRelu("channel", channel=5)
-                    self.prelu2 = PRelu("channel", channel=5)
-
                     self.group_norm1 = GroupNorm(8, 4)
                     self.gourp_norm2 = GroupNorm(8, 4)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
index 1a1b22ee71c35..00ac5906b6627 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
@@ -185,10 +185,10 @@ def __init__(
     ):
         super().__init__()
 
-        self._deconv = fluid.dygraph.Conv2DTranspose(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
+        self._deconv = paddle.nn.Conv2DTranspose(
+            num_channels,
+            num_filters,
+            filter_size,
             stride=stride,
             padding=padding,
             bias_attr=None if use_bias else False,
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 99fb5fac4ae6b..52b864648cb7f 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -33,8 +33,6 @@
     default_main_program,
     program_guard,
 )
-from paddle.fluid.initializer import Constant
-from paddle.fluid.param_attr import ParamAttr
 from paddle.tensor import random
 
 
@@ -383,54 +381,6 @@ def test_elementwise_minmax(self):
         np.testing.assert_allclose(n, min_eager_ret_value, rtol=1e-05)
         np.testing.assert_allclose(n2, max_eager_ret_value, rtol=1e-05)
 
-    def test_sequence_conv(self):
-        inp_np = np.arange(12).reshape([3, 4]).astype('float32')
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-        with self.static_graph():
-            seq = layers.data(
-                name='seq_in',
-                shape=[3, 4],
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False,
-            )
-            out = layers.sequence_conv(seq, 2, act='sigmoid')
-            static_rlt = self.get_static_graph_result(
-                feed={
-                    "seq_in": fluid.create_lod_tensor(
-                        data=inp_np, recursive_seq_lens=[[1, 1, 1]], place=place
-                    )
-                },
-                fetch_list=[out],
-                with_lod=True,
-            )[0]
-
-        with self.static_graph():
-            seq = layers.data(
-                name='seq_in',
-                shape=[3, 4],
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False,
-            )
-            seq_conv = nn.SequenceConv('seq_conv', num_filters=2, act='sigmoid')
-            out = seq_conv(seq)
-            static_rlt2 = self.get_static_graph_result(
-                feed={
-                    "seq_in": fluid.create_lod_tensor(
-                        data=inp_np, recursive_seq_lens=[[1, 1, 1]], place=place
-                    )
-                },
-                fetch_list=[out],
-                with_lod=True,
-            )[0]
-        np.testing.assert_array_equal(
-            np.array(static_rlt), np.array(static_rlt2)
-        )
-
     def test_conv2d_transpose(self):
         inp_np = np.arange(0, 24).reshape([2, 3, 2, 2]).astype('float32')
         with self.static_graph():
@@ -447,37 +397,37 @@ def test_conv2d_transpose(self):
             )[0]
         with self.static_graph():
             img = layers.data(name='pixel', shape=[3, 2, 2], dtype='float32')
-            conv2d_transpose = nn.Conv2DTranspose(
-                num_channels=3,
-                num_filters=10,
-                filter_size=27,
-                act='sigmoid',
+            conv2d_transpose = paddle.nn.Conv2DTranspose(
+                3,
+                10,
+                27,
                 bias_attr=fluid.initializer.ConstantInitializer(value=1),
             )
             out = conv2d_transpose(img)
+            out = paddle.nn.functional.sigmoid(out)
             static_rlt2 = self.get_static_graph_result(
                 feed={'pixel': inp_np}, fetch_list=[out]
             )[0]
         with self.dynamic_graph():
             with _test_eager_guard():
-                conv2d_transpose = nn.Conv2DTranspose(
-                    num_channels=3,
-                    num_filters=10,
-                    filter_size=27,
-                    act='sigmoid',
+                conv2d_transpose = paddle.nn.Conv2DTranspose(
+                    3,
+                    10,
+                    27,
                     bias_attr=fluid.initializer.ConstantInitializer(value=1),
                 )
                 dy_eager_rlt = conv2d_transpose(base.to_variable(inp_np))
+                dy_eager_rlt = paddle.nn.functional.sigmoid(dy_eager_rlt)
                 dy_eager_rlt_value = dy_eager_rlt.numpy()
 
-            conv2d_transpose = nn.Conv2DTranspose(
-                num_channels=3,
-                num_filters=10,
-                filter_size=27,
-                act='sigmoid',
+            conv2d_transpose = paddle.nn.Conv2DTranspose(
+                3,
+                10,
+                27,
                 bias_attr=fluid.initializer.ConstantInitializer(value=1),
             )
             dy_rlt = conv2d_transpose(base.to_variable(inp_np))
+            dy_rlt = paddle.nn.functional.sigmoid(dy_rlt)
             dy_rlt_value = dy_rlt.numpy()
         np.testing.assert_allclose(static_rlt2, static_rlt, rtol=1e-05)
         np.testing.assert_allclose(dy_rlt_value, static_rlt2, rtol=1e-05)
@@ -492,14 +442,12 @@ def test_conv2d_transpose(self):
                         custom_weight
                     )
                 )
-                conv2d1 = nn.Conv2DTranspose(
-                    num_channels=3, num_filters=3, filter_size=[2, 2]
-                )
-                conv2d2 = nn.Conv2DTranspose(
-                    num_channels=3,
-                    num_filters=3,
-                    filter_size=[2, 2],
-                    param_attr=weight_attr,
+                conv2d1 = paddle.nn.Conv2DTranspose(3, 3, [2, 2])
+                conv2d2 = paddle.nn.Conv2DTranspose(
+                    3,
+                    3,
+                    [2, 2],
+                    weight_attr=weight_attr,
                 )
                 dy_ret1 = conv2d1(base.to_variable(images))
                 dy_ret2 = conv2d2(base.to_variable(images))
@@ -537,14 +485,12 @@ def test_conv2d_transpose(self):
                     custom_weight
                 )
             )
-            conv2d1 = nn.Conv2DTranspose(
-                num_channels=3, num_filters=3, filter_size=[2, 2]
-            )
-            conv2d2 = nn.Conv2DTranspose(
-                num_channels=3,
-                num_filters=3,
-                filter_size=[2, 2],
-                param_attr=weight_attr,
+            conv2d1 = paddle.nn.Conv2DTranspose(3, 3, [2, 2])
+            conv2d2 = paddle.nn.Conv2DTranspose(
+                3,
+                3,
+                [2, 2],
+                weight_attr=weight_attr,
             )
             dy_ret1 = conv2d1(base.to_variable(images))
             dy_ret2 = conv2d2(base.to_variable(images))
@@ -578,9 +524,7 @@ def test_conv2d_transpose(self):
             # the input of Conv2DTranspose must be Variable.
             def test_Variable():
                 images = np.ones([2, 3, 5, 5], dtype='float32')
-                conv2d = nn.Conv2DTranspose(
-                    num_channels=3, num_filters=3, filter_size=[2, 2]
-                )
+                conv2d = paddle.nn.Conv2DTranspose(3, 3, [2, 2])
                 conv2d_ret1 = conv2d(images)
 
             self.assertRaises(TypeError, test_Variable)
@@ -591,9 +535,7 @@ def test_type():
                 images = layers.data(
                     name='pixel', shape=[3, 5, 5], dtype='int32'
                 )
-                conv2d = nn.Conv2DTranspose(
-                    num_channels=3, num_filters=3, filter_size=[2, 2]
-                )
+                conv2d = paddle.nn.Conv2DTranspose(3, 3, [2, 2])
                 conv2d_ret2 = conv2d(images)
 
             self.assertRaises(TypeError, test_type)
@@ -628,53 +570,55 @@ def test_bilinear_tensor_product(self):
             data_y = layers.data(
                 name='y', shape=[1, 3], dtype="float32", append_batch_size=False
             )
-            btp = nn.BilinearTensorProduct(
+            btp = paddle.nn.Bilinear(
                 3,
                 3,
                 6,
                 bias_attr=fluid.initializer.ConstantInitializer(value=1),
-                act='sigmoid',
             )
             out = btp(data_x, data_y)
+            out = paddle.nn.functional.sigmoid(out)
             static_rlt2 = self.get_static_graph_result(
                 feed={'x': inp_np_x, 'y': inp_np_y}, fetch_list=[out]
             )[0]
         with self.dynamic_graph():
             with _test_eager_guard():
-                btp = nn.BilinearTensorProduct(
+                btp = paddle.nn.Bilinear(
                     3,
                     3,
                     6,
                     bias_attr=fluid.initializer.ConstantInitializer(value=1),
-                    act='sigmoid',
                 )
                 dy_eager_rlt = btp(
                     base.to_variable(inp_np_x), base.to_variable(inp_np_y)
                 )
+                dy_eager_rlt = paddle.nn.functional.sigmoid(dy_eager_rlt)
                 dy_eager_rlt_value = dy_eager_rlt.numpy()
 
-            btp = nn.BilinearTensorProduct(
+            btp = paddle.nn.Bilinear(
                 3,
                 3,
                 6,
                 bias_attr=fluid.initializer.ConstantInitializer(value=1),
-                act='sigmoid',
             )
             dy_rlt = btp(base.to_variable(inp_np_x), base.to_variable(inp_np_y))
+            dy_rlt = paddle.nn.functional.sigmoid(dy_rlt)
             dy_rlt_value = dy_rlt.numpy()
 
         with self.dynamic_graph():
             with _test_eager_guard():
-                btp2 = nn.BilinearTensorProduct(3, 3, 6, act='sigmoid')
+                btp2 = paddle.nn.Bilinear(3, 3, 6)
                 dy_eager_rlt2 = btp2(
                     base.to_variable(inp_np_x), base.to_variable(inp_np_y)
                 )
+                dy_eager_rlt2 = paddle.nn.functional.sigmoid(dy_eager_rlt2)
                 dy_eager_rlt2_value = dy_eager_rlt2.numpy()
 
-            btp2 = nn.BilinearTensorProduct(3, 3, 6, act='sigmoid')
+            btp2 = paddle.nn.Bilinear(3, 3, 6)
             dy_rlt2 = btp2(
                 base.to_variable(inp_np_x), base.to_variable(inp_np_y)
             )
+            dy_rlt2 = paddle.nn.functional.sigmoid(dy_rlt2)
             dy_rlt2_value = dy_rlt2.numpy()
 
         with self.static_graph():
@@ -706,16 +650,16 @@ def test_bilinear_tensor_product(self):
                         custom_weight
                     )
                 )
-                btp1 = nn.BilinearTensorProduct(3, 3, 6, act='sigmoid')
-                btp2 = nn.BilinearTensorProduct(
-                    3, 3, 6, act='sigmoid', param_attr=weight_attr
-                )
+                btp1 = paddle.nn.Bilinear(3, 3, 6)
+                btp2 = paddle.nn.Bilinear(3, 3, 6, weight_attr=weight_attr)
                 dy_rlt1 = btp1(
                     base.to_variable(inp_np_x), base.to_variable(inp_np_y)
                 )
+                dy_rlt1 = paddle.nn.functional.sigmoid(dy_rlt1)
                 dy_rlt2 = btp2(
                     base.to_variable(inp_np_x), base.to_variable(inp_np_y)
                 )
+                dy_rlt2 = paddle.nn.functional.sigmoid(dy_rlt2)
                 self.assertFalse(
                     np.array_equal(dy_rlt1.numpy(), dy_rlt2.numpy())
                 )
@@ -744,16 +688,16 @@ def test_bilinear_tensor_product(self):
                     custom_weight
                 )
             )
-            btp1 = nn.BilinearTensorProduct(3, 3, 6, act='sigmoid')
-            btp2 = nn.BilinearTensorProduct(
-                3, 3, 6, act='sigmoid', param_attr=weight_attr
-            )
+            btp1 = paddle.nn.Bilinear(3, 3, 6)
+            btp2 = paddle.nn.Bilinear(3, 3, 6, weight_attr=weight_attr)
             dy_rlt1 = btp1(
                 base.to_variable(inp_np_x), base.to_variable(inp_np_y)
             )
+            dy_rlt1 = paddle.nn.functional.sigmoid(dy_rlt1)
             dy_rlt2 = btp2(
                 base.to_variable(inp_np_x), base.to_variable(inp_np_y)
             )
+            dy_rlt2 = paddle.nn.functional.sigmoid(dy_rlt2)
             self.assertFalse(np.array_equal(dy_rlt1.numpy(), dy_rlt2.numpy()))
             btp2.weight.set_value(btp1.weight.numpy())
             btp2.bias.set_value(btp1.bias)
@@ -772,133 +716,6 @@ def test_bilinear_tensor_product(self):
             )
             np.testing.assert_array_equal(btp1.bias.numpy(), btp2.bias.numpy())
 
-    def prelu_test(self, mode):
-        inp_np = np.ones([5, 200, 100, 100]).astype('float32')
-        with self.static_graph():
-            data_t = layers.data(
-                name="input",
-                shape=[5, 200, 100, 100],
-                dtype="float32",
-                append_batch_size=False,
-            )
-            out = paddle.static.nn.prelu(
-                data_t, mode, param_attr=ParamAttr(initializer=Constant(1.0))
-            )
-            static_rlt = self.get_static_graph_result(
-                feed={"input": inp_np}, fetch_list=[out]
-            )[0]
-
-        with self.static_graph():
-            data_t = layers.data(
-                name="input",
-                shape=[5, 200, 100, 100],
-                dtype="float32",
-                append_batch_size=False,
-            )
-            prelu = nn.PRelu(
-                mode=mode,
-                channel=inp_np.shape[1],
-                input_shape=data_t.shape,
-                param_attr=ParamAttr(initializer=Constant(1.0)),
-            )
-            out = prelu(data_t)
-            static_rlt2 = self.get_static_graph_result(
-                feed={"input": inp_np}, fetch_list=[out]
-            )[0]
-
-        with self.dynamic_graph():
-            with _test_eager_guard():
-                prelu = nn.PRelu(
-                    mode=mode,
-                    channel=inp_np.shape[1],
-                    input_shape=inp_np.shape,
-                    param_attr=ParamAttr(initializer=Constant(1.0)),
-                )
-                dy_eager_rlt = prelu(base.to_variable(inp_np))
-                dy_eager_rlt_value = dy_eager_rlt.numpy()
-
-            prelu = nn.PRelu(
-                mode=mode,
-                channel=inp_np.shape[1],
-                input_shape=inp_np.shape,
-                param_attr=ParamAttr(initializer=Constant(1.0)),
-            )
-            dy_rlt = prelu(base.to_variable(inp_np))
-            dy_rlt_value = dy_rlt.numpy()
-
-        np.testing.assert_allclose(static_rlt2, static_rlt, rtol=1e-05)
-        np.testing.assert_allclose(dy_rlt_value, static_rlt, rtol=1e-05)
-        np.testing.assert_allclose(dy_eager_rlt_value, static_rlt, rtol=1e-05)
-
-        with self.dynamic_graph():
-            with _test_eager_guard():
-                inp_np = np.random.randn(5, 200, 100, 100).astype("float32")
-                inp = base.to_variable(inp_np)
-                prelu1 = nn.PRelu(
-                    mode=mode,
-                    channel=inp_np.shape[1],
-                    input_shape=inp_np.shape,
-                    param_attr=ParamAttr(initializer=Constant(2.0)),
-                )
-                prelu2 = nn.PRelu(
-                    mode=mode,
-                    channel=inp_np.shape[1],
-                    input_shape=inp_np.shape,
-                    param_attr=ParamAttr(initializer=Constant(1.0)),
-                )
-                dy_rlt1 = prelu1(inp)
-                dy_rlt2 = prelu2(inp)
-                self.assertFalse(
-                    np.array_equal(prelu1.weight.numpy(), prelu2.weight.numpy())
-                )
-                self.assertFalse(
-                    np.array_equal(dy_rlt1.numpy(), dy_rlt2.numpy())
-                )
-                prelu2.weight.set_value(prelu1.weight.numpy())
-                dy_rlt1 = prelu1(inp)
-                dy_rlt2 = prelu2(inp)
-                np.testing.assert_array_equal(dy_rlt1.numpy(), dy_rlt2.numpy())
-
-                prelu2.weight = prelu1.weight
-                np.testing.assert_array_equal(
-                    prelu1.weight.numpy(), prelu2.weight.numpy()
-                )
-
-            inp_np = np.random.randn(5, 200, 100, 100).astype("float32")
-            inp = base.to_variable(inp_np)
-            prelu1 = nn.PRelu(
-                mode=mode,
-                channel=inp_np.shape[1],
-                input_shape=inp_np.shape,
-                param_attr=ParamAttr(initializer=Constant(2.0)),
-            )
-            prelu2 = nn.PRelu(
-                mode=mode,
-                channel=inp_np.shape[1],
-                input_shape=inp_np.shape,
-                param_attr=ParamAttr(initializer=Constant(1.0)),
-            )
-            dy_rlt1 = prelu1(inp)
-            dy_rlt2 = prelu2(inp)
-            self.assertFalse(
-                np.array_equal(prelu1.weight.numpy(), prelu2.weight.numpy())
-            )
-            self.assertFalse(np.array_equal(dy_rlt1.numpy(), dy_rlt2.numpy()))
-            prelu2.weight.set_value(prelu1.weight.numpy())
-            dy_rlt1 = prelu1(inp)
-            dy_rlt2 = prelu2(inp)
-            np.testing.assert_array_equal(dy_rlt1.numpy(), dy_rlt2.numpy())
-
-            prelu2.weight = prelu1.weight
-            np.testing.assert_array_equal(
-                prelu1.weight.numpy(), prelu2.weight.numpy()
-            )
-
-    def test_prelu(self):
-        self.prelu_test("channel")
-        self.prelu_test("element")
-        self.prelu_test("all")
-
     def test_embeding(self):
         inp_word = np.array([[[1]]]).astype('int64')
         dict_size = 20
@@ -1207,56 +1024,6 @@ def test_conv3d(self):
                 conv3d1.bias.numpy(), conv3d2.bias.numpy()
             )
 
-    def test_row_conv(self):
-        input = np.arange(15).reshape([3, 5]).astype('float32')
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-
-        with self.static_graph():
-            x = layers.data(
-                name='X',
-                shape=[3, 5],
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False,
-            )
-            ret = layers.row_conv(input=x, future_context_size=2)
-            static_ret = self.get_static_graph_result(
-                feed={
-                    'X': fluid.create_lod_tensor(
-                        data=input, recursive_seq_lens=[[1, 1, 1]], place=place
-                    )
-                },
-                fetch_list=[ret],
-                with_lod=True,
-            )[0]
-
-        with self.static_graph():
-            x = layers.data(
-                name='X',
-                shape=[3, 5],
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False,
-            )
-            rowConv = nn.RowConv('RowConv', future_context_size=2)
-            ret = rowConv(x)
-            static_ret2 = self.get_static_graph_result(
-                feed={
-                    'X': fluid.create_lod_tensor(
-                        data=input, recursive_seq_lens=[[1, 1, 1]], place=place
-                    )
-                },
-                fetch_list=[ret],
-                with_lod=True,
-            )[0]
-
-        # TODO: dygraph can't support LODTensor
-
-        np.testing.assert_allclose(static_ret, static_ret2, rtol=1e-05)
-
     def func_group_norm(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
index 83ce7e5d35519..34d89ec89b47c 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
@@ -16,7 +16,6 @@
 import time
 import unittest
 
-import paddle
 import numpy as np
 from test_multiprocess_dataloader_static import (
     BATCH_SIZE,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
index bb2cddf04b13a..12302c582f307 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
@@ -19,9 +19,6 @@
 
 sys.path.append("..")
 
-import paddle
-import paddle.nn.functional as F
-
 from op_test import OpTest
 from op_test_xpu import XPUOpTest
 from xpu.get_test_cover_info import (
@@ -31,6 +28,7 @@
 )
 
 import paddle
+import paddle.nn.functional as F
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_unfold_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_unfold_op_xpu.py
index cce01f1aebf3b..ce1d028e5e032 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_unfold_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_unfold_op_xpu.py
@@ -12,18 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle
-import paddle.fluid as fluid
-import numpy as np
 import sys
 import unittest
 
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+
 sys.path.append("..")
 from op_test_xpu import XPUOpTest
 from xpu.get_test_cover_info import (
+    XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
-    XPUOpTestWrapper,
 )
 
 paddle.enable_static()
diff --git a/python/paddle/static/nn/metric.py b/python/paddle/static/nn/metric.py
index 948b100bce713..3ed54ddd2ccee 100644
--- a/python/paddle/static/nn/metric.py
+++ b/python/paddle/static/nn/metric.py
@@ -15,16 +15,12 @@
 All layers just related to metric.
 """
 
-from paddle.fluid.layer_helper import LayerHelper
+from paddle import _legacy_C_ops
+from paddle.fluid.data_feeder import check_variable_and_dtype
+from paddle.fluid.framework import Variable, _non_static_mode, _varbase_creator
 from paddle.fluid.initializer import Constant
-from paddle.fluid.framework import (
-    Variable,
-    _non_static_mode,
-    _varbase_creator,
-)
+from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.layers import tensor
-from paddle.fluid.data_feeder import check_variable_and_dtype
-from paddle import _legacy_C_ops
 
 __all__ = ['accuracy', 'auc']
 

From 3a8aac354cce02d3d0f9dc650bd905fb305b4e2b Mon Sep 17 00:00:00 2001
From: Zhou Wei <1183042833@qq.com>
Date: Wed, 7 Dec 2022 17:58:45 +0800
Subject: [PATCH 29/60] fix ci (#48730)

---
 paddle/scripts/paddle_build.bat | 5 ++++-
 paddle/scripts/paddle_build.sh  | 3 +--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 254b268b958b2..0163946682400 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -99,6 +99,10 @@ set PYTHON_VENV_ROOT=%cache_dir%\python_venv
 set PYTHON_EXECUTABLE=!PYTHON_VENV_ROOT!\Scripts\python.exe
 %PYTHON_ROOT%\python.exe -m venv --clear !PYTHON_VENV_ROOT!
 call !PYTHON_VENV_ROOT!\Scripts\activate.bat
+if %ERRORLEVEL% NEQ 0 (
+    echo activate python virtual environment failed!
+    exit /b 5
+)
 
 if "%WITH_PYTHON%" == "ON" (
     where python
@@ -640,7 +644,6 @@ pip uninstall -y paddlepaddle
 pip uninstall -y paddlepaddle-gpu
 pip install %PADDLE_WHL_FILE_WIN%
 if %ERRORLEVEL% NEQ 0 (
-    call paddle_winci\Scripts\deactivate.bat 2>NUL
     echo pip install whl package failed!
     exit /b 1
 )
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 4e563496d3529..71facd9695068 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1008,9 +1008,8 @@ function generate_upstream_develop_api_spec() {
     cmake_change=`git diff --name-only upstream/$BRANCH | grep "cmake/external" || true`
 
     cd ${PADDLE_ROOT}
-    git fetch upstream $BRANCH
     git checkout -b develop_base_pr -t upstream/$BRANCH
-    echo "upstream develop git log: "
+    echo "develop git log: "
     git log --pretty=oneline -10
 
     dev_commit=`git log -1|head -1|awk '{print $2}'`

From e75c651d150f7d131614bad32765ca2cf1b5f3cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kevin=E5=90=B4=E5=98=89=E6=96=87?= <417333277@qq.com>
Date: Wed, 7 Dec 2022 19:04:41 +0800
Subject: [PATCH 30/60] Remove reduntant numpy output in Example code (1/3),
 test=document_fix (#48678)

---
 .../communication/stream/all_reduce.py        |  2 +-
 python/paddle/fft.py                          | 41 +++++++++---------
 .../incubate/nn/layer/fused_transformer.py    |  4 +-
 python/paddle/nn/functional/activation.py     |  9 ++--
 python/paddle/nn/functional/conv.py           | 20 ++++-----
 python/paddle/nn/functional/distance.py       |  4 +-
 python/paddle/nn/functional/extension.py      |  9 ++--
 python/paddle/nn/functional/loss.py           | 41 ++++++++++--------
 python/paddle/nn/functional/vision.py         |  5 +--
 python/paddle/nn/initializer/constant.py      | 10 +++--
 python/paddle/nn/layer/conv.py                | 20 ++++-----
 python/paddle/nn/layer/distance.py            |  4 +-
 python/paddle/nn/layer/loss.py                | 43 +++++++++++--------
 python/paddle/nn/layer/vision.py              |  5 +--
 python/paddle/nn/quant/quant_layers.py        |  6 +--
 python/paddle/tensor/manipulation.py          |  5 ++-
 python/paddle/text/datasets/conll05.py        |  2 +-
 python/paddle/text/datasets/imdb.py           |  2 +-
 python/paddle/text/datasets/imikolov.py       |  2 +-
 python/paddle/text/datasets/movielens.py      |  2 +-
 20 files changed, 126 insertions(+), 110 deletions(-)

diff --git a/python/paddle/distributed/communication/stream/all_reduce.py b/python/paddle/distributed/communication/stream/all_reduce.py
index dd04ab0852bf3..16f69764f4e61 100644
--- a/python/paddle/distributed/communication/stream/all_reduce.py
+++ b/python/paddle/distributed/communication/stream/all_reduce.py
@@ -106,7 +106,7 @@ def all_reduce(
                 data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
             task = dist.stream.all_reduce(data, sync_op=False)
             task.wait()
-            out = data.numpy()
+            out = data
             # [[5, 7, 9], [5, 7, 9]]
     """
     if _warn_cur_rank_not_in_group(group):
diff --git a/python/paddle/fft.py b/python/paddle/fft.py
index 9d2b4e2a1995e..7718e038c77c6 100644
--- a/python/paddle/fft.py
+++ b/python/paddle/fft.py
@@ -530,26 +530,27 @@ def fftn(x, s=None, axes=None, norm="backward", name=None):
             x = paddle.meshgrid(arr, arr, arr)[1]
 
             fftn_xp = paddle.fft.fftn(x, axes=(1, 2))
-            print(fftn_xp.numpy())
-            # [[[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]
-
-            #  [[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]
-
-            #  [[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]
-
-            #  [[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]]
+            print(fftn_xp)
+            # Tensor(shape=[4, 4, 4], dtype=complex128, place=Place(gpu:0), stop_gradient=True,
+            #        [[[(24+0j),  0j    ,  0j    ,  -0j   ],
+            #          [(-8+8j),  0j    ,  0j    ,  -0j   ],
+            #          [(-8+0j),  0j    ,  0j    ,  -0j   ],
+            #          [(-8-8j),  0j    ,  0j    ,  -0j   ]],
+
+            #         [[(24+0j),  0j    ,  0j    ,  -0j   ],
+            #          [(-8+8j),  0j    ,  0j    ,  -0j   ],
+            #          [(-8+0j),  0j    ,  0j    ,  -0j   ],
+            #          [(-8-8j),  0j    ,  0j    ,  -0j   ]],
+
+            #         [[(24+0j),  0j    ,  0j    ,  -0j   ],
+            #          [(-8+8j),  0j    ,  0j    ,  -0j   ],
+            #          [(-8+0j),  0j    ,  0j    ,  -0j   ],
+            #          [(-8-8j),  0j    ,  0j    ,  -0j   ]],
+
+            #         [[(24+0j),  0j    ,  0j    ,  -0j   ],
+            #          [(-8+8j),  0j    ,  0j    ,  -0j   ],
+            #          [(-8+0j),  0j    ,  0j    ,  -0j   ],
+            #          [(-8-8j),  0j    ,  0j    ,  -0j   ]]])
     """
     if is_integer(x) or is_floating_point(x):
         return fftn_r2c(
diff --git a/python/paddle/incubate/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py
index ad96ab9669e67..2f745a3feb980 100644
--- a/python/paddle/incubate/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
@@ -533,8 +533,8 @@ class FusedFeedForward(Layer):
             fused_feedforward_layer = FusedFeedForward(8, 8)
             x = paddle.rand((1, 8, 8))
             out = fused_feedforward_layer(x)
-            print(out.numpy().shape)
-            # (1, 8, 8)
+            print(out.shape)
+            # [1, 8, 8]
     """
 
     def __init__(
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 6bf32317a461f..89bb63643f6a1 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -1677,11 +1677,12 @@ def glu(x, axis=-1, name=None):
 
             x = paddle.to_tensor(
                 [[-0.22014759, -1.76358426,  0.80566144,  0.04241343],
-                 [-1.94900405, -1.89956081,  0.17134808, -1.11280477]]
+                    [-1.94900405, -1.89956081,  0.17134808, -1.11280477]]
             )
-            print(F.glu(x).numpy())
-            # array([[-0.15216254, -0.9004892 ],
-            #        [-1.0577879 , -0.46985325]], dtype=float32)
+            print(F.glu(x))
+            # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[-0.15216254, -0.90048921],
+            #         [-1.05778778, -0.46985325]])
 
     """
     check_variable_and_dtype(
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index face92190c0f5..9b5f63254809b 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -657,10 +657,9 @@ def conv2d(
           w_var = paddle.randn((6, 3, 3, 3), dtype='float32')
 
           y_var = F.conv2d(x_var, w_var)
-          y_np = y_var.numpy()
 
-          print(y_np.shape)
-          # (2, 6, 6, 6)
+          print(y_var.shape)
+          # [2, 6, 6, 6]
     """
     # entry checks
     if data_format not in ["NCHW", "NHWC"]:
@@ -1234,10 +1233,9 @@ def conv2d_transpose(
           w_var = paddle.randn((3, 6, 3, 3), dtype='float32')
 
           y_var = F.conv2d_transpose(x_var, w_var)
-          y_np = y_var.numpy()
 
-          print(y_np.shape)
-          # (2, 6, 10, 10)
+          print(y_var.shape)
+          # [2, 6, 10, 10]
     """
 
     if data_format not in ['NCHW', 'NHWC']:
@@ -1523,10 +1521,9 @@ def conv3d(
             w_var = paddle.randn((6, 3, 3, 3, 3), dtype='float32')
 
             y_var = F.conv3d(x_var, w_var)
-            y_np = y_var.numpy()
 
-            print(y_np.shape)
-            # (2, 6, 6, 6, 6)
+            print(y_var.shape)
+            # [2, 6, 6, 6, 6]
     """
     # entry check
     if data_format not in ["NCDHW", "NDHWC"]:
@@ -1738,10 +1735,9 @@ def conv3d_transpose(
           w_var = paddle.randn((3, 6, 3, 3, 3), dtype='float32')
 
           y_var = F.conv3d_transpose(x_var, w_var)
-          y_np = y_var.numpy()
 
-          print(y_np.shape)
-          # (2, 6, 10, 10, 10)
+          print(y_var.shape)
+          # [2, 6, 10, 10, 10]
     """
     # entry checks
     if data_format not in ["NCDHW", "NDHWC"]:
diff --git a/python/paddle/nn/functional/distance.py b/python/paddle/nn/functional/distance.py
index a931d3cb006ad..b9783c251be0c 100644
--- a/python/paddle/nn/functional/distance.py
+++ b/python/paddle/nn/functional/distance.py
@@ -63,7 +63,9 @@ def pairwise_distance(x, y, p=2.0, epsilon=1e-6, keepdim=False, name=None):
             x = paddle.to_tensor([[1., 3.], [3., 5.]], dtype=paddle.float64)
             y = paddle.to_tensor([[5., 6.], [7., 8.]], dtype=paddle.float64)
             distance = paddle.nn.functional.pairwise_distance(x, y)
-            print(distance.numpy()) # [5. 5.]
+            print(distance)
+    #       Tensor(shape=[2], dtype=float64, place=Place(gpu:0), stop_gradient=True,
+    #              [4.99999860, 4.99999860])
 
     """
     check_type(p, 'porder', (float, int), 'PairwiseDistance')
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index e2327871bcfc3..f3d906be1f3ed 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -215,10 +215,11 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
             lengths = paddle.to_tensor([10, 9, 8])
             mask = paddle.nn.functional.sequence_mask(lengths)
 
-            print(mask.numpy())
-            # [[1 1 1 1 1 1 1 1 1 1]
-            #  [1 1 1 1 1 1 1 1 1 0]
-            #  [1 1 1 1 1 1 1 1 0 0]]
+            print(mask)
+            # Tensor(shape=[3, 10], dtype=int64, place=Place(gpu:0), stop_gradient=True,
+            #        [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+            #         [1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
+            #         [1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
 
     """
 
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index dd5c2e128268c..83341a9dabc01 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1353,17 +1353,20 @@ def l1_loss(input, label, reduction='mean', name=None):
             label = paddle.to_tensor([[1.7, 1], [0.4, 0.5]])
 
             l1_loss = paddle.nn.functional.l1_loss(input, label)
-            print(l1_loss.numpy())
-            # [0.35]
+            print(l1_loss)
+            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [0.34999999])
 
             l1_loss = paddle.nn.functional.l1_loss(input, label, reduction='none')
-            print(l1_loss.numpy())
-            # [[0.20000005 0.19999999]
-            # [0.2        0.79999995]]
+            print(l1_loss)
+            # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[0.20000005, 0.19999999],
+            #         [0.20000000, 0.79999995]])
 
             l1_loss = paddle.nn.functional.l1_loss(input, label, reduction='sum')
-            print(l1_loss.numpy())
-            # [1.4]
+            print(l1_loss)
+            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [1.39999998])
 
     """
     if reduction not in ['sum', 'mean', 'none']:
@@ -2530,9 +2533,11 @@ def cross_entropy(
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=weight, reduction=reduction)
             dy_ret = cross_entropy_loss(
-                                       input,
-                                       label)
-            print(dy_ret.numpy()) #[5.41993642]
+                                        input,
+                                        label)
+            print(dy_ret)
+            # Tensor(shape=[1], dtype=float64, place=Place(gpu:0), stop_gradient=True,
+            #        [5.34043430])
 
         .. code-block:: python
 
@@ -2550,13 +2555,15 @@ def cross_entropy(
             labels = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
             labels /= paddle.sum(labels, axis=axis, keepdim=True)
             paddle_loss_mean = paddle.nn.functional.cross_entropy(
-                                                                  logits,
-                                                                  labels,
-                                                                  soft_label=True,
-                                                                  axis=axis,
-                                                                  weight=weight,
-                                                                  reduction=reduction)
-            print(paddle_loss_mean.numpy()) #[1.12908343]
+                                                                    logits,
+                                                                    labels,
+                                                                    soft_label=True,
+                                                                    axis=axis,
+                                                                    weight=weight,
+                                                                    reduction=reduction)
+            print(paddle_loss_mean)
+            # Tensor(shape=[1], dtype=float64, place=Place(gpu:0), stop_gradient=True,
+            #        [1.11043464])
 
     """
 
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index c01f962d79dbc..2cb448f9fdb2c 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -368,9 +368,8 @@ def pixel_shuffle(x, upscale_factor, data_format="NCHW", name=None):
 
             x = paddle.randn(shape=[2,9,4,4])
             out_var = F.pixel_shuffle(x, 3)
-            out = out_var.numpy()
-            print(out.shape)
-            # (2, 1, 12, 12)
+            print(out_var.shape)
+            # [2, 1, 12, 12]
     """
     if not isinstance(upscale_factor, int):
         raise TypeError("upscale factor must be int type")
diff --git a/python/paddle/nn/initializer/constant.py b/python/paddle/nn/initializer/constant.py
index 6a8ce4385fc5e..637ae6299005c 100644
--- a/python/paddle/nn/initializer/constant.py
+++ b/python/paddle/nn/initializer/constant.py
@@ -32,11 +32,13 @@ class Constant(ConstantInitializer):
 
             data = paddle.rand([30, 10, 2], dtype='float32')
             linear = nn.Linear(2,
-                               4,
-                               weight_attr=nn.initializer.Constant(value=2.0))
+                                4,
+                                weight_attr=nn.initializer.Constant(value=2.0))
             res = linear(data)
-            print(linear.weight.numpy())
-            #result is [[2. 2. 2. 2.],[2. 2. 2. 2.]]
+            print(linear.weight)
+            # Tensor(shape=[2, 4], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+            #        [[2., 2., 2., 2.],
+            #         [2., 2., 2., 2.]])
 
     """
 
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 81eef1091c1d6..a3d719f67c182 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -668,9 +668,8 @@ class Conv2D(_ConvNd):
 
           conv = nn.Conv2D(4, 6, (3, 3))
           y_var = conv(x_var)
-          y_np = y_var.numpy()
-          print(y_np.shape)
-          # (2, 6, 6, 6)
+          print(y_var.shape)
+          # [2, 6, 6, 6]
     """
 
     def __init__(
@@ -841,9 +840,8 @@ class Conv2DTranspose(_ConvNd):
 
           conv = nn.Conv2DTranspose(4, 6, (3, 3))
           y_var = conv(x_var)
-          y_np = y_var.numpy()
-          print(y_np.shape)
-          # (2, 6, 10, 10)
+          print(y_var.shape)
+          # [2, 6, 10, 10]
     """
 
     def __init__(
@@ -999,9 +997,8 @@ class Conv3D(_ConvNd):
 
           conv = nn.Conv3D(4, 6, (3, 3, 3))
           y_var = conv(x_var)
-          y_np = y_var.numpy()
-          print(y_np.shape)
-          # (2, 6, 6, 6, 6)
+          print(y_var.shape)
+          # [2, 6, 6, 6, 6]
     """
 
     def __init__(
@@ -1181,9 +1178,8 @@ class Conv3DTranspose(_ConvNd):
 
           conv = nn.Conv3DTranspose(4, 6, (3, 3, 3))
           y_var = conv(x_var)
-          y_np = y_var.numpy()
-          print(y_np.shape)
-          # (2, 6, 10, 10, 10)
+          print(y_var.shape)
+          # [2, 6, 10, 10, 10]
     """
 
     def __init__(
diff --git a/python/paddle/nn/layer/distance.py b/python/paddle/nn/layer/distance.py
index 72dea12b49a71..f63ce53c4e2b2 100644
--- a/python/paddle/nn/layer/distance.py
+++ b/python/paddle/nn/layer/distance.py
@@ -56,7 +56,9 @@ class PairwiseDistance(Layer):
             y = paddle.to_tensor([[5., 6.], [7., 8.]], dtype=paddle.float64)
             dist = paddle.nn.PairwiseDistance()
             distance = dist(x, y)
-            print(distance.numpy()) # [5. 5.]
+            print(distance)
+            # Tensor(shape=[2], dtype=float64, place=Place(gpu:0), stop_gradient=True,
+            #        [4.99999860, 4.99999860])
 
     """
 
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index e88331676c525..2d5f57f2c585e 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -102,7 +102,9 @@ class BCEWithLogitsLoss(Layer):
             label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32")
             bce_logit_loss = paddle.nn.BCEWithLogitsLoss()
             output = bce_logit_loss(logit, label)
-            print(output.numpy())  # [0.45618808]
+            print(output)
+            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [0.45618814])
 
     """
 
@@ -319,9 +321,11 @@ class CrossEntropyLoss(Layer):
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=weight, reduction=reduction)
             dy_ret = cross_entropy_loss(
-                                       input,
-                                       label)
-            print(dy_ret.numpy()) #[5.41993642]
+                                        input,
+                                        label)
+            print(dy_ret)
+            # Tensor(shape=[1], dtype=float64, place=Place(gpu:0), stop_gradient=True,
+            #        [5.34043430])
 
         .. code-block:: python
 
@@ -339,13 +343,15 @@ class CrossEntropyLoss(Layer):
             labels = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
             labels /= paddle.sum(labels, axis=axis, keepdim=True)
             paddle_loss_mean = paddle.nn.functional.cross_entropy(
-                                                                  logits,
-                                                                  labels,
-                                                                  soft_label=True,
-                                                                  axis=axis,
-                                                                  weight=weight,
-                                                                  reduction=reduction)
-            print(paddle_loss_mean.numpy()) #[1.12908343]
+                                                                    logits,
+                                                                    labels,
+                                                                    soft_label=True,
+                                                                    axis=axis,
+                                                                    weight=weight,
+                                                                    reduction=reduction)
+            print(paddle_loss_mean)
+            # Tensor(shape=[1], dtype=float64, place=Place(gpu:0), stop_gradient=True,
+            #        [1.11043464])
 
     """
 
@@ -635,19 +641,22 @@ class L1Loss(Layer):
 
             l1_loss = paddle.nn.L1Loss()
             output = l1_loss(input, label)
-            print(output.numpy())
-            # [0.35]
+            print(output)
+            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [0.34999999])
 
             l1_loss = paddle.nn.L1Loss(reduction='sum')
             output = l1_loss(input, label)
-            print(output.numpy())
-            # [1.4]
+            print(output)
+            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [1.39999998])
 
             l1_loss = paddle.nn.L1Loss(reduction='none')
             output = l1_loss(input, label)
             print(output)
-            # [[0.20000005 0.19999999]
-            # [0.2        0.79999995]]
+            # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[0.20000005, 0.19999999],
+            #         [0.20000000, 0.79999995]])
 
     """
 
diff --git a/python/paddle/nn/layer/vision.py b/python/paddle/nn/layer/vision.py
index bcc6ea77bb55c..8542e2b62111e 100644
--- a/python/paddle/nn/layer/vision.py
+++ b/python/paddle/nn/layer/vision.py
@@ -52,10 +52,9 @@ class PixelShuffle(Layer):
 
             x = paddle.randn(shape=[2,9,4,4])
             pixel_shuffle = nn.PixelShuffle(3)
-            out_var = pixel_shuffle(x)
-            out = out_var.numpy()
+            out = pixel_shuffle(x)
             print(out.shape)
-            # (2, 1, 12, 12)
+            # [2, 1, 12, 12]
 
     """
 
diff --git a/python/paddle/nn/quant/quant_layers.py b/python/paddle/nn/quant/quant_layers.py
index 8d81a61dd7921..9cb2db000d531 100644
--- a/python/paddle/nn/quant/quant_layers.py
+++ b/python/paddle/nn/quant/quant_layers.py
@@ -628,10 +628,8 @@ class QuantizedConv2DTranspose(Layer):
           conv_quantized = QuantizedConv2DTranspose(conv)
           y_quantized = conv_quantized(x_var)
           y_var = conv(x_var)
-          y_quantized_np = y_quantized.numpy()
-          y_np = y_var.numpy()
-          print(y_np.shape, y_quantized_np.shape)
-          # (2, 6, 10, 10), (2, 6, 10, 10)
+          print(y_var.shape, y_quantized.shape)
+          # [2, 6, 10, 10], [2, 6, 10, 10]
 
     """
 
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 8c47809d222a9..60272630b2199 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -2453,7 +2453,10 @@ def unique(
 
             x = paddle.to_tensor([2, 3, 3, 1, 5, 3])
             unique = paddle.unique(x)
-            np_unique = unique.numpy() # [1 2 3 5]
+            print(unique)
+            # Tensor(shape=[4], dtype=int64, place=Place(gpu:0), stop_gradient=True,
+            #        [1, 2, 3, 5])
+
             _, indices, inverse, counts = paddle.unique(x, return_index=True, return_inverse=True, return_counts=True)
             print(indices)
             # Tensor(shape=[4], dtype=int64, place=Place(gpu:0), stop_gradient=True,
diff --git a/python/paddle/text/datasets/conll05.py b/python/paddle/text/datasets/conll05.py
index 10ef8f4edfb4e..7456030b6711f 100644
--- a/python/paddle/text/datasets/conll05.py
+++ b/python/paddle/text/datasets/conll05.py
@@ -88,7 +88,7 @@ def forward(self, pred_idx, mark, label):
 
                 model = SimpleNet()
                 pred_idx, mark, label= model(pred_idx, mark, label)
-                print(pred_idx.numpy(), mark.numpy(), label.numpy())
+                print(pred_idx, mark, label)
 
     """
 
diff --git a/python/paddle/text/datasets/imdb.py b/python/paddle/text/datasets/imdb.py
index 7aad2095c4118..abf4424e3f37b 100644
--- a/python/paddle/text/datasets/imdb.py
+++ b/python/paddle/text/datasets/imdb.py
@@ -67,7 +67,7 @@ def forward(self, doc, label):
 
                 model = SimpleNet()
                 image, label = model(doc, label)
-                print(doc.numpy().shape, label.numpy().shape)
+                print(doc.shape, label.shape)
 
     """
 
diff --git a/python/paddle/text/datasets/imikolov.py b/python/paddle/text/datasets/imikolov.py
index c9f04712c6fe1..d936bcb667881 100644
--- a/python/paddle/text/datasets/imikolov.py
+++ b/python/paddle/text/datasets/imikolov.py
@@ -67,7 +67,7 @@ def forward(self, src, trg):
 
                 model = SimpleNet()
                 src, trg = model(src, trg)
-                print(src.numpy().shape, trg.numpy().shape)
+                print(src.shape, trg.shape)
 
     """
 
diff --git a/python/paddle/text/datasets/movielens.py b/python/paddle/text/datasets/movielens.py
index 505863748caa1..b3048426b4299 100644
--- a/python/paddle/text/datasets/movielens.py
+++ b/python/paddle/text/datasets/movielens.py
@@ -134,7 +134,7 @@ def forward(self, category, title, rating):
 
                 model = SimpleNet()
                 category, title, rating = model(category, title, rating)
-                print(category.numpy().shape, title.numpy().shape, rating.numpy().shape)
+                print(category.shape, title.shape, rating.shape)
 
     """
 

From 4aad4dc5f52c30ca75390e129d498964f962d822 Mon Sep 17 00:00:00 2001
From: Zman <35071129+Atlantisming@users.noreply.github.com>
Date: Wed, 7 Dec 2022 19:05:17 +0800
Subject: [PATCH 31/60] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86=E8=8B=B1?=
 =?UTF-8?q?=E6=96=87API=E6=96=87=E6=A1=A3=20(#48219)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 修改paddle.nn.dynamic_decode，paddle.nn.functional.diag_embed 示例

* mma qk tensor_core (#48087)

* use mma for QK dot computing in fused_multi_transformer.
* Update fused_multi_transformer_op.cu.h

* remove lrn which is not used in paddle 2.0 (#47945)

* replace scatter_nd and scatter_nd_add with paddle.scatter_nd and (#47960)

paddle.scatter_nd_add

* [PHI] Migrate mul_grad kernel (#48061)

* cleanup unused code

* unify is_int8 is_bfloat16

* Simplify matmul_v2 FWD kernel

* remove RunKernel methods

* remove import namespace

* remove headers

* clean fluid/phi cross imports

* remove fluid axpy_handler

* delete fluid methods

* activations

* OneDNNMemDesc

* MKLDNNFormatForSize

* MatchShapeToLayout

* MKLDNNMemoryFormat

* MKLDNNFormat

* ReorderMKLDNNHandler

* to_void_cast

* review suggestions

* interpolate

* remove fluid depedency

* init

* ExecuteMatMulV2

* rm fluid kernel

* matmul_grad

* remove mutable_data

* mul_grad

* delete unnecessary shape and slice op (#48112)

* 修改英文文档。

* 修改segment operator等英文文档。

* 重新修改了paddle.einsum，paddle.unique_consecutive，
paddle.disable_signal_handler的英文文档格式。

* 重新修改了英文文档格式。;test=docs_preview

* Update extension.py

* 重新修改了英文文档格式。;test=docs_preview

* 重新修改了英文文档格式。
待验收：
- paddle.linalg.svd
- paddle.nn.functional.diag_embed
- paddle.set_grad_enabled
- paddle.disable_signal_handler
- paddle.cumprod
- paddle.devaice.cuda.stream_guard

待修改：
- paddle.nn.dynamic_decode
- paddle.einsum
- paddle.unique_consecutive
- paddle.linalg.svd
- paddle.uncubate.segment_min
- paddle.uncubate.segment_max
- paddle.uncubate.segment_sum
- paddle.uncubate.segment_mean

;test=docs_preview

* 重新修改了英文文档格式。
待验收：
- paddle.linalg.svd
- paddle.nn.functional.diag_embed
- paddle.set_grad_enabled
- paddle.disable_signal_handler
- paddle.cumprod
- paddle.devaice.cuda.stream_guard
- paddle.nn.dynamic_decode
- paddle.unique_consecutive
- paddle.linalg.svd

待修改：
- paddle.einsum
- paddle.incubate.segment_min
- paddle.incubate.segment_max
- paddle.incubate.segment_sum
- paddle.incubate.segment_mean

;test=docs_preview

* 重新修改了英文文档格式。
待验收：
- paddle.linalg.svd
- paddle.nn.functional.diag_embed
- paddle.set_grad_enabled
- paddle.disable_signal_handler
- paddle.cumprod
- paddle.devaice.cuda.stream_guard
- paddle.nn.dynamic_decode
- paddle.unique_consecutive
- paddle.linalg.svd

待修改：
- paddle.einsum
- paddle.incubate.segment_min
- paddle.incubate.segment_max
- paddle.incubate.segment_sum
- paddle.incubate.segment_mean

;test=docs_preview

* update

* test=docs_preview

* update formula; test=docs_preview

* update formula; test=docs_preview

* remove this operator; test=docs_preview

* add hyper link; test=docs_preview

* add default value; test=docs_preview

* update format; test=docs_preview

* empty commit; test=docs_preview

* fix codestyle issues; test=docs_preview

* empty commit; test=docs_preview

Co-authored-by: lzy <569782149@qq.com>
Co-authored-by: Vvsmile <450864116@qq.com>
Co-authored-by: Sławomir Siwek <slawomir.siwek@intel.com>
Co-authored-by: RichardWooSJTU <37864677+RichardWooSJTU@users.noreply.github.com>
Co-authored-by: Ligoml <39876205+Ligoml@users.noreply.github.com>
Co-authored-by: Nyakku Shigure <sigure.qaq@gmail.com>
---
 python/paddle/device/cuda/__init__.py    |  4 +-
 python/paddle/fluid/framework.py         |  3 +-
 python/paddle/fluid/layers/rnn.py        | 25 ++++---
 python/paddle/framework/framework.py     |  3 +
 python/paddle/incubate/tensor/math.py    | 44 +++++++++----
 python/paddle/nn/functional/extension.py | 84 ++++++++++++------------
 python/paddle/tensor/einsum.py           | 31 ++++-----
 python/paddle/tensor/linalg.py           |  9 ++-
 python/paddle/tensor/manipulation.py     | 13 ++--
 python/paddle/tensor/math.py             | 10 ++-
 10 files changed, 131 insertions(+), 95 deletions(-)

diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py
index 316f9de612265..22ef453d08594 100644
--- a/python/paddle/device/cuda/__init__.py
+++ b/python/paddle/device/cuda/__init__.py
@@ -355,8 +355,8 @@ def _set_current_stream(stream):
 @signature_safe_contextmanager
 def stream_guard(stream):
     '''
-    **Notes**:
-        **This API only supports dygraph mode currently.**
+    Notes:
+        This API only supports dynamic graph mode currently.
 
     A context manager that specifies the current stream context by the given stream.
 
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 6aa1a32c55280..da4f609c401ac 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -786,7 +786,8 @@ def disable_signal_handler():
 
     Make sure you called paddle.disable_signal_handler() before using above mentioned frameworks.
 
-    Returns: None
+    Returns:
+        None
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index 6786f04292ba4..e6ad3de9f48f4 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -1805,26 +1805,23 @@ def dynamic_decode(
         **kwargs: Additional keyword arguments. Arguments passed to `decoder.step`.
 
     Returns:
-        tuple: A tuple( :code:`(final_outputs, final_states, sequence_lengths)` ) \
-            when `return_length` is True, otherwise a tuple( :code:`(final_outputs, final_states)` ). \
-            The final outputs and states, both are Tensor or nested structure of Tensor. \
-            `final_outputs` has the same structure and data types as the :code:`outputs` \
-            returned by :code:`decoder.step()` , and each Tenser in `final_outputs` \
-            is the stacked of all decoding steps' outputs, which might be revised \
-            by :code:`decoder.finalize()` if the decoder has implemented `finalize`. \
-            `final_states` is the counterpart at last time step of initial states \
-            returned by :code:`decoder.initialize()` , thus has the same structure \
-            with it and has tensors with same shapes and data types. `sequence_lengths` \
-            is an `int64` tensor with the same shape as `finished` returned \
-            by :code:`decoder.initialize()` , and it stores the actual lengths of \
-            all decoded sequences.
 
+        - final_outputs (Tensor, nested structure of Tensor), each Tensor in :code:`final_outputs` is the stacked of all decoding steps' outputs, which might be revised
+            by :code:`decoder.finalize()` if the decoder has implemented finalize.
+            And :code:`final_outputs` has the same structure and data types as the :code:`outputs`
+            returned by :code:`decoder.step()`
+
+        - final_states (Tensor, nested structure of Tensor), :code:`final_states` is the counterpart at last time step of initial states \
+            returned by :code:`decoder.initialize()` , thus has the same structure
+            with it and has tensors with same shapes and data types.
+
+        - sequence_lengths (Tensor), stores the actual lengths of all decoded sequences.
+            sequence_lengths is provided only if :code:`return_length` is True.
 
     Examples:
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
             from paddle.nn import BeamSearchDecoder, dynamic_decode
             from paddle.nn import GRUCell, Linear, Embedding
diff --git a/python/paddle/framework/framework.py b/python/paddle/framework/framework.py
index d8e1b79c259fb..e3b7519c4f846 100644
--- a/python/paddle/framework/framework.py
+++ b/python/paddle/framework/framework.py
@@ -93,6 +93,9 @@ def set_grad_enabled(mode):
     Args:
         mode(bool): whether to enable (`True`), or disable (`False`) grad.
 
+    Returns:
+        None.
+
     Examples:
         .. code-block:: python
 
diff --git a/python/paddle/incubate/tensor/math.py b/python/paddle/incubate/tensor/math.py
index 21e49135441ca..923f8a590bb52 100644
--- a/python/paddle/incubate/tensor/math.py
+++ b/python/paddle/incubate/tensor/math.py
@@ -31,9 +31,14 @@ def segment_sum(data, segment_ids, name=None):
     r"""
     Segment Sum Operator.
 
-    This operator sums the elements of input `data` which with
+    Sum the elements of input `data` which with
     the same index in `segment_ids`.
-    It computes a tensor such that $out_i = \\sum_{j} data_{j}$
+    It computes a tensor such that
+
+    .. math::
+
+        out_i = \sum_{j \in \{segment\_ids_j == i \} } data_{j}
+
     where sum is over j such that `segment_ids[j] == i`.
 
     Args:
@@ -45,7 +50,7 @@ def segment_sum(data, segment_ids, name=None):
                             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-       output (Tensor): the reduced result.
+       Tensor, the Segment Sum result.
 
     Examples:
 
@@ -93,11 +98,16 @@ def segment_sum(data, segment_ids, name=None):
 )
 def segment_mean(data, segment_ids, name=None):
     r"""
-    Segment mean Operator.
+    Segment Mean Operator.
 
     Ihis operator calculate the mean value of input `data` which
     with the same index in `segment_ids`.
-    It computes a tensor such that $out_i = \\frac{1}{n_i}  \\sum_{j} data[j]$
+    It computes a tensor such that
+
+    .. math::
+
+        out_i = \mathop{mean}_{j \in \{segment\_ids_j == i \} } data_{j}
+
     where sum is over j such that 'segment_ids[j] == i' and $n_i$ is the number
     of all index 'segment_ids[j] == i'.
 
@@ -110,7 +120,7 @@ def segment_mean(data, segment_ids, name=None):
                             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-       output (Tensor): the reduced result.
+       Tensor, the Segment Mean result.
 
     Examples:
 
@@ -161,9 +171,14 @@ def segment_min(data, segment_ids, name=None):
     r"""
     Segment min operator.
 
-    This operator calculate the minimum elements of input `data` which with
+    Calculate the minimum elements of input `data` which with
     the same index in `segment_ids`.
-    It computes a tensor such that $out_i = \\min_{j} data_{j}$
+    It computes a tensor such that
+
+    .. math::
+
+        out_i = \min_{j \in \{segment\_ids_j == i \} } data_{j}
+
     where min is over j such that `segment_ids[j] == i`.
 
     Args:
@@ -175,7 +190,7 @@ def segment_min(data, segment_ids, name=None):
                             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-       output (Tensor): the reduced result.
+       Tensor, the minimum result.
 
     Examples:
 
@@ -227,9 +242,14 @@ def segment_max(data, segment_ids, name=None):
     r"""
     Segment max operator.
 
-    This operator calculate the maximum elements of input `data` which with
+    Calculate the maximum elements of input `data` which with
     the same index in `segment_ids`.
-    It computes a tensor such that $out_i = \\max_{j} data_{j}$
+    It computes a tensor such that
+
+    .. math::
+
+        out_i = \max_{j \in \{segment\_ids_j == i \} } data_{j}
+
     where max is over j such that `segment_ids[j] == i`.
 
     Args:
@@ -241,7 +261,7 @@ def segment_max(data, segment_ids, name=None):
                             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-       output (Tensor): the reduced result.
+       Tensor, the maximum result.
 
     Examples:
 
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index f3d906be1f3ed..1a3a719369fff 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -39,7 +39,7 @@
 
 def diag_embed(input, offset=0, dim1=-2, dim2=-1):
     """
-    This OP creates a tensor whose diagonals of certain 2D planes (specified by dim1 and dim2)
+    Creates a tensor whose diagonals of certain 2D planes (specified by dim1 and dim2)
     are filled by ``input``. By default, a 2D plane formed by the last two dimensions
     of the returned tensor will be selected.
 
@@ -61,48 +61,48 @@ def diag_embed(input, offset=0, dim1=-2, dim2=-1):
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.nn.functional as F
-            import numpy as np
-
-            diag_embed = np.random.randn(2, 3).astype('float32')
-            # [[ 0.7545889 , -0.25074545,  0.5929117 ],
-            #  [-0.6097662 , -0.01753256,  0.619769  ]]
-
-            data1 = F.diag_embed(diag_embed)
-            data1.numpy()
-            # [[[ 0.7545889 ,  0.        ,  0.        ],
-            #  [ 0.        , -0.25074545,  0.        ],
-            #   [ 0.        ,  0.        ,  0.5929117 ]],
-
-            # [[-0.6097662 ,  0.        ,  0.        ],
-            #  [ 0.        , -0.01753256,  0.        ],
-            #  [ 0.        ,  0.        ,  0.619769  ]]]
-
-            data2 = F.diag_embed(diag_embed, offset=-1, dim1=0, dim2=2)
-            data2.numpy()
-            # [[[ 0.        ,  0.        ,  0.        ,  0.        ],
-            #   [ 0.7545889 ,  0.        ,  0.        ,  0.        ],
-            #   [ 0.        , -0.25074545,  0.        ,  0.        ],
-            #   [ 0.        ,  0.        ,  0.5929117 ,  0.        ]],
-            #
-            #  [[ 0.        ,  0.        ,  0.        ,  0.        ],
-            #   [-0.6097662 ,  0.        ,  0.        ,  0.        ],
-            #   [ 0.        , -0.01753256,  0.        ,  0.        ],
-            #   [ 0.        ,  0.        ,  0.619769  ,  0.        ]]]
-
-            data3 = F.diag_embed(diag_embed, offset=1, dim1=0, dim2=2)
-            data3.numpy()
-            # [[[ 0.        ,  0.7545889 ,  0.        ,  0.        ],
-            #   [ 0.        , -0.6097662 ,  0.        ,  0.        ]],
-            #
-            #  [[ 0.        ,  0.        , -0.25074545,  0.        ],
-            #   [ 0.        ,  0.        , -0.01753256,  0.        ]],
-            #
-            #  [[ 0.        ,  0.        ,  0.        ,  0.5929117 ],
-            #   [ 0.        ,  0.        ,  0.        ,  0.619769  ]],
-            #
-            #  [[ 0.        ,  0.        ,  0.        ,  0.        ],
-            #   [ 0.        ,  0.        ,  0.        ,  0.        ]]]
+
+            diag_embed_input = paddle.arange(6)
+
+            diag_embed_output1 = F.diag_embed(diag_embed_input)
+            print(diag_embed_output1)
+            # Tensor(shape=[6, 6], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[0, 0, 0, 0, 0, 0],
+            #         [0, 1, 0, 0, 0, 0],
+            #         [0, 0, 2, 0, 0, 0],
+            #         [0, 0, 0, 3, 0, 0],
+            #         [0, 0, 0, 0, 4, 0],
+            #         [0, 0, 0, 0, 0, 5]])
+
+            diag_embed_output2 = F.diag_embed(diag_embed_input, offset=-1, dim1=0,dim2=1 )
+            print(diag_embed_output2)
+            # Tensor(shape=[7, 7], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[0, 0, 0, 0, 0, 0, 0],
+            #         [0, 0, 0, 0, 0, 0, 0],
+            #         [0, 1, 0, 0, 0, 0, 0],
+            #         [0, 0, 2, 0, 0, 0, 0],
+            #         [0, 0, 0, 3, 0, 0, 0],
+            #         [0, 0, 0, 0, 4, 0, 0],
+            #         [0, 0, 0, 0, 0, 5, 0]])
+
+            diag_embed_input_2dim = paddle.reshape(diag_embed_input,[2,3])
+            print(diag_embed_input_2dim)
+            # Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[0, 1, 2],
+            #         [3, 4, 5]])
+            diag_embed_output3 = F.diag_embed(diag_embed_input_2dim,offset= 0, dim1=0, dim2=2 )
+            print(diag_embed_output3)
+            # Tensor(shape=[3, 2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[[0, 0, 0],
+            #          [3, 0, 0]],
+
+            #         [[0, 1, 0],
+            #          [0, 4, 0]],
+
+            #         [[0, 0, 2],
+            #          [0, 0, 5]]])
     """
     if not isinstance(input, Variable):
         input = assign(input)
diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py
index 50718b64409e9..1aad124b2cace 100644
--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
@@ -868,7 +868,7 @@ def einsum(equation, *operands):
 
     einsum(equation, *operands)
 
-    The current version of this API should be used in dygraph only mode.
+    The current version of this API should be used in dynamic graph only mode.
 
     Einsum offers a tensor operation API which allows using the Einstein summation
     convention or Einstain notation. It takes as input one or multiple tensors and
@@ -901,20 +901,21 @@ def einsum(equation, *operands):
           dimensions into broadcasting dimensions.
         - Singular labels are called free labels, duplicate are dummy labels. Dummy labeled
           dimensions will be reduced and removed in the output.
-        - Output labels can be explicitly specified on the right hand side of `->` or omitted. In the latter case, the output labels will be inferred from the input labels.
-            - Inference of output labels
-                - Broadcasting label `...`, if present, is put on the leftmost position.
-                - Free labels are reordered alphabetically and put after `...`.
-            - On explicit output labels
-                - If broadcasting is enabled, then `...` must be present.
-                - The output labels can be an empty, an indication to output as a scalar
-                  the sum over the original output.
-                - Non-input labels are invalid.
-                - Duplicate labels are invalid.
-                - For any dummy label which is present for the output, it's promoted to
-                  a free label.
-                - For any free label which is not present for the output, it's lowered to
-                  a dummy label.
+        - Output labels can be explicitly specified on the right hand side of `->` or omitted.
+            In the latter case, the output labels will be inferred from the input labels.
+                - Inference of output labels
+                    - Broadcasting label `...`, if present, is put on the leftmost position.
+                    - Free labels are reordered alphabetically and put after `...`.
+                - On explicit output labels
+                    - If broadcasting is enabled, then `...` must be present.
+                    - The output labels can be an empty, an indication to output as a scalar
+                        the sum over the original output.
+                    - Non-input labels are invalid.
+                    - Duplicate labels are invalid.
+                    - For any dummy label which is present for the output, it's promoted to
+                        a free label.
+                    - For any free label which is not present for the output, it's lowered to
+                        a dummy label.
 
         - Examples
             - '...ij, ...jk', where i and k are free labels, j is dummy. The output label
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 0ffae882ee5f2..2a3ae8001e743 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -2030,16 +2030,21 @@ def svd(x, full_matrices=False, name=None):
             where `...` is zero or more batch dimensions. N and M can be arbitraty
             positive number. Note that if x is sigular matrices, the grad is numerical
             instable. The data type of x should be float32 or float64.
-        full_matrices (bool): A flag to control the behavor of svd.
+        full_matrices (bool, optional): A flag to control the behavor of svd.
             If full_matrices = True, svd op will compute full U and V matrics,
             which means shape of U is `[..., N, N]`, shape of V is `[..., M, M]`. K = min(M, N).
             If full_matrices = False, svd op will use a economic method to store U and V.
             which means shape of U is `[..., N, K]`, shape of V is `[..., M, K]`. K = min(M, N).
+            Default value is False.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tuple of 3 tensors: (U, S, VH). VH is the conjugate transpose of V. S is the singlar value vectors of matrics with shape `[..., K]`
+        - U (Tensor), is the singular value decomposition result U.
+        - S (Tensor), is the singular value decomposition result S.
+        - VH (Tensor), VH is the conjugate transpose of V, which is the singular value decomposition result V.
+
+        Tuple of 3 tensors(U, S, VH): VH is the conjugate transpose of V. S is the singlar value vectors of matrics with shape `[..., K]`
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 60272630b2199..06a229106347c 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -2278,12 +2278,12 @@ def unique_consecutive(
     dtype="int64",
     name=None,
 ):
-    r"""
+    """
     Eliminates all but the first element from every consecutive group of equivalent elements.
 
     Note:
-        This function is different from :func:`paddle.unique` in the sense that this function
-        only eliminates consecutive duplicate values. This semantics is similar to `std::unique` in C++.
+        This function is different from :ref:`api_paddle_unique` in the sense that this function
+        only eliminates consecutive duplicate values. This semantics is similar to :ref:`api_paddle_unique` in C++.
 
     Args:
         x(Tensor): the input tensor, it's data type should be float32, float64, int32, int64.
@@ -2299,7 +2299,12 @@ def unique_consecutive(
             :ref:`api_guide_Name`. Default is None.
 
     Returns:
-        tuple (out, inverse, counts). `out` is the unique consecutive tensor for `x`. `inverse` is provided only if `return_inverse` is True. `counts` is provided only if `return_counts` is True.
+        - out (Tensor), the unique consecutive tensor for x.
+        - inverse (Tensor), the element of the input tensor corresponds to
+            the index of the elements in the unique consecutive tensor for x.
+            inverse is provided only if return_inverse is True.
+        - counts (Tensor), the counts of the every unique consecutive element in the input tensor.
+            counts is provided only if return_counts is True.
 
     Example:
         .. code-block:: python
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index efa8d7c453b80..883d3c0e3aeb4 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -3449,9 +3449,13 @@ def cumprod(x, dim=None, dtype=None, name=None):
 
     Args:
         x (Tensor): the input tensor need to be cumproded.
-        dim (int): the dimension along which the input tensor will be accumulated. It need to be in the range of [-x.rank, x.rank), where x.rank means the dimensions of the input tensor x and -1 means the last dimension.
-        dtype (str, optional): The data type of the output tensor, can be float32, float64, int32, int64, complex64, complex128. If specified, the input tensor is casted to dtype before the operation is performed. This is useful for preventing data type overflows. The default value is None.
-        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+        dim (int, optional): the dimension along which the input tensor will be accumulated. It need to be in the range of [-x.rank, x.rank),
+                    where x.rank means the dimensions of the input tensor x and -1 means the last dimension.
+        dtype (str, optional): The data type of the output tensor, can be float32, float64, int32, int64, complex64,
+                    complex128. If specified, the input tensor is casted to dtype before the operation is performed.
+                    This is useful for preventing data type overflows. The default value is None.
+        name (str, optional): Name for the operation (optional, default is None). For more information,
+                    please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor, the result of cumprod operator.

From ad41fce85799e2a4b1390b0a8511e56ae6ff326d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Wed, 7 Dec 2022 12:06:20 +0100
Subject: [PATCH 32/60] [PHI] Migrate squeeze and squeeze_grad kernels (#48634)

* squeeze kernel

* squeze fwd

* whitespace
---
 .../operators/mkldnn/reshape_mkldnn_op.cc     | 39 +--------
 .../phi/kernels/onednn/squeeze_grad_kernel.cc | 59 +++++++++++++
 paddle/phi/kernels/onednn/squeeze_kernel.cc   | 85 +++++++++++++++++++
 3 files changed, 147 insertions(+), 36 deletions(-)
 create mode 100644 paddle/phi/kernels/onednn/squeeze_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/onednn/squeeze_kernel.cc

diff --git a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
index 0c2b439b3e510..65a49dab27df2 100644
--- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
@@ -21,7 +21,6 @@ enum class ReshapeKernelOpName {
   reshape,
   reshape2,
   squeeze,
-  squeeze2,
   flatten,
   flatten2,
 };
@@ -106,9 +105,6 @@ class ReshapeMKLDNNKernel : public framework::OpKernel<T> {
       case ReshapeKernelOpName::squeeze:
         InferShapeSqueezeOp(ctx, x_dims, out_dims);
         break;
-      case ReshapeKernelOpName::squeeze2:
-        InferShapeSqueeze2Op(ctx, x_dims, out_dims);
-        break;
       case ReshapeKernelOpName::flatten:
         InferShapeFlattenOp(ctx, x_dims, out_dims);
         break;
@@ -172,16 +168,6 @@ class ReshapeMKLDNNKernel : public framework::OpKernel<T> {
     out_dims = GetOutputShape(axes, x_dims, true);
   }
 
-  void InferShapeSqueeze2Op(const framework::ExecutionContext& ctx,
-                            framework::DDim& x_dims,            // NOLINT
-                            framework::DDim& out_dims) const {  // NOLINT
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto* xshape = ctx.Output<phi::DenseTensor>("XShape");
-    auto xshape_dims = xshape->dims();
-    x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
-    out_dims = out->dims();
-  }
-
   void InferShapeFlattenOp(const framework::ExecutionContext& ctx,
                            framework::DDim& x_dims,            // NOLINT
                            framework::DDim& out_dims) const {  // NOLINT
@@ -342,19 +328,16 @@ class ReshapeGradMKLDNNKernel : public ReshapeMKLDNNKernel<T, op_name> {
         InferShapeReshapeSqueezeGradOp(ctx, x_dims);
         break;
       case ReshapeKernelOpName::reshape2:
-        InferShapeReshape2Squeeze2Flatten2GradOp(ctx, x_dims);
+        InferShapeReshape2Flatten2GradOp(ctx, x_dims);
         break;
       case ReshapeKernelOpName::squeeze:
         InferShapeReshapeSqueezeGradOp(ctx, x_dims);
         break;
-      case ReshapeKernelOpName::squeeze2:
-        InferShapeReshape2Squeeze2Flatten2GradOp(ctx, x_dims);
-        break;
       case ReshapeKernelOpName::flatten:
         InferShapeFlattenGradOp(ctx, x_dims);
         break;
       case ReshapeKernelOpName::flatten2:
-        InferShapeReshape2Squeeze2Flatten2GradOp(ctx, x_dims);
+        InferShapeReshape2Flatten2GradOp(ctx, x_dims);
         break;
       default:
         PADDLE_THROW(paddle::platform::errors::OutOfRange(
@@ -369,7 +352,7 @@ class ReshapeGradMKLDNNKernel : public ReshapeMKLDNNKernel<T, op_name> {
     dx_dims = dx->dims();
   }
 
-  void InferShapeReshape2Squeeze2Flatten2GradOp(
+  void InferShapeReshape2Flatten2GradOp(
       const framework::ExecutionContext& ctx,
       framework::DDim& dx_dims) const {  // NOLINT
     auto xshape_dims = ctx.Input<phi::DenseTensor>("XShape")->dims();
@@ -401,22 +384,6 @@ REGISTER_OP_KERNEL(
     ops::ReshapeGradMKLDNNKernel<paddle::platform::bfloat16,
                                  ReshapeKernelOpName::squeeze>);
 
-REGISTER_OP_KERNEL(
-    squeeze2,
-    MKLDNN,
-    paddle::platform::CPUPlace,
-    ops::ReshapeMKLDNNKernel<float, ReshapeKernelOpName::squeeze2>,
-    ops::ReshapeMKLDNNKernel<paddle::platform::bfloat16,
-                             ReshapeKernelOpName::squeeze2>);
-
-REGISTER_OP_KERNEL(
-    squeeze2_grad,
-    MKLDNN,
-    paddle::platform::CPUPlace,
-    ops::ReshapeGradMKLDNNKernel<float, ReshapeKernelOpName::squeeze2>,
-    ops::ReshapeGradMKLDNNKernel<paddle::platform::bfloat16,
-                                 ReshapeKernelOpName::squeeze2>);
-
 REGISTER_OP_KERNEL(
     reshape,
     MKLDNN,
diff --git a/paddle/phi/kernels/onednn/squeeze_grad_kernel.cc b/paddle/phi/kernels/onednn/squeeze_grad_kernel.cc
new file mode 100644
index 0000000000000..654acfe5700c3
--- /dev/null
+++ b/paddle/phi/kernels/onednn/squeeze_grad_kernel.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/squeeze_grad_kernel.h"
+
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SqueezeGradKernel(const Context& dev_ctx,
+                       const DenseTensor& xshape,
+                       const DenseTensor& dout,
+                       const IntArray& axes,
+                       DenseTensor* dx) {
+  auto dout_vec_dims = vectorize(dout.dims());
+  auto dout_type = funcs::ToOneDNNDataType(dout.dtype());
+
+  funcs::ReorderOneDNNHandler reorder_handler(
+      dout_vec_dims, dout.dtype(), dout_type, dev_ctx.GetEngine());
+
+  auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+      dout.mem_desc(), funcs::to_void_cast(dout.data<T>()));
+  auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+      dx,
+      funcs::GetPlainOneDNNFormat(dout_vec_dims.size()),
+      dev_ctx.GetPlace());
+  auto reorder_p = reorder_handler.AcquireReorder(reorder_dst_memory_p,
+                                                  reorder_src_memory_p);
+
+  auto& astream = OneDNNContext::tls().get_stream();
+  reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+  astream.wait();
+
+  auto dx_dims = slice_ddim(xshape.dims(), 1, xshape.dims().size());
+  dx->Resize(dx_dims);
+  reorder_dst_memory_p->get_desc().reshape(vectorize(dx_dims));
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(squeeze_grad,
+                   OneDNN,
+                   ONEDNN,
+                   phi::SqueezeGradKernel,
+                   float,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/onednn/squeeze_kernel.cc b/paddle/phi/kernels/onednn/squeeze_kernel.cc
new file mode 100644
index 0000000000000..eb7663f8e41b2
--- /dev/null
+++ b/paddle/phi/kernels/onednn/squeeze_kernel.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/squeeze_kernel.h"
+
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/unsqueeze.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ExecuteSqueeze(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DDim& x_dims,
+                    const DDim& out_dims,
+                    DenseTensor* out) {
+  auto x_vec_dims = vectorize(x_dims);
+
+  funcs::ReorderOneDNNHandler reorder_handler(
+      x_vec_dims,
+      x.dtype(),
+      funcs::ToOneDNNDataType(x.dtype()),
+      dev_ctx.GetEngine());
+
+  auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+      x.mem_desc(), funcs::to_void_cast(x.data<T>()));
+  out->Resize(x_dims);  // to match x numel, format is changed later
+  // reorder is done into a plain tag to allow usage with blocked formats
+  auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+      out, funcs::GetPlainOneDNNFormat(x_dims.size()), dev_ctx.GetPlace());
+  auto reorder_p = reorder_handler.AcquireReorder(reorder_dst_memory_p,
+                                                  reorder_src_memory_p);
+  auto& astream = OneDNNContext::tls().get_stream();
+  reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+  astream.wait();
+
+  out->Resize(out_dims);
+  out->set_mem_desc(
+      reorder_dst_memory_p->get_desc().reshape(vectorize(out_dims)));
+}
+
+template <typename T, typename Context>
+void SqueezeKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const IntArray& axes,
+                   DenseTensor* out) {
+  auto x_dims = x.dims();
+  std::vector<int32_t> tmp(axes.GetData().begin(), axes.GetData().end());
+  auto out_dims = funcs::GetOutputSqueezeShape(tmp, x_dims, true);
+  ExecuteSqueeze<T, Context>(dev_ctx, x, x_dims, out_dims, out);
+}
+
+template <typename T, typename Context>
+void SqueezeWithXShapeKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const IntArray& axes,
+                             DenseTensor* out,
+                             DenseTensor* xshape) {
+  auto x_dims = slice_ddim(xshape->dims(), 1, xshape->dims().size());
+  auto out_dims = out->dims();
+  ExecuteSqueeze<T, Context>(dev_ctx, x, x_dims, out_dims, out);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    squeeze, OneDNN, ONEDNN, phi::SqueezeKernel, float, phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(squeeze_with_xshape,
+                   OneDNN,
+                   ONEDNN,
+                   phi::SqueezeWithXShapeKernel,
+                   float,
+                   phi::dtype::bfloat16) {}

From e677b5e515ff42ef4818b69e1fb98ae21b42568d Mon Sep 17 00:00:00 2001
From: hjyp <53164956+Tomoko-hjf@users.noreply.github.com>
Date: Wed, 7 Dec 2022 20:02:53 +0800
Subject: [PATCH 33/60] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dpaddle.nn.functinal?=
 =?UTF-8?q?=E5=8C=85=E5=92=8Cpaddle.nn=E5=8C=85=E4=B8=8BAPI=E6=96=87?=
 =?UTF-8?q?=E6=A1=A3=20(#48581)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 python/paddle/nn/functional/conv.py | 8 ++++----
 python/paddle/nn/layer/conv.py      | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 9b5f63254809b..e636ee3cf9c5b 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -614,10 +614,10 @@ def conv2d(
             the number of output channels, g is the number of groups, kH is the filter's
             height, kW is the filter's width.
         bias (Tensor, optional): The bias with shape [M,].
-        stride (int|list|tuple): The stride size. It means the stride in convolution.
+        stride (int|list|tuple, optional): The stride size. It means the stride in convolution.
             If stride is a list/tuple, it must contain two integers, (stride_height, stride_width).
             Otherwise, stride_height = stride_width = stride. Default: stride = 1.
-        padding (string|int|list|tuple): The padding size. It means the number of zero-paddings
+        padding (string|int|list|tuple, optional): The padding size. It means the number of zero-paddings
             on both sides for each dimension.If `padding` is a string, either 'VALID' or
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
             it could be in three forms: `[pad_height, pad_width]` or
@@ -627,11 +627,11 @@ def conv2d(
             when `data_format` is `"NHWC"`, `padding` can be in the form
             `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
-        dilation (int|list|tuple): The dilation size. It means the spacing between the kernel
+        dilation (int|list|tuple, optional): The dilation size. It means the spacing between the kernel
             points. If dilation is a list/tuple, it must contain two integers, (dilation_height,
             dilation_width). Otherwise, dilation_height = dilation_width = dilation.
             Default: dilation = 1.
-        groups (int): The groups number of the Conv2D Layer. According to grouped
+        groups (int, optional): The groups number of the Conv2D Layer. According to grouped
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index a3d719f67c182..3131be49b5765 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -449,7 +449,7 @@ class Conv1DTranspose(_ConvNd):
         in_channels(int): The number of channels in the input image.
         out_channels(int): The number of the filter. It is as same as the output
             feature map.
-        kernel_size(int|tuple|list, optional): The filter size. If kernel_size is a tuple/list,
+        kernel_size(int|tuple|list): The filter size. If kernel_size is a tuple/list,
             it must contain one integers, (kernel_size). None if
             use output size to calculate kernel_size. Default: None. kernel_size and
             output_size should not be None at the same time.
@@ -598,7 +598,7 @@ class Conv2D(_ConvNd):
     Parameters:
         in_channels(int): The number of input channels in the input image.
         out_channels(int): The number of output channels produced by the convolution.
-        kernel_size(int|list|tuple, optional): The size of the convolving kernel.
+        kernel_size(int|list|tuple): The size of the convolving kernel.
         stride(int|list|tuple, optional): The stride size. If stride is a list/tuple, it must
             contain three integers, (stride_H, stride_W). Otherwise, the
             stride_H = stride_W = stride. The default value is 1.
@@ -925,7 +925,7 @@ class Conv3D(_ConvNd):
     Parameters:
         in_channels(int): The number of input channels in the input image.
         out_channels(int): The number of output channels produced by the convolution.
-        kernel_size(int|list|tuple, optional): The size of the convolving kernel.
+        kernel_size(int|list|tuple): The size of the convolving kernel.
         stride(int|list|tuple, optional): The stride size. If stride is a list/tuple, it must
             contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
             stride_D = stride_H = stride_W = stride. The default value is 1.

From a5b3a65aef5eadc406e52f9205fc762153ca594c Mon Sep 17 00:00:00 2001
From: Vigi Zhang <VigiZhang@users.noreply.github.com>
Date: Wed, 7 Dec 2022 20:13:15 +0800
Subject: [PATCH 34/60] assign cve number to pdsa, test=document_fix (#48846)

---
 security/advisory/pdsa-2022-001.md    | 4 ++++
 security/advisory/pdsa-2022-001_cn.md | 4 ++++
 security/advisory/pdsa-2022-002.md    | 4 ++++
 security/advisory/pdsa-2022-002_cn.md | 4 ++++
 4 files changed, 16 insertions(+)

diff --git a/security/advisory/pdsa-2022-001.md b/security/advisory/pdsa-2022-001.md
index 7b409020abb6b..596b314e749cf 100644
--- a/security/advisory/pdsa-2022-001.md
+++ b/security/advisory/pdsa-2022-001.md
@@ -1,5 +1,9 @@
 ## PDSA-2022-001: OOB read in gather_tree
 
+### CVE Number
+
+CVE-2022-46741
+
 ### Impact
 
 The PoC is as follows:
diff --git a/security/advisory/pdsa-2022-001_cn.md b/security/advisory/pdsa-2022-001_cn.md
index ce5f4e3cf4d58..60e428e2adddf 100644
--- a/security/advisory/pdsa-2022-001_cn.md
+++ b/security/advisory/pdsa-2022-001_cn.md
@@ -1,5 +1,9 @@
 ## PDSA-2022-001: OOB read in gather_tree
 
+### CVE编号
+
+CVE-2022-46741
+
 ### 影响
 
 PoC如下：
diff --git a/security/advisory/pdsa-2022-002.md b/security/advisory/pdsa-2022-002.md
index efb8e931722bb..a3171eab74745 100644
--- a/security/advisory/pdsa-2022-002.md
+++ b/security/advisory/pdsa-2022-002.md
@@ -1,5 +1,9 @@
 ## PDSA-2022-002: Code injection in paddle.audio.functional.get_window
 
+### CVE Number
+
+CVE-2022-46742
+
 ### Impact
 
 `paddle.audio.functional.get_windowis` vulnerable to a code injection as it calls `eval` on user supplied `winstr`. This may lead to arbitrary code execution.
diff --git a/security/advisory/pdsa-2022-002_cn.md b/security/advisory/pdsa-2022-002_cn.md
index 84fc365fbbcd8..f2d7ca9c86507 100644
--- a/security/advisory/pdsa-2022-002_cn.md
+++ b/security/advisory/pdsa-2022-002_cn.md
@@ -1,5 +1,9 @@
 ## PDSA-2022-002: Code injection in paddle.audio.functional.get_window
 
+### CVE编号
+
+CVE-2022-46742
+
 ### 影响
 
 `paddle.audio.functional.get_window`由于使用`eval`用户提供的参数`winstr`而存在代码注入漏洞，将导致任意代码执行。

From d3e9e73a8ad636c2743fb7edf66468c85a6e36c1 Mon Sep 17 00:00:00 2001
From: zqw_1997 <118182234+zhengqiwen1997@users.noreply.github.com>
Date: Wed, 7 Dec 2022 21:03:05 +0800
Subject: [PATCH 35/60] [fluid remove]: remove paddle.fluid.layers.yolo_box and
 paddle.fluid.layers.yolov3_loss (#48722)

* remove paddle.fluid.layers.nn.temporal_shift

* code check

* rm unittest

* remove fluid.yolo_box

* remove fluid.yolov3_loss

* change the comments of yolov3_loss to yolo_loss
---
 python/paddle/fluid/layers/detection.py       | 258 ------------------
 python/paddle/fluid/tests/test_detection.py   |  69 -----
 .../unittests/dygraph_to_static/yolov3.py     |   4 +-
 .../unittests/ipu/test_yolo_box_op_ipu.py     |   2 +-
 .../ir/inference/test_trt_yolo_box_op.py      |  14 +-
 .../tests/unittests/test_device_guard.py      |   4 +-
 .../unittests/xpu/test_device_guard_xpu.py    |   4 +-
 7 files changed, 14 insertions(+), 341 deletions(-)

diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 3d277705aa90e..274919197827b 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -52,8 +52,6 @@
     'iou_similarity',
     'box_coder',
     'polygon_box_transform',
-    'yolov3_loss',
-    'yolo_box',
     'box_clip',
     'multiclass_nms',
     'locality_aware_nms',
@@ -435,262 +433,6 @@ def polygon_box_transform(input, name=None):
     return output
 
 
-@deprecated(since="2.0.0", update_to="paddle.vision.ops.yolo_loss")
-@templatedoc(op_type="yolov3_loss")
-def yolov3_loss(
-    x,
-    gt_box,
-    gt_label,
-    anchors,
-    anchor_mask,
-    class_num,
-    ignore_thresh,
-    downsample_ratio,
-    gt_score=None,
-    use_label_smooth=True,
-    name=None,
-    scale_x_y=1.0,
-):
-    """
-
-    ${comment}
-
-    Args:
-        x (Variable): ${x_comment}The data type is float32 or float64.
-        gt_box (Variable): groud truth boxes, should be in shape of [N, B, 4],
-                          in the third dimension, x, y, w, h should be stored.
-                          x,y is the center coordinate of boxes, w, h are the
-                          width and height, x, y, w, h should be divided by
-                          input image height to scale to [0, 1].
-                          N is the batch number and B is the max box number in
-                          an image.The data type is float32 or float64.
-        gt_label (Variable): class id of ground truth boxes, should be in shape
-                            of [N, B].The data type is int32.
-        anchors (list|tuple): ${anchors_comment}
-        anchor_mask (list|tuple): ${anchor_mask_comment}
-        class_num (int): ${class_num_comment}
-        ignore_thresh (float): ${ignore_thresh_comment}
-        downsample_ratio (int): ${downsample_ratio_comment}
-        name (string): The default value is None.  Normally there is no need
-                       for user to set this property.  For more information,
-                       please refer to :ref:`api_guide_Name`
-        gt_score (Variable): mixup score of ground truth boxes, should be in shape
-                            of [N, B]. Default None.
-        use_label_smooth (bool): ${use_label_smooth_comment}
-        scale_x_y (float): ${scale_x_y_comment}
-
-    Returns:
-        Variable: A 1-D tensor with shape [N], the value of yolov3 loss
-
-    Raises:
-        TypeError: Input x of yolov3_loss must be Variable
-        TypeError: Input gtbox of yolov3_loss must be Variable
-        TypeError: Input gtlabel of yolov3_loss must be Variable
-        TypeError: Input gtscore of yolov3_loss must be None or Variable
-        TypeError: Attr anchors of yolov3_loss must be list or tuple
-        TypeError: Attr class_num of yolov3_loss must be an integer
-        TypeError: Attr ignore_thresh of yolov3_loss must be a float number
-        TypeError: Attr use_label_smooth of yolov3_loss must be a bool value
-
-    Examples:
-      .. code-block:: python
-
-          import paddle.fluid as fluid
-          import paddle
-          paddle.enable_static()
-          x = fluid.data(name='x', shape=[None, 255, 13, 13], dtype='float32')
-          gt_box = fluid.data(name='gt_box', shape=[None, 6, 4], dtype='float32')
-          gt_label = fluid.data(name='gt_label', shape=[None, 6], dtype='int32')
-          gt_score = fluid.data(name='gt_score', shape=[None, 6], dtype='float32')
-          anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
-          anchor_mask = [0, 1, 2]
-          loss = fluid.layers.yolov3_loss(x=x, gt_box=gt_box, gt_label=gt_label,
-                                          gt_score=gt_score, anchors=anchors,
-                                          anchor_mask=anchor_mask, class_num=80,
-                                          ignore_thresh=0.7, downsample_ratio=32)
-    """
-
-    if not isinstance(x, Variable):
-        raise TypeError("Input x of yolov3_loss must be Variable")
-    if not isinstance(gt_box, Variable):
-        raise TypeError("Input gtbox of yolov3_loss must be Variable")
-    if not isinstance(gt_label, Variable):
-        raise TypeError("Input gtlabel of yolov3_loss must be Variable")
-    if gt_score is not None and not isinstance(gt_score, Variable):
-        raise TypeError("Input gtscore of yolov3_loss must be Variable")
-    if not isinstance(anchors, list) and not isinstance(anchors, tuple):
-        raise TypeError("Attr anchors of yolov3_loss must be list or tuple")
-    if not isinstance(anchor_mask, list) and not isinstance(anchor_mask, tuple):
-        raise TypeError("Attr anchor_mask of yolov3_loss must be list or tuple")
-    if not isinstance(class_num, int):
-        raise TypeError("Attr class_num of yolov3_loss must be an integer")
-    if not isinstance(ignore_thresh, float):
-        raise TypeError(
-            "Attr ignore_thresh of yolov3_loss must be a float number"
-        )
-    if not isinstance(use_label_smooth, bool):
-        raise TypeError(
-            "Attr use_label_smooth of yolov3_loss must be a bool value"
-        )
-
-    if _non_static_mode():
-        attrs = (
-            "anchors",
-            anchors,
-            "anchor_mask",
-            anchor_mask,
-            "class_num",
-            class_num,
-            "ignore_thresh",
-            ignore_thresh,
-            "downsample_ratio",
-            downsample_ratio,
-            "use_label_smooth",
-            use_label_smooth,
-            "scale_x_y",
-            scale_x_y,
-        )
-        loss, _, _ = _legacy_C_ops.yolov3_loss(
-            x, gt_box, gt_label, gt_score, *attrs
-        )
-        return loss
-
-    helper = LayerHelper('yolov3_loss', **locals())
-    loss = helper.create_variable_for_type_inference(dtype=x.dtype)
-    objectness_mask = helper.create_variable_for_type_inference(dtype='int32')
-    gt_match_mask = helper.create_variable_for_type_inference(dtype='int32')
-
-    inputs = {
-        "X": x,
-        "GTBox": gt_box,
-        "GTLabel": gt_label,
-    }
-    if gt_score is not None:
-        inputs["GTScore"] = gt_score
-
-    attrs = {
-        "anchors": anchors,
-        "anchor_mask": anchor_mask,
-        "class_num": class_num,
-        "ignore_thresh": ignore_thresh,
-        "downsample_ratio": downsample_ratio,
-        "use_label_smooth": use_label_smooth,
-        "scale_x_y": scale_x_y,
-    }
-
-    helper.append_op(
-        type='yolov3_loss',
-        inputs=inputs,
-        outputs={
-            'Loss': loss,
-            'ObjectnessMask': objectness_mask,
-            'GTMatchMask': gt_match_mask,
-        },
-        attrs=attrs,
-    )
-    return loss
-
-
-@deprecated(since="2.0.0", update_to="paddle.vision.ops.yolo_box")
-@templatedoc(op_type="yolo_box")
-def yolo_box(
-    x,
-    img_size,
-    anchors,
-    class_num,
-    conf_thresh,
-    downsample_ratio,
-    clip_bbox=True,
-    name=None,
-    scale_x_y=1.0,
-    iou_aware=False,
-    iou_aware_factor=0.5,
-):
-    """
-
-    ${comment}
-
-    Args:
-        x (Variable): ${x_comment} The data type is float32 or float64.
-        img_size (Variable): ${img_size_comment} The data type is int32.
-        anchors (list|tuple): ${anchors_comment}
-        class_num (int): ${class_num_comment}
-        conf_thresh (float): ${conf_thresh_comment}
-        downsample_ratio (int): ${downsample_ratio_comment}
-        clip_bbox (bool): ${clip_bbox_comment}
-        scale_x_y (float): ${scale_x_y_comment}
-        name (string): The default value is None.  Normally there is no need
-                       for user to set this property.  For more information,
-                       please refer to :ref:`api_guide_Name`
-        iou_aware (bool): ${iou_aware_comment}
-        iou_aware_factor (float): ${iou_aware_factor_comment}
-
-    Returns:
-        Variable: A 3-D tensor with shape [N, M, 4], the coordinates of boxes,
-        and a 3-D tensor with shape [N, M, :attr:`class_num`], the classification
-        scores of boxes.
-
-    Raises:
-        TypeError: Input x of yolov_box must be Variable
-        TypeError: Attr anchors of yolo box must be list or tuple
-        TypeError: Attr class_num of yolo box must be an integer
-        TypeError: Attr conf_thresh of yolo box must be a float number
-
-    Examples:
-
-    .. code-block:: python
-
-        import paddle.fluid as fluid
-        import paddle
-        paddle.enable_static()
-        x = fluid.data(name='x', shape=[None, 255, 13, 13], dtype='float32')
-        img_size = fluid.data(name='img_size',shape=[None, 2],dtype='int64')
-        anchors = [10, 13, 16, 30, 33, 23]
-        boxes,scores = fluid.layers.yolo_box(x=x, img_size=img_size, class_num=80, anchors=anchors,
-                                        conf_thresh=0.01, downsample_ratio=32)
-    """
-    helper = LayerHelper('yolo_box', **locals())
-
-    if not isinstance(x, Variable):
-        raise TypeError("Input x of yolo_box must be Variable")
-    if not isinstance(img_size, Variable):
-        raise TypeError("Input img_size of yolo_box must be Variable")
-    if not isinstance(anchors, list) and not isinstance(anchors, tuple):
-        raise TypeError("Attr anchors of yolo_box must be list or tuple")
-    if not isinstance(class_num, int):
-        raise TypeError("Attr class_num of yolo_box must be an integer")
-    if not isinstance(conf_thresh, float):
-        raise TypeError("Attr ignore_thresh of yolo_box must be a float number")
-
-    boxes = helper.create_variable_for_type_inference(dtype=x.dtype)
-    scores = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-    attrs = {
-        "anchors": anchors,
-        "class_num": class_num,
-        "conf_thresh": conf_thresh,
-        "downsample_ratio": downsample_ratio,
-        "clip_bbox": clip_bbox,
-        "scale_x_y": scale_x_y,
-        "iou_aware": iou_aware,
-        "iou_aware_factor": iou_aware_factor,
-    }
-
-    helper.append_op(
-        type='yolo_box',
-        inputs={
-            "X": x,
-            "ImgSize": img_size,
-        },
-        outputs={
-            'Boxes': boxes,
-            'Scores': scores,
-        },
-        attrs=attrs,
-    )
-    return boxes, scores
-
-
 @templatedoc()
 def detection_map(
     detect_res,
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index cf2523947f0d2..a2745bbca8e71 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -528,75 +528,6 @@ def test_generate_proposals(self):
         np.testing.assert_array_equal(np.array(rois_num_stat), rois_num_dy)
 
 
-class TestYoloDetection(unittest.TestCase):
-    def test_yolov3_loss(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
-            gt_box = layers.data(name='gt_box', shape=[10, 4], dtype='float32')
-            gt_label = layers.data(name='gt_label', shape=[10], dtype='int32')
-            gt_score = layers.data(name='gt_score', shape=[10], dtype='float32')
-            loss = layers.yolov3_loss(
-                x,
-                gt_box,
-                gt_label,
-                [10, 13, 30, 13],
-                [0, 1],
-                10,
-                0.7,
-                32,
-                gt_score=gt_score,
-                use_label_smooth=False,
-            )
-
-            self.assertIsNotNone(loss)
-
-    def test_yolo_box(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
-            img_size = layers.data(name='img_size', shape=[2], dtype='int32')
-            boxes, scores = layers.yolo_box(
-                x, img_size, [10, 13, 30, 13], 10, 0.01, 32
-            )
-            self.assertIsNotNone(boxes)
-            self.assertIsNotNone(scores)
-
-    def test_yolov3_loss_with_scale(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
-            gt_box = layers.data(name='gt_box', shape=[10, 4], dtype='float32')
-            gt_label = layers.data(name='gt_label', shape=[10], dtype='int32')
-            gt_score = layers.data(name='gt_score', shape=[10], dtype='float32')
-            loss = layers.yolov3_loss(
-                x,
-                gt_box,
-                gt_label,
-                [10, 13, 30, 13],
-                [0, 1],
-                10,
-                0.7,
-                32,
-                gt_score=gt_score,
-                use_label_smooth=False,
-                scale_x_y=1.2,
-            )
-
-            self.assertIsNotNone(loss)
-
-    def test_yolo_box_with_scale(self):
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
-            img_size = layers.data(name='img_size', shape=[2], dtype='int32')
-            boxes, scores = layers.yolo_box(
-                x, img_size, [10, 13, 30, 13], 10, 0.01, 32, scale_x_y=1.2
-            )
-            self.assertIsNotNone(boxes)
-            self.assertIsNotNone(scores)
-
-
 class TestBoxClip(unittest.TestCase):
     def test_box_clip(self):
         program = Program()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
index 1c1877681c4b1..2fe1f652cce48 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
@@ -314,7 +314,7 @@ def forward(
         for i, out in enumerate(self.outputs):
             anchor_mask = cfg.anchor_masks[i]
             if self.is_train:
-                loss = fluid.layers.yolov3_loss(
+                loss = paddle.vision.ops.yolo_loss(
                     x=out,
                     gt_box=self.gtbox,
                     gt_label=self.gtlabel,
@@ -333,7 +333,7 @@ def forward(
                 for m in anchor_mask:
                     mask_anchors.append(cfg.anchors[2 * m])
                     mask_anchors.append(cfg.anchors[2 * m + 1])
-                boxes, scores = fluid.layers.yolo_box(
+                boxes, scores = paddle.vision.ops.yolo_box(
                     x=out,
                     img_size=self.im_shape,
                     anchors=mask_anchors,
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_yolo_box_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_yolo_box_op_ipu.py
index 1248eb10921cb..40c56af922833 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_yolo_box_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_yolo_box_op_ipu.py
@@ -65,7 +65,7 @@ def build_model(self):
             'value': 6,
         }
         img_size = paddle.fluid.layers.fill_constant(**attrs)
-        out = paddle.fluid.layers.yolo_box(x=x, img_size=img_size, **self.attrs)
+        out = paddle.vision.ops.yolo_box(x=x, img_size=img_size, **self.attrs)
         self.fetch_list = [x.name for x in out]
 
     def run_model(self, exec_mode):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py
index 42a65f7f79fd1..a578c5216f3e8 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py
@@ -17,8 +17,8 @@
 import numpy as np
 from inference_pass_test import InferencePassTest
 
+import paddle
 import paddle.fluid as fluid
-import paddle.fluid.core as core
 from paddle.fluid.core import AnalysisConfig, PassVersionChecker
 
 
@@ -56,7 +56,7 @@ def set_params(self):
         self.downsample_ratio = 32
 
     def append_yolobox(self, image, image_size):
-        return fluid.layers.yolo_box(
+        return paddle.vision.ops.yolo_box(
             x=image,
             img_size=image_size,
             class_num=self.class_num,
@@ -66,7 +66,7 @@ def append_yolobox(self, image, image_size):
         )
 
     def test_check_output(self):
-        if core.is_compiled_with_cuda():
+        if paddle.is_compiled_with_cuda():
             use_gpu = True
             self.check_output_with_option(use_gpu, flatten=True)
             self.assertTrue(
@@ -106,7 +106,7 @@ def set_params(self):
         self.downsample_ratio = 32
 
     def append_yolobox(self, image, image_size):
-        return fluid.layers.yolo_box(
+        return paddle.vision.ops.yolo_box(
             x=image,
             img_size=image_size,
             class_num=self.class_num,
@@ -116,7 +116,7 @@ def append_yolobox(self, image, image_size):
         )
 
     def test_check_output(self):
-        if core.is_compiled_with_cuda():
+        if paddle.is_compiled_with_cuda():
             use_gpu = True
             self.check_output_with_option(use_gpu, flatten=True, rtol=1e-1)
             self.assertTrue(
@@ -160,7 +160,7 @@ def set_params(self):
         self.iou_aware_factor = 0.5
 
     def append_yolobox(self, image, image_size):
-        return fluid.layers.yolo_box(
+        return paddle.vision.ops.yolo_box(
             x=image,
             img_size=image_size,
             class_num=self.class_num,
@@ -172,7 +172,7 @@ def append_yolobox(self, image, image_size):
         )
 
     def test_check_output(self):
-        if core.is_compiled_with_cuda():
+        if paddle.is_compiled_with_cuda():
             use_gpu = True
             self.check_output_with_option(use_gpu, flatten=True)
             self.assertTrue(
diff --git a/python/paddle/fluid/tests/unittests/test_device_guard.py b/python/paddle/fluid/tests/unittests/test_device_guard.py
index d62893de97c7d..eff076c6a7871 100644
--- a/python/paddle/fluid/tests/unittests/test_device_guard.py
+++ b/python/paddle/fluid/tests/unittests/test_device_guard.py
@@ -127,8 +127,8 @@ def test_cpu_only_op(self):
             ]
             anchor_mask = [0, 1, 2]
             with paddle.static.device_guard("gpu"):
-                # yolov3_loss only has cpu kernel, so its cpu kernel will be executed
-                loss = fluid.layers.yolov3_loss(
+                # yolo_loss only has cpu kernel, so its cpu kernel will be executed
+                loss = paddle.vision.ops.yolo_loss(
                     x=x,
                     gt_box=gt_box,
                     gt_label=gt_label,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_device_guard_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_device_guard_xpu.py
index 3e126318df2ae..6de4b3f07b237 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_device_guard_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_device_guard_xpu.py
@@ -133,8 +133,8 @@ def test_cpu_only_op(self):
             ]
             anchor_mask = [0, 1, 2]
             with paddle.static.device_guard("xpu"):
-                # yolov3_loss only has cpu kernel, so its cpu kernel will be executed
-                loss = fluid.layers.yolov3_loss(
+                # yolo_loss has cpu kernel, so its cpu kernel will be executed
+                loss = paddle.vision.ops.yolo_loss(
                     x=x,
                     gt_box=gt_box,
                     gt_label=gt_label,

From e89a50c18f81dfe4bae280614c78539d99b4251a Mon Sep 17 00:00:00 2001
From: Yiqun Liu <Xreki@users.noreply.github.com>
Date: Thu, 8 Dec 2022 09:42:09 +0800
Subject: [PATCH 36/60] Try add eval() to speedup the eigen performance.
 (#48855)

---
 paddle/phi/kernels/impl/logsumexp_kernel_impl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/phi/kernels/impl/logsumexp_kernel_impl.h b/paddle/phi/kernels/impl/logsumexp_kernel_impl.h
index cc5057396265c..9c4ee034b7409 100644
--- a/paddle/phi/kernels/impl/logsumexp_kernel_impl.h
+++ b/paddle/phi/kernels/impl/logsumexp_kernel_impl.h
@@ -51,7 +51,7 @@ struct LogsumexpFunctor {
 
     auto x_mt = (*x).template cast<MT>();
     auto y_dim = y->dimensions();
-    auto x_max = x_mt.maximum(dim);
+    auto x_max = x_mt.maximum(dim).eval();
     y->device(place) =
         (x_max +
          (x_mt - x_max.reshape(t_dim).broadcast(r_dim)).exp().sum(dim).log())

From b731fb8212e6bbf37afd8484ffec56a3e4561c67 Mon Sep 17 00:00:00 2001
From: risemeup1 <62429225+risemeup1@users.noreply.github.com>
Date: Thu, 8 Dec 2022 09:45:30 +0800
Subject: [PATCH 37/60] [Fluid Clean]move inplace_apis_indygraph_only from
 paddle.flud.dygraph.inplace_utils to paddle.utils (#48744)

* move inplace_apis_indygraph_only from paddle.flud.dygraph.inplace_utils to paddle.utils

* modify conflict

* modify conflict

* modify conflict

* modify conflict

* modify conflict

* modify conflict

* modify conflict

* modify static-check ci error

* fix conflict

* modify failed tests

* fix conflict

* fix conflict

* fix pool2d examples

* modify conflict

* fix failed tests

* fix conflict

* fix failed tests

* modfiy problem of deleting pool2d
---
 python/paddle/fluid/dygraph/__init__.py                 | 2 --
 python/paddle/fluid/dygraph/nn.py                       | 1 +
 python/paddle/nn/functional/activation.py               | 2 +-
 python/paddle/tensor/manipulation.py                    | 2 +-
 python/paddle/tensor/math.py                            | 5 +++--
 python/paddle/{fluid/dygraph => utils}/inplace_utils.py | 6 +++---
 6 files changed, 9 insertions(+), 9 deletions(-)
 rename python/paddle/{fluid/dygraph => utils}/inplace_utils.py (89%)

diff --git a/python/paddle/fluid/dygraph/__init__.py b/python/paddle/fluid/dygraph/__init__.py
index 551561428da72..1132ef393d552 100644
--- a/python/paddle/fluid/dygraph/__init__.py
+++ b/python/paddle/fluid/dygraph/__init__.py
@@ -51,8 +51,6 @@
 
 from .math_op_patch import monkey_patch_math_varbase
 
-from .inplace_utils import inplace_apis_in_dygraph_only
-
 __all__ = []
 __all__ += layers.__all__
 __all__ += base.__all__
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 2fa3945987dd1..16148e87ee769 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -30,6 +30,7 @@
     in_dygraph_mode,
     _in_legacy_dygraph,
 )
+
 from ..data_feeder import (
     convert_dtype,
     check_variable_and_dtype,
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 89bb63643f6a1..92d6c25ea3ac6 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -15,9 +15,9 @@
 import paddle
 from paddle import _C_ops, _legacy_C_ops, in_dynamic_mode
 from paddle.framework import core
+from paddle.utils.inplace_utils import inplace_apis_in_dygraph_only
 
 from ...fluid.data_feeder import check_dtype, check_variable_and_dtype
-from ...fluid.dygraph.inplace_utils import inplace_apis_in_dygraph_only
 from ...fluid.framework import (
     _in_legacy_dygraph,
     convert_np_dtype_to_dtype_,
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 06a229106347c..676a97b840870 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -20,6 +20,7 @@
 
 import paddle
 from paddle import _C_ops, _legacy_C_ops
+from paddle.utils.inplace_utils import inplace_apis_in_dygraph_only
 
 from ..common_ops_import import _varbase_creator, fill_constant
 from ..fluid.data_feeder import (
@@ -28,7 +29,6 @@
     check_variable_and_dtype,
     convert_dtype,
 )
-from ..fluid.dygraph.inplace_utils import inplace_apis_in_dygraph_only
 from ..fluid.framework import _in_legacy_dygraph, _non_static_mode
 from ..fluid.layers import utils
 from ..framework import (
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 883d3c0e3aeb4..6f3a5c5865611 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -14,7 +14,6 @@
 """
 math functions
 """
-
 # TODO: define math functions
 
 import numpy as np
@@ -23,13 +22,15 @@
 from paddle import _C_ops, _legacy_C_ops
 from paddle.common_ops_import import VarDesc, dygraph_only, dygraph_utils
 
+# TODO: define math functions
+from paddle.utils.inplace_utils import inplace_apis_in_dygraph_only
+
 from ..fluid.data_feeder import (
     check_dtype,
     check_type,
     check_variable_and_dtype,
     convert_dtype,
 )
-from ..fluid.dygraph.inplace_utils import inplace_apis_in_dygraph_only
 from ..fluid.layers import elementwise_sub, utils
 from ..framework import (
     LayerHelper,
diff --git a/python/paddle/fluid/dygraph/inplace_utils.py b/python/paddle/utils/inplace_utils.py
similarity index 89%
rename from python/paddle/fluid/dygraph/inplace_utils.py
rename to python/paddle/utils/inplace_utils.py
index fb27a5674b7d8..2ee809facab4f 100644
--- a/python/paddle/fluid/dygraph/inplace_utils.py
+++ b/python/paddle/utils/inplace_utils.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ..wrapped_decorator import wrap_decorator
-from ..framework import _non_static_mode
 import warnings
 import paddle
 from paddle import _C_ops, _legacy_C_ops
+from paddle.fluid.framework import _non_static_mode
+from paddle.fluid.wrapped_decorator import wrap_decorator
 
 
 # NOTE(pangyoki): The Inplace APIs with underline(`_`) is only valid for the method of calling `_C_ops`

From 2a31c9dd12bb73bb646c1abc22a8adc23454e7eb Mon Sep 17 00:00:00 2001
From: 201716010711 <87008376+201716010711@users.noreply.github.com>
Date: Thu, 8 Dec 2022 10:03:14 +0800
Subject: [PATCH 38/60] clean fluid task: transfer gaussian random api (#48529)

---
 python/paddle/distribution/normal.py          |   8 +-
 python/paddle/fluid/layers/nn.py              | 147 ------------------
 .../unittests/test_gaussian_random_op.py      |  13 +-
 .../unittests/test_imperative_auto_prune.py   |   3 +-
 .../fluid/tests/unittests/test_layers.py      |   2 +-
 .../fluid/tests/unittests/test_manual_seed.py |   9 +-
 .../fluid/tests/unittests/test_random_seed.py |  13 +-
 .../xpu/test_gaussian_random_op_xpu.py        |  13 +-
 python/paddle/tensor/random.py                |   3 +-
 9 files changed, 35 insertions(+), 176 deletions(-)

diff --git a/python/paddle/distribution/normal.py b/python/paddle/distribution/normal.py
index 7eb9fb597d3a2..06cd726fd4f36 100644
--- a/python/paddle/distribution/normal.py
+++ b/python/paddle/distribution/normal.py
@@ -21,7 +21,8 @@
 from paddle.distribution import distribution
 from paddle.fluid.data_feeder import check_type, convert_dtype
 from paddle.fluid.framework import _non_static_mode
-from paddle.fluid.layers import nn, tensor
+from paddle.fluid.layers import tensor
+from paddle.tensor import random
 
 
 class Normal(distribution.Distribution):
@@ -180,8 +181,9 @@ def sample(self, shape=(), seed=0):
                 self.loc + self.scale, batch_shape + shape, self.dtype, 0.0
             )
             zero_tmp_reshape = paddle.reshape(zero_tmp, output_shape)
+
             zero_tmp_shape = paddle.shape(zero_tmp_reshape)
-            normal_random_tmp = nn.gaussian_random(
+            normal_random_tmp = random.gaussian(
                 zero_tmp_shape, mean=0.0, std=1.0, seed=seed, dtype=self.dtype
             )
             output = normal_random_tmp * (zero_tmp_reshape + self.scale)
@@ -189,7 +191,7 @@ def sample(self, shape=(), seed=0):
             return output
         else:
             output_shape = shape + batch_shape
-            output = nn.gaussian_random(
+            output = random.gaussian(
                 output_shape, mean=0.0, std=1.0, seed=seed, dtype=self.dtype
             ) * (tensor.zeros(output_shape, dtype=self.dtype) + self.scale)
             output = paddle.add(output, self.loc, name=name)
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 9d4429ef04685..bf5853fad88d2 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -84,7 +84,6 @@
     'elementwise_div',
     'elementwise_sub',
     'elementwise_mul',
-    'gaussian_random',
     'clip',
     'clip_by_norm',
     'mean',
@@ -2720,152 +2719,6 @@ def relu(x, name=None):
 from paddle.fluid.framework import convert_np_dtype_to_dtype_
 
 
-@deprecated(since="2.0.0", update_to="paddle.normal")
-@templatedoc()
-def gaussian_random(
-    shape, mean=0.0, std=1.0, seed=0, dtype='float32', name=None
-):
-    """
-    This OP returns a Tensor filled with random values sampled from a Gaussian
-    distribution, with ``shape`` and ``dtype``.
-
-    Args:
-        shape(list|tuple|Tensor): The shape of the output Tensor. If ``shape``
-            is a list or tuple, the elements of it should be integers or Tensors
-            (with the shape [1], and the data type int32 or int64). If ``shape``
-            is a Tensor, it should be a 1-D Tensor(with the data type int32 or
-            int64).
-        mean(float|int, optional): Mean of the output tensor, default is 0.0.
-        std(float|int, optional): Standard deviation of the output tensor, default
-            is 1.0.
-        seed(int, optional): ${seed_comment}
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of
-            the output Tensor. Supported data types: float32, float64.
-            Default is float32.
-        name(str, optional): The default value is None. Normally there is no
-            need for user to set this property. For more information, please
-            refer to :ref:`api_guide_Name`.
-
-    Returns:
-        Tensor: A Tensor filled with random values sampled from a Gaussian
-        distribution, with ``shape`` and ``dtype``.
-
-    Examples:
-       .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            paddle.enable_static()
-
-            # example 1:
-            # attr shape is a list which doesn't contain Tensor.
-            result_1 = fluid.layers.gaussian_random(shape=[3, 4])
-            # [[-0.31261674,  1.8736548,  -0.6274357,   0.96988016],
-            #  [-0.12294637,  0.9554768,   1.5690808,  -1.2894802 ],
-            #  [-0.60082096, -0.61138713,  1.5345167,  -0.21834975]]
-
-            # example 2:
-            # attr shape is a list which contains Tensor.
-            dim_1 = fluid.layers.fill_constant([1], "int64", 2)
-            dim_2 = fluid.layers.fill_constant([1], "int32", 3)
-            result_2 = fluid.layers.gaussian_random(shape=[dim_1, dim_2])
-            # [[ 0.51398206, -0.3389769,   0.23597084],
-            #  [ 1.0388143,  -1.2015356,  -1.0499583 ]]
-
-            # example 3:
-            # attr shape is a Tensor, the data type must be int64 or int32.
-            var_shape = fluid.data(name='var_shape', shape=[2], dtype="int64")
-            result_3 = fluid.layers.gaussian_random(var_shape)
-            # if var_shape's value is [2, 3]
-            # result_3 is:
-            # [[-0.12310527,  0.8187662,   1.923219  ]
-            #  [ 0.70721835,  0.5210541,  -0.03214082]]
-
-       .. code-block:: python
-
-           # declarative mode
-           # required: skiptest
-           import numpy as np
-           from paddle import fluid
-
-           x = fluid.layers.gaussian_random((2, 3), std=2., seed=10)
-
-           place = fluid.CPUPlace()
-           exe = fluid.Executor(place)
-           start = fluid.default_startup_program()
-           main = fluid.default_main_program()
-
-           exe.run(start)
-           x_np, = exe.run(main, feed={}, fetch_list=[x])
-
-           x_np
-           # array([[2.3060477, 2.676496 , 3.9911983],
-           #        [0.9990833, 2.8675377, 2.2279181]], dtype=float32)
-
-       .. code-block:: python
-
-           # imperative mode
-           import numpy as np
-           from paddle import fluid
-           import paddle.fluid.dygraph as dg
-
-           place = fluid.CPUPlace()
-           with dg.guard(place) as g:
-               x = fluid.layers.gaussian_random((2, 4), mean=2., dtype="float32", seed=10)
-               x_np = x.numpy()
-           x_np
-           # array([[2.3060477 , 2.676496  , 3.9911983 , 0.9990833 ],
-           #        [2.8675377 , 2.2279181 , 0.79029655, 2.8447366 ]], dtype=float32)
-    """
-    if not isinstance(dtype, core.VarDesc.VarType):
-        dtype = convert_np_dtype_to_dtype_(dtype)
-
-    if in_dygraph_mode():
-        shape = utils.convert_shape_to_list(shape)
-        place = _current_expected_place()
-        return _C_ops.gaussian(
-            shape, float(mean), float(std), seed, dtype, place
-        )
-
-    if _in_legacy_dygraph():
-        shape = utils.convert_shape_to_list(shape)
-        return _legacy_C_ops.gaussian_random(
-            'shape',
-            shape,
-            'mean',
-            float(mean),
-            'std',
-            float(std),
-            'seed',
-            seed,
-            'dtype',
-            dtype,
-        )
-
-    check_type(shape, 'shape', (list, tuple, Variable), 'gaussian_random/randn')
-    check_dtype(dtype, 'dtype', ['float32', 'float64'], 'gaussian_random/randn')
-
-    inputs = {}
-    attrs = {
-        'mean': mean,
-        'std': std,
-        'seed': seed,
-        'dtype': dtype,
-        'use_mkldnn': False,
-    }
-    utils.get_shape_tensor_inputs(
-        inputs=inputs, attrs=attrs, shape=shape, op_type='gaussian_random/randn'
-    )
-
-    helper = LayerHelper('gaussian_random', **locals())
-    out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type='gaussian_random', inputs=inputs, outputs={'Out': out}, attrs=attrs
-    )
-
-    return out
-
-
 def _elementwise_op(helper):
     op_type = helper.layer_type
     x = helper.kwargs.get('x', None)
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
index b0fb623502ed3..e16c22eb12a2b 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -21,6 +21,7 @@
 import paddle.fluid.core as core
 from paddle.fluid.framework import _test_eager_guard
 from paddle.fluid.tests.unittests.op_test import OpTest, convert_uint16_to_float
+from paddle.tensor import random
 
 
 class TestGaussianRandomOp(OpTest):
@@ -228,11 +229,11 @@ def test_api(self):
             name="shape_tensor_int64", shape=[2], dtype="int64"
         )
 
-        out_1 = fluid.layers.gaussian_random(
+        out_1 = random.gaussian(
             shape=[2000, 500], dtype="float32", mean=0.0, std=1.0, seed=10
         )
 
-        out_2 = fluid.layers.gaussian_random(
+        out_2 = random.gaussian(
             shape=[2000, positive_2_int32],
             dtype="float32",
             mean=0.0,
@@ -240,7 +241,7 @@ def test_api(self):
             seed=10,
         )
 
-        out_3 = fluid.layers.gaussian_random(
+        out_3 = random.gaussian(
             shape=[2000, positive_2_int64],
             dtype="float32",
             mean=0.0,
@@ -248,7 +249,7 @@ def test_api(self):
             seed=10,
         )
 
-        out_4 = fluid.layers.gaussian_random(
+        out_4 = random.gaussian(
             shape=shape_tensor_int32,
             dtype="float32",
             mean=0.0,
@@ -256,7 +257,7 @@ def test_api(self):
             seed=10,
         )
 
-        out_5 = fluid.layers.gaussian_random(
+        out_5 = random.gaussian(
             shape=shape_tensor_int64,
             dtype="float32",
             mean=0.0,
@@ -264,7 +265,7 @@ def test_api(self):
             seed=10,
         )
 
-        out_6 = fluid.layers.gaussian_random(
+        out_6 = random.gaussian(
             shape=shape_tensor_int64,
             dtype=np.float32,
             mean=0.0,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
index 522fb24f8fb7a..5dbb1ac0a2974 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
@@ -19,6 +19,7 @@
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.framework import _test_eager_guard
+from paddle.tensor import random
 
 
 class AutoPruneLayer0(fluid.Layer):
@@ -487,7 +488,7 @@ def test_case3_prune_no_grad_branch2(self):
 
     def func_case4_with_no_grad_op_maker(self):
         with fluid.dygraph.guard():
-            out = fluid.layers.gaussian_random(shape=[20, 30])
+            out = random.gaussian(shape=[20, 30])
             loss = paddle.mean(out)
             loss.backward()
             self.assertIsNone(out._grad_ivar())
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 52b864648cb7f..f07f8bba97c97 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -2557,7 +2557,7 @@ def make_gaussian_random(self):
         with program_guard(
             fluid.default_main_program(), fluid.default_startup_program()
         ):
-            out = layers.gaussian_random(shape=[20, 30])
+            out = random.gaussian(shape=[20, 30])
             return out
 
     def make_sum(self):
diff --git a/python/paddle/fluid/tests/unittests/test_manual_seed.py b/python/paddle/fluid/tests/unittests/test_manual_seed.py
index 419ba5dba888d..a4b2afe521b1e 100644
--- a/python/paddle/fluid/tests/unittests/test_manual_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_manual_seed.py
@@ -18,6 +18,7 @@
 
 import paddle
 import paddle.fluid as fluid
+from paddle.tensor import random
 
 
 class TestManualSeed(unittest.TestCase):
@@ -25,13 +26,13 @@ def test_seed(self):
         fluid.enable_dygraph()
 
         gen = paddle.seed(12312321111)
-        x = fluid.layers.gaussian_random([10], dtype="float32")
+        x = random.gaussian([10], dtype="float32")
         st1 = gen.get_state()
-        x1 = fluid.layers.gaussian_random([10], dtype="float32")
+        x1 = random.gaussian([10], dtype="float32")
         gen.set_state(st1)
-        x2 = fluid.layers.gaussian_random([10], dtype="float32")
+        x2 = random.gaussian([10], dtype="float32")
         gen.manual_seed(12312321111)
-        x3 = fluid.layers.gaussian_random([10], dtype="float32")
+        x3 = random.gaussian([10], dtype="float32")
         x_np = x.numpy()
         x1_np = x1.numpy()
         x2_np = x2.numpy()
diff --git a/python/paddle/fluid/tests/unittests/test_random_seed.py b/python/paddle/fluid/tests/unittests/test_random_seed.py
index 44b368889583d..1c3c280d2fcbc 100644
--- a/python/paddle/fluid/tests/unittests/test_random_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_random_seed.py
@@ -21,6 +21,7 @@
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.generator as generator
+from paddle.tensor import random
 
 
 class TestGeneratorSeed(unittest.TestCase):
@@ -148,13 +149,13 @@ def test_generator_gaussian_random_dygraph(self):
         fluid.enable_dygraph()
 
         gen = paddle.seed(12312321111)
-        x = fluid.layers.gaussian_random([10], dtype="float32")
+        x = random.gaussian([10], dtype="float32")
         st1 = gen.get_state()
-        x1 = fluid.layers.gaussian_random([10], dtype="float32")
+        x1 = random.gaussian([10], dtype="float32")
         gen.set_state(st1)
-        x2 = fluid.layers.gaussian_random([10], dtype="float32")
+        x2 = random.gaussian([10], dtype="float32")
         gen.manual_seed(12312321111)
-        x3 = fluid.layers.gaussian_random([10], dtype="float32")
+        x3 = random.gaussian([10], dtype="float32")
         x_np = x.numpy()
         x1_np = x1.numpy()
         x2_np = x2.numpy()
@@ -175,8 +176,8 @@ def test_generator_gaussian_random_static(self):
         with fluid.program_guard(train_program, startup_program):
             # example 1:
             # attr shape is a list which doesn't contain tensor Variable.
-            result_1 = fluid.layers.gaussian_random(shape=[3, 4])
-            result_2 = fluid.layers.gaussian_random(shape=[3, 4])
+            result_1 = random.gaussian(shape=[3, 4])
+            result_2 = random.gaussian(shape=[3, 4])
 
             exe = fluid.Executor(fluid.CPUPlace())
             exe.run(startup_program)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py
index 62a0180dc876d..2a5f81f58c1e7 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py
@@ -29,6 +29,7 @@
 import paddle.fluid as fluid
 
 paddle.enable_static()
+from paddle.tensor import random
 
 
 class XPUTestGaussianRandomOp(XPUOpTestWrapper):
@@ -192,11 +193,11 @@ def test_api(self):
             name="shape_tensor_int64", shape=[2], dtype="int64"
         )
 
-        out_1 = fluid.layers.gaussian_random(
+        out_1 = random.gaussian(
             shape=[2000, 500], dtype="float32", mean=0.0, std=1.0, seed=10
         )
 
-        out_2 = fluid.layers.gaussian_random(
+        out_2 = random.gaussian(
             shape=[2000, positive_2_int32],
             dtype="float32",
             mean=0.0,
@@ -204,7 +205,7 @@ def test_api(self):
             seed=10,
         )
 
-        out_3 = fluid.layers.gaussian_random(
+        out_3 = random.gaussian(
             shape=[2000, positive_2_int64],
             dtype="float32",
             mean=0.0,
@@ -212,7 +213,7 @@ def test_api(self):
             seed=10,
         )
 
-        out_4 = fluid.layers.gaussian_random(
+        out_4 = random.gaussian(
             shape=shape_tensor_int32,
             dtype="float32",
             mean=0.0,
@@ -220,7 +221,7 @@ def test_api(self):
             seed=10,
         )
 
-        out_5 = fluid.layers.gaussian_random(
+        out_5 = random.gaussian(
             shape=shape_tensor_int64,
             dtype="float32",
             mean=0.0,
@@ -228,7 +229,7 @@ def test_api(self):
             seed=10,
         )
 
-        out_6 = fluid.layers.gaussian_random(
+        out_6 = random.gaussian(
             shape=shape_tensor_int64,
             dtype=np.float32,
             mean=0.0,
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 1b80c872faca5..3bbf7831e8472 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -314,7 +314,7 @@ def uniform_random_batch_size_like(
     return out
 
 
-def gaussian(shape, mean=0.0, std=1.0, dtype=None, name=None):
+def gaussian(shape, mean=0.0, std=1.0, seed=0, dtype=None, name=None):
     """
     Returns a Tensor filled with random values sampled from a Gaussian
     distribution, with ``shape`` and ``dtype``.
@@ -338,7 +338,6 @@ def gaussian(shape, mean=0.0, std=1.0, dtype=None, name=None):
         distribution, with ``shape`` and ``dtype``.
     """
     op_type_for_check = 'gaussian/standard_normal/randn/normal'
-    seed = 0
 
     if dtype is None:
         dtype = paddle.framework.get_default_dtype()

From a85dedf978e05bcb1000aa1d30ec65fd2415d6bf Mon Sep 17 00:00:00 2001
From: Guanghua Yu <742925032@qq.com>
Date: Thu, 8 Dec 2022 10:04:27 +0800
Subject: [PATCH 39/60] Delete duplicate quant nodes in QAT (#48751)

---
 .../slim/quantization/quantization_pass.py    | 20 ++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 705b0e5e69ee6..55e1dcacdcb62 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -2907,13 +2907,31 @@ def apply(self, graph):
             graph, IrGraph
         ), 'graph must be the instance of IrGraph.'
         fake_quant_dequant_ops = []
+        remove_fake_quant_ops = []
+        observer_out_node_names = []
+        for op in graph.all_op_nodes():
+            # collect observer node
+            if op.name() == "moving_average_abs_max_scale":
+                observer_out_node_names.append(op.output("Out")[0])
 
         for op in graph.all_op_nodes():
             if (
                 op.name() in _fake_quant_dequant_op_list
                 or op.name() == "moving_average_abs_max_scale"
             ):
-                fake_quant_dequant_ops.append(op)
+                var_name = op.input("X")[0]
+                if var_name in observer_out_node_names:
+                    remove_fake_quant_ops.append(op)
+                else:
+                    fake_quant_dequant_ops.append(op)
+
+        for _op in remove_fake_quant_ops:
+            x_node = graph._find_node_by_name(_op.inputs, _op.input("X")[0])
+            out_node = graph._find_node_by_name(
+                _op.outputs, _op.output("Out")[0]
+            )
+            for next_op_node in out_node.outputs:
+                graph.update_input_link(out_node, x_node, next_op_node)
 
         for _op in fake_quant_dequant_ops:
             self._replace_op(graph, _op)

From cf7995256f6f5e5f563e2aa20c09cc086e74c15d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A7=9C=E6=B0=B8=E4=B9=85?=
 <34344716+yjjiang11@users.noreply.github.com>
Date: Wed, 7 Dec 2022 18:05:29 -0800
Subject: [PATCH 40/60] rm autograd func dynamic eager tests (#48788)

---
 .../test_autograd_functional_dynamic.py       | 49 ++-----------------
 1 file changed, 3 insertions(+), 46 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py
index ec39947a99471..9883a88041542 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py
@@ -24,7 +24,6 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.nn.functional as F
-from paddle.fluid.framework import _test_eager_guard
 from paddle.incubate.autograd.utils import as_tensors
 
 
@@ -201,14 +200,6 @@ def func_vjp_aliased_input(self):
         self.check_results(ref_result, aliased_result)
 
     def test_all_cases(self):
-        with _test_eager_guard():
-            self.func_vjp_i1o1()
-            self.func_vjp_i2o1()
-            self.func_vjp_i2o2()
-            self.func_vjp_i2o2_omitting_v()
-            self.func_vjp_nested()
-            self.func_vjp_aliased_input()
-
         self.func_vjp_i1o1()
         self.func_vjp_i2o1()
         self.func_vjp_i2o2()
@@ -237,17 +228,12 @@ def test_input_single_tensor(self):
     ),
 )
 class TestVJPException(unittest.TestCase):
-    def func_vjp(self):
+    def test_vjp(self):
         with self.assertRaises(self.expected_exception):
             paddle.incubate.autograd.vjp(
                 self.fun, paddle.to_tensor(self.xs), paddle.to_tensor(self.v)
             )
 
-    def test_all_cases(self):
-        with _test_eager_guard():
-            self.func_vjp()
-        self.func_vjp()
-
 
 def jac(grad_fn, f, inputs):
     assert grad_fn in [
@@ -324,11 +310,6 @@ def func_jvp_i2o2_omitting_v(self):
             self.check_results(results_omitting_v, results_with_v)
 
     def test_all_cases(self):
-        with _test_eager_guard():
-            self.func_jvp_i1o1()
-            self.func_jvp_i2o1()
-            self.func_jvp_i2o2()
-            self.func_jvp_i2o2_omitting_v()
         self.func_jvp_i1o1()
         self.func_jvp_i2o1()
         self.func_jvp_i2o2()
@@ -372,7 +353,7 @@ def setUp(self):
             .get("atol")
         )
 
-    def func_jacobian(self):
+    def test_jacobian(self):
         xs = (
             [paddle.to_tensor(x) for x in self.xs]
             if isinstance(self.xs, typing.Sequence)
@@ -409,11 +390,6 @@ def _get_expected(self):
         )
         return utils._np_concat_matrix_sequence(jac, utils.MatrixFormat.NM)
 
-    def test_all_cases(self):
-        with _test_eager_guard():
-            self.func_jacobian()
-        self.func_jacobian()
-
 
 @utils.place(config.DEVICES)
 @utils.parameterize(
@@ -451,7 +427,7 @@ def setUp(self):
             .get("atol")
         )
 
-    def func_jacobian(self):
+    def test_jacobian(self):
         xs = (
             [paddle.to_tensor(x) for x in self.xs]
             if isinstance(self.xs, typing.Sequence)
@@ -505,11 +481,6 @@ def _get_expected(self):
             jac, utils.MatrixFormat.NBM, utils.MatrixFormat.BNM
         )
 
-    def test_all_cases(self):
-        with _test_eager_guard():
-            self.func_jacobian()
-        self.func_jacobian()
-
 
 class TestHessianNoBatch(unittest.TestCase):
     @classmethod
@@ -607,13 +578,6 @@ def func(x):
             paddle.incubate.autograd.Hessian(func, paddle.ones([3]))
 
     def test_all_cases(self):
-        with _test_eager_guard():
-            self.setUpClass()
-            self.func_single_input()
-            self.func_multi_input()
-            self.func_allow_unused_true()
-            self.func_create_graph_true()
-            self.func_out_not_single()
         self.setUpClass()
         self.func_single_input()
         self.func_multi_input()
@@ -744,13 +708,6 @@ def func(x):
             )
 
     def test_all_cases(self):
-        with _test_eager_guard():
-            self.setUpClass()
-            self.func_single_input()
-            self.func_multi_input()
-            self.func_allow_unused()
-            self.func_stop_gradient()
-            self.func_out_not_single()
         self.setUpClass()
         self.func_single_input()
         self.func_multi_input()

From da8e15e6e842d0f61bb9006c16e7f27a42d7290c Mon Sep 17 00:00:00 2001
From: risemeup1 <62429225+risemeup1@users.noreply.github.com>
Date: Thu, 8 Dec 2022 10:21:54 +0800
Subject: [PATCH 41/60] Setuptools optimization (#48770)

* optimize setup.py

* modify setup.py

* modify setup.py

* modify setup.py

* modify setup.py after zhangbo reviewed
---
 paddle/scripts/paddle_build.sh | 15 +--------------
 setup.py                       | 25 +++++++++++++++++--------
 2 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 71facd9695068..73334b651bfe0 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -3467,20 +3467,7 @@ function run_setup(){
     SYSTEM=`uname -s`
     if [ "$SYSTEM" == "Darwin" ]; then
         echo "Using python abi: $1"
-        if [ "$1" == "cp36-cp36m" ] || [ "$1" == "" ]; then
-            if [ -d "/Library/Frameworks/Python.framework/Versions/3.6" ]; then
-                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/
-                export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/Library/Frameworks/Python.framework/Versions/3.6/lib/
-                export PATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/:${PATH}
-                #after changing "PYTHON_LIBRARY:FILEPATH" to "PYTHON_LIBRARY" ,we can use export
-                export PYTHON_EXECUTABLE=/Library/Frameworks/Python.framework/Versions/3.6/bin/python3
-                export PYTHON_INCLUDE_DIR=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/
-                export PYTHON_LIBRARY=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib
-                pip3.6 install --user -r ${PADDLE_ROOT}/python/requirements.txt
-            else
-                exit 1
-            fi
-        elif [ "$1" == "cp37-cp37m" ]; then
+        if [ "$1" == "cp37-cp37m" ]; then
             if [ -d "/Library/Frameworks/Python.framework/Versions/3.7" ]; then
                 export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/
                 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/Library/Frameworks/Python.framework/Versions/3.7/lib/
diff --git a/setup.py b/setup.py
index 58458c5470740..6d088750a60b0 100644
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,7 @@
 from distutils.spawn import find_executable
 from subprocess import CalledProcessError
 
-from setuptools import Command, Distribution, Extension, setup
+from setuptools import Command, Extension, setup
 from setuptools.command.egg_info import egg_info
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.command.install_lib import install_lib
@@ -275,7 +275,7 @@ def _mkdir_p(dir_str):
     try:
         os.makedirs(dir_str)
     except OSError as e:
-        raise RuntimeError("Failed to create folder build/")
+        raise RuntimeError("Failed to create build folder")
 
 
 def get_major():
@@ -583,9 +583,14 @@ def build_run(args, build_path, envrion_var):
 
 def build_steps():
     print('------- Building start ------')
-    if not os.path.exists(TOP_DIR + '/build'):
-        _mkdir_p(TOP_DIR + '/build')
-    build_path = TOP_DIR + '/build'
+    build_dir = os.getenv("BUILD_DIR")
+    if build_dir is not None:
+        build_dir = TOP_DIR + '/' + build_dir
+    else:
+        build_dir = TOP_DIR + '/build'
+    if not os.path.exists(build_dir):
+        _mkdir_p(build_dir)
+    build_path = build_dir
     # run cmake to generate native build files
     cmake_cache_file_path = os.path.join(build_path, "CMakeCache.txt")
     # if rerun_cmake is True,remove CMakeCache.txt and rerun camke
@@ -1276,9 +1281,13 @@ def main():
     # Execute the build process,cmake and make
     if cmake_and_build:
         build_steps()
-
-    sys.path.append(TOP_DIR + "/build/python/")
-    from build.python.env_dict import env_dict as env_dict
+    build_dir = os.getenv("BUILD_DIR")
+    if build_dir is not None:
+        env_dict_path = TOP_DIR + '/' + build_dir + '/python'
+    else:
+        env_dict_path = TOP_DIR + "/build/python/"
+    sys.path.append(env_dict_path)
+    from env_dict import env_dict as env_dict
 
     global env_dict
     global paddle_binary_dir, paddle_source_dir

From cdf31dc18258e6b80192dab5b0cdc74fc4f6456b Mon Sep 17 00:00:00 2001
From: Nyakku Shigure <sigure.qaq@gmail.com>
Date: Thu, 8 Dec 2022 10:25:10 +0800
Subject: [PATCH 42/60] [CodeStyle][F811] fix some test cases shadowed by the
 same name (#48745)

* [CodeStyle][F811] fix some unittests

* fix setup.py

* remove ignore from flake8 config

* remove repeat TestAbsDoubleGradCheck

* fix rrelu test

* fix fft ut

* add noqa in fluid.lstm ut

* add rtol and atol in test_matmul_v2_op

* update rtol

* empty commit

* empty commit

* revert changes in matmul ut and add noqa

* rename test case name
---
 .flake8                                       |  6 ----
 ...est_dygraph_group_sharded_api_for_eager.py |  2 +-
 .../fft/test_fft_with_static_graph.py         | 14 ++-------
 .../unittests/test_activation_nn_grad.py      | 30 -------------------
 .../tests/unittests/test_lstm_cudnn_op.py     |  2 +-
 .../tests/unittests/test_matmul_v2_op.py      |  2 +-
 .../fluid/tests/unittests/test_rrelu_op.py    | 19 ++++--------
 setup.py                                      |  1 -
 8 files changed, 11 insertions(+), 65 deletions(-)

diff --git a/.flake8 b/.flake8
index 2d284df082e8a..853e887f5e40a 100644
--- a/.flake8
+++ b/.flake8
@@ -37,9 +37,3 @@ per-file-ignores =
     .cmake-format.py: F821
     python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py: F821
     python/paddle/fluid/tests/unittests/dygraph_to_static/test_closure_analysis.py: F821
-    # These files will be fixed in the future
-    python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py: F811
-    python/paddle/fluid/tests/unittests/test_activation_nn_grad.py: F811
-    python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py: F811
-    python/paddle/fluid/tests/unittests/test_matmul_v2_op.py: F811
-    python/paddle/fluid/tests/unittests/test_rrelu_op.py: F811
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_group_sharded_api_for_eager.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_group_sharded_api_for_eager.py
index ecf864cf806f6..331974edfbc0d 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_group_sharded_api_for_eager.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_group_sharded_api_for_eager.py
@@ -28,7 +28,7 @@ def test_dygraph_group_sharded(self):
         self.run_mnist_2gpu('dygraph_group_sharded_api_eager.py')
 
     # check stage3 for some functions.
-    def test_dygraph_group_sharded(self):
+    def test_dygraph_group_sharded_stage3(self):
         self.run_mnist_2gpu('dygraph_group_sharded_stage3_eager.py')
 
 
diff --git a/python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py b/python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py
index 79b8fb2798252..38ccb9b6470ab 100644
--- a/python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py
+++ b/python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py
@@ -266,14 +266,6 @@ def test_static_fftn(self):
 @parameterize(
     (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
     [
-        (
-            'test_x_complex',
-            rand_x(4, complex=True),
-            None,
-            None,
-            'backward',
-            TypeError,
-        ),
         (
             'test_n_nagative',
             rand_x(4),
@@ -295,11 +287,11 @@ def test_static_fftn(self):
         ('test_norm_not_in_enum', rand_x(2), None, -1, 'random', ValueError),
     ],
 )
-class TestRfftnException(unittest.TestCase):
-    def test_static_rfftn(self):
+class TestFftnException(unittest.TestCase):
+    def test_static_fftn(self):
         with self.assertRaises(self.expect_exception):
             with stgraph(
-                paddle.fft.rfftn,
+                paddle.fft.fftn,
                 self.place,
                 self.x,
                 self.n,
diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
index 4b3311120467d..f10232cf02bce 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
@@ -407,36 +407,6 @@ def test_grad(self):
             self.func(p)
 
 
-class TestAbsDoubleGradCheck(unittest.TestCase):
-    @prog_scope()
-    def func(self, place):
-        # the shape of input variable should be clearly specified, not inlcude -1.
-        shape = [2, 3, 7, 9]
-        eps = 1e-6
-        dtype = np.float64
-
-        x = layers.data('x', shape, False, dtype)
-        x.persistable = True
-        y = paddle.abs(x)
-        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
-        # Because we set delta = 0.005 in calculating numeric gradient,
-        # if x is too small, the numeric gradient is inaccurate.
-        # we should avoid this
-        x_arr[np.abs(x_arr) < 0.005] = 0.02
-
-        gradient_checker.double_grad_check(
-            [x], y, x_init=x_arr, place=place, eps=eps
-        )
-
-    def test_grad(self):
-        paddle.enable_static()
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            self.func(p)
-
-
 class TestLogDoubleGradCheck(unittest.TestCase):
     def log_wrapper(self, x):
         return paddle.log(x[0])
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
index cbc7450bbc6d2..536fc59f42ed8 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
@@ -584,7 +584,7 @@ def test_lstm(self):
 @unittest.skipIf(
     not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
 )
-class TestCUDNNlstmAPI(unittest.TestCase):
+class TestCUDNNlstmAPI(unittest.TestCase):  # noqa: F811
     def test_lstm(self):
         seq_len = 20
         batch_size = 5
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
index 868cec1d592b7..c452958ead841 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
@@ -732,7 +732,7 @@ def func_dygraph_matmul(self):
 
         paddle.enable_static()
 
-    def func_dygraph_matmul(self):
+    def func_dygraph_matmul(self):  # noqa: F811
         with _test_eager_guard():
             self.func_dygraph_matmul()
 
diff --git a/python/paddle/fluid/tests/unittests/test_rrelu_op.py b/python/paddle/fluid/tests/unittests/test_rrelu_op.py
index 847675ee6f58f..96bccf8120257 100644
--- a/python/paddle/fluid/tests/unittests/test_rrelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rrelu_op.py
@@ -317,9 +317,9 @@ def setUp(self):
         self.lower = 0.1
         self.upper = 0.3
         self.is_test = True
-        self.init_prams()
+        self.init_params()
 
-    def init_prams(self):
+    def init_params(self):
         self.dtype = "float64"
         self.x_shape = [2, 3, 4, 5]
 
@@ -343,22 +343,13 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
-class RReluTrainingTest(OpTest):
+class RReluTrainingTest(RReluTest):
     def setUp(self):
         self.op_type = "rrelu"
         self.lower = 0.3
-        self.upper = 0.3000009
+        self.upper = 0.300000009
         self.is_test = False
-        self.init_prams()
-
-
-class RReluTrainingTest(OpTest):
-    def setUp(self):
-        self.op_type = "rrelu"
-        self.lower = 0.3
-        self.upper = 0.3000009
-        self.is_test = False
-        self.init_prams()
+        self.init_params()
 
 
 if __name__ == "__main__":
diff --git a/setup.py b/setup.py
index 6d088750a60b0..6e77373acf540 100644
--- a/setup.py
+++ b/setup.py
@@ -30,7 +30,6 @@
 from setuptools.command.egg_info import egg_info
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.command.install_lib import install_lib
-from setuptools.dist import Distribution
 
 if sys.version_info < (3, 7):
     raise RuntimeError(

From 592ed40b58d7bc015de87368cc611e865bdcd6ea Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Thu, 8 Dec 2022 10:25:37 +0800
Subject: [PATCH 43/60] set free_when_no_cache_hit default value to true
 (#48815)

---
 .../fluid/memory/allocation/auto_growth_best_fit_allocator.cc   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
index 27c7563fee840..972665562c30f 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -31,7 +31,7 @@ PADDLE_DEFINE_EXPORTED_READONLY_bool(
 
 PADDLE_DEFINE_EXPORTED_READONLY_bool(
     free_when_no_cache_hit,
-    false,
+    true,
     "Whether to free idle chunks when no cache hit. If true, idle "
     "chunk would be freed when no cache hit; if false, idle "
     "chunk would be freed when out of memory occurs. This flag "

From 379216ae05bbf17225a60062a8ab2ffb17883463 Mon Sep 17 00:00:00 2001
From: Roc <30228238+sljlp@users.noreply.github.com>
Date: Thu, 8 Dec 2022 10:38:43 +0800
Subject: [PATCH 44/60] [Clean Fluid] Rm and mv some fluid dygrah apis (#48576)

Remove fluid dygrah apis
GroupNorm
TreeConv
Move fluid dygraph apis
Flatten
SpectralNorm
---
 python/paddle/fluid/dygraph/nn.py             | 422 ------------------
 python/paddle/fluid/reader.py                 |   6 +-
 .../tests/unittests/test_group_norm_op.py     |  32 +-
 .../tests/unittests/test_group_norm_op_v2.py  | 101 -----
 .../test_imperative_load_static_param.py      |  10 +-
 .../fluid/tests/unittests/test_layers.py      | 224 +---------
 .../tests/unittests/test_spectral_norm_op.py  |   5 +-
 .../tests/unittests/test_tree_conv_op.py      |  25 --
 python/paddle/nn/layer/common.py              |  39 +-
 python/paddle/nn/layer/norm.py                | 145 +++++-
 10 files changed, 221 insertions(+), 788 deletions(-)

diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 16148e87ee769..8639f7294046f 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -55,10 +55,6 @@
     'BatchNorm',
     'Embedding',
     'Conv3DTranspose',
-    'GroupNorm',
-    'SpectralNorm',
-    'TreeConv',
-    'Flatten',
 ]
 
 
@@ -1203,421 +1199,3 @@ def forward(self, input):
             outputs={'Out': [out]},
         )
         return self._helper.append_activation(out, act=self._act)
-
-
-class GroupNorm(layers.Layer):
-    """
-    :alias_main: paddle.nn.GroupNorm
-        :alias: paddle.nn.GroupNorm,paddle.nn.layer.GroupNorm,paddle.nn.layer.norm.GroupNorm
-        :old_api: paddle.fluid.dygraph.GroupNorm
-
-    This interface is used to construct a callable object of the ``GroupNorm`` class.
-    For more details, refer to code examples.
-    It implements the function of the Group Normalization Layer.
-    Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_ .
-
-    Parameters:
-        channels(int): The number of channels of input.
-        groups(int): The number of groups that divided from channels.
-        epsilon(float, optional): The small value added to the variance to prevent
-                                  division by zero. Default: 1e-05.
-        param_attr(ParamAttr, optional): The parameter attribute for the learnable
-                                         scale :math:`g`. If it is set to False, no scale will be added to the output units.
-                                         If it is set to None, the bias is initialized one. Default: None.
-        bias_attr(ParamAttr, optional): The parameter attribute for the learnable
-                                        bias :math:`b`. If it is set to False, no bias will be added to the output units.
-                                        If it is set to None, the bias is initialized zero. Default: None.
-        act(str, optional): Activation to be applied to the output of group normalization. Default: None.
-        data_layout(str, optional): Specify the input data format. Only NCHW is supported. Default: NCHW.
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import numpy as np
-
-          with fluid.dygraph.guard():
-              x = np.random.random((8, 32, 32)).astype('float32')
-              groupNorm = fluid.dygraph.nn.GroupNorm(channels=32, groups=4)
-              ret = groupNorm(fluid.dygraph.base.to_variable(x))
-
-    """
-
-    def __init__(
-        self,
-        channels,
-        groups,
-        epsilon=1e-05,
-        param_attr=None,
-        bias_attr=None,
-        act=None,
-        data_layout='NCHW',
-        dtype='float32',
-    ):
-        super().__init__()
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._epsilon = epsilon
-        self._channels = channels
-        self._groups = groups
-        self._act = act
-        self._dtype = dtype
-        if data_layout != 'NCHW':
-            raise ValueError("unsupported data layout:" + data_layout)
-
-        param_shape = [self._channels]
-
-        self.weight = self.create_parameter(
-            attr=self._param_attr or False,
-            shape=param_shape,
-            dtype=self._dtype,
-            default_initializer=Constant(1.0),
-        )
-
-        self.bias = self.create_parameter(
-            attr=self._bias_attr or False,
-            shape=param_shape,
-            dtype=self._dtype,
-            is_bias=True,
-        )
-
-    def forward(self, input):
-        mean_out = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype, stop_gradient=True
-        )
-        variance_out = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype, stop_gradient=True
-        )
-        if in_dygraph_mode():
-            out = _C_ops.group_norm(
-                input,
-                self.weight,
-                self.bias,
-                self._epsilon,
-                self._groups,
-                "NCHW",
-            )
-
-            return dygraph_utils._append_activation_in_dygraph(out, self._act)
-
-        elif _in_legacy_dygraph():
-            attrs = ('epsilon', self._epsilon, 'groups', self._groups)
-            out, _, _ = _legacy_C_ops.group_norm(
-                input, self.weight, self.bias, mean_out, variance_out, *attrs
-            )
-
-            return dygraph_utils._append_activation_in_dygraph(out, self._act)
-        else:
-            inputs = {'X': input}
-            if self.bias is not None:
-                inputs['Bias'] = self.bias
-            if self.weight is not None:
-                inputs['Scale'] = self.weight
-
-            # create output
-            group_norm_out = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype
-            )
-
-            self._helper.append_op(
-                type="group_norm",
-                inputs=inputs,
-                outputs={
-                    "Y": group_norm_out,
-                    "Mean": mean_out,
-                    "Variance": variance_out,
-                },
-                attrs={"epsilon": self._epsilon, "groups": self._groups},
-            )
-
-            return self._helper.append_activation(group_norm_out, self._act)
-
-
-class SpectralNorm(layers.Layer):
-    r"""
-    This interface is used to construct a callable object of the ``SpectralNorm`` class.
-    For more details, refer to code examples. It implements the function of the Spectral Normalization Layer.
-    This layer calculates the spectral normalization value of weight parameters of
-    fc, conv1d, conv2d, conv3d layers which should be 2-D, 3-D, 4-D, 5-D
-    Parameters. Calculations are showed as follows.
-
-    Step 1:
-    Generate vector U in shape of [H], and V in shape of [W].
-    While H is the :attr:`dim` th dimension of the input weights,
-    and W is the product result of remaining dimensions.
-
-    Step 2:
-    :attr:`power_iters` should be a positive integer, do following
-    calculations with U and V for :attr:`power_iters` rounds.
-
-    .. math::
-
-        \mathbf{v} := \frac{\mathbf{W}^{T} \mathbf{u}}{\|\mathbf{W}^{T} \mathbf{u}\|_2}
-
-        \mathbf{u} := \frac{\mathbf{W}^{T} \mathbf{v}}{\|\mathbf{W}^{T} \mathbf{v}\|_2}
-
-    Step 3:
-    Calculate :math:`\sigma(\mathbf{W})` and normalize weight values.
-
-    .. math::
-
-        \sigma(\mathbf{W}) = \mathbf{u}^{T} \mathbf{W} \mathbf{v}
-
-        \mathbf{W} = \frac{\mathbf{W}}{\sigma(\mathbf{W})}
-
-
-    Refer to `Spectral Normalization <https://arxiv.org/abs/1802.05957>`_ .
-
-    Parameters:
-        weight_shape(list or tuple): The shape of weight parameter.
-        dim(int, optional): The index of dimension which should be permuted to the first before reshaping Input(Weight) to matrix, it should be set as 0 if Input(Weight) is the weight of fc layer, and should be set as 1 if Input(Weight) is the weight of conv layer. Default: 0.
-        power_iters(int, optional): The number of power iterations to calculate spectral norm. Default: 1.
-        eps(float, optional): The epsilon for numerical stability in calculating norms. Default: 1e-12.
-        name (str, optional): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
-        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".
-
-    Returns:
-        None
-
-    Examples:
-       .. code-block:: python
-
-            import paddle
-            x = paddle.rand((2,8,32,32))
-
-            spectral_norm = paddle.nn.SpectralNorm(x.shape, dim=1, power_iters=2)
-            spectral_norm_out = spectral_norm(x)
-
-            print(spectral_norm_out.shape) # [2, 8, 32, 32]
-
-    """
-
-    def __init__(
-        self, weight_shape, dim=0, power_iters=1, eps=1e-12, dtype='float32'
-    ):
-        super().__init__()
-        self._power_iters = power_iters
-        self._eps = eps
-        self._dim = dim
-        self._dtype = dtype
-
-        self._weight_shape = list(weight_shape)
-        assert (
-            np.prod(self._weight_shape) > 0
-        ), "Any dimension of `weight_shape` cannot be equal to 0."
-        assert dim < len(self._weight_shape), (
-            "The input `dim` should be less than the "
-            "length of `weight_shape`, but received dim="
-            "{}".format(dim)
-        )
-        h = self._weight_shape[self._dim]
-        w = np.prod(self._weight_shape) // h
-
-        self.weight_u = self.create_parameter(
-            attr=ParamAttr(),
-            shape=[h],
-            dtype=self._dtype,
-            default_initializer=Normal(0.0, 1.0),
-        )
-        self.weight_u.stop_gradient = True
-
-        self.weight_v = self.create_parameter(
-            attr=ParamAttr(),
-            shape=[w],
-            dtype=self._dtype,
-            default_initializer=Normal(0.0, 1.0),
-        )
-        self.weight_v.stop_gradient = True
-
-    def forward(self, weight):
-        if in_dygraph_mode():
-            return _C_ops.spectral_norm(
-                weight,
-                self.weight_u,
-                self.weight_v,
-                self._dim,
-                self._power_iters,
-                self._eps,
-            )
-
-        check_variable_and_dtype(
-            weight, "weight", ['float32', 'float64'], 'SpectralNorm'
-        )
-        inputs = {'Weight': weight, 'U': self.weight_u, 'V': self.weight_v}
-        out = self._helper.create_variable_for_type_inference(self._dtype)
-        self._helper.append_op(
-            type="spectral_norm",
-            inputs=inputs,
-            outputs={
-                "Out": out,
-            },
-            attrs={
-                "dim": self._dim,
-                "power_iters": self._power_iters,
-                "eps": self._eps,
-            },
-        )
-
-        return out
-
-
-class TreeConv(layers.Layer):
-    """
-    This interface is used to construct a callable object of the ``TreeConv`` class.
-    For more details, refer to code examples.
-    Tree-Based Convolution is a kind of convolution based on tree structure.
-    Tree-Based Convolution is a part of Tree-Based Convolution Neural Network(TBCNN),
-    which is used to classify tree structures, such as Abstract Syntax Tree.
-    Tree-Based Convolution proposed a kind of data structure called continuous binary tree,
-    which regards multiway tree as binary tree.
-    The paper of Tree-Based Convolution Operator is here: `tree-based convolution <https://arxiv.org/abs/1409.5718v1/>`_ .
-
-    Parameters:
-        feature_size(int): last dimension of nodes_vector.
-        output_size(int): output feature width.
-        num_filters(int, optional): number of filters, Default: 1.
-        max_depth(int, optional): max depth of filters, Default: 2.
-        act(str, optional): activation function, Default: tanh.
-        param_attr(ParamAttr, optional): the parameter attribute for the filters, Default: None.
-        bias_attr(ParamAttr, optional): the parameter attribute for the bias of this layer, Default: None.
-        name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` .
-        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".
-
-    Attribute:
-        **weight** (Parameter): the learnable weights of filters of this layer.
-
-        **bias** (Parameter or None): the learnable bias of this layer.
-
-    Returns:
-        None
-
-    Examples:
-
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import numpy
-
-          with fluid.dygraph.guard():
-              nodes_vector = numpy.random.random((1, 10, 5)).astype('float32')
-              edge_set = numpy.random.random((1, 9, 2)).astype('int32')
-              treeConv = fluid.dygraph.nn.TreeConv(
-                feature_size=5, output_size=6, num_filters=1, max_depth=2)
-              ret = treeConv(fluid.dygraph.base.to_variable(nodes_vector), fluid.dygraph.base.to_variable(edge_set))
-    """
-
-    def __init__(
-        self,
-        feature_size,
-        output_size,
-        num_filters=1,
-        max_depth=2,
-        act='tanh',
-        param_attr=None,
-        bias_attr=None,
-        name=None,
-        dtype='float32',
-    ):
-        super().__init__()
-        self._name = name
-        self._feature_size = feature_size
-        self._output_size = output_size
-        self._act = act
-        self._max_depth = max_depth
-        self._num_filters = num_filters
-        self._bias_attr = bias_attr
-        self._param_attr = param_attr
-        self._dtype = dtype
-        w_shape = [self._feature_size, 3, self._output_size, self._num_filters]
-        if self._bias_attr:
-            self.bias = self.create_parameter(
-                attr=self._bias_attr,
-                shape=[self._num_filters],
-                dtype=self._dtype,
-                is_bias=True,
-            )
-        self.weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=w_shape,
-            dtype=self._dtype,
-            is_bias=False,
-        )
-
-    def forward(self, nodes_vector, edge_set):
-        check_type(nodes_vector, 'nodes_vector', (Variable), 'TreeConv')
-        check_type(edge_set, 'edge_set', (Variable), 'TreeConv')
-        if self._name:
-            out = self.create_variable(
-                name=self._name, dtype=self._dtype, persistable=False
-            )
-        else:
-            out = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype
-            )
-        self._helper.append_op(
-            type='tree_conv',
-            inputs={
-                'NodesVector': nodes_vector,
-                'EdgeSet': edge_set,
-                'Filter': self.weight,
-            },
-            outputs={
-                'Out': out,
-            },
-            attrs={'max_depth': self._max_depth},
-        )
-        if self._bias_attr:
-            pre_activation = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype
-            )
-            self._helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [out], 'Y': [self.bias]},
-                outputs={'Out': [pre_activation]},
-                attrs={'axis': 1},
-            )
-        else:
-            pre_activation = out
-        return self._helper.append_activation(pre_activation, act=self._act)
-
-
-class Flatten(layers.Layer):
-    """
-    This interface is used to construct a callable object of the ``FLatten`` class.
-    For more details, refer to code examples.
-    It implements flatten a contiguous range of dims into a tensor.
-
-    Parameters:
-        start_axis(int): first dim to flatten (default = 1)
-        stop_axis(int): last dim to flatten (default = -1).
-
-    Returns:
-        None
-
-    Examples:
-
-        .. code-block:: python
-
-          import paddle
-          import numpy as np
-
-          inp_np = np.ones([5, 2, 3, 4]).astype('float32')
-          inp_np = paddle.to_tensor(inp_np)
-          flatten = paddle.nn.Flatten(start_axis=1, stop_axis=2)
-          flatten_res = flatten(inp_np)
-
-    """
-
-    def __init__(self, start_axis=1, stop_axis=-1):
-        super().__init__()
-        self.start_axis = start_axis
-        self.stop_axis = stop_axis
-
-    def forward(self, input):
-        out = paddle.tensor.manipulation.flatten(
-            input, start_axis=self.start_axis, stop_axis=self.stop_axis
-        )
-        return out
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 4883d70d97dfc..5e678fc67662d 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -1351,9 +1351,9 @@ def __init__(
         self._use_double_buffer = use_double_buffer
         self._capacity = capacity
         if not self._iterable:
-            # Because layers.io.double_buffer is not supported anymore, and only when iterable and use_double_buffer
-            # are both True layers.io.double_buffer will be in use, here if itrable is False, use_double_buffer will be
-            # forcely set False to avoid using layers.io.double_buffer.
+            # Because layers.io.double_buffer is not supported anymore and that iterable is False and use_double_buffer
+            # is True is not spported, here if itrable is False, use_double_buffer will be
+            # forcely set False to avoid unexpected error.
             # TODO: keep use_double_buffer
             self._use_double_buffer = False
             self._init_non_iterable()
diff --git a/python/paddle/fluid/tests/unittests/test_group_norm_op.py b/python/paddle/fluid/tests/unittests/test_group_norm_op.py
index 24f0bc2a0b1f0..83195e0885d22 100644
--- a/python/paddle/fluid/tests/unittests/test_group_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op.py
@@ -293,21 +293,25 @@ def attr_data_format():
 
 class TestGroupNormEager(unittest.TestCase):
     def test_dygraph_api(self):
-        self.dtype = np.float64
+
+        # not supported float64
+        # only support float32
+        self.dtype = np.float32
+
         self.shape = (8, 32, 32)
         input = np.random.random(self.shape).astype(self.dtype)
 
         with fluid.dygraph.guard():
             tensor_1 = fluid.dygraph.to_variable(input)
             tensor_1.stop_gradient = False
-            groupNorm = fluid.dygraph.nn.GroupNorm(channels=32, groups=4)
+            groupNorm = paddle.nn.GroupNorm(num_channels=32, num_groups=4)
             ret1 = groupNorm(tensor_1)
             ret1.backward()
             with _test_eager_guard():
                 tensor_eager_1 = fluid.dygraph.to_variable(input)
                 tensor_eager_1.stop_gradient = False
-                groupNorm_eager = fluid.dygraph.nn.GroupNorm(
-                    channels=32, groups=4
+                groupNorm_eager = paddle.nn.GroupNorm(
+                    num_channels=32, num_groups=4
                 )
                 ret2 = groupNorm_eager(tensor_eager_1)
                 ret2.backward()
@@ -328,16 +332,14 @@ def test_dygraph_api(self):
         with fluid.dygraph.guard():
             tensor_1 = fluid.dygraph.to_variable(input)
             tensor_1.stop_gradient = False
-            groupNorm = fluid.dygraph.nn.GroupNorm(
-                channels=32, groups=4, dtype='float32'
-            )
+            groupNorm = paddle.nn.GroupNorm(num_channels=32, num_groups=4)
             ret1 = groupNorm(tensor_1)
             ret1.backward()
             with _test_eager_guard():
                 tensor_eager_1 = fluid.dygraph.to_variable(input)
                 tensor_eager_1.stop_gradient = False
-                groupNorm_eager = fluid.dygraph.nn.GroupNorm(
-                    channels=32, groups=4
+                groupNorm_eager = paddle.nn.GroupNorm(
+                    num_channels=32, num_groups=4
                 )
                 ret2 = groupNorm_eager(tensor_eager_1)
                 ret2.backward()
@@ -351,23 +353,25 @@ def test_dygraph_api(self):
 
 class TestGroupNormEager_fp16(unittest.TestCase):
     def test_dygraph_api(self):
+
+        # not supported float16
+        # only support float32
         self.dtype = np.float32
+
         self.shape = (8, 32, 32)
         input = np.random.random(self.shape).astype(self.dtype)
 
         with fluid.dygraph.guard():
             tensor_1 = fluid.dygraph.to_variable(input)
             tensor_1.stop_gradient = False
-            groupNorm = fluid.dygraph.nn.GroupNorm(
-                channels=32, groups=4, dtype='float16'
-            )
+            groupNorm = paddle.nn.GroupNorm(num_channels=32, num_groups=4)
             ret1 = groupNorm(tensor_1)
             ret1.backward()
             with _test_eager_guard():
                 tensor_eager_1 = fluid.dygraph.to_variable(input)
                 tensor_eager_1.stop_gradient = False
-                groupNorm_eager = fluid.dygraph.nn.GroupNorm(
-                    channels=32, groups=4
+                groupNorm_eager = paddle.nn.GroupNorm(
+                    num_channels=32, num_groups=4
                 )
                 ret2 = groupNorm_eager(tensor_eager_1)
                 ret2.backward()
diff --git a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
index 1082cd4b98dc5..c710c96ba9bd5 100644
--- a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
@@ -19,7 +19,6 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid import Program, program_guard
 from paddle.fluid.framework import _test_eager_guard
 
 
@@ -39,106 +38,6 @@ def group_norm_naive_for_general_dimension(x, scale, bias, epsilon, groups):
     return output
 
 
-class TestDygraphGroupNormv2(unittest.TestCase):
-    def test_dygraph(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("group_norm"):
-            places.append(fluid.CUDAPlace(0))
-        shapes = [
-            [2, 2, 2, 2],
-            [2, 2, 4],
-            [4, 2],
-            [4, 2, 6, 6, 2],
-            [2, 2, 2, 2, 2, 2],
-        ]
-        for p in places:
-
-            def compute_v1(x):
-                with fluid.dygraph.guard(p):
-                    gn = fluid.dygraph.GroupNorm(channels=2, groups=2)
-                    y = gn(fluid.dygraph.to_variable(x))
-                return y.numpy()
-
-            def compute_v2(x):
-                with fluid.dygraph.guard(p):
-                    gn = paddle.nn.GroupNorm(num_channels=2, num_groups=2)
-                    y = gn(fluid.dygraph.to_variable(x))
-                return y.numpy()
-
-            def test_weight_bias_false():
-                with fluid.dygraph.guard(p):
-                    gn = paddle.nn.GroupNorm(
-                        num_channels=2,
-                        num_groups=2,
-                        weight_attr=False,
-                        bias_attr=False,
-                    )
-
-            def test_nn_exception():
-                with fluid.dygraph.guard(p):
-
-                    def attr_data_format():
-                        out = paddle.nn.GroupNorm(
-                            num_groups=2, num_channels=2, data_format="CNHW"
-                        )
-
-                    self.assertRaises(ValueError, attr_data_format)
-
-            for shape in shapes:
-                x = np.random.randn(*shape).astype("float32")
-                y1 = compute_v1(x)
-                y2 = compute_v2(x)
-                result = np.allclose(y1, y2, atol=1e-5)
-                if not result:
-                    print("y1:", y1, "\ty2:", y2)
-                self.assertTrue(result)
-                test_weight_bias_false()
-                test_nn_exception()
-
-    def test_static(self):
-        paddle.enable_static()
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("group_norm"):
-            places.append(fluid.CUDAPlace(0))
-        shapes = [
-            [2, 6, 2, 2],
-            [2, 6, 4],
-            [4, 6],
-            [4, 6, 6, 6, 2],
-            [4, 6, 2, 2, 2, 2],
-        ]
-        for p in places:
-            exe = fluid.Executor(p)
-
-            def compute_v1(x_np):
-                with program_guard(Program(), Program()):
-                    gn = fluid.dygraph.GroupNorm(channels=6, groups=2)
-                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
-                    y = gn(x)
-                    exe.run(fluid.default_startup_program())
-                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
-                return r
-
-            def compute_v2(x_np):
-                with program_guard(Program(), Program()):
-                    gn = paddle.nn.GroupNorm(num_channels=6, num_groups=2)
-                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
-                    y = gn(x)
-                    exe.run(fluid.default_startup_program())
-                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
-                return r
-
-            for shape in shapes:
-                x = np.random.randn(*shape).astype("float32")
-                y1 = compute_v1(x)
-                y2 = compute_v2(x)
-                np.testing.assert_allclose(y1, y2, rtol=1e-05, atol=1e-05)
-
-    def test_eager_api(self):
-        with _test_eager_guard():
-            self.test_dygraph()
-
-
 class TestGroupNormAPIV2_With_General_Dimensions(unittest.TestCase):
     def test_numerical_accuracy(self):
         paddle.disable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
index 96bb7914a6c7e..f864e2829046b 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
@@ -21,7 +21,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.framework as framework
-from paddle.fluid.dygraph.nn import BatchNorm, Embedding, GroupNorm
+from paddle.fluid.dygraph.nn import BatchNorm, Embedding
 from paddle.nn import Linear
 
 
@@ -122,10 +122,10 @@ def testLoadStaticModel(self):
             name='groupnorm_in', shape=[None, 8, 32, 32], dtype='float32'
         )
         groupnorm_out1 = paddle.static.nn.group_norm(
-            input=groupnorm_in, groups=4
+            input=groupnorm_in, groups=4, param_attr=True, bias_attr=True
         )
         groupnorm_out2 = paddle.static.nn.group_norm(
-            input=groupnorm_in, groups=4
+            input=groupnorm_in, groups=4, param_attr=True, bias_attr=True
         )
         '''
         spec_norm = fluid.data(name='spec_norm', shape=[2, 8, 32, 32], dtype='float32')
@@ -212,8 +212,8 @@ def __init__(self):
                     self.layer_norm_1 = paddle.nn.LayerNorm([10])
                     self.layer_norm_2 = paddle.nn.LayerNorm(10)
 
-                    self.group_norm1 = GroupNorm(8, 4)
-                    self.gourp_norm2 = GroupNorm(8, 4)
+                    self.group_norm1 = paddle.nn.GroupNorm(4, 8)
+                    self.gourp_norm2 = paddle.nn.GroupNorm(4, 8)
 
                     self.w_1 = self.create_parameter(
                         [100, 100], dtype='float32', attr="weight_test_1"
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index f07f8bba97c97..4fb6645b9673f 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -191,7 +191,7 @@ def test_Flatten(self):
                 dtype='float32',
                 append_batch_size=False,
             )
-            flatten = nn.Flatten()
+            flatten = paddle.nn.Flatten()
             ret = flatten(t)
             static_ret = self.get_static_graph_result(
                 feed={'data': inp}, fetch_list=[ret]
@@ -199,12 +199,12 @@ def test_Flatten(self):
         with self.dynamic_graph():
             with _test_eager_guard():
                 t = base.to_variable(inp)
-                flatten = nn.Flatten()
+                flatten = paddle.nn.Flatten()
                 dy_eager_ret = flatten(t)
                 dy_eager_ret_value = dy_eager_ret.numpy()
 
             t = base.to_variable(inp)
-            flatten = nn.Flatten()
+            flatten = paddle.nn.Flatten()
             dy_ret = flatten(t)
             dy_ret_value = dy_ret.numpy()
 
@@ -1066,10 +1066,10 @@ def func_group_norm(self):
                 lod_level=1,
                 append_batch_size=False,
             )
-            groupNorm = nn.GroupNorm(
-                channels=shape[1],
-                groups=2,
-                param_attr=fluid.initializer.Uniform(low=-0.5, high=0.5),
+            groupNorm = paddle.nn.GroupNorm(
+                num_channels=shape[1],
+                num_groups=2,
+                weight_attr=fluid.initializer.Uniform(low=-0.5, high=0.5),
                 bias_attr=fluid.initializer.ConstantInitializer(value=1),
             )
             ret = groupNorm(X)
@@ -1084,10 +1084,10 @@ def func_group_norm(self):
             )[0]
 
         with self.dynamic_graph():
-            groupNorm = nn.GroupNorm(
-                channels=shape[1],
-                groups=2,
-                param_attr=fluid.initializer.Uniform(low=-0.5, high=0.5),
+            groupNorm = paddle.nn.GroupNorm(
+                num_channels=shape[1],
+                num_groups=2,
+                weight_attr=fluid.initializer.Uniform(low=-0.5, high=0.5),
                 bias_attr=fluid.initializer.ConstantInitializer(value=1),
             )
             dy_ret = groupNorm(base.to_variable(input))
@@ -1209,7 +1209,7 @@ def test_spectral_norm(self):
                 lod_level=1,
                 append_batch_size=False,
             )
-            spectralNorm = nn.SpectralNorm(shape, dim=1, power_iters=2)
+            spectralNorm = paddle.nn.SpectralNorm(shape, axis=1, power_iters=2)
             ret = spectralNorm(Weight)
             static_ret2 = self.get_static_graph_result(
                 feed={
@@ -1223,11 +1223,13 @@ def test_spectral_norm(self):
 
         with self.dynamic_graph():
             with _test_eager_guard():
-                spectralNorm = nn.SpectralNorm(shape, dim=1, power_iters=2)
+                spectralNorm = paddle.nn.SpectralNorm(
+                    shape, axis=1, power_iters=2
+                )
                 dy_eager_ret = spectralNorm(base.to_variable(input))
                 dy_eager_rlt_value = dy_eager_ret.numpy()
 
-            spectralNorm = nn.SpectralNorm(shape, dim=1, power_iters=2)
+            spectralNorm = paddle.nn.SpectralNorm(shape, axis=1, power_iters=2)
             dy_ret = spectralNorm(base.to_variable(input))
             dy_rlt_value = dy_ret.numpy()
 
@@ -1235,200 +1237,6 @@ def test_spectral_norm(self):
         np.testing.assert_allclose(static_ret, dy_eager_rlt_value, rtol=1e-05)
         np.testing.assert_allclose(static_ret, static_ret2, rtol=1e-05)
 
-    def test_tree_conv(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-        adj_array = [1, 2, 1, 3, 1, 4, 1, 5, 2, 6, 2, 7, 2, 8, 4, 9, 4, 10]
-        adj = np.array(adj_array).reshape((1, 9, 2)).astype('int32')
-        adj = np.tile(adj, (1, 1, 1))
-        vectors = np.random.random((1, 10, 5)).astype('float32')
-        with self.static_graph():
-            NodesVector = fluid.layers.data(
-                name='NodesVector',
-                shape=(1, 10, 5),
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False,
-            )
-            EdgeSet = fluid.layers.data(
-                name='EdgeSet',
-                shape=(1, 9, 2),
-                dtype='int32',
-                lod_level=1,
-                append_batch_size=False,
-            )
-            ret = fluid.contrib.layers.tree_conv(
-                nodes_vector=NodesVector,
-                edge_set=EdgeSet,
-                output_size=6,
-                num_filters=1,
-                max_depth=2,
-            )
-            static_ret = self.get_static_graph_result(
-                feed={
-                    'NodesVector': fluid.create_lod_tensor(
-                        data=vectors, recursive_seq_lens=[[1]], place=place
-                    ),
-                    'EdgeSet': fluid.create_lod_tensor(
-                        data=adj, recursive_seq_lens=[[1]], place=place
-                    ),
-                },
-                fetch_list=[ret],
-                with_lod=False,
-            )[0]
-
-        with self.static_graph():
-            NodesVector = fluid.layers.data(
-                name='NodesVector',
-                shape=(1, 10, 5),
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False,
-            )
-            EdgeSet = fluid.layers.data(
-                name='EdgeSet',
-                shape=(1, 9, 2),
-                dtype='int32',
-                lod_level=1,
-                append_batch_size=False,
-            )
-            treeConv = nn.TreeConv(
-                feature_size=5, output_size=6, num_filters=1, max_depth=2
-            )
-            ret = treeConv(NodesVector, EdgeSet)
-            static_ret2 = self.get_static_graph_result(
-                feed={
-                    'NodesVector': fluid.create_lod_tensor(
-                        data=vectors, recursive_seq_lens=[[1]], place=place
-                    ),
-                    'EdgeSet': fluid.create_lod_tensor(
-                        data=adj, recursive_seq_lens=[[1]], place=place
-                    ),
-                },
-                fetch_list=[ret],
-                with_lod=False,
-            )[0]
-
-        with self.dynamic_graph():
-            with _test_eager_guard():
-                treeConv = nn.TreeConv(
-                    feature_size=5, output_size=6, num_filters=1, max_depth=2
-                )
-                dy_eager_ret = treeConv(
-                    base.to_variable(vectors), base.to_variable(adj)
-                )
-                dy_eager_rlt_value = dy_eager_ret.numpy()
-
-            treeConv = nn.TreeConv(
-                feature_size=5, output_size=6, num_filters=1, max_depth=2
-            )
-            dy_ret = treeConv(base.to_variable(vectors), base.to_variable(adj))
-            dy_rlt_value = dy_ret.numpy()
-
-        np.testing.assert_allclose(static_ret, static_ret2, rtol=1e-05)
-        np.testing.assert_allclose(static_ret, dy_rlt_value, rtol=1e-05)
-        np.testing.assert_allclose(static_ret, dy_eager_rlt_value, rtol=1e-05)
-
-        with self.dynamic_graph():
-            with _test_eager_guard():
-                custom_weight = np.random.randn(5, 3, 6, 1).astype("float32")
-                weight_attr = fluid.ParamAttr(
-                    initializer=fluid.initializer.NumpyArrayInitializer(
-                        custom_weight
-                    )
-                )
-                treeConv1 = nn.TreeConv(
-                    feature_size=5,
-                    output_size=6,
-                    num_filters=1,
-                    max_depth=2,
-                    bias_attr='eager_tc1_b',
-                )
-                treeConv2 = nn.TreeConv(
-                    feature_size=5,
-                    output_size=6,
-                    num_filters=1,
-                    max_depth=2,
-                    param_attr=weight_attr,
-                    bias_attr='eager_tc2_b',
-                )
-                dy_ret1 = treeConv1(
-                    base.to_variable(vectors), base.to_variable(adj)
-                )
-                dy_ret2 = treeConv2(
-                    base.to_variable(vectors), base.to_variable(adj)
-                )
-                self.assertFalse(
-                    np.array_equal(dy_ret1.numpy(), dy_ret2.numpy())
-                )
-                treeConv2.weight.set_value(treeConv1.weight.numpy())
-                treeConv2.bias.set_value(treeConv1.bias)
-                dy_ret1 = treeConv1(
-                    base.to_variable(vectors), base.to_variable(adj)
-                )
-                dy_ret2 = treeConv2(
-                    base.to_variable(vectors), base.to_variable(adj)
-                )
-                np.testing.assert_array_equal(dy_ret1.numpy(), dy_ret2.numpy())
-
-                treeConv2.weight = treeConv1.weight
-                treeConv2.bias = treeConv1.bias
-                np.testing.assert_array_equal(
-                    treeConv1.weight.numpy(), treeConv2.weight.numpy()
-                )
-                np.testing.assert_array_equal(
-                    treeConv1.bias.numpy(), treeConv2.bias.numpy()
-                )
-
-            custom_weight = np.random.randn(5, 3, 6, 1).astype("float32")
-            weight_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    custom_weight
-                )
-            )
-            treeConv1 = nn.TreeConv(
-                feature_size=5,
-                output_size=6,
-                num_filters=1,
-                max_depth=2,
-                bias_attr='tc1_b',
-            )
-            treeConv2 = nn.TreeConv(
-                feature_size=5,
-                output_size=6,
-                num_filters=1,
-                max_depth=2,
-                param_attr=weight_attr,
-                bias_attr='tc2_b',
-            )
-            dy_ret1 = treeConv1(
-                base.to_variable(vectors), base.to_variable(adj)
-            )
-            dy_ret2 = treeConv2(
-                base.to_variable(vectors), base.to_variable(adj)
-            )
-            self.assertFalse(np.array_equal(dy_ret1.numpy(), dy_ret2.numpy()))
-            treeConv2.weight.set_value(treeConv1.weight.numpy())
-            treeConv2.bias.set_value(treeConv1.bias)
-            dy_ret1 = treeConv1(
-                base.to_variable(vectors), base.to_variable(adj)
-            )
-            dy_ret2 = treeConv2(
-                base.to_variable(vectors), base.to_variable(adj)
-            )
-            np.testing.assert_array_equal(dy_ret1.numpy(), dy_ret2.numpy())
-
-            treeConv2.weight = treeConv1.weight
-            treeConv2.bias = treeConv1.bias
-            np.testing.assert_array_equal(
-                treeConv1.weight.numpy(), treeConv2.weight.numpy()
-            )
-            np.testing.assert_array_equal(
-                treeConv1.bias.numpy(), treeConv2.bias.numpy()
-            )
-
     def test_conv3d_transpose(self):
         input_array = (
             np.arange(0, 48).reshape([2, 3, 2, 2, 2]).astype('float32')
diff --git a/python/paddle/fluid/tests/unittests/test_spectral_norm_op.py b/python/paddle/fluid/tests/unittests/test_spectral_norm_op.py
index 939406945d0d4..34e26c0b24e35 100644
--- a/python/paddle/fluid/tests/unittests/test_spectral_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_spectral_norm_op.py
@@ -17,6 +17,7 @@
 import numpy as np
 from op_test import OpTest, skip_check_grad_ci
 
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.framework import Program, program_guard
 
@@ -152,9 +153,7 @@ class TestDygraphSpectralNormOpError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
             shape = (2, 4, 3, 3)
-            spectralNorm = fluid.dygraph.nn.SpectralNorm(
-                shape, dim=1, power_iters=2
-            )
+            spectralNorm = paddle.nn.SpectralNorm(shape, axis=1, power_iters=2)
 
             def test_Variable():
                 weight_1 = np.random.random((2, 4)).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_tree_conv_op.py b/python/paddle/fluid/tests/unittests/test_tree_conv_op.py
index 1c85dbf5e6cc3..fbb3ceffc355a 100644
--- a/python/paddle/fluid/tests/unittests/test_tree_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tree_conv_op.py
@@ -196,30 +196,5 @@ def test_errors(self):
             )
 
 
-class TestDygraphTreeConv_OpError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-            TreeConv = fluid.dygraph.nn.TreeConv(
-                feature_size=5, output_size=6, num_filters=1, max_depth=2
-            )
-            nodes_vector_1 = np.random.random((10, 5)).astype("float32")
-            edge_set_1 = fluid.layers.data(
-                name='edge_set_1', shape=[10, 2], dtype='float32'
-            )
-            # the nodes_vector of TreeConv must be Variable.
-            self.assertRaises(
-                TypeError, TreeConv, nodes_vector_1, edge_set_1, 3
-            )
-
-            nodes_vector_2 = fluid.layers.data(
-                name='vectors2', shape=[10, 5], dtype='float32'
-            )
-            edge_set_2 = np.random.random((10, 2)).astype("float32")
-            # the edge_set of TreeConv must be Variable.
-            self.assertRaises(
-                TypeError, TreeConv, nodes_vector_2, edge_set_2, 3
-            )
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index ea4dfb41525c6..921643ef9d855 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -17,7 +17,6 @@
 from paddle import in_dynamic_mode
 from paddle.nn import Layer
 
-from ...fluid.dygraph import Flatten  # noqa: F401
 from .. import functional as F
 
 __all__ = []
@@ -1705,3 +1704,41 @@ def extra_repr(self):
             self.strides,
             name_str,
         )
+
+
+class Flatten(Layer):
+    """
+    This interface is used to construct a callable object of the ``FLatten`` class.
+    For more details, refer to code examples.
+    It implements flatten a contiguous range of dims into a tensor.
+
+    Parameters:
+        start_axis(int): first dim to flatten (default = 1)
+        stop_axis(int): last dim to flatten (default = -1).
+
+    Returns:
+        None
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+
+          inp = paddle.ones([5, 2, 3, 4]).astype('float32')
+          flatten = paddle.nn.Flatten(start_axis=1, stop_axis=2)
+          y = flatten(inp)
+          # y.shape = [5, 6, 4]
+
+    """
+
+    def __init__(self, start_axis=1, stop_axis=-1):
+        super().__init__()
+        self.start_axis = start_axis
+        self.stop_axis = stop_axis
+
+    def forward(self, x):
+        out = paddle.flatten(
+            x, start_axis=self.start_axis, stop_axis=self.stop_axis
+        )
+        return out
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index c0117560f25e2..1c7e64d794a65 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -39,12 +39,11 @@
 
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid.dygraph import BatchNorm  # noqa: F401
-from ...fluid.dygraph import SpectralNorm  # noqa: F401
 from ...framework import ParamAttr, get_default_dtype, no_grad
 from .. import Layer
 from .. import functional as F
 from ..functional import batch_norm, instance_norm, layer_norm
-from ..initializer import Constant
+from ..initializer import Constant, Normal
 
 __all__ = []
 
@@ -388,8 +387,8 @@ def __init__(
                 shape=param_shape,
                 default_initializer=Constant(1.0),
             )
-            self.weight.stop_gradient = (
-                self._weight_attr is not None
+            self.weight.stop_gradient = self._weight_attr is not None and (
+                hasattr(self._weight_attr, "learning_rate")
                 and self._weight_attr.learning_rate == 0.0
             )
 
@@ -405,8 +404,8 @@ def __init__(
             self.bias = self.create_parameter(
                 attr=self._bias_attr, shape=param_shape, is_bias=True
             )
-            self.bias.stop_gradient = (
-                self._bias_attr is not None
+            self.bias.stop_gradient = self._bias_attr is not None and (
+                hasattr(self._bias_attr, "learning_rate")
                 and self._bias_attr.learning_rate == 0.0
             )
 
@@ -1431,3 +1430,137 @@ def extra_repr(self):
         if self.name is not None:
             main_str += ', name={}'.format(self.name)
         return main_str
+
+
+class SpectralNorm(Layer):
+    r"""
+    This interface is used to construct a callable object of the ``SpectralNorm`` class.
+    For more details, refer to code examples. It implements the function of the Spectral Normalization Layer.
+    This layer calculates the spectral normalization value of weight parameters of
+    fc, conv1d, conv2d, conv3d layers which should be 2-D, 3-D, 4-D, 5-D
+    Parameters. Calculations are showed as follows.
+
+    Step 1:
+    Generate vector U in shape of [H], and V in shape of [W].
+    While H is the :attr:`axis` th dimension of the input weights,
+    and W is the product result of remaining dimensions.
+
+    Step 2:
+    :attr:`power_iters` should be a positive integer, do following
+    calculations with U and V for :attr:`power_iters` rounds.
+
+    .. math::
+
+        \mathbf{v} := \frac{\mathbf{W}^{T} \mathbf{u}}{\|\mathbf{W}^{T} \mathbf{u}\|_2}
+
+        \mathbf{u} := \frac{\mathbf{W}^{T} \mathbf{v}}{\|\mathbf{W}^{T} \mathbf{v}\|_2}
+
+    Step 3:
+    Calculate :math:`\sigma(\mathbf{W})` and normalize weight values.
+
+    .. math::
+
+        \sigma(\mathbf{W}) = \mathbf{u}^{T} \mathbf{W} \mathbf{v}
+
+        \mathbf{W} = \frac{\mathbf{W}}{\sigma(\mathbf{W})}
+
+
+    Refer to `Spectral Normalization <https://arxiv.org/abs/1802.05957>`_ .
+
+    Parameters:
+        weight_shape(list or tuple): The shape of weight parameter.
+        axis(int, optional): The index of dimension which should be permuted to the first before reshaping Input(Weight) to matrix, it should be set as 0 if Input(Weight) is the weight of fc layer, and should be set as 1 if Input(Weight) is the weight of conv layer. Default: 0.
+        power_iters(int, optional): The number of power iterations to calculate spectral norm. Default: 1.
+        epsilon(float, optional): The epsilon for numerical stability in calculating norms. Default: 1e-12.
+        name (str, optional): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
+        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".
+
+    Returns:
+        None
+
+    Examples:
+       .. code-block:: python
+
+            import paddle
+            x = paddle.rand((2,8,32,32))
+
+            spectral_norm = paddle.nn.SpectralNorm(x.shape, axis=1, power_iters=2)
+            spectral_norm_out = spectral_norm(x)
+
+            print(spectral_norm_out.shape) # [2, 8, 32, 32]
+
+    """
+
+    def __init__(
+        self,
+        weight_shape,
+        axis=0,
+        power_iters=1,
+        epsilon=1e-12,
+        dtype='float32',
+    ):
+        super().__init__()
+        self._power_iters = power_iters
+        self._epsilon = epsilon
+        self._dim = axis
+        self._dtype = dtype
+
+        self._weight_shape = list(weight_shape)
+        assert (
+            np.prod(self._weight_shape) > 0
+        ), "Any dimension of `weight_shape` cannot be equal to 0."
+        assert axis < len(self._weight_shape), (
+            "The input `axis` should be less than the "
+            "length of `weight_shape`, but received axis="
+            "{}".format(axis)
+        )
+        h = self._weight_shape[self._dim]
+        w = np.prod(self._weight_shape) // h
+
+        self.weight_u = self.create_parameter(
+            attr=ParamAttr(),
+            shape=[h],
+            dtype=self._dtype,
+            default_initializer=Normal(0.0, 1.0),
+        )
+        self.weight_u.stop_gradient = True
+
+        self.weight_v = self.create_parameter(
+            attr=ParamAttr(),
+            shape=[w],
+            dtype=self._dtype,
+            default_initializer=Normal(0.0, 1.0),
+        )
+        self.weight_v.stop_gradient = True
+
+    def forward(self, x):
+        weight = x
+        if in_dygraph_mode():
+            return _C_ops.spectral_norm(
+                weight,
+                self.weight_u,
+                self.weight_v,
+                self._dim,
+                self._power_iters,
+                self._epsilon,
+            )
+
+        check_variable_and_dtype(
+            weight, "weight", ['float32', 'float64'], 'SpectralNorm'
+        )
+        inputs = {'Weight': weight, 'U': self.weight_u, 'V': self.weight_v}
+        out = self._helper.create_variable_for_type_inference(self._dtype)
+        self._helper.append_op(
+            type="spectral_norm",
+            inputs=inputs,
+            outputs={
+                "Out": out,
+            },
+            attrs={
+                "dim": self._dim,
+                "power_iters": self._power_iters,
+                "eps": self._epsilon,
+            },
+        )
+
+        return out

From 3a387df62ebe84719e9837f1bebb05701b013c3a Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Thu, 8 Dec 2022 10:39:50 +0800
Subject: [PATCH 45/60] [Inference] inference add cinn interface (#48741)

---
 .../framework/paddle2cinn/build_cinn_pass.cc  | 28 ++++++++++++++---
 paddle/fluid/inference/analysis/argument.h    |  3 ++
 .../inference/analysis/ir_pass_manager.cc     |  2 ++
 paddle/fluid/inference/api/analysis_config.cc | 31 ++++++++++++++++++-
 .../fluid/inference/api/analysis_predictor.cc | 30 +++++++++++-------
 .../inference/api/paddle_analysis_config.h    | 16 ++++++++++
 .../inference/api/paddle_pass_builder.cc      |  7 +++++
 .../fluid/inference/api/paddle_pass_builder.h |  3 ++
 8 files changed, 104 insertions(+), 16 deletions(-)
 mode change 100755 => 100644 paddle/fluid/inference/api/analysis_config.cc
 mode change 100755 => 100644 paddle/fluid/inference/api/analysis_predictor.cc

diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
index e97a56a743e25..f7306bfc9a28e 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
@@ -484,7 +484,8 @@ void AnalyseClusterVariables(
     const std::unordered_set<std::string>& deny_var_set,
     GraphNodeSet* cluster_inputs,
     GraphNodeSet* cluster_outputs,
-    GraphNodeSet* cluster_internals) {
+    GraphNodeSet* cluster_internals,
+    bool is_inference_stage) {
   // collecting all input and output of op
   for (auto* op_node : cluster) {
     const auto& op_name = op_node->Name();
@@ -523,6 +524,18 @@ void AnalyseClusterVariables(
   for (auto* var_node : *cluster_internals) {
     cluster_outputs->erase(var_node);
   }
+
+  if (is_inference_stage) {
+    // If part of the output of the Op is not used by other operators, change it
+    // to internal. such as transpose2 op's XShape out.
+    auto outs = *cluster_outputs;
+    for (auto* node : outs) {
+      if (node->outputs.empty()) {
+        cluster_outputs->erase(node);
+        cluster_internals->insert(node);
+      }
+    }
+  }
 }
 
 void AddLinkToCinnOp(const GraphNodeSet& cluster_inputs,
@@ -611,7 +624,7 @@ void ReplaceSubGraphWithCinnOpNode(
 // Here we using SubgraphDetector to detecte the subgraph that
 // all of op node supported by CINN. We using OpMapperRegistry
 // to check whether the op node supported by CINN.
-void SearchAllSubgraphs(Graph* graph) {
+void SearchAllSubgraphs(Graph* graph, bool is_inference_stage) {
   auto allow_ops = StringSplit(FLAGS_allow_cinn_ops, kDelim);
   auto deny_ops = StringSplit(FLAGS_deny_cinn_ops, kDelim);
   OpTransInfo trans_info;
@@ -671,7 +684,8 @@ void SearchAllSubgraphs(Graph* graph) {
                             deny_var_set,
                             &cluster_inputs,
                             &cluster_outputs,
-                            &cluster_internals);
+                            &cluster_internals,
+                            is_inference_stage);
 
     VLOG(4) << "Cluster Ops: " << cluster_debug_info(cluster_set);
     VLOG(4) << "Cluster input vars: " << cluster_debug_info(cluster_inputs);
@@ -698,7 +712,13 @@ void SearchAllSubgraphs(Graph* graph) {
 }
 }  // namespace
 
-void BuildCinnPass::ApplyImpl(Graph* graph) const { SearchAllSubgraphs(graph); }
+void BuildCinnPass::ApplyImpl(Graph* graph) const {
+  bool is_inference_stage{false};
+  if (Has("is_inference_stage")) {
+    is_inference_stage = Get<bool>("is_inference_stage");
+  }
+  SearchAllSubgraphs(graph, is_inference_stage);
+}
 
 }  // namespace paddle2cinn
 }  // namespace framework
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 8750a9afb44e4..a72c1fe762213 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -368,6 +368,9 @@ struct Argument {
   DECL_ARGUMENT_FIELD(enable_gpu_half, EnableGPUHalf, bool);
   DECL_ARGUMENT_FIELD(mixed_precision_mode, MixedPrecisionMode, int);
 
+  // cinn compiler related
+  DECL_ARGUMENT_FIELD(use_cinn_compiler, UseCinnCompiler, bool);
+
  private:
   std::unordered_set<std::string> valid_fields_;
 };
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index cbcc48a7f68e8..25b371cb2ff39 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -235,6 +235,8 @@ void IRPassManager::CreatePasses(Argument *argument,
                 new framework::ProgramDesc *(&argument->main_program()));
     } else if (pass_name == "memory_optimize_pass") {
       pass->Set("root_predictor_id", new int(argument->root_predictor_id()));
+    } else if (pass_name == "build_cinn_pass") {
+      pass->Set("is_inference_stage", new bool(argument->use_cinn_compiler()));
     }
     if (pass_name == "lite_subgraph_pass") {
       bool lite_enable_int8 =
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
old mode 100755
new mode 100644
index c5e648dffc0bf..17afc4f840e7d
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -477,6 +477,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   // profile related.
   CP_MEMBER(with_profile_);
 
+  // cinn compiler related.
+  CP_MEMBER(use_cinn_compiler_);
+
   // glog related.
   CP_MEMBER(with_glog_info_);
 
@@ -542,7 +545,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 #undef CP_MEMBER
 
   Update();
-  if (use_tensorrt_) {
+  if (use_tensorrt_ || use_cinn_compiler_) {
     // Update() will reset all the passes, when some tensorRT pass is deleted in
     // other.pass_builder(), it will set again, so we just remove the
     // deleted_pass.
@@ -872,6 +875,14 @@ void AnalysisConfig::Update() {
     }
   }
 
+  // TODO(wilber): An ugly method to update pass, need to be fixed.
+  if (use_cinn_compiler_) {
+    pass_builder()->ClearPasses();
+    for (const auto &pass : kCINNCompilerPasses) {
+      pass_builder()->AppendPass(pass);
+    }
+  }
+
   if (use_dlnne_) {
     pass_builder()->ClearPasses();
     for (const auto &pass : kDlnneSubgraphPasses) {
@@ -1316,6 +1327,9 @@ std::string AnalysisConfig::Summary() {
     os.InsertRow({"use_lite", use_lite_ ? "true" : "false"});
   }
 
+  // cinn compiler
+  os.InsertRow({"use_cinn_compiler", use_cinn_compiler_ ? "true" : "false"});
+
   // ir info
   os.InsertRow({"ir_optim", enable_ir_optim_ ? "true" : "false"});
   os.InsertRow({"ir_debug", ir_debug_ ? "true" : "false"});
@@ -1429,4 +1443,19 @@ void AnalysisConfig::Exp_DisableMixedInferOps(
   mixed_black_list_ = black_list;
 }
 
+void AnalysisConfig::Exp_EnableCINNCompiler() {
+#ifdef PADDLE_WITH_CINN
+  use_cinn_compiler_ = true;
+  Update();
+#else
+  PADDLE_THROW(platform::errors::Unavailable(
+      "You tried to use CINN compiler, but Paddle was not compiled "
+      "with CINN."));
+#endif
+}
+
+bool AnalysisConfig::cinn_compiler_enabled() const {
+  return use_cinn_compiler_;
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
old mode 100755
new mode 100644
index bf89db83dd4ae..13dba59492b55
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1217,6 +1217,10 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
   }
 
+  if (config_.use_cinn_compiler_) {
+    argument_.SetUseCinnCompiler(config_.use_cinn_compiler_);
+  }
+
 #ifdef PADDLE_WITH_MKLDNN
   if (config_.mkldnn_quantizer_enabled()) {
     LOG(INFO) << "Quantization is enabled";
@@ -1239,21 +1243,25 @@ void AnalysisPredictor::PrepareArgument() {
 #endif
 
   auto *pass_builder = config_.pass_builder();
+  // TODO(inference): Need to reconstruct the pass_builder, pass should be
+  // processed in a single
   if (model_precision_ != phi::DataType::FLOAT32) {
     LOG(INFO) << "Model is mixed precision type with " << model_precision_
               << ", we will use a new PassStrategy. Note that only the GPU "
                  "backend is supported for now.";
-    pass_builder->ClearPasses();
-    const auto &deleted_passes = pass_builder->GetAllDeletedPasses();
-    if (config_.tensorrt_engine_enabled()) {
-      for (const auto &pass : kTrtLowerPrecisionPasses) {
-        if (deleted_passes.count(pass)) continue;
-        pass_builder->AppendPass(pass);
-      }
-    } else if (config_.use_gpu()) {
-      for (const auto &pass : kGpuLowerPrecisionPasses) {
-        if (deleted_passes.count(pass)) continue;
-        pass_builder->AppendPass(pass);
+    if (!config_.use_cinn_compiler_) {
+      pass_builder->ClearPasses();
+      const auto &deleted_passes = pass_builder->GetAllDeletedPasses();
+      if (config_.tensorrt_engine_enabled()) {
+        for (const auto &pass : kTrtLowerPrecisionPasses) {
+          if (deleted_passes.count(pass)) continue;
+          pass_builder->AppendPass(pass);
+        }
+      } else if (config_.use_gpu()) {
+        for (const auto &pass : kGpuLowerPrecisionPasses) {
+          if (deleted_passes.count(pass)) continue;
+          pass_builder->AppendPass(pass);
+        }
       }
     }
   }
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 5521caee9f430..5bf5d3de7b0f0 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -1016,6 +1016,19 @@ struct PD_INFER_DECL AnalysisConfig {
 
   void SetSkipLoadParams(bool value) { skip_load_params_ = value; }
 
+  ///
+  /// \brief Enable use cinn compiler optimization.
+  ///
+  void Exp_EnableCINNCompiler();
+
+  ///
+  /// \brief A boolean state telling whether the CINN compiler optimization is
+  /// turned on.
+  ///
+  /// \return bool Whether the CINN compiler optimization is turned on.
+  ///
+  bool cinn_compiler_enabled() const;
+
  protected:
   // Update the config.
   void Update();
@@ -1143,6 +1156,9 @@ struct PD_INFER_DECL AnalysisConfig {
   Precision lite_precision_mode_;
   bool lite_zero_copy_;
 
+  // CINN compiler related.
+  bool use_cinn_compiler_{false};
+
   // XPU related.
   bool use_xpu_{false};
   int xpu_device_id_{0};
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 4ac91231121d1..4e397fbd041c7 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -204,6 +204,13 @@ const std::vector<std::string> kTrtLowerPrecisionPasses{
     "tensorrt_subgraph_pass",
 };
 
+const std::vector<std::string> kCINNCompilerPasses{
+    "gpu_cpu_map_matmul_v2_to_mul_pass",
+    "gpu_cpu_map_matmul_v2_to_matmul_pass",
+    "gpu_cpu_map_matmul_to_mul_pass",
+    "build_cinn_pass",
+};
+
 GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
   passes_.assign({
     //   "identity_scale_op_clean_pass",             //
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 0990a61da34e1..8dea84400e8e1 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -349,6 +349,9 @@ PD_INFER_DECL extern const std::vector<std::string> kDlnneSubgraphPasses;
 /// \brief List of lite subgraph passes.
 PD_INFER_DECL extern const std::vector<std::string> kLiteSubgraphPasses;
 
+/// \brief List of cinn compiler passes.
+PD_INFER_DECL extern const std::vector<std::string> kCINNCompilerPasses;
+
 /// \brief TODO(inference): Most of the existing pass fusion operators do not
 /// support fp16/bf16 precision, temporarily use low precision pass to prevent
 /// running errors. After fusion operator supports low precision, delete this.

From 6b0d959071142e97b728f9c67d32a29555064748 Mon Sep 17 00:00:00 2001
From: Ghost Screaming <mofengshenjieII@163.com>
Date: Thu, 8 Dec 2022 10:42:28 +0800
Subject: [PATCH 46/60] Clean and migrate fluid APIs of
 paddle.fluid.layers.control_flow (#48233)

* Merge branch 'reduce_sum' of https://github.com/GhostScreaming/Paddle into mine_fluid_clean_common.

* Fix some bugs.

* Clean APIs in python/paddle/fluid/layers/control_flow.py

* Polish code style.

* Change API.

* Fix some bugs.

* Fix some bugs.
---
 python/paddle/fluid/layers/control_flow.py    |  753 +-----------
 python/paddle/fluid/layers/rnn.py             |    2 +-
 python/paddle/fluid/tests/test_if_else_op.py  |  255 ----
 .../auto_parallel/test_while_op_partition.py  |    2 +-
 .../fleet/hybrid_parallel_inference_helper.py |    4 +-
 .../fluid/tests/unittests/dist_transformer.py |    2 +-
 .../dygraph_to_static/test_ifelse.py          |    2 +-
 .../unittests/mlu/test_set_value_op_mlu.py    |    2 +-
 .../unittests/npu/test_set_value_op_npu.py    |    2 +-
 .../tests/unittests/npu/test_while_op_npu.py  |    4 +-
 .../unittests/sequence/test_sequence_pool.py  |   13 +-
 .../paddle/fluid/tests/unittests/test_case.py |   60 +-
 .../tests/unittests/test_device_guard.py      |    4 +-
 .../test_dynamic_rnn_stop_gradient.py         |    4 +-
 .../unittests/test_eager_deletion_while_op.py |    4 +-
 .../test_fusion_seqpool_concat_op.py          |    9 +-
 .../test_fusion_seqpool_cvm_concat_op.py      |    9 +-
 .../tests/unittests/test_imperative_basic.py  | 1033 -----------------
 .../test_imperative_static_runner_while.py    |    4 +-
 .../test_ir_memory_optimize_ifelse_op.py      |  126 --
 .../fluid/tests/unittests/test_layers.py      |   40 +-
 .../tests/unittests/test_lod_rank_table.py    |   66 --
 .../test_optimizer_in_control_flow.py         |    6 +-
 .../fluid/tests/unittests/test_profiler.py    |    2 +-
 .../unittests/test_program_prune_backward.py  |    4 +-
 .../unittests/test_reorder_lod_tensor.py      |  260 -----
 .../tests/unittests/test_set_value_op.py      |    2 +-
 .../fluid/tests/unittests/test_switch_case.py |   47 +-
 .../unittests/test_tensor_array_to_tensor.py  |    6 +-
 .../tests/unittests/test_while_loop_op.py     |   60 +-
 .../fluid/tests/unittests/test_while_op.py    |   10 +-
 .../unittests/xpu/test_device_guard_xpu.py    |    2 +-
 .../tests/unittests/xpu/test_while_op_xpu.py  |    8 +-
 python/paddle/static/nn/__init__.py           |    8 +-
 python/paddle/static/nn/control_flow.py       |  797 +++++++++++++
 tools/parallel_UT_rule.py                     |   11 -
 tools/static_mode_white_list.py               |    5 -
 37 files changed, 1008 insertions(+), 2620 deletions(-)
 delete mode 100644 python/paddle/fluid/tests/test_if_else_op.py
 delete mode 100644 python/paddle/fluid/tests/unittests/test_imperative_basic.py
 delete mode 100644 python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
 delete mode 100644 python/paddle/fluid/tests/unittests/test_lod_rank_table.py
 delete mode 100644 python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
 create mode 100644 python/paddle/static/nn/control_flow.py

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index cd49f94e035b8..884d4275bd936 100755
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -52,20 +52,14 @@
 from paddle import _C_ops, _legacy_C_ops
 
 __all__ = [
-    'While',
     'Switch',
     'increment',
     'array_write',
     'array_read',
     'cond',
-    'IfElse',
     'StaticRNN',
-    'reorder_lod_tensor_by_rank',
     'Print',
     'Assert',
-    'is_empty',
-    'case',
-    'switch_case',
     'while_loop',
 ]
 
@@ -527,6 +521,7 @@ def Assert(cond, data=None, summarize=20, name=None):
     return op
 
 
+# (TODO: Mine) There exists dependency. It will be removed later.
 class BlockGuard:
     """
     BlockGuard class.
@@ -550,6 +545,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         return True
 
 
+# (TODO: Mine) There exists dependency. It will be removed later.
 class BlockGuardWithCompletion(BlockGuard):
     """
     BlockGuardWithCompletion class.
@@ -1101,6 +1097,7 @@ def _complete_op(self):
         )
 
 
+# (TODO: Mine) There exists dependency. It will be removed later.
 class WhileGuard(BlockGuard):
     def __init__(self, while_op):
         if not isinstance(while_op, While):
@@ -1120,6 +1117,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         return super().__exit__(exc_type, exc_val, exc_tb)
 
 
+# (TODO: Mine) There exists dependency. It will be removed later.
 def get_inputs_outputs_in_block(
     current_block, inner_inputs, inner_outputs, helper
 ):
@@ -1182,6 +1180,7 @@ def is_ignore_vars(op, var_name):
     return inner_inputs, inner_outputs
 
 
+# (TODO: Mine) There exists dependency. It will be removed later.
 class While:
     """
     :api_attr: Static Graph
@@ -1320,6 +1319,7 @@ def _complete(self):
 support_ret_buildin_type = (bool, float, int)
 
 
+# (TODO: Mine) There exists dependency. It will be removed later.
 def assign_skip_lod_tensor_array(input, output):
     """
     Assign input to output, but skip the process of copying LoDTensorArray unless it's created in while_block.
@@ -1363,6 +1363,7 @@ def has_shape_diff(x_var, y_var):
         assign(input, output)
 
 
+# (TODO: Mine) There exists dependency (jit.dy2static.convert_operators). It will be removed later.
 def while_loop(cond, body, loop_vars, is_test=False, name=None):
     """
     :api_attr: Static Graph
@@ -1473,6 +1474,7 @@ def body(i, ten):
     return loop_vars
 
 
+# (TODO: Mine) There exists dependency. It will be removed later.
 def _deal_with_undefined_var(output_vars, loop_vars):
     """Deal with undefined var cases, We create undefined variable based on the results of body().
     In Dy2Static, we use undefined var to represent the var created in control flow. This function
@@ -1511,102 +1513,6 @@ def create_var_like(o_var):
     return results
 
 
-def lod_rank_table(x, level=0):
-    """
-    LoD Rank Table Operator. Given an input variable **x** and a level number
-    of LoD, this layer creates a LodRankTable object. A LoDRankTable object
-    contains a list of bi-element tuples. Each tuple consists of an index and
-    a length, both of which are int type. Refering to specified level of LoD,
-    the index is the sequence index number and the length represents the
-    sequence length. Please note that the list is ranked in descending order by
-    the length. The following is an example:
-
-        .. code-block:: text
-
-            x is a LoDTensor:
-                x.lod = [[2,                1],
-                         [5,             1, 1]]
-                x.data = [a, b, c, d, e, f, g]
-
-            1. set level to 0:
-                Create lod rank table:
-                    lod_rank_table_obj = lod_rank_table(x, level=0)
-
-                Get:
-                    lod_rank_table_obj.items() = [(0, 2), (1, 1)]
-
-            2. set level to 1:
-                Create lod rank table:
-                    lod_rank_table_obj = lod_rank_table(x, level=1)
-
-                Get:
-                    lod_rank_table_obj.items() = [(0, 5), (1, 1), (2, 1)]
-
-    Args:
-        x (Variable): Input variable, a LoDTensor based which to create the lod
-            rank table.
-        level (int): Specify the LoD level, on which to create the lod rank
-            table.
-
-    Returns:
-        Variable: The created LoDRankTable object.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[10],
-                                  dtype='float32', lod_level=1)
-            out = layers.lod_rank_table(x=x, level=0)
-    """
-    check_type(x, 'x', (Variable, list), 'lod_rank_table')
-    if isinstance(x, (list)):
-        for i, input_x in enumerate(x):
-            check_type(
-                input_x, 'input[' + str(i) + ']', Variable, 'lod_rank_table'
-            )
-
-    helper = LayerHelper("lod_rank_table", **locals())
-    table = helper.create_variable(
-        type=core.VarDesc.VarType.LOD_RANK_TABLE,
-        name=unique_name.generate("lod_rank_table"),
-    )
-    helper.append_op(
-        type='lod_rank_table',
-        inputs={'X': x},
-        outputs={'Out': table},
-        attrs={'level': level},
-    )
-    return table
-
-
-@templatedoc()
-def max_sequence_len(rank_table):
-    """
-    ${comment}
-
-    >>> import paddle.fluid as fluid
-    >>> x = fluid.layers.data(name='x', shape=[10], dtype='float32',
-    >>>                       lod_level=1)
-    >>> rank_table = layers.lod_rank_table(x=x, level=0)
-    >>> max_seq_len = layers.max_sequence_len(rank_table)
-
-    Args:
-        rank_table(${rank_table_type}): ${rank_table_comment}.
-
-    Returns:
-        ${out_comment}.
-    """
-    helper = LayerHelper("max_seqence_len", **locals())
-    res = helper.create_variable_for_type_inference(dtype="int64")
-    helper.append_op(
-        type="max_sequence_len",
-        inputs={"RankTable": rank_table},
-        outputs={"Out": res},
-    )
-    return res
-
-
 def increment(x, value=1.0, in_place=True):
     """
     The OP is usually used for control flow to increment the data of :attr:`x` by an amount :attr:`value`.
@@ -2422,154 +2328,6 @@ def map_fn(n1, n2, name, order):
     return nest1_out, nest2_out
 
 
-def _error_message(what, arg_name, op_name, right_value, error_value):
-    error_message = (
-        "{what} of '{arg_name}' in {op_name} must be "
-        "{right_value}, but received: {error_value}.".format(
-            what=what,
-            arg_name=arg_name,
-            op_name=op_name,
-            right_value=right_value,
-            error_value=error_value,
-        )
-    )
-
-    return error_message
-
-
-def case(pred_fn_pairs, default=None, name=None):
-    '''
-    :api_attr: Static Graph
-
-    This operator works like an if-elif-elif-else chain.
-
-    Args:
-        pred_fn_pairs(list|tuple): A list or tuple of (pred, fn) pairs. ``pred`` is a boolean Tensor with shape [1], ``fn`` is a callable. All callables return the same structure of Tensors.
-        default(callable, optional): Callable that returns a structure of Tensors.
-        name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        Tensor|list(Tensor): Tensors returned by the callable from the first pair whose pred is True,
-        or Tensors returned by ``default`` if no pred in ``pred_fn_pairs`` is True and ``default`` is not None,
-        or Tensors returned by the last callable in ``pred_fn_pairs``  if no pred in ``pred_fn_pairs`` is True and ``default`` is None.
-
-    Raises:
-        TypeError: If the type of ``pred_fn_pairs`` is not list or tuple.
-        TypeError: If the type of elements in ``pred_fn_pairs`` is not tuple.
-        TypeError: If the size of tuples in ``pred_fn_pairs`` is not 2.
-        TypeError: If the first element of 2-tuple in ``pred_fn_pairs`` is not a Tensor.
-        TypeError: If the second element of 2-tuple in ``pred_fn_pairs`` is not callable.
-        TypeError: If ``default`` is not None but it is not callable.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-
-            paddle.enable_static()
-
-            def fn_1():
-                return paddle.full(shape=[1, 2], dtype='float32', fill_value=1)
-
-            def fn_2():
-                return paddle.full(shape=[2, 2], dtype='int32', fill_value=2)
-
-            def fn_3():
-                return paddle.full(shape=[3], dtype='int32', fill_value=3)
-
-            main_program = paddle.static.default_startup_program()
-            startup_program = paddle.static.default_main_program()
-
-            with paddle.static.program_guard(main_program, startup_program):
-                x = paddle.full(shape=[1], dtype='float32', fill_value=0.3)
-                y = paddle.full(shape=[1], dtype='float32', fill_value=0.1)
-                z = paddle.full(shape=[1], dtype='float32', fill_value=0.2)
-
-                pred_1 = paddle.less_than(z, x)  # true: 0.2 < 0.3
-                pred_2 = paddle.less_than(x, y)  # false: 0.3 < 0.1
-                pred_3 = paddle.equal(x, y)      # false: 0.3 == 0.1
-
-                # Call fn_1 because pred_1 is True
-                out_1 = paddle.static.nn.case(
-                    pred_fn_pairs=[(pred_1, fn_1), (pred_2, fn_2)], default=fn_3)
-
-                # Argument default is None and no pred in pred_fn_pairs is True. fn_3 will be called.
-                # because fn_3 is the last callable in pred_fn_pairs.
-                out_2 = paddle.static.nn.case(pred_fn_pairs=[(pred_2, fn_2), (pred_3, fn_3)])
-
-                exe = paddle.static.Executor(paddle.CPUPlace())
-                res_1, res_2 = exe.run(main_program, fetch_list=[out_1, out_2])
-                print(res_1)  # [[1. 1.]]
-                print(res_2)  # [3 3 3]
-    '''
-    helper = LayerHelper('case', **locals())
-
-    def _case_check_args(pred_fn_pairs, default):
-        '''
-        Check arguments pred_fn_pairs and default. Return canonical pre_fn_pairs and default.
-        '''
-        check_type(pred_fn_pairs, 'pred_fn_pairs', (list, tuple), 'case')
-
-        for pred_fn in pred_fn_pairs:
-            if not isinstance(pred_fn, tuple):
-                raise TypeError(
-                    _error_message(
-                        "The elements' type",
-                        "pred_fn_pairs",
-                        "case",
-                        tuple,
-                        type(pred_fn),
-                    )
-                )
-            if len(pred_fn) != 2:
-                raise TypeError(
-                    _error_message(
-                        "The tuple's size",
-                        "pred_fn_pairs",
-                        "case",
-                        "2",
-                        str(len(pred_fn)) + "-tuple",
-                    )
-                )
-            pred, fn = pred_fn
-
-            if not isinstance(pred, Variable):
-                raise TypeError(
-                    _error_message(
-                        "The pred's type",
-                        "pred_fn_pairs",
-                        "case",
-                        "boolean Variable",
-                        type(pred),
-                    )
-                )
-
-            if not callable(fn):
-                raise TypeError(
-                    "The fn for {} of pred_fn_pairs in Op(case) must"
-                    " be callable.".format(pred.name)
-                )
-
-        if default is None:
-            default_index = len(pred_fn_pairs) - 1  # pick the last one
-            default = pred_fn_pairs[default_index][1]
-            pred_fn_pairs = pred_fn_pairs[:default_index]
-        elif not callable(default):
-            raise TypeError("The default in Op(case) must be callable.")
-
-        return pred_fn_pairs, default
-
-    pred_fn_pairs, default = _case_check_args(pred_fn_pairs, default)
-
-    false_fn = default
-    for pred, true_fn in reversed(pred_fn_pairs):
-        false_fn = partial(cond, pred=pred, true_fn=true_fn, false_fn=false_fn)
-
-    final_fn = false_fn
-
-    return final_fn()
-
-
 class Switch:
     """
     :api_attr: Static Graph
@@ -2698,498 +2456,3 @@ def __exit__(self, exc_type, exc_val, exc_tb):
             return False  # re-raise exception
 
         return True
-
-
-class IfElseBlockGuard:
-    def __init__(self, is_true, ifelse):
-        if not isinstance(ifelse, IfElse):
-            raise TypeError("ifelse must be an instance of IfElse class")
-
-        if ifelse.status != IfElse.OUT_IF_ELSE_BLOCKS:
-            raise ValueError("You cannot invoke IfElse.block() inside a block")
-
-        self.is_true = is_true
-        self.ie = ifelse
-        if is_true:
-            self.cond_block = ifelse.conditional_true_block
-        else:
-            self.cond_block = ifelse.conditional_false_block
-
-        if not isinstance(self.cond_block, ConditionalBlock):
-            raise TypeError("Unexpected situation")
-
-        self.cond_block = self.cond_block.block()
-
-    def __enter__(self):
-        self.ie.status = (
-            IfElse.IN_IF_ELSE_TRUE_BLOCKS
-            if self.is_true
-            else IfElse.IN_IF_ELSE_FALSE_BLOCKS
-        )
-        self.cond_block.__enter__()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if not self.cond_block.__exit__(exc_type, exc_val, exc_tb):
-            # re-raise inside exception
-            return False
-        if len(self.ie.output_table[1 if self.is_true else 0]) == 0:
-            raise ValueError("Must set output inside block")
-        self.ie.status = IfElse.OUT_IF_ELSE_BLOCKS
-
-
-class IfElse:
-    """
-    :api_attr: Static Graph
-
-    This class is used to implement IfElse branch control function. IfElse contains two blocks, true_block and false_block. IfElse will put data satisfying True or False conditions into different blocks to run.
-
-    Cond is a 2-D Tensor with shape [N, 1] and data type bool, representing the execution conditions of the corresponding part of the input data.
-
-    Note:
-        A new OP :ref:`api_fluid_layers_cond` is highly recommended instead of ``IfElse``. if the shape of parameter ``cond`` is [1].
-        OP :ref:`api_fluid_layers_cond` is easier to use and is called with less code but does the same thing as ``IfElse`` .
-
-    IfElse OP is different from other OPs in usage, which may cause some users confusion. Here is a simple example to illustrate this OP.
-
-    .. code-block:: python
-
-        # The following code completes the function: subtract 10 from the data greater than 0 in x, add 10 to the data less than 0 in x, and sum all the data.
-        import numpy as np
-        import paddle.fluid as fluid
-
-        x = fluid.layers.data(name='x', shape=[4, 1], dtype='float32', append_batch_size=False)
-        y = fluid.layers.data(name='y', shape=[4, 1], dtype='float32', append_batch_size=False)
-
-        x_d = np.array([[3], [1], [-2], [-3]]).astype(np.float32)
-        y_d = np.zeros((4, 1)).astype(np.float32)
-
-        # Compare the size of x, y pairs of elements, output cond, cond is shape [4, 1], data type bool 2-D tensor.
-        # Based on the input data x_d, y_d, it can be inferred that the data in cond are [[true], [true], [false], [false]].
-        cond = fluid.layers.greater_than(x, y)
-        # Unlike other common OPs, ie below returned by the OP is an IfElse OP object
-        ie = fluid.layers.IfElse(cond)
-
-        with ie.true_block():
-            # In this block, according to cond condition, the data corresponding to true dimension in X is obtained and subtracted by 10.
-            out_1 = ie.input(x)
-            out_1 = out_1 - 10
-            ie.output(out_1)
-        with ie.false_block():
-            # In this block, according to cond condition, get the data of the corresponding condition in X as false dimension, and add 10
-            out_1 = ie.input(x)
-            out_1 = out_1 + 10
-            ie.output(out_1)
-
-        # According to cond condition, the data processed in the two blocks are merged. The output here is output, the type is List, and the element type in List is Variable.
-        output = ie() #  [array([[-7.], [-9.], [ 8.], [ 7.]], dtype=float32)]
-
-        # Get the first Variable in the output List and add all elements.
-        out = paddle.sum(output[0])
-
-        exe = fluid.Executor(fluid.CPUPlace())
-        exe.run(fluid.default_startup_program())
-
-        res = exe.run(fluid.default_main_program(), feed={"x":x_d, "y":y_d}, fetch_list=[out])
-        print(res)
-        # [array([-1.], dtype=float32)]
-
-    Args:
-        cond (Variable): cond is a 2-D Tensor with shape [N, 1] and data type bool, representing the corresponding execution conditions of N input data. The data type is bool.
-        name(str, optional): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
-
-    Returns:
-        Unlike other common OPs, the OP call returns an IfElse OP object (e.g. ie in the example), which branches the input data by calling the internal functions of the object ``true_block ()``, ``false_block ()``, ``input ()``, ``output ()``, and integrates the data processed by different branches as the overall output by calling the internal ``call ()`` function. The output type is a list, and the type of each element in the list is Variable.
-
-    Internal Functions:
-        The block is constructed by calling the ``with ie. true_block()`` function in the object, and the computational logic under condition true is put into the block. If no corresponding block is constructed, the input data in the corresponding conditional dimension is unchanged.
-
-        The block is constructed by calling the ``with ie. false_block()`` function in the object, and the computational logic under condition false is put into the block. If no corresponding block is constructed, the input data in the corresponding conditional dimension is unchanged.
-
-        ``Out = ie. input (x)`` will take out the data of the corresponding conditional dimension in X and put it into out, supporting the internal processing of multiple inputs in block.
-
-        ``ie. output (out)`` writes the result to the output of the corresponding condition.
-
-        There is a ``call ()`` function inside the object, that is, by calling ``output = ie ()``, all the outputs inside the block of False are fused as the whole output, the output type is a list, and the type of each element in the list is Variable.
-
-    """
-
-    OUT_IF_ELSE_BLOCKS = 0
-    IN_IF_ELSE_TRUE_BLOCKS = 1
-    IN_IF_ELSE_FALSE_BLOCKS = 2
-
-    def __init__(self, cond, name=None):
-        check_type(cond, "cond", Variable, "fluid.layers.IfElse")
-        check_type(name, "name", (str, type(None)), "fluid.layers.IfElse")
-        self.helper = LayerHelper('ifelse', name=name)
-        self.cond = cond
-        self.input_table = {}
-        self.status = IfElse.OUT_IF_ELSE_BLOCKS
-        self.conditional_true_block = ConditionalBlock(inputs=[self.cond])
-        self.conditional_false_block = ConditionalBlock(inputs=[self.cond])
-        self.output_table = ([], [])  # (true_outs, false_outs)
-
-    def input(self, x):
-        if self.status == IfElse.OUT_IF_ELSE_BLOCKS:
-            raise ValueError("input must in true/false blocks")
-        if id(x) not in self.input_table:
-            parent_block = self._parent_block()
-            out_true = parent_block.create_var(
-                name=unique_name.generate_with_ignorable_key(
-                    'ifelse_input' + self.helper.name
-                ),
-                dtype=x.dtype,
-            )
-
-            out_false = parent_block.create_var(
-                name=unique_name.generate_with_ignorable_key(
-                    'ifelse_input' + self.helper.name
-                ),
-                dtype=x.dtype,
-            )
-            parent_block.append_op(
-                type='split_lod_tensor',
-                inputs={
-                    'X': x,
-                    'Mask': self.cond,
-                },
-                outputs={'OutTrue': out_true, 'OutFalse': out_false},
-                attrs={'level': 0},
-            )
-            self.input_table[id(x)] = (out_true, out_false)
-        else:
-            out_true, out_false = self.input_table[id(x)]
-
-        if self.status == IfElse.IN_IF_ELSE_TRUE_BLOCKS:
-            return out_true
-        else:
-            return out_false
-
-    def _parent_block(self):
-        current_block = self.helper.main_program.current_block()
-        return self.helper.main_program.block(current_block.parent_idx)
-
-    def true_block(self):
-        return IfElseBlockGuard(True, self)
-
-    def false_block(self):
-        return IfElseBlockGuard(False, self)
-
-    def output(self, *outs):
-        if self.status == self.OUT_IF_ELSE_BLOCKS:
-            raise ValueError("output can only be invoked in the sub-block")
-
-        out_table = self.output_table[
-            1 if self.status == self.IN_IF_ELSE_TRUE_BLOCKS else 0
-        ]
-        parent_block = self._parent_block()
-        for each_out in outs:
-            check_type(
-                each_out, "each output", Variable, "fluid.layers.IfElse.output"
-            )
-            # create outside tensor
-            outside_out = parent_block.create_var(
-                name=unique_name.generate_with_ignorable_key(
-                    "_".join([self.helper.name, 'output'])
-                ),
-                dtype=each_out.dtype,
-            )
-            out_table.append(outside_out)
-
-            # assign local var to outside
-            assign(input=each_out, output=outside_out)
-
-    def __call__(self):
-        if self.status != self.OUT_IF_ELSE_BLOCKS:
-            raise ValueError("IfElse::__call__ must be out of sub-block")
-        false_len, true_len = list(map(len, self.output_table))
-        if false_len == 0 and true_len == 0:
-            raise ValueError(
-                "Must invoke true_block/false_block before " "__call__"
-            )
-        elif false_len != true_len and false_len != 0 and true_len != 0:
-            raise ValueError("The output side must be same")
-        elif false_len == 0 or true_len == 0:
-            return self.output_table[0 if false_len != 0 else 1]
-
-        # else none of false_len/true_len is zero
-        # merge together
-        rlist = []
-        for false_var, true_var in zip(*self.output_table):
-            rlist.append(
-                merge_lod_tensor(
-                    in_true=true_var,
-                    in_false=false_var,
-                    mask=self.cond,
-                    x=self.cond,
-                    level=0,
-                )
-            )
-        return rlist
-
-
-def switch_case(branch_index, branch_fns, default=None, name=None):
-    '''
-    :api_attr: Static Graph
-
-    This operator is like a C++ switch/case statement.
-
-    Args:
-        branch_index(Tensor): A Tensor with shape [1] to specify which branch to execute. The data type is ``int32``, ``int64`` or ``uint8``.
-        branch_fns(dict|list|tuple): If it's a list or tuple, the elements in it could be pairs of (int, callable) or simple callables whose actual index will be used as the index of callable. If it's a dict, its key is a python integer and the value is a callable. All callables return the same structure of Tensors.
-        default(callable, optional): Callable that returns a structure of Tensors.
-        name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        Tensor|list(Tensor): Tensors returned by the callable specified by ``branch_index`` in ``branch_fns``,
-        or Tensors returned by ``default`` if ``default`` is not None and no index matches in ``branch_fns``,
-        or Tensors returned by the callable with the max index in ``branch_fns`` if ``default`` is None and no index matches in ``branch_fns``.
-
-    Raises:
-        TypeError: If the type of ``branch_index`` is not Tensor.
-        TypeError: If the data type of ``branch_index`` is not ``int32``, ``int64`` or ``uint8``.
-        TypeError: If the type of ``branch_fns`` is not dict, list or tuple.
-        TypeError: If the elements of ``branch_fns`` is not 2-tuple.
-        TypeError: If the first element of 2-tuple in ``branch_fns`` is not integer.
-        ValueError: If the first element of 2-tuple in ``branch_fns`` is not unique.
-        TypeError: If the second element of 2-tuple in ``branch_fns`` is not callable.
-        TypeError: If ``default`` is not None but it is not callable.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-
-            paddle.enable_static()
-
-            def fn_1():
-                return paddle.full(shape=[1, 2], dtype='float32', fill_value=1)
-
-            def fn_2():
-                return paddle.full(shape=[2, 2], dtype='int32', fill_value=2)
-
-            def fn_3():
-                return paddle.full(shape=[3], dtype='int32', fill_value=3)
-
-            main_program = paddle.static.default_startup_program()
-            startup_program = paddle.static.default_main_program()
-            with paddle.static.program_guard(main_program, startup_program):
-                index_1 = paddle.full(shape=[1], dtype='int32', fill_value=1)
-                index_2 = paddle.full(shape=[1], dtype='int32', fill_value=2)
-
-                out_1 = paddle.static.nn.switch_case(
-                    branch_index=index_1,
-                    branch_fns={1: fn_1, 2: fn_2},
-                    default=fn_3)
-
-                out_2 = paddle.static.nn.switch_case(
-                    branch_index=index_2,
-                    branch_fns=[(1, fn_1), (2, fn_2)],
-                    default=fn_3)
-
-                # Argument default is None and no index matches. fn_3 will be called because of the max index 7.
-                out_3 = paddle.static.nn.switch_case(
-                    branch_index=index_2,
-                    branch_fns=[(0, fn_1), (4, fn_2), (7, fn_3)])
-
-                exe = paddle.static.Executor(paddle.CPUPlace())
-                res_1, res_2, res_3 = exe.run(main_program, fetch_list=[out_1, out_2, out_3])
-                print(res_1)  # [[1. 1.]]
-                print(res_2)  # [[2 2] [2 2]]
-                print(res_3)  # [3 3 3]
-    '''
-    helper = LayerHelper('switch_case', **locals())
-
-    def _check_args(branch_index, branch_fns, default):
-
-        check_variable_and_dtype(
-            branch_index,
-            'branch_index',
-            ['uint8', 'int32', 'int64'],
-            'switch_case',
-        )
-
-        if convert_dtype(branch_index.dtype) != "int64":
-            branch_index = cast(branch_index, "int64")
-
-        check_type(branch_fns, 'branch_fns', (list, tuple, dict), 'switch_case')
-
-        branch_fns = (
-            branch_fns.items() if isinstance(branch_fns, dict) else branch_fns
-        )
-
-        branch_fns = (
-            list(enumerate(branch_fns))
-            if all(callable(fn) for fn in branch_fns)
-            else branch_fns
-        )
-
-        keys_of_fns = []
-        for index_fn_pair in branch_fns:
-            if not isinstance(index_fn_pair, tuple):
-                raise TypeError(
-                    _error_message(
-                        "The elements' type",
-                        "branch_fns",
-                        "switch_case",
-                        tuple,
-                        type(branch_fns),
-                    )
-                )
-
-            if len(index_fn_pair) != 2:
-                raise TypeError(
-                    _error_message(
-                        "The tuple's size",
-                        "branch_fns",
-                        "switch_case",
-                        "2",
-                        str(len(index_fn_pair)) + "-tuple",
-                    )
-                )
-
-            key, fn = index_fn_pair
-
-            if not isinstance(key, int):
-                raise TypeError(
-                    _error_message(
-                        "The key's type",
-                        "branch_fns",
-                        "switch_case",
-                        int,
-                        type(key),
-                    )
-                )
-
-            if key in keys_of_fns:
-                raise ValueError(
-                    "The key in 'branch_fns' must be unique, but '{}' appears more than once.".format(
-                        key
-                    )
-                )
-            else:
-                keys_of_fns.append(key)
-
-            if not callable(fn):
-                raise TypeError(
-                    _error_message(
-                        "The type of function for key {}".format(key),
-                        "branch_fns",
-                        "switch_case",
-                        "callable",
-                        type(fn),
-                    )
-                )
-
-        if default is None:
-            default = sorted(branch_fns)[-1][1]
-            branch_fns = sorted(branch_fns)[:-1]
-        elif not callable(default):
-            raise TypeError("The default in Op(case) must be callable.")
-
-        pred_fn_pairs = []
-        for index, fn in branch_fns:
-            new_index = fill_constant(shape=[1], dtype="int64", value=index)
-            pred = paddle.equal(branch_index, new_index)
-            pred_fn_pairs.append((pred, fn))
-
-        return pred_fn_pairs, default
-
-    pred_fn_pairs, default = _check_args(branch_index, branch_fns, default)
-    false_fn = default
-    for pred, true_fn in pred_fn_pairs:
-        false_fn = partial(cond, pred=pred, true_fn=true_fn, false_fn=false_fn)
-
-    final_fn = false_fn
-    return final_fn()
-
-
-@templatedoc()
-def reorder_lod_tensor_by_rank(x, rank_table):
-    """
-    ${comment}
-
-    Args:
-        x(${x_type}): ${x_comment}.
-        rank_table(${rank_table_type}): ${rank_table_comment}.
-
-    Returns:
-        out(${out_type}): ${out_comment}.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          data_desc = (['input', [9], 0], ['ref', [5], 1])
-          data = fluid.layers.data(name=data_desc[0][0], shape=data_desc[0][1])
-          rank_data = fluid.layers.data(name=data_desc[1][0], shape=data_desc[1][1])
-          table = fluid.layers.control_flow.lod_rank_table(rank_data)
-          new_data = fluid.layers.reorder_lod_tensor_by_rank(
-                           x=data, rank_table=table)
-
-    """
-
-    check_type(x, 'x', (Variable), 'reorder_lod_tensor_by_rank')
-    check_type(
-        rank_table, 'rank_table', (Variable), 'reorder_lod_tensor_by_rank'
-    )
-    if rank_table.type != core.VarDesc.VarType.LOD_RANK_TABLE:
-        raise TypeError("The type of rank_table should be LOD_RANK_TABLE.")
-
-    helper = LayerHelper('reorder_lod_tensor_by_rank', **locals())
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='reorder_lod_tensor_by_rank',
-        inputs={'X': [x], 'RankTable': [rank_table]},
-        outputs={'Out': [out]},
-    )
-    return out
-
-
-def is_empty(x, name=None):
-    """
-
-    Test whether a Tensor is empty.
-
-    Args:
-        x (Tensor): The Tensor to be tested.
-        name (str, optional): The default value is ``None`` . Normally users
-                            don't have to set this parameter. For more information,
-                            please refer to :ref:`api_guide_Name` .
-
-    Returns:
-        Tensor: A bool scalar Tensor. True if 'x' is an empty Tensor.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-
-            input = paddle.rand(shape=[4, 32, 32], dtype='float32')
-            res = paddle.is_empty(x=input)
-            print("res:", res)
-            # ('res:', Tensor: eager_tmp_1
-            #    - place: CPUPlace
-            #    - shape: [1]
-            #    - layout: NCHW
-            #    - dtype: bool
-            #    - data: [0])
-
-    """
-    if in_dygraph_mode():
-        return _C_ops.is_empty(x)
-    if _in_legacy_dygraph():
-        return _legacy_C_ops.is_empty(x)
-
-    check_variable_and_dtype(
-        x, 'x', ['float32', 'float64', 'int32', 'int64'], 'is_empty'
-    )
-    check_type(name, "name", (str, type(None)), "is_empty")
-
-    helper = LayerHelper("is_empty", **locals())
-    cond = helper.create_variable_for_type_inference(dtype='bool')
-    cond.stop_gradient = True
-    helper.append_op(
-        type='is_empty', inputs={'X': [x]}, outputs={'Out': [cond]}
-    )
-    return cond
diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index e6ad3de9f48f4..6799550e7f63a 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -1594,7 +1594,7 @@ def _dynamic_decode_declarative(
         max_step_num = tensor.fill_constant(
             shape=[1], dtype="int64", value=max_step_num
         )
-    while_op = control_flow.While(cond, is_test=is_test)
+    while_op = paddle.static.nn.control_flow.While(cond, is_test=is_test)
 
     sequence_lengths = tensor.cast(paddle.zeros_like(initial_finished), "int64")
     sequence_lengths.stop_gradient = True
diff --git a/python/paddle/fluid/tests/test_if_else_op.py b/python/paddle/fluid/tests/test_if_else_op.py
deleted file mode 100644
index 24857164dc30b..0000000000000
--- a/python/paddle/fluid/tests/test_if_else_op.py
+++ /dev/null
@@ -1,255 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.layers as layers
-from paddle.fluid.executor import Executor
-from paddle.fluid.framework import Program, program_guard
-from paddle.fluid.layers.control_flow import (
-    ConditionalBlock,
-    merge_lod_tensor,
-    split_lod_tensor,
-)
-from paddle.fluid.optimizer import MomentumOptimizer
-
-paddle.enable_static()
-
-
-class TestMNISTIfElseOp(unittest.TestCase):
-    # FIXME: https://github.com/PaddlePaddle/Paddle/issues/12245#issuecomment-406462379
-    def not_test_raw_api(self):
-        prog = Program()
-        startup_prog = Program()
-        with program_guard(prog, startup_prog):
-            image = layers.data(name='x', shape=[784], dtype='float32')
-
-            label = layers.data(name='y', shape=[1], dtype='int64')
-
-            limit = layers.fill_constant(shape=[1], dtype='int64', value=5)
-            cond = paddle.less_than(x=label, y=limit)
-            true_image, false_image = split_lod_tensor(input=image, mask=cond)
-
-            true_out = paddle.tensor.create_tensor(dtype='float32')
-            true_cond = ConditionalBlock([cond])
-
-            with true_cond.block():
-                hidden = layers.fc(input=true_image, size=100, act='tanh')
-                prob = layers.fc(input=hidden, size=10, act='softmax')
-                layers.assign(input=prob, output=true_out)
-
-            false_out = paddle.tensor.create_tensor(dtype='float32')
-            false_cond = ConditionalBlock([cond])
-
-            with false_cond.block():
-                hidden = layers.fc(input=false_image, size=200, act='tanh')
-                prob = layers.fc(input=hidden, size=10, act='softmax')
-                layers.assign(input=prob, output=false_out)
-
-            prob = merge_lod_tensor(
-                in_true=true_out, in_false=false_out, mask=cond, x=image
-            )
-            loss = layers.cross_entropy(input=prob, label=label)
-            avg_loss = paddle.mean(loss)
-
-            optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
-            optimizer.minimize(avg_loss, startup_prog)
-
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(paddle.dataset.mnist.train(), buf_size=8192),
-            batch_size=10,
-        )
-
-        place = core.CPUPlace()
-        exe = Executor(place)
-
-        exe.run(startup_prog)
-        PASS_NUM = 100
-        for pass_id in range(PASS_NUM):
-            for data in train_reader():
-                x_data = np.array([x[0] for x in data]).astype("float32")
-                y_data = np.array([x[1] for x in data]).astype("int64")
-                y_data = np.expand_dims(y_data, axis=1)
-
-                outs = exe.run(
-                    prog, feed={'x': x_data, 'y': y_data}, fetch_list=[avg_loss]
-                )
-                print(outs[0])
-                if outs[0] < 1.0:
-                    return
-        self.assertFalse(True)
-
-    # FIXME: https://github.com/PaddlePaddle/Paddle/issues/12245#issuecomment-406462379
-    def not_test_ifelse(self):
-        prog = Program()
-        startup_prog = Program()
-        with program_guard(prog, startup_prog):
-            image = layers.data(name='x', shape=[784], dtype='float32')
-
-            label = layers.data(name='y', shape=[1], dtype='int64')
-
-            limit = layers.fill_constant(shape=[1], dtype='int64', value=5)
-            cond = paddle.less_than(x=label, y=limit)
-            ie = layers.IfElse(cond)
-
-            with ie.true_block():
-                true_image = ie.input(image)
-                hidden = layers.fc(input=true_image, size=100, act='tanh')
-                prob = layers.fc(input=hidden, size=10, act='softmax')
-                ie.output(prob)
-
-            with ie.false_block():
-                false_image = ie.input(image)
-                hidden = layers.fc(input=false_image, size=200, act='tanh')
-                prob = layers.fc(input=hidden, size=10, act='softmax')
-                ie.output(prob)
-
-            prob = ie()
-            loss = layers.cross_entropy(input=prob[0], label=label)
-            avg_loss = paddle.mean(loss)
-
-            optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
-            optimizer.minimize(avg_loss, startup_prog)
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(paddle.dataset.mnist.train(), buf_size=8192),
-            batch_size=200,
-        )
-
-        place = core.CPUPlace()
-        exe = Executor(place)
-
-        exe.run(startup_prog)
-        PASS_NUM = 100
-        for pass_id in range(PASS_NUM):
-            for data in train_reader():
-                x_data = np.array([x[0] for x in data]).astype("float32")
-                y_data = np.array([x[1] for x in data]).astype("int64")
-                y_data = y_data.reshape((y_data.shape[0], 1))
-
-                outs = exe.run(
-                    prog, feed={'x': x_data, 'y': y_data}, fetch_list=[avg_loss]
-                )
-                print(outs[0])
-                if outs[0] < 1.0:
-                    return
-        self.assertFalse(True)
-
-
-class TestIfElse(unittest.TestCase):
-    def set_test_case(self):
-        # condiction is: self.data < self.cond_value
-        self.cond_value = 0.5
-        self.data = np.random.rand(25, 1).astype(np.float32)
-
-    def numpy_cal(self):
-        s1 = self.data[np.where(self.data < self.cond_value)]
-        res = np.sum(np.exp(s1))
-        s2 = self.data[np.where(self.data >= self.cond_value)]
-        res += np.sum(np.tanh(s2))
-        return res
-
-    def compare_ifelse_op_and_numpy(self, place):
-        self.set_test_case()
-
-        prog = Program()
-        startup_prog = Program()
-        with program_guard(prog, startup_prog):
-            src = layers.data(name='data', shape=[1], dtype='float32')
-            cond = layers.fill_constant(
-                [1], dtype='float32', value=self.cond_value
-            )
-            ifcond = paddle.less_than(x=src, y=cond)
-            ie = layers.IfElse(ifcond)
-            with ie.true_block():
-                true_target = ie.input(src)
-                true_target = paddle.exp(true_target)
-                ie.output(true_target)
-
-            with ie.false_block():
-                false_target = ie.input(src)
-                false_target = paddle.tanh(false_target)
-                ie.output(false_target)
-            if_out = ie()
-            out = paddle.sum(if_out[0])
-
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-            fetch_list = [out]
-            (o1,) = exe.run(
-                fluid.default_main_program(),
-                feed={'data': self.data},
-                fetch_list=[out],
-            )
-            o2 = self.numpy_cal()
-
-            np.testing.assert_allclose(
-                o1,
-                o2,
-                rtol=1e-05,
-                atol=1e-08,
-            )
-
-    def test_cpu(self):
-        self.compare_ifelse_op_and_numpy(fluid.CPUPlace())
-
-    def test_cuda(self):
-        if not core.is_compiled_with_cuda():
-            return
-        self.compare_ifelse_op_and_numpy(fluid.CUDAPlace(0))
-
-
-class TestIfElseTrueBranch(TestIfElse):
-    def set_test_case(self):
-        # condiction is: self.data < self.cond_value
-        self.cond_value = 10.0
-        self.data = np.random.rand(25, 1).astype(np.float32)
-
-
-class TestIfElseFalseBranch(TestIfElse):
-    def set_test_case(self):
-        # condiction is: self.data < self.cond_value
-        self.cond_value = -10.0
-        self.data = np.random.rand(25, 1).astype(np.float32)
-
-
-class TestIfElseError(unittest.TestCase):
-    def test_input_type_error(self):
-        main_program = Program()
-        startup_program = Program()
-        with program_guard(main_program, startup_program):
-            src = layers.data(name='data', shape=[1], dtype='float32')
-            const_value = layers.fill_constant(
-                [1], dtype='float32', value=123.0
-            )
-            ifcond = paddle.less_than(x=src, y=const_value)
-            with self.assertRaises(TypeError):
-                ie = layers.IfElse(set())
-            with self.assertRaises(TypeError):
-                ie = layers.IfElse(ifcond, set())
-
-            with self.assertRaises(TypeError):
-                ie = layers.IfElse(ifcond)
-                with ie.true_block():
-                    true_target = ie.input(src)
-                    true_target = paddle.exp(true_target)
-                    ie.output([])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py
index fcfd783f71f6d..83eb2ae8aad8c 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py
@@ -174,7 +174,7 @@ def get_program():
         cond = paddle.less_than(x=i, y=loop_len)
         auto.shard_tensor(cond, _g_process_mesh, [None])
 
-        while_op = fluid.layers.While(cond=cond)
+        while_op = paddle.static.nn.control_flow.While(cond=cond)
         with while_op.block():
 
             pre_input = fluid.layers.array_read(array=input_array, i=i)
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_inference_helper.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_inference_helper.py
index 542b1ba637936..b4d1cbca4cff0 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_inference_helper.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_inference_helper.py
@@ -84,7 +84,9 @@ def test_hybrid_parallel_inference_helper_mp1pp2(self):
                 )
                 print(cond_int.shape)
                 cond = paddle.less_than(x=step_idx, y=max_len)
-                while_op = layers.While(cond, is_test=True)
+                while_op = paddle.static.nn.control_flow.While(
+                    cond, is_test=True
+                )
 
             with while_op.block():
                 with paddle.fluid.device_guard(f'{device}:all'):
diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py
index e56a632c3de4d..1e516b04849f5 100644
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -1763,7 +1763,7 @@ def beam_search():
             shape=[1], dtype=start_tokens.dtype, value=0
         )
         cond = paddle.less_than(x=step_idx, y=max_len)
-        while_op = layers.While(cond)
+        while_op = paddle.static.nn.control_flow.While(cond)
         # array states will be stored for each step.
         ids = layers.array_write(
             paddle.reshape(start_tokens, (-1, 1)), step_idx
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
index 8cc543a19f94d..c9c68eeafd92f 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
@@ -161,7 +161,7 @@ def body(i, ten, y):
 
     i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
     ten = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
-    i, ten, y = fluid.layers.while_loop(cond, body, [i, ten, y])
+    i, ten, y = paddle.static.nn.while_loop(cond, body, [i, ten, y])
     return y[0]
 
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_set_value_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_set_value_op_mlu.py
index d9db940b02943..8fcffc68e8686 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_set_value_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_set_value_op_mlu.py
@@ -145,7 +145,7 @@ def _get_answer(self):
 #             return i, x
 #
 #         i = paddle.zeros(shape=(1, ), dtype='int32')
-#         i, x = paddle.fluid.layers.while_loop(cond, body, [i, x])
+#         i, x = paddle.static.nn.while_loop(cond, body, [i, x])
 #
 #     def _get_answer(self):
 #         self.data[0] = self.value
diff --git a/python/paddle/fluid/tests/unittests/npu/test_set_value_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_set_value_op_npu.py
index 96af414883319..4366c759d1617 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_set_value_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_set_value_op_npu.py
@@ -147,7 +147,7 @@ def _get_answer(self):
 #             return i, x
 
 #         i = paddle.zeros(shape=(1, ), dtype='int32')
-#         i, x = paddle.fluid.layers.while_loop(cond, body, [i, x])
+#         i, x = paddle.static.nn.while_loop(cond, body, [i, x])
 
 #     def _get_answer(self):
 #         self.data[0] = self.value
diff --git a/python/paddle/fluid/tests/unittests/npu/test_while_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_while_op_npu.py
index c63f11b85910c..17b0711e91a0f 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_while_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_while_op_npu.py
@@ -64,8 +64,8 @@ def simple_net(self):
         cond2 = paddle.logical_or(x=j, y=array_len2)
         cond2 = paddle.ones(shape=[1], dtype='int32')
         cond2 = layers.cast(cond2, 'bool')
-        while_op = layers.While(cond=cond)
-        while_op2 = layers.While(cond=cond2)
+        while_op = paddle.static.nn.control_flow.While(cond=cond)
+        while_op2 = paddle.static.nn.control_flow.While(cond=cond2)
         with while_op.block():
             d = layers.array_read(array=data_array, i=i)
             prev = layers.array_read(array=mem_array, i=i)
diff --git a/python/paddle/fluid/tests/unittests/sequence/test_sequence_pool.py b/python/paddle/fluid/tests/unittests/sequence/test_sequence_pool.py
index b43bacec6ea37..908199c246d6f 100644
--- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_pool.py
+++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_pool.py
@@ -17,9 +17,20 @@
 
 import numpy as np
 
+import paddle
+
 sys.path.append("../")
 from op_test import OpTest, skip_check_grad_ci
-from test_reorder_lod_tensor import convert_to_offset
+
+paddle.enable_static()
+
+
+def convert_to_offset(lod):
+    offset = [[0] for i in lod]
+    for i, level in enumerate(lod):
+        for seq_len in level:
+            offset[i].append(offset[i][-1] + seq_len)
+    return offset
 
 
 def compute_seqpool_sum(x, offset, out, pad_value=0.0):
diff --git a/python/paddle/fluid/tests/unittests/test_case.py b/python/paddle/fluid/tests/unittests/test_case.py
index 3ab6e983d9019..e5980abea5d1e 100644
--- a/python/paddle/fluid/tests/unittests/test_case.py
+++ b/python/paddle/fluid/tests/unittests/test_case.py
@@ -24,6 +24,8 @@
 import paddle.fluid.optimizer as optimizer
 from paddle.fluid.framework import Program, program_guard
 
+paddle.enable_static()
+
 
 class TestAPICase(unittest.TestCase):
     def test_return_single_var(self):
@@ -46,25 +48,29 @@ def fn_3():
             pred_1 = paddle.less_than(z, x)  # true: 0.2 < 0.3
 
             # call fn_1
-            out_0 = layers.case(
+            out_0 = paddle.static.nn.control_flow.case(
                 pred_fn_pairs=[(pred_1, fn_1), (pred_1, fn_2)], default=fn_3
             )
 
             # call fn_2
-            out_1 = layers.case(
+            out_1 = paddle.static.nn.control_flow.case(
                 pred_fn_pairs=[(pred_2, fn_1), (pred_1, fn_2)], default=fn_3
             )
 
             # call default fn_3
-            out_2 = layers.case(
+            out_2 = paddle.static.nn.control_flow.case(
                 pred_fn_pairs=((pred_2, fn_1), (pred_2, fn_2)), default=fn_3
             )
 
             # no default, call fn_2
-            out_3 = layers.case(pred_fn_pairs=[(pred_1, fn_2)])
+            out_3 = paddle.static.nn.control_flow.case(
+                pred_fn_pairs=[(pred_1, fn_2)]
+            )
 
             # no default, call fn_2. but pred_2 is false
-            out_4 = layers.case(pred_fn_pairs=[(pred_2, fn_2)])
+            out_4 = paddle.static.nn.control_flow.case(
+                pred_fn_pairs=[(pred_2, fn_2)]
+            )
 
             place = (
                 fluid.CUDAPlace(0)
@@ -109,7 +115,9 @@ def fn_3():
             pred_1 = paddle.equal(x, y)  # true
             pred_2 = paddle.equal(x, z)  # false
 
-            out = layers.case(((pred_1, fn_1), (pred_2, fn_2)), fn_3)
+            out = paddle.static.nn.control_flow.case(
+                ((pred_1, fn_1), (pred_2, fn_2)), fn_3
+            )
 
             place = (
                 fluid.CUDAPlace(0)
@@ -132,7 +140,7 @@ def test_nested_case(self):
         def fn_1(x=1):
             var_5 = layers.fill_constant(shape=[1], dtype='int32', value=5)
             var_6 = layers.fill_constant(shape=[1], dtype='int32', value=6)
-            out = layers.case(
+            out = paddle.static.nn.control_flow.case(
                 pred_fn_pairs=[
                     (
                         var_5 < var_6,
@@ -159,7 +167,7 @@ def fn_1(x=1):
         def fn_2(x=2):
             var_5 = layers.fill_constant(shape=[1], dtype='int32', value=5)
             var_6 = layers.fill_constant(shape=[1], dtype='int32', value=6)
-            out = layers.case(
+            out = paddle.static.nn.control_flow.case(
                 pred_fn_pairs=[
                     (var_5 < var_6, partial(fn_1, x=x)),
                     (
@@ -178,7 +186,7 @@ def fn_2(x=2):
         def fn_3():
             var_5 = layers.fill_constant(shape=[1], dtype='int32', value=5)
             var_6 = layers.fill_constant(shape=[1], dtype='int32', value=6)
-            out = layers.case(
+            out = paddle.static.nn.control_flow.case(
                 pred_fn_pairs=[
                     (var_5 < var_6, partial(fn_2, x=3)),
                     (
@@ -203,15 +211,15 @@ def fn_3():
             pred_2 = paddle.less_than(x, y)  # false: 0.3 < 0.1
             pred_1 = paddle.less_than(z, x)  # true: 0.2 < 0.3
 
-            out_1 = layers.case(
+            out_1 = paddle.static.nn.control_flow.case(
                 pred_fn_pairs=[(pred_1, fn_1), (pred_2, fn_2)], default=fn_3
             )
 
-            out_2 = layers.case(
+            out_2 = paddle.static.nn.control_flow.case(
                 pred_fn_pairs=[(pred_2, fn_1), (pred_1, fn_2)], default=fn_3
             )
 
-            out_3 = layers.case(
+            out_3 = paddle.static.nn.control_flow.case(
                 pred_fn_pairs=[(x == y, fn_1), (x == z, fn_2)], default=fn_3
             )
 
@@ -243,37 +251,49 @@ def fn_1():
 
             # The type of 'pred_fn_pairs' in case must be list or tuple
             def type_error_pred_fn_pairs():
-                layers.case(pred_fn_pairs=1, default=fn_1)
+                paddle.static.nn.control_flow.case(
+                    pred_fn_pairs=1, default=fn_1
+                )
 
             self.assertRaises(TypeError, type_error_pred_fn_pairs)
 
             # The elements' type of 'pred_fn_pairs' in Op(case) must be tuple
             def type_error_pred_fn_1():
-                layers.case(pred_fn_pairs=[1], default=fn_1)
+                paddle.static.nn.control_flow.case(
+                    pred_fn_pairs=[1], default=fn_1
+                )
 
             self.assertRaises(TypeError, type_error_pred_fn_1)
 
             # The tuple's size of 'pred_fn_pairs' in Op(case) must be 2
             def type_error_pred_fn_2():
-                layers.case(pred_fn_pairs=[(1, 2, 3)], default=fn_1)
+                paddle.static.nn.control_flow.case(
+                    pred_fn_pairs=[(1, 2, 3)], default=fn_1
+                )
 
             self.assertRaises(TypeError, type_error_pred_fn_2)
 
             # The pred's type of 'pred_fn_pairs' in Op(case) must be bool Variable
             def type_error_pred():
-                layers.case(pred_fn_pairs=[(1, fn_1)], default=fn_1)
+                paddle.static.nn.control_flow.case(
+                    pred_fn_pairs=[(1, fn_1)], default=fn_1
+                )
 
             self.assertRaises(TypeError, type_error_pred)
 
             # The function of pred_fn_pairs in case must be callable
             def type_error_fn():
-                layers.case(pred_fn_pairs=[(pred_1, 2)], default=fn_1)
+                paddle.static.nn.control_flow.case(
+                    pred_fn_pairs=[(pred_1, 2)], default=fn_1
+                )
 
             self.assertRaises(TypeError, type_error_fn)
 
             # The default in Op(case) must be callable
             def type_error_default():
-                layers.case(pred_fn_pairs=[(pred_1, fn_1)], default=fn_1())
+                paddle.static.nn.control_flow.case(
+                    pred_fn_pairs=[(pred_1, fn_1)], default=fn_1()
+                )
 
             self.assertRaises(TypeError, type_error_default)
 
@@ -308,7 +328,9 @@ def fn_2():
             loss = paddle.mean(sum, name="f_2_loss")
             adagrad.minimize(loss)
 
-        layers.case(pred_fn_pairs=[(switch_id == one, fn_1)], default=fn_2)
+        paddle.static.nn.control_flow.case(
+            pred_fn_pairs=[(switch_id == one, fn_1)], default=fn_2
+        )
 
         exe = fluid.Executor(fluid.CPUPlace())
         exe.run(fluid.default_startup_program())
diff --git a/python/paddle/fluid/tests/unittests/test_device_guard.py b/python/paddle/fluid/tests/unittests/test_device_guard.py
index eff076c6a7871..13122c2794eb1 100644
--- a/python/paddle/fluid/tests/unittests/test_device_guard.py
+++ b/python/paddle/fluid/tests/unittests/test_device_guard.py
@@ -19,6 +19,8 @@
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 
+paddle.enable_static()
+
 
 def execute(main_program, startup_program):
     if paddle.is_compiled_with_cuda():
@@ -153,7 +155,7 @@ def test_without_kernel_op(self):
             with warnings.catch_warnings(record=True) as w:
                 warnings.simplefilter("always")
                 with paddle.static.device_guard("cpu"):
-                    while_op = fluid.layers.While(cond=cond)
+                    while_op = paddle.static.nn.control_flow.While(cond=cond)
                     with while_op.block():
                         i = paddle.increment(x=i, value=1)
                         paddle.assign(paddle.less_than(x=i, y=loop_len), cond)
diff --git a/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py b/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py
index 3e3eefd5d278d..f4071ac4149c9 100644
--- a/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py
@@ -20,6 +20,8 @@
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 
+paddle.enable_static()
+
 
 def build_and_run_program(place, batch_size, beam_size, stop_gradient=False):
     fluid.default_startup_program().random_seed = 1
@@ -37,7 +39,7 @@ def build_and_run_program(place, batch_size, beam_size, stop_gradient=False):
         shape=[1], dtype="int64", value=10, force_cpu=True
     )
     cond = paddle.less_than(x=step_idx, y=max_len)
-    while_op = layers.While(cond)
+    while_op = paddle.static.nn.control_flow.While(cond)
     scores = layers.array_write(x, step_idx)
     with while_op.block():
         bs = layers.cast(paddle.shape(x)[0], "int64")
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
index 943642b857cce..13704cb6105c2 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
@@ -103,8 +103,8 @@ def run_main(self, place, with_data_parallel):
         array_len2.stop_gradient = True
         cond2 = paddle.less_than(x=j, y=array_len2)
 
-        while_op = layers.While(cond=cond)
-        while_op2 = layers.While(cond=cond2)
+        while_op = paddle.static.nn.control_flow.While(cond=cond)
+        while_op2 = paddle.static.nn.control_flow.While(cond=cond2)
         with while_op.block():
             d = layers.array_read(array=data_array, i=i)
             prev = layers.array_read(array=mem_array, i=i)
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_seqpool_concat_op.py b/python/paddle/fluid/tests/unittests/test_fusion_seqpool_concat_op.py
index ce0eaff4e06bb..744f67ec42399 100644
--- a/python/paddle/fluid/tests/unittests/test_fusion_seqpool_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_seqpool_concat_op.py
@@ -21,7 +21,14 @@
     compute_seqpool_sqrt,
     compute_seqpool_sum,
 )
-from test_reorder_lod_tensor import convert_to_offset
+
+
+def convert_to_offset(lod):
+    offset = [[0] for i in lod]
+    for i, level in enumerate(lod):
+        for seq_len in level:
+            offset[i].append(offset[i][-1] + seq_len)
+    return offset
 
 
 class TestFusionSeqPoolConcatOp(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_seqpool_cvm_concat_op.py b/python/paddle/fluid/tests/unittests/test_fusion_seqpool_cvm_concat_op.py
index 02e3a48e12a85..8082e79e78288 100644
--- a/python/paddle/fluid/tests/unittests/test_fusion_seqpool_cvm_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_seqpool_cvm_concat_op.py
@@ -22,7 +22,14 @@
     compute_seqpool_sum,
 )
 from test_cvm_op import cvm_compute
-from test_reorder_lod_tensor import convert_to_offset
+
+
+def convert_to_offset(lod):
+    offset = [[0] for i in lod]
+    for i, level in enumerate(lod):
+        for seq_len in level:
+            offset[i].append(offset[i][-1] + seq_len)
+    return offset
 
 
 class TestFusionSeqPoolCVMConcatOp(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
deleted file mode 100644
index 375536b8cb684..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ /dev/null
@@ -1,1033 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from test_imperative_base import new_program_scope
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.dygraph_utils as dygraph_utils
-from paddle.fluid import core
-from paddle.fluid.dygraph.layer_object_helper import LayerObjectHelper
-from paddle.fluid.framework import _in_legacy_dygraph, _test_eager_guard
-from paddle.fluid.layer_helper import LayerHelper
-
-
-class MyLayer(fluid.Layer):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, inputs):
-        x = fluid.layers.relu(inputs)
-        self._x_for_debug = x
-        x = paddle.multiply(x, x)
-        x = paddle.sum(x)
-        return [x]
-
-
-class MLP(fluid.Layer):
-    def __init__(self, input_size):
-        super().__init__()
-        self._linear1 = paddle.nn.Linear(
-            input_size,
-            3,
-            weight_attr=paddle.ParamAttr(
-                initializer=paddle.nn.initializer.Constant(value=0.1)
-            ),
-            bias_attr=paddle.ParamAttr(
-                initializer=paddle.nn.initializer.Constant(value=0.1)
-            ),
-        )
-        self._linear2 = paddle.nn.Linear(
-            3,
-            4,
-            weight_attr=paddle.ParamAttr(
-                initializer=paddle.nn.initializer.Constant(value=0.1)
-            ),
-            bias_attr=paddle.ParamAttr(
-                initializer=paddle.nn.initializer.Constant(value=0.1)
-            ),
-        )
-
-    def forward(self, inputs):
-        x = self._linear1(inputs)
-        x = self._linear2(x)
-        x = paddle.sum(x)
-        return x
-
-
-class SimpleRNNCell(fluid.Layer):
-    def __init__(self, step_input_size, hidden_size, output_size, param_attr):
-        super().__init__()
-        self.step_input_size = step_input_size
-        self.hidden_size = hidden_size
-        self.output_size = output_size
-        self._dtype = core.VarDesc.VarType.FP32
-        self.param_attr = param_attr
-
-        i2h_param_shape = [self.step_input_size, self.hidden_size]
-        h2h_param_shape = [self.hidden_size, self.hidden_size]
-        h2o_param_shape = [self.output_size, self.hidden_size]
-        self._i2h_w = None
-        self._i2h_w = self.create_parameter(
-            attr=self.param_attr,
-            shape=i2h_param_shape,
-            dtype=self._dtype,
-            is_bias=False,
-        )
-        self._h2h_w = self.create_parameter(
-            attr=self.param_attr,
-            shape=h2h_param_shape,
-            dtype=self._dtype,
-            is_bias=False,
-        )
-        self._h2o_w = self.create_parameter(
-            attr=self.param_attr,
-            shape=h2o_param_shape,
-            dtype=self._dtype,
-            is_bias=False,
-        )
-
-    def forward(self, input, pre_hidden):
-        tmp_i2h = paddle.fluid.layers.nn.mul(input, self._i2h_w)
-        tmp_h2h = paddle.fluid.layers.nn.mul(pre_hidden, self._h2h_w)
-        hidden = paddle.add(tmp_h2h, tmp_i2h)
-        hidden = self._helper.append_activation(hidden, act='tanh')
-        out = paddle.fluid.layers.nn.mul(hidden, self._h2o_w)
-        softmax_out = paddle.nn.functional.softmax(out)
-        reduce_out = paddle.sum(softmax_out)
-        return reduce_out, hidden
-
-
-class SimpleRNN(fluid.Layer):
-    def __init__(self):
-        super().__init__()
-        self.seq_len = 4
-        self._cell = SimpleRNNCell(
-            3,
-            3,
-            3,
-            fluid.ParamAttr(initializer=fluid.initializer.Constant(value=0.1)),
-        )
-
-    def forward(self, inputs):
-        outs = list()
-        pre_hiddens = list()
-
-        init_hidden = self.create_parameter(
-            attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)
-            ),
-            shape=[1, 3],
-            dtype='float32',
-            is_bias=False,
-        )
-        pre_hidden = init_hidden
-        for i in range(self.seq_len):
-            input = paddle.slice(inputs, axes=[1], starts=[i], ends=[i + 1])
-            input = paddle.reshape(input, shape=[1, 3])
-            out_softmax, pre_hidden = self._cell(input, pre_hidden)
-            outs.append(out_softmax)
-
-        return outs, pre_hiddens
-
-
-class TestImperative(unittest.TestCase):
-    def functional_dygraph_context(self):
-        self.assertFalse(fluid.dygraph.enabled())
-        fluid.enable_dygraph()
-        self.assertTrue(fluid.dygraph.enabled())
-        np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
-        var_inp = paddle.to_tensor(np_inp)
-        mlp = MLP(input_size=2)
-        out = mlp(var_inp)
-        dy_out1 = out.numpy()
-        out.backward()
-        dy_grad1 = mlp._linear1.weight.gradient()
-        fluid.disable_dygraph()
-        self.assertFalse(fluid.dygraph.enabled())
-        with fluid.dygraph.guard():
-            self.assertTrue(fluid.dygraph.enabled())
-            var_inp = paddle.to_tensor(np_inp)
-            mlp = MLP(input_size=2)
-            out = mlp(var_inp)
-            dy_out2 = out.numpy()
-            out.backward()
-            dy_grad2 = mlp._linear1.weight.gradient()
-        self.assertFalse(fluid.dygraph.enabled())
-        np.testing.assert_array_equal(dy_out1, dy_out2)
-        np.testing.assert_array_equal(dy_grad1, dy_grad2)
-
-    def test_functional_dygraph_context(self):
-        with _test_eager_guard():
-            self.functional_dygraph_context()
-        self.functional_dygraph_context()
-
-    def functional_paddle_imperative_dygraph_context(self):
-        self.assertFalse(paddle.in_dynamic_mode())
-        paddle.disable_static()
-        self.assertTrue(paddle.in_dynamic_mode())
-        np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
-        var_inp = paddle.to_tensor(np_inp)
-        mlp = MLP(input_size=2)
-        out = mlp(var_inp)
-        dy_out1 = out.numpy()
-        out.backward()
-        dy_grad1 = mlp._linear1.weight.gradient()
-        paddle.enable_static()
-        self.assertFalse(paddle.in_dynamic_mode())
-        paddle.disable_static()
-        self.assertTrue(paddle.in_dynamic_mode())
-        var_inp = paddle.to_tensor(np_inp)
-        mlp = MLP(input_size=2)
-        out = mlp(var_inp)
-        dy_out2 = out.numpy()
-        out.backward()
-        dy_grad2 = mlp._linear1.weight.gradient()
-        paddle.enable_static()
-        self.assertFalse(paddle.in_dynamic_mode())
-        np.testing.assert_array_equal(dy_out1, dy_out2)
-        np.testing.assert_array_equal(dy_grad1, dy_grad2)
-
-    def test_functional_paddle_imperative_dygraph_context(self):
-        with _test_eager_guard():
-            self.functional_paddle_imperative_dygraph_context()
-        self.functional_paddle_imperative_dygraph_context()
-
-    def func_isinstance(self):
-        var = fluid.layers.data(shape=[1], name='x', dtype='float32')
-        self.assertTrue(isinstance(var, fluid.Variable))
-        with fluid.dygraph.guard():
-            if not _in_legacy_dygraph():
-                var_base = paddle.to_tensor(np.array([3, 4, 5]))
-                self.assertTrue(isinstance(var_base, core.eager.Tensor))
-            else:
-                var_base = paddle.to_tensor(np.array([3, 4, 5]))
-                self.assertTrue(isinstance(var_base, core.VarBase))
-                self.assertTrue(isinstance(var_base, fluid.Variable))
-
-    def test_isinstance(self):
-        with _test_eager_guard():
-            self.func_isinstance()
-        self.func_isinstance()
-
-    def func_create_varbase(self):
-        x = np.ones([2, 2], np.float32)
-        y = np.zeros([3, 3], np.float32)
-        t = fluid.Tensor()
-        t.set(x, fluid.CPUPlace())
-        if not _in_legacy_dygraph():
-            egr_tmp = fluid.core.eager.Tensor(
-                value=x, place=fluid.core.CPUPlace()
-            )
-            egr_tmp2 = fluid.core.eager.Tensor(y, fluid.core.CPUPlace())
-            egr_tmp3 = paddle.to_tensor(x)
-            egr_tmp4 = fluid.core.eager.Tensor(y)
-            egr_tmp5 = fluid.core.eager.Tensor(value=x)
-            egr_tmp6 = fluid.core.eager.Tensor(t)
-
-            np.testing.assert_array_equal(x, egr_tmp.numpy())
-            np.testing.assert_array_equal(y, egr_tmp2.numpy())
-            np.testing.assert_array_equal(x, egr_tmp3.numpy())
-            np.testing.assert_array_equal(y, egr_tmp4.numpy())
-            np.testing.assert_array_equal(x, egr_tmp5.numpy())
-            np.testing.assert_array_equal(x, egr_tmp6.numpy())
-        else:
-            tmp = fluid.core.VarBase(value=x, place=fluid.core.CPUPlace())
-            tmp2 = fluid.core.VarBase(y, fluid.core.CPUPlace())
-            tmp3 = paddle.to_tensor(x)
-            tmp4 = fluid.core.VarBase(y)
-            tmp5 = fluid.core.VarBase(value=x)
-            tmp6 = fluid.core.VarBase(t)
-
-            np.testing.assert_array_equal(x, tmp.numpy())
-            np.testing.assert_array_equal(y, tmp2.numpy())
-            np.testing.assert_array_equal(x, tmp3.numpy())
-            np.testing.assert_array_equal(y, tmp4.numpy())
-            np.testing.assert_array_equal(x, tmp5.numpy())
-            np.testing.assert_array_equal(x, tmp6.numpy())
-
-    def test_create_varbase(self):
-        with fluid.dygraph.guard():
-            with _test_eager_guard():
-                self.func_create_varbase()
-            self.func_create_varbase()
-
-    def test_no_grad_guard(self):
-        data = np.array([[2, 3], [4, 5]]).astype('float32')
-        with fluid.dygraph.guard():
-            l0 = paddle.nn.Linear(2, 2)
-            self.assertIsNone(l0.weight._grad_ivar())
-            l1 = paddle.nn.Linear(2, 2)
-            with fluid.dygraph.no_grad():
-                self.assertTrue(l1.weight.stop_gradient is False)
-                tmp = l1.weight * 2
-                self.assertTrue(tmp.stop_gradient)
-            x = paddle.to_tensor(data)
-            y = paddle.add(l0(x), tmp)
-            o = l1(y)
-            o.backward()
-
-            self.assertIsNone(tmp._grad_ivar())
-            self.assertIsNotNone(l0.weight._grad_ivar())
-
-    def test_paddle_imperative_no_grad_guard(self):
-        data = np.array([[2, 3], [4, 5]]).astype('float32')
-        with fluid.dygraph.guard():
-            l0 = paddle.nn.Linear(2, 2)
-            self.assertIsNone(l0.weight._grad_ivar())
-            l1 = paddle.nn.Linear(2, 2)
-            with paddle.no_grad():
-                self.assertTrue(l1.weight.stop_gradient is False)
-                tmp = l1.weight * 2
-                self.assertTrue(tmp.stop_gradient)
-            x = paddle.to_tensor(data)
-            y = paddle.add(l0(x), tmp)
-            o = l1(y)
-            o.backward()
-
-            self.assertIsNone(tmp._grad_ivar())
-            self.assertIsNotNone(l0.weight._grad_ivar())
-
-    def test_paddle_imperative_set_grad_enabled(self):
-        data = np.array([[2, 3], [4, 5]]).astype('float32')
-        with fluid.dygraph.guard():
-            l0 = paddle.nn.Linear(2, 2)
-            self.assertIsNone(l0.weight._grad_ivar())
-            l1 = paddle.nn.Linear(2, 2)
-            with paddle.set_grad_enabled(False):
-                self.assertTrue(l1.weight.stop_gradient is False)
-                tmp = l1.weight * 2
-                with paddle.set_grad_enabled(True):
-                    tmp2 = l1.weight * 2
-                self.assertTrue(tmp.stop_gradient)
-                self.assertTrue(tmp2.stop_gradient is False)
-            x = paddle.to_tensor(data)
-            y = paddle.add(l0(x), tmp2)
-            o = l1(y)
-            o.backward()
-
-            self.assertIsNone(tmp._grad_ivar())
-            self.assertIsNotNone(tmp2._grad_ivar())
-            self.assertIsNotNone(l0.weight._grad_ivar())
-
-    def test_paddle_imperative_is_grad_enabled(self):
-        with fluid.dygraph.guard():
-            with paddle.set_grad_enabled(False):
-                self.assertTrue(paddle.is_grad_enabled() is False)
-                with paddle.set_grad_enabled(True):
-                    self.assertTrue(paddle.is_grad_enabled())
-
-    def func_sum_op(self):
-        x = np.ones([2, 2], np.float32)
-        with fluid.dygraph.guard():
-            inputs = []
-            for _ in range(10):
-                tmp = paddle.to_tensor(x)
-                tmp.stop_gradient = False
-                inputs.append(tmp)
-            ret = paddle.add_n(inputs)
-            loss = paddle.sum(ret)
-            loss.backward()
-        with fluid.dygraph.guard():
-            inputs2 = []
-            for _ in range(10):
-                tmp = paddle.to_tensor(x)
-                tmp.stop_gradient = False
-                inputs2.append(tmp)
-            ret2 = paddle.add_n(inputs2)
-            loss2 = paddle.sum(ret2)
-            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
-            loss2.backward()
-
-            np.testing.assert_allclose(ret.numpy(), x * 10, rtol=1e-05)
-            np.testing.assert_allclose(inputs[0].gradient(), x, rtol=1e-05)
-            np.testing.assert_allclose(ret2.numpy(), x * 10, rtol=1e-05)
-            a = inputs2[0].gradient()
-            np.testing.assert_allclose(inputs2[0].gradient(), x, rtol=1e-05)
-
-    def test_sum_op(self):
-        with _test_eager_guard():
-            self.func_sum_op()
-        self.func_sum_op()
-
-    def func_empty_var(self):
-        with fluid.dygraph.guard():
-            cur_program = fluid.Program()
-            cur_block = cur_program.current_block()
-            # Normally, we don't allow tensor with -1 shape being created in dygraph mode, this test is not good.
-            if _in_legacy_dygraph():
-                new_variable = cur_block.create_var(
-                    name="X", shape=[-1, 23, 48], dtype='float32'
-                )
-            else:
-                new_variable = cur_block.create_var(
-                    name="X", shape=[1, 23, 48], dtype='float32'
-                )
-            try:
-                new_variable.numpy()
-            except Exception as e:
-                assert type(e) == ValueError
-
-            try:
-                new_variable.backward()
-            except Exception as e:
-                assert type(e) == core.EnforceNotMet
-            try:
-                new_variable.clear_gradient()
-            except Exception as e:
-                assert type(e) == core.EnforceNotMet
-
-    def test_empty_var(self):
-        with _test_eager_guard():
-            self.func_empty_var()
-        self.func_empty_var()
-
-    def func_empty_grad(self):
-        with fluid.dygraph.guard():
-            x = np.ones([2, 2], np.float32)
-            new_var = paddle.to_tensor(x)
-            self.assertIsNone(new_var.gradient())
-            try:
-                new_var.clear_gradient()
-            except Exception as e:
-                assert type(e) == core.EnforceNotMet
-
-        with fluid.dygraph.guard():
-            cur_program = fluid.Program()
-            cur_block = cur_program.current_block()
-            # Normally, we don't allow tensor with -1 shape being created in dygraph mode, this test is not good.
-            if _in_legacy_dygraph():
-                new_variable = cur_block.create_var(
-                    name="X", shape=[-1, 23, 48], dtype='float32'
-                )
-            else:
-                new_variable = cur_block.create_var(
-                    name="X", shape=[1, 23, 48], dtype='float32'
-                )
-            try:
-                new_variable.gradient()
-            except Exception as e:
-                assert type(e) == ValueError
-
-    def test_empty_grad(self):
-        with _test_eager_guard():
-            self.func_empty_grad()
-        self.func_empty_grad()
-
-    def func_set_persistable(self):
-        with fluid.dygraph.guard():
-            x = np.ones([2, 2], np.float32)
-            new_var = paddle.to_tensor(x)
-            self.assertFalse(new_var.persistable)
-            new_var.persistable = True
-            self.assertTrue(new_var.persistable)
-
-    def test_set_persistable(self):
-        with _test_eager_guard():
-            self.func_set_persistable()
-        self.func_set_persistable()
-
-    def func_layer(self):
-        with fluid.dygraph.guard():
-            l = fluid.Layer("l")
-            self.assertRaises(NotImplementedError, l.forward, [])
-
-    def test_layer(self):
-        with _test_eager_guard():
-            self.func_layer()
-        self.func_layer()
-
-    def func_layer_in_out(self):
-        np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32)
-        with fluid.dygraph.guard():
-            var_inp = paddle.to_tensor(np_inp)
-            var_inp.stop_gradient = False
-            l = MyLayer()
-            x = l(var_inp)[0]
-            self.assertIsNotNone(x)
-            dy_out = x.numpy()
-            x.backward()
-            dy_grad = l._x_for_debug.gradient()
-
-        with fluid.dygraph.guard():
-            var_inp2 = paddle.to_tensor(np_inp)
-            var_inp2.stop_gradient = False
-            l2 = MyLayer()
-            x2 = l2(var_inp2)[0]
-            self.assertIsNotNone(x2)
-            dy_out2 = x2.numpy()
-            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
-            x2.backward()
-            dy_grad2 = l2._x_for_debug.gradient()
-
-        with new_program_scope():
-            inp = fluid.layers.data(
-                name="inp", shape=[3], append_batch_size=False
-            )
-            l = MyLayer()
-            x = l(inp)[0]
-            param_grads = fluid.backward.append_backward(
-                x, parameter_list=[l._x_for_debug.name]
-            )[0]
-            exe = fluid.Executor(
-                fluid.CPUPlace()
-                if not core.is_compiled_with_cuda()
-                else fluid.CUDAPlace(0)
-            )
-
-            static_out, static_grad = exe.run(
-                feed={inp.name: np_inp},
-                fetch_list=[x.name, param_grads[1].name],
-            )
-
-        np.testing.assert_array_equal(dy_out, static_out)
-        np.testing.assert_array_equal(dy_grad, static_grad)
-        np.testing.assert_array_equal(dy_out2, static_out)
-        np.testing.assert_array_equal(dy_grad2, static_grad)
-
-    def test_layer_in_out(self):
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
-        with _test_eager_guard():
-            self.func_layer_in_out()
-        self.func_layer_in_out()
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
-
-    def func_mlp(self):
-        np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
-        with fluid.dygraph.guard():
-            var_inp = paddle.to_tensor(np_inp)
-            mlp = MLP(input_size=2)
-            out = mlp(var_inp)
-            dy_out = out.numpy()
-            out.backward()
-            dy_grad = mlp._linear1.weight.gradient()
-
-        with fluid.dygraph.guard():
-            var_inp2 = paddle.to_tensor(np_inp)
-            mlp2 = MLP(input_size=2)
-            out2 = mlp2(var_inp2)
-            dy_out2 = out2.numpy()
-            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
-            out2.backward()
-            dy_grad2 = mlp2._linear1.weight.gradient()
-
-        with new_program_scope():
-            inp = fluid.layers.data(
-                name="inp", shape=[2, 2], append_batch_size=False
-            )
-            mlp = MLP(input_size=2)
-            out = mlp(inp)
-            param_grads = fluid.backward.append_backward(
-                out, parameter_list=[mlp._linear1.weight.name]
-            )[0]
-            exe = fluid.Executor(
-                fluid.CPUPlace()
-                if not core.is_compiled_with_cuda()
-                else fluid.CUDAPlace(0)
-            )
-            exe.run(fluid.default_startup_program())
-
-            static_out, static_grad = exe.run(
-                feed={inp.name: np_inp},
-                fetch_list=[out.name, param_grads[1].name],
-            )
-
-        np.testing.assert_allclose(dy_out, static_out, rtol=1e-05)
-        np.testing.assert_allclose(dy_grad, static_grad, rtol=1e-05)
-        np.testing.assert_allclose(dy_out2, static_out, rtol=1e-05)
-        np.testing.assert_allclose(dy_grad2, static_grad, rtol=1e-05)
-
-        params = mlp.parameters(True)
-        self.assertEqual("linear_0.w_0", params[0].name)
-        self.assertEqual("linear_0.b_0", params[1].name)
-        self.assertEqual("linear_1.w_0", params[2].name)
-        self.assertEqual("linear_1.b_0", params[3].name)
-        self.assertEqual(len(params), 4)
-
-        sublayers = mlp.sublayers()
-        self.assertEqual(mlp._linear1, sublayers[0])
-        self.assertEqual(mlp._linear2, sublayers[1])
-        self.assertEqual(len(sublayers), 2)
-
-    def test_mlp(self):
-        with _test_eager_guard():
-            self.func_mlp()
-        self.func_mlp()
-
-    def test_gradient_accumulation(self):
-        def test_single_api(sort_sum_gradient):
-            fluid.set_flags({'FLAGS_sort_sum_gradient': sort_sum_gradient})
-            x = paddle.to_tensor(5.0, stop_gradient=False)
-            for i in range(10):
-                y = paddle.pow(x, 4.0)
-                y.backward()
-                self.assertEqual(x.grad.numpy(), (i + 1) * 500)
-            x.clear_gradient()
-            self.assertEqual(x.grad.numpy(), 0.0)
-            for i in range(10):
-                y = paddle.pow(x, 4.0)
-                y.backward()
-                self.assertEqual(x.grad.numpy(), (i + 1) * 500)
-            x.clear_grad()
-            self.assertEqual(x.grad.numpy(), 0.0)
-
-        def test_simple_net(sort_sum_gradient):
-            fluid.set_flags({'FLAGS_sort_sum_gradient': sort_sum_gradient})
-            x = paddle.to_tensor(5.0, stop_gradient=False)
-            y = paddle.to_tensor(2.0, stop_gradient=False)
-            z = paddle.to_tensor(3.0, stop_gradient=False)
-
-            def fun(x, y, z):
-                loss1 = x * x * y
-                loss2 = x * z
-                loss1.backward(retain_graph=True)
-                loss2.backward(retain_graph=True)
-                np.testing.assert_array_equal(x.grad.numpy(), [23.0])
-                np.testing.assert_array_equal(y.grad.numpy(), [25.0])
-                np.testing.assert_array_equal(z.grad.numpy(), [5.0])
-                x.clear_grad()
-                y.clear_grad()
-                z.clear_grad()
-
-                dx = paddle.grad([loss1], x, create_graph=True)[0]
-                loss = loss1 + loss2 + dx
-                # loss = x*x*y + x*z + 2*x*y
-                return loss
-
-            loss = fun(x, y, z)
-            loss.backward(retain_graph=True)
-            # x.grad = 2*x*y + z + 2*y = 27
-            np.testing.assert_array_equal(x.grad.numpy(), [27])
-
-            loss.backward(retain_graph=True)
-            np.testing.assert_array_equal(x.grad.numpy(), [54])
-
-            loss.backward()
-            np.testing.assert_array_equal(x.grad.numpy(), [81])
-
-            with self.assertRaises(RuntimeError):
-                loss.backward()
-
-            loss1 = x * x * y
-            loss2 = x * z
-            dx = paddle.grad([loss1], x, create_graph=True)[0]
-            loss = loss1 + loss2 + dx
-            loss.backward()
-            np.testing.assert_array_equal(dx.grad.numpy(), [1])
-            np.testing.assert_array_equal(x.grad.numpy(), [108])
-
-        def test_mlp(sort_sum_gradient):
-            fluid.set_flags({'FLAGS_sort_sum_gradient': sort_sum_gradient})
-            input_size = 5
-            paddle.seed(1)
-            mlp1 = MLP(input_size=input_size)
-            # generate the gradient of each step
-            mlp2 = MLP(input_size=input_size)
-
-            expected_weight1_grad = 0.0
-            expected_bias1_grad = 0.0
-            expected_weight2_grad = 0.0
-            expected_bias2_grad = 0.0
-
-            for batch_id in range(100):
-                x = paddle.uniform([10, input_size])
-                detach_x = x.detach()
-                clear_loss = mlp2(detach_x)
-                clear_loss.backward()
-                expected_weight1_grad = (
-                    expected_weight1_grad + mlp2._linear1.weight.grad.numpy()
-                )
-                expected_bias1_grad = (
-                    expected_bias1_grad + mlp2._linear1.bias.grad.numpy()
-                )
-                expected_weight2_grad = (
-                    expected_weight2_grad + mlp2._linear2.weight.grad.numpy()
-                )
-                expected_bias2_grad = (
-                    expected_bias2_grad + mlp2._linear2.bias.grad.numpy()
-                )
-
-                loss = mlp1(x)
-                loss.backward()
-
-                np.testing.assert_array_equal(loss.grad.numpy(), [1])
-                np.testing.assert_allclose(
-                    mlp1._linear1.weight.grad.numpy(),
-                    expected_weight1_grad,
-                    rtol=1e-05,
-                )
-                np.testing.assert_allclose(
-                    mlp1._linear1.bias.grad.numpy(),
-                    expected_bias1_grad,
-                    rtol=1e-05,
-                )
-                np.testing.assert_allclose(
-                    mlp1._linear2.weight.grad.numpy(),
-                    expected_weight2_grad,
-                    rtol=1e-05,
-                )
-                np.testing.assert_allclose(
-                    mlp1._linear2.bias.grad.numpy(),
-                    expected_bias2_grad,
-                    rtol=1e-05,
-                )
-
-                mlp2.clear_gradients()
-                np.testing.assert_array_equal(clear_loss.grad.numpy(), [1])
-                if ((batch_id + 1) % 10) % 2 == 0:
-                    mlp1.clear_gradients()
-                    expected_weight1_grad = 0.0
-                    expected_bias1_grad = 0.0
-                    expected_weight2_grad = 0.0
-                    expected_bias2_grad = 0.0
-                elif ((batch_id + 1) % 10) % 2 == 1:
-                    mlp1.clear_gradients()
-                    mlp1._linear1.weight._set_grad_ivar(
-                        paddle.ones([input_size, 3])
-                    )
-                    mlp1._linear2.weight._set_grad_ivar(paddle.ones([3, 4]))
-                    expected_weight1_grad = 1.0
-                    expected_bias1_grad = 0.0
-                    expected_weight2_grad = 1.0
-                    expected_bias2_grad = 0.0
-
-        with fluid.dygraph.guard():
-            test_single_api(False)
-            test_single_api(True)
-            test_simple_net(False)
-            test_simple_net(True)
-            test_mlp(False)
-            test_mlp(True)
-
-    def func_dygraph_vs_static(self):
-        np_inp1 = np.random.rand(4, 3, 3)
-        np_inp2 = np.random.rand(4, 3, 3)
-
-        # dynamic graph
-        with fluid.dygraph.guard():
-            inp1 = paddle.to_tensor(np_inp1)
-            inp2 = paddle.to_tensor(np_inp2)
-            if np.sum(np_inp1) < np.sum(np_inp2):
-                x = paddle.add(inp1, inp2)
-            else:
-                x = paddle.subtract(inp1, inp2)
-            dygraph_result = x.numpy()
-
-        # static graph
-        with new_program_scope():
-            inp_data1 = fluid.layers.data(
-                name='inp1', shape=[3, 3], dtype=np.float32
-            )
-            inp_data2 = fluid.layers.data(
-                name='inp2', shape=[3, 3], dtype=np.float32
-            )
-
-            a = paddle.expand(
-                paddle.reshape(paddle.sum(inp_data1), [1, 1]),
-                [4, -1],
-            )
-            b = paddle.expand(
-                paddle.reshape(paddle.sum(inp_data2), [1, 1]),
-                [4, -1],
-            )
-            cond = paddle.less_than(x=a, y=b)
-
-            ie = fluid.layers.IfElse(cond)
-            with ie.true_block():
-                d1 = ie.input(inp_data1)
-                d2 = ie.input(inp_data2)
-                d3 = paddle.add(d1, d2)
-                ie.output(d3)
-
-            with ie.false_block():
-                d1 = ie.input(inp_data1)
-                d2 = ie.input(inp_data2)
-                d3 = paddle.subtract(d1, d2)
-                ie.output(d3)
-            out = ie()
-
-            exe = fluid.Executor(
-                fluid.CPUPlace()
-                if not core.is_compiled_with_cuda()
-                else fluid.CUDAPlace(0)
-            )
-            static_result = exe.run(
-                fluid.default_main_program(),
-                feed={'inp1': np_inp1, 'inp2': np_inp2},
-                fetch_list=out,
-            )[0]
-        np.testing.assert_allclose(dygraph_result, static_result, rtol=1e-05)
-
-    def test_dygraph_vs_static(self):
-        with _test_eager_guard():
-            self.func_dygraph_vs_static()
-        self.func_dygraph_vs_static()
-
-    def func_rnn(self):
-        np_inp = np.array(
-            [
-                [1.0, 2.0, 3.0],
-                [4.0, 5.0, 6.0],
-                [7.0, 8.0, 9.0],
-                [10.0, 11.0, 12.0],
-            ]
-        )
-        np_inp = np_inp.reshape((1, 4, 3))
-        np_inp = np_inp.astype(np.float32)
-        with fluid.dygraph.guard():
-            var_inp = paddle.to_tensor(np_inp)
-            var_inp = paddle.reshape(var_inp, shape=[1, 4, 3])
-            simple_rnn = SimpleRNN()
-            outs, pre_hiddens = simple_rnn.forward(var_inp)
-            dy_out = outs[3].numpy()
-            outs[3].backward()
-            dy_grad_h2o = simple_rnn._cell._h2o_w.gradient()
-            dy_grad_h2h = simple_rnn._cell._h2h_w.gradient()
-            dy_grad_i2h = simple_rnn._cell._i2h_w.gradient()
-
-        with fluid.dygraph.guard():
-            var_inp2 = paddle.to_tensor(np_inp)
-            var_inp2 = paddle.reshape(var_inp2, shape=[1, 4, 3])
-            simple_rnn2 = SimpleRNN()
-            outs2, pre_hiddens2 = simple_rnn2.forward(var_inp2)
-            dy_out2 = outs2[3].numpy()
-            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
-            outs2[3].backward()
-            dy_grad_h2o2 = simple_rnn2._cell._h2o_w.gradient()
-            dy_grad_h2h2 = simple_rnn2._cell._h2h_w.gradient()
-            dy_grad_i2h2 = simple_rnn2._cell._i2h_w.gradient()
-
-        with new_program_scope():
-            inp = fluid.layers.data(
-                name="inp", shape=[1, 4, 3], append_batch_size=False
-            )
-            simple_rnn = SimpleRNN()
-            outs, pre_hiddens = simple_rnn(inp)
-            param_grads = fluid.backward.append_backward(outs[3])
-            exe = fluid.Executor(fluid.CPUPlace())
-            exe.run(fluid.default_startup_program())
-            (
-                static_out,
-                static_grad_h2o,
-                static_grad_h2h,
-                static_grad_i2h,
-            ) = exe.run(
-                feed={inp.name: np_inp},
-                fetch_list=[
-                    outs[3].name,
-                    param_grads[0][1].name,
-                    param_grads[1][1].name,
-                    param_grads[2][1].name,
-                ],
-            )
-
-        np.testing.assert_array_equal(dy_out, static_out)
-        np.testing.assert_array_equal(dy_grad_h2o, static_grad_h2o)
-        np.testing.assert_array_equal(dy_grad_h2h, static_grad_h2h)
-        np.testing.assert_array_equal(dy_grad_i2h, static_grad_i2h)
-        np.testing.assert_array_equal(dy_out2, static_out)
-        np.testing.assert_array_equal(dy_grad_h2o2, static_grad_h2o)
-        np.testing.assert_array_equal(dy_grad_h2h2, static_grad_h2h)
-        np.testing.assert_array_equal(dy_grad_i2h2, static_grad_i2h)
-
-    def test_rnn(self):
-        with _test_eager_guard():
-            self.func_rnn()
-        self.func_rnn()
-
-    def func_layer_attrs(self):
-        layer = fluid.dygraph.Layer("test")
-        layer.test_attr = 1
-        self.assertFalse(hasattr(layer, "whatever"))
-        self.assertTrue(hasattr(layer, "test_attr"))
-        self.assertEqual(layer.test_attr, 1)
-
-        my_layer = MyLayer()
-        my_layer.w1 = my_layer.create_parameter([3, 3])
-        my_layer.add_parameter('w2', None)
-        self.assertEqual(len(my_layer.parameters()), 1)
-        self.assertRaises(TypeError, my_layer.__setattr__, 'w1', 'str')
-        my_layer.w1 = None
-        self.assertEqual(len(my_layer.parameters()), 0)
-        my_layer.l1 = paddle.nn.Linear(3, 3)
-        self.assertEqual(len(my_layer.sublayers()), 1)
-        self.assertRaises(TypeError, my_layer.__setattr__, 'l1', 'str')
-        my_layer.l1 = None
-        self.assertEqual(len(my_layer.sublayers()), 0)
-
-    def test_layer_attrs(self):
-        with _test_eager_guard():
-            self.func_layer_attrs()
-        self.func_layer_attrs()
-
-
-class TestDygraphUtils(unittest.TestCase):
-    def func_append_activation_in_dygraph_exception(self):
-        with new_program_scope():
-            np_inp = np.random.random(size=(10, 20, 30)).astype(np.float32)
-            a = fluid.layers.data("a", [10, 20])
-            func = dygraph_utils._append_activation_in_dygraph
-            self.assertRaises(AssertionError, func, a, act="sigmoid")
-
-    def test_append_activation_in_dygraph_exception(self):
-        with _test_eager_guard():
-            self.func_append_activation_in_dygraph_exception()
-        self.func_append_activation_in_dygraph_exception()
-
-    def func_append_activation_in_dygraph1(self):
-        a_np = np.random.random(size=(10, 20, 30)).astype(np.float32)
-        func = dygraph_utils._append_activation_in_dygraph
-        with fluid.dygraph.guard():
-            a = paddle.to_tensor(a_np)
-            res1 = func(a, act="hard_sigmoid")
-            res2 = paddle.nn.functional.hardsigmoid(a, slope=0.2)
-            np.testing.assert_array_equal(res1.numpy(), res2.numpy())
-
-    def test_append_activation_in_dygraph1(self):
-        with _test_eager_guard():
-            self.func_append_activation_in_dygraph1()
-        self.func_append_activation_in_dygraph1()
-
-    def func_append_activation_in_dygraph2(self):
-        a_np = np.random.random(size=(10, 20, 30)).astype(np.float32)
-        func = dygraph_utils._append_activation_in_dygraph
-        with fluid.dygraph.guard():
-            a = paddle.to_tensor(a_np)
-            res1 = func(a, act="sigmoid", use_mkldnn=True, use_cudnn=True)
-            res2 = paddle.nn.functional.sigmoid(a)
-            np.testing.assert_allclose(res1.numpy(), res2.numpy(), rtol=1e-05)
-
-    def test_append_activation_in_dygraph2(self):
-        with _test_eager_guard():
-            self.func_append_activation_in_dygraph2()
-        self.func_append_activation_in_dygraph2()
-
-    def func_append_activation_in_dygraph3(self):
-        a_np = np.random.random(size=(10, 20, 30)).astype(np.float32)
-        helper = LayerObjectHelper(fluid.unique_name.generate("test"))
-        func = helper.append_activation
-        with fluid.dygraph.guard():
-            a = paddle.to_tensor(a_np)
-            res1 = func(a, act="sigmoid", use_cudnn=True)
-            res2 = paddle.nn.functional.sigmoid(a)
-            np.testing.assert_array_equal(res1.numpy(), res2.numpy())
-
-    def test_append_activation_in_dygraph3(self):
-        with _test_eager_guard():
-            self.func_append_activation_in_dygraph3()
-        self.func_append_activation_in_dygraph3()
-
-    def func_append_activation_in_dygraph_use_mkldnn(self):
-        a_np = np.random.uniform(-2, 2, (10, 20, 30)).astype(np.float32)
-        helper = LayerHelper(
-            fluid.unique_name.generate("test"), act="relu", use_mkldnn=True
-        )
-        func = helper.append_activation
-        with fluid.dygraph.guard():
-            a = paddle.to_tensor(a_np)
-            res1 = func(a)
-            res2 = fluid.layers.relu(a)
-            np.testing.assert_array_equal(res1.numpy(), res2.numpy())
-
-    def test_append_activation_in_dygraph_use_mkldnn(self):
-        with _test_eager_guard():
-            self.func_append_activation_in_dygraph_use_mkldnn()
-        self.func_append_activation_in_dygraph_use_mkldnn()
-
-    def func_append_activation_in_dygraph_global_use_mkldnn(self):
-        a_np = np.random.uniform(-2, 2, (10, 20, 30)).astype(np.float32)
-        helper = LayerHelper(fluid.unique_name.generate("test"), act="relu")
-        func = helper.append_activation
-        with fluid.dygraph.guard(fluid.core.CPUPlace()):
-            a = paddle.to_tensor(a_np)
-            fluid.set_flags({'FLAGS_use_mkldnn': True})
-            try:
-                res1 = func(a)
-            finally:
-                fluid.set_flags({'FLAGS_use_mkldnn': False})
-            res2 = fluid.layers.relu(a)
-        np.testing.assert_array_equal(res1.numpy(), res2.numpy())
-
-    def test_append_activation_in_dygraph_global_use_mkldnn(self):
-        with _test_eager_guard():
-            self.func_append_activation_in_dygraph_global_use_mkldnn()
-        self.func_append_activation_in_dygraph_global_use_mkldnn()
-
-    def func_append_bias_in_dygraph_exception(self):
-        with new_program_scope():
-            np_inp = np.random.random(size=(10, 20, 30)).astype(np.float32)
-            a = fluid.layers.data("a", [10, 20])
-            func = dygraph_utils._append_bias_in_dygraph
-            self.assertRaises(AssertionError, func, a)
-
-    def test_append_bias_in_dygraph_exception(self):
-        with _test_eager_guard():
-            self.func_append_bias_in_dygraph_exception()
-        self.func_append_bias_in_dygraph_exception()
-
-    def func_append_bias_in_dygraph(self):
-        a_np = np.random.random(size=(10, 20, 30)).astype(np.float32)
-        func = dygraph_utils._append_bias_in_dygraph
-        with fluid.dygraph.guard():
-            a = paddle.to_tensor(a_np)
-            res1 = func(a, bias=a)
-            res2 = paddle.add(a, a)
-            np.testing.assert_array_equal(res1.numpy(), res2.numpy())
-
-    def test_append_bias_in_dygraph(self):
-        with _test_eager_guard():
-            self.func_append_bias_in_dygraph()
-        self.func_append_bias_in_dygraph()
-
-
-class TestDygraphGuardWithError(unittest.TestCase):
-    def func_without_guard(self):
-        with fluid.dygraph.guard():
-            x = paddle.to_tensor(np.zeros([10, 10]))
-        with self.assertRaisesRegexp(
-            TypeError, "Please use `with fluid.dygraph.guard()"
-        ):
-            y = paddle.matmul(x, x)
-
-    def test_without_guard(self):
-        with _test_eager_guard():
-            self.func_without_guard()
-        self.func_without_guard()
-
-
-class TestMetaclass(unittest.TestCase):
-    def func_metaclass(self):
-        self.assertEqual(type(MyLayer).__name__, 'type')
-        self.assertNotEqual(type(MyLayer).__name__, 'pybind11_type')
-        if not _in_legacy_dygraph():
-            self.assertEqual(
-                type(paddle.fluid.core.eager.Tensor).__name__, 'type'
-            )
-        else:
-            self.assertEqual(
-                type(paddle.fluid.core.VarBase).__name__, 'pybind11_type'
-            )
-
-    def test_metaclass(self):
-        with _test_eager_guard():
-            self.func_metaclass()
-        self.func_metaclass()
-
-
-if __name__ == '__main__':
-    paddle.enable_static()
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py
index b0f6418e24c2c..aac9152195be5 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py
@@ -24,6 +24,8 @@
 
 LOADED_VAR_SUFFIX = ".load_0"
 
+paddle.enable_static()
+
 
 def while_softmax_regression(img):
     def cond(i, times, pred):
@@ -37,7 +39,7 @@ def body(i, times, pred):
     i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
     times = fluid.layers.fill_constant(shape=[1], dtype='int64', value=5)
     pred = fluid.layers.fc(input=img, size=10, act='softmax')
-    i, times, pred = fluid.layers.while_loop(
+    i, times, pred = paddle.static.nn.while_loop(
         cond=cond, body=body, loop_vars=[i, times, pred]
     )
     return pred
diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
deleted file mode 100644
index 852f4e550326e..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# nlp model stack of op operate on lod. It's a classical test case in optimize pass.
-
-import unittest
-
-import numpy as np
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.layers as layers
-from paddle.fluid import Program, compiler, program_guard
-from paddle.fluid.executor import Executor
-from paddle.fluid.optimizer import MomentumOptimizer
-
-
-class TestIrMemoryOptimizeIfElseOp(unittest.TestCase):
-    def check_network_convergence(
-        self, use_cuda=True, use_mem_opt=False, iter_num=5
-    ):
-        paddle.seed(100)
-        paddle.framework.random._manual_program_seed(100)
-        prog = Program()
-        startup_prog = Program()
-        with program_guard(prog, startup_prog):
-            image = layers.data(name='x', shape=[784], dtype='float32')
-
-            label = layers.data(name='y', shape=[1], dtype='int64')
-
-            limit = layers.fill_constant(shape=[1], dtype='int64', value=5)
-            cond = paddle.less_than(x=label, y=limit)
-            ie = layers.IfElse(cond)
-
-            with ie.true_block():
-                true_image = ie.input(image)
-                hidden = layers.fc(input=true_image, size=100, act='tanh')
-                prob = layers.fc(input=hidden, size=10, act='softmax')
-                ie.output(prob)
-
-            with ie.false_block():
-                false_image = ie.input(image)
-                hidden = layers.fc(input=false_image, size=200, act='tanh')
-                prob = layers.fc(input=hidden, size=10, act='softmax')
-                ie.output(prob)
-
-            prob = ie()
-            loss = layers.cross_entropy(input=prob[0], label=label)
-            avg_loss = paddle.mean(loss)
-
-            optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
-            optimizer.minimize(avg_loss, startup_prog)
-            train_reader = paddle.batch(
-                paddle.dataset.mnist.train(), batch_size=200
-            )
-
-            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-            exe = Executor(place)
-
-            exec_strategy = fluid.ExecutionStrategy()
-            exec_strategy._use_device = (
-                core.DeviceType.CUDA if use_cuda else core.DeviceType.CPU
-            )
-
-            build_strategy = fluid.BuildStrategy()
-            build_strategy.memory_optimize = use_mem_opt
-
-            train_cp = compiler.CompiledProgram(fluid.default_main_program())
-            train_cp = train_cp.with_data_parallel(
-                loss_name=avg_loss.name,
-                exec_strategy=exec_strategy,
-                build_strategy=build_strategy,
-            )
-            fetch_list = [avg_loss.name]
-
-            exe.run(startup_prog)
-            PASS_NUM = 100
-            loop = 0
-            ret = []
-            for pass_id in range(PASS_NUM):
-                for data in train_reader():
-                    x_data = np.array([x[0] for x in data]).astype("float32")
-                    y_data = np.array([x[1] for x in data]).astype("int64")
-                    y_data = y_data.reshape((y_data.shape[0], 1))
-
-                    outs = exe.run(
-                        train_cp,
-                        feed={'x': x_data, 'y': y_data},
-                        fetch_list=[avg_loss],
-                    )
-
-                    loop += 1
-                    ret.append(outs[0])
-                    if iter_num == loop:
-                        return ret
-            return ret
-
-    def test_ifelse(self):
-        ret1 = self.check_network_convergence(False, True)
-        print(ret1)
-        ret2 = self.check_network_convergence(False, False)
-        print(ret2)
-        np.testing.assert_allclose(ret1, ret2, rtol=1e-05)
-
-        if fluid.core.is_compiled_with_cuda():
-            ret1 = self.check_network_convergence(True, True)
-            print(ret1)
-            ret2 = self.check_network_convergence(True, False)
-            print(ret2)
-            np.testing.assert_allclose(ret1, ret2, rtol=1e-05)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 4fb6645b9673f..e166b0adb0914 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -1387,7 +1387,7 @@ def cond(i):
             def body(i):
                 return i + 1
 
-            out = layers.while_loop(cond, body, [i])
+            out = paddle.static.nn.while_loop(cond, body, [i])
             static_ret = self.get_static_graph_result(feed={}, fetch_list=out)
 
         with self.dynamic_graph():
@@ -1400,14 +1400,14 @@ def cond1(i):
             def body1(i):
                 return i + 1
 
-            dy_ret = layers.while_loop(cond1, body1, [i])
+            dy_ret = paddle.static.nn.while_loop(cond1, body1, [i])
             with self.assertRaises(ValueError):
                 j = layers.fill_constant(shape=[1], dtype='int64', value=0)
 
                 def body2(i):
                     return i + 1, i + 2
 
-                layers.while_loop(cond1, body2, [j])
+                paddle.static.nn.while_loop(cond1, body2, [j])
 
         np.testing.assert_array_equal(static_ret[0], dy_ret[0].numpy())
 
@@ -1659,10 +1659,12 @@ def fn_3():
             pred_2 = paddle.less_than(x, y)  # false: 0.3 < 0.1
             pred_3 = paddle.equal(x, y)  # false: 0.3 == 0.1
 
-            out_1 = layers.case(
+            out_1 = paddle.static.nn.case(
                 pred_fn_pairs=[(pred_1, fn_1), (pred_2, fn_2)], default=fn_3
             )
-            out_2 = layers.case(pred_fn_pairs=[(pred_2, fn_2), (pred_3, fn_3)])
+            out_2 = paddle.static.nn.case(
+                pred_fn_pairs=[(pred_2, fn_2), (pred_3, fn_3)]
+            )
 
             place = (
                 fluid.CUDAPlace(0)
@@ -1682,10 +1684,10 @@ def fn_3():
                 pred_2 = paddle.less_than(x, y)  # false: 0.3 < 0.1
                 pred_3 = paddle.equal(x, y)  # false: 0.3 == 0.1
 
-                out_1 = layers.case(
+                out_1 = paddle.static.nn.case(
                     pred_fn_pairs=[(pred_1, fn_1), (pred_2, fn_2)], default=fn_3
                 )
-                out_2 = layers.case(
+                out_2 = paddle.static.nn.case(
                     pred_fn_pairs=[(pred_2, fn_2), (pred_3, fn_3)]
                 )
                 eager_dynamic_res1 = out_1.numpy()
@@ -1699,10 +1701,12 @@ def fn_3():
             pred_2 = paddle.less_than(x, y)  # false: 0.3 < 0.1
             pred_3 = paddle.equal(x, y)  # false: 0.3 == 0.1
 
-            out_1 = layers.case(
+            out_1 = paddle.static.nn.case(
                 pred_fn_pairs=[(pred_1, fn_1), (pred_2, fn_2)], default=fn_3
             )
-            out_2 = layers.case(pred_fn_pairs=[(pred_2, fn_2), (pred_3, fn_3)])
+            out_2 = paddle.static.nn.case(
+                pred_fn_pairs=[(pred_2, fn_2), (pred_3, fn_3)]
+            )
             dynamic_res1 = out_1.numpy()
             dynamic_res2 = out_2.numpy()
 
@@ -1725,17 +1729,17 @@ def fn_3():
             index_1 = layers.fill_constant(shape=[1], dtype='int32', value=1)
             index_2 = layers.fill_constant(shape=[1], dtype='int32', value=2)
 
-            out_1 = layers.switch_case(
+            out_1 = paddle.static.nn.switch_case(
                 branch_index=index_1,
                 branch_fns={1: fn_1, 2: fn_2},
                 default=fn_3,
             )
-            out_2 = layers.switch_case(
+            out_2 = paddle.static.nn.switch_case(
                 branch_index=index_2,
                 branch_fns=[(1, fn_1), (2, fn_2)],
                 default=fn_3,
             )
-            out_3 = layers.switch_case(
+            out_3 = paddle.static.nn.switch_case(
                 branch_index=index_2,
                 branch_fns=[(0, fn_1), (4, fn_2), (7, fn_3)],
             )
@@ -1759,17 +1763,17 @@ def fn_3():
                     shape=[1], dtype='int32', value=2
                 )
 
-                out_1 = layers.switch_case(
+                out_1 = paddle.static.nn.switch_case(
                     branch_index=index_1,
                     branch_fns={1: fn_1, 2: fn_2},
                     default=fn_3,
                 )
-                out_2 = layers.switch_case(
+                out_2 = paddle.static.nn.switch_case(
                     branch_index=index_2,
                     branch_fns=[(1, fn_1), (2, fn_2)],
                     default=fn_3,
                 )
-                out_3 = layers.switch_case(
+                out_3 = paddle.static.nn.switch_case(
                     branch_index=index_2,
                     branch_fns=[(0, fn_1), (4, fn_2), (7, fn_3)],
                 )
@@ -1781,17 +1785,17 @@ def fn_3():
             index_1 = layers.fill_constant(shape=[1], dtype='int32', value=1)
             index_2 = layers.fill_constant(shape=[1], dtype='int32', value=2)
 
-            out_1 = layers.switch_case(
+            out_1 = paddle.static.nn.switch_case(
                 branch_index=index_1,
                 branch_fns={1: fn_1, 2: fn_2},
                 default=fn_3,
             )
-            out_2 = layers.switch_case(
+            out_2 = paddle.static.nn.switch_case(
                 branch_index=index_2,
                 branch_fns=[(1, fn_1), (2, fn_2)],
                 default=fn_3,
             )
-            out_3 = layers.switch_case(
+            out_3 = paddle.static.nn.switch_case(
                 branch_index=index_2,
                 branch_fns=[(0, fn_1), (4, fn_2), (7, fn_3)],
             )
diff --git a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
deleted file mode 100644
index c523c31a24b44..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
+++ /dev/null
@@ -1,66 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy
-
-from paddle.fluid import Program, core, program_guard
-from paddle.fluid.executor import Executor
-from paddle.fluid.layers import data
-from paddle.fluid.layers.control_flow import lod_rank_table
-
-
-class TestLoDRankTable(unittest.TestCase):
-    def test_lod_rank_table(self):
-        x = data(name='x', shape=[100])
-        cpu = core.CPUPlace()
-        rank_table = lod_rank_table(x=x, level=1)
-        rank_table.persistable = True
-        exe = Executor(cpu)
-        scope = core.Scope()
-
-        tensor = core.LoDTensor()
-        tensor.set(numpy.random.random(size=(17, 100)), cpu)
-        tensor.set_recursive_sequence_lengths(
-            [[1, 2], [5, 1, 1], [3, 1, 5, 1, 3, 3, 1]]
-        )
-        exe.run(scope=scope, feed={'x': tensor})
-        var = scope.find_var(rank_table.name)
-        table = var.get_lod_rank_table()
-        self.assertEqual([(0, 5), (1, 1), (2, 1)], list(table.items()))
-
-
-class TestLoDRankTableError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-            x = numpy.random.random((2, 4)).astype("float32")
-
-            def test_Variable():
-                rank_table = lod_rank_table(x=x, level=1)
-
-            self.assertRaises(TypeError, test_Variable)
-
-            def test_list_Variable():
-                rank_table = lod_rank_table(x=[x], level=1)
-
-            self.assertRaises(TypeError, test_list_Variable)
-
-            x = data(name='x', shape=[10], dtype='float32', lod_level=1)
-            out = lod_rank_table(x=x, level=0)
-            out = lod_rank_table(x=[x], level=0)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py b/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
index df07543fa7acf..b592e21656f73 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
@@ -101,7 +101,7 @@ def fn_2(opt, avg_loss=None, pred=None, label=None):
         mod_two = paddle.remainder(id, two) == 0
 
         if loss_in_switch:
-            avg_loss = layers.case(
+            avg_loss = paddle.static.nn.case(
                 [(mod_two, lambda: fn_1(adam, None, prediction, label))],
                 lambda: fn_2(sgd, None, prediction, label),
             )
@@ -112,7 +112,7 @@ def fn_2(opt, avg_loss=None, pred=None, label=None):
                 logits=prediction, label=label
             )
             avg_loss_2 = paddle.mean(loss_2)
-            avg_loss = layers.case(
+            avg_loss = paddle.static.nn.case(
                 [(mod_two, lambda: fn_1(adam, avg_loss_1))],
                 lambda: fn_2(sgd, avg_loss_2),
             )
@@ -264,7 +264,7 @@ def fn_2(opt, avg_loss):
 
             cond = layers.fill_constant([1], 'bool', True)
 
-            layers.case(
+            paddle.static.nn.case(
                 [(cond, lambda: fn_1(adam, avg_loss))],
                 lambda: fn_2(sgd, avg_loss),
             )
diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py
index 62d46d4cadc48..3e846bdba7685 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@@ -46,7 +46,7 @@ def build_program(self, compile_program=True):
             until = layers.fill_constant([1], dtype='int64', value=10)
             data_arr = layers.array_write(hidden1, i)
             cond = paddle.less_than(x=counter, y=until)
-            while_op = fluid.layers.While(cond=cond)
+            while_op = paddle.static.nn.control_flow.While(cond=cond)
             with while_op.block():
                 hidden_n = fluid.layers.fc(input=hidden1, size=64, act='relu')
                 layers.array_write(hidden_n, i, data_arr)
diff --git a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
index 056afc5ead833..e244195fed0f9 100755
--- a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
+++ b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
@@ -100,7 +100,7 @@ def loss2(pred, label):
 
     two = fluid.layers.fill_constant([1], 'int32', 2)
     pred = two == 0
-    avg_loss = fluid.layers.case(
+    avg_loss = paddle.static.nn.case(
         [(pred, lambda: loss1(prediction, label))],
         lambda: loss2(prediction, label),
     )
@@ -132,7 +132,7 @@ def loss2(opt, pred, label, with_optimize):
     sgd = fluid.optimizer.SGD(learning_rate=0.1)
     two = fluid.layers.fill_constant([1], 'int32', 2)
     pred = two == 0
-    avg_loss = fluid.layers.case(
+    avg_loss = paddle.static.nn.case(
         [(pred, lambda: loss1(sgd, prediction, label, with_optimize))],
         lambda: loss2(sgd, prediction, label, with_optimize),
     )
diff --git a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
deleted file mode 100644
index 2e480f7ca15fd..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
+++ /dev/null
@@ -1,260 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import functools
-import unittest
-
-import numpy as np
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid import Program, program_guard
-from paddle.fluid.layers.control_flow import lod_rank_table
-
-
-def convert_to_offset(lod):
-    offset = [[0] for i in lod]
-    for i, level in enumerate(lod):
-        for seq_len in level:
-            offset[i].append(offset[i][-1] + seq_len)
-    return offset
-
-
-class TestReorderLoDTensor(unittest.TestCase):
-    num_seq = 5
-    # [name, shape, lod_level] pair indicating data info of source and target
-    data_desc = (['input', [9], 0], ['ref', [5], 1])
-
-    @classmethod
-    def setUpClass(cls):
-        cls.set_program()
-
-    @classmethod
-    def set_program(cls):
-        dat = fluid.layers.data(
-            name=cls.data_desc[0][0], shape=cls.data_desc[0][1]
-        )
-        dat.stop_gradient = False
-        rank_dat = fluid.layers.data(
-            name=cls.data_desc[1][0], shape=cls.data_desc[1][1]
-        )
-        table = lod_rank_table(rank_dat)
-        new_dat = fluid.layers.reorder_lod_tensor_by_rank(
-            x=dat, rank_table=table
-        )
-        loss = paddle.sum(new_dat)
-        fluid.backward.append_backward(loss=loss)
-        cls.fetch_list = [new_dat, cls.data_desc[0][0] + '@GRAD']
-
-    def run_program(self):
-        outputs = []
-        input_grads = []
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        for place in places:
-            self.set_inputs(place)
-            exe = fluid.Executor(place)
-            output, input_grad = exe.run(
-                fluid.default_main_program(),
-                feed=self.inputs,
-                fetch_list=self.fetch_list,
-                return_numpy=False,
-            )
-            outputs.append(output)
-            input_grads.append(input_grad)
-        self.actual_outputs = outputs
-        self.actual_grads = input_grads
-
-    def set_data(self):
-        self.data = {}
-        for desc in self.data_desc:
-            data_name = desc[0]
-            data_shape = desc[1]
-            data_lod_level = desc[2]
-            data_lod = []
-            for i in range(data_lod_level):
-                lod_level_i = np.random.randint(
-                    low=1,
-                    high=5,
-                    size=self.num_seq
-                    if i == 0
-                    else sum(lod_level_i),  # noqa: F821
-                ).tolist()
-                data_lod.append(lod_level_i)
-            data_value = np.random.random(
-                size=[sum(data_lod[-1]) if data_lod else self.num_seq]
-                + data_shape
-            ).astype('float32')
-            self.data[data_name] = (data_value, data_lod)
-
-    def set_inputs(self, place):
-        self.inputs = {}
-        for desc in self.data_desc:
-            tensor = fluid.Tensor()
-            tensor.set(self.data[desc[0]][0], place)
-            if self.data[desc[0]][1]:
-                tensor.set_recursive_sequence_lengths(self.data[desc[0]][1])
-            self.inputs[desc[0]] = tensor
-
-    def reorder(self):
-        level = 0
-        # compute the rank_table according to ref_lod
-        ref_lod = self.data[self.data_desc[1][0]][1][level]
-        rank_table = []  # list of (index, length)
-        for i in range(len(ref_lod)):
-            rank_table.append((i, ref_lod[i]))
-        rank_table = sorted(
-            rank_table, key=functools.cmp_to_key(lambda x, y: y[1] - x[1])
-        )
-
-        # compute the input sequence info according to input_lod
-        input_value, input_lod = self.data[self.data_desc[0][0]]
-        offset_lod = convert_to_offset(input_lod)
-
-        input_table = []  # list of (offset, length, sub_lod)
-        if offset_lod:
-            for i in range(len(offset_lod[level]) - 1):
-                start_idx = i
-                end_idx = i + 1
-                sub_lod = []
-                for lod_level_i in offset_lod[level:]:
-                    sub_lod_i = []
-                    for idx in range(start_idx, end_idx):
-                        sub_lod_i.append(
-                            lod_level_i[idx + 1] - lod_level_i[idx]
-                        )
-                    sub_lod.append(sub_lod_i)
-                    start_idx = lod_level_i[start_idx]
-                    end_idx = lod_level_i[end_idx]
-                input_table.append((start_idx, end_idx - start_idx, sub_lod))
-        else:
-            input_table = [(i, 1, []) for i in range(len(rank_table))]
-
-        # reorder by rank_table
-        output_value = np.zeros_like(input_value)
-        output_lod = []
-        offset = 0
-        for index, length in rank_table:
-            input_seq_start = input_table[index][0]
-            input_seq_len = input_table[index][1]
-            input_seq_end = input_seq_start + input_seq_len
-            output_value[offset : offset + input_seq_len] = input_value[
-                input_seq_start:input_seq_end
-            ]
-            offset += input_seq_len
-
-            input_seq_sub_lod = input_table[index][2]
-            if len(output_lod) == 0:
-                output_lod = [[] for i in input_seq_sub_lod]
-            for i, level in enumerate(input_seq_sub_lod):
-                output_lod[i].extend(level)
-        return output_value, output_lod
-
-    def test_reorder_lod_tensor(self):
-        self.data_desc[0][-1] = 2  # input is lod_tensor
-        self.set_data()
-        self.run_program()
-        # check output
-        expect_output, expect_output_lod = self.reorder()
-        for actual_output in self.actual_outputs:
-            np.testing.assert_allclose(
-                np.array(actual_output), expect_output, rtol=1e-05, atol=0.001
-            )
-            self.assertEqual(
-                expect_output_lod, actual_output.recursive_sequence_lengths()
-            )
-        # check gradient
-        expect_grad = np.ones_like(self.data[self.data_desc[0][0]][0])
-        expect_grad_lod = self.data[self.data_desc[0][0]][1]
-        for actual_grad in self.actual_grads:
-            np.testing.assert_allclose(
-                np.array(actual_grad), expect_grad, rtol=1e-05, atol=0.001
-            )
-            self.assertEqual(
-                expect_grad_lod, actual_grad.recursive_sequence_lengths()
-            )
-
-    def test_reorder_tensor(self):
-        self.data_desc[0][-1] = 0  # input is tensor
-        self.set_data()
-        self.run_program()
-        # check output
-        expect_output, expect_output_lod = self.reorder()
-        for actual_output in self.actual_outputs:
-            np.testing.assert_allclose(
-                np.array(actual_output), expect_output, rtol=1e-05, atol=0.001
-            )
-            self.assertEqual(
-                expect_output_lod, actual_output.recursive_sequence_lengths()
-            )
-        # check gradient
-        expect_grad = np.ones_like(self.data[self.data_desc[0][0]][0])
-        expect_grad_lod = self.data[self.data_desc[0][0]][1]
-        for actual_grad in self.actual_grads:
-            np.testing.assert_allclose(
-                np.array(actual_grad), expect_grad, rtol=1e-05, atol=0.001
-            )
-            self.assertEqual(
-                expect_grad_lod, actual_grad.recursive_sequence_lengths()
-            )
-
-        # compare outputs between LodTensors with explicit and implicit lod
-        # use the same data but set the input lod explicitly
-        input_lod = [[1] * len(self.data[self.data_desc[0][0]][0])]
-        self.inputs[self.data_desc[0][0]].set_recursive_sequence_lengths(
-            input_lod
-        )
-        # preserve the output of LodTensor with implicit lod to compare
-        expect_outputs = [
-            np.array(actual_output) for actual_output in self.actual_outputs
-        ]
-        self.run_program()
-        for actual_output, expect_output in zip(
-            self.actual_outputs, expect_outputs
-        ):
-            np.testing.assert_allclose(
-                np.array(actual_output), expect_output, rtol=1e-05, atol=0.001
-            )
-
-
-class TestReorderLoDTensorError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program()):
-
-            def test_Variable():
-                # The input must be Variable.
-                x1 = np.array([0.9383, 0.1983, 3.2, 1.2]).astype("float64")
-                table1 = np.array([0.9383, 0.1983, 3.2, 1.2]).astype("float64")
-                new_dat = fluid.layers.reorder_lod_tensor_by_rank(
-                    x=x1, rank_table=table1
-                )
-
-            self.assertRaises(TypeError, test_Variable)
-
-            def test_type():
-                x2 = fluid.layers.data(name='x1', shape=[4], dtype='float32')
-                table2 = fluid.layers.data(
-                    name='table2', shape=[4], dtype='int32'
-                )
-                new_dat2 = fluid.layers.reorder_lod_tensor_by_rank(
-                    x=x2, rank_table=table2
-                )
-
-            self.assertRaises(TypeError, test_type)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_set_value_op.py b/python/paddle/fluid/tests/unittests/test_set_value_op.py
index ff2df8c1a8e99..664a7c11d50bf 100644
--- a/python/paddle/fluid/tests/unittests/test_set_value_op.py
+++ b/python/paddle/fluid/tests/unittests/test_set_value_op.py
@@ -156,7 +156,7 @@ def body(i, x):
             return i, x
 
         i = paddle.zeros(shape=(1,), dtype='int32')
-        i, x = paddle.fluid.layers.while_loop(cond, body, [i, x])
+        i, x = paddle.static.nn.while_loop(cond, body, [i, x])
 
     def _get_answer(self):
         self.data[0] = self.value
diff --git a/python/paddle/fluid/tests/unittests/test_switch_case.py b/python/paddle/fluid/tests/unittests/test_switch_case.py
index 1b6b460397c91..119b5ac285f73 100644
--- a/python/paddle/fluid/tests/unittests/test_switch_case.py
+++ b/python/paddle/fluid/tests/unittests/test_switch_case.py
@@ -17,11 +17,14 @@
 
 import numpy as np
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.layers as layers
 from paddle.fluid.framework import Program, program_guard
 
+paddle.enable_static()
+
 
 class TestAPISwitchCase(unittest.TestCase):
     def test_return_single_var(self):
@@ -42,29 +45,29 @@ def fn_3():
             index_5 = layers.fill_constant(shape=[1], dtype='int32', value=5)
 
             # call fn_1
-            out_0 = layers.switch_case(
+            out_0 = paddle.static.nn.switch_case(
                 branch_index=index_1, branch_fns={1: fn_1, 2: fn_2, 3: fn_3}
             )
 
             # call fn_2 : branch_fns={0: fn_1, 1:fn_2, 2:fn_3}
-            out_1 = layers.switch_case(
+            out_1 = paddle.static.nn.switch_case(
                 branch_index=index_1, branch_fns=(fn_1, fn_2, fn_3)
             )
 
             # call default fn_3
-            out_2 = layers.switch_case(
+            out_2 = paddle.static.nn.switch_case(
                 branch_index=index_5,
                 branch_fns=((1, fn_1), (2, fn_2)),
                 default=fn_3,
             )
 
             # no default, call fn_2
-            out_3 = layers.switch_case(
+            out_3 = paddle.static.nn.switch_case(
                 branch_index=index_2, branch_fns=[(1, fn_1), (2, fn_2)]
             )
 
             # no default, call fn_2 but branch_index is 5
-            out_4 = layers.switch_case(
+            out_4 = paddle.static.nn.switch_case(
                 branch_index=index_5,
                 branch_fns=[(1, fn_1), (3, fn_2), (2, fn_3)],
             )
@@ -132,7 +135,9 @@ def fn_3():
         with program_guard(main_program, startup_program):
             index_1 = layers.fill_constant(shape=[1], dtype='int32', value=1)
 
-            out = layers.switch_case(index_1, ((1, fn_1), (2, fn_2)), fn_3)
+            out = paddle.static.nn.switch_case(
+                index_1, ((1, fn_1), (2, fn_2)), fn_3
+            )
 
             place = (
                 fluid.CUDAPlace(0)
@@ -153,7 +158,7 @@ def fn_3():
 class TestAPISwitchCase_Nested(unittest.TestCase):
     def test_nested_switch_case(self):
         def fn_1(x=1):
-            out = layers.switch_case(
+            out = paddle.static.nn.switch_case(
                 branch_index=layers.fill_constant(
                     shape=[1], dtype='int32', value=x
                 ),
@@ -169,7 +174,7 @@ def fn_1(x=1):
             return out
 
         def fn_2(x=2):
-            out = layers.switch_case(
+            out = paddle.static.nn.switch_case(
                 branch_index=layers.fill_constant(
                     shape=[1], dtype='int32', value=2
                 ),
@@ -186,7 +191,7 @@ def fn_2(x=2):
             return out
 
         def fn_3():
-            out = layers.switch_case(
+            out = paddle.static.nn.switch_case(
                 branch_index=layers.fill_constant(
                     shape=[1], dtype='int32', value=3
                 ),
@@ -209,14 +214,14 @@ def fn_3():
             index_2 = layers.fill_constant(shape=[1], dtype='int32', value=2)
             index_3 = layers.fill_constant(shape=[1], dtype='int64', value=3)
 
-            out_1 = layers.switch_case(
+            out_1 = paddle.static.nn.switch_case(
                 branch_index=index_1, branch_fns={1: fn_1, 2: fn_2, 3: fn_3}
             )
-            out_2 = layers.switch_case(
+            out_2 = paddle.static.nn.switch_case(
                 branch_index=index_2, branch_fns={1: fn_1, 2: fn_2, 3: fn_3}
             )
 
-            out_3 = layers.switch_case(
+            out_3 = paddle.static.nn.switch_case(
                 branch_index=index_3, branch_fns={1: fn_1, 2: fn_2, 3: fn_3}
             )
 
@@ -277,7 +282,7 @@ def fn_3():
 
             # The type of 'branch_index' in Op(switch_case) must be Variable
             def type_error_branch_index():
-                layers.switch_case(
+                paddle.static.nn.switch_case(
                     branch_index=1, branch_fns=[(1, fn_1)], default=fn_3
                 )
 
@@ -285,7 +290,7 @@ def type_error_branch_index():
 
             # The data type of 'branch_index' in Op(switch_case) must be int32, int64 or uint8
             def dtype_error_branch_index():
-                layers.switch_case(
+                paddle.static.nn.switch_case(
                     branch_index=key_float32,
                     branch_fns=[(1, fn_1)],
                     default=fn_3,
@@ -295,7 +300,7 @@ def dtype_error_branch_index():
 
             # The type of 'branch_fns' in Op(switch_case) must be list, tuple or dict
             def type_error_branch_fns():
-                layers.switch_case(
+                paddle.static.nn.switch_case(
                     branch_index=key_int32, branch_fns=1, default=fn_3
                 )
 
@@ -303,7 +308,7 @@ def type_error_branch_fns():
 
             # The elements' type of 'branch_fns' in Op(switch_case) must be tuple
             def type_error_index_fn_pair_1():
-                layers.switch_case(
+                paddle.static.nn.switch_case(
                     branch_index=key_int32, branch_fns=[1], default=fn_3
                 )
 
@@ -311,7 +316,7 @@ def type_error_index_fn_pair_1():
 
             # The tuple's size of 'branch_fns' in Op(switch_case) must be 2
             def type_error_index_fn_pair_2():
-                layers.switch_case(
+                paddle.static.nn.switch_case(
                     branch_index=key_int32, branch_fns=[(1, 2, 3)], default=fn_3
                 )
 
@@ -319,7 +324,7 @@ def type_error_index_fn_pair_2():
 
             # The key's type of 'branch_fns' in Op(switch_case) must be int
             def type_error_key():
-                layers.switch_case(
+                paddle.static.nn.switch_case(
                     branch_index=key_int32, branch_fns=[(2.3, 2)], default=fn_3
                 )
 
@@ -327,7 +332,7 @@ def type_error_key():
 
             # The key in 'branch_fns' must be unique
             def value_error_key():
-                layers.switch_case(
+                paddle.static.nn.switch_case(
                     branch_index=key_int32,
                     branch_fns=[(2, fn_1), (2, fn_2)],
                     default=fn_3,
@@ -337,7 +342,7 @@ def value_error_key():
 
             # The type of function in 'branch_fns' must be callable
             def type_error_fn():
-                layers.switch_case(
+                paddle.static.nn.switch_case(
                     branch_index=key_int32,
                     branch_fns=[(1, 1), (2, fn_2)],
                     default=fn_3,
@@ -347,7 +352,7 @@ def type_error_fn():
 
             # The default in Op(case) must be callable
             def type_error_default():
-                layers.switch_case(
+                paddle.static.nn.switch_case(
                     branch_index=key_int32,
                     branch_fns=[(1, fn_1), (2, fn_2)],
                     default=1,
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py
index e662e1488c50e..21b36342feb05 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py
@@ -21,6 +21,8 @@
 import paddle.fluid.core as core
 from paddle.fluid import Program, program_guard
 
+paddle.enable_static()
+
 
 class TestTensorArrayToTensorError(unittest.TestCase):
     """Tensor_array_to_tensor error message enhance"""
@@ -288,7 +290,9 @@ def body(i, end, array):
                 fluid.layers.array_write(prev, i, array)
                 return i + 1, end, array
 
-            _, _, array = fluid.layers.while_loop(cond, body, [i, ten, array])
+            _, _, array = paddle.static.nn.while_loop(
+                cond, body, [i, ten, array]
+            )
 
             self.assertTrue(paddle.tensor.array_length(array), 10)
             last = fluid.layers.fill_constant(shape=[1], dtype='int64', value=9)
diff --git a/python/paddle/fluid/tests/unittests/test_while_loop_op.py b/python/paddle/fluid/tests/unittests/test_while_loop_op.py
index 8c94834c9a28b..3bee6eef63950 100644
--- a/python/paddle/fluid/tests/unittests/test_while_loop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_loop_op.py
@@ -40,7 +40,7 @@ def body(i):
             i = layers.fill_constant(shape=[1], dtype='int64', value=0)
             one = layers.fill_constant(shape=[1], dtype='int64', value=1)
             ten = layers.fill_constant(shape=[1], dtype='int64', value=10)
-            out = layers.while_loop(cond, body, (i,))
+            out = paddle.static.nn.while_loop(cond, body, (i,))
 
         place = (
             fluid.CUDAPlace(0)
@@ -69,7 +69,7 @@ def body(i, mem):
             ten = layers.fill_constant(shape=[1], dtype='int64', value=10)
             mem = fluid.data(name='mem', shape=[10], dtype='float32')
             one = layers.fill_constant(shape=[10], dtype='float32', value=1)
-            out = layers.while_loop(cond, body, [i, mem])
+            out = paddle.static.nn.while_loop(cond, body, [i, mem])
 
             data = np.random.rand(10).astype('float32')
             data_one = np.ones(10).astype('float32')
@@ -122,7 +122,13 @@ def body(i, ten, test_dict, test_list, test_list_dict):
                 }
             ]
 
-            i, ten, test_dict, test_list, test_list_dict = layers.while_loop(
+            (
+                i,
+                ten,
+                test_dict,
+                test_list,
+                test_list_dict,
+            ) = paddle.static.nn.while_loop(
                 cond, body, [i, ten, test_dict, test_list, test_list_dict]
             )
         place = (
@@ -171,7 +177,7 @@ def internal_body(j, init, sums):
                 j = layers.increment(j)
                 return [j, init, sums]
 
-            result = layers.while_loop(
+            result = paddle.static.nn.while_loop(
                 internal_cond, internal_body, [j, init, sums]
             )
             j = result[0]
@@ -192,7 +198,7 @@ def internal_body(j, init, sums):
             loop_len2 = layers.fill_constant(shape=[1], dtype='int64', value=3)
             ones = layers.fill_constant(shape=[3, 3], dtype='float32', value=1)
 
-            out = layers.while_loop(
+            out = paddle.static.nn.while_loop(
                 external_cond, external_body, [i, j, init, sums]
             )
 
@@ -236,7 +242,7 @@ def body(i, x):
             x = fluid.data(name='x', shape=[1], dtype='float32')
             x.stop_gradient = False
 
-            out = layers.while_loop(cond, body, [i, x])
+            out = paddle.static.nn.while_loop(cond, body, [i, x])
             mean = paddle.mean(out[1])
             append_backward(mean)
 
@@ -277,7 +283,7 @@ def body(i, x):
             x = fluid.data(name='x', shape=[1], dtype='float32')
             x.stop_gradient = False
 
-            out = layers.while_loop(cond, body, [i, x])
+            out = paddle.static.nn.while_loop(cond, body, [i, x])
             mean = paddle.mean(out[1])
             append_backward(mean)
 
@@ -328,7 +334,7 @@ def internal_body(j, x, mem_array):
             outer_sum_1 = paddle.add(x=x, y=outer_sum_0)
             i = layers.increment(x=i, in_place=True)
             layers.array_write(outer_sum_1, i=i, array=mem_array)
-            j, x, mem_array = layers.while_loop(
+            j, x, mem_array = paddle.static.nn.while_loop(
                 internal_cond, internal_body, [j, x, mem_array]
             )
             return [i, j, x, mem_array]
@@ -357,7 +363,7 @@ def internal_body(j, x, mem_array):
             j.stop_gradient = True
             array_len2 = layers.fill_constant(shape=[1], dtype='int64', value=3)
 
-            out = layers.while_loop(
+            out = paddle.static.nn.while_loop(
                 external_cond, external_body, [i, j, x, mem_array]
             )
 
@@ -405,7 +411,7 @@ def fn_add_one():
                 data_add_one = paddle.add(x=i, y=one)
                 return data_add_one
 
-            return layers.switch_case(
+            return paddle.static.nn.switch_case(
                 branch_index=i,
                 branch_fns={2: fn_add_three, 5: fn_square},
                 default=fn_add_one,
@@ -418,7 +424,7 @@ def fn_add_one():
             ten = layers.fill_constant(shape=[1], dtype='int64', value=10)
             three = layers.fill_constant(shape=[1], dtype='int64', value=3)
             one = layers.fill_constant(shape=[1], dtype='int64', value=1)
-            out = layers.while_loop(cond, body, [i])
+            out = paddle.static.nn.while_loop(cond, body, [i])
 
         place = (
             fluid.CUDAPlace(0)
@@ -488,13 +494,13 @@ def body_returns_with_mutable_list(i, test_list):
 
             # The type of `cond` in Op(while_loop) must be callable
             def type_error_cond():
-                out = layers.while_loop(data, body, [data_1d])
+                out = paddle.static.nn.while_loop(data, body, [data_1d])
 
             self.assertRaises(TypeError, type_error_cond)
 
             # The type of `body` in Op(while_loop) must be callable
             def type_error_body():
-                out = layers.while_loop(
+                out = paddle.static.nn.while_loop(
                     cond_returns_bool_tensor, data, [data_1d]
                 )
 
@@ -502,25 +508,31 @@ def type_error_body():
 
             # The type of `loop_vars` in Op(while_loop) must be list or tuple
             def type_error_loop_vars():
-                out = layers.while_loop(cond_returns_bool_tensor, body, data_1d)
+                out = paddle.static.nn.while_loop(
+                    cond_returns_bool_tensor, body, data_1d
+                )
 
             self.assertRaises(TypeError, type_error_loop_vars)
 
             # The value of `loop_vars` is empty
             def value_error_loop_vars():
-                out = layers.while_loop(cond_returns_bool_tensor, body, [])
+                out = paddle.static.nn.while_loop(
+                    cond_returns_bool_tensor, body, []
+                )
 
             self.assertRaises(ValueError, value_error_loop_vars)
 
             # The type of `cond` returns in Op(while_loop) must be Variable
             def type_error_cond_returns_not_variable():
-                out = layers.while_loop(cond_returns_constant, body, [data_1d])
+                out = paddle.static.nn.while_loop(
+                    cond_returns_constant, body, [data_1d]
+                )
 
             self.assertRaises(TypeError, type_error_cond_returns_not_variable)
 
             # The type of `cond` returns in Op(while_loop) must be a bollean variable
             def type_error_cond_returns_not_boolean():
-                out = layers.while_loop(
+                out = paddle.static.nn.while_loop(
                     cond_returns_not_bool_tensor, body, [data_1d]
                 )
 
@@ -528,13 +540,15 @@ def type_error_cond_returns_not_boolean():
 
             # The shape of `cond` returns in Op(while_loop) must be 1
             def type_error_shape_cond_returns_2d():
-                out = layers.while_loop(cond_returns_2d_tensor, body, [data_2d])
+                out = paddle.static.nn.while_loop(
+                    cond_returns_2d_tensor, body, [data_2d]
+                )
 
             self.assertRaises(TypeError, type_error_shape_cond_returns_2d)
 
             # The length of `body` returns in Op(while_loop) must be same as `loop_vars`
             def value_error_body_returns_error_length():
-                out = layers.while_loop(
+                out = paddle.static.nn.while_loop(
                     cond_returns_bool_tensor, body_returns_error_length, [data]
                 )
 
@@ -542,7 +556,7 @@ def value_error_body_returns_error_length():
 
             # The type of `body` returns in Op(while_loop) must be same as `loop_vars`
             def value_error_body_returns_error_type():
-                out = layers.while_loop(
+                out = paddle.static.nn.while_loop(
                     cond_receives_two_args, body_returns_error_type, [data, ten]
                 )
 
@@ -555,7 +569,7 @@ def value_error_body_returns_with_mutable_dict():
                         shape=[2, 2], dtype='int64', value=1
                     )
                 }
-                out = layers.while_loop(
+                out = paddle.static.nn.while_loop(
                     cond_returns_with_mutable_dict,
                     body_returns_with_mutable_dict,
                     [data, test_dict],
@@ -569,7 +583,7 @@ def value_error_body_returns_with_mutable_list():
                 test_list = [
                     layers.fill_constant(shape=[2, 2], dtype='int64', value=1)
                 ]
-                out = layers.while_loop(
+                out = paddle.static.nn.while_loop(
                     cond_returns_with_mutable_list,
                     body_returns_with_mutable_list,
                     [data, test_list],
@@ -597,7 +611,7 @@ def body(z, i):
             z = fluid.layers.fill_constant([1], 'int32', 0)
             x_shape = paddle.shape(x)
             i = fluid.layers.fill_constant([1], 'int32', 0)
-            z, _ = fluid.layers.while_loop(cond, body, [z, i])
+            z, _ = paddle.static.nn.while_loop(cond, body, [z, i])
 
         place = (
             fluid.CUDAPlace(0)
diff --git a/python/paddle/fluid/tests/unittests/test_while_op.py b/python/paddle/fluid/tests/unittests/test_while_op.py
index f77d9767f3c8b..06b3d2d8f2554 100644
--- a/python/paddle/fluid/tests/unittests/test_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_op.py
@@ -56,8 +56,8 @@ def simple_net(self):
         array_len2 = layers.fill_constant(shape=[1], dtype='int64', value=3)
         array_len2.stop_gradient = True
         cond2 = paddle.less_than(x=j, y=array_len2)
-        while_op = layers.While(cond=cond)
-        while_op2 = layers.While(cond=cond2)
+        while_op = paddle.static.nn.control_flow.While(cond=cond)
+        while_op2 = paddle.static.nn.control_flow.While(cond=cond2)
         with while_op.block():
             d = layers.array_read(array=data_array, i=i)
             prev = layers.array_read(array=mem_array, i=i)
@@ -122,10 +122,10 @@ def test_exceptions(self):
         array_len = layers.fill_constant(shape=[2], dtype='int64', value=1)
         cond = paddle.less_than(x=i, y=array_len)
         with self.assertRaises(TypeError):
-            layers.While(cond=cond)
+            paddle.static.nn.control_flow.While(cond=cond)
         cond = layers.cast(cond, dtype='float64')
         with self.assertRaises(TypeError):
-            layers.While(cond=cond)
+            paddle.static.nn.control_flow.While(cond=cond)
 
 
 class BadInputTest(unittest.TestCase):
@@ -157,7 +157,7 @@ def body_func(i, ten, batch_info, origin_seq):
         i = layers.fill_constant(shape=[1], value=0, dtype='int32')
         num = layers.fill_constant(shape=[1], value=5, dtype='int32')
 
-        i, ten, shuffle_temp, y = layers.while_loop(
+        i, ten, shuffle_temp, y = paddle.static.nn.while_loop(
             cond, body_func, [i, num, temp, y]
         )
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_device_guard_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_device_guard_xpu.py
index 6de4b3f07b237..081074dcab8a2 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_device_guard_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_device_guard_xpu.py
@@ -159,7 +159,7 @@ def test_without_kernel_op(self):
             with warnings.catch_warnings(record=True) as w:
                 warnings.simplefilter("always")
                 with paddle.static.device_guard("cpu"):
-                    while_op = fluid.layers.While(cond=cond)
+                    while_op = paddle.static.nn.control_flow.While(cond=cond)
                     with while_op.block():
                         i = paddle.increment(x=i, value=1)
                         paddle.assign(paddle.less_than(x=i, y=loop_len), cond)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_while_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_while_op_xpu.py
index e52e8fdceb7ed..6b2d658067bc7 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_while_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_while_op_xpu.py
@@ -55,8 +55,8 @@ def simple_net(self):
         array_len2 = layers.fill_constant(shape=[1], dtype='int64', value=3)
         array_len2.stop_gradient = True
         cond2 = paddle.less_than(x=j, y=array_len2)
-        while_op = layers.While(cond=cond)
-        while_op2 = layers.While(cond=cond2)
+        while_op = paddle.static.nn.control_flow.While(cond=cond)
+        while_op2 = paddle.static.nn.control_flow.While(cond=cond2)
         with while_op.block():
             d = layers.array_read(array=data_array, i=i)
             prev = layers.array_read(array=mem_array, i=i)
@@ -121,10 +121,10 @@ def test_exceptions(self):
         array_len = layers.fill_constant(shape=[2], dtype='int64', value=1)
         cond = paddle.less_than(x=i, y=array_len)
         with self.assertRaises(TypeError):
-            layers.While(cond=cond)
+            paddle.static.nn.control_flow.While(cond=cond)
         cond = layers.cast(cond, dtype='float64')
         with self.assertRaises(TypeError):
-            layers.While(cond=cond)
+            paddle.static.nn.control_flow.While(cond=cond)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py
index cae4b52fe4c59..7d449c16be557 100755
--- a/python/paddle/static/nn/__init__.py
+++ b/python/paddle/static/nn/__init__.py
@@ -21,10 +21,14 @@
 from .common import conv3d  # noqa: F401
 from .common import conv2d_transpose  # noqa: F401
 from .common import conv3d_transpose  # noqa: F401
+from .control_flow import (
+    case,
+    while_loop,
+    switch_case,
+)
 from .common import bilinear_tensor_product  # noqa: F401
 from .common import py_func  # noqa: F401
 from ...tensor.creation import create_parameter  # noqa: F401
-from ...fluid.layers import case  # noqa: F401
 from ...fluid.layers import cond  # noqa: F401
 from ...fluid.layers import conv2d  # noqa: F401
 from ...fluid.layers import crf_decoding  # noqa: F401
@@ -34,8 +38,6 @@
 from .common import prelu  # noqa: F401
 from ...fluid.layers import row_conv  # noqa: F401
 from ...fluid.layers import spectral_norm  # noqa: F401
-from ...fluid.layers import switch_case  # noqa: F401
-from ...fluid.layers import while_loop  # noqa: F401
 
 from ...fluid.input import embedding  # noqa: F401
 from ...fluid.contrib.layers import sparse_embedding  # noqa: F401
diff --git a/python/paddle/static/nn/control_flow.py b/python/paddle/static/nn/control_flow.py
new file mode 100644
index 0000000000000..9374f654b4c06
--- /dev/null
+++ b/python/paddle/static/nn/control_flow.py
@@ -0,0 +1,797 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from functools import partial, reduce
+
+import paddle
+import paddle.fluid.core as core
+from paddle.common_ops_import import (
+    LayerHelper,
+    _non_static_mode,
+    check_type,
+    check_variable_and_dtype,
+    convert_dtype,
+)
+from paddle.fluid.framework import Operator, Program, Variable
+
+# Temporary solution, it will be deleted later
+from paddle.fluid.layers.control_flow import cond
+from paddle.fluid.layers.utils import (
+    assert_same_structure,
+    copy_mutable_vars,
+    hold_mutable_vars,
+    is_sequence,
+    map_structure,
+)
+
+
+class BlockGuard:
+    """
+    BlockGuard class.
+
+    BlockGuard class is used to create a sub-block in a program by
+    using the Python `with` keyword.
+    """
+
+    def __init__(self, main_program):
+        if not isinstance(main_program, Program):
+            raise TypeError("BlockGuard takes a program")
+        self.main_program = main_program
+
+    def __enter__(self):
+        self.main_program._create_block()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.main_program._rollback()
+        if exc_type is not None:
+            return False  # re-raise exception
+        return True
+
+
+class WhileGuard(BlockGuard):
+    def __init__(self, while_op):
+        if not isinstance(while_op, While):
+            raise TypeError("WhileGuard takes a while op")
+        super().__init__(while_op.helper.main_program)
+        self.while_op = while_op
+
+    def __enter__(self):
+        self.while_op.status = While.IN_WHILE_BLOCK
+        return super().__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is not None:
+            return False
+        self.while_op.status = While.AFTER_WHILE_BLOCK
+        self.while_op._complete()
+        return super().__exit__(exc_type, exc_val, exc_tb)
+
+
+def get_inputs_outputs_in_block(
+    current_block, inner_inputs, inner_outputs, helper
+):
+    """
+    Find inputs and outputs in current control flow block.
+    :param current_block: Current control flow block.
+    :param inner_inputs: Input var name of ops in current block.
+    :param inner_outputs: Output var name of ops in current block.
+    :return: inner_inputs, inner_outputs
+    """
+
+    def is_ignore_vars(op, var_name):
+        # NOTE(dev): There are some persistable var created in some non-standard API
+        # such as "contrib.layers.shuffle_batch". It create a "Seed" used both in
+        # Input and Output. This var shall not be considered as a loop_var in
+        # control_flow.
+        IGNORE_VAR_NAMES = {"shuffle_batch": ["shuffle_batch_seed"]}
+        if op.type in IGNORE_VAR_NAMES:
+            var_names = IGNORE_VAR_NAMES[op.type]
+            for name in var_names:
+                if name in var_name:
+                    return True
+        return False
+
+    # Step1: update inner_inputs and inner_outputs
+    # NOTE: Here assumes that all variables are input or output of Ops,
+    # but some variables are created without appendding a real op.
+    # For example, in `arr = create_array(dtype)`, `arr` is not a output of a op.
+    for op in current_block.ops:
+        assert isinstance(op, Operator)
+        for iname in op.input_names:
+            for in_var_name in op.input(iname):
+                if in_var_name not in inner_outputs and not is_ignore_vars(
+                    op, in_var_name
+                ):
+                    inner_inputs.add(in_var_name)
+
+        for oname in op.output_names:
+            for out_var_name in op.output(oname):
+                inner_outputs.add(out_var_name)
+
+    # Step2: Remove LOD_TENSOR_ARRAY created in current control flow block.
+    remove_inner_inputs = set()
+    parent_block = helper.main_program.block(current_block.parent_idx)
+
+    for in_var_name in inner_inputs:
+        parent_block_var = parent_block._find_var_recursive(in_var_name)
+        current_block_var = None
+        if current_block.has_var(in_var_name):
+            current_block_var = current_block.var(in_var_name)
+        if (
+            not parent_block_var
+            and current_block_var
+            and current_block_var.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY
+        ):
+            remove_inner_inputs.add(in_var_name)
+
+    inner_inputs = inner_inputs - remove_inner_inputs
+
+    return inner_inputs, inner_outputs
+
+
+class While:
+    """
+    :api_attr: Static Graph
+
+    while loop control flow. Repeat while body until cond is False.
+
+    Note:
+        A new OP :ref:`api_fluid_layers_while_loop` is highly recommended instead of ``While`` if the shape of parameter ``cond`` is [1].
+        OP :ref:`api_fluid_layers_while_loop` is easier to use and is called with less code but does the same thing as ``While`` .
+
+    Notice:
+        Local variables created in ``While`` are similar to that created in while of C++, and cannot be referenced externally.
+        As a result, they cannot be obtained through ``fetch_list`` of ``Executor``. If you would like to access the variable
+        out of ``while`` , PaddlePaddle provides ``assign`` API to assign local variables to external. Please refer to example
+        code 2 or refer to `issue#22724 <https://github.com/PaddlePaddle/Paddle/issues/22724>`_.
+
+    Args:
+        cond(Variable): A Tensor whose data type is bool controlling whether to continue looping.
+        is_test(bool, optional): A flag indicating whether execution is in test phase. Default value is False.
+        name(str, optional): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
+
+    Examples 1:
+          .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.enable_static()
+
+            i = paddle.full(shape=[1], dtype='int64', fill_value=0)           # loop counter
+
+            loop_len = paddle.full(shape=[1],dtype='int64', fill_value=10)    # loop length
+
+            cond = paddle.less_than(x=i, y=loop_len)
+            while_op = paddle.static.nn.control_flow.While(cond=cond)
+            with while_op.block():
+                i = paddle.increment(x=i, value=1)
+                paddle.assign(paddle.less_than(x=i, y=loop_len), output=cond)
+
+            exe = paddle.static.Executor(paddle.CPUPlace())
+            exe.run(paddle.static.default_startup_program())
+
+            res = exe.run(paddle.static.default_main_program(), feed={}, fetch_list=[i])
+            print(res) # [array([10])]
+
+
+    Examples 2:
+          .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            paddle.enable_static()
+
+            i = paddle.full(shape=[1], dtype='int64', fill_value=0)
+            loop_len = paddle.full(shape=[1], dtype='int64', fill_value=10)
+            one = paddle.full(shape=[1], dtype='float32', fill_value=1)
+            data = paddle.static.data(name='data', shape=[1], dtype='float32')
+            sums = paddle.full(shape=[1], dtype='float32', fill_value=0)  # Define the variable to be obtained ouside of While, which name should be different from the variable inside the While to be obtained
+
+            cond = paddle.less_than(x=i, y=loop_len)
+            while_op = paddle.static.nn.control_flow.While(cond=cond)
+            with while_op.block():
+                sums_tensor = paddle.add(x=data, y=data)
+                paddle.assign(sums_tensor, sums)  # Update the value of sums_tensor defined in While to the sums which defined outside of While through layers.assign
+                i = paddle.increment(x=i, value=1)
+                data = paddle.add(x=data, y=one)
+                paddle.assign(paddle.less_than(x=i, y=loop_len), output=cond)
+
+            feed_data = np.ones(1).astype('float32')
+            exe = paddle.static.Executor(paddle.CPUPlace())
+            exe.run(paddle.static.default_startup_program())
+            res = exe.run(paddle.static.default_main_program(), feed={'data': feed_data}, fetch_list=sums)
+            print(res[0])  # [2.]    # Because the data in While does not update the value outside the While, the value of sums is [2.] after the loop
+    """
+
+    BEFORE_WHILE_BLOCK = 0
+    IN_WHILE_BLOCK = 1
+    AFTER_WHILE_BLOCK = 2
+
+    def __init__(self, cond, is_test=False, name=None):
+        self.helper = LayerHelper("while", name=name)
+        self.status = While.BEFORE_WHILE_BLOCK
+        check_variable_and_dtype(cond, 'cond', ['bool'], 'static.nn.While')
+        if reduce(lambda a, b: a * b, cond.shape, 1) != 1:
+            raise TypeError(
+                "condition expected shape as [1], but given shape as {0}.".format(
+                    list(cond.shape)
+                )
+            )
+        self.cond_var = cond
+        self.is_test = is_test
+
+    def block(self):
+        return WhileGuard(self)
+
+    def _complete(self):
+        main_program = self.helper.main_program
+        while_block = main_program.current_block()
+        parent_block = main_program.block(
+            main_program.current_block().parent_idx
+        )
+
+        inner_outputs = {self.cond_var.name}
+        x_name_list = set()
+        x_name_list, inner_outputs = get_inputs_outputs_in_block(
+            while_block, x_name_list, inner_outputs, self.helper
+        )
+
+        out_vars = []
+        for inner_out_name in inner_outputs:
+            inner_var = parent_block._find_var_recursive(inner_out_name)
+            if inner_var:
+                out_vars.append(inner_var)
+
+        x_name_list |= set(map(lambda x: x.name, out_vars))
+        # NOTE(dev): cond_var has been contained in Input('Condition'), so
+        # we remove it from Input('X')
+        x_name_list -= {self.cond_var.name}
+
+        step_scope = parent_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES
+        )
+
+        parent_block.append_op(
+            type='while',
+            inputs={
+                'X': [
+                    parent_block._var_recursive(x_name)
+                    for x_name in x_name_list
+                ],
+                'Condition': [self.cond_var],
+            },
+            outputs={'Out': out_vars, 'StepScopes': [step_scope]},
+            attrs={'sub_block': while_block, "is_test": self.is_test},
+        )
+
+
+support_ret_buildin_type = (bool, float, int)
+
+
+def assign_skip_lod_tensor_array(input, output):
+    """
+    Assign input to output, but skip the process of copying LoDTensorArray unless it's created in while_block.
+    """
+
+    def has_shape_diff(x_var, y_var):
+        if len(x_var.shape) != len(y_var.shape):
+            return True
+        for x_dim, y_dim in zip(x_var.shape, y_var.shape):
+            if x_dim != y_dim and -1 not in [x_dim, y_dim]:
+                return True
+        return False
+
+    if not isinstance(input, (Variable, core.VarBase)):
+        if isinstance(output, Variable) and isinstance(
+            input, support_ret_buildin_type
+        ):
+            paddle.assign(input, output)
+        else:
+            output = input
+        return
+
+    if input.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+        main_program = input.block.program
+        parent_block = main_program.block(
+            main_program.current_block().parent_idx
+        )
+        if parent_block and not parent_block._find_var_recursive(input.name):
+            paddle.assign(input, output)
+    else:
+        if (
+            isinstance(output, Variable)
+            and isinstance(input, Variable)
+            and has_shape_diff(input, output)
+        ):
+            warnings.warn(
+                "In dy2static mode, we attemp to assign a variable with shape {} into a variable with shape{}, which is not always right.".format(
+                    input.shape, output.shape
+                )
+            )
+        paddle.assign(input, output)
+
+
+def while_loop(cond, body, loop_vars, is_test=False, name=None):
+    """
+    :api_attr: Static Graph
+
+    while_loop is one of the control flows. Repeats while_loop `body` until `cond` returns False.
+
+    Notice:
+        Local variables defined in ``body`` cannot be obtained through ``fetch_list`` of ``Executor`` , variables should
+        be defined outside ``body`` and placed in ``loop_vars`` for looping, then these variables can be fetched by ``fetch_list`` .
+
+    Args:
+        cond(Callable): A callable returning a boolean tensor controlling whether to continue looping. And ``cond`` takes
+            as many arguments as ``loop_vars`` .
+        body(Callable): A callable returning a tuple or list of tensors or LoDTensorArrays of the same arity
+            (length and structure) and types as ``loops_vars`` . And ``body`` takes as many arguments as ``loop_vars`` .
+        loop_vars(list|tuple): A list or tuple of tensors or LoDTensorArrays that is passed to both ``cond`` and ``body`` .
+        is_test(bool, optional): A flag indicating whether execution is in test phase. Default value is False.
+        name(str, optional): Normally there is no need for users to set this property. For more information, please
+            refer to :ref:`api_guide_Name`. Default is None.
+
+    Returns:
+        A list or tuple of Tensors or LoDTensorArrays which returned by ``body`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            paddle.enable_static()
+
+            def cond(i, ten):
+                return i < ten
+
+            def body(i, ten):
+                i = i + 1
+                return [i, ten]
+
+            main_program = paddle.static.default_main_program()
+            startup_program = paddle.static.default_startup_program()
+            with paddle.static.program_guard(main_program, startup_program):
+                i = paddle.full(shape=[1], fill_value=0, dtype='int64')     # loop counter
+                ten = paddle.full(shape=[1], fill_value=10, dtype='int64')  # loop length
+                i, ten = paddle.static.nn.while_loop(cond, body, [i, ten])
+
+                exe = paddle.static.Executor(paddle.CPUPlace())
+                res = exe.run(main_program, feed={}, fetch_list=[i])
+                print(res) # [array([10])]
+    """
+    helper = LayerHelper('while_loop', **locals())
+
+    if not callable(cond):
+        raise TypeError("cond in while_loop should be callable")
+    if not callable(body):
+        raise TypeError("body in while_loop should be callable")
+    check_type(loop_vars, 'loop_vars', (list, tuple), 'static.nn.while_loop')
+    if len(loop_vars) == 0:
+        raise ValueError("loop_vars in while_loop should not be empty")
+
+    pre_cond = cond(*loop_vars)
+    check_variable_and_dtype(
+        pre_cond, 'var of cond returned', ['bool'], 'static.nn.while_loop'
+    )
+    if reduce(lambda a, b: a * b, pre_cond.shape, 1) != 1:
+        raise TypeError(
+            "the shape of the variable returned by cond should be [1],"
+            "but given shape as {0}.".format(list(pre_cond.shape))
+        )
+
+    if _non_static_mode():
+        now_cond = pre_cond.numpy()[0]
+        while now_cond:
+            output_vars = body(*loop_vars)
+            if not isinstance(output_vars, (list, tuple)):
+                output_vars = [output_vars]
+            if len(output_vars) != len(loop_vars):
+                raise ValueError(
+                    "body in while_loop should return the same arity "
+                    "(length and structure) and types as loop_vars"
+                )
+            now_cond = cond(*output_vars).numpy()[0]
+            map_structure(assign_skip_lod_tensor_array, output_vars, loop_vars)
+        return loop_vars
+
+    while_loop_block = While(pre_cond, is_test, name)
+    has_mutable_vars_in_loop = hold_mutable_vars(loop_vars)
+    with while_loop_block.block():
+        # If a variable with mutable type is included in loop_vars, like `dict/list`,
+        # modifying it in the body function will cause origin variable to be modified
+        # synchronously. This will raise an assignment error out of while block.
+        # Here we make a copy of the mutable vars to avoid this problem.
+        if has_mutable_vars_in_loop:
+            new_loop_vars = copy_mutable_vars(loop_vars)
+            output_vars = body(*new_loop_vars)
+        else:
+            output_vars = body(*loop_vars)
+        if not isinstance(output_vars, (list, tuple)):
+            output_vars = [output_vars]
+        try:
+            loop_vars = _deal_with_undefined_var(output_vars, loop_vars)
+            assert_same_structure(output_vars, loop_vars, check_types=False)
+        except ValueError as e:
+            raise ValueError(
+                "body in while_loop should return the same arity "
+                "(length and structure) as loop_vars: {0}".format(e)
+            )
+        now_cond = cond(*output_vars)
+        map_structure(assign_skip_lod_tensor_array, output_vars, loop_vars)
+        paddle.assign(now_cond, pre_cond)
+    return loop_vars
+
+
+def _deal_with_undefined_var(output_vars, loop_vars):
+    """Deal with undefined var cases, We create undefined variable based on the results of body().
+    In Dy2Static, we use undefined var to represent the var created in control flow. This function
+    expand the loop_vars and replace original loop_vars.
+    1. UndefinedVar = Variable      # create a variable
+    2. UndefinedVar = None          # create a undefined var with RETURN_NO_VALUE_MAGIC_NUM
+    3. UndefinedVar = List(int)     # create a list of variable
+    4. UndefinedVar = value         # create a variable
+    """
+    from paddle.jit.dy2static.utils import (
+        UndefinedVar,
+        create_undefined_variable,
+    )
+
+    def create_var_like(o_var):
+        if (
+            isinstance(o_var, (Variable,) + support_ret_buildin_type)
+            or o_var is None
+        ):
+            return create_undefined_variable()
+        if is_sequence(o_var):
+            """
+            Create a complex container class inside the body of while, including Python list and python Dict
+            """
+            return map_structure(lambda x: create_undefined_variable(), o_var)
+
+    if len(output_vars) != len(loop_vars):
+        raise ValueError("The length of loop_vars should be the same.")
+
+    results = []
+    for o_var, l_var in zip(output_vars, loop_vars):
+        if isinstance(l_var, UndefinedVar) or l_var is None:
+            results.append(create_var_like(o_var))
+        else:
+            results.append(l_var)
+    return results
+
+
+def _error_message(what, arg_name, op_name, right_value, error_value):
+    error_message = (
+        "{what} of '{arg_name}' in {op_name} must be "
+        "{right_value}, but received: {error_value}.".format(
+            what=what,
+            arg_name=arg_name,
+            op_name=op_name,
+            right_value=right_value,
+            error_value=error_value,
+        )
+    )
+
+    return error_message
+
+
+def case(pred_fn_pairs, default=None, name=None):
+    '''
+    :api_attr: Static Graph
+
+    This operator works like an if-elif-elif-else chain.
+
+    Args:
+        pred_fn_pairs(list|tuple): A list or tuple of (pred, fn) pairs. ``pred`` is a boolean Tensor with shape [1], ``fn`` is a callable. All callables return the same structure of Tensors.
+        default(callable, optional): Callable that returns a structure of Tensors.
+        name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor|list(Tensor): Tensors returned by the callable from the first pair whose pred is True,
+        or Tensors returned by ``default`` if no pred in ``pred_fn_pairs`` is True and ``default`` is not None,
+        or Tensors returned by the last callable in ``pred_fn_pairs``  if no pred in ``pred_fn_pairs`` is True and ``default`` is None.
+
+    Raises:
+        TypeError: If the type of ``pred_fn_pairs`` is not list or tuple.
+        TypeError: If the type of elements in ``pred_fn_pairs`` is not tuple.
+        TypeError: If the size of tuples in ``pred_fn_pairs`` is not 2.
+        TypeError: If the first element of 2-tuple in ``pred_fn_pairs`` is not a Tensor.
+        TypeError: If the second element of 2-tuple in ``pred_fn_pairs`` is not callable.
+        TypeError: If ``default`` is not None but it is not callable.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            paddle.enable_static()
+
+            def fn_1():
+                return paddle.full(shape=[1, 2], dtype='float32', fill_value=1)
+
+            def fn_2():
+                return paddle.full(shape=[2, 2], dtype='int32', fill_value=2)
+
+            def fn_3():
+                return paddle.full(shape=[3], dtype='int32', fill_value=3)
+
+            main_program = paddle.static.default_startup_program()
+            startup_program = paddle.static.default_main_program()
+
+            with paddle.static.program_guard(main_program, startup_program):
+                x = paddle.full(shape=[1], dtype='float32', fill_value=0.3)
+                y = paddle.full(shape=[1], dtype='float32', fill_value=0.1)
+                z = paddle.full(shape=[1], dtype='float32', fill_value=0.2)
+
+                pred_1 = paddle.less_than(z, x)  # true: 0.2 < 0.3
+                pred_2 = paddle.less_than(x, y)  # false: 0.3 < 0.1
+                pred_3 = paddle.equal(x, y)      # false: 0.3 == 0.1
+
+                # Call fn_1 because pred_1 is True
+                out_1 = paddle.static.nn.case(
+                    pred_fn_pairs=[(pred_1, fn_1), (pred_2, fn_2)], default=fn_3)
+
+                # Argument default is None and no pred in pred_fn_pairs is True. fn_3 will be called.
+                # because fn_3 is the last callable in pred_fn_pairs.
+                out_2 = paddle.static.nn.case(pred_fn_pairs=[(pred_2, fn_2), (pred_3, fn_3)])
+
+                exe = paddle.static.Executor(paddle.CPUPlace())
+                res_1, res_2 = exe.run(main_program, fetch_list=[out_1, out_2])
+                print(res_1)  # [[1. 1.]]
+                print(res_2)  # [3 3 3]
+    '''
+    helper = LayerHelper('case', **locals())
+
+    def _case_check_args(pred_fn_pairs, default):
+        '''
+        Check arguments pred_fn_pairs and default. Return canonical pre_fn_pairs and default.
+        '''
+        check_type(pred_fn_pairs, 'pred_fn_pairs', (list, tuple), 'case')
+
+        for pred_fn in pred_fn_pairs:
+            if not isinstance(pred_fn, tuple):
+                raise TypeError(
+                    _error_message(
+                        "The elements' type",
+                        "pred_fn_pairs",
+                        "case",
+                        tuple,
+                        type(pred_fn),
+                    )
+                )
+            if len(pred_fn) != 2:
+                raise TypeError(
+                    _error_message(
+                        "The tuple's size",
+                        "pred_fn_pairs",
+                        "case",
+                        "2",
+                        str(len(pred_fn)) + "-tuple",
+                    )
+                )
+            pred, fn = pred_fn
+
+            if not isinstance(pred, Variable):
+                raise TypeError(
+                    _error_message(
+                        "The pred's type",
+                        "pred_fn_pairs",
+                        "case",
+                        "boolean Variable",
+                        type(pred),
+                    )
+                )
+
+            if not callable(fn):
+                raise TypeError(
+                    "The fn for {} of pred_fn_pairs in Op(case) must"
+                    " be callable.".format(pred.name)
+                )
+
+        if default is None:
+            default_index = len(pred_fn_pairs) - 1  # pick the last one
+            default = pred_fn_pairs[default_index][1]
+            pred_fn_pairs = pred_fn_pairs[:default_index]
+        elif not callable(default):
+            raise TypeError("The default in Op(case) must be callable.")
+
+        return pred_fn_pairs, default
+
+    pred_fn_pairs, default = _case_check_args(pred_fn_pairs, default)
+
+    false_fn = default
+    for pred, true_fn in reversed(pred_fn_pairs):
+        false_fn = partial(cond, pred=pred, true_fn=true_fn, false_fn=false_fn)
+
+    final_fn = false_fn
+
+    return final_fn()
+
+
+def switch_case(branch_index, branch_fns, default=None, name=None):
+    '''
+    :api_attr: Static Graph
+
+    This operator is like a C++ switch/case statement.
+
+    Args:
+        branch_index(Tensor): A Tensor with shape [1] to specify which branch to execute. The data type is ``int32``, ``int64`` or ``uint8``.
+        branch_fns(dict|list|tuple): If it's a list or tuple, the elements in it could be pairs of (int, callable) or simple callables whose actual index will be used as the index of callable. If it's a dict, its key is a python integer and the value is a callable. All callables return the same structure of Tensors.
+        default(callable, optional): Callable that returns a structure of Tensors.
+        name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor|list(Tensor): Tensors returned by the callable specified by ``branch_index`` in ``branch_fns``,
+        or Tensors returned by ``default`` if ``default`` is not None and no index matches in ``branch_fns``,
+        or Tensors returned by the callable with the max index in ``branch_fns`` if ``default`` is None and no index matches in ``branch_fns``.
+
+    Raises:
+        TypeError: If the type of ``branch_index`` is not Tensor.
+        TypeError: If the data type of ``branch_index`` is not ``int32``, ``int64`` or ``uint8``.
+        TypeError: If the type of ``branch_fns`` is not dict, list or tuple.
+        TypeError: If the elements of ``branch_fns`` is not 2-tuple.
+        TypeError: If the first element of 2-tuple in ``branch_fns`` is not integer.
+        ValueError: If the first element of 2-tuple in ``branch_fns`` is not unique.
+        TypeError: If the second element of 2-tuple in ``branch_fns`` is not callable.
+        TypeError: If ``default`` is not None but it is not callable.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            paddle.enable_static()
+
+            def fn_1():
+                return paddle.full(shape=[1, 2], dtype='float32', fill_value=1)
+
+            def fn_2():
+                return paddle.full(shape=[2, 2], dtype='int32', fill_value=2)
+
+            def fn_3():
+                return paddle.full(shape=[3], dtype='int32', fill_value=3)
+
+            main_program = paddle.static.default_startup_program()
+            startup_program = paddle.static.default_main_program()
+            with paddle.static.program_guard(main_program, startup_program):
+                index_1 = paddle.full(shape=[1], dtype='int32', fill_value=1)
+                index_2 = paddle.full(shape=[1], dtype='int32', fill_value=2)
+
+                out_1 = paddle.static.nn.switch_case(
+                    branch_index=index_1,
+                    branch_fns={1: fn_1, 2: fn_2},
+                    default=fn_3)
+
+                out_2 = paddle.static.nn.switch_case(
+                    branch_index=index_2,
+                    branch_fns=[(1, fn_1), (2, fn_2)],
+                    default=fn_3)
+
+                # Argument default is None and no index matches. fn；,,_3 will be called because of the max index 7.
+                out_3 = paddle.static.nn.switch_case(
+                    branch_index=index_2,
+                    branch_fns=[(0, fn_1), (4, fn_2), (7, fn_3)])
+
+                exe = paddle.static.Executor(paddle.CPUPlace())
+                res_1, res_2, res_3 = exe.run(main_program, fetch_list=[out_1, out_2, out_3])
+                print(res_1)  # [[1. 1.]]
+                print(res_2)  # [[2 2] [2 2]]
+                print(res_3)  # [3 3 3]
+    '''
+    helper = LayerHelper('switch_case', **locals())
+
+    def _check_args(branch_index, branch_fns, default):
+
+        check_variable_and_dtype(
+            branch_index,
+            'branch_index',
+            ['uint8', 'int32', 'int64'],
+            'static.nn.switch_case',
+        )
+
+        if convert_dtype(branch_index.dtype) != "int64":
+            branch_index = paddle.cast(branch_index, "int64")
+
+        check_type(branch_fns, 'branch_fns', (list, tuple, dict), 'switch_case')
+
+        branch_fns = (
+            branch_fns.items() if isinstance(branch_fns, dict) else branch_fns
+        )
+
+        branch_fns = (
+            list(enumerate(branch_fns))
+            if all(callable(fn) for fn in branch_fns)
+            else branch_fns
+        )
+
+        keys_of_fns = []
+        for index_fn_pair in branch_fns:
+            if not isinstance(index_fn_pair, tuple):
+                raise TypeError(
+                    _error_message(
+                        "The elements' type",
+                        "branch_fns",
+                        "switch_case",
+                        tuple,
+                        type(branch_fns),
+                    )
+                )
+
+            if len(index_fn_pair) != 2:
+                raise TypeError(
+                    _error_message(
+                        "The tuple's size",
+                        "branch_fns",
+                        "switch_case",
+                        "2",
+                        str(len(index_fn_pair)) + "-tuple",
+                    )
+                )
+
+            key, fn = index_fn_pair
+
+            if not isinstance(key, int):
+                raise TypeError(
+                    _error_message(
+                        "The key's type",
+                        "branch_fns",
+                        "switch_case",
+                        int,
+                        type(key),
+                    )
+                )
+
+            if key in keys_of_fns:
+                raise ValueError(
+                    "The key in 'branch_fns' must be unique, but '{}' appears more than once.".format(
+                        key
+                    )
+                )
+            else:
+                keys_of_fns.append(key)
+
+            if not callable(fn):
+                raise TypeError(
+                    _error_message(
+                        "The type of function for key {}".format(key),
+                        "branch_fns",
+                        "switch_case",
+                        "callable",
+                        type(fn),
+                    )
+                )
+
+        if default is None:
+            default = sorted(branch_fns)[-1][1]
+            branch_fns = sorted(branch_fns)[:-1]
+        elif not callable(default):
+            raise TypeError("The default in Op(case) must be callable.")
+
+        pred_fn_pairs = []
+        for index, fn in branch_fns:
+            new_index = paddle.full(shape=[1], dtype="int64", fill_value=index)
+            pred = paddle.equal(branch_index, new_index)
+            pred_fn_pairs.append((pred, fn))
+
+        return pred_fn_pairs, default
+
+    pred_fn_pairs, default = _check_args(branch_index, branch_fns, default)
+    false_fn = default
+    for pred, true_fn in pred_fn_pairs:
+        false_fn = partial(cond, pred=pred, true_fn=true_fn, false_fn=false_fn)
+
+    final_fn = false_fn
+    return final_fn()
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index b2cda25fc9341..bd372cbabd552 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -212,7 +212,6 @@
     'check_reduce_rank_test',
     'test_progressbar',
     'test_seed_op',
-    'test_shrink_rnn_memory',
     'test_fc_bf16_mkldnn_op',
     'test_sequence_first_step',
     'test_fusion_lstm_mkldnn_op',
@@ -273,7 +272,6 @@
     'test_fleet_graph_executor',
     'decorator_test',
     'test_collective_base',
-    'test_lod_rank_table',
     'test_multi_gru_mkldnn_op',
     'test_eager_deletion_conditional_block',
     'op_proto_maker_test',
@@ -868,7 +866,6 @@
     'test_imperative_load_static_param',
     'test_imperative_qat_user_defined',
     'test_anchor_generator_op',
-    'test_if_else_op',
     'test_prepare_op',
     'test_conj_op',
     'test_imperative_hook_for_layer',
@@ -1099,7 +1096,6 @@
     'test_sequence_mask',
     'test_fill_op',
     'test_imperative_deepcf',
-    'test_reorder_lod_tensor',
     'test_multiply',
     'test_partial_program',
     'test_fetch_feed',
@@ -1264,7 +1260,6 @@
     'test_imperative_static_runner_mnist',
     'test_nearest_interp_op',
     'test_diag_embed',
-    'test_imperative_basic',
     'test_merge_selectedrows_op',
     'test_feed_data_check_shape_type',
     'test_complex_trace_layer',
@@ -1740,7 +1735,6 @@
     'test_simplify_with_basic_ops_pass',
     'test_similarity_focus_op',
     'test_shuffle_batch_op',
-    'test_shrink_rnn_memory',
     'test_set_bool_attr',
     'test_sequence_topk_avg_pooling',
     'test_sequence_scatter_op',
@@ -1846,7 +1840,6 @@
     'test_logger',
     'test_lod_tensor_array_ops',
     'test_lod_tensor_array',
-    'test_lod_rank_table',
     'test_locality_aware_nms_op',
     'test_load_vars_shape_check',
     'test_load_op_xpu',
@@ -2373,7 +2366,6 @@
     'test_trt_conv3d_op',
     'test_parallel_executor_drop_scope',
     'test_tensorrt_engine',
-    'test_ir_memory_optimize_ifelse_op',
     'test_parallel_executor_mnist',
     'test_load_state_dict_from_old_format',
     'test_fuse_elewise_add_act_pass',
@@ -2594,7 +2586,6 @@
     'test_imperative_hook_for_layer',
     'test_complex_sum_layer',
     'test_complex_cast',
-    'test_reorder_lod_tensor',
     'test_complex_kron',
     'test_complex_trace_layer',
     'test_merge_selectedrows_op',
@@ -2851,7 +2842,6 @@
     'test_imperative_data_parallel',
     'test_norm_nn_grad',
     'test_im2sequence_op',
-    'test_if_else_op',
     'test_one_hot_v2_op',
     'test_grid_sampler_op',
     'test_pad_op',
@@ -3068,7 +3058,6 @@
     'test_broadcast_tensors_op',
     'test_pad3d_op',
     'test_cumprod_op',
-    'test_imperative_basic',
     'trt_fc_prelu_test',
     'test_sigmoid_focal_loss',
     'test_pixel_shuffle',
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index ed5fb31009a95..84ed3a253ca92 100755
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -263,7 +263,6 @@
     'test_huber_loss_op',
     'test_im2sequence_op',
     'test_image_classification_layer',
-    'test_imperative_basic',
     'test_imperative_deepcf',
     'test_imperative_framework',
     'test_imperative_gan',
@@ -293,7 +292,6 @@
     'test_inverse_op',
     'test_io_save_load',
     'test_iou_similarity_op',
-    'test_ir_memory_optimize_ifelse_op',
     'test_ir_memory_optimize_pass',
     'test_is_empty_op',
     'test_isfinite_op',
@@ -315,7 +313,6 @@
     'test_load_vars_shape_check',
     'test_locality_aware_nms_op',
     'test_lod_array_length_op',
-    'test_lod_rank_table',
     'test_lod_tensor_array_ops',
     'test_log_loss_op',
     'test_log_softmax',
@@ -440,7 +437,6 @@
     'test_registry',
     'test_regularizer',
     'test_regularizer_api',
-    'test_reorder_lod_tensor',
     'test_reshape_op',
     'test_reshape_bf16_op',
     'test_retinanet_detection_output',
@@ -472,7 +468,6 @@
     'test_sgd_op',
     'test_shape_op',
     'test_shard_index_op',
-    'test_shrink_rnn_memory',
     'test_shuffle_batch_op',
     'test_shuffle_channel_op',
     'test_sigmoid_cross_entropy_with_logits_op',

From 736888945a47ace0b304b0a0bc7df96adf83acab Mon Sep 17 00:00:00 2001
From: Netpunk <69072522+Patrick-Star125@users.noreply.github.com>
Date: Thu, 8 Dec 2022 10:52:26 +0800
Subject: [PATCH 47/60] remove gpu_info.h from phi dependencies (#48811)

---
 paddle/phi/kernels/funcs/blas/blas_impl.cu.h         | 1 -
 paddle/phi/kernels/funcs/blas/blas_impl.hip.h        | 1 -
 paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu    | 1 -
 paddle/phi/kernels/gpu/affine_grid_kernel.cu         | 1 -
 paddle/phi/kernels/gpu/one_hot_kernel.cu             | 1 -
 paddle/phi/kernels/gpu/pad3d_grad_kernel.cu          | 1 -
 paddle/phi/kernels/gpu/pad3d_kernel.cu               | 1 -
 paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu | 1 -
 paddle/phi/kernels/gpudnn/affine_grid_kernel.cu      | 1 -
 paddle/phi/kernels/gpudnn/conv_cudnn_v7.h            | 1 -
 10 files changed, 10 deletions(-)

diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
index 0486e7730a96c..a27c7f013feef 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
@@ -14,7 +14,6 @@
 
 #pragma once
 
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/phi/backends/dynload/cublas.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h
index cbde4fdbc819b..37343111d53dc 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h
@@ -14,7 +14,6 @@
 
 #pragma once
 
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/phi/backends/dynload/rocblas.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu b/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu
index 886aaa76e41ec..334df8cc873c4 100644
--- a/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu
@@ -16,7 +16,6 @@
 
 #include "paddle/phi/kernels/affine_grid_grad_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
diff --git a/paddle/phi/kernels/gpu/affine_grid_kernel.cu b/paddle/phi/kernels/gpu/affine_grid_kernel.cu
index 8274e687512ac..90b22ed9cc942 100644
--- a/paddle/phi/kernels/gpu/affine_grid_kernel.cu
+++ b/paddle/phi/kernels/gpu/affine_grid_kernel.cu
@@ -16,7 +16,6 @@
 
 #include "paddle/phi/kernels/affine_grid_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
diff --git a/paddle/phi/kernels/gpu/one_hot_kernel.cu b/paddle/phi/kernels/gpu/one_hot_kernel.cu
index d055e5ad73ee9..af9872d9c7076 100644
--- a/paddle/phi/kernels/gpu/one_hot_kernel.cu
+++ b/paddle/phi/kernels/gpu/one_hot_kernel.cu
@@ -14,7 +14,6 @@
 
 #include "paddle/phi/kernels/one_hot_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
index ca26d9be4f908..a87bc5f95b550 100644
--- a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
@@ -14,7 +14,6 @@
 
 #include "paddle/phi/kernels/pad3d_grad_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/gpu/pad3d_kernel.cu b/paddle/phi/kernels/gpu/pad3d_kernel.cu
index 241ffefe5d18d..d0cd5a23833ca 100644
--- a/paddle/phi/kernels/gpu/pad3d_kernel.cu
+++ b/paddle/phi/kernels/gpu/pad3d_kernel.cu
@@ -16,7 +16,6 @@
 
 #include <algorithm>
 
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/complex.h"
diff --git a/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu b/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu
index 6bcfd328aacac..45c72ee31d526 100644
--- a/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu
@@ -15,7 +15,6 @@
 #ifndef PADDLE_WITH_HIP
 
 #include "paddle/phi/kernels/affine_grid_grad_kernel.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
diff --git a/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu b/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu
index 2f1c4de3716e2..2a4eea79e4d2d 100644
--- a/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu
@@ -15,7 +15,6 @@
 #ifndef PADDLE_WITH_HIP
 
 #include "paddle/phi/kernels/affine_grid_kernel.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
diff --git a/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h b/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
index cb03efc42bccb..ac4a60384af19 100644
--- a/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
+++ b/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/phi/kernels/autotune/switch_autotune.h"
 #include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h"
 

From 1adf54304447e35b30d742f4295bfab896b8d45d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=85=AD=E4=B8=AA=E9=AA=A8=E5=A4=B4?=
 <46243324+zrr1999@users.noreply.github.com>
Date: Thu, 8 Dec 2022 11:05:47 +0800
Subject: [PATCH 48/60] [Paddle Inference] Add add onehot trt converter
 (#48655)

* add onehot trt converter

* add unitest

* fix bug

* opt code

* fix bug

* fix depth_tensor

* fix unitest

* fix bug

* fix unitest

* fix bug

* fix bug

* fix bug

* fix bug
---
 .../fluid/inference/api/analysis_predictor.cc |   2 +
 .../inference/tensorrt/convert/CMakeLists.txt |   1 +
 .../inference/tensorrt/convert/one_hot_op.cc  |  92 ++++++++++
 paddle/fluid/inference/tensorrt/op_teller.cc  |  43 +++++
 .../ir/inference/test_trt_convert_one_hot.py  | 168 ++++++++++++++++++
 5 files changed, 306 insertions(+)
 create mode 100644 paddle/fluid/inference/tensorrt/convert/one_hot_op.cc
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_one_hot.py

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 13dba59492b55..45661dc3897fe 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2299,6 +2299,8 @@ USE_TRT_CONVERTER(conv2d_transpose);
 USE_TRT_CONVERTER(leaky_relu);
 USE_TRT_CONVERTER(shuffle_channel);
 USE_TRT_CONVERTER(where);
+USE_TRT_CONVERTER(one_hot);
+USE_TRT_CONVERTER(one_hot_v2);
 USE_TRT_CONVERTER(swish);
 USE_TRT_CONVERTER(silu);
 USE_TRT_CONVERTER(group_norm);
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 2598b4c2ae0f0..f0319b720cd75 100755
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -27,6 +27,7 @@ list(
   shuffle_channel_op.cc
   fill_any_like_op.cc
   where_op.cc
+  one_hot_op.cc
   swish_op.cc
   silu_op.cc
   instance_norm_op.cc
diff --git a/paddle/fluid/inference/tensorrt/convert/one_hot_op.cc b/paddle/fluid/inference/tensorrt/convert/one_hot_op.cc
new file mode 100644
index 0000000000000..f1ea2fcc482a1
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/one_hot_op.cc
@@ -0,0 +1,92 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * OneHot Op
+ */
+class OneHotOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope,
+                  bool test_mode) override {
+#if IS_TRT_VERSION_GE(8510)
+    VLOG(3) << "convert a fluid one_hot op to tensorrt one_hot layer";
+    framework::OpDesc op_desc(op, nullptr);
+
+    const auto indices_tensor = engine_->GetITensor(op_desc.Input("X").front());
+    nvinfer1::ITensor* values_tensor;
+    nvinfer1::ITensor* depth_tensor;
+    const int dtype = PADDLE_GET_CONST(int, op_desc.GetAttr("dtype"));
+    if (dtype == 2 || dtype == 3) {  // int, int64
+      const std::vector<int> values_data = {0, 1};
+      values_tensor = Add1DConstantLayer<int>(values_data, "values_tensor");
+      if (dtype == 3) {  // int64
+        VLOG(3) << "trt not support int64, so it is converted to int32.";
+      }
+    } else if (dtype == 5 || dtype == 6) {  // float
+      const std::vector<float> values_data = {0.0f, 1.0f};
+      values_tensor = Add1DConstantLayer<float>(values_data, "values_tensor");
+      if (dtype == 6) {  // int64
+        VLOG(3) << "trt not support float64, so it is converted to float32.";
+      }
+    }
+
+    auto depth_name = op_desc.Input("depth_tensor");
+    if (depth_name.size() == 0) {
+      const int depth = PADDLE_GET_CONST(int, op_desc.GetAttr("depth"));
+      depth_tensor = Add1DConstantLayer<int>(depth, "depth_tensor", true);
+    } else {
+      nvinfer1::Dims depth_dims;
+      depth_dims.nbDims = 0;
+      nvinfer1::ITensor* depth_tensor_paddle =
+          engine_->GetITensor(depth_name.front());
+      auto shuffle_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *depth_tensor_paddle);
+      shuffle_layer->setReshapeDimensions(depth_dims);
+      shuffle_layer->getOutput(0)->setName(depth_tensor_paddle->getName());
+      depth_tensor = shuffle_layer->getOutput(0);
+    }
+    auto layer = TRT_ENGINE_ADD_LAYER(
+        engine_, OneHot, *indices_tensor, *values_tensor, *depth_tensor, -1);
+
+    auto output_name = op_desc.Output("Out").front();
+    RreplenishLayerAndOutput(layer, "one_hot", {output_name}, test_mode);
+#else
+    VLOG(3) << "one_hot is not supported when TensorRT < 8.5.1";
+#endif
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(one_hot, OneHotOpConverter);
+REGISTER_TRT_OP_CONVERTER(one_hot_v2, OneHotOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index fbe3615f4d9d6..4367927bb1734 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -1783,6 +1783,45 @@ struct SimpleOpTypeSetTeller : public Teller {
       }
     }
 
+    if (op_type == "one_hot" || op_type == "one_hot_v2") {
+#if IS_TRT_VERSION_LT(8510)
+      VLOG(3) << "one_hot/one_hot_v2 is not supported when TensorRT < 8.5.1";
+      return false;
+#endif
+      if (!with_dynamic_shape) {
+        VLOG(3)
+            << "the one_hot/one_hot_v2 op does not support static shape yet";
+        return false;
+      }
+      if (desc.HasAttr("allow_out_of_range")) {
+        VLOG(3)
+            << "allow_out_of_range one_hot/one_hot_v2 op is not supported now.";
+        if (PADDLE_GET_CONST(bool, desc.GetAttr("allow_out_of_range")))
+          return false;
+      }
+      if (desc.HasAttr("dtype")) {
+        const int dtype = PADDLE_GET_CONST(int, desc.GetAttr("dtype"));
+        if (dtype != 2 && dtype != 3 && dtype != 5) {
+          VLOG(3) << "one_hot/one_hot_v2 op only support int32, int64, float.";
+          return false;
+        }
+      }
+      auto one_hot_inputs = desc.Inputs();
+      if (one_hot_inputs.find("depth_tensor") != one_hot_inputs.end()) {
+        if (desc.Input("depth_tensor").size() != 0) {
+          return true;
+        }
+      }
+
+      if (desc.HasAttr("depth")) {
+        const int depth = PADDLE_GET_CONST(int, desc.GetAttr("depth"));
+        if (depth <= 0) {
+          VLOG(3) << "depth only support positive in one_hot/one_hot_v2 op.";
+          return false;
+        }
+      }
+    }
+
     if (op_type == "skip_layernorm") {
       if (!with_dynamic_shape) {
         VLOG(3) << "the skip_layernorm does not support static shape yet";
@@ -2447,6 +2486,8 @@ struct SimpleOpTypeSetTeller : public Teller {
       "fc",
       "shuffle_channel",
       "where",
+      "one_hot",
+      "one_hot_v2",
       "swish",
       "silu",
       "celu",
@@ -2588,6 +2629,8 @@ struct SimpleOpTypeSetTeller : public Teller {
       "fc",
       "shuffle_channel",
       "where",
+      "one_hot",
+      "one_hot_v2",
       "swish",
       "silu",
       "celu",
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_one_hot.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_one_hot.py
new file mode 100644
index 0000000000000..60e654bb95e5e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_one_hot.py
@@ -0,0 +1,168 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from functools import partial
+from typing import List
+
+import numpy as np
+from program_config import ProgramConfig, TensorConfig
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest
+
+import paddle.inference as paddle_infer
+
+
+class TrtConvertOneHotTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        ver = paddle_infer.get_trt_compile_version()
+        if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 8510:
+            return False
+        return True
+
+    def sample_program_configs(self):
+        self.trt_param.workspace_size = 1073741824
+
+        def generate_indices(dims, batch):
+            if dims == 2:
+                return np.random.randint(0, 10, (batch, 4), dtype=np.int32)
+            elif dims == 3:
+                return np.random.randint(0, 10, (batch, 4, 6), dtype=np.int32)
+            else:
+                return np.random.randint(
+                    0, 10, (batch, 4, 6, 8), dtype=np.int32
+                )
+
+        def generate_depth(dims, batch):
+            return np.ones((1,), dtype=np.int32) * 10
+
+        for dims in [2, 3, 4]:
+            for batch in [1, 2]:
+                self.dims = dims
+                dics = [{"dtype": 5, "depth": 10}, {}]
+                ops_config = [
+                    {
+                        "op_type": "one_hot",
+                        "op_inputs": {
+                            "X": ["input_x_data"],
+                            "depth_tensor": ["input_depth_data"],
+                        },
+                        "op_outputs": {"Out": ["output_data"]},
+                        "op_attrs": dics[0],
+                        "outputs_dtype": {"output_data": np.int},
+                    },
+                ]
+                ops = self.generate_op_config(ops_config)
+
+                program_config = ProgramConfig(
+                    ops=ops,
+                    weights={
+                        "depth_tensor": TensorConfig(
+                            data_gen=partial(generate_depth, dims, batch)
+                        ),
+                    },
+                    inputs={
+                        "indices_tensor": TensorConfig(
+                            data_gen=partial(generate_indices, dims, batch)
+                        ),
+                    },
+                    outputs=["output_data"],
+                )
+
+                yield program_config
+
+    def sample_predictor_configs(
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            if self.dims == 1:
+                self.dynamic_shape.min_input_shape = {
+                    "input_x_data": [1],
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_x_data": [2],
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_x_data": [1],
+                }
+            elif self.dims == 2:
+                self.dynamic_shape.min_input_shape = {
+                    "input_x_data": [1, 4],
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_x_data": [2, 4],
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_x_data": [1, 4],
+                }
+            elif self.dims == 3:
+                self.dynamic_shape.min_input_shape = {
+                    "input_x_data": [1, 4, 6],
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_x_data": [2, 4, 6],
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_x_data": [1, 4, 6],
+                }
+            elif self.dims == 4:
+                self.dynamic_shape.min_input_shape = {
+                    "input_x_data": [1, 4, 6, 8],
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_x_data": [2, 4, 6, 8],
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_x_data": [1, 4, 6, 8],
+                }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            if not dynamic_shape:
+                return 0, 3
+            return 1, 2
+
+        attrs = [op.attrs for op in program_config.ops]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), 1e-5
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True
+        ), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True
+        ), 1e-5
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()

From de2c5fd60cc029d5d9c3d3340d21d4d2e296332e Mon Sep 17 00:00:00 2001
From: Netpunk <69072522+Patrick-Star125@users.noreply.github.com>
Date: Thu, 8 Dec 2022 11:07:47 +0800
Subject: [PATCH 49/60] [PHI decoupling] remove  bbox_util.h from phi
 dependencies (#48761)

* remove bbox_util.h from phi

* add file bbox_util.h

* reframe bbox_util.h
---
 .../phi/kernels/funcs/detection/bbox_util.h   | 28 +++++++++++++++++++
 .../gpu/distribute_fpn_proposals_kernel.cu    | 17 +++++++++--
 .../kernels/gpu/generate_proposals_kernel.cu  | 10 ++-----
 3 files changed, 44 insertions(+), 11 deletions(-)
 create mode 100644 paddle/phi/kernels/funcs/detection/bbox_util.h

diff --git a/paddle/phi/kernels/funcs/detection/bbox_util.h b/paddle/phi/kernels/funcs/detection/bbox_util.h
new file mode 100644
index 0000000000000..4acaa4406bce3
--- /dev/null
+++ b/paddle/phi/kernels/funcs/detection/bbox_util.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace phi {
+namespace funcs {
+
+struct RangeInitFunctor {
+  int start_;
+  int delta_;
+  int *out_;
+  __device__ void operator()(size_t i) { out_[i] = start_ + i * delta_; }
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
index bcce09649a8fc..3d50a75ae22fd 100644
--- a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
+++ b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
@@ -24,6 +24,7 @@ namespace cub = hipcub;
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/detection/bbox_util.h"
 #include "paddle/phi/kernels/funcs/distribute_fpn_proposals_functor.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/gather.cu.h"
@@ -31,7 +32,6 @@ namespace cub = hipcub;
 
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace phi {
@@ -62,7 +62,18 @@ __global__ void GPUDistFpnProposalsHelper(const int nthreads,
     const T* offset_roi = rois + i * BBoxSize;
     int roi_batch_ind = roi_batch_id_data[i];
     // get the target level of current rois
-    T roi_area = paddle::operators::RoIArea(offset_roi, pixel_offset);
+    T roi_area;
+    if (offset_roi[2] < offset_roi[0] || offset_roi[3] < offset_roi[1]) {
+      roi_area = static_cast<T>(0.);
+    } else {
+      const T w = offset_roi[2] - offset_roi[0];
+      const T h = offset_roi[3] - offset_roi[1];
+      if (pixel_offset) {
+        roi_area = (w + 1) * (h + 1);
+      } else {
+        roi_area = w * h;
+      }
+    }
     T roi_scale = sqrt(roi_area);
     int tgt_lvl = floor(
         log2(roi_scale / static_cast<T>(refer_scale) + (T)1e-8) + refer_level);
@@ -155,7 +166,7 @@ void DistributeFpnProposalsKernel(
   index_in_t.Resize({roi_num});
   int* idx_in = dev_ctx.template Alloc<int>(&index_in_t);
   funcs::ForRange<phi::GPUContext> for_range(dev_ctx, roi_num);
-  for_range(paddle::operators::RangeInitFunctor{0, 1, idx_in});
+  for_range(funcs::RangeInitFunctor{0, 1, idx_in});
 
   DenseTensor keys_out_t;
   keys_out_t.Resize({roi_num});
diff --git a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
index f750bd5fe7eb9..622ef8100af30 100644
--- a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
+++ b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
@@ -26,6 +26,7 @@ namespace cub = hipcub;
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/detection/bbox_util.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -38,13 +39,6 @@ int const kThreadsPerBlock = sizeof(uint64_t) * 8;
 
 static const double kBBoxClipDefault = std::log(1000.0 / 16.0);
 
-struct RangeInitFunctor {
-  int start_;
-  int delta_;
-  int *out_;
-  __device__ void operator()(size_t i) { out_[i] = start_ + i * delta_; }
-};
-
 template <typename T>
 static void SortDescending(const phi::GPUContext &ctx,
                            const DenseTensor &value,
@@ -55,7 +49,7 @@ static void SortDescending(const phi::GPUContext &ctx,
   index_in_t.Resize(phi::make_ddim({num}));
   int *idx_in = ctx.template Alloc<int>(&index_in_t);
   phi::funcs::ForRange<phi::GPUContext> for_range(ctx, num);
-  for_range(RangeInitFunctor{0, 1, idx_in});
+  for_range(funcs::RangeInitFunctor{0, 1, idx_in});
 
   index_out->Resize(phi::make_ddim({num}));
   int *idx_out = ctx.template Alloc<int>(index_out);

From b91bbd320fccb49576937d26d0639dc8e0f94583 Mon Sep 17 00:00:00 2001
From: 201716010711 <87008376+201716010711@users.noreply.github.com>
Date: Thu, 8 Dec 2022 11:13:19 +0800
Subject: [PATCH 50/60] Optimize Paddle diagonal (#47904)

---
 paddle/phi/kernels/cpu/diagonal_kernel.cc     | 67 ++++++++------
 paddle/phi/kernels/funcs/diagonal.h           | 88 +++++++++----------
 .../phi/kernels/gpu/diagonal_grad_kernel.cu   | 12 +++
 paddle/phi/kernels/gpu/diagonal_kernel.cu     | 11 ++-
 .../fluid/tests/unittests/test_diagonal_op.py | 29 ++++++
 5 files changed, 133 insertions(+), 74 deletions(-)

diff --git a/paddle/phi/kernels/cpu/diagonal_kernel.cc b/paddle/phi/kernels/cpu/diagonal_kernel.cc
index f125802c19e24..d2361bee30a5f 100644
--- a/paddle/phi/kernels/cpu/diagonal_kernel.cc
+++ b/paddle/phi/kernels/cpu/diagonal_kernel.cc
@@ -35,6 +35,7 @@ void DiagonalKernel(const Context& dev_ctx,
   auto* output = out;
   T* output_data = dev_ctx.template Alloc<T>(output);
   auto output_dim = vectorize(output->dims());
+  auto output_dim_size = output_dim.size();
 
   const int64_t offset_ = offset;
   int64_t axis1_ = axis1 < 0 ? input_dim_size + axis1 : axis1;
@@ -43,40 +44,48 @@ void DiagonalKernel(const Context& dev_ctx,
   std::vector<int64_t> input_stride = funcs::ComputeDimStride(input_dim);
   std::vector<int64_t> output_stride = funcs::ComputeDimStride(output_dim);
 
-  int64_t numel = input->numel();
-
-  for (int64_t idx = 0; idx < numel; idx++) {
-    std::vector<int64_t> idx_dim(input_dim_size);
+  int64_t out_numel = out->numel();
+  for (int64_t idx = 0; idx < out_numel; idx++) {
+    std::vector<int64_t> idx_dim(output_dim_size);
     int64_t temp = 0;
-    for (size_t i = 0; i < input_dim_size; i++) {
-      idx_dim[i] = (idx - temp) / input_stride[i];
-      temp = temp + idx_dim[i] * input_stride[i];
+    for (size_t i = 0; i < output_dim_size; i++) {
+      idx_dim[i] = (idx - temp) / output_stride[i];
+      temp = temp + idx_dim[i] * output_stride[i];
     }
-
-    int64_t axis1_dim = idx_dim[axis1_];
-    int64_t axis2_dim = idx_dim[axis2_];
-
-    idx_dim.erase(idx_dim.begin() + std::max(axis1_, axis2_));
-    idx_dim.erase(idx_dim.begin() + std::min(axis1_, axis2_));
-
-    bool flag = false;
-    if (offset_ == 0 && axis1_dim == axis2_dim) {
-      idx_dim.push_back(axis1_dim);
-      flag = true;
-    } else if (offset_ > 0 && (axis1_dim + offset_) == axis2_dim) {
-      idx_dim.push_back(axis1_dim);
-      flag = true;
-    } else if (offset_ < 0 && (axis1_dim + offset_) == axis2_dim) {
-      idx_dim.push_back(axis2_dim);
-      flag = true;
+    int64_t tmp = idx_dim[output_dim_size - 1];
+    std::vector<int64_t> list;
+    list.clear();
+    int64_t l = std::min(axis1_, axis2_);
+    int64_t r = std::max(axis1_, axis2_);
+    for (size_t j = 0; j < output_dim_size - 1; j++) {
+      list.push_back(idx_dim[j]);
     }
-    if (flag) {
-      int64_t idx_output = 0;
-      for (size_t i = 0; i < idx_dim.size(); i++) {
-        idx_output = idx_output + idx_dim[i] * output_stride[i];
+    if (offset_ == 0) {
+      list.insert(list.begin() + l, tmp);
+      list.insert(list.begin() + r, tmp);
+    } else if (offset_ > 0) {
+      if (axis1_ < axis2_) {
+        list.insert(list.begin() + l, tmp);
+        list.insert(list.begin() + r, tmp + offset_);
+      } else {
+        list.insert(list.begin() + l, tmp + offset_);
+        list.insert(list.begin() + r, tmp);
       }
-      output_data[idx_output] = input_data[idx];
+    } else if (offset_ < 0) {
+      if (axis1_ < axis2_) {
+        list.insert(list.begin() + l, tmp - offset_);
+        list.insert(list.begin() + r, tmp);
+      } else {
+        list.insert(list.begin() + l, tmp);
+        list.insert(list.begin() + r, tmp - offset_);
+      }
+    }
+
+    int64_t input_offset = 0;
+    for (size_t i = 0; i < input_dim_size; i++) {
+      input_offset = input_offset + list[i] * input_stride[i];
     }
+    output_data[idx] = input_data[input_offset];
   }
 }
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/diagonal.h b/paddle/phi/kernels/funcs/diagonal.h
index 92f970aed3279..a30fb79f8c8b0 100644
--- a/paddle/phi/kernels/funcs/diagonal.h
+++ b/paddle/phi/kernels/funcs/diagonal.h
@@ -156,59 +156,59 @@ __global__ void DiagonalCuda(const T* data1,
                              int64_t* x_stride,
                              int64_t* out_stride,
                              int64_t numel,
+                             int64_t out_numel,
                              bool is_grad) {
-  CUDA_KERNEL_LOOP(idx, numel) {
-    int64_t idx_dim[X_DIM_SIZE] = {0};
+  CUDA_KERNEL_LOOP(idx, out_numel) {
+    int64_t idx_dim[OUT_DIM_SIZE] = {0};
     int64_t temp = 0;
-    for (size_t i = 0; i < X_DIM_SIZE - 1; i++) {
-      idx_dim[i] = (idx - temp) / x_stride[i];
-      temp = temp + idx_dim[i] * x_stride[i];
+    for (size_t i = 0; i < OUT_DIM_SIZE - 1; i++) {
+      idx_dim[i] = (idx - temp) / out_stride[i];
+      temp = temp + idx_dim[i] * out_stride[i];
     }
-    idx_dim[X_DIM_SIZE - 1] = idx - temp;
-
-    int64_t axis1_dim = idx_dim[axis1_];
-    int64_t axis2_dim = idx_dim[axis2_];
-
-    int64_t out_dim[OUT_DIM_SIZE] = {0};
-    int temp_pos = 0;
-    for (int i = 0; i < X_DIM_SIZE; i++) {
-      if (i != axis1_ && i != axis2_) {
-        out_dim[temp_pos] = idx_dim[i];
-        temp_pos++;
+    idx_dim[OUT_DIM_SIZE - 1] = idx - temp;
+    int64_t tmp = idx - temp;
+    int64_t list[9];
+    int64_t p = 0;
+    for (size_t j = 0; j < X_DIM_SIZE; j++) {
+      if (j == axis1_ || j == axis2_) {
+        list[j] = 0;
+      } else {
+        list[j] = idx_dim[p];
+        p += 1;
       }
     }
-    bool flag = false;
-    if (offset_ == 0 && axis1_dim == axis2_dim) {
-      out_dim[temp_pos] = axis1_dim;
-      flag = true;
-    } else if (offset_ > 0 && (axis1_dim + offset_) == axis2_dim) {
-      out_dim[temp_pos] = axis1_dim;
-      flag = true;
-    } else if (offset_ < 0 && (axis1_dim + offset_) == axis2_dim) {
-      out_dim[temp_pos] = axis2_dim;
-      flag = true;
-    }
-    if (!is_grad) {
-      if (flag) {
-        int64_t idx_output = 0;
-        for (size_t i = 0; i < OUT_DIM_SIZE - 1; i++) {
-          idx_output = idx_output + out_dim[i] * out_stride[i];
-        }
-        idx_output = idx_output + out_dim[OUT_DIM_SIZE - 1];
-        data2[idx_output] = data1[idx];
+    int64_t l = min(axis1_, axis2_);
+    int64_t r = max(axis1_, axis2_);
+    if (offset_ == 0) {
+      list[l] = tmp;
+      list[r] = tmp;
+    } else if (offset_ > 0) {
+      if (axis1_ < axis2_) {
+        list[l] = tmp;
+        list[r] = tmp + offset_;
+      } else {
+        list[l] = tmp + offset_;
+        list[r] = tmp;
       }
-    } else {
-      if (flag) {
-        int64_t idx_output = 0;
-        for (size_t i = 0; i < OUT_DIM_SIZE - 1; i++) {
-          idx_output = idx_output + out_dim[i] * out_stride[i];
-        }
-        idx_output = idx_output + out_dim[OUT_DIM_SIZE - 1];
-        data2[idx] = data1[idx_output];
+    } else if (offset_ < 0) {
+      if (axis1_ < axis2_) {
+        list[l] = tmp - offset_;
+        list[r] = tmp;
       } else {
-        data2[idx] = static_cast<T>(0);
+        list[l] = tmp;
+        list[r] = tmp - offset_;
       }
     }
+    int64_t input_offset = 0;
+
+    for (size_t i = 0; i < X_DIM_SIZE; i++) {
+      input_offset = input_offset + list[i] * x_stride[i];
+    }
+    if (!is_grad) {
+      data2[idx] = data1[input_offset];
+    } else {
+      data2[input_offset] = data1[idx];
+    }
   }
 }
 #endif
diff --git a/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu b/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu
index 05a57426fcb21..a65d9af75f6a3 100644
--- a/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu
@@ -62,6 +62,10 @@ void DiagonalGradKernel(const Context& dev_ctx,
   int threads = PADDLE_CUDA_NUM_THREADS;
   int blocks = (numel + threads - 1) / threads;
 
+  int64_t dout_numel = out_grad.numel();
+  phi::backends::gpu::GpuMemsetAsync(
+      dx_data, 0, numel * sizeof(T), dev_ctx.stream());
+
   switch (dx_dim_size) {
     case 2:
       funcs::DiagonalCuda<T, 2, 1><<<blocks, threads>>>(dout_data,
@@ -72,6 +76,7 @@ void DiagonalGradKernel(const Context& dev_ctx,
                                                         dx_stride,
                                                         dout_stride,
                                                         numel,
+                                                        dout_numel,
                                                         true);
       break;
     case 3:
@@ -83,6 +88,7 @@ void DiagonalGradKernel(const Context& dev_ctx,
                                                         dx_stride,
                                                         dout_stride,
                                                         numel,
+                                                        dout_numel,
                                                         true);
       break;
     case 4:
@@ -94,6 +100,7 @@ void DiagonalGradKernel(const Context& dev_ctx,
                                                         dx_stride,
                                                         dout_stride,
                                                         numel,
+                                                        dout_numel,
                                                         true);
       break;
     case 5:
@@ -105,6 +112,7 @@ void DiagonalGradKernel(const Context& dev_ctx,
                                                         dx_stride,
                                                         dout_stride,
                                                         numel,
+                                                        dout_numel,
                                                         true);
       break;
     case 6:
@@ -116,6 +124,7 @@ void DiagonalGradKernel(const Context& dev_ctx,
                                                         dx_stride,
                                                         dout_stride,
                                                         numel,
+                                                        dout_numel,
                                                         true);
       break;
     case 7:
@@ -127,6 +136,7 @@ void DiagonalGradKernel(const Context& dev_ctx,
                                                         dx_stride,
                                                         dout_stride,
                                                         numel,
+                                                        dout_numel,
                                                         true);
       break;
     case 8:
@@ -138,6 +148,7 @@ void DiagonalGradKernel(const Context& dev_ctx,
                                                         dx_stride,
                                                         dout_stride,
                                                         numel,
+                                                        dout_numel,
                                                         true);
       break;
     case 9:
@@ -149,6 +160,7 @@ void DiagonalGradKernel(const Context& dev_ctx,
                                                         dx_stride,
                                                         dout_stride,
                                                         numel,
+                                                        dout_numel,
                                                         true);
       break;
     default:
diff --git a/paddle/phi/kernels/gpu/diagonal_kernel.cu b/paddle/phi/kernels/gpu/diagonal_kernel.cu
index 74bad0ecd9a35..74e7db258c7d1 100644
--- a/paddle/phi/kernels/gpu/diagonal_kernel.cu
+++ b/paddle/phi/kernels/gpu/diagonal_kernel.cu
@@ -54,9 +54,10 @@ void DiagonalKernel(const Context& dev_ctx,
   int64_t axis1_ = axis1 < 0 ? input_dim_size + axis1 : axis1;
   int64_t axis2_ = axis2 < 0 ? input_dim_size + axis2 : axis2;
   int64_t numel = input->numel();
+  int64_t out_numel = out->numel();
 
   int threads = PADDLE_CUDA_NUM_THREADS;
-  int blocks = (numel + threads - 1) / threads;
+  int blocks = (out_numel + threads - 1) / threads;
 
   switch (input_dim_size) {
     case 2:
@@ -68,6 +69,7 @@ void DiagonalKernel(const Context& dev_ctx,
                                                         input_stride,
                                                         output_stride,
                                                         numel,
+                                                        out_numel,
                                                         false);
       break;
     case 3:
@@ -79,6 +81,7 @@ void DiagonalKernel(const Context& dev_ctx,
                                                         input_stride,
                                                         output_stride,
                                                         numel,
+                                                        out_numel,
                                                         false);
       break;
     case 4:
@@ -90,6 +93,7 @@ void DiagonalKernel(const Context& dev_ctx,
                                                         input_stride,
                                                         output_stride,
                                                         numel,
+                                                        out_numel,
                                                         false);
       break;
     case 5:
@@ -101,6 +105,7 @@ void DiagonalKernel(const Context& dev_ctx,
                                                         input_stride,
                                                         output_stride,
                                                         numel,
+                                                        out_numel,
                                                         false);
       break;
     case 6:
@@ -112,6 +117,7 @@ void DiagonalKernel(const Context& dev_ctx,
                                                         input_stride,
                                                         output_stride,
                                                         numel,
+                                                        out_numel,
                                                         false);
       break;
     case 7:
@@ -123,6 +129,7 @@ void DiagonalKernel(const Context& dev_ctx,
                                                         input_stride,
                                                         output_stride,
                                                         numel,
+                                                        out_numel,
                                                         false);
       break;
     case 8:
@@ -134,6 +141,7 @@ void DiagonalKernel(const Context& dev_ctx,
                                                         input_stride,
                                                         output_stride,
                                                         numel,
+                                                        out_numel,
                                                         false);
       break;
     case 9:
@@ -145,6 +153,7 @@ void DiagonalKernel(const Context& dev_ctx,
                                                         input_stride,
                                                         output_stride,
                                                         numel,
+                                                        out_numel,
                                                         false);
       break;
     default:
diff --git a/python/paddle/fluid/tests/unittests/test_diagonal_op.py b/python/paddle/fluid/tests/unittests/test_diagonal_op.py
index 5b3c3830c57ca..cb35a3fce5d03 100644
--- a/python/paddle/fluid/tests/unittests/test_diagonal_op.py
+++ b/python/paddle/fluid/tests/unittests/test_diagonal_op.py
@@ -101,6 +101,35 @@ def test_check_grad(self):
         pass
 
 
+class TestDiagonalOpCase4(TestDiagonalOp):
+    def init_config(self):
+        self.case = np.random.randn(100, 100).astype('int64')
+        self.inputs = {'Input': self.case}
+        self.attrs = {'offset': 1, 'axis1': 1, 'axis2': 0}
+        self.target = np.diagonal(
+            self.inputs['Input'],
+            offset=self.attrs['offset'],
+            axis1=self.attrs['axis1'],
+            axis2=self.attrs['axis2'],
+        )
+
+    def test_check_grad(self):
+        pass
+
+
+class TestDiagonalOpCase5(TestDiagonalOp):
+    def init_config(self):
+        self.case = np.random.randn(4, 2, 4, 4).astype('float32')
+        self.inputs = {'Input': self.case}
+        self.attrs = {'offset': -2, 'axis1': 0, 'axis2': 3}
+        self.target = np.diagonal(
+            self.inputs['Input'],
+            offset=self.attrs['offset'],
+            axis1=self.attrs['axis1'],
+            axis2=self.attrs['axis2'],
+        )
+
+
 class TestDiagonalAPI(unittest.TestCase):
     def setUp(self):
         self.shape = [10, 3, 4]

From ea9e4085b14d7c6eedb0870625219aba8c566db8 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Thu, 8 Dec 2022 11:14:52 +0800
Subject: [PATCH 51/60] [API Clean]Clean __all__ to avoid exposing usless API
 (#48713)

* [API Clean]Clean __all__ to avoid exposing usless API

* fix import

* fix typo

* remove tracedLayer unittest
---
 .../unittests/test_directory_migration.py     |  1 -
 .../tests/unittests/test_imperative_mnist.py  | 16 ++----------
 .../unittests/test_imperative_ptb_rnn.py      | 24 +++---------------
 .../tests/unittests/test_imperative_resnet.py | 20 +++------------
 ..._imperative_transformer_sorted_gradient.py | 25 ++-----------------
 .../unittests/test_op_function_generator.py   | 16 ------------
 python/paddle/jit/__init__.py                 |  2 --
 python/paddle/jit/api.py                      |  2 --
 python/paddle/jit/dy2static/__init__.py       |  2 +-
 .../jit/dy2static/assert_transformer.py       |  2 +-
 .../paddle/jit/dy2static/ast_transformer.py   |  2 +-
 .../paddle/jit/dy2static/base_transformer.py  |  2 ++
 .../jit/dy2static/basic_api_transformer.py    |  2 ++
 .../dy2static/break_continue_transformer.py   |  2 +-
 .../paddle/jit/dy2static/call_transformer.py  |  2 ++
 .../paddle/jit/dy2static/cast_transformer.py  |  2 ++
 .../paddle/jit/dy2static/convert_call_func.py |  2 +-
 .../paddle/jit/dy2static/convert_operators.py |  7 ++++--
 .../dy2static/create_variable_transformer.py  |  9 ++++---
 .../jit/dy2static/decorator_transformer.py    |  6 +++--
 .../jit/dy2static/early_return_transformer.py |  4 +--
 python/paddle/jit/dy2static/function_spec.py  |  2 ++
 python/paddle/jit/dy2static/logging_utils.py  |  2 +-
 .../jit/dy2static/logical_transformer.py      |  4 ++-
 .../paddle/jit/dy2static/loop_transformer.py  | 24 ++++++------------
 python/paddle/jit/dy2static/origin_info.py    |  2 ++
 .../paddle/jit/dy2static/partial_program.py   |  2 ++
 .../jit/dy2static/program_translator.py       |  2 +-
 .../jit/dy2static/return_transformer.py       | 20 ++++++---------
 .../paddle/jit/dy2static/static_analysis.py   |  2 +-
 .../jit/dy2static/tensor_shape_transformer.py |  6 +++--
 .../jit/dy2static/typehint_transformer.py     |  2 ++
 .../jit/dy2static/variable_trans_func.py      |  9 ++-----
 33 files changed, 75 insertions(+), 152 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_directory_migration.py b/python/paddle/fluid/tests/unittests/test_directory_migration.py
index 4212b73f214e8..6968c6041baaf 100644
--- a/python/paddle/fluid/tests/unittests/test_directory_migration.py
+++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py
@@ -48,7 +48,6 @@ def test_new_directory(self):
             'paddle.distributed.ParallelEnv',
             'paddle.DataParallel',
             'paddle.jit',
-            'paddle.jit.TracedLayer',
             'paddle.jit.to_static',
             'paddle.jit.ProgramTranslator',
             'paddle.jit.TranslatedLayer',
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
index d4a26eb4cef72..7c8977ee1815e 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
@@ -21,7 +21,7 @@
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
-from paddle.fluid.framework import _in_legacy_dygraph, _test_eager_guard
+from paddle.fluid.framework import _test_eager_guard
 from paddle.fluid.optimizer import SGDOptimizer
 from paddle.nn import Linear
 
@@ -153,19 +153,7 @@ def func_test_mnist_float32(self):
                     dy_x_data = img.numpy()
                     label = data[1]
                     label.stop_gradient = True
-
-                    if batch_id % 10 == 0 and _in_legacy_dygraph():
-                        cost, traced_layer = paddle.jit.TracedLayer.trace(
-                            mnist, inputs=img
-                        )
-                        if program is not None:
-                            self.assertTrue(program, traced_layer.program)
-                        program = traced_layer.program
-                        traced_layer.save_inference_model(
-                            './infer_imperative_mnist'
-                        )
-                    else:
-                        cost = mnist(img)
+                    cost = mnist(img)
 
                     if traced_layer is not None:
                         cost_static = traced_layer([img])
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
index 55f7f1ec31f18..2a59dd396f000 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 from test_imperative_base import new_program_scope
-from utils import DyGraphProgramDescTracerTestHelper, is_equal_program
+from utils import DyGraphProgramDescTracerTestHelper
 
 import paddle
 import paddle.fluid as fluid
@@ -24,9 +24,8 @@
 import paddle.fluid.framework as framework
 from paddle.fluid.dygraph.base import to_variable
 from paddle.fluid.dygraph.nn import Embedding
-from paddle.fluid.framework import _in_legacy_dygraph, _test_eager_guard
+from paddle.fluid.framework import _test_eager_guard
 from paddle.fluid.optimizer import SGDOptimizer
-from paddle.jit import TracedLayer
 
 
 class SimpleLSTMRNN(fluid.Layer):
@@ -298,25 +297,8 @@ def ptb_rnn_cpu_float32(self, is_sparse):
                 y = to_variable(y_data)
                 init_hidden = to_variable(init_hidden_data)
                 init_cell = to_variable(init_cell_data)
-                if i % 5 == 0 and _in_legacy_dygraph():
-                    outs, traced_layer = TracedLayer.trace(
-                        ptb_model, [x, y, init_hidden, init_cell]
-                    )
-                    outs_static = traced_layer([x, y, init_hidden, init_cell])
-                    helper.assertEachVar(outs, outs_static)
-
-                    if program is not None:
-                        self.assertTrue(
-                            is_equal_program(traced_layer.program, program)
-                        )
 
-                    program = traced_layer.program
-
-                    traced_layer.save_inference_model(
-                        './infe_imperative_ptb_rnn', feed=list(range(4))
-                    )
-                else:
-                    outs = ptb_model(x, y, init_hidden, init_cell)
+                outs = ptb_model(x, y, init_hidden, init_cell)
 
                 dy_loss, last_hidden, last_cell = outs
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
index a8cf1fc8ce86a..559ea6ff71e86 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -16,15 +16,14 @@
 
 import numpy as np
 from test_imperative_base import new_program_scope
-from utils import DyGraphProgramDescTracerTestHelper, is_equal_program
+from utils import DyGraphProgramDescTracerTestHelper
 
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid import BatchNorm, core
 from paddle.fluid.dygraph.base import to_variable
-from paddle.fluid.framework import _in_legacy_dygraph, _test_eager_guard
+from paddle.fluid.framework import _test_eager_guard
 from paddle.fluid.layer_helper import LayerHelper
-from paddle.jit import TracedLayer
 
 # NOTE(zhiqiu): run with FLAGS_cudnn_deterministic=1
 
@@ -301,20 +300,7 @@ def func_test_resnet_float32(self):
                 label.stop_gradient = True
 
                 out = None
-                if batch_id % 5 == 0 and _in_legacy_dygraph():
-                    out, traced_layer = TracedLayer.trace(resnet, img)
-                    if program is not None:
-                        self.assertTrue(
-                            is_equal_program(program, traced_layer.program)
-                        )
-
-                    traced_layer.save_inference_model(
-                        './infer_imperative_resnet'
-                    )
-
-                    program = traced_layer.program
-                else:
-                    out = resnet(img)
+                out = resnet(img)
 
                 if traced_layer is not None:
                     resnet.eval()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
index 0bd69f0359104..3cc07ee6a3378 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
@@ -23,12 +23,11 @@
 from paddle.fluid import Embedding, Layer, core
 from paddle.fluid.dygraph import guard, to_variable
 from paddle.fluid.framework import _in_legacy_dygraph, _test_eager_guard
-from paddle.jit import TracedLayer
 from paddle.nn import Linear
 
 np.set_printoptions(suppress=True)
 
-from utils import DyGraphProgramDescTracerTestHelper, is_equal_program
+from utils import DyGraphProgramDescTracerTestHelper
 
 
 # Copy from models
@@ -1171,27 +1170,7 @@ def run_dygraph():
 
             for i in range(batch_num):
                 enc_inputs, dec_inputs, label, weights = create_data()
-                if False:
-                    outs, traced_layer = TracedLayer.trace(
-                        transformer, [enc_inputs, dec_inputs, label, weights]
-                    )
-
-                    ins_static = enc_inputs + dec_inputs + [label, weights]
-                    outs_static = traced_layer(ins_static)
-                    helper.assertEachVar(outs, outs_static)
-                    if program is not None:
-                        self.assertTrue(
-                            is_equal_program(program, traced_layer.program)
-                        )
-
-                    program = traced_layer.program
-                    traced_layer.save_inference_model(
-                        './infer_imperative_transformer',
-                        feed=list(range(len(ins_static))),
-                        fetch=list(range(len(outs_static))),
-                    )
-                else:
-                    outs = transformer(enc_inputs, dec_inputs, label, weights)
+                outs = transformer(enc_inputs, dec_inputs, label, weights)
 
                 dy_sum_cost, dy_avg_cost, dy_predict, dy_token_num = outs
 
diff --git a/python/paddle/fluid/tests/unittests/test_op_function_generator.py b/python/paddle/fluid/tests/unittests/test_op_function_generator.py
index 53edabb18236c..f3991150193c3 100644
--- a/python/paddle/fluid/tests/unittests/test_op_function_generator.py
+++ b/python/paddle/fluid/tests/unittests/test_op_function_generator.py
@@ -20,8 +20,6 @@
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle import _legacy_C_ops
-from paddle.fluid.framework import in_dygraph_mode
-from paddle.jit.api import TracedLayer
 
 
 class TestTracedLayer(fluid.dygraph.Layer):
@@ -93,20 +91,6 @@ def test_trace_backward(self):
             np.testing.assert_array_equal(y_grad, loss.gradient() * a)
         fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
-    def test_traced_layer(self):
-        if in_dygraph_mode():
-            return
-        with fluid.dygraph.guard():
-            layer = TestTracedLayer("test_traced_layer")
-            a = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
-            x = fluid.dygraph.to_variable(a)
-            res_dygraph, static_layer = TracedLayer.trace(
-                layer, inputs=x
-            )  # dygraph out
-            res_static_graph = static_layer([x])[0]
-
-            np.testing.assert_array_equal(res_dygraph.numpy(), res_static_graph)
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/jit/__init__.py b/python/paddle/jit/__init__.py
index 4f32b7e29c60d..af4aad939acbe 100644
--- a/python/paddle/jit/__init__.py
+++ b/python/paddle/jit/__init__.py
@@ -15,7 +15,6 @@
 
 from .api import save
 from .api import load
-from .api import TracedLayer
 from .api import set_code_level
 from .api import set_verbosity
 from .api import declarative as to_static
@@ -34,5 +33,4 @@
     'set_code_level',
     'set_verbosity',
     'not_to_static',
-    'TracedLayer',
 ]
diff --git a/python/paddle/jit/api.py b/python/paddle/jit/api.py
index 95b07a989a7a7..ea30bb658b89f 100644
--- a/python/paddle/jit/api.py
+++ b/python/paddle/jit/api.py
@@ -74,9 +74,7 @@
 from paddle.fluid.wrapped_decorator import wrap_decorator
 
 __all__ = [
-    'TracedLayer',
     'declarative',
-    'dygraph_to_static_func',
     'set_code_level',
     'set_verbosity',
     'save',
diff --git a/python/paddle/jit/dy2static/__init__.py b/python/paddle/jit/dy2static/__init__.py
index 89204e62cee14..b55d5d672c2b1 100644
--- a/python/paddle/jit/dy2static/__init__.py
+++ b/python/paddle/jit/dy2static/__init__.py
@@ -36,6 +36,6 @@
 from .assert_transformer import AssertTransformer
 from .ast_transformer import DygraphToStaticAst
 from .program_translator import convert_to_static
-from .static_analysis import *  # noqa: F403
+from .static_analysis import AstNodeWrapper, NodeVarType, StaticAnalysisVisitor
 
 __all__ = []
diff --git a/python/paddle/jit/dy2static/assert_transformer.py b/python/paddle/jit/dy2static/assert_transformer.py
index 81bad1111b192..5fa5df0722886 100644
--- a/python/paddle/jit/dy2static/assert_transformer.py
+++ b/python/paddle/jit/dy2static/assert_transformer.py
@@ -22,7 +22,7 @@
     BaseTransformer,
 )
 
-__all__ = ['AssertTransformer']
+__all__ = []
 
 
 class AssertTransformer(BaseTransformer):
diff --git a/python/paddle/jit/dy2static/ast_transformer.py b/python/paddle/jit/dy2static/ast_transformer.py
index 2acbda4c8b2aa..9d61f6475db0c 100644
--- a/python/paddle/jit/dy2static/ast_transformer.py
+++ b/python/paddle/jit/dy2static/ast_transformer.py
@@ -71,7 +71,7 @@
 from . import logging_utils
 from .utils import ast_to_source_code
 
-__all__ = ['DygraphToStaticAst']
+__all__ = []
 
 
 def apply_optimization(transformers):
diff --git a/python/paddle/jit/dy2static/base_transformer.py b/python/paddle/jit/dy2static/base_transformer.py
index 166753d05f57a..2956d91deeee4 100644
--- a/python/paddle/jit/dy2static/base_transformer.py
+++ b/python/paddle/jit/dy2static/base_transformer.py
@@ -27,6 +27,8 @@
     get_attribute_full_name,
 )
 
+__all__ = []
+
 
 class BaseTransformer(gast.NodeTransformer):
     def visit(self, node):
diff --git a/python/paddle/jit/dy2static/basic_api_transformer.py b/python/paddle/jit/dy2static/basic_api_transformer.py
index 89fa0738b9a19..f35bfb8625674 100644
--- a/python/paddle/jit/dy2static/basic_api_transformer.py
+++ b/python/paddle/jit/dy2static/basic_api_transformer.py
@@ -23,6 +23,8 @@
     BaseTransformer,
 )
 
+__all__ = []
+
 
 class BasicApiTransformer(BaseTransformer):
     """
diff --git a/python/paddle/jit/dy2static/break_continue_transformer.py b/python/paddle/jit/dy2static/break_continue_transformer.py
index 670cc842f349b..e3de6e03ea704 100644
--- a/python/paddle/jit/dy2static/break_continue_transformer.py
+++ b/python/paddle/jit/dy2static/break_continue_transformer.py
@@ -27,7 +27,7 @@
     ForNodeVisitor,
 )
 
-__all__ = ['BreakContinueTransformer']
+__all__ = []
 
 BREAK_NAME_PREFIX = '__break'
 CONTINUE_NAME_PREFIX = '__continue'
diff --git a/python/paddle/jit/dy2static/call_transformer.py b/python/paddle/jit/dy2static/call_transformer.py
index 012c73b99f654..fa1d71cbb4f53 100644
--- a/python/paddle/jit/dy2static/call_transformer.py
+++ b/python/paddle/jit/dy2static/call_transformer.py
@@ -25,6 +25,8 @@
 
 PDB_SET = "pdb.set_trace"
 
+__all__ = []
+
 
 class CallTransformer(BaseTransformer):
     """
diff --git a/python/paddle/jit/dy2static/cast_transformer.py b/python/paddle/jit/dy2static/cast_transformer.py
index 96a504d70cdb5..c0d22d93a9070 100644
--- a/python/paddle/jit/dy2static/cast_transformer.py
+++ b/python/paddle/jit/dy2static/cast_transformer.py
@@ -22,6 +22,8 @@
     BaseTransformer,
 )
 
+__all__ = []
+
 
 class CastTransformer(BaseTransformer):
     """
diff --git a/python/paddle/jit/dy2static/convert_call_func.py b/python/paddle/jit/dy2static/convert_call_func.py
index da0560cb34692..3a2ffa70fc946 100644
--- a/python/paddle/jit/dy2static/convert_call_func.py
+++ b/python/paddle/jit/dy2static/convert_call_func.py
@@ -40,7 +40,7 @@
 from paddle.jit.dy2static.utils import is_paddle_func, unwrap
 from paddle.fluid.dygraph.layers import Layer
 
-__all__ = ["convert_call"]
+__all__ = []
 
 
 # The api(s) should be considered as plain function and convert
diff --git a/python/paddle/jit/dy2static/convert_operators.py b/python/paddle/jit/dy2static/convert_operators.py
index 1d3e23a4b96b7..f67e1dd1585b3 100644
--- a/python/paddle/jit/dy2static/convert_operators.py
+++ b/python/paddle/jit/dy2static/convert_operators.py
@@ -15,7 +15,7 @@
 import re
 import paddle
 from paddle.fluid.data_feeder import convert_dtype
-from paddle.jit.dy2static.variable_trans_func import (
+from .variable_trans_func import (
     to_static_variable,
 )
 from paddle.fluid.framework import core, Variable
@@ -43,10 +43,13 @@
 from paddle.jit.dy2static.utils import (
     UndefinedVar,
     Dygraph2StaticException,
+    GetterSetterHelper,
 )
-from paddle.jit.dy2static.utils import GetterSetterHelper
+
 from paddle.fluid.layers.utils import copy_mutable_vars
 
+__all__ = []
+
 
 def convert_attr(x, attr):
     if isinstance(x, Variable) and attr == "size":
diff --git a/python/paddle/jit/dy2static/create_variable_transformer.py b/python/paddle/jit/dy2static/create_variable_transformer.py
index 808a047c3a283..ae34266af8d23 100644
--- a/python/paddle/jit/dy2static/create_variable_transformer.py
+++ b/python/paddle/jit/dy2static/create_variable_transformer.py
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle.jit.dy2static.static_analysis import (
+from .static_analysis import (
     AstNodeWrapper,
 )
-from paddle.jit.dy2static.utils import (
+from .utils import (
     FunctionNameLivenessAnalysis,
 )
-from paddle.jit.dy2static.variable_trans_func import (
+from .variable_trans_func import (
     create_undefined_var,
 )
 from .base_transformer import (
@@ -26,6 +26,9 @@
 )
 
 
+__all__ = []
+
+
 class CreateVariableTransformer(BaseTransformer):
     """ """
 
diff --git a/python/paddle/jit/dy2static/decorator_transformer.py b/python/paddle/jit/dy2static/decorator_transformer.py
index cd33f8625a3b0..01387fbc672b6 100644
--- a/python/paddle/jit/dy2static/decorator_transformer.py
+++ b/python/paddle/jit/dy2static/decorator_transformer.py
@@ -14,13 +14,13 @@
 # limitations under the License.
 
 from paddle.utils import gast
-from paddle.jit.dy2static.static_analysis import (
+from .static_analysis import (
     AstNodeWrapper,
 )
 from .base_transformer import (
     BaseTransformer,
 )
-from paddle.jit.dy2static.utils import (
+from .utils import (
     RE_PYNAME,
     RE_PYMODULE,
     ast_to_source_code,
@@ -29,6 +29,8 @@
 
 import re
 
+__all__ = []
+
 IGNORE_NAMES = [
     'declarative',
     'to_static',
diff --git a/python/paddle/jit/dy2static/early_return_transformer.py b/python/paddle/jit/dy2static/early_return_transformer.py
index 72076fb3cd883..61cef0f61b565 100644
--- a/python/paddle/jit/dy2static/early_return_transformer.py
+++ b/python/paddle/jit/dy2static/early_return_transformer.py
@@ -13,14 +13,14 @@
 # limitations under the License.
 
 from paddle.utils import gast
-from paddle.jit.dy2static.static_analysis import (
+from .static_analysis import (
     AstNodeWrapper,
 )
 from .base_transformer import (
     BaseTransformer,
 )
 
-__all__ = ['EarlyReturnTransformer']
+__all__ = []
 
 
 class EarlyReturnTransformer(BaseTransformer):
diff --git a/python/paddle/jit/dy2static/function_spec.py b/python/paddle/jit/dy2static/function_spec.py
index 370fb36bcfb91..431f639a8197d 100644
--- a/python/paddle/jit/dy2static/function_spec.py
+++ b/python/paddle/jit/dy2static/function_spec.py
@@ -32,6 +32,8 @@
     func_to_source_code,
 )
 
+__all__ = []
+
 
 class FunctionSpec:
     """
diff --git a/python/paddle/jit/dy2static/logging_utils.py b/python/paddle/jit/dy2static/logging_utils.py
index ca36cb0b35aa5..d3b806a5d7cd0 100644
--- a/python/paddle/jit/dy2static/logging_utils.py
+++ b/python/paddle/jit/dy2static/logging_utils.py
@@ -18,7 +18,7 @@
 from paddle.fluid import log_helper
 from .utils import ast_to_source_code
 
-__all__ = ["TranslatorLogger", "set_verbosity", "set_code_level"]
+__all__ = []
 
 VERBOSITY_ENV_NAME = 'TRANSLATOR_VERBOSITY'
 CODE_LEVEL_ENV_NAME = 'TRANSLATOR_CODE_LEVEL'
diff --git a/python/paddle/jit/dy2static/logical_transformer.py b/python/paddle/jit/dy2static/logical_transformer.py
index cd4d27ac73e57..59860d23bd17b 100644
--- a/python/paddle/jit/dy2static/logical_transformer.py
+++ b/python/paddle/jit/dy2static/logical_transformer.py
@@ -13,11 +13,13 @@
 # limitations under the License.
 
 from paddle.utils import gast
-from paddle.jit.dy2static.utils import ast_to_source_code
+from .utils import ast_to_source_code
 from .base_transformer import (
     BaseTransformer,
 )
 
+__all__ = []
+
 cmpop_type_to_str = {
     gast.Eq: "==",
     gast.NotEq: "!=",
diff --git a/python/paddle/jit/dy2static/loop_transformer.py b/python/paddle/jit/dy2static/loop_transformer.py
index 4bb99e830b350..fff46e56ffc90 100644
--- a/python/paddle/jit/dy2static/loop_transformer.py
+++ b/python/paddle/jit/dy2static/loop_transformer.py
@@ -17,22 +17,16 @@
 
 from collections import defaultdict
 from paddle.fluid import unique_name
-from paddle.jit.dy2static.static_analysis import (
-    AstNodeWrapper,
-)
-from paddle.jit.dy2static.static_analysis import NodeVarType
-from paddle.jit.dy2static.static_analysis import (
-    StaticAnalysisVisitor,
-)
-from paddle.jit.dy2static.utils import ast_to_source_code
-from paddle.jit.dy2static.utils import get_attribute_full_name
-from paddle.jit.dy2static.utils import (
+from .static_analysis import AstNodeWrapper, NodeVarType, StaticAnalysisVisitor
+from .utils import (
+    ast_to_source_code,
+    get_attribute_full_name,
     create_nonlocal_stmt_nodes,
     create_get_args_node,
     create_set_args_node,
-)
-from paddle.jit.dy2static.utils import (
     FunctionNameLivenessAnalysis,
+    GetterSetterHelper,
+    create_name_str,
 )
 from .ifelse_transformer import ARGS_NAME
 from .base_transformer import (
@@ -41,12 +35,8 @@
     ForNodeVisitor,
 )
 
-from paddle.jit.dy2static.utils import (
-    GetterSetterHelper,
-    create_name_str,
-)
 
-__all__ = ['LoopTransformer', 'NameVisitor']
+__all__ = []
 
 WHILE_CONDITION_PREFIX = 'while_condition'
 WHILE_BODY_PREFIX = 'while_body'
diff --git a/python/paddle/jit/dy2static/origin_info.py b/python/paddle/jit/dy2static/origin_info.py
index 7f3c7f719a539..3ed9726772f95 100644
--- a/python/paddle/jit/dy2static/origin_info.py
+++ b/python/paddle/jit/dy2static/origin_info.py
@@ -24,6 +24,8 @@
 
 from collections.abc import Sequence
 
+__all__ = []
+
 
 class Location:
     """
diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py
index d1ebdbe5ccc57..a4afeee0d721a 100644
--- a/python/paddle/jit/dy2static/partial_program.py
+++ b/python/paddle/jit/dy2static/partial_program.py
@@ -44,6 +44,8 @@
 )
 from paddle import _legacy_C_ops
 
+__all__ = []
+
 
 class NestSequence:
     """
diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py
index 37a85fc078bc9..b183537e2d10b 100644
--- a/python/paddle/jit/dy2static/program_translator.py
+++ b/python/paddle/jit/dy2static/program_translator.py
@@ -57,7 +57,7 @@
 
 from .ast_transformer import DygraphToStaticAst
 
-__all__ = ['ProgramTranslator', 'convert_to_static']
+__all__ = []
 
 # For each traced function, we set `max_traced_program_count` = 10 to consider caching performance.
 # Once exceeding the threshold, we will raise warning to users to make sure the conversion is as expected.
diff --git a/python/paddle/jit/dy2static/return_transformer.py b/python/paddle/jit/dy2static/return_transformer.py
index 8aa96b6157846..0782bde83d36a 100644
--- a/python/paddle/jit/dy2static/return_transformer.py
+++ b/python/paddle/jit/dy2static/return_transformer.py
@@ -15,22 +15,18 @@
 from paddle.utils import gast
 
 from paddle.fluid import unique_name
-from paddle.jit.dy2static.utils import index_in_list
-from .break_continue_transformer import (
-    ForToWhileTransformer,
+from .utils import (
+    index_in_list,
+    ast_to_source_code,
+    Dygraph2StaticException,
+    ORIGI_INFO,
 )
-from paddle.jit.dy2static.utils import ast_to_source_code
+from .break_continue_transformer import ForToWhileTransformer
 from .base_transformer import (
     BaseTransformer,
 )
-from paddle.jit.dy2static.utils import Dygraph2StaticException
-from paddle.jit.dy2static.utils import ORIGI_INFO
-
-__all__ = [
-    'RETURN_NO_VALUE_MAGIC_NUM',
-    'RETURN_NO_VALUE_VAR_NAME',
-    'ReturnTransformer',
-]
+
+__all__ = []
 
 # Constant for the name of the variable which stores the boolean state that we
 # should return
diff --git a/python/paddle/jit/dy2static/static_analysis.py b/python/paddle/jit/dy2static/static_analysis.py
index 5b6c3d1261c84..da3964e620963 100644
--- a/python/paddle/jit/dy2static/static_analysis.py
+++ b/python/paddle/jit/dy2static/static_analysis.py
@@ -22,7 +22,7 @@
     ast_to_source_code,
 )
 
-__all__ = ['AstNodeWrapper', 'NodeVarType', 'StaticAnalysisVisitor']
+__all__ = []
 
 
 class NodeVarType:
diff --git a/python/paddle/jit/dy2static/tensor_shape_transformer.py b/python/paddle/jit/dy2static/tensor_shape_transformer.py
index ffdba7e790a77..94912d7e91690 100644
--- a/python/paddle/jit/dy2static/tensor_shape_transformer.py
+++ b/python/paddle/jit/dy2static/tensor_shape_transformer.py
@@ -14,14 +14,16 @@
 
 from paddle.utils import gast
 
-from paddle.jit.dy2static.utils import ast_to_source_code
-from paddle.jit.dy2static.static_analysis import (
+from .utils import ast_to_source_code
+from .static_analysis import (
     AstNodeWrapper,
 )
 from .base_transformer import (
     BaseTransformer,
 )
 
+__all__ = []
+
 
 class TensorShapeTransformer(BaseTransformer):
     """
diff --git a/python/paddle/jit/dy2static/typehint_transformer.py b/python/paddle/jit/dy2static/typehint_transformer.py
index d5c23d1d7d73c..129023ba3ed97 100644
--- a/python/paddle/jit/dy2static/typehint_transformer.py
+++ b/python/paddle/jit/dy2static/typehint_transformer.py
@@ -20,6 +20,8 @@
     BaseTransformer,
 )
 
+__all__ = []
+
 
 class TypeHintTransformer(BaseTransformer):
     """
diff --git a/python/paddle/jit/dy2static/variable_trans_func.py b/python/paddle/jit/dy2static/variable_trans_func.py
index c98823242b40a..1501aa38fc129 100644
--- a/python/paddle/jit/dy2static/variable_trans_func.py
+++ b/python/paddle/jit/dy2static/variable_trans_func.py
@@ -15,18 +15,13 @@
 import paddle
 from paddle.utils import gast
 from paddle.fluid.framework import Variable
-from paddle.jit.dy2static.utils import (
+from .utils import (
     UndefinedVar,
     create_undefined_variable,
 )
 from paddle.fluid.layers.utils import map_structure, is_sequence
 
-__all__ = [
-    'create_bool_as_type',
-    'create_fill_constant_node',
-    'to_static_variable',
-    'create_undefined_var',
-]
+__all__ = []
 
 
 def create_undefined_var(name):

From 911d6bb1fa8fe5f0b5e3f21229bdbe20dbfc033e Mon Sep 17 00:00:00 2001
From: Ghost Screaming <mofengshenjieII@163.com>
Date: Thu, 8 Dec 2022 11:21:21 +0800
Subject: [PATCH 52/60] Clean fluid APIs in distributed and fleet files
 (#48851)

* Fix bug of reduce_sum op. When input.numel() > INT32_MAX, its result
is wrong.

* Remove climits.

* Clean fluid API in paddle/distributed and paddle/fleetx folders.
Include following files:
python/paddle/distributed/__init__.py
python/paddle/distributed/collective.py
python/paddle/distributed/fleet/utils/fs.py
python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
python/paddle/distributed/fleet/utils/internal_storage.py
python/paddle/distributed/launch/context/device.py
python/paddle/distributed/parallel.py
python/paddle/distributed/parallel_with_gloo.py
python/paddle/distributed/spawn.py
python/paddle/framework/__init__.py
To be mentioned, 'paddle.fluid.dygraph.parallel.ParallelEnv'
 and 'fluid.framework.core' keeps unchanged in those files.
ParallelEnv is used by paddle.fluid.dygraph.parallel.DataParallel.
However, APIs in paddle.fluid.dygraph.parallel can't be
migrated to paddle.distributed, as there exists cyclic import
dependencies in modules like paddle.static, paddle.tensor. And
'fluid.framework.core' will be changed to import framework.core
after fluid.core is transmitted.

* Change TODO authors.
---
 python/paddle/distributed/__init__.py         |  3 +++
 python/paddle/distributed/collective.py       |  4 +++-
 python/paddle/distributed/fleet/utils/fs.py   |  1 +
 .../fleet/utils/hybrid_parallel_inference.py  |  4 +++-
 .../fleet/utils/hybrid_parallel_util.py       |  7 ++++--
 .../fleet/utils/internal_storage.py           | 22 +++++++++--------
 .../distributed/launch/context/device.py      | 24 ++++++++++---------
 python/paddle/distributed/parallel.py         | 12 ++++++++--
 .../paddle/distributed/parallel_with_gloo.py  |  1 +
 python/paddle/distributed/spawn.py            |  3 ++-
 python/paddle/framework/__init__.py           |  8 +++++++
 11 files changed, 61 insertions(+), 28 deletions(-)

diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index e7832758a8013..4e81ce52ef908 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -64,6 +64,9 @@
 from .entry_attr import CountFilterEntry  # noqa: F401
 from .entry_attr import ShowClickEntry  # noqa: F401
 
+# (TODO: GhostScreaming) It needs migration of ParallelEnv. However,
+# it's hard to migrate APIs in paddle.fluid.dygraph.parallel completely.
+# It will be replaced later.
 from paddle.fluid.dygraph.parallel import ParallelEnv  # noqa: F401
 
 from . import cloud_utils  # noqa: F401
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 6d8cd60c6b48e..6b9075de20a0b 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -15,9 +15,11 @@
 import datetime
 
 import paddle
+
+# (TODO: GhostScreaming) It will be removed later.
 import paddle.fluid.core as core
+from paddle.framework import _non_static_mode, in_dygraph_mode
 
-from ..fluid.framework import _non_static_mode, in_dygraph_mode
 from .communication.group import Group, _add_new_group, is_initialized
 from .fleet.layers.mpu.mp_ops import _c_concat  # noqa: F401
 from .fleet.layers.mpu.mp_ops import _c_identity  # noqa: F401
diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py
index ea1fbc5c940fe..b61abbbaa5686 100644
--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -20,6 +20,7 @@
 import shutil
 import time
 
+# (TODO: GhostScreaming) It will be removed later.
 from paddle.fluid import core
 
 from .log_util import logger
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
index 49aed0862f697..c2c9a31769db3 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
@@ -17,8 +17,10 @@
 import numpy as np
 
 import paddle.distributed.fleet as fleet
+
+# (TODO: GhostScreaming) It will be removed later.
 import paddle.fluid.core as core
-from paddle.fluid.framework import Block, Program, _non_static_mode
+from paddle.framework import Block, Program, _non_static_mode
 
 
 class HybridParallelInferenceHelper:
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
index 5f7709f0fe121..2b0653ea35dc6 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -14,13 +14,16 @@
 
 import paddle
 from paddle import framework
+
+# (TODO: GhostScreaming) It will be removed later.
 from paddle.fluid import core
-from paddle.fluid.dygraph.parallel import (
+from paddle.framework import (
+    _in_legacy_dygraph,
     _split_tensors,
     build_groups,
+    in_dygraph_mode,
     sync_params_buffers,
 )
-from paddle.fluid.framework import _in_legacy_dygraph, in_dygraph_mode
 
 from .log_util import logger
 
diff --git a/python/paddle/distributed/fleet/utils/internal_storage.py b/python/paddle/distributed/fleet/utils/internal_storage.py
index ce3a4040988a5..2b27d6a0dcd77 100644
--- a/python/paddle/distributed/fleet/utils/internal_storage.py
+++ b/python/paddle/distributed/fleet/utils/internal_storage.py
@@ -25,7 +25,9 @@
 import numpy as np
 
 import paddle
-import paddle.fluid as fluid
+from paddle import framework
+
+# (TODO: GhostScreaming) It will be removed later.
 from paddle.fluid import core
 
 from ..meta_parallel.sharding.sharding_utils import Type, device_guard
@@ -111,7 +113,7 @@ def to(self, device, dtype=None, keep_alignment=True):
         if keep_alignment:
             self._array_params()
 
-    @fluid.dygraph.no_grad
+    @framework.no_grad()
     def add_rank_params(self, trainable_params, param2align, convert_gpu=True):
         """
         Add new parameters to the InternalStorage. Params becomes a view of this InternalStorage buffer.
@@ -145,7 +147,7 @@ def add_rank_params(self, trainable_params, param2align, convert_gpu=True):
             self._params.append(param)
             self._param_ids.append(id(param))
 
-    @fluid.dygraph.no_grad
+    @framework.no_grad()
     def _add_param_as_view(self, param, align, convert_gpu=True):
 
         assert (
@@ -185,7 +187,7 @@ def _add_param_as_view(self, param, align, convert_gpu=True):
         self._fill = offset
         return p_shape
 
-    @fluid.dygraph.no_grad
+    @framework.no_grad()
     def _convert_buffer(self, param, p_shape, align):
 
         var_end = self._fill + np.prod(p_shape)
@@ -199,7 +201,7 @@ def _convert_buffer(self, param, p_shape, align):
 
         self._fill = offset
 
-    @fluid.dygraph.no_grad
+    @framework.no_grad()
     def _array_params(self):
         """
         Given the parameters which have been registered previously, rebuild the whole InternalStorage.
@@ -261,7 +263,7 @@ def to(self, device, dtype=None, keep_alignment=True):
         if keep_alignment:
             self._array_grads()
 
-    @fluid.dygraph.no_grad
+    @framework.no_grad()
     def add_grad(self, param, align):
         """
         Add a new parameter gradient to the InternalStorage. Param.grad becomes a view of this InternalStorage buffer.
@@ -275,7 +277,7 @@ def add_grad(self, param, align):
         self._params.append(param)
         self._param_ids.append(id(param))
 
-    @fluid.dygraph.no_grad
+    @framework.no_grad()
     def manumal_relase(self):
         """
         Release the buffer from InternalStorage. The InternalStorage will need to be rebuilt before use.
@@ -291,7 +293,7 @@ def manumal_relase(self):
             self.params_checked_in = 0
             self._release = True
 
-    @fluid.dygraph.no_grad
+    @framework.no_grad()
     def rebuild(self):
         """
         Given the parameter gradients which have been registered previously, rebuild the whole InternalStorage.
@@ -305,7 +307,7 @@ def rebuild(self):
 
             self._release = False
 
-    @fluid.dygraph.no_grad
+    @framework.no_grad()
     def _array_grads(self):
         """
         Given the parameters gradients which have been registered previously, rebuild the whole InternalStorage.
@@ -315,7 +317,7 @@ def _array_grads(self):
             for p in self._params:
                 self._add_grad_as_view(p, self._parm2align[p.name])
 
-    @fluid.dygraph.no_grad
+    @framework.no_grad()
     def _add_grad_as_view(self, param, align):
         assert (
             np.prod(self.buffer.shape) > 0
diff --git a/python/paddle/distributed/launch/context/device.py b/python/paddle/distributed/launch/context/device.py
index f5aaf83d135a1..48dba9af56411 100644
--- a/python/paddle/distributed/launch/context/device.py
+++ b/python/paddle/distributed/launch/context/device.py
@@ -14,9 +14,11 @@
 
 import os
 
-import paddle.fluid as fluid
 from paddle.device import get_available_custom_device
 
+# (TODO: GhostScreaming) It will be removed later.
+from paddle.fluid import core
+
 
 class DeviceType:
     CPU = 'cpu'
@@ -148,25 +150,25 @@ def get_custom_devices_count(device_type):
             )
             if visible_devices_str in os.environ:
                 visible_devices = os.getenv(visible_devices_str)
-        elif fluid.core.is_compiled_with_cuda():
+        elif core.is_compiled_with_cuda():
             dev._dtype = DeviceType.GPU
-            num = fluid.core.get_cuda_device_count()
+            num = core.get_cuda_device_count()
             visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
-        elif fluid.core.is_compiled_with_xpu():
+        elif core.is_compiled_with_xpu():
             dev._dtype = DeviceType.XPU
-            num = fluid.core.get_xpu_device_count()
+            num = core.get_xpu_device_count()
             visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
-        elif fluid.core.is_compiled_with_npu():
+        elif core.is_compiled_with_npu():
             dev._dtype = DeviceType.NPU
-            num = fluid.core.get_npu_device_count()
+            num = core.get_npu_device_count()
             visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
-        elif fluid.core.is_compiled_with_mlu():
+        elif core.is_compiled_with_mlu():
             dev._dtype = DeviceType.MLU
-            num = fluid.core.get_mlu_device_count()
+            num = core.get_mlu_device_count()
             visible_devices = os.getenv("MLU_VISIBLE_DEVICES")
-        elif fluid.core.is_compiled_with_ipu():
+        elif core.is_compiled_with_ipu():
             dev._dtype = DeviceType.IPU
-            num = fluid.core.get_ipu_device_count()
+            num = core.get_ipu_device_count()
             # For IPUs, 'labels' is a list which contains the available numbers of IPU devices.
             dev._labels = [str(x) for x in range(0, num + 1)]
             return dev
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index bd449acabf4db..99a71146104ef 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -38,10 +38,18 @@
 from paddle.distributed.fleet.launch_utils import check_backend
 
 # deprecated module import
+# (TODO: GhostScreaming) It will be removed later.
 from paddle.fluid import core
-from paddle.fluid.dygraph import parallel_helper
+
+# (TODO: GhostScreaming) It will be removed later.
 from paddle.fluid.dygraph.parallel import ParallelEnv
-from paddle.fluid.framework import _set_expected_place, in_dygraph_mode
+
+# (TODO: GhostScreaming) It will be removed later.
+from paddle.framework import (
+    _set_expected_place,
+    in_dygraph_mode,
+    parallel_helper,
+)
 
 __all__ = []
 
diff --git a/python/paddle/distributed/parallel_with_gloo.py b/python/paddle/distributed/parallel_with_gloo.py
index d0c1b3eac90ae..3204b6460d368 100755
--- a/python/paddle/distributed/parallel_with_gloo.py
+++ b/python/paddle/distributed/parallel_with_gloo.py
@@ -20,6 +20,7 @@
 )
 
 # deprecated module import
+# (TODO: GhostScreaming) It will be removed later.
 from paddle.fluid import core
 
 __all__ = []
diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py
index 21ea1d4bdc682..cb9804f452076 100644
--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -37,8 +37,9 @@
 )
 
 # deprecated module import
+# (TODO: GhostScreaming) It will be removed later.
 from paddle.fluid import core
-from paddle.fluid.framework import set_flags
+from paddle.framework import set_flags
 
 __all__ = []
 
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index 30d637936804e..99d9cffed1fa3 100755
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -68,5 +68,13 @@
 from ..fluid.framework import _global_flags  # noqa: F401
 from ..fluid.framework import _apply_pass  # noqa: F401
 from ..fluid.framework import switch_main_program
+from ..fluid.framework import _set_expected_place  # noqa: F401
+from ..fluid.framework import Block, Program  # noqa: F401
+from ..fluid.dygraph import parallel_helper  # noqa: F401
+from ..fluid.dygraph.parallel import (
+    _split_tensors,
+    build_groups,
+    sync_params_buffers,
+)
 
 __all__ = []

From 83c4145932afe30795bc1804d388284f3125bad7 Mon Sep 17 00:00:00 2001
From: QingshuChen <chenqingshu@baidu.com>
Date: Thu, 8 Dec 2022 11:22:17 +0800
Subject: [PATCH 53/60] rm kunlun xpu2_op_list (#48826)

*test=kunlun
---
 .../fluid/platform/device/xpu/xpu2_op_list.h  | 773 ------------------
 .../unittests/xpu/test_sampling_id_op_xpu.py  |  48 --
 2 files changed, 821 deletions(-)
 delete mode 100644 paddle/fluid/platform/device/xpu/xpu2_op_list.h
 delete mode 100644 python/paddle/fluid/tests/unittests/xpu/test_sampling_id_op_xpu.py

diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
deleted file mode 100644
index 59fd27dced779..0000000000000
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ /dev/null
@@ -1,773 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#ifdef PADDLE_WITH_XPU
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-
-#include "paddle/fluid/framework/op_kernel_type.h"
-
-namespace paddle {
-namespace platform {
-
-using vartype = paddle::framework::proto::VarType;
-using pOpKernelType = paddle::framework::OpKernelType;
-using XPUKernelSet =
-    std::unordered_set<pOpKernelType, paddle::framework::OpKernelType::Hash>;
-using XPUOpMap = std::unordered_map<std::string, XPUKernelSet>;
-
-XPUOpMap& get_kl2_ops() {
-  // KL1支持的op，通过op_name, data_type, place来索引
-  static XPUOpMap s_xpu2_kernels{
-      {"abs", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"abs_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"adadelta", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"adamw",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"adam",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"argsort_grad",
-       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"argsort",
-       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"assign",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace())})},
-      {"assign_value",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"batch_norm_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"batch_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"bmm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"bmm_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"bce_loss_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"bce_loss", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"beam_search",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"beam_search_decode",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"bilinear_interp_v2",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"bilinear_interp_v2_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"broadcast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"c_allgather",
-       XPUKernelSet({pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"c_allreduce_sum",
-       XPUKernelSet({pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace())})},
-      {"c_embedding", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"c_identity",
-       XPUKernelSet({pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"c_sync_calc_stream",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"c_sync_comm_stream",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"cast",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::UINT8, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace())})},
-      {"check_finite_and_unscale",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"clip", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"clip_by_norm",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"coalesce_tensor",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"concat_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"concat",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"conv2d_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"conv2d",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"conv3d_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"conv3d",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"conv2d_transpose_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"conv2d_transpose",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"cumsum",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"deformable_conv_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"deformable_conv",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"depthwise_conv2d_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"depthwise_conv2d",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"dropout_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"dropout",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_add_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_add",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_div_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_div",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_floordiv",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_max_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_max",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_min_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_min",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_mul_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_mul",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_pow",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_sub_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_sub",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"elementwise_mod",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace())})},
-      {"empty",
-       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT16, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::UINT8, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP64, XPUPlace())})},
-      {"embedding_sparse_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"equal",
-       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"exp_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"exp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"expand_as_v2",
-       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"expand_v2",
-       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"fill_any_like",
-       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"fill_constant",
-       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT16, XPUPlace()),
-                     pOpKernelType(vartype::UINT8, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::COMPLEX64, XPUPlace()),
-                     pOpKernelType(vartype::COMPLEX128, XPUPlace())})},
-      {"flatten2_grad",
-       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"flatten2",
-       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"flatten_contiguous_range_grad",
-       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"flatten_contiguous_range",
-       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"flatten_grad",
-       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"flatten",
-       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"unfold",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"unfold_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"floor", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"gather_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"gather_nd",
-       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"gather",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"gaussian_random",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"gelu_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"gelu",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"generate_proposals_v2",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"grad_add",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"greater_equal",
-       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"greater_than",
-       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"grid_sampler",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"hard_sigmoid_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"hard_sigmoid",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"hard_swish_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"hard_swish", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"huber_loss_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"huber_loss", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"kldiv_loss", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"kldiv_loss_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"iou_similarity",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"index_select",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"instance_norm",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"instance_norm_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"label_smooth",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"lamb",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"lars_momentum",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"layer_norm_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"layer_norm",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"leaky_relu_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"leaky_relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"less_equal",
-       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"less_than",
-       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"load_combine",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"log", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"log_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"log_softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"log_softmax_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"lookup_table_v2_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"lookup_table_v2",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"masked_select",
-       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"masked_select_grad",
-       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"matmul_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"matmul_v2_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"matmul_v2",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"matmul",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"mean_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"mean",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"merged_momentum",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"mish_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"mish", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"momentum",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"mul",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"mul_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"nearest_interp_v2",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"nearest_interp_v2_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"not_equal",
-       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"one_hot",
-       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"one_hot_v2",
-       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"p_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"p_norm_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"pad3d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"pad3d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"pool2d_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"pool2d",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"pow", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"pow_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"pow2_decay_with_linear_warmup",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"prior_box", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"range",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"reciprocal", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reciprocal_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"reduce_max_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_mean_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_min", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_prod", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_sum_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"relu6", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"relu6_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"relu_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"relu",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"reshape2_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reshape2",
-       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"resnet_unit",
-       XPUKernelSet({pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"resnet_unit_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"rmsprop", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"rnn", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"rnn_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"roi_align", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"roi_align_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"roll", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"roll_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"scale",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"scatter",
-       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"sampling_id",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP64, XPUPlace())})},
-      {"sgd",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"sgd_dense_param_sparse_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"silu_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"silu",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"sigmoid_cross_entropy_with_logits_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"sigmoid_cross_entropy_with_logits",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"shape",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"sigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"sigmoid_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"sign", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"slice_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace())})},
-      {"slice",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"softmax",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"softmax_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"softmax_with_cross_entropy_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"softmax_with_cross_entropy",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"softplus", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"softplus_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"split",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace())})},
-      {"split_with_num",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace())})},
-      {"sqrt", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"sqrt_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"square_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"square",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"squeeze2_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::UINT8, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"squeeze2",
-       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::UINT8, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"squeeze_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::UINT8, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"squeeze",
-       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::UINT8, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"stack",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace())})},
-      {"stack_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace())})},
-      {"strided_slice",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::INT16, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace())})},
-      {"strided_slice_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::INT16, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace())})},
-      {"sum",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"swish", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"swish_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"tanh_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"tanh",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"temporal_shift",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"temporal_shift_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"tril_triu",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace())})},
-      {"tril_triu_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace())})},
-      {"tile",
-       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"tile_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"transpose2_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"transpose2",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"transpose_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"transpose",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"truncated_gaussian_random",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"top_k",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"top_k_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"update_loss_scaling",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"uniform_random",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"unsqueeze2_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::UINT8, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"unsqueeze2",
-       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::UINT8, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"unsqueeze_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::UINT8, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"unsqueeze",
-       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::UINT8, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"warpctc_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"warpctc", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"where_index",
-       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"where",
-       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-
-      // AddMore
-      {"sequence_conv",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"sequence_conv_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"sequence_unpad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      // Fused op
-      {"resnet_basic_block_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"resnet_basic_block",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"fused_gemm_epilogue",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"fused_gemm_epilogue_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"fused_attention",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"fused_attention_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"fused_feedforward",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"fused_feedforward_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-  };
-
-  return s_xpu2_kernels;
-}
-
-}  // namespace platform
-}  // namespace paddle
-#endif
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sampling_id_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sampling_id_op_xpu.py
deleted file mode 100644
index f2425c8f86088..0000000000000
--- a/python/paddle/fluid/tests/unittests/xpu/test_sampling_id_op_xpu.py
+++ /dev/null
@@ -1,48 +0,0 @@
-#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import unittest
-
-import numpy as np
-
-sys.path.append("..")
-
-import paddle
-import paddle.fluid as fluid
-
-
-class TestSamplingIdShape(unittest.TestCase):
-    def test_shape(self):
-        paddle.enable_static()
-        x = fluid.layers.data(name='x', shape=[3], dtype='float32')
-        output = fluid.layers.sampling_id(x)
-
-        place = fluid.XPUPlace(0)
-        exe = fluid.Executor(place=place)
-        exe.run(fluid.default_startup_program())
-
-        feed = {
-            'x': np.array([[0.2, 0.3, 0.5], [0.2, 0.3, 0.4]], dtype='float32')
-        }
-        output_np = exe.run(feed=feed, fetch_list=[output])[0]
-
-        self.assertEqual(output.shape[0], -1)
-        self.assertEqual(len(output.shape), 1)
-        self.assertEqual(output_np.shape[0], 2)
-        self.assertEqual(len(output_np.shape), 1)
-
-
-if __name__ == "__main__":
-    unittest.main()

From c8497414cb5a84ed72f4dd7cf66297a17d33d458 Mon Sep 17 00:00:00 2001
From: zqw_1997 <118182234+zhengqiwen1997@users.noreply.github.com>
Date: Thu, 8 Dec 2022 11:40:42 +0800
Subject: [PATCH 54/60] remove detection_output, iou_similarity and
 bipartite_match (#48773)

---
 python/paddle/fluid/layers/detection.py       | 293 ------------------
 python/paddle/fluid/tests/test_detection.py   |  43 ---
 .../fluid/tests/unittests/test_layers.py      |   9 -
 3 files changed, 345 deletions(-)

diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 274919197827b..9a0af76269a7b 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -42,14 +42,11 @@
     'prior_box',
     'density_prior_box',
     'multi_box_head',
-    'bipartite_match',
-    'detection_output',
     'anchor_generator',
     'roi_perspective_transform',
     'generate_proposal_labels',
     'generate_proposals',
     'generate_mask_labels',
-    'iou_similarity',
     'box_coder',
     'polygon_box_transform',
     'box_clip',
@@ -63,205 +60,6 @@
 ]
 
 
-def detection_output(
-    loc,
-    scores,
-    prior_box,
-    prior_box_var,
-    background_label=0,
-    nms_threshold=0.3,
-    nms_top_k=400,
-    keep_top_k=200,
-    score_threshold=0.01,
-    nms_eta=1.0,
-    return_index=False,
-):
-    """
-
-    Given the regression locations, classification confidences and prior boxes,
-    calculate the detection outputs by performing following steps:
-
-    1. Decode input bounding box predictions according to the prior boxes and
-       regression locations.
-    2. Get the final detection results by applying multi-class non maximum
-       suppression (NMS).
-
-    Please note, this operation doesn't clip the final output bounding boxes
-    to the image window.
-
-    Args:
-        loc(Variable): A 3-D Tensor with shape [N, M, 4] represents the
-            predicted locations of M bounding bboxes. Data type should be
-            float32 or float64. N is the batch size,
-            and each bounding box has four coordinate values and the layout
-            is [xmin, ymin, xmax, ymax].
-        scores(Variable): A 3-D Tensor with shape [N, M, C] represents the
-            predicted confidence predictions. Data type should be float32
-            or float64. N is the batch size, C is the
-            class number, M is number of bounding boxes.
-        prior_box(Variable): A 2-D Tensor with shape [M, 4] holds M boxes,
-            each box is represented as [xmin, ymin, xmax, ymax]. Data type
-            should be float32 or float64.
-        prior_box_var(Variable): A 2-D Tensor with shape [M, 4] holds M group
-            of variance. Data type should be float32 or float64.
-        background_label(int): The index of background label,
-            the background label will be ignored. If set to -1, then all
-            categories will be considered. Default: 0.
-        nms_threshold(float): The threshold to be used in NMS. Default: 0.3.
-        nms_top_k(int): Maximum number of detections to be kept according
-            to the confidences after filtering detections based on
-            score_threshold and before NMS. Default: 400.
-        keep_top_k(int): Number of total bboxes to be kept per image after
-            NMS step. -1 means keeping all bboxes after NMS step. Default: 200.
-        score_threshold(float): Threshold to filter out bounding boxes with
-            low confidence score. If not provided, consider all boxes.
-            Default: 0.01.
-        nms_eta(float): The parameter for adaptive NMS. It works only when the
-            value is less than 1.0. Default: 1.0.
-        return_index(bool): Whether return selected index. Default: False
-
-    Returns:
-
-        A tuple with two Variables: (Out, Index) if return_index is True,
-        otherwise, a tuple with one Variable(Out) is returned.
-
-        Out (Variable): The detection outputs is a LoDTensor with shape [No, 6].
-        Data type is the same as input (loc). Each row has six values:
-        [label, confidence, xmin, ymin, xmax, ymax]. `No` is
-        the total number of detections in this mini-batch. For each instance,
-        the offsets in first dimension are called LoD, the offset number is
-        N + 1, N is the batch size. The i-th image has `LoD[i + 1] - LoD[i]`
-        detected results, if it is 0, the i-th image has no detected results.
-
-        Index (Variable): Only return when return_index is True. A 2-D LoDTensor
-        with shape [No, 1] represents the selected index which type is Integer.
-        The index is the absolute value cross batches. No is the same number
-        as Out. If the index is used to gather other attribute such as age,
-        one needs to reshape the input(N, M, 1) to (N * M, 1) as first, where
-        N is the batch size and M is the number of boxes.
-
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle
-
-            paddle.enable_static()
-
-            pb = fluid.data(name='prior_box', shape=[10, 4], dtype='float32')
-            pbv = fluid.data(name='prior_box_var', shape=[10, 4], dtype='float32')
-            loc = fluid.data(name='target_box', shape=[2, 21, 4], dtype='float32')
-            scores = fluid.data(name='scores', shape=[2, 21, 10], dtype='float32')
-            nmsed_outs, index = fluid.layers.detection_output(scores=scores,
-                                       loc=loc,
-                                       prior_box=pb,
-                                       prior_box_var=pbv,
-                                       return_index=True)
-    """
-    helper = LayerHelper("detection_output", **locals())
-    decoded_box = box_coder(
-        prior_box=prior_box,
-        prior_box_var=prior_box_var,
-        target_box=loc,
-        code_type='decode_center_size',
-    )
-    scores = paddle.nn.functional.softmax(scores)
-    scores = paddle.transpose(scores, perm=[0, 2, 1])
-    scores.stop_gradient = True
-    nmsed_outs = helper.create_variable_for_type_inference(
-        dtype=decoded_box.dtype
-    )
-    if return_index:
-        index = helper.create_variable_for_type_inference(dtype='int')
-        helper.append_op(
-            type="multiclass_nms2",
-            inputs={'Scores': scores, 'BBoxes': decoded_box},
-            outputs={'Out': nmsed_outs, 'Index': index},
-            attrs={
-                'background_label': 0,
-                'nms_threshold': nms_threshold,
-                'nms_top_k': nms_top_k,
-                'keep_top_k': keep_top_k,
-                'score_threshold': score_threshold,
-                'nms_eta': 1.0,
-            },
-        )
-        index.stop_gradient = True
-    else:
-        helper.append_op(
-            type="multiclass_nms",
-            inputs={'Scores': scores, 'BBoxes': decoded_box},
-            outputs={'Out': nmsed_outs},
-            attrs={
-                'background_label': 0,
-                'nms_threshold': nms_threshold,
-                'nms_top_k': nms_top_k,
-                'keep_top_k': keep_top_k,
-                'score_threshold': score_threshold,
-                'nms_eta': 1.0,
-            },
-        )
-    nmsed_outs.stop_gradient = True
-    if return_index:
-        return nmsed_outs, index
-    return nmsed_outs
-
-
-@templatedoc()
-def iou_similarity(x, y, box_normalized=True, name=None):
-    """
-        :alias_main: paddle.nn.functional.iou_similarity
-        :alias: paddle.nn.functional.iou_similarity,paddle.nn.functional.loss.iou_similarity
-        :old_api: paddle.fluid.layers.iou_similarity
-
-    ${comment}
-
-    Args:
-        x (Variable): ${x_comment}.The data type is float32 or float64.
-        y (Variable): ${y_comment}.The data type is float32 or float64.
-        box_normalized(bool): Whether treat the priorbox as a normalized box.
-            Set true by default.
-    Returns:
-        Variable: ${out_comment}.The data type is same with x.
-
-    Examples:
-        .. code-block:: python
-
-            import numpy as np
-            import paddle.fluid as fluid
-
-            use_gpu = False
-            place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
-            exe = fluid.Executor(place)
-
-            x = fluid.data(name='x', shape=[None, 4], dtype='float32')
-            y = fluid.data(name='y', shape=[None, 4], dtype='float32')
-            iou = fluid.layers.iou_similarity(x=x, y=y)
-
-            exe.run(fluid.default_startup_program())
-            test_program = fluid.default_main_program().clone(for_test=True)
-
-            [out_iou] = exe.run(test_program,
-                    fetch_list=iou,
-                    feed={'x': np.array([[0.5, 0.5, 2.0, 2.0],
-                                         [0., 0., 1.0, 1.0]]).astype('float32'),
-                          'y': np.array([[1.0, 1.0, 2.5, 2.5]]).astype('float32')})
-            # out_iou is [[0.2857143],
-            #             [0.       ]] with shape: [2, 1]
-    """
-    helper = LayerHelper("iou_similarity", **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-    helper.append_op(
-        type="iou_similarity",
-        inputs={"X": x, "Y": y},
-        attrs={"box_normalized": box_normalized},
-        outputs={"Out": out},
-    )
-    return out
-
-
 @templatedoc()
 def box_coder(
     prior_box,
@@ -533,97 +331,6 @@ def __create_var(type):
     return map_out
 
 
-def bipartite_match(
-    dist_matrix, match_type=None, dist_threshold=None, name=None
-):
-    """
-
-    This operator implements a greedy bipartite matching algorithm, which is
-    used to obtain the matching with the maximum distance based on the input
-    distance matrix. For input 2D matrix, the bipartite matching algorithm can
-    find the matched column for each row (matched means the largest distance),
-    also can find the matched row for each column. And this operator only
-    calculate matched indices from column to row. For each instance,
-    the number of matched indices is the column number of the input distance
-    matrix. **The OP only supports CPU**.
-
-    There are two outputs, matched indices and distance.
-    A simple description, this algorithm matched the best (maximum distance)
-    row entity to the column entity and the matched indices are not duplicated
-    in each row of ColToRowMatchIndices. If the column entity is not matched
-    any row entity, set -1 in ColToRowMatchIndices.
-
-    NOTE: the input DistMat can be LoDTensor (with LoD) or Tensor.
-    If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size.
-    If Tensor, the height of ColToRowMatchIndices is 1.
-
-    NOTE: This API is a very low level API. It is used by :code:`ssd_loss`
-    layer. Please consider to use :code:`ssd_loss` instead.
-
-    Args:
-        dist_matrix(Variable): This input is a 2-D LoDTensor with shape
-            [K, M]. The data type is float32 or float64. It is pair-wise
-            distance matrix between the entities represented by each row and
-            each column. For example, assumed one entity is A with shape [K],
-            another entity is B with shape [M]. The dist_matrix[i][j] is the
-            distance between A[i] and B[j]. The bigger the distance is, the
-            better matching the pairs are. NOTE: This tensor can contain LoD
-            information to represent a batch of inputs. One instance of this
-            batch can contain different numbers of entities.
-        match_type(str, optional): The type of matching method, should be
-           'bipartite' or 'per_prediction'. None ('bipartite') by default.
-        dist_threshold(float32, optional): If `match_type` is 'per_prediction',
-            this threshold is to determine the extra matching bboxes based
-            on the maximum distance, 0.5 by default.
-        name(str, optional): For detailed information, please refer
-            to :ref:`api_guide_Name`. Usually name is no need to set and
-            None by default.
-
-    Returns:
-        Tuple:
-
-        matched_indices(Variable): A 2-D Tensor with shape [N, M]. The data
-        type is int32. N is the batch size. If match_indices[i][j] is -1, it
-        means B[j] does not match any entity in i-th instance.
-        Otherwise, it means B[j] is matched to row
-        match_indices[i][j] in i-th instance. The row number of
-        i-th instance is saved in match_indices[i][j].
-
-        matched_distance(Variable): A 2-D Tensor with shape [N, M]. The data
-        type is float32. N is batch size. If match_indices[i][j] is -1,
-        match_distance[i][j] is also -1.0. Otherwise, assumed
-        match_distance[i][j] = d, and the row offsets of each instance
-        are called LoD. Then match_distance[i][j] =
-        dist_matrix[d+LoD[i]][j].
-
-    Examples:
-
-        >>> import paddle.fluid as fluid
-        >>> x = fluid.data(name='x', shape=[None, 4], dtype='float32')
-        >>> y = fluid.data(name='y', shape=[None, 4], dtype='float32')
-        >>> iou = fluid.layers.iou_similarity(x=x, y=y)
-        >>> matched_indices, matched_dist = fluid.layers.bipartite_match(iou)
-    """
-    helper = LayerHelper('bipartite_match', **locals())
-    match_indices = helper.create_variable_for_type_inference(dtype='int32')
-    match_distance = helper.create_variable_for_type_inference(
-        dtype=dist_matrix.dtype
-    )
-    helper.append_op(
-        type='bipartite_match',
-        inputs={'DistMat': dist_matrix},
-        attrs={
-            'match_type': match_type,
-            'dist_threshold': dist_threshold,
-        },
-        outputs={
-            'ColToRowMatchIndices': match_indices,
-            'ColToRowMatchDist': match_distance,
-        },
-    )
-    return match_indices, match_distance
-
-
 def prior_box(
     input,
     image,
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index a2745bbca8e71..23bcf526c7e33 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -77,49 +77,6 @@ def dynamic_graph(self, force_to_use_cpu=False):
 
 
 class TestDetection(unittest.TestCase):
-    def test_detection_output(self):
-        program = Program()
-        with program_guard(program):
-            pb = layers.data(
-                name='prior_box',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32',
-            )
-            pbv = layers.data(
-                name='prior_box_var',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32',
-            )
-            loc = layers.data(
-                name='target_box',
-                shape=[2, 10, 4],
-                append_batch_size=False,
-                dtype='float32',
-            )
-            scores = layers.data(
-                name='scores',
-                shape=[2, 10, 20],
-                append_batch_size=False,
-                dtype='float32',
-            )
-            out = layers.detection_output(
-                scores=scores, loc=loc, prior_box=pb, prior_box_var=pbv
-            )
-            out2, index = layers.detection_output(
-                scores=scores,
-                loc=loc,
-                prior_box=pb,
-                prior_box_var=pbv,
-                return_index=True,
-            )
-            self.assertIsNotNone(out)
-            self.assertIsNotNone(out2)
-            self.assertIsNotNone(index)
-            self.assertEqual(out.shape[-1], 6)
-        print(str(program))
-
     def test_box_coder_api(self):
         program = Program()
         with program_guard(program):
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index e166b0adb0914..e39ed15e28c16 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -2414,15 +2414,6 @@ def make_scale_variable(self):
             out = paddle.scale(input, scale=scale_var)
             return out
 
-    def make_iou_similarity(self):
-        with program_guard(
-            fluid.default_main_program(), fluid.default_startup_program()
-        ):
-            x = self._get_data(name="x", shape=[4], dtype="float32")
-            y = self._get_data(name="y", shape=[4], dtype="float32")
-            out = layers.iou_similarity(x, y, name='iou_similarity')
-            return out
-
     def make_bilinear_tensor_product_layer(self):
         with program_guard(
             fluid.default_main_program(), fluid.default_startup_program()

From a5999d834514779090d49f0ea465f08fc1537075 Mon Sep 17 00:00:00 2001
From: Ruibiao Chen <chenruibiao@baidu.com>
Date: Thu, 8 Dec 2022 12:02:56 +0800
Subject: [PATCH 55/60] Set WaiterType of kGpuSync to kCPU (#48758)

---
 .../framework/new_executor/interpreter/stream_analyzer.cc      | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
index 920fec72bd43a..88fac23338f54 100644
--- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
@@ -356,7 +356,8 @@ void StreamAnalyzer::ShrinkEventInfo(
 
 platform::DeviceType StreamAnalyzer::GetWaiterType(
     const Instruction& instr) const {
-  if (instr.KernelType() == OpFuncType::kCpuSync) {
+  if (instr.KernelType() == OpFuncType::kCpuSync ||
+      instr.KernelType() == OpFuncType::kGpuSync) {
     return platform::kCPU;
   } else {
     if (platform::is_xpu_place(place_)) {

From fe86771afff3cdf524c372e3a8a0b40864a965fa Mon Sep 17 00:00:00 2001
From: liu zhengxi <380185688@qq.com>
Date: Thu, 8 Dec 2022 12:23:39 +0800
Subject: [PATCH 56/60] [Migrate Fluid] Migrate Decoder, BeamSearchDecoder
 (#48754)

---
 python/paddle/fluid/layers/rnn.py             | 767 ------------------
 .../tests/unittests/test_rnn_decode_api.py    | 155 +---
 python/paddle/nn/decode.py                    | 624 +++++++++++++-
 3 files changed, 628 insertions(+), 918 deletions(-)

diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index 6799550e7f63a..52c0d133f0038 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -39,8 +39,6 @@
     'RNNCell',
     'GRUCell',
     'LSTMCell',
-    'Decoder',
-    'BeamSearchDecoder',
     'rnn',
     'birnn',
     'dynamic_decode',
@@ -48,7 +46,6 @@
     'TrainingHelper',
     'GreedyEmbeddingHelper',
     'SampleEmbeddingHelper',
-    'BasicDecoder',
     'dynamic_lstm',
     'dynamic_lstmp',
     'dynamic_gru',
@@ -821,632 +818,6 @@ def birnn(
     return outputs, final_states
 
 
-class Decoder:
-    """
-        :api_attr: Static Graph
-
-    Decoder is the base class for any decoder instance used in `dynamic_decode`.
-    It provides interface for output generation for one time step, which can be
-    used to generate sequences.
-
-    The key abstraction provided by Decoder is:
-
-    1. :code:`(initial_input, initial_state, finished) = initialize(inits)` ,
-    which generates the input and state for the first decoding step, and gives the
-    initial status telling whether each sequence in the batch is finished.
-    It would be called once before the decoding iterations.
-
-    2. :code:`(output, next_state, next_input, finished) = step(time, input, state)` ,
-    which transforms the input and state to the output and new state, generates
-    input for the next decoding step, and emits the flag indicating finished status.
-    It is the main part for each decoding iteration.
-
-    3. :code:`(final_outputs, final_state) = finalize(outputs, final_state, sequence_lengths)` ,
-    which revises the outputs(stack of all time steps' output) and final state(state from the
-    last decoding step) to get the counterpart for special usage.
-    Not necessary to be implemented if no need to revise the stacked outputs and
-    state from the last decoding step. If implemented, it would be called after
-    the decoding iterations.
-
-    Decoder is more general compared to RNNCell, since the returned `next_input`
-    and `finished` make it can determine the input and when to finish by itself
-    when used in dynamic decoding. Decoder always wraps a RNNCell instance though
-    not necessary.
-    """
-
-    def initialize(self, inits):
-        r"""
-        Called once before the decoding iterations.
-
-        Parameters:
-            inits: Argument provided by the caller.
-
-        Returns:
-            tuple: A tuple( :code:`(initial_inputs, initial_states, finished)` ). \
-                `initial_inputs` and `initial_states` both are a (possibly nested \
-                structure of) tensor variable[s], and `finished` is a tensor with \
-                bool data type.
-        """
-        raise NotImplementedError
-
-    def step(self, time, inputs, states, **kwargs):
-        r"""
-        Called per step of decoding.
-
-        Parameters:
-            time(Variable): A Tensor with shape :math:`[1]` provided by the caller.
-                The data type is int64.
-            inputs(Variable): A (possibly nested structure of) tensor variable[s].
-            states(Variable): A (possibly nested structure of) tensor variable[s].
-            **kwargs: Additional keyword arguments, provided by the caller.
-
-        Returns:
-            tuple: A tuple( :code:(outputs, next_states, next_inputs, finished)` ). \
-                `next_inputs` and `next_states` both are a (possibly nested \
-                structure of) tensor variable[s], and the structure, shape and \
-                data type must be same as the counterpart from input arguments. \
-                `outputs` is a (possibly nested structure of) tensor variable[s]. \
-                `finished` is a Tensor with bool data type.
-        """
-        raise NotImplementedError
-
-    def finalize(self, outputs, final_states, sequence_lengths):
-        r"""
-        Called once after the decoding iterations if implemented.
-
-        Parameters:
-            outputs(Variable): A (possibly nested structure of) tensor variable[s].
-                The structure and data type is same as `output_dtype`.
-                The tensor stacks all time steps' output thus has shape
-                :math:`[time\_step, batch\_size, ...]` , which is done by the caller.
-            final_states(Variable): A (possibly nested structure of) tensor variable[s].
-                It is the `next_states` returned by `decoder.step` at last decoding step,
-                thus has the same structure, shape and data type with states at any time
-                step.
-
-        Returns:
-            tuple: A tuple( :code:`(final_outputs, final_states)` ). \
-                `final_outputs` and `final_states` both are a (possibly nested \
-                structure of) tensor variable[s].
-        """
-        raise NotImplementedError
-
-    @property
-    def tracks_own_finished(self):
-        """
-        Describes whether the Decoder keeps track of finished states by itself.
-
-        `decoder.step()` would emit a bool `finished` value at each decoding
-        step. The emited `finished` can be used to determine whether every
-        batch entries is finished directly, or it can be combined with the
-        finished tracker keeped in `dynamic_decode` by performing a logical OR
-        to take the already finished into account.
-
-        If `False`, the latter would be took when performing `dynamic_decode`,
-        which is the default. Otherwise, the former would be took, which uses
-        the finished value emited by the decoder as all batch entry finished
-        status directly, and it is the case when batch entries might be
-        reordered such as beams in BeamSearchDecoder.
-
-        Returns:
-            bool: A python bool `False`.
-        """
-        return False
-
-
-class BeamSearchDecoder(Decoder):
-    """
-    Decoder with beam search decoding strategy. It wraps a cell to get probabilities,
-    and follows a beam search step to calculate scores and select candidate
-    token ids for each decoding step.
-
-    Please refer to `Beam search <https://en.wikipedia.org/wiki/Beam_search>`_
-    for more details.
-
-    **NOTE** When decoding with beam search, the `inputs` and `states` of cell
-    would be tiled to `beam_size` (unsqueeze and tile), resulting to shapes like
-    `[batch_size * beam_size, ...]` , which is built into `BeamSearchDecoder` and
-    done automatically. Thus any other tensor with shape `[batch_size, ...]` used
-    in `cell.call` needs to be tiled manually first, which can be completed by using
-    :code:`BeamSearchDecoder.tile_beam_merge_with_batch` . The most common case
-    for this is the encoder output in attention mechanism.
-
-    Returns:
-        BeamSearchDecoder: An instance of decoder which can be used in \
-            `paddle.nn.dynamic_decode` to implement decoding.
-
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-            from paddle.nn import BeamSearchDecoder, dynamic_decode
-            from paddle.nn import GRUCell, Linear, Embedding
-            trg_embeder = Embedding(100, 32)
-            output_layer = Linear(32, 32)
-            decoder_cell = GRUCell(input_size=32, hidden_size=32)
-            decoder = BeamSearchDecoder(decoder_cell,
-                                        start_token=0,
-                                        end_token=1,
-                                        beam_size=4,
-                                        embedding_fn=trg_embeder,
-                                        output_fn=output_layer)
-
-    """
-
-    def __init__(
-        self,
-        cell,
-        start_token,
-        end_token,
-        beam_size,
-        embedding_fn=None,
-        output_fn=None,
-    ):
-        """
-        Constructor of BeamSearchDecoder.
-
-        Parameters:
-            cell(RNNCellBase): An instance of `RNNCellBase` or object with the same interface.
-            start_token(int): The start token id.
-            end_token(int): The end token id.
-            beam_size(int): The beam width used in beam search.
-            embedding_fn(optional): A callable to apply to selected candidate ids.
-                Mostly it is an embedding layer to transform ids to embeddings,
-                and the returned value acts as the `input` argument for `cell.call`.
-                If not provided, the id to embedding transformation must be built into
-                `cell.call`. Default None.
-            output_fn(optional): A callable to apply to the cell's output prior to
-                calculate scores and select candidate token ids. Default None.
-        """
-        self.cell = cell
-        self.embedding_fn = embedding_fn
-        self.output_fn = output_fn
-        self.start_token = start_token
-        self.end_token = end_token
-        self.beam_size = beam_size
-
-    @staticmethod
-    def tile_beam_merge_with_batch(x, beam_size):
-        r"""
-        Tile the batch dimension of a tensor. Specifically, this function takes
-        a tensor t shaped `[batch_size, s0, s1, ...]` composed of minibatch
-        entries `t[0], ..., t[batch_size - 1]` and tiles it to have a shape
-        `[batch_size * beam_size, s0, s1, ...]` composed of minibatch entries
-        `t[0], t[0], ..., t[1], t[1], ...` where each minibatch entry is repeated
-        `beam_size` times.
-
-        Parameters:
-            x(Variable): A tensor with shape `[batch_size, ...]`. The data type
-                should be float32, float64, int32, int64 or bool.
-            beam_size(int): The beam width used in beam search.
-
-        Returns:
-            Variable: A tensor with shape `[batch_size * beam_size, ...]`, whose \
-                data type is same as `x`.
-        """
-        check_type(
-            x, 'x', (Variable), 'BeamSearchDecoder.tile_beam_merge_with_batch'
-        )
-        x = nn.unsqueeze(x, [1])  # [batch_size, 1, ...]
-        expand_times = [1] * len(x.shape)
-        expand_times[1] = beam_size
-        x = paddle.tile(x, expand_times)  # [batch_size, beam_size, ...]
-        x = paddle.transpose(
-            x, list(range(2, len(x.shape))) + [0, 1]
-        )  # [..., batch_size, beam_size]
-        # use 0 to copy to avoid wrong shape
-        x = paddle.reshape(
-            x, shape=[0] * (len(x.shape) - 2) + [-1]
-        )  # [..., batch_size * beam_size]
-        x = paddle.transpose(
-            x, [len(x.shape) - 1] + list(range(0, len(x.shape) - 1))
-        )  # [batch_size * beam_size, ...]
-        return x
-
-    def _split_batch_beams(self, x):
-        r"""
-        Reshape a tensor with shape `[batch_size * beam_size, ...]` to a new
-        tensor with shape `[batch_size, beam_size, ...]`.
-
-        Parameters:
-            x(Variable): A tensor with shape `[batch_size * beam_size, ...]`. The
-                data type should be float32, float64, int32, int64 or bool.
-
-        Returns:
-            Variable: A tensor with shape `[batch_size, beam_size, ...]`, whose \
-                data type is same as `x`.
-        """
-        check_type(x, 'x', (Variable), 'BeamSearchDecoder._split_batch_beams')
-        # TODO: avoid fake shape in compile-time like tile_beam_merge_with_batch
-        return paddle.reshape(x, shape=[-1, self.beam_size] + list(x.shape[1:]))
-
-    def _merge_batch_beams(self, x):
-        r"""
-        Reshape a tensor with shape `[batch_size, beam_size, ...]` to a new
-        tensor with shape `[batch_size * beam_size, ...]`.
-
-        Parameters:
-            x(Variable): A tensor with shape `[batch_size, beam_size, ...]`. The
-                data type should be float32, float64, int32, int64 or bool.
-
-        Returns:
-            Variable: A tensor with shape `[batch_size * beam_size, ...]`, whose \
-                data type is same as `x`.
-        """
-        check_type(x, 'x', (Variable), 'BeamSearchDecoder._merge_batch_beams')
-        # TODO: avoid fake shape in compile-time like tile_beam_merge_with_batch
-        return paddle.reshape(x, shape=[-1] + list(x.shape[2:]))
-
-    def _expand_to_beam_size(self, x):
-        r"""
-        This function takes a tensor t shaped `[batch_size, s0, s1, ...]` composed
-        of minibatch entries `t[0], ..., t[batch_size - 1]` and tiles it to have a
-        shape `[batch_size, beam_size, s0, s1, ...]` composed of minibatch entries
-        `t[0], t[0], ..., t[1], t[1], ...` where each minibatch entry is repeated
-        `beam_size` times.
-
-        Parameters:
-            x(Variable): A tensor with shape `[batch_size, ...]`, The data type
-                should be float32, float64, int32, int64 or bool.
-
-        Returns:
-            Variable: A tensor with shape `[batch_size, beam_size, ...]`, whose \
-                data type is same as `x`.
-        """
-        check_type(x, 'x', (Variable), 'BeamSearchDecoder._expand_to_beam_size')
-        x = nn.unsqueeze(x, [1])
-        expand_times = [1] * len(x.shape)
-        expand_times[1] = self.beam_size
-        x = paddle.tile(x, expand_times)
-        return x
-
-    def _mask_probs(self, probs, finished):
-        r"""
-        Mask log probabilities. It forces finished beams to allocate all probability
-        mass to eos and unfinished beams to remain unchanged.
-
-        Parameters:
-            probs(Variable): A tensor with shape `[batch_size, beam_size, vocab_size]`,
-                representing the log probabilities. Its data type should be float32 or float64.
-            finished(Variable): A tensor with shape `[batch_size, beam_size]`,
-                representing the finished status for all beams. Its data type
-                should be bool.
-
-        Returns:
-            Variable: A tensor with the same shape and data type as `x`, \
-                where unfinished beams stay unchanged and finished beams are \
-                replaced with a tensor with all probability on the EOS token.
-        """
-        check_type(probs, 'probs', (Variable), 'BeamSearchDecoder._mask_probs')
-        check_type(
-            finished, 'finished', (Variable), 'BeamSearchDecoder._mask_probs'
-        )
-        # TODO: use where_op
-        finished = tensor.cast(finished, dtype=probs.dtype)
-        probs = paddle.multiply(
-            paddle.tile(nn.unsqueeze(finished, [2]), [1, 1, self.vocab_size]),
-            self.noend_mask_tensor,
-        ) - nn.elementwise_mul(probs, (finished - 1), axis=0)
-        return probs
-
-    def _gather(self, x, indices, batch_size):
-        r"""
-        Gather from the tensor `x` using `indices`.
-
-        Parameters:
-            x(Variable): A tensor with shape `[batch_size, beam_size, ...]`.
-            indices(Variable): A `int64` tensor with shape `[batch_size, beam_size]`,
-                representing the indices that we use to gather.
-            batch_size(Variable): A tensor with shape `[1]`. Its data type should
-                be int32 or int64.
-
-        Returns:
-            Variable: A tensor with the same shape and data type as `x`, \
-                representing the gathered tensor.
-        """
-        check_type(x, 'x', (Variable), 'BeamSearchDecoder._gather')
-        check_type(indices, 'indices', (Variable), 'BeamSearchDecoder._gather')
-        check_type(
-            batch_size, 'batch_size', (Variable), 'BeamSearchDecoder._gather'
-        )
-        # TODO: compatibility of int32 and int64
-        batch_size = (
-            tensor.cast(batch_size, indices.dtype)
-            if batch_size.dtype != indices.dtype
-            else batch_size
-        )
-        batch_size.stop_gradient = True  # TODO: remove this
-        batch_pos = paddle.tile(
-            nn.unsqueeze(
-                paddle.arange(0, batch_size, 1, dtype=indices.dtype), [1]
-            ),
-            [1, self.beam_size],
-        )
-        topk_coordinates = paddle.stack([batch_pos, indices], axis=2)
-        topk_coordinates.stop_gradient = True
-        return paddle.gather_nd(x, topk_coordinates)
-
-    class OutputWrapper(
-        collections.namedtuple(
-            "OutputWrapper", ("scores", "predicted_ids", "parent_ids")
-        )
-    ):
-        """
-        The structure for the returned value `outputs` of `decoder.step`.
-        A namedtuple includes scores, predicted_ids, parent_ids as fields.
-        """
-
-        pass
-
-    class StateWrapper(
-        collections.namedtuple(
-            "StateWrapper", ("cell_states", "log_probs", "finished", "lengths")
-        )
-    ):
-        """
-        The structure for the argument `states` of `decoder.step`.
-        A namedtuple includes cell_states, log_probs, finished, lengths as fields.
-        """
-
-        pass
-
-    def initialize(self, initial_cell_states):
-        r"""
-        Initialize the BeamSearchDecoder.
-
-        Parameters:
-            initial_cell_states(Variable): A (possibly nested structure of)
-                tensor variable[s]. An argument provided by the caller.
-
-        Returns:
-            tuple: A tuple( :code:`(initial_inputs, initial_states, finished)` ). \
-                `initial_inputs` is a tensor t filled by `start_token` with shape \
-                `[batch_size, beam_size]` when `embedding_fn` is None, or the \
-                returned value of `embedding_fn(t)` when `embedding_fn` is provided. \
-                `initial_states` is a nested structure(namedtuple including cell_states, \
-                log_probs, finished, lengths as fields) of tensor variables, where \
-                `log_probs, finished, lengths` all has a tensor value shaped \
-                `[batch_size, beam_size]` with data type `float32, bool, int64`. \
-                cell_states has a value with the same structure as the input \
-                argument `initial_cell_states` but with tiled shape `[batch_size, beam_size, ...]`. \
-                `finished` is a `bool` tensor filled by False with shape `[batch_size, beam_size]`.
-        """
-        self.kinf = 1e9
-        state = flatten(initial_cell_states)[0]
-        self.batch_size = paddle.shape(state)[0]
-
-        self.start_token_tensor = tensor.fill_constant(
-            shape=[1], dtype="int64", value=self.start_token
-        )
-        self.end_token_tensor = tensor.fill_constant(
-            shape=[1], dtype="int64", value=self.end_token
-        )
-
-        init_cell_states = map_structure(
-            self._expand_to_beam_size, initial_cell_states
-        )
-        init_inputs = paddle.full(
-            shape=[self.batch_size, self.beam_size],
-            fill_value=self.start_token_tensor,
-            dtype=self.start_token_tensor.dtype,
-        )
-        log_probs = paddle.tile(
-            tensor.assign(
-                np.array(
-                    [[0.0] + [-self.kinf] * (self.beam_size - 1)],
-                    dtype="float32",
-                )
-            ),
-            [self.batch_size, 1],
-        )
-        if paddle.get_default_dtype() == "float64":
-            log_probs = tensor.cast(log_probs, "float64")
-        # TODO: remove the restriction of force_cpu
-        init_finished = tensor.fill_constant_batch_size_like(
-            input=state,
-            shape=[-1, self.beam_size],
-            dtype="bool",
-            value=False,
-            force_cpu=True,
-        )
-        init_lengths = paddle.zeros_like(init_inputs)
-        init_inputs = (
-            self.embedding_fn(init_inputs) if self.embedding_fn else init_inputs
-        )
-        return (
-            init_inputs,
-            self.StateWrapper(
-                init_cell_states, log_probs, init_finished, init_lengths
-            ),
-            init_finished,
-        )
-
-    def _beam_search_step(self, time, logits, next_cell_states, beam_state):
-        r"""
-        Calculate scores and select candidate token ids.
-
-        Parameters:
-            time(Variable): An `int64` tensor with shape `[1]` provided by the caller,
-                representing the current time step number of decoding.
-            logits(Variable): A tensor with shape `[batch_size, beam_size, vocab_size]`,
-                representing the logits at the current time step. Its data type is float32.
-            next_cell_states(Variable): A (possibly nested structure of) tensor variable[s].
-                It has the same structure, shape and data type as the `cell_states` of
-                `initial_states` returned by `initialize()`. It represents the next state
-                from the cell.
-            beam_state(Variable): A structure of tensor variables.
-                It is same as the `initial_states` returned by `initialize()` for
-                the first decoding step and `beam_search_state` returned by
-                `step()` for the others.
-
-        Returns:
-            tuple: A tuple( :code:`(beam_search_output, beam_search_state)` ). \
-                `beam_search_output` is a namedtuple(including scores, predicted_ids, \
-                parent_ids as fields) of tensor variables, where \
-                `scores, predicted_ids, parent_ids` all has a tensor value shaped \
-                `[batch_size, beam_size]` with data type `float32, int64, int64`.
-                `beam_search_state` has the same structure, shape and data type \
-                as the input argument `beam_state`.
-
-        """
-        self.vocab_size = logits.shape[-1]
-        self.vocab_size_tensor = tensor.fill_constant(
-            shape=[1], dtype="int64", value=self.vocab_size
-        )
-        noend_array = [-self.kinf] * self.vocab_size
-        noend_array[self.end_token] = 0
-
-        self.noend_mask_tensor = tensor.assign(np.array(noend_array, "float32"))
-        if paddle.get_default_dtype() == "float64":
-            self.noend_mask_tensor = tensor.cast(
-                self.noend_mask_tensor, "float64"
-            )
-
-        step_log_probs = paddle.log(paddle.nn.functional.softmax(logits))
-        step_log_probs = self._mask_probs(step_log_probs, beam_state.finished)
-        log_probs = nn.elementwise_add(
-            x=step_log_probs, y=beam_state.log_probs, axis=0
-        )
-        # TODO: length penalty
-        scores = log_probs
-        scores = paddle.reshape(scores, [-1, self.beam_size * self.vocab_size])
-        # TODO: add grad for topk then this beam search can be used to train
-        topk_scores, topk_indices = paddle.topk(x=scores, k=self.beam_size)
-        beam_indices = paddle.floor_divide(topk_indices, self.vocab_size_tensor)
-        token_indices = paddle.remainder(topk_indices, self.vocab_size_tensor)
-        next_log_probs = self._gather(
-            paddle.reshape(log_probs, [-1, self.beam_size * self.vocab_size]),
-            topk_indices,
-            self.batch_size,
-        )
-        next_cell_states = map_structure(
-            lambda x: self._gather(x, beam_indices, self.batch_size),
-            next_cell_states,
-        )
-        next_finished = self._gather(
-            beam_state.finished, beam_indices, self.batch_size
-        )
-        next_lengths = self._gather(
-            beam_state.lengths, beam_indices, self.batch_size
-        )
-        next_lengths = next_lengths + tensor.cast(
-            paddle.logical_not(next_finished), beam_state.lengths.dtype
-        )
-        next_finished = paddle.logical_or(
-            next_finished,
-            paddle.equal(token_indices, self.end_token_tensor),
-        )
-
-        beam_search_output = self.OutputWrapper(
-            topk_scores, token_indices, beam_indices
-        )
-        beam_search_state = self.StateWrapper(
-            next_cell_states, next_log_probs, next_finished, next_lengths
-        )
-        return beam_search_output, beam_search_state
-
-    def step(self, time, inputs, states, **kwargs):
-        r"""
-        Perform a beam search decoding step, which uses `cell` to get probabilities,
-        and follows a beam search step to calculate scores and select candidate
-        token ids.
-
-        Parameters:
-            time(Variable): An `int64` tensor with shape `[1]` provided by the caller,
-                representing the current time step number of decoding.
-            inputs(Variable): A tensor variable. It is same as `initial_inputs`
-                returned by `initialize()` for the first decoding step and
-                `next_inputs` returned by `step()` for the others.
-            states(Variable): A structure of tensor variables.
-                It is same as the `initial_states` returned by `initialize()` for
-                the first decoding step and `beam_search_state` returned by
-                `step()` for the others.
-            **kwargs: Additional keyword arguments, provided by the caller.
-
-        Returns:
-            tuple: A tuple( :code:`(beam_search_output, beam_search_state, next_inputs, finished)` ). \
-                `beam_search_state` and `next_inputs` have the same structure, \
-                shape and data type as the input arguments `states` and `inputs` separately. \
-                `beam_search_output` is a namedtuple(including scores, predicted_ids, \
-                parent_ids as fields) of tensor variables, where \
-                `scores, predicted_ids, parent_ids` all has a tensor value shaped \
-                `[batch_size, beam_size]` with data type `float32, int64, int64`. \
-                `finished` is a `bool` tensor with shape `[batch_size, beam_size]`.
-        """
-        inputs = map_structure(self._merge_batch_beams, inputs)
-        cell_states = map_structure(self._merge_batch_beams, states.cell_states)
-        cell_outputs, next_cell_states = self.cell(
-            inputs, cell_states, **kwargs
-        )
-        cell_outputs = map_structure(self._split_batch_beams, cell_outputs)
-        next_cell_states = map_structure(
-            self._split_batch_beams, next_cell_states
-        )
-
-        if self.output_fn is not None:
-            cell_outputs = self.output_fn(cell_outputs)
-
-        beam_search_output, beam_search_state = self._beam_search_step(
-            time=time,
-            logits=cell_outputs,
-            next_cell_states=next_cell_states,
-            beam_state=states,
-        )
-        finished = beam_search_state.finished
-        sample_ids = beam_search_output.predicted_ids
-        sample_ids.stop_gradient = True
-        next_inputs = (
-            self.embedding_fn(sample_ids) if self.embedding_fn else sample_ids
-        )
-
-        return (beam_search_output, beam_search_state, next_inputs, finished)
-
-    def finalize(self, outputs, final_states, sequence_lengths):
-        r"""
-        Use `gather_tree` to backtrace along the beam search tree and construct
-        the full predicted sequences.
-
-        Parameters:
-            outputs(Variable): A structure(namedtuple) of tensor variables,
-                The structure and data type is same as `output_dtype`.
-                The tensor stacks all time steps' output thus has shape
-                `[time_step, batch_size, ...]`, which is done by the caller.
-            final_states(Variable): A structure(namedtuple) of tensor variables.
-                It is the `next_states` returned by `decoder.step` at last
-                decoding step, thus has the same structure, shape and data type
-                with states at any time step.
-            sequence_lengths(Variable): An `int64` tensor shaped `[batch_size, beam_size]`.
-                It contains sequence lengths for each beam determined during
-                decoding.
-
-        Returns:
-            tuple: A tuple( :code:`(predicted_ids, final_states)` ). \
-                `predicted_ids` is an `int64` tensor shaped \
-                `[time_step, batch_size, beam_size]`. `final_states` is the same \
-                as the input argument `final_states`.
-        """
-        predicted_ids = paddle.nn.functional.gather_tree(
-            outputs.predicted_ids, outputs.parent_ids
-        )
-        # TODO: use FinalBeamSearchDecoderOutput as output
-        return predicted_ids, final_states
-
-    @property
-    def tracks_own_finished(self):
-        """
-        BeamSearchDecoder reorders its beams and their finished state. Thus it
-        conflicts with `dynamic_decode` function's tracking of finished states.
-        Setting this property to true to avoid early stopping of decoding due
-        to mismanagement of the finished state.
-
-        Returns:
-            bool: A python bool `True`.
-        """
-        return True
-
-
 def _dynamic_decode_imperative(
     decoder,
     inits=None,
@@ -2304,144 +1675,6 @@ def __init__(
         self.seed = seed
 
 
-class BasicDecoder(Decoder):
-    """
-    BasicDecoder is a subclass of Decoder and assembles a RNNCell and DecodeHelper
-    instance as members, where the DecodeHelper helps to implement customed
-    decoding strategies.. It performs one decoding step as following steps:
-
-    1. Perform `cell_outputs, cell_states = cell.call(inputs, states)`
-    to get outputs and new states from cell.
-
-    2. Perform `sample_ids = helper.sample(time, cell_outputs, cell_states)`
-    to sample ids as decoded results of the current time step.
-
-    3. Perform `finished, next_inputs, next_states = helper.next_inputs(time,
-    cell_outputs, cell_states, sample_ids)` to generate inputs, states and
-    finished status for the next decoding step.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-            trg_emb = fluid.data(name="trg_emb",
-                                 shape=[None, None, 128],
-                                 dtype="float32")
-
-            trg_embeder = lambda x: fluid.embedding(
-                x, size=[10000, 128], param_attr=fluid.ParamAttr(name="trg_embedding"))
-            output_layer = lambda x: layers.fc(x,
-                                            size=10000,
-                                            num_flatten_dims=len(x.shape) - 1,
-                                            param_attr=fluid.ParamAttr(name=
-                                                                    "output_w"),
-                                            bias_attr=False)
-            helper = layers.SampleEmbeddingHelper(trg_embeder, start_tokens=0, end_token=1)
-            decoder_cell = layers.GRUCell(hidden_size=128)
-            decoder = layers.BasicDecoder(decoder_cell, helper, output_fn=output_layer)
-            outputs = layers.dynamic_decode(
-                decoder=decoder, inits=decoder_cell.get_initial_states(encoder_output))
-    """
-
-    def __init__(self, cell, helper, output_fn=None):
-        """
-        Constructor of BasicDecoder.
-
-        Parameters:
-            cell(RNNCell): An instance of `RNNCell` or object with the same interface.
-            helper(DecodeHelper): An instance of `DecodeHelper`.
-            output_fn(optional): A callable to apply to the cell's output prior to
-                sampling. Default None.
-        """
-        self.cell = cell
-        self.helper = helper
-        self.output_fn = output_fn
-
-    def initialize(self, initial_cell_states):
-        r"""
-        BasicDecoder initialization includes helper initialization and cell
-        initialization, and cell initialization uses `initial_cell_states` as
-        the result directly.
-
-        Parameters:
-            initial_cell_states(Variable): A (possibly nested structure of)
-                tensor variable[s]. An argument provided by the caller `dynamic_decode`.
-
-        Returns:
-            tuple: A tuple( :code:(initial_inputs, initial_cell_states, finished)` ). \
-                `initial_inputs` and `initial_states` both are a (possibly nested \
-                structure of) tensor variable[s], and `finished` is a tensor with \
-                bool data type. `initial_inputs` and `finished` are the results \
-                of `helper.initialize()`, and `initial_cell_states` is same as \
-                the input argument counterpart.
-        """
-        (initial_inputs, initial_finished) = self.helper.initialize()
-        return initial_inputs, initial_cell_states, initial_finished
-
-    class OutputWrapper(
-        collections.namedtuple("OutputWrapper", ("cell_outputs", "sample_ids"))
-    ):
-        """
-        The structure for the returned value `outputs` of `decoder.step`.
-        A namedtuple includes cell_outputs, sample_ids as fields.
-        """
-
-        pass
-
-    def step(self, time, inputs, states, **kwargs):
-        r"""
-        Perform one decoding step as following steps:
-
-        1. Perform `cell_outputs, cell_states = cell.call(inputs, states)`
-        to get outputs and new states from cell.
-
-        2. Perform `sample_ids = helper.sample(time, cell_outputs, cell_states)`
-        to sample ids as decoded results of the current time step.
-
-        3. Perform `finished, next_inputs, next_states = helper.next_inputs(time,
-        cell_outputs, cell_states, sample_ids)` to generate inputs, states and
-        finished status for the next decoding step.
-
-        Parameters:
-            time(Variable): An `int64` tensor with shape `[1]` provided by the caller,
-                representing the current time step number of decoding.
-            inputs(Variable): A tensor variable. It is same as `initial_inputs`
-                returned by `initialize()` for the first decoding step and
-                `next_inputs` returned by `step()` for the others.
-            states(Variable): A structure of tensor variables.
-                It is same as the `initial_cell_states` returned by `initialize()`
-                for the first decoding step and `next_states` returned by
-                `step()` for the others.
-            **kwargs: Additional keyword arguments, provided by the caller
-                `dynamic_decode`.
-
-        Returns:
-            tuple: A tuple( :code:`(outputs, next_states, next_inputs, finished)` ). \
-                `outputs` is a namedtuple(including cell_outputs, sample_ids, \
-                as fields) of tensor variables, where `cell_outputs` is the result \
-                fof `cell.call()` and `sample_ids` is the result of `helper.sample()`. \
-                `next_states` and `next_inputs` have the same structure, shape \
-                and data type as the input arguments `states` and `inputs` separately. \
-                `finished` is a `bool` tensor with shape `[batch_size]`.
-        """
-        cell_outputs, cell_states = self.cell(inputs, states, **kwargs)
-        if self.output_fn is not None:
-            cell_outputs = self.output_fn(cell_outputs)
-        sample_ids = self.helper.sample(
-            time=time, outputs=cell_outputs, states=cell_states
-        )
-        sample_ids.stop_gradient = True
-        (finished, next_inputs, next_states) = self.helper.next_inputs(
-            time=time,
-            outputs=cell_outputs,
-            states=cell_states,
-            sample_ids=sample_ids,
-        )
-        outputs = self.OutputWrapper(cell_outputs, sample_ids)
-        return (outputs, next_states, next_inputs, finished)
-
-
 def dynamic_lstm(
     input,
     size,
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
index 5a1aaa78338a6..cddc44bbf79e1 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
@@ -151,23 +151,15 @@ def __call__(
 
         if self.decoding_strategy == "beam_search":
             beam_size = kwargs.get("beam_size", 4)
-            encoder_output = (
-                layers.BeamSearchDecoder.tile_beam_merge_with_batch(
-                    encoder_output, beam_size
-                )
+            encoder_output = BeamSearchDecoder.tile_beam_merge_with_batch(
+                encoder_output, beam_size
             )
-            encoder_padding_mask = (
-                layers.BeamSearchDecoder.tile_beam_merge_with_batch(
-                    encoder_padding_mask, beam_size
-                )
+            encoder_padding_mask = BeamSearchDecoder.tile_beam_merge_with_batch(
+                encoder_padding_mask, beam_size
             )
-            decoder = layers.BeamSearchDecoder(
+            decoder = BeamSearchDecoder(
                 cell=self.decoder_cell, output_fn=output_layer, **kwargs
             )
-        else:
-            decoder = layers.BasicDecoder(
-                self.decoder_cell, helper, output_fn=output_layer
-            )
 
         (
             decoder_output,
@@ -535,130 +527,6 @@ def setUp(self):
         )
         self.exe = Executor(place)
 
-    def test_mle_train(self):
-        paddle.enable_static()
-        self.model_hparams["decoding_strategy"] = "train_greedy"
-        agent = SeqPGAgent(
-            model_cls=Seq2SeqModel,
-            alg_cls=MLE,
-            model_hparams=self.model_hparams,
-            alg_hparams={"lr": 0.001},
-            executor=self.exe,
-            main_program=fluid.Program(),
-            startup_program=fluid.Program(),
-            seed=123,
-        )
-        self.exe.run(agent.startup_program)
-        for iter_idx in range(self.iter_num):
-            reward, cost = agent.learn(
-                {
-                    "src": self.data["src"][
-                        iter_idx
-                        * self.batch_size : (iter_idx + 1)
-                        * self.batch_size,
-                        :,
-                    ],
-                    "src_sequence_length": self.data["src_sequence_length"][
-                        iter_idx
-                        * self.batch_size : (iter_idx + 1)
-                        * self.batch_size
-                    ],
-                    "trg": self.data["trg"][
-                        iter_idx
-                        * self.batch_size : (iter_idx + 1)
-                        * self.batch_size,
-                        :,
-                    ],
-                    "trg_sequence_length": self.data["trg_sequence_length"][
-                        iter_idx
-                        * self.batch_size : (iter_idx + 1)
-                        * self.batch_size
-                    ],
-                    "label": self.data["label"][
-                        iter_idx
-                        * self.batch_size : (iter_idx + 1)
-                        * self.batch_size
-                    ],
-                },
-                fetch_list=[agent.cost, agent.cost],
-            )
-            print(
-                "iter_idx: %d, reward: %f, cost: %f"
-                % (iter_idx, reward.mean(), cost)
-            )
-
-    def test_greedy_train(self):
-        paddle.enable_static()
-        self.model_hparams["decoding_strategy"] = "infer_greedy"
-        agent = SeqPGAgent(
-            model_cls=Seq2SeqModel,
-            alg_cls=PolicyGradient,
-            model_hparams=self.model_hparams,
-            alg_hparams={"lr": 0.001},
-            executor=self.exe,
-            main_program=fluid.Program(),
-            startup_program=fluid.Program(),
-            seed=123,
-        )
-        self.exe.run(agent.startup_program)
-        for iter_idx in range(self.iter_num):
-            reward, cost = agent.learn(
-                {
-                    "src": self.data["src"][
-                        iter_idx
-                        * self.batch_size : (iter_idx + 1)
-                        * self.batch_size,
-                        :,
-                    ],
-                    "src_sequence_length": self.data["src_sequence_length"][
-                        iter_idx
-                        * self.batch_size : (iter_idx + 1)
-                        * self.batch_size
-                    ],
-                },
-                fetch_list=[agent.reward, agent.cost],
-            )
-            print(
-                "iter_idx: %d, reward: %f, cost: %f"
-                % (iter_idx, reward.mean(), cost)
-            )
-
-    def test_sample_train(self):
-        paddle.enable_static()
-        self.model_hparams["decoding_strategy"] = "infer_sample"
-        agent = SeqPGAgent(
-            model_cls=Seq2SeqModel,
-            alg_cls=PolicyGradient,
-            model_hparams=self.model_hparams,
-            alg_hparams={"lr": 0.001},
-            executor=self.exe,
-            main_program=fluid.Program(),
-            startup_program=fluid.Program(),
-            seed=123,
-        )
-        self.exe.run(agent.startup_program)
-        for iter_idx in range(self.iter_num):
-            reward, cost = agent.learn(
-                {
-                    "src": self.data["src"][
-                        iter_idx
-                        * self.batch_size : (iter_idx + 1)
-                        * self.batch_size,
-                        :,
-                    ],
-                    "src_sequence_length": self.data["src_sequence_length"][
-                        iter_idx
-                        * self.batch_size : (iter_idx + 1)
-                        * self.batch_size
-                    ],
-                },
-                fetch_list=[agent.reward, agent.cost],
-            )
-            print(
-                "iter_idx: %d, reward: %f, cost: %f"
-                % (iter_idx, reward.mean(), cost)
-            )
-
     def test_beam_search_infer(self):
         paddle.set_default_dtype("float32")
         paddle.enable_static()
@@ -693,19 +561,6 @@ def test_beam_search_infer(self):
                 fetch_list=[output],
             )[0]
 
-    def func_dynamic_basic_decoder(self):
-        paddle.disable_static()
-        src = paddle.to_tensor(np.random.randint(8, size=(8, 4)))
-        src_length = paddle.to_tensor(np.random.randint(8, size=(8)))
-        model = Seq2SeqModel(**self.model_hparams)
-        probs, samples, sample_length = model(src, src_length)
-        paddle.enable_static()
-
-    def test_dynamic_basic_decoder(self):
-        with _test_eager_guard():
-            self.func_dynamic_basic_decoder()
-        self.func_dynamic_basic_decoder()
-
 
 class ModuleApiTest(unittest.TestCase):
     @classmethod
diff --git a/python/paddle/nn/decode.py b/python/paddle/nn/decode.py
index ff4a6e4f482af..1e5f633b61f2c 100644
--- a/python/paddle/nn/decode.py
+++ b/python/paddle/nn/decode.py
@@ -12,7 +12,629 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ..fluid.layers import BeamSearchDecoder  # noqa: F401
+import collections
+
+import numpy as np
+
+import paddle
+
 from ..fluid.layers import dynamic_decode  # noqa: F401
+from ..fluid.layers.utils import flatten, map_structure
 
 __all__ = []
+
+
+class Decoder:
+    """
+        :api_attr: Static Graph
+
+    Decoder is the base class for any decoder instance used in `dynamic_decode`.
+    It provides interface for output generation for one time step, which can be
+    used to generate sequences.
+
+    The key abstraction provided by Decoder is:
+
+    1. :code:`(initial_input, initial_state, finished) = initialize(inits)` ,
+    which generates the input and state for the first decoding step, and gives the
+    initial status telling whether each sequence in the batch is finished.
+    It would be called once before the decoding iterations.
+
+    2. :code:`(output, next_state, next_input, finished) = step(time, input, state)` ,
+    which transforms the input and state to the output and new state, generates
+    input for the next decoding step, and emits the flag indicating finished status.
+    It is the main part for each decoding iteration.
+
+    3. :code:`(final_outputs, final_state) = finalize(outputs, final_state, sequence_lengths)` ,
+    which revises the outputs(stack of all time steps' output) and final state(state from the
+    last decoding step) to get the counterpart for special usage.
+    Not necessary to be implemented if no need to revise the stacked outputs and
+    state from the last decoding step. If implemented, it would be called after
+    the decoding iterations.
+
+    Decoder is more general compared to RNNCell, since the returned `next_input`
+    and `finished` make it can determine the input and when to finish by itself
+    when used in dynamic decoding. Decoder always wraps a RNNCell instance though
+    not necessary.
+    """
+
+    def initialize(self, inits):
+        r"""
+        Called once before the decoding iterations.
+
+        Parameters:
+            inits: Argument provided by the caller.
+
+        Returns:
+            tuple: A tuple( :code:`(initial_inputs, initial_states, finished)` ). \
+                `initial_inputs` and `initial_states` both are a (possibly nested \
+                structure of) tensor variable[s], and `finished` is a tensor with \
+                bool data type.
+        """
+        raise NotImplementedError
+
+    def step(self, time, inputs, states, **kwargs):
+        r"""
+        Called per step of decoding.
+
+        Parameters:
+            time(Tensor): A Tensor with shape :math:`[1]` provided by the caller.
+                The data type is int64.
+            inputs(Tensor): A (possibly nested structure of) tensor variable[s].
+            states(Tensor): A (possibly nested structure of) tensor variable[s].
+            **kwargs: Additional keyword arguments, provided by the caller.
+
+        Returns:
+            tuple: A tuple( :code:(outputs, next_states, next_inputs, finished)` ). \
+                `next_inputs` and `next_states` both are a (possibly nested \
+                structure of) tensor variable[s], and the structure, shape and \
+                data type must be same as the counterpart from input arguments. \
+                `outputs` is a (possibly nested structure of) tensor variable[s]. \
+                `finished` is a Tensor with bool data type.
+        """
+        raise NotImplementedError
+
+    def finalize(self, outputs, final_states, sequence_lengths):
+        r"""
+        Called once after the decoding iterations if implemented.
+
+        Parameters:
+            outputs(Tensor): A (possibly nested structure of) tensor variable[s].
+                The structure and data type is same as `output_dtype`.
+                The tensor stacks all time steps' output thus has shape
+                :math:`[time\_step, batch\_size, ...]` , which is done by the caller.
+            final_states(Tensor): A (possibly nested structure of) tensor variable[s].
+                It is the `next_states` returned by `decoder.step` at last decoding step,
+                thus has the same structure, shape and data type with states at any time
+                step.
+
+        Returns:
+            tuple: A tuple( :code:`(final_outputs, final_states)` ). \
+                `final_outputs` and `final_states` both are a (possibly nested \
+                structure of) tensor variable[s].
+        """
+        raise NotImplementedError
+
+    @property
+    def tracks_own_finished(self):
+        """
+        Describes whether the Decoder keeps track of finished states by itself.
+
+        `decoder.step()` would emit a bool `finished` value at each decoding
+        step. The emited `finished` can be used to determine whether every
+        batch entries is finished directly, or it can be combined with the
+        finished tracker keeped in `dynamic_decode` by performing a logical OR
+        to take the already finished into account.
+
+        If `False`, the latter would be took when performing `dynamic_decode`,
+        which is the default. Otherwise, the former would be took, which uses
+        the finished value emited by the decoder as all batch entry finished
+        status directly, and it is the case when batch entries might be
+        reordered such as beams in BeamSearchDecoder.
+
+        Returns:
+            bool: A python bool `False`.
+        """
+        return False
+
+
+class BeamSearchDecoder(Decoder):
+    """
+    Decoder with beam search decoding strategy. It wraps a cell to get probabilities,
+    and follows a beam search step to calculate scores and select candidate
+    token ids for each decoding step.
+
+    Please refer to `Beam search <https://en.wikipedia.org/wiki/Beam_search>`_
+    for more details.
+
+    **NOTE** When decoding with beam search, the `inputs` and `states` of cell
+    would be tiled to `beam_size` (unsqueeze and tile), resulting to shapes like
+    `[batch_size * beam_size, ...]` , which is built into `BeamSearchDecoder` and
+    done automatically. Thus any other tensor with shape `[batch_size, ...]` used
+    in `cell.call` needs to be tiled manually first, which can be completed by using
+    :code:`BeamSearchDecoder.tile_beam_merge_with_batch` . The most common case
+    for this is the encoder output in attention mechanism.
+
+    Returns:
+        BeamSearchDecoder: An instance of decoder which can be used in \
+            `paddle.nn.dynamic_decode` to implement decoding.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+            from paddle.nn import BeamSearchDecoder, dynamic_decode
+            from paddle.nn import GRUCell, Linear, Embedding
+            trg_embeder = Embedding(100, 32)
+            output_layer = Linear(32, 32)
+            decoder_cell = GRUCell(input_size=32, hidden_size=32)
+            decoder = BeamSearchDecoder(decoder_cell,
+                                        start_token=0,
+                                        end_token=1,
+                                        beam_size=4,
+                                        embedding_fn=trg_embeder,
+                                        output_fn=output_layer)
+
+    """
+
+    def __init__(
+        self,
+        cell,
+        start_token,
+        end_token,
+        beam_size,
+        embedding_fn=None,
+        output_fn=None,
+    ):
+        """
+        Constructor of BeamSearchDecoder.
+
+        Parameters:
+            cell(RNNCellBase): An instance of `RNNCellBase` or object with the same interface.
+            start_token(int): The start token id.
+            end_token(int): The end token id.
+            beam_size(int): The beam width used in beam search.
+            embedding_fn(optional): A callable to apply to selected candidate ids.
+                Mostly it is an embedding layer to transform ids to embeddings,
+                and the returned value acts as the `input` argument for `cell.call`.
+                If not provided, the id to embedding transformation must be built into
+                `cell.call`. Default None.
+            output_fn(optional): A callable to apply to the cell's output prior to
+                calculate scores and select candidate token ids. Default None.
+        """
+        self.cell = cell
+        self.embedding_fn = embedding_fn
+        self.output_fn = output_fn
+        self.start_token = start_token
+        self.end_token = end_token
+        self.beam_size = beam_size
+
+    @staticmethod
+    def tile_beam_merge_with_batch(x, beam_size):
+        r"""
+        Tile the batch dimension of a tensor. Specifically, this function takes
+        a tensor t shaped `[batch_size, s0, s1, ...]` composed of minibatch
+        entries `t[0], ..., t[batch_size - 1]` and tiles it to have a shape
+        `[batch_size * beam_size, s0, s1, ...]` composed of minibatch entries
+        `t[0], t[0], ..., t[1], t[1], ...` where each minibatch entry is repeated
+        `beam_size` times.
+
+        Parameters:
+            x(Tensor): A tensor with shape `[batch_size, ...]`. The data type
+                should be float32, float64, int32, int64 or bool.
+            beam_size(int): The beam width used in beam search.
+
+        Returns:
+            Tensor: A tensor with shape `[batch_size * beam_size, ...]`, whose \
+                data type is same as `x`.
+        """
+        x = paddle.unsqueeze(x, [1])  # [batch_size, 1, ...]
+        expand_times = [1] * len(x.shape)
+        expand_times[1] = beam_size
+        x = paddle.tile(x, expand_times)  # [batch_size, beam_size, ...]
+        x = paddle.transpose(
+            x, list(range(2, len(x.shape))) + [0, 1]
+        )  # [..., batch_size, beam_size]
+        # use 0 to copy to avoid wrong shape
+        x = paddle.reshape(
+            x, shape=[0] * (len(x.shape) - 2) + [-1]
+        )  # [..., batch_size * beam_size]
+        x = paddle.transpose(
+            x, [len(x.shape) - 1] + list(range(0, len(x.shape) - 1))
+        )  # [batch_size * beam_size, ...]
+        return x
+
+    def _split_batch_beams(self, x):
+        r"""
+        Reshape a tensor with shape `[batch_size * beam_size, ...]` to a new
+        tensor with shape `[batch_size, beam_size, ...]`.
+
+        Parameters:
+            x(Tensor): A tensor with shape `[batch_size * beam_size, ...]`. The
+                data type should be float32, float64, int32, int64 or bool.
+
+        Returns:
+            Tensor: A tensor with shape `[batch_size, beam_size, ...]`, whose \
+                data type is same as `x`.
+        """
+        # TODO: avoid fake shape in compile-time like tile_beam_merge_with_batch
+        return paddle.reshape(x, shape=[-1, self.beam_size] + list(x.shape[1:]))
+
+    def _merge_batch_beams(self, x):
+        r"""
+        Reshape a tensor with shape `[batch_size, beam_size, ...]` to a new
+        tensor with shape `[batch_size * beam_size, ...]`.
+
+        Parameters:
+            x(Tensor): A tensor with shape `[batch_size, beam_size, ...]`. The
+                data type should be float32, float64, int32, int64 or bool.
+
+        Returns:
+            Tensor: A tensor with shape `[batch_size * beam_size, ...]`, whose \
+                data type is same as `x`.
+        """
+        # TODO: avoid fake shape in compile-time like tile_beam_merge_with_batch
+        return paddle.reshape(x, shape=[-1] + list(x.shape[2:]))
+
+    def _expand_to_beam_size(self, x):
+        r"""
+        This function takes a tensor t shaped `[batch_size, s0, s1, ...]` composed
+        of minibatch entries `t[0], ..., t[batch_size - 1]` and tiles it to have a
+        shape `[batch_size, beam_size, s0, s1, ...]` composed of minibatch entries
+        `t[0], t[0], ..., t[1], t[1], ...` where each minibatch entry is repeated
+        `beam_size` times.
+
+        Parameters:
+            x(Tensor): A tensor with shape `[batch_size, ...]`, The data type
+                should be float32, float64, int32, int64 or bool.
+
+        Returns:
+            Tensor: A tensor with shape `[batch_size, beam_size, ...]`, whose \
+                data type is same as `x`.
+        """
+        x = paddle.unsqueeze(x, [1])
+        expand_times = [1] * len(x.shape)
+        expand_times[1] = self.beam_size
+        x = paddle.tile(x, expand_times)
+        return x
+
+    def _mask_probs(self, probs, finished):
+        r"""
+        Mask log probabilities. It forces finished beams to allocate all probability
+        mass to eos and unfinished beams to remain unchanged.
+
+        Parameters:
+            probs(Tensor): A tensor with shape `[batch_size, beam_size, vocab_size]`,
+                representing the log probabilities. Its data type should be float32 or float64.
+            finished(Tensor): A tensor with shape `[batch_size, beam_size]`,
+                representing the finished status for all beams. Its data type
+                should be bool.
+
+        Returns:
+            Tensor: A tensor with the same shape and data type as `x`, \
+                where unfinished beams stay unchanged and finished beams are \
+                replaced with a tensor with all probability on the EOS token.
+        """
+        # TODO: use where_op
+        finished = paddle.cast(finished, dtype=probs.dtype)
+
+        probs = paddle.multiply(
+            paddle.tile(
+                paddle.unsqueeze(finished, [2]), [1, 1, self.vocab_size]
+            ),
+            self.noend_mask_tensor,
+        ) - paddle.multiply(probs, (finished - 1).unsqueeze([2]))
+
+        return probs
+
+    def _gather(self, x, indices, batch_size):
+        r"""
+        Gather from the tensor `x` using `indices`.
+
+        Parameters:
+            x(Tensor): A tensor with shape `[batch_size, beam_size, ...]`.
+            indices(Tensor): A `int64` tensor with shape `[batch_size, beam_size]`,
+                representing the indices that we use to gather.
+            batch_size(Tensor): A tensor with shape `[1]`. Its data type should
+                be int32 or int64.
+
+        Returns:
+            Tensor: A tensor with the same shape and data type as `x`, \
+                representing the gathered tensor.
+        """
+        # TODO: compatibility of int32 and int64
+        batch_size = (
+            paddle.cast(batch_size, indices.dtype)
+            if batch_size.dtype != indices.dtype
+            else batch_size
+        )
+        batch_size.stop_gradient = True  # TODO: remove this
+        batch_pos = paddle.tile(
+            paddle.unsqueeze(
+                paddle.arange(0, batch_size, 1, dtype=indices.dtype), [1]
+            ),
+            [1, self.beam_size],
+        )
+        topk_coordinates = paddle.stack([batch_pos, indices], axis=2)
+        topk_coordinates.stop_gradient = True
+        return paddle.gather_nd(x, topk_coordinates)
+
+    class OutputWrapper(
+        collections.namedtuple(
+            "OutputWrapper", ("scores", "predicted_ids", "parent_ids")
+        )
+    ):
+        """
+        The structure for the returned value `outputs` of `decoder.step`.
+        A namedtuple includes scores, predicted_ids, parent_ids as fields.
+        """
+
+        pass
+
+    class StateWrapper(
+        collections.namedtuple(
+            "StateWrapper", ("cell_states", "log_probs", "finished", "lengths")
+        )
+    ):
+        """
+        The structure for the argument `states` of `decoder.step`.
+        A namedtuple includes cell_states, log_probs, finished, lengths as fields.
+        """
+
+        pass
+
+    def initialize(self, initial_cell_states):
+        r"""
+        Initialize the BeamSearchDecoder.
+
+        Parameters:
+            initial_cell_states(Tensor): A (possibly nested structure of)
+                tensor variable[s]. An argument provided by the caller.
+
+        Returns:
+            tuple: A tuple( :code:`(initial_inputs, initial_states, finished)` ). \
+                `initial_inputs` is a tensor t filled by `start_token` with shape \
+                `[batch_size, beam_size]` when `embedding_fn` is None, or the \
+                returned value of `embedding_fn(t)` when `embedding_fn` is provided. \
+                `initial_states` is a nested structure(namedtuple including cell_states, \
+                log_probs, finished, lengths as fields) of tensor variables, where \
+                `log_probs, finished, lengths` all has a tensor value shaped \
+                `[batch_size, beam_size]` with data type `float32, bool, int64`. \
+                cell_states has a value with the same structure as the input \
+                argument `initial_cell_states` but with tiled shape `[batch_size, beam_size, ...]`. \
+                `finished` is a `bool` tensor filled by False with shape `[batch_size, beam_size]`.
+        """
+        self.kinf = 1e9
+        state = flatten(initial_cell_states)[0]
+        self.batch_size = paddle.shape(state)[0]
+
+        self.start_token_tensor = paddle.full(
+            shape=[1], dtype="int64", fill_value=self.start_token
+        )
+        self.end_token_tensor = paddle.full(
+            shape=[1], dtype="int64", fill_value=self.end_token
+        )
+
+        init_cell_states = map_structure(
+            self._expand_to_beam_size, initial_cell_states
+        )
+        init_inputs = paddle.full(
+            shape=[self.batch_size, self.beam_size],
+            fill_value=self.start_token_tensor,
+            dtype=self.start_token_tensor.dtype,
+        )
+        log_probs = paddle.tile(
+            paddle.assign(
+                np.array(
+                    [[0.0] + [-self.kinf] * (self.beam_size - 1)],
+                    dtype="float32",
+                )
+            ),
+            [self.batch_size, 1],
+        )
+        if paddle.get_default_dtype() == "float64":
+            log_probs = paddle.cast(log_probs, "float64")
+
+        init_finished = paddle.full(
+            shape=[paddle.shape(state)[0], self.beam_size],
+            fill_value=False,
+            dtype="bool",
+        )
+
+        init_lengths = paddle.zeros_like(init_inputs)
+        init_inputs = (
+            self.embedding_fn(init_inputs) if self.embedding_fn else init_inputs
+        )
+        return (
+            init_inputs,
+            self.StateWrapper(
+                init_cell_states, log_probs, init_finished, init_lengths
+            ),
+            init_finished,
+        )
+
+    def _beam_search_step(self, time, logits, next_cell_states, beam_state):
+        r"""
+        Calculate scores and select candidate token ids.
+
+        Parameters:
+            time(Tensor): An `int64` tensor with shape `[1]` provided by the caller,
+                representing the current time step number of decoding.
+            logits(Tensor): A tensor with shape `[batch_size, beam_size, vocab_size]`,
+                representing the logits at the current time step. Its data type is float32.
+            next_cell_states(Tensor): A (possibly nested structure of) tensor variable[s].
+                It has the same structure, shape and data type as the `cell_states` of
+                `initial_states` returned by `initialize()`. It represents the next state
+                from the cell.
+            beam_state(Tensor): A structure of tensor variables.
+                It is same as the `initial_states` returned by `initialize()` for
+                the first decoding step and `beam_search_state` returned by
+                `step()` for the others.
+
+        Returns:
+            tuple: A tuple( :code:`(beam_search_output, beam_search_state)` ). \
+                `beam_search_output` is a namedtuple(including scores, predicted_ids, \
+                parent_ids as fields) of tensor variables, where \
+                `scores, predicted_ids, parent_ids` all has a tensor value shaped \
+                `[batch_size, beam_size]` with data type `float32, int64, int64`.
+                `beam_search_state` has the same structure, shape and data type \
+                as the input argument `beam_state`.
+
+        """
+        self.vocab_size = logits.shape[-1]
+        self.vocab_size_tensor = paddle.full(
+            shape=[1], dtype="int64", fill_value=self.vocab_size
+        )
+        noend_array = [-self.kinf] * self.vocab_size
+        noend_array[self.end_token] = 0
+
+        self.noend_mask_tensor = paddle.assign(np.array(noend_array, "float32"))
+        if paddle.get_default_dtype() == "float64":
+            self.noend_mask_tensor = paddle.cast(
+                self.noend_mask_tensor, "float64"
+            )
+
+        step_log_probs = paddle.log(paddle.nn.functional.softmax(logits))
+        step_log_probs = self._mask_probs(step_log_probs, beam_state.finished)
+
+        log_probs = paddle.add(
+            step_log_probs, beam_state.log_probs.unsqueeze([2])
+        )
+
+        # TODO: length penalty
+        scores = log_probs
+        scores = paddle.reshape(scores, [-1, self.beam_size * self.vocab_size])
+        # TODO: add grad for topk then this beam search can be used to train
+        topk_scores, topk_indices = paddle.topk(x=scores, k=self.beam_size)
+        beam_indices = paddle.floor_divide(topk_indices, self.vocab_size_tensor)
+        token_indices = paddle.remainder(topk_indices, self.vocab_size_tensor)
+        next_log_probs = self._gather(
+            paddle.reshape(log_probs, [-1, self.beam_size * self.vocab_size]),
+            topk_indices,
+            self.batch_size,
+        )
+        next_cell_states = map_structure(
+            lambda x: self._gather(x, beam_indices, self.batch_size),
+            next_cell_states,
+        )
+        next_finished = self._gather(
+            beam_state.finished, beam_indices, self.batch_size
+        )
+        next_lengths = self._gather(
+            beam_state.lengths, beam_indices, self.batch_size
+        )
+        next_lengths = next_lengths + paddle.cast(
+            paddle.logical_not(next_finished), beam_state.lengths.dtype
+        )
+        next_finished = paddle.logical_or(
+            next_finished,
+            paddle.equal(token_indices, self.end_token_tensor),
+        )
+
+        beam_search_output = self.OutputWrapper(
+            topk_scores, token_indices, beam_indices
+        )
+        beam_search_state = self.StateWrapper(
+            next_cell_states, next_log_probs, next_finished, next_lengths
+        )
+        return beam_search_output, beam_search_state
+
+    def step(self, time, inputs, states, **kwargs):
+        r"""
+        Perform a beam search decoding step, which uses `cell` to get probabilities,
+        and follows a beam search step to calculate scores and select candidate
+        token ids.
+
+        Parameters:
+            time(Tensor): An `int64` tensor with shape `[1]` provided by the caller,
+                representing the current time step number of decoding.
+            inputs(Tensor): A tensor variable. It is same as `initial_inputs`
+                returned by `initialize()` for the first decoding step and
+                `next_inputs` returned by `step()` for the others.
+            states(Tensor): A structure of tensor variables.
+                It is same as the `initial_states` returned by `initialize()` for
+                the first decoding step and `beam_search_state` returned by
+                `step()` for the others.
+            **kwargs: Additional keyword arguments, provided by the caller.
+
+        Returns:
+            tuple: A tuple( :code:`(beam_search_output, beam_search_state, next_inputs, finished)` ). \
+                `beam_search_state` and `next_inputs` have the same structure, \
+                shape and data type as the input arguments `states` and `inputs` separately. \
+                `beam_search_output` is a namedtuple(including scores, predicted_ids, \
+                parent_ids as fields) of tensor variables, where \
+                `scores, predicted_ids, parent_ids` all has a tensor value shaped \
+                `[batch_size, beam_size]` with data type `float32, int64, int64`. \
+                `finished` is a `bool` tensor with shape `[batch_size, beam_size]`.
+        """
+        inputs = map_structure(self._merge_batch_beams, inputs)
+        cell_states = map_structure(self._merge_batch_beams, states.cell_states)
+        cell_outputs, next_cell_states = self.cell(
+            inputs, cell_states, **kwargs
+        )
+        cell_outputs = map_structure(self._split_batch_beams, cell_outputs)
+        next_cell_states = map_structure(
+            self._split_batch_beams, next_cell_states
+        )
+
+        if self.output_fn is not None:
+            cell_outputs = self.output_fn(cell_outputs)
+
+        beam_search_output, beam_search_state = self._beam_search_step(
+            time=time,
+            logits=cell_outputs,
+            next_cell_states=next_cell_states,
+            beam_state=states,
+        )
+        finished = beam_search_state.finished
+        sample_ids = beam_search_output.predicted_ids
+        sample_ids.stop_gradient = True
+        next_inputs = (
+            self.embedding_fn(sample_ids) if self.embedding_fn else sample_ids
+        )
+
+        return (beam_search_output, beam_search_state, next_inputs, finished)
+
+    def finalize(self, outputs, final_states, sequence_lengths):
+        r"""
+        Use `gather_tree` to backtrace along the beam search tree and construct
+        the full predicted sequences.
+
+        Parameters:
+            outputs(Tensor): A structure(namedtuple) of tensor variables,
+                The structure and data type is same as `output_dtype`.
+                The tensor stacks all time steps' output thus has shape
+                `[time_step, batch_size, ...]`, which is done by the caller.
+            final_states(Tensor): A structure(namedtuple) of tensor variables.
+                It is the `next_states` returned by `decoder.step` at last
+                decoding step, thus has the same structure, shape and data type
+                with states at any time step.
+            sequence_lengths(Tensor): An `int64` tensor shaped `[batch_size, beam_size]`.
+                It contains sequence lengths for each beam determined during
+                decoding.
+
+        Returns:
+            tuple: A tuple( :code:`(predicted_ids, final_states)` ). \
+                `predicted_ids` is an `int64` tensor shaped \
+                `[time_step, batch_size, beam_size]`. `final_states` is the same \
+                as the input argument `final_states`.
+        """
+        predicted_ids = paddle.nn.functional.gather_tree(
+            outputs.predicted_ids, outputs.parent_ids
+        )
+        # TODO: use FinalBeamSearchDecoderOutput as output
+        return predicted_ids, final_states
+
+    @property
+    def tracks_own_finished(self):
+        """
+        BeamSearchDecoder reorders its beams and their finished state. Thus it
+        conflicts with `dynamic_decode` function's tracking of finished states.
+        Setting this property to true to avoid early stopping of decoding due
+        to mismanagement of the finished state.
+
+        Returns:
+            bool: A python bool `True`.
+        """
+        return True

From f88713e1707d0f2b2806d21c13973035ea19a796 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Thu, 8 Dec 2022 12:56:34 +0800
Subject: [PATCH 57/60] [Inference] Enable infer shape cache. (#48312)

---
 .../ir/runtime_context_cache_pass.cc          | 21 ++++-
 paddle/fluid/framework/operator.cc            | 80 +++++++++++++++++--
 paddle/fluid/framework/operator.h             |  8 +-
 .../passes/ir_graph_to_program_pass.cc        |  6 +-
 .../inference/api/paddle_pass_builder.cc      | 12 +--
 5 files changed, 104 insertions(+), 23 deletions(-)

diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
index 451e41e767dc4..4f5e5edb893fe 100644
--- a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
+++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/ir/runtime_context_cache_pass.h"
 
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
@@ -21,10 +22,28 @@ namespace framework {
 namespace ir {
 
 void RuntimeContextCachePass::ApplyImpl(ir::Graph* graph) const {
+  static constexpr char kNotAllowInferShapeCahce[] =
+      "@NOT_ALLOW_INFERSHAPE_CACHE@";
   VLOG(3) << "Applies Runtime Context Cache strategy.";
   for (const Node* n : graph->Nodes()) {
     if (n->IsOp() && n->Op()) {
-      n->Op()->SetAttr(kEnableCacheRuntimeContext, true);
+      n->Op()->SetAttr(framework::kEnableCacheRuntimeContext, true);
+    }
+  }
+
+  // if op1 -> var0 and op2 -> var0, then op1 and op2 not support
+  // InferShapeCache.
+  std::unordered_map<std::string, std::vector<Node*>> var2ops;
+  for (auto* op_node : TopologySortOperations(*graph)) {
+    for (auto* var_node : op_node->outputs) {
+      var2ops[var_node->Name()].push_back(op_node);
+    }
+  }
+  for (auto& it : var2ops) {
+    if (it.second.size() > 1) {
+      for (auto op_node : it.second) {
+        op_node->Op()->SetAttr(kNotAllowInferShapeCahce, true);
+      }
     }
   }
 }
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 538a76e738904..19d0c6ea0d2a8 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 
 #include <sstream>
 #include <string>
+#include <unordered_set>
 
 #include "gflags/gflags.h"
 #include "paddle/fluid/framework/convert_utils.h"
@@ -36,6 +37,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler/supplement_tracing.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_context.h"
 #include "paddle/phi/core/kernel_factory.h"
 #include "paddle/phi/ops/compat/signatures.h"
@@ -562,6 +564,14 @@ phi::DenseTensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var) {
   }
 }
 
+OperatorWithKernel::OperatorWithKernel(const std::string& type,
+                                       const VariableNameMap& inputs,
+                                       const VariableNameMap& outputs,
+                                       const AttributeMap& attrs)
+    : OperatorBase(type, inputs, outputs, attrs) {}
+
+OperatorWithKernel::~OperatorWithKernel() = default;
+
 bool ExecutionContext::HasInput(const std::string& name) const {
   auto* var = InputVar(name);
   return var != nullptr;
@@ -1204,19 +1214,54 @@ class RuntimeInferShapeContext : public InferShapeContext {
 };
 
 struct OperatorWithKernel::CacheImpl {
+  static const char kNotAllowInferShapeCahce[];
   explicit CacheImpl(phi::KernelContext* kernel_ctx,
-                     RuntimeInferShapeContext* infer_shape_ctx)
-      : kernel_ctx_(kernel_ctx), infer_shape_ctx_(infer_shape_ctx) {}
+                     RuntimeInferShapeContext* infer_shape_ctx,
+                     const std::vector<phi::DenseTensor*>& tensors,
+                     bool not_allow_infer_shape_cache)
+      : kernel_ctx_(kernel_ctx),
+        infer_shape_ctx_(infer_shape_ctx),
+        tensors_(tensors),
+        not_allow_infer_shape_cache_(not_allow_infer_shape_cache) {}
 
   phi::KernelContext* getKernelContext() { return kernel_ctx_.get(); }
   RuntimeInferShapeContext* getRuntimeInferShapeContext() {
     return infer_shape_ctx_.get();
   }
 
+  bool NeedInferShape() {
+    if (not_allow_infer_shape_cache_) return true;
+
+    bool ret{false};
+    if (last_ddims_.empty() || tensors_.empty()) ret = true;
+    if (!ret) {
+      CHECK_EQ(last_ddims_.size(), tensors_.size());
+      for (size_t i = 0; i < last_ddims_.size(); ++i) {
+        if (tensors_[i]->dims() != last_ddims_[i]) {
+          ret = true;
+          break;
+        }
+      }
+    }
+    if (ret) {
+      last_ddims_.resize(tensors_.size());
+      for (size_t i = 0; i < last_ddims_.size(); ++i) {
+        last_ddims_[i] = tensors_[i]->dims();
+      }
+    }
+    VLOG(3) << "need infer shape is " << ret;
+    return ret;
+  }
+
  private:
   std::unique_ptr<phi::KernelContext> kernel_ctx_;
   std::unique_ptr<RuntimeInferShapeContext> infer_shape_ctx_;
+  std::vector<phi::DenseTensor*> tensors_;
+  bool not_allow_infer_shape_cache_;
+  std::vector<phi::DDim> last_ddims_;
 };
+const char OperatorWithKernel::CacheImpl::kNotAllowInferShapeCahce[] =
+    "@NOT_ALLOW_INFERSHAPE_CACHE@";
 
 static void CheckTensorNANOrInf(const std::string& op_type,
                                 const std::string& name,
@@ -1524,8 +1569,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     pre_scope_ = cur_scope;
   } else if (run_phi_kernel_ && impl_ != nullptr && !need_prepare_data_ &&
              !need_prepare_phi_data_) {
-    if (!all_kernels_must_compute_runtime_shape_)
+    if (!all_kernels_must_compute_runtime_shape_ && impl_->NeedInferShape()) {
       this->Info().infer_shape_(impl_->getRuntimeInferShapeContext());
+    }
     (*phi_kernel_)(impl_->getKernelContext());
   } else {
     if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
@@ -1828,9 +1874,31 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       phi::KernelContext phi_kernel_context;
       if (enable_cache_runtime_context_ && !need_prepare_phi_data_ &&
           !need_prepare_data_) {
-        impl_ =
+        // TODO(inference): Now we only suppor dense_tensor cache, we may be
+        // support ScalarTensor, SparseTensor in future.
+        bool all_dense_tensor_input_{true};
+        for (auto& iter : Inputs()) {
+          for (auto& name : iter.second) {
+            all_dense_tensor_input_ &=
+                scope.FindVar(name)->IsType<phi::DenseTensor>();
+          }
+        }
+
+        std::vector<phi::DenseTensor*> tensors;
+        if (all_dense_tensor_input_) {
+          for (auto& iter : Inputs()) {
+            for (auto& name : iter.second) {
+              auto* t = scope.FindVar(name)->GetMutable<phi::DenseTensor>();
+              tensors.push_back(t);
+            }
+          }
+        }
+
+        impl_.reset(
             new CacheImpl(new phi::KernelContext(),
-                          new RuntimeInferShapeContext(*this, *runtime_ctx));
+                          new RuntimeInferShapeContext(*this, *runtime_ctx),
+                          tensors,
+                          HasAttr(CacheImpl::kNotAllowInferShapeCahce)));
         BuildPhiKernelContext(*runtime_ctx, dev_ctx, impl_->getKernelContext());
         (*phi_kernel_)(impl_->getKernelContext());
       } else {
@@ -3246,6 +3314,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
   if (phi::OneDNNContext::classof(dev_ctx)) {
     phi::OneDNNContext* one_dnn_ctx = static_cast<phi::OneDNNContext*>(dev_ctx);
     one_dnn_ctx->ClearDnnAttr();
+    if (!RuntimeAttrs().empty()) need_prepare_phi_data_ = true;
   }
 #endif
 
@@ -3267,7 +3336,6 @@ void OperatorWithKernel::BuildPhiKernelContext(
 #if defined(PADDLE_WITH_MKLDNN) || defined(PADDLE_WITH_CUDA)
   auto& runtime_attrs = RuntimeAttrs();
   for (const auto& attr_iter : runtime_attrs) {
-    need_prepare_phi_data_ = true;
     auto& attr_name = attr_iter.first;
     auto& attr = attr_iter.second;
     auto attr_propertys = paddle::operators::GetExtraAttrProperties(attr_name);
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 236ff7af8d230..07e1a26c7c0ab 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -612,8 +612,9 @@ class OperatorWithKernel : public OperatorBase {
   OperatorWithKernel(const std::string& type,
                      const VariableNameMap& inputs,
                      const VariableNameMap& outputs,
-                     const AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
+                     const AttributeMap& attrs);
+
+  virtual ~OperatorWithKernel();
 
   static paddle::flat_hash_map<std::string /* op_type */, OpKernelMap>&
   AllOpKernels() {
@@ -785,8 +786,9 @@ class OperatorWithKernel : public OperatorBase {
   mutable std::unique_ptr<phi::Kernel> phi_kernel_;
   mutable std::unique_ptr<phi::ArgumentMappingFn> arg_map_fn_;
 
+ private:
   struct CacheImpl;
-  mutable CacheImpl* impl_{nullptr};
+  mutable std::unique_ptr<CacheImpl> impl_;
 };
 
 extern bool OpSupportGPU(const std::string& op_type);
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
index 3d86f7bf399a9..2f7f61406b384 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
@@ -23,6 +23,8 @@ namespace inference {
 namespace analysis {
 
 void IrGraphToProgramPass::RunImpl(Argument *argument) {
+  auto cache_pass =
+      framework::ir::PassRegistry::Instance().Get("runtime_context_cache_pass");
   auto pass =
       framework::ir::PassRegistry::Instance().Get("graph_to_program_pass");
 
@@ -31,14 +33,12 @@ void IrGraphToProgramPass::RunImpl(Argument *argument) {
               new int(argument->memory_optim_sort_kind()));
   }
 
-  std::unique_ptr<framework::ir::Graph> graph(argument->main_graph_ptr());
-
   // Direct using ProgramDesc desc(argument->main_program()) may cause
   // incomplete copies of information.
   framework::ProgramDesc desc;
   desc.CopyFrom(*argument->main_program().Proto());
   pass->SetNotOwned("program", &desc);
-  pass->Apply(graph.release());  // the argument still own the graph.
+  pass->Apply(cache_pass->Apply(argument->main_graph_ptr()));
 
   argument->SetIrAnalyzedProgram(
       new framework::proto::ProgramDesc(*desc.Proto()));
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 4e397fbd041c7..2fa96205426b1 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -188,7 +188,6 @@ const std::vector<std::string> kGpuLowerPrecisionPasses{
     "fc_fuse_pass",
     "fc_elementwise_layernorm_fuse_pass",
     "embedding_eltwise_layernorm_fuse_pass",
-    "runtime_context_cache_pass",
 };
 
 const std::vector<std::string> kTrtLowerPrecisionPasses{
@@ -254,10 +253,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
 #endif                                         //
         "transpose_flatten_concat_fuse_pass",  //
         "constant_folding_pass",               //
-        // following pass should be located in the last, since it will
-        // work on all fused ops.
-        "float_to_half_pass",  //
-        "runtime_context_cache_pass"
+        "float_to_half_pass",                  //
   });
 
   use_gpu_ = true;
@@ -322,10 +318,7 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
                   "conv_transpose_bn_fuse_pass",             //
                   "conv_transpose_eltwiseadd_bn_fuse_pass",  //
                   "is_test_pass",                            //
-                  "constant_folding_pass",
-                  // following pass should be located in the last, since
-                  // it will work on all fused ops.
-                  "runtime_context_cache_pass"});
+                  "constant_folding_pass"});
 
   use_gpu_ = false;
 }
@@ -475,7 +468,6 @@ void CpuPassStrategy::EnableMkldnnInt8() {
     passes_.push_back("int8_scale_calculation_mkldnn_pass");
     passes_.push_back("params_quantization_mkldnn_pass");
     passes_.push_back("mkldnn_inplace_pass");
-    passes_.push_back("runtime_context_cache_pass");
   }
   use_mkldnn_int8_ = true;
 #else

From 9c395d386b9e32731a4047ba7f7e46b517a11982 Mon Sep 17 00:00:00 2001
From: heyanru <81976792+heyanru01@users.noreply.github.com>
Date: Thu, 8 Dec 2022 13:05:22 +0800
Subject: [PATCH 58/60] [Fluid Clean] remove unfold, deformable_roi_pooling,
 shard_index, hard_swish, mish, uniform_random, unbind (#48451)

---
 python/paddle/distribution/uniform.py         |   4 +-
 python/paddle/fluid/layers/nn.py              | 671 ------------------
 .../fluid/tests/unittests/CMakeLists.txt      |   1 -
 .../test_mkldnn_elt_act_fuse_pass.py          |   6 +-
 .../ir/inference/test_trt_activation_pass.py  |  10 +-
 .../tests/unittests/test_activation_op.py     |   8 +-
 .../tests/unittests/test_cuda_random_seed.py  |  26 +-
 .../test_deformable_psroi_pooling.py          | 596 ----------------
 .../tests/unittests/test_gradient_clip.py     |  14 +-
 .../fluid/tests/unittests/test_layers.py      |  40 +-
 .../fluid/tests/unittests/test_random_seed.py |  34 +-
 .../fluid/tests/unittests/test_regularizer.py |   2 +-
 .../tests/unittests/test_regularizer_api.py   |   2 +-
 ...tatic_shape_inferrence_for_shape_tensor.py |   4 +-
 .../fluid/tests/unittests/test_unbind_op.py   |   2 +-
 .../unittests/test_uniform_random_bf16_op.py  |  25 +-
 .../tests/unittests/test_uniform_random_op.py |  36 +-
 17 files changed, 59 insertions(+), 1422 deletions(-)
 delete mode 100644 python/paddle/fluid/tests/unittests/test_deformable_psroi_pooling.py

diff --git a/python/paddle/distribution/uniform.py b/python/paddle/distribution/uniform.py
index 9b41dd026709f..706ff73ee83ff 100644
--- a/python/paddle/distribution/uniform.py
+++ b/python/paddle/distribution/uniform.py
@@ -23,7 +23,7 @@
     _non_static_mode,
     in_dygraph_mode,
 )
-from paddle.fluid.layers import nn, tensor
+from paddle.fluid.layers import tensor
 from paddle.tensor import random
 
 
@@ -187,7 +187,7 @@ def sample(self, shape, seed=0):
             return output
         else:
             output_shape = shape + batch_shape
-            output = nn.uniform_random(
+            output = paddle.uniform(
                 output_shape, dtype=self.dtype, min=0.0, max=1.0, seed=seed
             ) * (
                 tensor.zeros(output_shape, dtype=self.dtype)
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index bf5853fad88d2..e470d2f13f177 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -90,13 +90,6 @@
     'mul',
     'merge_selected_rows',
     'get_tensor_from_selected_rows',
-    'unfold',
-    'deformable_roi_pooling',
-    'shard_index',
-    'hard_swish',
-    'mish',
-    'uniform_random',
-    'unbind',
 ]
 
 OP_NAMEMAPPING = {
@@ -3564,667 +3557,3 @@ def get_tensor_from_selected_rows(x, name=None):
         attrs={},
     )
     return out
-
-
-def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
-    r"""
-
-    This op returns a col buffer of sliding local blocks of input x, also known
-    as im2col for batched 2D image tensors. For each block under the convolution filter,
-    all element will be rearranged as a column. While the convolution filter sliding over
-    the input feature map, a series of such columns will be formed.
-
-    For each input :math:`x` with shape [N, C, H, W], the output shape [N, Cout, Lout]
-    can be calculated as following.
-
-    .. math::
-
-        dkernel[0] &= dilations[0] \times (kernel\_sizes[0] - 1) + 1
-
-        dkernel[1] &= dilations[1] \times (kernel\_sizes[1] - 1) + 1
-
-        hout &= \frac{H + paddings[0] + paddings[2] - dkernel[0]}{strides[0]} + 1
-
-        wout &= \frac{W + paddings[1] + paddings[3] - dkernel[1]}{strides[1]} + 1
-
-        Cout &= C \times kernel\_sizes[0] \times kernel\_sizes[1]
-
-        Lout &= hout \times wout
-
-
-    Parameters:
-        x(Tensor):              4-D Tensor, input tensor of format [N, C, H, W],
-                                  data type can be float32 or float64
-        kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
-                                  or an integer k treated as [k, k].
-        strides(int|list):        The strides, should be [stride_h, stride_w]
-                                  or an integer stride treated as [sride, stride].
-                                  For default, strides will be [1, 1].
-        paddings(int|list):       The paddings of each dimension, should be
-                                  [padding_top, padding_left, padding_bottom, padding_right]
-                                  or [padding_h, padding_w] or an integer padding.
-                                  If [padding_h, padding_w] was given, it will expanded to
-                                  [padding_h, padding_w, padding_h, padding_w]. If an integer
-                                  padding was given, [padding, padding, padding, padding] will
-                                  be used. For default, paddings will be [0, 0, 0, 0]
-        dilations(int|list):      the dilations of convolution kernel, should be
-                                  [dilation_h, dilation_w], or an integer dilation treated as
-                                  [dilation, dilation]. For default, it will be [1, 1].
-        name(str, optional): The default value is None.
-                             Normally there is no need for user to set this property.
-                             For more information, please refer to :ref:`api_guide_Name`
-
-
-    Returns:
-        The tensor corresponding to the sliding local blocks.
-        The output shape is [N, Cout, Lout] as decriabled above.
-        Cout is the  total number of values within each block,
-        and Lout is the total number of such blocks.
-        The data type of output is the same as the input :math:`x`
-
-    Return Type:
-        Tensor
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            import paddle.nn.functional as F
-
-            x = paddle.randn((100,3,224,224))
-            y = F.unfold(x, [3, 3], 1, 1, 1)
-    """
-
-    return paddle.nn.functional.unfold(
-        x, kernel_sizes, strides, paddings, dilations, name
-    )
-
-
-def deformable_roi_pooling(
-    input,
-    rois,
-    trans,
-    no_trans=False,
-    spatial_scale=1.0,
-    group_size=[1, 1],
-    pooled_height=1,
-    pooled_width=1,
-    part_size=None,
-    sample_per_part=1,
-    trans_std=0.1,
-    position_sensitive=False,
-    name=None,
-):
-    r"""
-
-    Deformable ROI Pooling Layer
-
-    Performs deformable region-of-interest pooling on inputs. As described
-    in `Deformable Convolutional Networks <https://arxiv.org/abs/1703.06211>`_, it will get offset for each bin after
-    roi pooling so that pooling at correct region. Batch_size will change to the number of region bounding boxes after deformable_roi_pooling.
-
-    The operation has three steps:
-
-    1. Dividing each region proposal into equal-sized sections with the pooled_width and pooled_height.
-
-    2. Add offset to pixel in ROI to get new location and the new value which are computed directly through
-       bilinear interpolation with four nearest pixel.
-
-    3. Sample several points in each bin to get average values as output.
-
-
-    Args:
-        input (Variable):The input of deformable roi pooling and it is tensor which value type is float32. The shape of input is
-                         [N, C, H, W]. Where N is batch size, C is number of input channels,
-                         H is height of the feature, and W is the width of the feature.
-        rois (Variable): ROIs (Regions of Interest) with type float32 to pool over. It should be
-                         a 2-D LoDTensor of shape (num_rois, 4), and the lod level
-                         is 1. Given as [[x1, y1, x2, y2], ...], (x1, y1) is
-                         the top left coordinates, and (x2, y2) is the bottom
-                         right coordinates, which value type is float32.
-        trans (Variable): Offset of features on ROIs while pooling which value type is float32. The format is [N, C, H, W], where
-                          N is number of ROIs, C is number of channels, which indicate the offset distance
-                          in the x and y directions, H is pooled height, and W is pooled width.
-        no_trans (bool): Whether to add offset to get new value or not while roi pooling, which value with type bool is True or False.
-                         If value is True, no offset will be added in operation. Default: False.
-        spatial_scale (float): Ratio of input feature map height (or width) to raw image height (or width), which value type is float32.
-                         Equals the reciprocal of total stride in convolutional layers, Default: 1.0.
-        group_size (list|tuple): The number of groups which input channels are divided and the input is list or tuple, which value type is int32. (eg.number of input channels
-                          is k1 * k2 * (C + 1), which k1 and k2 are group width and height and C+1 is number of output
-                          channels.) eg.(4, 6), which 4 is height of group and 6 is width of group. Default: [1, 1].
-        pooled_height (int): The pooled output height which value type is int32. Default: 1.
-        pooled_width (int): The pooled output width which value type is int32. Default: 1.
-        part_size (list|tuple): The height and width of offset which values in list or tuple is int32, eg.(4, 6), which height is 4 and width is 6, and values always equal to pooled_height \
-                         and pooled_width. Default: if None, default value is [pooled_height, pooled_width].
-        sample_per_part (int): The number of samples in each bin which value type is int32. If value is bigger, it will consume more performance. Default: 1.
-        trans_std (float): Coefficient of offset which value type is float32. It controls weight of offset. Default: 0.1.
-        position_sensitive (bool): Whether to choose deformable psroi pooling mode or not, and value type is bool(True or False). If value is False, input dimension equals to output dimension. \
-                                   If value is True, input dimension should be output dimension * pooled_height * pooled_width. Default: False.
-        name (str|None): Name of layer. Default: None.
-    Returns:
-        Variable: Output of deformable roi pooling is that, if position sensitive is False, input dimension equals to output dimension. If position sensitive is True,\
-                  input dimension should be the result of output dimension divided by pooled height and pooled width.
-
-    Examples:
-      .. code-block:: python
-
-        # position_sensitive=True
-        import paddle.fluid as fluid
-        input = fluid.data(name="input",
-                           shape=[2, 192, 64, 64],
-                           dtype='float32')
-        rois = fluid.data(name="rois",
-                          shape=[-1, 4],
-                          dtype='float32',
-                          lod_level=1)
-        trans = fluid.data(name="trans",
-                           shape=[2, 384, 64, 64],
-                           dtype='float32')
-        x = fluid.layers.deformable_roi_pooling(input=input,
-                                                rois=rois,
-                                                trans=trans,
-                                                no_trans=False,
-                                                spatial_scale=1.0,
-                                                group_size=(1, 1),
-                                                pooled_height=8,
-                                                pooled_width=8,
-                                                part_size=(8, 8),
-                                                sample_per_part=4,
-                                                trans_std=0.1,
-                                                position_sensitive=True)
-
-        # position_sensitive=False
-        import paddle.fluid as fluid
-        input = fluid.data(name="input",
-                           shape=[2, 192, 64, 64],
-                           dtype='float32')
-        rois = fluid.data(name="rois",
-                          shape=[-1, 4],
-                          dtype='float32',
-                          lod_level=1)
-        trans = fluid.data(name="trans",
-                           shape=[2, 384, 64, 64],
-                           dtype='float32')
-        x = fluid.layers.deformable_roi_pooling(input=input,
-                                                rois=rois,
-                                                trans=trans,
-                                                no_trans=False,
-                                                spatial_scale=1.0,
-                                                group_size=(1, 1),
-                                                pooled_height=8,
-                                                pooled_width=8,
-                                                part_size=(8, 8),
-                                                sample_per_part=4,
-                                                trans_std=0.1,
-                                                position_sensitive=False)
-    """
-
-    check_variable_and_dtype(
-        input, 'input', ['float32', 'float64'], 'deformable_roi_pooling'
-    )
-    check_variable_and_dtype(
-        rois, 'rois', ['float32', 'float64'], 'deformable_roi_pooling'
-    )
-    check_variable_and_dtype(
-        trans, 'trans', ['float32', 'float64'], 'deformable_roi_pooling'
-    )
-    check_type(
-        group_size, 'group_size', (list, tuple), 'deformable_roi_pooling'
-    )
-    if part_size is not None:
-        check_type(
-            part_size, 'part_size', (list, tuple), 'deformable_roi_pooling'
-        )
-
-    input_channels = input.shape[1]
-    if position_sensitive is False:
-        output_channels = input_channels
-    else:
-        output_channels = input_channels / pooled_height / pooled_width
-
-    if part_size is None:
-        part_height = pooled_height
-        part_width = pooled_width
-        part_size = [part_height, part_width]
-    part_size = utils.convert_to_list(part_size, 2, 'part_size')
-    group_size = utils.convert_to_list(group_size, 2, 'group_size')
-    helper = LayerHelper('deformable_psroi_pooling', **locals())
-    dtype = helper.input_dtype()
-    output = helper.create_variable_for_type_inference(dtype)
-    top_count = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(
-        type="deformable_psroi_pooling",
-        inputs={"Input": input, "ROIs": rois, "Trans": trans},
-        outputs={"Output": output, "TopCount": top_count},
-        attrs={
-            "no_trans": no_trans,
-            "spatial_scale": spatial_scale,
-            "output_dim": output_channels,
-            "group_size": group_size,
-            "pooled_height": pooled_height,
-            "pooled_width": pooled_width,
-            "part_size": part_size,
-            "sample_per_part": sample_per_part,
-            "trans_std": trans_std,
-        },
-    )
-    return output
-
-
-@deprecated(since="2.0.0", update_to="paddle.shard_index")
-def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
-    """
-    Reset the values of `input` according to the shard it beloning to.
-    Every value in `input` must be a non-negative integer, and
-    the parameter `index_num` represents the integer above the maximum
-    value of `input`. Thus, all values in `input` must be in the range
-    [0, index_num) and each value can be regarded as the offset to the beginning
-    of the range. The range is further split into multiple shards. Specifically,
-    we first compute the `shard_size` according to the following formula,
-    which represents the number of integers each shard can hold. So for the
-    i'th shard, it can hold values in the range [i*shard_size, (i+1)*shard_size).
-    ::
-
-        shard_size = (index_num + nshards - 1) // nshards
-
-    For each value `v` in `input`, we reset it to a new value according to the
-    following formula:
-    ::
-
-        v = v - shard_id * shard_size if shard_id * shard_size <= v < (shard_id+1) * shard_size else ignore_value
-
-    That is, the value `v` is set to the new offset within the range represented by the shard `shard_id`
-    if it in the range. Otherwise, we reset it to be `ignore_value`.
-
-    Args:
-        input (Tensor): Input tensor with data type int64 or int32. It's last dimension must be 1.
-        index_num (int): An integer represents the integer above the maximum value of `input`.
-        nshards (int): The number of shards.
-        shard_id (int): The index of the current shard.
-        ignore_value (int): An integer value out of sharded index range.
-
-    Returns:
-        Tensor.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            label = paddle.to_tensor([[16], [1]], "int64")
-            shard_label = paddle.shard_index(input=label,
-                                             index_num=20,
-                                             nshards=2,
-                                             shard_id=0)
-            print(shard_label)
-            # [[-1], [1]]
-    """
-    if in_dygraph_mode():
-        return _C_ops.shard_index(
-            input, index_num, nshards, shard_id, ignore_value
-        )
-
-    check_variable_and_dtype(input, 'input', ['int64', 'int32'], 'shard_index')
-    op_type = 'shard_index'
-    helper = LayerHelper(op_type, **locals())
-    if shard_id < 0 or shard_id >= nshards:
-        raise ValueError(
-            'The shard_id(%d) should be in [0, %d)' % (shard_id, nshards)
-        )
-
-    out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type=op_type,
-        inputs={'X': [input]},
-        outputs={'Out': out},
-        attrs={
-            'index_num': index_num,
-            'nshards': nshards,
-            'shard_id': shard_id,
-            'ignore_value': ignore_value,
-        },
-        stop_gradient=True,
-    )
-    return out
-
-
-@templatedoc()
-def hard_swish(x, threshold=6.0, scale=6.0, offset=3.0, name=None):
-    r"""
-    This operator implements the hard_swish activation function.
-    Hard_swish is proposed in MobileNetV3, and performs better in computational stability and efficiency compared to swish function.
-    For more details please refer to: https://arxiv.org/pdf/1905.02244.pdf
-
-    The formula is as follows:
-
-    .. math::
-
-        out = \\frac{x * (min(max(0, x+offset), threshold))}{scale}
-
-    In the above equation:
-
-    ``threshold`` and ``scale`` should be positive, ``offset`` can be positive or negative. It is recommended to use default parameters.
-
-    Args:
-        x (Variable): Input feature, multi-dimensional Tensor. The data type should be float32 or float64.
-        threshold (float, optional): The threshold in Relu function. Default: 6.0
-        scale (float, optional): The scale factor. Default: 6.0
-        offset (float, optional): The offset factor. Default: 3.0
-        name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`
-
-    Returns:
-        Variable: The output tensor with the same shape and data type as input.
-
-
-    Examples:
-
-    .. code-block:: python
-
-        import paddle.fluid as fluid
-        import paddle
-        import numpy as np
-        paddle.enable_static()
-
-        DATATYPE='float32'
-
-        x_data = np.array([i for i in range(1,5)]).reshape([1,1,4]).astype(DATATYPE)
-
-        x = fluid.data(name="x", shape=[None,1,4], dtype=DATATYPE)
-        y = fluid.layers.hard_swish(x)
-
-        place = fluid.CPUPlace()
-        #place = fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-        out, = exe.run(feed={'x':x_data}, fetch_list=[y.name])
-        print(out)  # [[0.66666667, 1.66666667,3., 4.]]
-    """
-    if _non_static_mode():
-        return _legacy_C_ops.hard_swish(
-            x, 'threshold', threshold, 'scale', scale, 'offset', offset
-        )
-
-    check_variable_and_dtype(
-        x, 'x', ['float16', 'float32', 'float64'], 'hard_swish'
-    )
-
-    helper = LayerHelper('hard_swish', **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='hard_swish',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'threshold': threshold, 'scale': scale, 'offset': offset},
-    )
-    return out
-
-
-@templatedoc()
-def mish(x, threshold=20, name=None):
-    r"""
-    This operator implements the mish activation function.
-    Refer to `Mish: A Self Regularized Non-Monotonic Neural
-    Activation Function <https://arxiv.org/abs/1908.08681>`_
-
-
-    The formula is as follows if :attr:`threshold` is :code:`None` or negative:
-
-    .. math::
-
-        out = x * \\tanh(\\ln(1 + e^{x}))
-
-    The formula is as follows if :attr:`threshold` is set as positive value:
-
-    .. math::
-
-    out = \\begin{cases}
-        x \\ast \\tanh(x), \\text{if } x > \\text{threshold} \\\\
-        x \\ast \\tanh(e^{x}), \\text{if } x < -\\text{threshold} \\\\
-        x \\ast \\tanh(\\ln(1 + e^{x})),  \\text{otherwise}
-          \\end{cases}
-
-    Args:
-        x (Variable): Input feature, multi-dimensional Tensor. The data type
-                      should be float16, float32 or float64.
-        threshold (float|None): threshold for softplus in Mish operator.
-                Approximate value of softplus will be used if absolute value
-                of input is greater than :attr:threshold and :attr:threshold
-                is set as positive value. For none or negative threshold,
-                approximate value is not used. Default 20.
-        name (str, optional): The default value is None. Normally there is no
-                need for user to set this property. For more information, please
-                refer to :ref:`api_guide_Name`
-
-    Returns:
-        Variable: The output tensor with the same shape and data type as input.
-
-
-    Examples:
-
-    .. code-block:: python
-
-        import paddle
-        import paddle.fluid as fluid
-        import numpy as np
-
-        paddle.enable_static()
-        DATATYPE='float32'
-
-        x_data = np.array([i for i in range(1,5)]).reshape([1,1,4]).astype(DATATYPE)
-
-        x = fluid.data(name="x", shape=[None,1,4], dtype=DATATYPE)
-        y = fluid.layers.mish(x)
-
-        place = fluid.CPUPlace()
-        # place = fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-        out, = exe.run(feed={'x':x_data}, fetch_list=[y.name])
-        print(out)  # [[0.66666667, 1.66666667, 3., 4.]]
-    """
-    if in_dygraph_mode():
-        return _C_ops.mish(x, threshold)
-    if _in_legacy_dygraph():
-        return _legacy_C_ops.mish(x, 'threshold', threshold)
-
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'mish')
-    check_type(threshold, 'threshold', (float, int), 'mish')
-    assert (
-        threshold > 0
-    ), "threshold of mish should be greater than 0, " "but got {}".format(
-        threshold
-    )
-
-    helper = LayerHelper('mish', **locals())
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type='mish',
-        inputs={'X': x},
-        outputs={'Out': out},
-        attrs={'threshold': threshold},
-    )
-    return out
-
-
-@deprecated(since="2.0.0", update_to="paddle.uniform")
-@templatedoc()
-def uniform_random(
-    shape, dtype='float32', min=-1.0, max=1.0, seed=0, name=None
-):
-    """
-    This OP returns a Tensor filled with random values sampled from a uniform
-    distribution in the range [``min``, ``max``), with ``shape`` and ``dtype``.
-
-    Examples:
-    ::
-
-        Input:
-          shape = [1, 2]
-
-        Output:
-          result=[[0.8505902, 0.8397286]]
-
-    Args:
-        shape(list|tuple|Tensor): The shape of the output Tensor. If ``shape``
-            is a list or tuple, the elements of it should be integers or Tensors
-            (with the shape [1], and the data type int32 or int64). If ``shape``
-            is a Tensor, it should be a 1-D Tensor(with the data type int32 or
-            int64).
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of
-            the output Tensor. Supported data types: float32, float64.
-            Default is float32.
-        min(float|int, optional): The lower bound on the range of random values
-            to generate, ``min`` is included in the range. Default is -1.0.
-        max(float|int, optional): The upper bound on the range of random values
-            to generate, ``max`` is excluded in the range. Default is 1.0.
-        seed(int, optional): Random seed used for generating samples. 0 means
-            use a seed generated by the system. Note that if seed is not 0,
-            this operator will always generate the same random numbers every
-            time. Default is 0.
-        name(str, optional): The default value is None. Normally there is no
-            need for user to set this property. For more information, please
-            refer to :ref:`api_guide_Name`.
-
-    Returns:
-        Tensor: A Tensor filled with random values sampled from a uniform
-        distribution in the range [``min``, ``max``), with ``shape`` and ``dtype``.
-
-    Raises:
-        TypeError: If ``shape`` is not list, tuple, Tensor.
-        TypeError: If ``dtype`` is not float32, float64.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            paddle.enable_static()
-
-            # example 1:
-            # attr shape is a list which doesn't contain Tensor.
-            result_1 = fluid.layers.uniform_random(shape=[3, 4])
-            # [[ 0.84524226,  0.6921872,   0.56528175,  0.71690357],
-            #  [-0.34646994, -0.45116323, -0.09902662, -0.11397249],
-            #  [ 0.433519,    0.39483607, -0.8660099,   0.83664286]]
-
-            # example 2:
-            # attr shape is a list which contains Tensor.
-            dim_1 = fluid.layers.fill_constant([1], "int64", 2)
-            dim_2 = fluid.layers.fill_constant([1], "int32", 3)
-            result_2 = fluid.layers.uniform_random(shape=[dim_1, dim_2])
-            # [[-0.9951253,   0.30757582, 0.9899647 ],
-            #  [ 0.5864527,   0.6607096,  -0.8886161 ]]
-
-            # example 3:
-            # attr shape is a Tensor, the data type must be int64 or int32.
-            var_shape = fluid.data(name='var_shape', shape=[2], dtype="int64")
-            result_3 = fluid.layers.uniform_random(var_shape)
-            # if var_shape's value is [2, 3]
-            # result_3 is:
-            # [[-0.8517412,  -0.4006908,   0.2551912 ],
-            #  [ 0.3364414,   0.36278176, -0.16085452]]
-
-    """
-    if not isinstance(dtype, core.VarDesc.VarType):
-        dtype = convert_np_dtype_to_dtype_(dtype)
-
-    if in_dygraph_mode():
-        shape = utils.convert_shape_to_list(shape)
-        return _C_ops.uniform(
-            shape,
-            dtype,
-            float(min),
-            float(max),
-            seed,
-            _current_expected_place(),
-        )
-    elif _in_legacy_dygraph():
-        shape = utils.convert_shape_to_list(shape)
-        return _legacy_C_ops.uniform_random(
-            'shape',
-            shape,
-            'min',
-            float(min),
-            'max',
-            float(max),
-            'seed',
-            seed,
-            'dtype',
-            dtype,
-        )
-
-    check_type(shape, 'shape', (list, tuple, Variable), 'uniform_random/rand')
-    check_dtype(
-        dtype, 'dtype', ('float32', 'float64', 'uint16'), 'uniform_random/rand'
-    )
-    check_type(min, 'min', (float, int, Variable), 'uniform_random/rand')
-    check_type(max, 'max', (float, int, Variable), 'uniform_random/rand')
-
-    inputs = dict()
-    attrs = {'seed': seed, 'min': min, 'max': max, 'dtype': dtype}
-    utils.get_shape_tensor_inputs(
-        inputs=inputs, attrs=attrs, shape=shape, op_type='uniform_random/rand'
-    )
-
-    helper = LayerHelper("uniform_random", **locals())
-    out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="uniform_random", inputs=inputs, attrs=attrs, outputs={"Out": out}
-    )
-    utils.try_set_static_shape_tensor(out, shape)
-    return out
-
-
-def unbind(input, axis=0):
-    """
-    Removes a tensor dimension, then split the input tensor into multiple sub-Tensors.
-    Args:
-        input (Variable): The input variable which is an N-D Tensor, data type being float32, float64, int32 or int64.
-
-        axis (int32|int64, optional): A scalar with type ``int32|int64`` shape [1]. The dimension along which to unbind. If :math:`axis < 0`, the
-            dimension to unbind along is :math:`rank(input) + axis`. Default is 0.
-    Returns:
-        list(Variable): The list of segmented Tensor variables.
-
-    Example:
-        .. code-block:: python
-            import paddle
-            # input is a variable which shape is [3, 4, 5]
-            input = paddle.fluid.data(
-                 name="input", shape=[3, 4, 5], dtype="float32")
-            [x0, x1, x2] = paddle.tensor.unbind(input, axis=0)
-            # x0.shape [4, 5]
-            # x1.shape [4, 5]
-            # x2.shape [4, 5]
-            [x0, x1, x2, x3] = paddle.tensor.unbind(input, axis=1)
-            # x0.shape [3, 5]
-            # x1.shape [3, 5]
-            # x2.shape [3, 5]
-            # x3.shape [3, 5]
-
-    """
-    helper = LayerHelper("unbind", **locals())
-    check_type(input, 'input', (Variable), 'unbind')
-    dtype = helper.input_dtype()
-    check_dtype(
-        dtype, 'unbind', ['float32', 'float64', 'int32', 'int64'], 'unbind'
-    )
-    if not isinstance(axis, (int)):
-        raise TypeError(
-            "The type of 'axis'  must be int, but received %s." % (type(axis))
-        )
-    if isinstance(axis, np.generic):
-        axis = np.asscalar(axis)
-    input_shape = input.shape
-    axis_ = axis if axis >= 0 else len(input_shape) + axis
-    num = input_shape[axis_]
-    outs = [
-        helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-        for i in range(num)
-    ]
-
-    helper.append_op(
-        type="unbind",
-        inputs={"X": input},
-        outputs={"Out": outs},
-        attrs={"axis": axis},
-    )
-    return outs
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 9fff40e1685c1..613f696f12ade 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1097,7 +1097,6 @@ set_tests_properties(test_bilinear_interp_v2_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_svd_op PROPERTIES TIMEOUT 80)
 set_tests_properties(test_einsum_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_qr_op PROPERTIES TIMEOUT 60)
-set_tests_properties(test_deformable_psroi_pooling PROPERTIES TIMEOUT 120)
 set_tests_properties(test_trilinear_interp_v2_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_static_runner_mnist PROPERTIES TIMEOUT 120)
 set_tests_properties(test_masked_select_op PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
index 2026a54116c23..e8329c48c2a2b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
@@ -102,7 +102,7 @@ class ElementwiseActivationMkldnnFusePassTest_Add_HardSwish(
 ):
     def set_params(self):
         self.operand = paddle.add
-        self.act = fluid.layers.hard_swish
+        self.act = paddle.nn.functional.hardswish
 
 
 class ElementwiseActivationMkldnnFusePassTest_Add_SQRT(
@@ -202,7 +202,7 @@ class ElementwiseActivationMkldnnFusePassTest_Sub_HardSwish(
 ):
     def set_params(self):
         self.operand = paddle.subtract
-        self.act = fluid.layers.hard_swish
+        self.act = paddle.nn.functional.hardswish
 
 
 class ElementwiseActivationMkldnnFusePassTest_Sub_ABS(
@@ -294,7 +294,7 @@ class ElementwiseActivationMkldnnFusePassTest_Mul_HardSwish(
 ):
     def set_params(self):
         self.operand = paddle.multiply
-        self.act = fluid.layers.hard_swish
+        self.act = paddle.nn.functional.hardswish
 
 
 class ElementwiseActivationMkldnnFusePassTest_Mul_SQRT(
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
index abc96d262e04e..4134f421e9e4b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
@@ -88,7 +88,7 @@ def append_act(self, x):
 
 class TensorRTSubgraphPassHardSwishTest(TensorRTSubgraphPassActivationTest):
     def append_act(self, x):
-        return fluid.layers.hard_swish(x)
+        return paddle.nn.functional.hardswish(x)
 
 
 class TensorRTSubgraphPassHardSigmoidTest(TensorRTSubgraphPassActivationTest):
@@ -100,7 +100,7 @@ class TensorRTSubgraphPassHardSwishPluginTest(
     TensorRTSubgraphPassActivationTest
 ):
     def append_act(self, x):
-        return fluid.layers.hard_swish(x, threshold=4.0, scale=8.0)
+        return paddle.nn.functional.hardswish(x)
 
 
 class TensorRTSubgraphPassClipTest(TensorRTSubgraphPassActivationTest):
@@ -166,7 +166,7 @@ def setUpTensorRTParam(self):
         )
 
     def append_act(self, x):
-        return fluid.layers.mish(x)
+        return paddle.nn.functional.mish(x)
 
 
 class TensorRTSubgraphPassMishFp16SerializeTest(
@@ -179,7 +179,7 @@ def setUpTensorRTParam(self):
         )
 
     def append_act(self, x):
-        return fluid.layers.mish(x)
+        return paddle.nn.functional.mish(x)
 
 
 class TensorRTSubgraphPassDynamicMishFp16SerializeTest(
@@ -200,7 +200,7 @@ def setUpTensorRTParam(self):
         )
 
     def append_act(self, x):
-        return fluid.layers.mish(x)
+        return paddle.nn.functional.mish(x)
 
 
 class TensorRTSubgraphPassPreluAllTest(TensorRTSubgraphPassActivationTest):
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 2479312a51ef5..db3bb976b865e 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -2073,7 +2073,7 @@ def test_dygraph_api(self):
     def test_fluid_api(self):
         with fluid.program_guard(fluid.Program()):
             x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
-            out = fluid.layers.hard_swish(x)
+            out = paddle.nn.functional.hardswish(x)
             exe = fluid.Executor(self.place)
             res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
         out_ref = ref_hardswish(self.x_np)
@@ -2081,7 +2081,7 @@ def test_fluid_api(self):
 
         paddle.disable_static(self.place)
         x = paddle.to_tensor(self.x_np)
-        out = paddle.fluid.layers.hard_swish(x)
+        out = paddle.nn.functional.hardswish(x)
         np.testing.assert_allclose(out_ref, out.numpy(), rtol=1e-05)
         paddle.enable_static()
 
@@ -3414,7 +3414,7 @@ def ref_mish(x, threshold=20.0):
 class TestMish(TestActivation):
     def setUp(self):
         self.op_type = "mish"
-        self.python_api = paddle.fluid.layers.nn.mish
+        self.python_api = paddle.nn.functional.mish
         self.init_dtype()
         self.init_shape()
 
@@ -3480,7 +3480,7 @@ def test_fluid_api(self):
         paddle.enable_static()
         with fluid.program_guard(fluid.Program()):
             x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
-            out = fluid.layers.mish(x)
+            out = paddle.nn.functional.mish(x)
             exe = fluid.Executor(self.place)
             res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
         out_ref = ref_mish(self.x_np)
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
index 7c3ba6add0312..07263731e1ff2 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
@@ -41,15 +41,9 @@ def test_gen_dropout_dygraph(self):
         gen.manual_seed(111111111)
         st = paddle.get_cuda_rng_state()
 
-        x = fluid.layers.uniform_random(
-            [2, 10], dtype="float32", min=0.0, max=1.0
-        )
-        x_again = fluid.layers.uniform_random(
-            [2, 10], dtype="float32", min=0.0, max=1.0
-        )
-        x_third = fluid.layers.uniform_random(
-            [2, 10], dtype="float32", min=0.0, max=1.0
-        )
+        x = paddle.uniform([2, 10], dtype="float32", min=0.0, max=1.0)
+        x_again = paddle.uniform([2, 10], dtype="float32", min=0.0, max=1.0)
+        x_third = paddle.uniform([2, 10], dtype="float32", min=0.0, max=1.0)
         print("x: {}".format(x.numpy()))
         print("x_again: {}".format(x_again.numpy()))
         x = x + x_again + x_third
@@ -57,15 +51,9 @@ def test_gen_dropout_dygraph(self):
 
         paddle.set_cuda_rng_state(st)
 
-        x1 = fluid.layers.uniform_random(
-            [2, 10], dtype="float32", min=0.0, max=1.0
-        )
-        x1_again = fluid.layers.uniform_random(
-            [2, 10], dtype="float32", min=0.0, max=1.0
-        )
-        x1_third = fluid.layers.uniform_random(
-            [2, 10], dtype="float32", min=0.0, max=1.0
-        )
+        x1 = paddle.uniform([2, 10], dtype="float32", min=0.0, max=1.0)
+        x1_again = paddle.uniform([2, 10], dtype="float32", min=0.0, max=1.0)
+        x1_third = paddle.uniform([2, 10], dtype="float32", min=0.0, max=1.0)
         x1 = x1 + x1_again + x1_third
         y1 = fluid.layers.dropout(x1, 0.5)
         y_np = y.numpy()
@@ -128,7 +116,7 @@ def test_gen_TruncatedNormal_initializer(self):
         with fluid.program_guard(train_program, startup_program):
             # example 1:
             # attr shape is a list which doesn't contain tensor Variable.
-            x = fluid.layers.uniform_random(shape=[2, 10])
+            x = paddle.uniform(shape=[2, 10])
             result_1 = fluid.layers.fc(
                 input=x,
                 size=10,
diff --git a/python/paddle/fluid/tests/unittests/test_deformable_psroi_pooling.py b/python/paddle/fluid/tests/unittests/test_deformable_psroi_pooling.py
deleted file mode 100644
index d73ef732da814..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_deformable_psroi_pooling.py
+++ /dev/null
@@ -1,596 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-import paddle.fluid as fluid
-from paddle.fluid import Program, program_guard
-
-
-def set_input(input, rois, trans):
-    inputs = {'Input': input, "ROIs": rois, "Trans": trans}
-    return inputs
-
-
-def set_attrs(
-    no_trans,
-    spatial_scale,
-    output_channels,
-    group_size,
-    pooled_height,
-    pooled_width,
-    part_size,
-    sample_per_part,
-    trans_std,
-):
-    attrs = {
-        'no_trans': no_trans,
-        'spatial_scale': spatial_scale,
-        'output_dim': output_channels,
-        'group_size': group_size,
-        'pooled_height': pooled_height,
-        'pooled_width': pooled_width,
-        'part_size': part_size,
-        'sample_per_part': sample_per_part,
-        'trans_std': trans_std,
-    }
-    return attrs
-
-
-def set_outputs(output, top_count):
-    outputs = {
-        'Output': output.astype('float32'),
-        'TopCount': top_count.astype('float32'),
-    }
-    return outputs
-
-
-class TestDeformablePSROIPoolOp(OpTest):
-    def set_data(self):
-        self.start_test1()
-        self.start_test2()
-        self.start_test3()
-        self.start_test4()
-
-    def start_test1(self):
-        self.init_test_case1()
-        self.make_rois()
-        self.calc_deformable_psroi_pooling()
-
-        inputs = self.input
-        rois = (self.rois[:, 1:5], self.rois_lod)
-        trans = self.trans
-        self.inputs = set_input(inputs, rois, trans)
-
-        no_trans = self.no_trans
-        spatial_scale = self.spatial_scale
-        output_channels = self.output_channels
-        group_size = self.group_size
-        pooled_height = self.pooled_height
-        pooled_width = self.pooled_width
-        part_size = self.part_size
-        sample_per_part = self.sample_per_part
-        trans_std = self.trans_std
-
-        self.attrs = set_attrs(
-            no_trans,
-            spatial_scale,
-            output_channels,
-            group_size,
-            pooled_height,
-            pooled_width,
-            part_size,
-            sample_per_part,
-            trans_std,
-        )
-
-        output = self.out.astype('float32')
-        top_count = self.top_count.astype('float32')
-        self.outputs = set_outputs(output, top_count)
-
-    def start_test2(self):
-        self.init_test_case2()
-        self.make_rois()
-        self.calc_deformable_psroi_pooling()
-
-        inputs = self.input
-        rois = (self.rois[:, 1:5], self.rois_lod)
-        trans = self.trans
-        self.inputs = set_input(inputs, rois, trans)
-
-        no_trans = self.no_trans
-        spatial_scale = self.spatial_scale
-        output_channels = self.output_channels
-        group_size = self.group_size
-        pooled_height = self.pooled_height
-        pooled_width = self.pooled_width
-        part_size = self.part_size
-        sample_per_part = self.sample_per_part
-        trans_std = self.trans_std
-
-        self.attrs = set_attrs(
-            no_trans,
-            spatial_scale,
-            output_channels,
-            group_size,
-            pooled_height,
-            pooled_width,
-            part_size,
-            sample_per_part,
-            trans_std,
-        )
-
-        output = self.out.astype('float32')
-        top_count = self.top_count.astype('float32')
-        self.outputs = set_outputs(output, top_count)
-
-    def start_test3(self):
-        self.init_test_case3()
-        self.make_rois()
-        self.calc_deformable_psroi_pooling()
-
-        inputs = self.input
-        rois = (self.rois[:, 1:5], self.rois_lod)
-        trans = self.trans
-        self.inputs = set_input(inputs, rois, trans)
-
-        no_trans = self.no_trans
-        spatial_scale = self.spatial_scale
-        output_channels = self.output_channels
-        group_size = self.group_size
-        pooled_height = self.pooled_height
-        pooled_width = self.pooled_width
-        part_size = self.part_size
-        sample_per_part = self.sample_per_part
-        trans_std = self.trans_std
-
-        self.attrs = set_attrs(
-            no_trans,
-            spatial_scale,
-            output_channels,
-            group_size,
-            pooled_height,
-            pooled_width,
-            part_size,
-            sample_per_part,
-            trans_std,
-        )
-
-        output = self.out.astype('float32')
-        top_count = self.top_count.astype('float32')
-        self.outputs = set_outputs(output, top_count)
-
-    def start_test4(self):
-        self.init_test_case4()
-        self.make_rois()
-        self.calc_deformable_psroi_pooling()
-
-        inputs = self.input
-        rois = (self.rois[:, 1:5], self.rois_lod)
-        trans = self.trans
-        self.inputs = set_input(inputs, rois, trans)
-
-        no_trans = self.no_trans
-        spatial_scale = self.spatial_scale
-        output_channels = self.output_channels
-        group_size = self.group_size
-        pooled_height = self.pooled_height
-        pooled_width = self.pooled_width
-        part_size = self.part_size
-        sample_per_part = self.sample_per_part
-        trans_std = self.trans_std
-
-        self.attrs = set_attrs(
-            no_trans,
-            spatial_scale,
-            output_channels,
-            group_size,
-            pooled_height,
-            pooled_width,
-            part_size,
-            sample_per_part,
-            trans_std,
-        )
-
-        output = self.out.astype('float32')
-        top_count = self.top_count.astype('float32')
-        self.outputs = set_outputs(output, top_count)
-
-    def init_test_case1(self):
-        self.batch_size = 3
-        self.channels = 3 * 2 * 2
-        self.height = 12
-        self.width = 12
-        self.input_dim = [
-            self.batch_size,
-            self.channels,
-            self.height,
-            self.width,
-        ]
-        self.no_trans = False
-        self.spatial_scale = 1.0 / 4.0
-        self.output_channels = 12
-        self.group_size = [1, 1]
-        self.pooled_height = 4
-        self.pooled_width = 4
-        self.part_size = [4, 4]
-        self.sample_per_part = 2
-        self.trans_std = 0.1
-        self.input = np.random.random(self.input_dim).astype('float32')
-
-    def init_test_case2(self):
-        self.batch_size = 2
-        self.channels = 3 * 2 * 2
-        self.height = 12
-        self.width = 12
-        self.input_dim = [
-            self.batch_size,
-            self.channels,
-            self.height,
-            self.width,
-        ]
-        self.no_trans = True
-        self.spatial_scale = 1.0 / 2.0
-        self.output_channels = 12
-        self.group_size = [1, 1]
-        self.pooled_height = 7
-        self.pooled_width = 7
-        self.part_size = [7, 7]
-        self.sample_per_part = 4
-        self.trans_std = 0.1
-        self.input = np.random.random(self.input_dim).astype('float32')
-
-    def init_test_case3(self):
-        self.batch_size = 2
-        self.channels = 3 * 2 * 2
-        self.height = 12
-        self.width = 12
-        self.input_dim = [
-            self.batch_size,
-            self.channels,
-            self.height,
-            self.width,
-        ]
-        self.no_trans = False
-        self.spatial_scale = 1.0 / 4.0
-        self.output_channels = 12
-        self.group_size = [1, 1]
-        self.pooled_height = 3
-        self.pooled_width = 3
-        self.part_size = [3, 3]
-        self.sample_per_part = 3
-        self.trans_std = 0.2
-        self.input = np.random.random(self.input_dim).astype('float32')
-
-    def init_test_case4(self):
-        self.batch_size = 2
-        self.channels = 3 * 2 * 2
-        self.height = 12
-        self.width = 12
-        self.input_dim = [
-            self.batch_size,
-            self.channels,
-            self.height,
-            self.width,
-        ]
-        self.no_trans = True
-        self.spatial_scale = 1.0 / 2.0
-        self.output_channels = 12
-        self.group_size = [1, 1]
-        self.pooled_height = 6
-        self.pooled_width = 2
-        self.part_size = [6, 6]
-        self.sample_per_part = 6
-        self.trans_std = 0.4
-        self.input = np.random.random(self.input_dim).astype('float32')
-
-    def make_rois(self):
-        rois = []
-        self.rois_lod = [[]]
-        for bno in range(self.batch_size):
-            self.rois_lod[0].append(bno + 1)
-            for i in range(bno + 1):
-                x_1 = np.random.randint(
-                    0, self.width // self.spatial_scale - self.pooled_width
-                )
-                y_1 = np.random.randint(
-                    0, self.height // self.spatial_scale - self.pooled_height
-                )
-                x_2 = np.random.randint(
-                    x_1 + self.pooled_width, self.width // self.spatial_scale
-                )
-                y_2 = np.random.randint(
-                    y_1 + self.pooled_height, self.height // self.spatial_scale
-                )
-                roi = [bno, x_1, y_1, x_2, y_2]
-                rois.append(roi)
-        self.rois_num = len(rois)
-        self.rois = np.array(rois).astype("float32")
-
-    def dmc_bilinear(self, data_im, p_h, p_w):
-        h_low = int(np.floor(p_h))
-        w_low = int(np.floor(p_w))
-        h_high = h_low + 1
-        w_high = w_low + 1
-        l_h = p_h - h_low
-        l_w = p_w - w_low
-        h_h = 1 - l_h
-        h_w = 1 - l_w
-        v_1 = 0
-        if h_low >= 0 and w_low >= 0:
-            v_1 = data_im[h_low, w_low]
-        v_2 = 0
-        if h_low >= 0 and w_high <= self.width - 1:
-            v_2 = data_im[h_low, w_high]
-        v_3 = 0
-        if h_high <= self.height - 1 and w_low >= 0:
-            v_3 = data_im[h_high, w_low]
-        v_4 = 0
-        if h_high <= self.height - 1 and w_high <= self.width - 1:
-            v_4 = data_im[h_high, w_high]
-        w_1, w_2, w_3, w_4 = h_h * h_w, h_h * l_w, l_h * h_w, l_h * l_w
-        val = w_1 * v_1 + w_2 * v_2 + w_3 * v_3 + w_4 * v_4
-        return val
-
-    def calc_deformable_psroi_pooling(self):
-        output_shape = (
-            self.rois_num,
-            self.output_channels,
-            self.pooled_height,
-            self.pooled_width,
-        )
-        self.out = np.zeros(output_shape)
-        self.trans = np.random.rand(
-            self.rois_num, 2, self.part_size[0], self.part_size[1]
-        ).astype('float32')
-        self.top_count = np.random.random((output_shape)).astype('float32')
-        count = (
-            self.rois_num
-            * self.output_channels
-            * self.pooled_height
-            * self.pooled_width
-        )
-        for index in range(count):
-            p_w = int(index % self.pooled_width)
-            p_h = int(index / self.pooled_width % self.pooled_height)
-            ctop = int(
-                index
-                / self.pooled_width
-                / self.pooled_height
-                % self.output_channels
-            )
-            n_out = int(
-                index
-                / self.pooled_width
-                / self.pooled_height
-                / self.output_channels
-            )
-            roi = self.rois[n_out]
-            roi_batch_id = int(roi[0])
-            roi_start_w = int(np.round(roi[1])) * self.spatial_scale - 0.5
-            roi_start_h = int(np.round(roi[2])) * self.spatial_scale - 0.5
-            roi_end_w = int(np.round(roi[3] + 1)) * self.spatial_scale - 0.5
-            roi_end_h = int(np.round(roi[4] + 1)) * self.spatial_scale - 0.5
-            roi_width = max(roi_end_w - roi_start_w, 0.1)
-            roi_height = max(roi_end_h - roi_start_h, 0.1)
-            bin_size_h = float(roi_height) / float(self.pooled_height)
-            bin_size_w = float(roi_width) / float(self.pooled_width)
-            sub_bin_size_h = bin_size_h / self.sample_per_part
-            sub_bin_size_w = bin_size_w / self.sample_per_part
-            part_h = int(np.floor(p_h) / self.pooled_height * self.part_size[0])
-            part_w = int(np.floor(p_w) / self.pooled_width * self.part_size[1])
-            if self.no_trans:
-                trans_x = 0
-                trans_y = 0
-            else:
-                trans_x = self.trans[n_out][0][part_h][part_w] * self.trans_std
-                trans_y = self.trans[n_out][1][part_h][part_w] * self.trans_std
-            wstart = p_w * bin_size_w + roi_start_w
-            wstart = wstart + trans_x * roi_width
-            hstart = p_h * bin_size_h + roi_start_h
-            hstart = hstart + trans_y * roi_height
-            sum = 0
-            num_sample = 0
-            g_w = np.floor(p_w * self.group_size[0] / self.pooled_height)
-            g_h = np.floor(p_h * self.group_size[1] / self.pooled_width)
-            g_w = min(max(g_w, 0), self.group_size[0] - 1)
-            g_h = min(max(g_h, 0), self.group_size[1] - 1)
-            input_i = self.input[roi_batch_id]
-            for i_w in range(self.sample_per_part):
-                for i_h in range(self.sample_per_part):
-                    w_sample = wstart + i_w * sub_bin_size_w
-                    h_sample = hstart + i_h * sub_bin_size_h
-                    if (
-                        w_sample < -0.5
-                        or w_sample > self.width - 0.5
-                        or h_sample < -0.5
-                        or h_sample > self.height - 0.5
-                    ):
-                        continue
-                    w_sample = min(max(w_sample, 0.0), self.width - 1.0)
-                    h_sample = min(max(h_sample, 0.0), self.height - 1.0)
-                    c_sample = int(
-                        (ctop * self.group_size[0] + g_h) * self.group_size[1]
-                        + g_w
-                    )
-                    val = self.dmc_bilinear(
-                        input_i[c_sample], h_sample, w_sample
-                    )
-                    sum = sum + val
-                    num_sample = num_sample + 1
-            if num_sample == 0:
-                self.out[n_out][ctop][p_h][p_w] = 0
-            else:
-                self.out[n_out][ctop][p_h][p_w] = sum / num_sample
-            self.top_count[n_out][ctop][p_h][p_w] = num_sample
-
-    def setUp(self):
-        self.op_type = "deformable_psroi_pooling"
-        self.set_data()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['Input'], 'Output')
-
-
-class TestDeformablePSROIPoolOpError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-            input1 = fluid.data(
-                name="input1", shape=[2, 192, 64, 64], dtype='float32'
-            )
-            rois1 = fluid.data(
-                name="rois1", shape=[-1, 4], dtype='float32', lod_level=1
-            )
-            trans1 = fluid.data(
-                name="trans1", shape=[2, 384, 64, 64], dtype='float32'
-            )
-
-            # The `input` must be Variable and the data type of `input` Tensor must be one of float32 and float64.
-            def test_input_type():
-                fluid.layers.deformable_roi_pooling(
-                    input=[3, 4],
-                    rois=rois1,
-                    trans=trans1,
-                    pooled_height=8,
-                    pooled_width=8,
-                    part_size=(8, 8),
-                    sample_per_part=4,
-                    position_sensitive=True,
-                )
-
-            self.assertRaises(TypeError, test_input_type)
-
-            def test_input_tensor_dtype():
-                input2 = fluid.data(
-                    name="input2", shape=[2, 192, 64, 64], dtype='int32'
-                )
-                fluid.layers.deformable_roi_pooling(
-                    input=input2,
-                    rois=rois1,
-                    trans=trans1,
-                    pooled_height=8,
-                    pooled_width=8,
-                    part_size=(8, 8),
-                    sample_per_part=4,
-                    position_sensitive=True,
-                )
-
-            self.assertRaises(TypeError, test_input_tensor_dtype)
-
-            # The `rois` must be Variable and the data type of `rois` Tensor must be one of float32 and float64.
-            def test_rois_type():
-                fluid.layers.deformable_roi_pooling(
-                    input=input1,
-                    rois=2,
-                    trans=trans1,
-                    pooled_height=8,
-                    pooled_width=8,
-                    part_size=(8, 8),
-                    sample_per_part=4,
-                    position_sensitive=True,
-                )
-
-            self.assertRaises(TypeError, test_rois_type)
-
-            def test_rois_tensor_dtype():
-                rois2 = fluid.data(
-                    name="rois2", shape=[-1, 4], dtype='int32', lod_level=1
-                )
-                fluid.layers.deformable_roi_pooling(
-                    input=input1,
-                    rois=rois2,
-                    trans=trans1,
-                    pooled_height=8,
-                    pooled_width=8,
-                    part_size=(8, 8),
-                    sample_per_part=4,
-                    position_sensitive=True,
-                )
-
-            self.assertRaises(TypeError, test_rois_tensor_dtype)
-
-            # The `trans` must be Variable and the data type of `trans` Tensor must be one of float32 and float64.
-            def test_trans_type():
-                fluid.layers.deformable_roi_pooling(
-                    input=input1,
-                    rois=rois1,
-                    trans=[2],
-                    pooled_height=8,
-                    pooled_width=8,
-                    part_size=(8, 8),
-                    sample_per_part=4,
-                    position_sensitive=True,
-                )
-
-            self.assertRaises(TypeError, test_trans_type)
-
-            def test_trans_tensor_dtype():
-                trans2 = fluid.data(
-                    name="trans2", shape=[2, 384, 64, 64], dtype='int32'
-                )
-                fluid.layers.deformable_roi_pooling(
-                    input=input1,
-                    rois=rois1,
-                    trans=trans2,
-                    pooled_height=8,
-                    pooled_width=8,
-                    part_size=(8, 8),
-                    sample_per_part=4,
-                    position_sensitive=True,
-                )
-
-            self.assertRaises(TypeError, test_trans_tensor_dtype)
-
-            # The `group_size` must be one of list and tuple.
-            # Each element must be int.
-            def test_group_size_type():
-                fluid.layers.deformable_roi_pooling(
-                    input=input1,
-                    rois=rois1,
-                    trans=trans1,
-                    group_size=1,
-                    pooled_height=8,
-                    pooled_width=8,
-                    part_size=(8, 8),
-                    sample_per_part=4,
-                    position_sensitive=True,
-                )
-
-            self.assertRaises(TypeError, test_group_size_type)
-
-            # The `part_size` must be one of list, tuple and None.
-            # Each element must be int.
-            def test_part_size_type():
-                fluid.layers.deformable_roi_pooling(
-                    input=input1,
-                    rois=rois1,
-                    trans=trans1,
-                    pooled_height=8,
-                    pooled_width=8,
-                    part_size=8,
-                    sample_per_part=4,
-                    position_sensitive=True,
-                )
-
-            self.assertRaises(TypeError, test_part_size_type)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
index 4aa064921fe5c..0c89e000538d6 100644
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -408,9 +408,7 @@ class TestDygraphGradientClip(unittest.TestCase):
     def test_gradient_clip(self):
         with fluid.dygraph.guard():
             linear = paddle.nn.Linear(5, 5)
-            inputs = fluid.layers.uniform_random(
-                [16, 5], min=-10, max=10
-            ).astype('float32')
+            inputs = paddle.uniform([16, 5], min=-10, max=10).astype('float32')
             out = linear(fluid.dygraph.to_variable(inputs))
             loss = paddle.mean(out)
             loss.backward()
@@ -552,9 +550,9 @@ def test_gradient_clip(self):
                     models=model, optimizers=sgd_optimizer, level='O2'
                 )
                 scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
-                inputs = fluid.layers.uniform_random(
-                    [1, 5], min=-10, max=10
-                ).astype('float32')
+                inputs = paddle.uniform([1, 5], min=-10, max=10).astype(
+                    'float32'
+                )
                 with paddle.amp.auto_cast(level='O2'):
                     out = model(fluid.dygraph.to_variable(inputs))
                     loss = paddle.mean(out)
@@ -600,9 +598,7 @@ def test_gradient_clip(self):
 class TestDygraphGradientClipFP64(unittest.TestCase):
     def test_gradient_clip(self):
         with fluid.dygraph.guard():
-            inputs = fluid.layers.uniform_random(
-                [16, 5], min=-10, max=10
-            ).astype('float32')
+            inputs = paddle.uniform([16, 5], min=-10, max=10).astype('float32')
             linear = paddle.nn.Linear(5, 5)
             out = linear(fluid.dygraph.to_variable(inputs))
             loss = paddle.mean(out)
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index e39ed15e28c16..409fdbbbdc5ad 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -1843,7 +1843,7 @@ def test_crop_tensor(self):
     def test_shard_index(self):
         with self.static_graph():
             x = fluid.layers.data(name="label", shape=[4, 1], dtype='int64')
-            shard_label = fluid.layers.shard_index(
+            shard_label = paddle.shard_index(
                 input=x, index_num=20, nshards=2, shard_id=0
             )
 
@@ -2342,7 +2342,7 @@ def make_mish(self):
             fluid.default_main_program(), fluid.default_startup_program()
         ):
             input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.mish(input, name='mish')
+            out = paddle.nn.functional.mish(input, name='mish')
             return out
 
     def make_cross_entropy(self):
@@ -2794,7 +2794,7 @@ def test_linspace(self):
     def test_unfold(self):
         with self.static_graph():
             x = layers.data(name='x', shape=[3, 20, 20], dtype='float32')
-            out = layers.unfold(x, [3, 3], 1, 1, 1)
+            out = paddle.nn.functional.unfold(x, [3, 3], 1, 1, 1)
             return out
 
     def test_partial_concat(self):
@@ -2809,40 +2809,6 @@ def test_partial_concat(self):
             )
             return concat1, concat2
 
-    def test_deform_roi_pooling(self):
-        with program_guard(
-            fluid.default_main_program(), fluid.default_startup_program()
-        ):
-            input = layers.data(
-                name='input',
-                shape=[2, 3, 32, 32],
-                dtype='float32',
-                append_batch_size=False,
-            )
-            rois = layers.data(
-                name="rois", shape=[4], dtype='float32', lod_level=1
-            )
-            trans = layers.data(
-                name="trans",
-                shape=[2, 3, 32, 32],
-                dtype='float32',
-                append_batch_size=False,
-            )
-            out = layers.deformable_roi_pooling(
-                input=input,
-                rois=rois,
-                trans=trans,
-                no_trans=False,
-                spatial_scale=1.0,
-                group_size=(1, 1),
-                pooled_height=8,
-                pooled_width=8,
-                part_size=(8, 8),
-                sample_per_part=4,
-                trans_std=0.1,
-            )
-        return out
-
     def test_addmm(self):
         with program_guard(
             fluid.default_main_program(), fluid.default_startup_program()
diff --git a/python/paddle/fluid/tests/unittests/test_random_seed.py b/python/paddle/fluid/tests/unittests/test_random_seed.py
index 1c3c280d2fcbc..420109b3a3880 100644
--- a/python/paddle/fluid/tests/unittests/test_random_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_random_seed.py
@@ -35,23 +35,17 @@ def test_generator_uniform_random_dygraph(self):
         fluid.enable_dygraph()
 
         gen = paddle.seed(12312321111)
-        x = fluid.layers.uniform_random([10], dtype="float32", min=0.0, max=1.0)
+        x = paddle.uniform([10], dtype="float32", min=0.0, max=1.0)
 
         st1 = gen.get_state()
-        x1 = fluid.layers.uniform_random(
-            [10], dtype="float32", min=0.0, max=1.0
-        )
+        x1 = paddle.uniform([10], dtype="float32", min=0.0, max=1.0)
 
         gen.set_state(st1)
         print(gen.get_state())
-        x2 = fluid.layers.uniform_random(
-            [10], dtype="float32", min=0.0, max=1.0
-        )
+        x2 = paddle.uniform([10], dtype="float32", min=0.0, max=1.0)
 
         paddle.seed(12312321111)
-        x3 = fluid.layers.uniform_random(
-            [10], dtype="float32", min=0.0, max=1.0
-        )
+        x3 = paddle.uniform([10], dtype="float32", min=0.0, max=1.0)
 
         x_np = x.numpy()
         x1_np = x1.numpy()
@@ -72,8 +66,8 @@ def test_generator_uniform_random_static(self):
         with fluid.program_guard(train_program, startup_program):
             # example 1:
             # attr shape is a list which doesn't contain tensor Variable.
-            result_1 = fluid.layers.uniform_random(shape=[3, 4])
-            result_2 = fluid.layers.uniform_random(shape=[3, 4])
+            result_1 = paddle.uniform(shape=[3, 4])
+            result_2 = paddle.uniform(shape=[3, 4])
 
             exe = fluid.Executor(fluid.CPUPlace())
             exe.run(startup_program)
@@ -102,15 +96,11 @@ def test_gen_dropout_dygraph(self):
         gen = paddle.seed(111111111)
         st = gen.get_state()
         # x = np.arange(1,101).reshape(2,50).astype("float32")
-        x = fluid.layers.uniform_random(
-            [2, 10], dtype="float32", min=0.0, max=1.0
-        )
+        x = paddle.uniform([2, 10], dtype="float32", min=0.0, max=1.0)
         y = fluid.layers.dropout(x, 0.5)
         gen.manual_seed(111111111)
         # gen.set_state(st)
-        x1 = fluid.layers.uniform_random(
-            [2, 10], dtype="float32", min=0.0, max=1.0
-        )
+        x1 = paddle.uniform([2, 10], dtype="float32", min=0.0, max=1.0)
         y1 = fluid.layers.dropout(x1, 0.5)
         y_np = y.numpy()
         y1_np = y1.numpy()
@@ -129,7 +119,7 @@ def test_gen_dropout_static(self):
         with fluid.program_guard(train_program, startup_program):
             # example 1:
             # attr shape is a list which doesn't contain tensor Variable.
-            x_1 = fluid.layers.uniform_random(shape=[2, 10])
+            x_1 = paddle.uniform(shape=[2, 10])
             y_1 = fluid.layers.dropout(x_1, 0.5)
             exe = fluid.Executor(fluid.CPUPlace())
             exe.run(startup_program)
@@ -235,8 +225,8 @@ def test_generator_uniform_random_static_1(self):
         with fluid.program_guard(train_program, startup_program):
             # example 1:
             # attr shape is a list which doesn't contain tensor Variable.
-            result_1 = fluid.layers.uniform_random(shape=[3, 4])
-            result_2 = fluid.layers.uniform_random(shape=[3, 4])
+            result_1 = paddle.uniform(shape=[3, 4])
+            result_2 = paddle.uniform(shape=[3, 4])
 
             exe = fluid.Executor(fluid.CPUPlace())
             exe.run(startup_program)
@@ -384,7 +374,7 @@ def test_gen_TruncatedNormal_initializer(self):
         with fluid.program_guard(train_program, startup_program):
             # example 1:
             # attr shape is a list which doesn't contain tensor Variable.
-            x = fluid.layers.uniform_random(shape=[2, 10])
+            x = paddle.uniform(shape=[2, 10])
             result_1 = fluid.layers.fc(
                 input=x,
                 size=10,
diff --git a/python/paddle/fluid/tests/unittests/test_regularizer.py b/python/paddle/fluid/tests/unittests/test_regularizer.py
index f162a8e829fe8..27031d97090b5 100644
--- a/python/paddle/fluid/tests/unittests/test_regularizer.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer.py
@@ -263,7 +263,7 @@ def test_repeated_regularization(self):
             regularizer=paddle.regularizer.L1Decay()
         )
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.layers.uniform_random([2, 2, 3])
+            x = paddle.uniform([2, 2, 3])
             out = fluid.layers.fc(x, 5, param_attr=fc_param_attr)
             loss = paddle.sum(out)
             sgd = fluid.optimizer.SGD(learning_rate=0.1, regularization=l2)
diff --git a/python/paddle/fluid/tests/unittests/test_regularizer_api.py b/python/paddle/fluid/tests/unittests/test_regularizer_api.py
index c3adc0cf0b359..d125d61feb774 100644
--- a/python/paddle/fluid/tests/unittests/test_regularizer_api.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer_api.py
@@ -173,7 +173,7 @@ def test_repeated_regularization(self):
             regularizer=paddle.regularizer.L1Decay()
         )
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.layers.uniform_random([2, 2, 3])
+            x = paddle.uniform([2, 2, 3])
             out = fluid.layers.fc(x, 5, param_attr=fc_param_attr)
             loss = paddle.sum(out)
             sgd = fluid.optimizer.SGD(learning_rate=0.1, regularization=l2)
diff --git a/python/paddle/fluid/tests/unittests/test_static_shape_inferrence_for_shape_tensor.py b/python/paddle/fluid/tests/unittests/test_static_shape_inferrence_for_shape_tensor.py
index 6f70e553cc2bc..5ba3bcbbc1108 100644
--- a/python/paddle/fluid/tests/unittests/test_static_shape_inferrence_for_shape_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_static_shape_inferrence_for_shape_tensor.py
@@ -15,6 +15,7 @@
 import unittest
 
 import paddle
+from paddle.fluid.layers.utils import try_set_static_shape_tensor
 
 
 class StaticShapeInferrenceTest(unittest.TestCase):
@@ -24,7 +25,8 @@ def test_static_graph(self):
             name="x", shape=[-1, 2], dtype='float32'
         )
         shape = paddle.shape(data)  # shape should be [-1, 2]
-        x = paddle.fluid.layers.uniform_random(shape)
+        x = paddle.uniform(shape)
+        try_set_static_shape_tensor(x, shape)
         self.assertEqual(x.shape, data.shape)
         paddle.disable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_unbind_op.py b/python/paddle/fluid/tests/unittests/test_unbind_op.py
index 82cec33d59e68..6f49d66b1aca6 100644
--- a/python/paddle/fluid/tests/unittests/test_unbind_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unbind_op.py
@@ -66,7 +66,7 @@ class TestLayersUnbind(unittest.TestCase):
     def test_layers_unbind(self):
 
         x_1 = fluid.data(shape=[2, 3], dtype='float32', name='x_1')
-        [out_0, out_1] = fluid.layers.unbind(input=x_1, axis=0)
+        [out_0, out_1] = paddle.unbind(input=x_1, axis=0)
         input_1 = np.random.random([2, 3]).astype("float32")
         axis = fluid.data(shape=[1], dtype='int32', name='axis')
         exe = fluid.Executor(place=fluid.CPUPlace())
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py
index 81529828020ef..0dcdf0cc2502d 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py
@@ -162,23 +162,6 @@ def check_with_place(self, place):
         np.testing.assert_allclose(hist, prob, rtol=0, atol=0.01)
 
 
-class TestUniformRandomOpBF16AttrTensorAPI(unittest.TestCase):
-    def test_attr_tensor_API(self):
-        startup_program = fluid.Program()
-        train_program = fluid.Program()
-        with fluid.program_guard(train_program, startup_program):
-            dim_tensor = fluid.layers.fill_constant([1], "int64", 3)
-            ret = fluid.layers.nn.uniform_random(
-                [1, dim_tensor, 2], dtype=np.uint16
-            )
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-
-            exe.run(startup_program)
-            outs = exe.run(train_program, fetch_list=[ret])
-
-
 class TestUniformRandomOpAPISeed(unittest.TestCase):
     def test_attr_tensor_API(self):
         _seed = 10
@@ -189,12 +172,8 @@ def test_attr_tensor_API(self):
             _min = 5
             _max = 10
 
-            ret = fluid.layers.nn.uniform_random(
-                [2, 3, 2], min=_min, max=_max, seed=_seed
-            )
-            ret_2 = fluid.layers.nn.uniform_random(
-                [2, 3, 2], min=_min, max=_max, seed=_seed
-            )
+            ret = paddle.uniform([2, 3, 2], min=_min, max=_max, seed=_seed)
+            ret_2 = paddle.uniform([2, 3, 2], min=_min, max=_max, seed=_seed)
             res = paddle.equal(ret, ret_2)
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index 58078cbd71b8a..86e932d25bf03 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -199,26 +199,18 @@ def test_Variable():
                 x1 = fluid.create_lod_tensor(
                     np.zeros((4, 784)), [[1, 1, 1, 1]], fluid.CPUPlace()
                 )
-                fluid.layers.uniform_random(x1)
+                paddle.uniform(x1)
 
             self.assertRaises(TypeError, test_Variable)
 
             def test_Variable2():
                 x1 = np.zeros((4, 784))
-                fluid.layers.uniform_random(x1)
+                paddle.uniform(x1)
 
             self.assertRaises(TypeError, test_Variable2)
 
-            def test_dtype():
-                x2 = fluid.layers.data(
-                    name='x2', shape=[4, 784], dtype='float32'
-                )
-                fluid.layers.uniform_random(x2, 'int32')
-
-            self.assertRaises(TypeError, test_dtype)
-
             def test_out_dtype():
-                out = fluid.layers.uniform_random(shape=[3, 4], dtype='float64')
+                out = paddle.uniform(shape=[3, 4], dtype='float64')
                 self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
 
             test_out_dtype()
@@ -323,7 +315,7 @@ def test_attr_tensor_API(self):
         train_program = fluid.Program()
         with fluid.program_guard(train_program, startup_program):
             dim_tensor = fluid.layers.fill_constant([1], "int64", 3)
-            ret = fluid.layers.nn.uniform_random([1, dim_tensor, 2])
+            ret = paddle.uniform([1, dim_tensor, 2])
 
             place = fluid.CPUPlace()
             if fluid.core.is_compiled_with_cuda():
@@ -339,7 +331,7 @@ def test_attr_tensorlist_int32_API(self):
         with fluid.program_guard(train_program, startup_program):
             dim_1 = fluid.layers.fill_constant([1], "int64", 3)
             dim_2 = fluid.layers.fill_constant([1], "int32", 2)
-            ret = fluid.layers.nn.uniform_random([1, dim_1, dim_2])
+            ret = paddle.uniform([1, dim_1, dim_2])
 
             place = fluid.CPUPlace()
             if fluid.core.is_compiled_with_cuda():
@@ -354,7 +346,7 @@ def test_attr_tensor_int32_API(self):
         train_program = fluid.Program()
         with fluid.program_guard(train_program, startup_program):
             shape = fluid.data(name='shape_tensor', shape=[2], dtype="int32")
-            ret = fluid.layers.nn.uniform_random(shape)
+            ret = paddle.uniform(shape)
 
             place = fluid.CPUPlace()
             if fluid.core.is_compiled_with_cuda():
@@ -377,12 +369,8 @@ def test_attr_tensor_API(self):
             _min = 5
             _max = 10
 
-            ret = fluid.layers.nn.uniform_random(
-                [2, 3, 2], min=_min, max=_max, seed=_seed
-            )
-            ret_2 = fluid.layers.nn.uniform_random(
-                [2, 3, 2], min=_min, max=_max, seed=_seed
-            )
+            ret = paddle.uniform([2, 3, 2], min=_min, max=_max, seed=_seed)
+            ret_2 = paddle.uniform([2, 3, 2], min=_min, max=_max, seed=_seed)
             res = paddle.equal(ret, ret_2)
             place = fluid.CPUPlace()
             if fluid.core.is_compiled_with_cuda():
@@ -464,9 +452,7 @@ def check_with_place(self, place):
 class TestUniformRandomDygraphMode(unittest.TestCase):
     def test_check_output(self):
         with fluid.dygraph.guard():
-            x = fluid.layers.uniform_random(
-                [10], dtype="float32", min=0.0, max=1.0
-            )
+            x = paddle.uniform([10], dtype="float32", min=0.0, max=1.0)
             x_np = x.numpy()
             for i in range(10):
                 self.assertTrue((x_np[i] > 0 and x_np[i] < 1.0))
@@ -698,9 +684,7 @@ def test_static(self):
             min_v = paddle.to_tensor([0.1])
             max_v = paddle.to_tensor([0.9])
             y = paddle.uniform([2, 3, 10], min=min_v, max=max_v)
-            z = paddle.fluid.layers.uniform_random(
-                [2, 3, 10], min=min_v, max=max_v
-            )
+            z = paddle.uniform([2, 3, 10], min=min_v, max=max_v)
 
             out = feat + y + z
 

From 91ff2071d3460a3ac7c94b5fb9d195bf1ee9237d Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Thu, 8 Dec 2022 13:48:31 +0800
Subject: [PATCH 59/60] fix-gpups setup.py (#48888)

* fix-gpups

* test=document_fix
---
 paddle/scripts/paddle_build.sh | 1 +
 setup.py                       | 1 +
 2 files changed, 2 insertions(+)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 73334b651bfe0..47e64afb16c57 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -24,6 +24,7 @@ if [ -z ${BRANCH} ]; then
     BRANCH="develop"
 fi
 
+
 function print_usage() {
     echo -e "\n${RED}Usage${NONE}:
     ${BOLD}${SCRIPT_NAME}${NONE} [OPTION]"
diff --git a/setup.py b/setup.py
index 6e77373acf540..6d088750a60b0 100644
--- a/setup.py
+++ b/setup.py
@@ -30,6 +30,7 @@
 from setuptools.command.egg_info import egg_info
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.command.install_lib import install_lib
+from setuptools.dist import Distribution
 
 if sys.version_info < (3, 7):
     raise RuntimeError(

From a4d9851b89d8c4f3c33d556dd93ded40c254eb3a Mon Sep 17 00:00:00 2001
From: huangjiyi <43315610+huangjiyi@users.noreply.github.com>
Date: Thu, 8 Dec 2022 14:18:52 +0800
Subject: [PATCH 60/60] [PHI decoupling] move cuda_graph from fluid to phi
 (#48686)

* move cuda_graph from fluid to phi

* move device_memory_aligment from fluid to phi

* Revert "move device_memory_aligment from fluid to phi"

This reverts commit b92fcd39a0a50fdac13278f49be0237a85f3a13f.

* update xpu cmake
---
 paddle/fluid/memory/allocation/CMakeLists.txt |   2 +-
 paddle/fluid/platform/CMakeLists.txt          |   2 +-
 .../platform/device/gpu/cuda/CMakeLists.txt   |   4 -
 .../platform/device/gpu/cuda/cuda_graph.h     | 224 +---------------
 .../fluid/platform/device/xpu/CMakeLists.txt  |   2 +-
 paddle/phi/backends/CMakeLists.txt            |   5 +-
 .../backends}/gpu/cuda/cuda_graph.cc          |  57 +++--
 paddle/phi/backends/gpu/cuda/cuda_graph.h     | 241 ++++++++++++++++++
 paddle/phi/backends/xpu/CMakeLists.txt        |   4 -
 paddle/phi/core/CMakeLists.txt                |   2 +-
 paddle/phi/core/device_context.cc             |   7 +-
 paddle/phi/kernels/gpudnn/conv_cudnn_v7.h     |   2 +-
 12 files changed, 291 insertions(+), 261 deletions(-)
 rename paddle/{fluid/platform/device => phi/backends}/gpu/cuda/cuda_graph.cc (90%)
 create mode 100644 paddle/phi/backends/gpu/cuda/cuda_graph.h
 delete mode 100644 paddle/phi/backends/xpu/CMakeLists.txt

diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 20a922b406745..f7c57fa2b02d6 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -32,7 +32,7 @@ if(WITH_GPU OR WITH_ROCM)
 endif()
 
 if(WITH_GPU)
-  list(APPEND ALLOCATOR_DEPS cuda_graph)
+  list(APPEND ALLOCATOR_DEPS phi_backends)
 endif()
 
 if(CUDA_VERSION VERSION_GREATER_EQUAL 10.2)
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 8c93eaf2469d1..2db144f423fc7 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -85,7 +85,7 @@ if(WITH_GPU)
   nv_library(
     cuda_graph_with_memory_pool
     SRCS cuda_graph_with_memory_pool.cc
-    DEPS device_context allocator cuda_graph)
+    DEPS device_context allocator phi_backends)
 else()
   cc_library(
     cuda_graph_with_memory_pool
diff --git a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
index 64a2f891c21cd..07901054b3b33 100644
--- a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
@@ -1,7 +1,3 @@
-nv_library(
-  cuda_graph
-  SRCS cuda_graph.cc
-  DEPS enforce)
 nv_library(
   cuda_profiler
   SRCS cuda_profiler.cc
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
index 5b5151ea822e8..1c0843a0eb645 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
@@ -14,45 +14,23 @@
 
 #pragma once
 
-#include <atomic>
-#include <functional>
-#include <memory>
-#include <mutex>
-#include <thread>
-#include <vector>
-
-#include "cuda.h"          // NOLINT
-#include "cuda_runtime.h"  // NOLINT
-#include "paddle/fluid/platform/device/gpu/gpu_types.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/utils/optional.h"
+#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
 
 namespace paddle {
 namespace platform {
 
+using CUDAKernelParams = phi::backends::gpu::CUDAKernelParams;
+#if CUDA_VERSION < 10010
+using cudaStreamCaptureMode = phi::backends::gpu::cudaStreamCaptureMode;
+#endif
+using CUDAGraph = phi::backends::gpu::CUDAGraph;
+using CUDAGraphCaptureModeGuard = phi::backends::gpu::CUDAGraphCaptureModeGuard;
+
 template <typename T>
 static bool IsBitwiseEqual(const T &x, const T &y) {
   return std::memcmp(&x, &y, sizeof(T)) == 0;
 }
 
-class CUDAKernelParams {
- public:
-  explicit CUDAKernelParams(const cudaKernelNodeParams *params)
-      : params_(params) {}
-
-  const void *func() const { return params_->func; }
-
-  template <typename T>
-  T &As(size_t idx) const {
-    return *reinterpret_cast<T *>(params_->kernelParams[idx]);
-  }
-
- private:
-  const cudaKernelNodeParams *params_;
-};
-
 template <typename F, F f>
 struct IsSameKernelHelper;
 
@@ -96,191 +74,5 @@ struct IsSameKernelHelper<Return (*)(FuncArgs...), kernel_fn> {
   }
 };
 
-#if CUDA_VERSION >= 10010
-static void ThrowErrorIfNotSupportCUDAGraph() {}
-#else
-enum cudaStreamCaptureMode {
-  cudaStreamCaptureModeGlobal = 0,
-  cudaStreamCaptureModeThreadLocal = 1,
-  cudaStreamCaptureModeRelaxed = 2
-};
-static void ThrowErrorIfNotSupportCUDAGraph() {
-  PADDLE_THROW(platform::errors::Unimplemented(
-      "CUDA Graph is only supported when CUDA version >= 10.1"));
-}
-#endif
-
-// NOTE: Currently, we do not support to capture CUDA graph in parallel
-// NOTE: Do not use this class directly because it should be used with
-//       the memory pool.
-class CUDAGraph {
-  DISABLE_COPY_AND_ASSIGN(CUDAGraph);
-
-  // Since the constructor would throw error is CUDA_VERSION < 10010.
-  // The non-static method of CUDAGraph need not check CUDA_VERSION
-  // again.
-  CUDAGraph() {
-    ThrowErrorIfNotSupportCUDAGraph();
-    id_ = UniqueID();
-  }
-
- public:
-  static constexpr int64_t kDefaultPoolID = 0;
-  static constexpr int64_t kInvalidPoolID = -1;
-
-  ~CUDAGraph() { Reset(); }
-
-  CUDAGraphID ID() const { return id_; }
-
-  static int64_t SetMemoryPoolID(int64_t pool_id) {
-    auto &pool_id_ = capturing_graph_->pool_id_;
-    PADDLE_ENFORCE_EQ(
-        pool_id_,
-        kInvalidPoolID,
-        phi::errors::InvalidArgument("Cannot reset memory pool id twice, the "
-                                     "former memory pool id is %d.",
-                                     pool_id_));
-    if (pool_id <= kInvalidPoolID) {
-      pool_id_ = UniqueMemoryPoolID();
-    } else {
-      PADDLE_ENFORCE_GE(
-          pool_id,
-          kDefaultPoolID,
-          phi::errors::InvalidArgument("Invalid memory pool id %d.", pool_id));
-      pool_id_ = pool_id;
-    }
-    return pool_id_;
-  }
-
-  int64_t PoolID() const { return pool_id_; }
-
-  static int64_t CapturingPoolID() { return capturing_graph_->pool_id_; }
-
-  void Replay();
-
-  void Reset();
-
-  void AddResetCallback(std::function<void()> callback) {
-    std::lock_guard<std::mutex> guard(mtx_);
-    callbacks_.push_back(std::move(callback));
-  }
-
-  void PrintToDotFiles(const std::string &dirname, unsigned int flags);
-
-  static void BeginCapture(platform::CUDAPlace place,
-                           cudaStream_t stream,
-                           cudaStreamCaptureMode mode);
-  static std::unique_ptr<CUDAGraph> EndCapture();
-
-  static void BeginSegmentCapture();
-  static void EndSegmentCapture();
-
-  static void AddResetCallbackDuringCapturing(std::function<void()> callback) {
-    capturing_graph_->AddResetCallback(std::move(callback));
-  }
-
-  // No need to add CUDA_VERSION macro because capturing_graph_ would
-  // always be nullptr (constructor throws error)
-  static bool IsCapturing() { return capturing_graph_ != nullptr; }
-
-  static CUDAGraphID CapturingID() { return capturing_graph_->id_; }
-
-  static platform::CUDAPlace CapturingPlace() {
-    return capturing_graph_->place_;
-  }
-
-  // This API can be used to debug which GPU operation is not
-  // supported during capturing CUDA Graph.
-  static bool IsValidCapturing();
-
-  static bool IsThreadLocalCapturing() {
-#if CUDA_VERSION >= 10010
-    return IsCapturing() &&
-           capturing_graph_->capture_mode_ == cudaStreamCaptureModeThreadLocal;
-#else
-    return false;
-#endif
-  }
-
-  static bool IsThisThreadCapturing() {
-    if (UNLIKELY(IsCapturing())) {
-      return IsThreadLocalCapturing()
-                 ? capturing_thread_id_.get() == std::this_thread::get_id()
-                 : true;
-    } else {
-      return false;
-    }
-  }
-
-  using SetSeedFunc = std::function<bool(CUDAKernelParams *, bool)>;
-  static void RecordRandomKernelInfo(SetSeedFunc set_seed_func) {
-    std::lock_guard<std::mutex> guard(capturing_graph_->func_mtx_);
-    capturing_graph_->set_seed_funcs_.emplace_back(std::move(set_seed_func));
-  }
-
-  static int64_t UniqueMemoryPoolID();
-
- private:
-  static CUDAGraphID UniqueID();
-
- private:
-#if CUDA_VERSION >= 10010
-  std::vector<cudaGraph_t> graphs_;
-  std::vector<cudaGraphExec_t> exec_graphs_;
-  cudaStreamCaptureMode capture_mode_;
-#endif
-  cudaStream_t stream_{nullptr};
-  platform::CUDAPlace place_;
-  CUDAGraphID id_;
-  int64_t pool_id_{kInvalidPoolID};
-  std::vector<std::function<void()>> callbacks_;
-  bool is_reset_{false};
-  std::mutex mtx_;
-
-  std::vector<SetSeedFunc> set_seed_funcs_;
-  std::vector<std::vector<std::function<void(cudaGraphExec_t)>>> pre_hooks_;
-  std::mutex func_mtx_;
-
-  bool is_first_run_{true};
-
-  static paddle::optional<std::thread::id> capturing_thread_id_;
-  static std::unique_ptr<CUDAGraph> capturing_graph_;
-};
-
-#if CUDA_VERSION >= 10010
-class CUDAGraphCaptureModeGuard {
-  DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard);
-
- public:
-  explicit CUDAGraphCaptureModeGuard(
-      cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {
-    if (UNLIKELY(CUDAGraph::IsCapturing())) {
-      PADDLE_ENFORCE_GPU_SUCCESS(cudaThreadExchangeStreamCaptureMode(&mode));
-      // After cudaThreadExchangeStreamCaptureMode is called,
-      // the variable "mode" would be set to the old capturing mode.
-      old_mode_ = mode;
-    }
-  }
-
-  ~CUDAGraphCaptureModeGuard() PADDLE_MAY_THROW {
-    if (UNLIKELY(CUDAGraph::IsCapturing())) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          cudaThreadExchangeStreamCaptureMode(&old_mode_));
-    }
-  }
-
- private:
-  cudaStreamCaptureMode old_mode_;
-};
-#else
-class CUDAGraphCaptureModeGuard {
-  DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard);
-
- public:
-  explicit CUDAGraphCaptureModeGuard(
-      cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {}
-};
-#endif
-
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/xpu/CMakeLists.txt b/paddle/fluid/platform/device/xpu/CMakeLists.txt
index 31ac51050b87b..242f2a8e26002 100644
--- a/paddle/fluid/platform/device/xpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/xpu/CMakeLists.txt
@@ -30,7 +30,7 @@ cc_library(
        xpulib
        device_context
        op_kernel_type
-       phi_xpu_op_list)
+       phi_backends)
 cc_library(
   xpu_resource_pool
   SRCS xpu_resource_pool.cc
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index a4c76ab0e68a0..f8a6b2174a830 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -7,7 +7,7 @@ if(WITH_GPU OR WITH_ROCM)
   list(APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc
        gpu/gpu_resources.cc)
   if(WITH_GPU)
-    list(APPEND BACKENDS_SRCS gpu/cuda/cuda_info.cc)
+    list(APPEND BACKENDS_SRCS gpu/cuda/cuda_info.cc gpu/cuda/cuda_graph.cc)
   endif()
   if(WITH_ROCM)
     list(APPEND BACKENDS_SRCS gpu/rocm/rocm_info.cc)
@@ -16,8 +16,9 @@ if(WITH_GPU OR WITH_ROCM)
 endif()
 
 if(WITH_XPU)
-  add_subdirectory(xpu)
   list(APPEND BACKENDS_SRCS xpu/xpu_context.cc xpu/xpu_info.cc)
+  list(APPEND BACKENDS_SRCS xpu/xpu_op_list.cc xpu/xpu1_op_list.cc
+       xpu/xpu2_op_list.cc)
 endif()
 
 if(WITH_MKLDNN)
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc b/paddle/phi/backends/gpu/cuda/cuda_graph.cc
similarity index 90%
rename from paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
rename to paddle/phi/backends/gpu/cuda/cuda_graph.cc
index 61c8fe4f4c5fd..5fc39a5319945 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
+++ b/paddle/phi/backends/gpu/cuda/cuda_graph.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,14 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
+#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
 
 #include <queue>
 #include <unordered_map>
 #include <unordered_set>
 
-namespace paddle {
-namespace platform {
+namespace phi {
+namespace backends {
+namespace gpu {
 
 std::unique_ptr<CUDAGraph> CUDAGraph::capturing_graph_{nullptr};
 paddle::optional<std::thread::id> CUDAGraph::capturing_thread_id_{paddle::none};
@@ -113,7 +114,7 @@ void CUDAGraph::Replay() {
 #if CUDA_VERSION >= 10010
   PADDLE_ENFORCE_EQ(is_reset_,
                     false,
-                    errors::PermissionDenied(
+                    phi::errors::PermissionDenied(
                         "Cannot replay the CUDA Graph after reset is called."));
   size_t n = exec_graphs_.size();
   for (size_t i = 0; i < n; ++i) {
@@ -131,43 +132,43 @@ void CUDAGraph::Replay() {
 void CUDAGraph::BeginSegmentCapture() {
   ThrowErrorIfNotSupportCUDAGraph();
 #if CUDA_VERSION >= 10010
-  PADDLE_ENFORCE_EQ(
-      IsCapturing(),
-      true,
-      errors::PermissionDenied("BeginSegmentCapture should be called when CUDA "
-                               "Graph is capturing."));
+  PADDLE_ENFORCE_EQ(IsCapturing(),
+                    true,
+                    phi::errors::PermissionDenied(
+                        "BeginSegmentCapture should be called when CUDA "
+                        "Graph is capturing."));
   if (IsThreadLocalCapturing()) {
     PADDLE_ENFORCE_EQ(IsThisThreadCapturing(),
                       true,
-                      platform::errors::PermissionDenied(
+                      phi::errors::PermissionDenied(
                           "When capturing CUDA Graph in the thread local mode, "
                           "you cannot begin segmented capturing in the thread "
                           "which is not the one that starts the capturing."));
   }
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamBeginCapture(
       capturing_graph_->stream_, capturing_graph_->capture_mode_));
-  PADDLE_ENFORCE_EQ(IsValidCapturing(),
-                    true,
-                    platform::errors::PermissionDenied(
-                        "CUDA Graph should not be invalidated."));
+  PADDLE_ENFORCE_EQ(
+      IsValidCapturing(),
+      true,
+      phi::errors::PermissionDenied("CUDA Graph should not be invalidated."));
   VLOG(10) << "Begin to capture CUDA Graph with ID " << capturing_graph_->id_
            << ", segment id " << capturing_graph_->graphs_.size()
            << ", memory pool id " << capturing_graph_->pool_id_;
 #endif
 }
 
-void CUDAGraph::BeginCapture(platform::CUDAPlace place,
+void CUDAGraph::BeginCapture(phi::GPUPlace place,
                              cudaStream_t stream,
                              cudaStreamCaptureMode mode) {
   ThrowErrorIfNotSupportCUDAGraph();
 #if CUDA_VERSION >= 10010
-  PADDLE_ENFORCE_EQ(
-      IsCapturing(),
-      false,
-      errors::PermissionDenied("CUDA Graph can only captured one by one."));
+  PADDLE_ENFORCE_EQ(IsCapturing(),
+                    false,
+                    phi::errors::PermissionDenied(
+                        "CUDA Graph can only captured one by one."));
   PADDLE_ENFORCE_NOT_NULL(
       stream,
-      errors::PermissionDenied(
+      phi::errors::PermissionDenied(
           "CUDA Graph cannot be captured in default CUDA stream 0."));
   capturing_graph_.reset(new CUDAGraph());
   capturing_graph_->place_ = place;
@@ -185,9 +186,10 @@ void CUDAGraph::BeginCapture(platform::CUDAPlace place,
 void CUDAGraph::EndSegmentCapture() {
   ThrowErrorIfNotSupportCUDAGraph();
 #if CUDA_VERSION >= 10010
-  PADDLE_ENFORCE_EQ(IsCapturing(),
-                    true,
-                    errors::PermissionDenied("No CUDA Graph is capturing."));
+  PADDLE_ENFORCE_EQ(
+      IsCapturing(),
+      true,
+      phi::errors::PermissionDenied("No CUDA Graph is capturing."));
   cudaGraph_t graph;
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaStreamEndCapture(capturing_graph_->stream_, &graph));
@@ -299,11 +301,12 @@ void CUDAGraph::PrintToDotFiles(const std::string &dirname,
         cudaGraphDebugDotPrint(graphs_[i], filename.c_str(), flags));
   }
 #else
-  PADDLE_THROW(platform::errors::Unimplemented(
+  PADDLE_THROW(phi::errors::Unimplemented(
       "The print_to_dot_files() method is only supported when CUDA version >= "
       "11.3."));
 #endif
 }
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace gpu
+}  // namespace backends
+}  // namespace phi
diff --git a/paddle/phi/backends/gpu/cuda/cuda_graph.h b/paddle/phi/backends/gpu/cuda/cuda_graph.h
new file mode 100644
index 0000000000000..f2004eb6c7da0
--- /dev/null
+++ b/paddle/phi/backends/gpu/cuda/cuda_graph.h
@@ -0,0 +1,241 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+#include "cuda.h"          // NOLINT
+#include "cuda_runtime.h"  // NOLINT
+
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/errors.h"
+#include "paddle/phi/core/macros.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+namespace backends {
+namespace gpu {
+
+class CUDAKernelParams {
+ public:
+  explicit CUDAKernelParams(const cudaKernelNodeParams *params)
+      : params_(params) {}
+
+  const void *func() const { return params_->func; }
+
+  template <typename T>
+  T &As(size_t idx) const {
+    return *reinterpret_cast<T *>(params_->kernelParams[idx]);
+  }
+
+ private:
+  const cudaKernelNodeParams *params_;
+};
+
+#if CUDA_VERSION >= 10010
+static void ThrowErrorIfNotSupportCUDAGraph() {}
+#else
+enum cudaStreamCaptureMode {
+  cudaStreamCaptureModeGlobal = 0,
+  cudaStreamCaptureModeThreadLocal = 1,
+  cudaStreamCaptureModeRelaxed = 2
+};
+static void ThrowErrorIfNotSupportCUDAGraph() {
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "CUDA Graph is only supported when CUDA version >= 10.1"));
+}
+#endif
+
+using CUDAGraphID = unsigned long long;  // NOLINT
+
+// NOTE: Currently, we do not support to capture CUDA graph in parallel
+// NOTE: Do not use this class directly because it should be used with
+//       the memory pool.
+class CUDAGraph {
+  DISABLE_COPY_AND_ASSIGN(CUDAGraph);
+
+  // Since the constructor would throw error is CUDA_VERSION < 10010.
+  // The non-static method of CUDAGraph need not check CUDA_VERSION
+  // again.
+  CUDAGraph() {
+    ThrowErrorIfNotSupportCUDAGraph();
+    id_ = UniqueID();
+  }
+
+ public:
+  static constexpr int64_t kDefaultPoolID = 0;
+  static constexpr int64_t kInvalidPoolID = -1;
+
+  ~CUDAGraph() { Reset(); }
+
+  CUDAGraphID ID() const { return id_; }
+
+  static int64_t SetMemoryPoolID(int64_t pool_id) {
+    auto &pool_id_ = capturing_graph_->pool_id_;
+    PADDLE_ENFORCE_EQ(
+        pool_id_,
+        kInvalidPoolID,
+        phi::errors::InvalidArgument("Cannot reset memory pool id twice, the "
+                                     "former memory pool id is %d.",
+                                     pool_id_));
+    if (pool_id <= kInvalidPoolID) {
+      pool_id_ = UniqueMemoryPoolID();
+    } else {
+      PADDLE_ENFORCE_GE(
+          pool_id,
+          kDefaultPoolID,
+          phi::errors::InvalidArgument("Invalid memory pool id %d.", pool_id));
+      pool_id_ = pool_id;
+    }
+    return pool_id_;
+  }
+
+  int64_t PoolID() const { return pool_id_; }
+
+  static int64_t CapturingPoolID() { return capturing_graph_->pool_id_; }
+
+  void Replay();
+
+  void Reset();
+
+  void AddResetCallback(std::function<void()> callback) {
+    std::lock_guard<std::mutex> guard(mtx_);
+    callbacks_.push_back(std::move(callback));
+  }
+
+  void PrintToDotFiles(const std::string &dirname, unsigned int flags);
+
+  static void BeginCapture(phi::GPUPlace place,
+                           cudaStream_t stream,
+                           cudaStreamCaptureMode mode);
+  static std::unique_ptr<CUDAGraph> EndCapture();
+
+  static void BeginSegmentCapture();
+  static void EndSegmentCapture();
+
+  static void AddResetCallbackDuringCapturing(std::function<void()> callback) {
+    capturing_graph_->AddResetCallback(std::move(callback));
+  }
+
+  // No need to add CUDA_VERSION macro because capturing_graph_ would
+  // always be nullptr (constructor throws error)
+  static bool IsCapturing() { return capturing_graph_ != nullptr; }
+
+  static CUDAGraphID CapturingID() { return capturing_graph_->id_; }
+
+  static phi::GPUPlace CapturingPlace() { return capturing_graph_->place_; }
+
+  // This API can be used to debug which GPU operation is not
+  // supported during capturing CUDA Graph.
+  static bool IsValidCapturing();
+
+  static bool IsThreadLocalCapturing() {
+#if CUDA_VERSION >= 10010
+    return IsCapturing() &&
+           capturing_graph_->capture_mode_ == cudaStreamCaptureModeThreadLocal;
+#else
+    return false;
+#endif
+  }
+
+  static bool IsThisThreadCapturing() {
+    if (UNLIKELY(IsCapturing())) {
+      return IsThreadLocalCapturing()
+                 ? capturing_thread_id_.get() == std::this_thread::get_id()
+                 : true;
+    } else {
+      return false;
+    }
+  }
+
+  using SetSeedFunc = std::function<bool(CUDAKernelParams *, bool)>;
+  static void RecordRandomKernelInfo(SetSeedFunc set_seed_func) {
+    std::lock_guard<std::mutex> guard(capturing_graph_->func_mtx_);
+    capturing_graph_->set_seed_funcs_.emplace_back(std::move(set_seed_func));
+  }
+
+  static int64_t UniqueMemoryPoolID();
+
+ private:
+  static CUDAGraphID UniqueID();
+
+ private:
+#if CUDA_VERSION >= 10010
+  std::vector<cudaGraph_t> graphs_;
+  std::vector<cudaGraphExec_t> exec_graphs_;
+  cudaStreamCaptureMode capture_mode_;
+#endif
+  cudaStream_t stream_{nullptr};
+  phi::GPUPlace place_;
+  CUDAGraphID id_;
+  int64_t pool_id_{kInvalidPoolID};
+  std::vector<std::function<void()>> callbacks_;
+  bool is_reset_{false};
+  std::mutex mtx_;
+
+  std::vector<SetSeedFunc> set_seed_funcs_;
+  std::vector<std::vector<std::function<void(cudaGraphExec_t)>>> pre_hooks_;
+  std::mutex func_mtx_;
+
+  bool is_first_run_{true};
+
+  static paddle::optional<std::thread::id> capturing_thread_id_;
+  static std::unique_ptr<CUDAGraph> capturing_graph_;
+};
+
+#if CUDA_VERSION >= 10010
+class CUDAGraphCaptureModeGuard {
+  DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard);
+
+ public:
+  explicit CUDAGraphCaptureModeGuard(
+      cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {
+    if (UNLIKELY(CUDAGraph::IsCapturing())) {
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaThreadExchangeStreamCaptureMode(&mode));
+      // After cudaThreadExchangeStreamCaptureMode is called,
+      // the variable "mode" would be set to the old capturing mode.
+      old_mode_ = mode;
+    }
+  }
+
+  ~CUDAGraphCaptureModeGuard() PADDLE_MAY_THROW {
+    if (UNLIKELY(CUDAGraph::IsCapturing())) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          cudaThreadExchangeStreamCaptureMode(&old_mode_));
+    }
+  }
+
+ private:
+  cudaStreamCaptureMode old_mode_;
+};
+#else
+class CUDAGraphCaptureModeGuard {
+  DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard);
+
+ public:
+  explicit CUDAGraphCaptureModeGuard(
+      cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {}
+};
+#endif
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace phi
diff --git a/paddle/phi/backends/xpu/CMakeLists.txt b/paddle/phi/backends/xpu/CMakeLists.txt
deleted file mode 100644
index d84e6a63e058a..0000000000000
--- a/paddle/phi/backends/xpu/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-cc_library(
-  phi_xpu_op_list
-  SRCS xpu_op_list.cc xpu1_op_list.cc xpu2_op_list.cc
-  DEPS glog)
diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt
index 8911da82b5480..6dc43ff633f19 100644
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -19,7 +19,7 @@ if(WITH_XPU)
   cc_library(
     kernel_factory
     SRCS kernel_factory.cc
-    DEPS phi_enforce convert_utils phi_xpu_op_list)
+    DEPS phi_enforce convert_utils phi_backends)
 else()
   cc_library(
     kernel_factory
diff --git a/paddle/phi/core/device_context.cc b/paddle/phi/core/device_context.cc
index a18e695cce4d8..60747e36185a5 100644
--- a/paddle/phi/core/device_context.cc
+++ b/paddle/phi/core/device_context.cc
@@ -15,7 +15,7 @@
 #include "paddle/phi/core/device_context.h"
 
 #ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
+#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
 #endif
 
 #include "paddle/phi/core/dense_tensor.h"
@@ -153,8 +153,9 @@ struct DeviceContext::Impl {
                           : (pinned ? pinned_allocator_ : device_allocator_);
 #ifdef PADDLE_WITH_CUDA
     bool must_cuda_graph_allocator = (tensor->numel() != 0) && !pinned;
-    if (must_cuda_graph_allocator && paddle::platform::is_gpu_place(place) &&
-        paddle::platform::CUDAGraph::IsThisThreadCapturing()) {
+    if (must_cuda_graph_allocator &&
+        place.GetType() == phi::AllocationType::GPU &&
+        phi::backends::gpu::CUDAGraph::IsThisThreadCapturing()) {
       PADDLE_ENFORCE_NOT_NULL(cuda_graph_allocator_,
                               phi::errors::InvalidArgument(
                                   "Required cuda_graph_allocator_ shall not be "
diff --git a/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h b/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
index ac4a60384af19..cc32759b5f044 100644
--- a/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
+++ b/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
@@ -465,7 +465,7 @@ struct SearchAlgorithmBase<ConvKind::kBackwardFilter> {
 
   static size_t GetWorkspaceSize(const ConvArgs& args,
                                  cudnnConvolutionBwdFilterAlgo_t algo) {
-    paddle::platform::CUDAGraphCaptureModeGuard guard;
+    phi::backends::gpu::CUDAGraphCaptureModeGuard guard;
     size_t workspace_size = 0;
     PADDLE_ENFORCE_GPU_SUCCESS(
         phi::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(