PaddlePaddle · GhostScreaming · Nov 22, 2023 · Oct 24, 2023 · Nov 4, 2023 · Nov 6, 2023
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
@@ -1251,6 +1251,7 @@
   output : Tensor(x_grad), Tensor(scale_grad), Tensor(bias_grad)
   infer_meta :
     func : LayerNormGradInferMeta
+    spmd_rule : LayerNormGradInferSpmd
     param : [x, scale, bias]
   kernel :
     func : layer_norm_grad

diff --git a/paddle/phi/api/yaml/generator/dist_api_gen.py b/paddle/phi/api/yaml/generator/dist_api_gen.py
@@ -713,6 +713,14 @@ def generate_specialized_infer_spmd_code(self) -> str:
                         name=param
                     )
                     input_args_code += "meta_dist_input_" + param + ", "
+                elif (
+                    self.inputs['input_info'][param]
+                    == "const paddle::optional<Tensor>&"
+                ):
+                    input_decl_code += (
+                        OPTIONAL_SINGLE_DIST_META_IN_TEMPLATE.format(name=param)
+                    )
+                    input_args_code += "meta_dist_input_" + param + ", "
 
                 else:
                     raise ValueError(

diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
@@ -1410,6 +1410,7 @@
   output : Tensor(out), Tensor(mean), Tensor(variance)
   infer_meta :
     func : LayerNormInferMeta
+    spmd_rule : LayerNormInferSpmd
   kernel :
     func : layer_norm
     data_type : x

@@ -283,5 +283,121 @@ SpmdInfo LayerNormInferSpmdReverse(const DistMetaTensor& x,
   return {ToArgDistAttr(input_dist_attrs), ToArgDistAttr(output_dist_attrs)};
 }
 
+std::tuple<std::vector<std::string>, std::string> BuildLayerNormGradEinsum(
+    int64_t input_rank, int64_t begin_norm_axis) {
+  std::string alphabet = "ijklmnopqrstuvwxyz";
+  std::string x_notation = alphabet.substr(0, input_rank);
+  std::string mean_variance_notation = x_notation.substr(0, begin_norm_axis);
+  std::string align_notation = x_notation.substr(0, begin_norm_axis);
+  return {
+      {x_notation, mean_variance_notation, mean_variance_notation, x_notation},
+      align_notation};
+}
+
+SpmdInfo LayerNormGradInferSpmd(const DistMetaTensor& x,
+                                const DistMetaTensor& scale,
+                                const DistMetaTensor& bias,
+                                const DistMetaTensor& mean,
+                                const DistMetaTensor& variance,
+                                const DistMetaTensor out_grad,
+                                float epsilon,
+                                int begin_norm_axis) {
+  auto get_shape = [](const auto& meta) {
+    return phi::vectorize<int64_t>(meta.dims());
+  };
+  // 1、check tensors shapes
+  auto x_shape = get_shape(x);
+  auto scale_shape = get_shape(scale);
+  auto bias_shape = get_shape(bias);
+  auto mean_shape = get_shape(mean);
+  auto variance_shape = get_shape(variance);
+  auto out_grad_shape = get_shape(out_grad);
+  PADDLE_ENFORCE_GE(
+      x_shape.size(),
+      begin_norm_axis,
+      phi::errors::InvalidArgument(
+          "The Tensor x's rank [%d] and begin_norm_axis [%d] are not matched.",
+          x_shape.size(),
+          begin_norm_axis));
+  PADDLE_ENFORCE_EQ(
+      x_shape.size(),
+      out_grad_shape.size(),
+      phi::errors::InvalidArgument("The Tensor x's rank [%d] and Tensor "
+                                   "out_grad's rank [%d] are not matched.",
+                                   x_shape.size(),
+                                   out_grad_shape.size()));
+
+  PADDLE_ENFORCE_EQ(
+      scale_shape.size(),
+      bias_shape.size(),
+      phi::errors::InvalidArgument("The Tensor scale's rank [%d] and Tensor "
+                                   "bias's rank [%d] are not matched.",
+                                   scale_shape.size(),
+                                   bias_shape.size()));
+
+  PADDLE_ENFORCE_EQ(
+      mean_shape.size(),
+      variance_shape.size(),
+      phi::errors::InvalidArgument("The Tensor mean's rank [%d] and Tensor "
+                                   "variance's rank [%d] are not matched.",
+                                   mean_shape.size(),
+                                   variance_shape.size()));
+
+  // 2、align sharding
+  TensorDistAttr x_dist_attr;
+  TensorDistAttr mean_dist_attr;
+  TensorDistAttr variance_dist_attr;
+  TensorDistAttr grad_dist_attr;
+  std::vector<TensorDistAttr> dist_attrs;
+  dist_attrs.push_back(x.dist_attr());
+  dist_attrs.push_back(mean.dist_attr());
+  dist_attrs.push_back(variance.dist_attr());
+  dist_attrs.push_back(out_grad.dist_attr());
+  if (begin_norm_axis > 0) {
+    std::vector<std::vector<int64_t>> shapes = {
+        x_shape, mean_shape, variance_shape, x_shape};
+    std::vector<std::string> anotations;
+    std::string align_anotation;
+    std::tie(anotations, align_anotation) =
+        BuildLayerNormGradEinsum(x_shape.size(), begin_norm_axis);
+    AlignDimsSharding(
+        &dist_attrs, shapes, anotations, {}, align_anotation, false);
+    x_dist_attr = std::move(dist_attrs[0]);
+    mean_dist_attr = std::move(dist_attrs[1]);
+    variance_dist_attr = std::move(dist_attrs[2]);
+    grad_dist_attr = std::move(dist_attrs[3]);
+  } else {
+    x_dist_attr = GetReplicatedDistAttr(dist_attrs[0]);
+    mean_dist_attr = GetReplicatedDistAttr(dist_attrs[1]);
+    variance_dist_attr = GetReplicatedDistAttr(dist_attrs[2]);
+    grad_dist_attr = GetReplicatedDistAttr(dist_attrs[3]);
+  }
+  // TODO(liuzhenhai): support sharded scale and bias
+  TensorDistAttr scale_dist_attr = GetReplicatedDistAttr(scale.dist_attr());
+  TensorDistAttr bias_dist_attr = GetReplicatedDistAttr(bias.dist_attr());
+  TensorDistAttr scale_grad_dist_attr =
+      GetReplicatedDistAttr(scale.dist_attr());
+  TensorDistAttr bias_grad_dist_attr = GetReplicatedDistAttr(bias.dist_attr());
+  // partial grad dim
+  std::vector<int64_t> partial_on_dims;
+  const auto& dim_mapping = x_dist_attr.dims_mapping();
+  for (int i = 0; i < begin_norm_axis; ++i) {
+    auto mapping = dim_mapping[i];
+    if (mapping != -1) {
+      partial_on_dims.push_back(i);
+    }
+  }
+  scale_grad_dist_attr.set_partial_status(partial_on_dims);
+  bias_grad_dist_attr.set_partial_status(partial_on_dims);
+
+  return SpmdInfo({x_dist_attr,
+                   scale_dist_attr,
+                   bias_dist_attr,
+                   mean_dist_attr,
+                   variance_dist_attr,
+                   grad_dist_attr},
+                  {grad_dist_attr, scale_grad_dist_attr, bias_grad_dist_attr});
+}
+
 }  // namespace distributed
 }  // namespace phi
@@ -26,6 +26,15 @@ SpmdInfo LayerNormInferSpmd(const DistMetaTensor& x,
                             float epsilon,
                             int begin_norm_axis);
 
+SpmdInfo LayerNormGradInferSpmd(const DistMetaTensor& x,
+                                const DistMetaTensor& scale,
+                                const DistMetaTensor& bias,
+                                const DistMetaTensor& mean,
+                                const DistMetaTensor& variance,
+                                const DistMetaTensor out_grad,
+                                float epsilon = 1e-5,
+                                int begin_norm_axis = 1);
+
 SpmdInfo LayerNormInferSpmdReverse(const DistMetaTensor& x,
                                    const DistMetaTensor& scale,
                                    const DistMetaTensor& bias,

@@ -447,6 +447,11 @@ PD_REGISTER_SPMD_RULE(
     PD_INFER_SPMD(phi::distributed::ElementwiseBinaryInferSpmd),
     PD_INFER_SPMD(phi::distributed::ElementwiseBinaryInferSpmdReverse));
 
+PD_REGISTER_SPMD_RULE(
+    not_equal,
+    PD_INFER_SPMD(phi::distributed::ElementwiseBinaryInferSpmd),
+    PD_INFER_SPMD(phi::distributed::ElementwiseBinaryInferSpmdReverse));
+
 // TODO(pkuzyc): add multiary elementwise rule
 
 // reduction rule
@@ -474,6 +479,12 @@ PD_REGISTER_SPMD_RULE(
     max,
     PD_INFER_SPMD(phi::distributed::ReductionInferSpmd),
     PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse));
+
+PD_REGISTER_SPMD_RULE(
+    reduce_max,
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmd),
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse));
+
 PD_REGISTER_SPMD_RULE(
     min,
     PD_INFER_SPMD(phi::distributed::ReductionInferSpmd),

diff --git a/test/auto_parallel/semi_auto_parallel_for_layernorm.py b/test/auto_parallel/semi_auto_parallel_for_layernorm.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from semi_auto_parallel_util import SemiAutoParallelTestBase
+
+import paddle
+import paddle.distributed as dist
+
+
+def layer_norm(input, weights, bias, normalized_shape):
+    return paddle.nn.functional.layer_norm(
+        input, normalized_shape, weight=weights, bias=bias
+    )
+
+
+class TestLayerNormSemiAutoParallel(SemiAutoParallelTestBase):
+    def __init__(self):
+        super().__init__()
+
+    def check_tensor_eq(self, a, b):
+        np1 = a.numpy()
+        np2 = b.numpy()
+        np.testing.assert_allclose(np1, np2, rtol=1e-04, verbose=True)
+
+    def check_dim_mapping(self, output, expected_dim_mapping):
+        assert (
+            output.dist_attr.dims_mapping == expected_dim_mapping
+        ), f"{output.dist_attr.dims_mapping}  vs {expected_dim_mapping}"
+
+    def test_layernorm_forward(self):
+        shapes = ([16, 4, 4], [16], [16])
+        specs = (['x', None, None], [None], [None])
+        inputs, outputs = self.runfunc_and_check(
+            inputs_shape=shapes,
+            inputs_specs=specs,
+            op_func=layer_norm,
+            with_backward=True,
+            normalized_shape=[4, 4],
+        )
+        self.check_dim_mapping(outputs, [0, -1, -1])
+
+    def test_layernorm_reshard(self):
+        shapes = ([16, 4, 4], [16], [16])
+        specs = ([None, None, 'x'], [None], [None])
+        inputs, outputs = self.runfunc_and_check(
+            inputs_shape=shapes,
+            inputs_specs=specs,
+            op_func=layer_norm,
+            with_backward=True,
+            normalized_shape=[4, 4],
+        )
+        self.check_dim_mapping(outputs, [-1, -1, -1])
+
+    def run_test_case(self):
+        if self._backend == "cpu":
+            paddle.set_device("cpu")
+        elif self._backend == "gpu":
+            paddle.set_device("gpu:" + str(dist.get_rank()))
+        else:
+            raise ValueError("Only support cpu or gpu backend.")
+
+        self.test_layernorm_forward()
+        # all to all is not supported yet for cpu
+        if self._backend == "gpu":
+            self.test_layernorm_reshard()
+
+
+if __name__ == '__main__':
+    TestLayerNormSemiAutoParallel().run_test_case()
diff --git a/test/auto_parallel/test_semi_auto_parallel_basic.py b/test/auto_parallel/test_semi_auto_parallel_basic.py
@@ -56,6 +56,16 @@ def test_concat_api(self):
                 user_defined_envs=envs,
             )
 
+    def test_layernorm_api(self):
+        envs_list = test_base.gen_product_envs_list(
+            self._default_envs, self._changeable_envs
+        )
+        for envs in envs_list:
+            self.run_test_case(
+                "semi_auto_parallel_for_layernorm.py",
+                user_defined_envs=envs,
+            )
+
     def test_reduction_api(self):
         envs_list = test_base.gen_product_envs_list(
             self._default_envs, self._changeable_envs

diff --git a/test/cpp/auto_parallel/spmd_rule_test.cc b/test/cpp/auto_parallel/spmd_rule_test.cc
@@ -997,6 +997,72 @@ TEST(Numel, Ctor) {
   check_partial_dims(infered_dist_attrs.second[0], {0});
 }
 
+TEST(LayerNorm, Ctor) {
+  using phi::distributed::PartialStatus;
+  std::vector<int64_t> mesh_shape = {2, 2};
+  std::vector<int64_t> process_ids = {0, 1, 2, 3};
+  std::vector<std::string> dim_names = {"x", "y"};
+  ProcessMesh process_mesh(mesh_shape, process_ids, dim_names);
+
+  std::vector<int64_t> x_shapes = {16, 32, 32};
+
+  auto build_input = [&](const std::vector<int64_t>& shape,
+                         const std::vector<int64_t>& dim_mapping) {
+    auto t_dist_attr = TensorDistAttr();
+    t_dist_attr.set_process_mesh(process_mesh);
+    t_dist_attr.set_dims_mapping(dim_mapping);
+    t_dist_attr.set_dynamic_dims({false, false, false});
+    auto input =
+        phi::distributed::DistMetaTensor(phi::make_ddim(shape), t_dist_attr);
+    return input;
+  };
+  // test 1
+  auto x = build_input(x_shapes, {0, 1, -1});
+  auto out_grad = build_input(x_shapes, {0, 1, -1});
+  auto mean = build_input({16, 32}, {0, 1});
+  auto variance = build_input({16, 32}, {0, 1});
+  auto scale = build_input({32}, {0});
+  auto bias = build_input({32}, {0});
+
+  auto spmd1 =
+      LayerNormGradInferSpmd(x, scale, bias, mean, variance, out_grad, 1.0, 2);
+
+  EXPECT_EQ(spmd1.first.size(), static_cast<size_t>(6));
+  EXPECT_EQ(spmd1.second.size(), static_cast<size_t>(3));
+
+  check_dim_mapping(spmd1.first[0], {0, 1, -1});
+  check_dim_mapping(spmd1.first[1], {-1});
+  check_dim_mapping(spmd1.first[2], {-1});
+  check_dim_mapping(spmd1.first[3], {0, 1});
+  check_dim_mapping(spmd1.first[4], {0, 1});
+  check_dim_mapping(spmd1.first[5], {0, 1, -1});
+  check_dim_mapping(spmd1.second[0], {0, 1, -1});
+  check_dim_mapping(spmd1.second[1], {-1});
+  check_dim_mapping(spmd1.second[2], {-1});
+  check_partial_dims(spmd1.second[1], {0, 1});
+  check_partial_dims(spmd1.second[2], {0, 1});
+  // test 2
+  mean = build_input({16}, {0});
+  variance = build_input({16}, {0});
+  scale = build_input({32, 32}, {0, 1});
+  bias = build_input({32, 32}, {0, 1});
+  auto spmd2 =
+      LayerNormGradInferSpmd(x, scale, bias, mean, variance, out_grad, 1.0, 1);
+  EXPECT_EQ(spmd2.first.size(), static_cast<size_t>(6));
+  EXPECT_EQ(spmd2.second.size(), static_cast<size_t>(3));
+  check_dim_mapping(spmd2.first[0], {0, -1, -1});
+  check_dim_mapping(spmd2.first[1], {-1, -1});
+  check_dim_mapping(spmd2.first[2], {-1, -1});
+  check_dim_mapping(spmd2.first[3], {0});
+  check_dim_mapping(spmd2.first[4], {0});
+  check_dim_mapping(spmd2.first[5], {0, -1, -1});
+  check_dim_mapping(spmd2.second[0], {0, -1, -1});
+  check_dim_mapping(spmd2.second[1], {-1, -1});
+  check_dim_mapping(spmd2.second[2], {-1, -1});
+  check_partial_dims(spmd2.second[1], {0});
+  check_partial_dims(spmd2.second[2], {0});
+}
+
 TEST(Util, Ctor) {
   // test equal test not equal
   using phi::distributed::PartialStatus;