Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Layer norm back ward #58760

Merged
merged 34 commits into from
Nov 22, 2023
Merged
Show file tree
Hide file tree
Changes from 32 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
add1f01
polish
liuzhenhai93 Oct 24, 2023
80582e7
polish
liuzhenhai93 Nov 4, 2023
ce8b472
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
liuzhenhai93 Nov 6, 2023
775df69
polish
liuzhenhai93 Nov 6, 2023
ad22ccc
polish
liuzhenhai93 Nov 6, 2023
5bcaa2f
polish
liuzhenhai93 Nov 6, 2023
8b82133
layer_norm_backward
liuzhenhai93 Nov 7, 2023
b24f85c
layer_norm_backward
liuzhenhai93 Nov 7, 2023
758149b
polish
liuzhenhai93 Nov 7, 2023
02ed2c5
add test
liuzhenhai93 Nov 7, 2023
85a2ecf
polish
liuzhenhai93 Nov 7, 2023
8e47a65
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
liuzhenhai93 Nov 7, 2023
f0a47ae
polish
liuzhenhai93 Nov 7, 2023
56bc5e9
polish
liuzhenhai93 Nov 7, 2023
d0fdeec
add test
liuzhenhai93 Nov 7, 2023
c634811
polish
liuzhenhai93 Nov 7, 2023
dc0fd8f
polish
liuzhenhai93 Nov 7, 2023
62b87f4
format
liuzhenhai93 Nov 7, 2023
29b86b7
code gen not supported yet
liuzhenhai93 Nov 7, 2023
4d3ba17
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
liuzhenhai93 Nov 13, 2023
66ec1be
polish
liuzhenhai93 Nov 13, 2023
986d725
polish
liuzhenhai93 Nov 13, 2023
c364e85
polish
liuzhenhai93 Nov 13, 2023
d76e7e0
add test
liuzhenhai93 Nov 13, 2023
90546d8
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
liuzhenhai93 Nov 14, 2023
de22227
polish
liuzhenhai93 Nov 14, 2023
f02351e
polish
liuzhenhai93 Nov 14, 2023
cf2ebd6
polish
liuzhenhai93 Nov 14, 2023
762331c
polish
liuzhenhai93 Nov 15, 2023
2bf59a8
polish
liuzhenhai93 Nov 15, 2023
6806fb5
polish
liuzhenhai93 Nov 15, 2023
3fa601b
polish
liuzhenhai93 Nov 16, 2023
32438d9
polish
liuzhenhai93 Nov 21, 2023
216f246
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
liuzhenhai93 Nov 21, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions paddle/phi/api/yaml/backward.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1251,6 +1251,7 @@
output : Tensor(x_grad), Tensor(scale_grad), Tensor(bias_grad)
infer_meta :
func : LayerNormGradInferMeta
spmd_rule : LayerNormGradInferSpmd
param : [x, scale, bias]
kernel :
func : layer_norm_grad
Expand Down
8 changes: 8 additions & 0 deletions paddle/phi/api/yaml/generator/dist_api_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -713,6 +713,14 @@ def generate_specialized_infer_spmd_code(self) -> str:
name=param
)
input_args_code += "meta_dist_input_" + param + ", "
elif (
self.inputs['input_info'][param]
== "const paddle::optional<Tensor>&"
):
input_decl_code += (
OPTIONAL_SINGLE_DIST_META_IN_TEMPLATE.format(name=param)
)
input_args_code += "meta_dist_input_" + param + ", "

else:
raise ValueError(
Expand Down
1 change: 1 addition & 0 deletions paddle/phi/api/yaml/ops.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1410,6 +1410,7 @@
output : Tensor(out), Tensor(mean), Tensor(variance)
infer_meta :
func : LayerNormInferMeta
spmd_rule : LayerNormInferSpmd
kernel :
func : layer_norm
data_type : x
Expand Down
116 changes: 116 additions & 0 deletions paddle/phi/infermeta/spmd_rules/layer_norm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -283,5 +283,121 @@ SpmdInfo LayerNormInferSpmdReverse(const DistMetaTensor& x,
return {ToArgDistAttr(input_dist_attrs), ToArgDistAttr(output_dist_attrs)};
}

std::tuple<std::vector<std::string>, std::string> BuildLayerNormGradEinsum(
int64_t input_rank, int64_t begin_norm_axis) {
std::string alphabet = "ijklmnopqrstuvwxyz";
std::string x_notation = alphabet.substr(0, input_rank);
std::string mean_variance_notation = x_notation.substr(0, begin_norm_axis);
std::string align_notation = x_notation.substr(0, begin_norm_axis);
return {
{x_notation, mean_variance_notation, mean_variance_notation, x_notation},
align_notation};
}

SpmdInfo LayerNormGradInferSpmd(const DistMetaTensor& x,
const DistMetaTensor& scale,
const DistMetaTensor& bias,
const DistMetaTensor& mean,
const DistMetaTensor& variance,
const DistMetaTensor out_grad,
float epsilon,
int begin_norm_axis) {
auto get_shape = [](const auto& meta) {
return phi::vectorize<int64_t>(meta.dims());
};
// 1、check tensors shapes
auto x_shape = get_shape(x);
auto scale_shape = get_shape(scale);
auto bias_shape = get_shape(bias);
auto mean_shape = get_shape(mean);
auto variance_shape = get_shape(variance);
auto out_grad_shape = get_shape(out_grad);
PADDLE_ENFORCE_GE(
x_shape.size(),
begin_norm_axis,
phi::errors::InvalidArgument(
"The Tensor x's rank [%d] and begin_norm_axis [%d] are not matched.",
x_shape.size(),
begin_norm_axis));
PADDLE_ENFORCE_EQ(
x_shape.size(),
out_grad_shape.size(),
phi::errors::InvalidArgument("The Tensor x's rank [%d] and Tensor "
"out_grad's rank [%d] are not matched.",
x_shape.size(),
out_grad_shape.size()));

PADDLE_ENFORCE_EQ(
scale_shape.size(),
bias_shape.size(),
phi::errors::InvalidArgument("The Tensor scale's rank [%d] and Tensor "
"bias's rank [%d] are not matched.",
scale_shape.size(),
bias_shape.size()));

PADDLE_ENFORCE_EQ(
mean_shape.size(),
variance_shape.size(),
phi::errors::InvalidArgument("The Tensor mean's rank [%d] and Tensor "
"variance's rank [%d] are not matched.",
mean_shape.size(),
variance_shape.size()));

// 2、align sharding
TensorDistAttr x_dist_attr;
TensorDistAttr mean_dist_attr;
TensorDistAttr variance_dist_attr;
TensorDistAttr grad_dist_attr;
std::vector<TensorDistAttr> dist_attrs;
dist_attrs.push_back(x.dist_attr());
dist_attrs.push_back(mean.dist_attr());
dist_attrs.push_back(variance.dist_attr());
dist_attrs.push_back(out_grad.dist_attr());
if (begin_norm_axis > 0) {
std::vector<std::vector<int64_t>> shapes = {
x_shape, mean_shape, variance_shape, x_shape};
std::vector<std::string> anotations;
std::string align_anotation;
std::tie(anotations, align_anotation) =
BuildLayerNormGradEinsum(x_shape.size(), begin_norm_axis);
AlignDimsSharding(
&dist_attrs, shapes, anotations, {}, align_anotation, false);
x_dist_attr = std::move(dist_attrs[0]);
mean_dist_attr = std::move(dist_attrs[1]);
variance_dist_attr = std::move(dist_attrs[2]);
grad_dist_attr = std::move(dist_attrs[3]);
} else {
x_dist_attr = GetReplicatedDistAttr(dist_attrs[0]);
mean_dist_attr = GetReplicatedDistAttr(dist_attrs[1]);
variance_dist_attr = GetReplicatedDistAttr(dist_attrs[2]);
grad_dist_attr = GetReplicatedDistAttr(dist_attrs[3]);
}
// TODO(liuzhenhai): support sharded scale and bias
TensorDistAttr scale_dist_attr = GetReplicatedDistAttr(scale.dist_attr());
TensorDistAttr bias_dist_attr = GetReplicatedDistAttr(bias.dist_attr());
TensorDistAttr scale_grad_dist_attr =
GetReplicatedDistAttr(scale.dist_attr());
TensorDistAttr bias_grad_dist_attr = GetReplicatedDistAttr(bias.dist_attr());
// partial grad dim
std::vector<int64_t> partial_on_dims;
const auto& dim_mapping = x_dist_attr.dims_mapping();
for (int i = 0; i < begin_norm_axis; ++i) {
auto mapping = dim_mapping[i];
if (mapping != -1) {
partial_on_dims.push_back(i);
}
}
scale_grad_dist_attr.set_partial_status(partial_on_dims);
bias_grad_dist_attr.set_partial_status(partial_on_dims);

return SpmdInfo({x_dist_attr,
scale_dist_attr,
bias_dist_attr,
mean_dist_attr,
variance_dist_attr,
grad_dist_attr},
{grad_dist_attr, scale_grad_dist_attr, bias_grad_dist_attr});
}

} // namespace distributed
} // namespace phi
9 changes: 9 additions & 0 deletions paddle/phi/infermeta/spmd_rules/layer_norm.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,15 @@ SpmdInfo LayerNormInferSpmd(const DistMetaTensor& x,
float epsilon,
int begin_norm_axis);

SpmdInfo LayerNormGradInferSpmd(const DistMetaTensor& x,
const DistMetaTensor& scale,
const DistMetaTensor& bias,
const DistMetaTensor& mean,
const DistMetaTensor& variance,
const DistMetaTensor out_grad,
float epsilon = 1e-5,
int begin_norm_axis = 1);

SpmdInfo LayerNormInferSpmdReverse(const DistMetaTensor& x,
const DistMetaTensor& scale,
const DistMetaTensor& bias,
Expand Down
11 changes: 11 additions & 0 deletions paddle/phi/infermeta/spmd_rules/rules.h
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,11 @@ PD_REGISTER_SPMD_RULE(
PD_INFER_SPMD(phi::distributed::ElementwiseBinaryInferSpmd),
PD_INFER_SPMD(phi::distributed::ElementwiseBinaryInferSpmdReverse));

PD_REGISTER_SPMD_RULE(
not_equal,
PD_INFER_SPMD(phi::distributed::ElementwiseBinaryInferSpmd),
PD_INFER_SPMD(phi::distributed::ElementwiseBinaryInferSpmdReverse));

// TODO(pkuzyc): add multiary elementwise rule

// reduction rule
Expand Down Expand Up @@ -474,6 +479,12 @@ PD_REGISTER_SPMD_RULE(
max,
PD_INFER_SPMD(phi::distributed::ReductionInferSpmd),
PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse));

PD_REGISTER_SPMD_RULE(
reduce_max,
PD_INFER_SPMD(phi::distributed::ReductionInferSpmd),
PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse));

PD_REGISTER_SPMD_RULE(
min,
PD_INFER_SPMD(phi::distributed::ReductionInferSpmd),
Expand Down
81 changes: 81 additions & 0 deletions test/auto_parallel/semi_auto_parallel_for_layernorm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
from semi_auto_parallel_util import SemiAutoParallelTestBase

import paddle
import paddle.distributed as dist


def layer_norm(input, weights, bias, normalized_shape):
return paddle.nn.functional.layer_norm(
input, normalized_shape, weight=weights, bias=bias
)


class TestLayerNormSemiAutoParallel(SemiAutoParallelTestBase):
def __init__(self):
super().__init__()

def check_tensor_eq(self, a, b):
np1 = a.numpy()
np2 = b.numpy()
np.testing.assert_allclose(np1, np2, rtol=1e-04, verbose=True)

def check_dim_mapping(self, output, expected_dim_mapping):
assert (
output.dist_attr.dims_mapping == expected_dim_mapping
), f"{output.dist_attr.dims_mapping} vs {expected_dim_mapping}"

def test_layernorm_forward(self):
shapes = ([16, 4, 4], [16], [16])
specs = (['x', None, None], [None], [None])
inputs, outputs = self.runfunc_and_check(
inputs_shape=shapes,
inputs_specs=specs,
op_func=layer_norm,
with_backward=True,
normalized_shape=[4, 4],
)
self.check_dim_mapping(outputs, [0, -1, -1])

def test_layernorm_reshard(self):
shapes = ([16, 4, 4], [16], [16])
specs = ([None, None, 'x'], [None], [None])
inputs, outputs = self.runfunc_and_check(
inputs_shape=shapes,
inputs_specs=specs,
op_func=layer_norm,
with_backward=True,
normalized_shape=[4, 4],
)
self.check_dim_mapping(outputs, [-1, -1, -1])

def run_test_case(self):
if self._backend == "cpu":
paddle.set_device("cpu")
elif self._backend == "gpu":
paddle.set_device("gpu:" + str(dist.get_rank()))
else:
raise ValueError("Only support cpu or gpu backend.")

self.test_layernorm_forward()
# all to all is not supported yet for cpu
if self._backend == "gpu":
self.test_layernorm_reshard()


if __name__ == '__main__':
TestLayerNormSemiAutoParallel().run_test_case()
10 changes: 10 additions & 0 deletions test/auto_parallel/test_semi_auto_parallel_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,16 @@ def test_concat_api(self):
user_defined_envs=envs,
)

def test_layernorm_api(self):
envs_list = test_base.gen_product_envs_list(
self._default_envs, self._changeable_envs
)
for envs in envs_list:
self.run_test_case(
"semi_auto_parallel_for_layernorm.py",
user_defined_envs=envs,
)

def test_reduction_api(self):
envs_list = test_base.gen_product_envs_list(
self._default_envs, self._changeable_envs
Expand Down
66 changes: 66 additions & 0 deletions test/cpp/auto_parallel/spmd_rule_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -997,6 +997,72 @@ TEST(Numel, Ctor) {
check_partial_dims(infered_dist_attrs.second[0], {0});
}

TEST(LayerNorm, Ctor) {
using phi::distributed::PartialStatus;
std::vector<int64_t> mesh_shape = {2, 2};
std::vector<int64_t> process_ids = {0, 1, 2, 3};
std::vector<std::string> dim_names = {"x", "y"};
ProcessMesh process_mesh(mesh_shape, process_ids, dim_names);

std::vector<int64_t> x_shapes = {16, 32, 32};

auto build_input = [&](const std::vector<int64_t>& shape,
const std::vector<int64_t>& dim_mapping) {
auto t_dist_attr = TensorDistAttr();
t_dist_attr.set_process_mesh(process_mesh);
t_dist_attr.set_dims_mapping(dim_mapping);
t_dist_attr.set_dynamic_dims({false, false, false});
auto input =
phi::distributed::DistMetaTensor(phi::make_ddim(shape), t_dist_attr);
return input;
};
// test 1
auto x = build_input(x_shapes, {0, 1, -1});
auto out_grad = build_input(x_shapes, {0, 1, -1});
auto mean = build_input({16, 32}, {0, 1});
auto variance = build_input({16, 32}, {0, 1});
auto scale = build_input({32}, {0});
auto bias = build_input({32}, {0});

auto spmd1 =
LayerNormGradInferSpmd(x, scale, bias, mean, variance, out_grad, 1.0, 2);

EXPECT_EQ(spmd1.first.size(), static_cast<size_t>(6));
EXPECT_EQ(spmd1.second.size(), static_cast<size_t>(3));

check_dim_mapping(spmd1.first[0], {0, 1, -1});
check_dim_mapping(spmd1.first[1], {-1});
check_dim_mapping(spmd1.first[2], {-1});
check_dim_mapping(spmd1.first[3], {0, 1});
check_dim_mapping(spmd1.first[4], {0, 1});
check_dim_mapping(spmd1.first[5], {0, 1, -1});
check_dim_mapping(spmd1.second[0], {0, 1, -1});
check_dim_mapping(spmd1.second[1], {-1});
check_dim_mapping(spmd1.second[2], {-1});
check_partial_dims(spmd1.second[1], {0, 1});
check_partial_dims(spmd1.second[2], {0, 1});
// test 2
mean = build_input({16}, {0});
variance = build_input({16}, {0});
scale = build_input({32, 32}, {0, 1});
bias = build_input({32, 32}, {0, 1});
auto spmd2 =
LayerNormGradInferSpmd(x, scale, bias, mean, variance, out_grad, 1.0, 1);
EXPECT_EQ(spmd2.first.size(), static_cast<size_t>(6));
EXPECT_EQ(spmd2.second.size(), static_cast<size_t>(3));
check_dim_mapping(spmd2.first[0], {0, -1, -1});
check_dim_mapping(spmd2.first[1], {-1, -1});
check_dim_mapping(spmd2.first[2], {-1, -1});
check_dim_mapping(spmd2.first[3], {0});
check_dim_mapping(spmd2.first[4], {0});
check_dim_mapping(spmd2.first[5], {0, -1, -1});
check_dim_mapping(spmd2.second[0], {0, -1, -1});
check_dim_mapping(spmd2.second[1], {-1, -1});
check_dim_mapping(spmd2.second[2], {-1, -1});
check_partial_dims(spmd2.second[1], {0});
check_partial_dims(spmd2.second[2], {0});
}

TEST(Util, Ctor) {
// test equal test not equal
using phi::distributed::PartialStatus;
Expand Down