Skip to content

Commit

Permalink
Make the normalization operator more general and fix bug in l2_normal…
Browse files Browse the repository at this point in the history
…ize. (#11348)

* Add normalization operator.
1. Refine the raw norm_op and let it more general to support to normalize Tensor along any axis.
2. There is a bug in l2_normalize API, which lacks sqrt after `reduce_sum`.
3. Use norm_op to refine the l2_normalize API.
4. Fix bug in test_normalization_wrapper.py.
  • Loading branch information
qingqing01 committed Jun 12, 2018
1 parent f15504e commit 19fd071
Show file tree
Hide file tree
Showing 7 changed files with 193 additions and 271 deletions.
80 changes: 37 additions & 43 deletions paddle/fluid/operators/norm_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,40 +16,34 @@ limitations under the License. */
namespace paddle {
namespace operators {

template <typename AttrType>
class NormOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput(
"X",
"(Tensor) The input tensor of norm operator. "
"The format of input tensor is NCHW. Where N is batch size, C is the "
"number of channels, H and W is the height and width of feature.");
AddInput("Scale",
"(Tensor) The input tensor of norm operator. "
"The format of input tensor is C * 1.");
AddAttr<AttrType>("epsilon",
"(float, default 1e-10) Constant "
"for numerical stability.")
AddInput("X", "(Tensor) A tensor of rank >= axis.");
AddAttr<int>("axis",
"The axis on which to apply normalization. If axis < 0, "
"the dimension to normalization is rank(X) + axis. -1 is "
"the last dimension.");
AddAttr<float>("epsilon",
"(float, default 1e-10) The epsilon value is used "
"to avoid division by zero.")
.SetDefault(1.0e-10f);
AddOutput("Out",
"(Tensor) The output tensor of norm operator."
"N * M."
"M = C * H * W");
AddOutput("Norm",
"(Tensor) A tensor saved the `sqrt(sum(x) + epsion)` will "
"be used in backward kernel.")
.AsIntermediate();
AddOutput("Out", "(Tensor) A tensor of the same shape as X.");
AddComment(R"DOC(
"Input shape: $(N, C, H, W)$
Scale shape: $(C, 1)$
Output shape: $(N, C, H, W)$
Where
forward
$$
[\frac {x_{1}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{2}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{3}}{\sqrt{\sum{x_{i}^{2}}}} \cdot \cdot \cdot \frac {x_{n}}{\sqrt{\sum{x_{i}^{2}}}}]
$$
backward
$$
\frac{\frac{\mathrm{d}L }{\mathrm{d}y_{1}} - \frac {x_{1}\sum {\frac{\mathrm{d} L}{\mathrm{d} y_{j}}}x_{j}}{\sum x_{j}^{2}} }{\sqrt{\sum{x_{j}^{2}}}}
$$
)DOC");
Given a tensor, apply 2-normalization along the provided axis.
$$
y = \frac{x}{ \sqrt{\sum {x^2} + epsion }}
$$
where, $\sum {x^2}$ is calculated along the `axis` dimension.
)DOC");
}
};

Expand All @@ -58,15 +52,15 @@ class NormOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of NormOp"
"should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Scale"),
"Input(Scale) of NormOp"
"should not be null.");
"Input(X) of NormOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of NormOp should not be null.");
auto in_x_dims = ctx->GetInputDim("X");
ctx->SetOutputDim("Out", in_x_dims);
auto xdim = ctx->GetInputDim("X");
ctx->SetOutputDim("Out", xdim);
int axis = ctx->Attrs().Get<int>("axis");
if (axis < 0) axis = xdim.size() + axis;
xdim[axis] = 1;
ctx->SetOutputDim("Norm", xdim);
}
};

Expand All @@ -84,12 +78,12 @@ class NormOpGrad : public framework::OperatorWithKernel {
} // namespace paddle

namespace ops = paddle::operators;
REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker<float>,
using CPU = paddle::platform::CPUDeviceContext;

REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(norm_grad, ops::NormOpGrad);
REGISTER_OP_CPU_KERNEL(
norm, ops::NormKernel<paddle::platform::CPUDeviceContext, float>,
ops::NormKernel<paddle::platform::CPUDeviceContext, double, float>);
REGISTER_OP_CPU_KERNEL(
norm_grad, ops::NormGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::NormGradKernel<paddle::platform::CPUDeviceContext, double, float>);
REGISTER_OP_CPU_KERNEL(norm, ops::NormKernel<CPU, float>,
ops::NormKernel<CPU, double>);
REGISTER_OP_CPU_KERNEL(norm_grad, ops::NormGradKernel<CPU, float>,
ops::NormGradKernel<CPU, double>);
12 changes: 6 additions & 6 deletions paddle/fluid/operators/norm_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ limitations under the License. */
#include "paddle/fluid/operators/norm_op.h"

namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
norm, ops::NormKernel<paddle::platform::CUDADeviceContext, float>,
ops::NormKernel<paddle::platform::CUDADeviceContext, double, float>);
REGISTER_OP_CUDA_KERNEL(
norm_grad, ops::NormGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::NormGradKernel<paddle::platform::CUDADeviceContext, double, float>);
using CUDA = paddle::platform::CUDADeviceContext;

REGISTER_OP_CUDA_KERNEL(norm, ops::NormKernel<CUDA, float>,
ops::NormKernel<CUDA, double>);
REGISTER_OP_CUDA_KERNEL(norm_grad, ops::NormGradKernel<CUDA, float>,
ops::NormGradKernel<CUDA, double>);
240 changes: 97 additions & 143 deletions paddle/fluid/operators/norm_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,156 +19,110 @@ limitations under the License. */
namespace paddle {
namespace operators {

template <typename DeviceContext, typename T, typename AttrType = T>
inline void GetDims(const framework::DDim& dim, int axis, int* pre, int* n,
int* post) {
*pre = 1;
*post = 1;
*n = dim[axis];
for (int i = 0; i < axis; ++i) {
(*pre) *= dim[i];
}
for (int i = axis + 1; i < dim.size(); ++i) {
(*post) *= dim[i];
}
}

template <typename DeviceContext, typename T>
class NormKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
const framework::Tensor* scale = context.Input<framework::Tensor>("Scale");
auto* out = context.Output<framework::Tensor>("Out");
auto epsilon = static_cast<T>(context.Attr<AttrType>("epsilon"));
out->mutable_data<T>(context.GetPlace());
int batch_size = in_x->dims()[0];
int channels = in_x->dims()[1];
int height = in_x->dims()[2];
int width = in_x->dims()[3];
int fea_len = height * width;
auto* place =
context.template device_context<DeviceContext>().eigen_device();
auto x =
framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
*in_x, framework::make_ddim({batch_size, fea_len * channels}));
// get square
framework::Tensor x_square;
x_square.mutable_data<T>(in_x->dims(), context.GetPlace());
auto x_square_eigen =
framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
x_square, framework::make_ddim({batch_size, fea_len * channels}));
x_square_eigen.device(*place) = x.square();
auto scale_eigen =
framework::EigenVector<T, Eigen::RowMajor, Eigen::DenseIndex>::Flatten(
*scale);
for (int n = 0; n < batch_size; ++n) {
framework::Tensor in_x_batch = in_x->Slice(n, n + 1);
auto in_x_batch_eigen =
framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
in_x_batch, framework::make_ddim({channels, fea_len}));
framework::Tensor x_square_batch = x_square.Slice(n, n + 1);
auto x_square_batch_eigen =
framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
x_square_batch, framework::make_ddim({channels, fea_len}));
framework::Tensor out_batch = out->Slice(n, n + 1);
auto out_batch_eigen =
framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
out_batch, framework::make_ddim({channels, fea_len}));
framework::Tensor tmp_tensor;
tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
context.GetPlace());
auto tmp = framework::EigenVector<T, Eigen::RowMajor,
Eigen::DenseIndex>::Flatten(tmp_tensor);
// get colsum and sqrt , inverse
auto dim = Eigen::array<int, 1>({{0}});
tmp.device(*place) = x_square_batch_eigen.sum(dim);
tmp.device(*place) = (tmp + epsilon).sqrt().inverse();
Eigen::array<int, 2> broadcast_dim_col;
broadcast_dim_col[1] = 1;
broadcast_dim_col[0] = channels;
out_batch_eigen.device(*place) =
in_x_batch_eigen * (tmp.broadcast(broadcast_dim_col));
Eigen::array<int, 2> broadcast_dim_row;
broadcast_dim_row[1] = fea_len;
broadcast_dim_row[0] = 1;
out_batch_eigen.device(*place) =
out_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row));
}
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in_x = ctx.Input<framework::Tensor>("X");
auto* out_y = ctx.Output<framework::Tensor>("Out");
auto* out_norm = ctx.Output<framework::Tensor>("Norm");
out_y->mutable_data<T>(ctx.GetPlace());
out_norm->mutable_data<T>(ctx.GetPlace());

auto xdim = in_x->dims();
auto ndim = out_norm->dims();
T eps = static_cast<T>(ctx.Attr<float>("epsilon"));
int axis = ctx.Attr<int>("axis");
if (axis < 0) axis = xdim.size() + axis;
int pre, n, post;
GetDims(xdim, axis, &pre, &n, &post);

auto* place = ctx.template device_context<DeviceContext>().eigen_device();

Eigen::DSizes<int, 3> shape(pre, n, post);
Eigen::DSizes<int, 2> norm_shape(pre, post);

auto x_e = framework::EigenVector<T>::Flatten(*in_x);
auto y_e = framework::EigenVector<T>::Flatten(*out_y);
auto norm_e = framework::EigenVector<T>::Flatten(*out_norm);
auto x = x_e.reshape(shape);
auto y = y_e.reshape(shape);
auto norm = norm_e.reshape(norm_shape);

Eigen::DSizes<int, 1> rdim(1);
// y = x / sqrt((sum(x * x) + epsilon))
// norm = sqrt(sum(x * x) + epsilon)
auto sum = x.pow(2).sum(rdim) + eps;
norm.device(*place) = sum.sqrt();
// y = x / norm
Eigen::DSizes<int, 3> rshape(pre, 1, post);
Eigen::DSizes<int, 3> bcast(1, n, 1);
y.device(*place) = x / norm.reshape(rshape).broadcast(bcast);
}
};
template <typename DeviceContext, typename T, typename AttrType = T>
class NormGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
const framework::Tensor* scale = context.Input<framework::Tensor>("Scale");
const framework::Tensor* out_grad =
context.Input<framework::Tensor>(framework::GradVarName("Out"));
auto epsilon = static_cast<T>(context.Attr<AttrType>("epsilon"));
framework::Tensor* in_x_grad =
context.Output<framework::Tensor>(framework::GradVarName("X"));
in_x_grad->mutable_data<T>(context.GetPlace());
int batch_size = in_x->dims()[0];
int channels = in_x->dims()[1];
int height = in_x->dims()[2];
int width = in_x->dims()[3];
int fea_len = height * width;
auto* place =
context.template device_context<DeviceContext>().eigen_device();

auto scale_eigen =
framework::EigenVector<T, Eigen::RowMajor, Eigen::DenseIndex>::Flatten(
*scale);
auto x =
framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
*in_x, framework::make_ddim({batch_size, fea_len * channels}));
// get square
framework::Tensor x_square;
x_square.mutable_data<T>(in_x->dims(), context.GetPlace());
auto x_square_eigen =
framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
x_square, framework::make_ddim({batch_size, fea_len * channels}));
x_square_eigen.device(*place) = x.square();

for (int n = 0; n < batch_size; ++n) {
framework::Tensor in_x_batch = in_x->Slice(n, n + 1);
auto in_x_batch_eigen =
framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
in_x_batch, framework::make_ddim({channels, fea_len}));
framework::Tensor in_g_batch = in_x_grad->Slice(n, n + 1);
auto in_g_batch_eigen =
framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
in_g_batch, framework::make_ddim({channels, fea_len}));
framework::Tensor x_square_batch = x_square.Slice(n, n + 1);
auto x_square_batch_eigen =
framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
x_square_batch, framework::make_ddim({channels, fea_len}));
framework::Tensor outg_batch = out_grad->Slice(n, n + 1);
auto outg_batch_eigen =
framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
outg_batch, framework::make_ddim({channels, fea_len}));

framework::Tensor tmp_tensor;
tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
context.GetPlace());
auto tmp_eigen =
framework::EigenVector<T, Eigen::RowMajor,
Eigen::DenseIndex>::Flatten(tmp_tensor);
auto dim = Eigen::array<int, 1>({{0}});
tmp_eigen.device(*place) = (in_x_batch_eigen * outg_batch_eigen).sum(dim);
framework::Tensor norm_tmp_tensor;
norm_tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
context.GetPlace());
auto norm_tmp_eigen =
framework::EigenVector<T, Eigen::RowMajor,
Eigen::DenseIndex>::Flatten(norm_tmp_tensor);
norm_tmp_eigen.device(*place) =
(x_square_batch_eigen.sum(dim) + epsilon).sqrt();
Eigen::array<int, 2> broadcast_dim_col;
broadcast_dim_col[1] = 1;
broadcast_dim_col[0] = channels;
in_g_batch_eigen.device(*place) =
in_x_batch_eigen * tmp_eigen.broadcast(broadcast_dim_col);
in_g_batch_eigen.device(*place) =
in_g_batch_eigen /
(norm_tmp_eigen * norm_tmp_eigen).broadcast(broadcast_dim_col);
in_g_batch_eigen.device(*place) = outg_batch_eigen - in_g_batch_eigen;
// outg_batch_eigen + (in_g_batch_eigen * -1);
in_g_batch_eigen.device(*place) =
in_g_batch_eigen / norm_tmp_eigen.broadcast(broadcast_dim_col);
Eigen::array<int, 2> broadcast_dim_row;
broadcast_dim_row[1] = fea_len;
broadcast_dim_row[0] = 1;
in_g_batch_eigen.device(*place) =
in_g_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row));
}
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in_x = ctx.Input<framework::Tensor>("X");
auto* in_norm = ctx.Input<framework::Tensor>("Norm");
auto* in_dy = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* out_dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
out_dx->mutable_data<T>(ctx.GetPlace());

auto xdim = in_x->dims();
int axis = ctx.Attr<int>("axis");
if (axis < 0) axis = xdim.size() + axis;
int pre, n, post;
GetDims(xdim, axis, &pre, &n, &post);

auto* place = ctx.template device_context<DeviceContext>().eigen_device();

auto x_e = framework::EigenVector<T>::Flatten(*in_x);
auto dy_e = framework::EigenVector<T>::Flatten(*in_dy);
auto norm_e = framework::EigenVector<T>::Flatten(*in_norm);
auto dx_e = framework::EigenVector<T>::Flatten(*out_dx);

Eigen::DSizes<int, 3> shape(pre, n, post);
Eigen::DSizes<int, 2> norm_shape(pre, post);
auto x = x_e.reshape(shape);
auto dy = dy_e.reshape(shape);
auto norm = norm_e.reshape(norm_shape);
auto dx = dx_e.reshape(shape);

framework::Tensor rsum;
rsum.mutable_data<T>({pre, post}, ctx.GetPlace());
auto sum = framework::EigenTensor<T, 2>::From(rsum);

Eigen::DSizes<int, 1> rdim(1);
Eigen::DSizes<int, 3> bcast(1, n, 1);
Eigen::DSizes<int, 3> rshape(pre, 1, post);

// dx = ( dy/sqrt(sum(x*x)) ) * [1 - x*sum(x) / (sum(x*x) + e)]
// = [dy - dy * x * sum(x) / (sum(x*x) + e)] / sqrt(sum(x*x))
// = [dy - x * sum(x*dy) / (sum(x*x) + e)] / sqrt(sum(x*x))
// 1. sum = sum(x*dy)
sum.device(*place) = (x * dy).sum(rdim);
// 2. dx = x * sum
dx.device(*place) = sum.reshape(rshape).broadcast(bcast) * x;
// 3. dx / (sum(x*x) + e)
// where, norm.pow(2) = sum(x*x) + e, which is calculated in forward.
dx.device(*place) = dx / norm.pow(2).broadcast(bcast);
// 4. [dy - dx] / sqrt(sum(x*x))
dx.device(*place) = (dy - dx) / norm.broadcast(bcast);
}
};
} // namespace operators
Expand Down
Loading

0 comments on commit 19fd071

Please sign in to comment.