-
Notifications
You must be signed in to change notification settings - Fork 5.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add model average optimizer for fluid #9082
Merged
wanghaoshuang
merged 13 commits into
PaddlePaddle:develop
from
wanghaoshuang:average_model
Mar 22, 2018
Merged
Changes from all commits
Commits
Show all changes
13 commits
Select commit
Hold shift + click to select a range
8a64568
Add sum accumulator with window for model average
wanghaoshuang d7e5e1f
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
wanghaoshuang aee6867
Add clone_variable function for Block class.
wanghaoshuang 016d0eb
Add python API for sum op.
wanghaoshuang 87fe52c
Add ModelAverage class to optimizer.py
wanghaoshuang e0b136c
Refine average accumulates op
wanghaoshuang 92a01d4
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
wanghaoshuang cad4d7f
Refine initial and API of ModelAverage API
wanghaoshuang d22f4de
Refine sum_accumulates_op.
wanghaoshuang e01c770
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
wanghaoshuang 68c9f6e
Fix error while params_grads[1]==None
wanghaoshuang ad63722
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
wanghaoshuang edb4e29
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
wanghaoshuang File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,216 @@ | ||
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. */ | ||
|
||
#include "paddle/fluid/operators/average_accumulates_op.h" | ||
|
||
namespace paddle { | ||
namespace operators { | ||
|
||
template <> | ||
void GetAccumulators<paddle::platform::CPUDeviceContext>( | ||
const framework::ExecutionContext& ctx, int64_t& num_updates_, | ||
int64_t& num_accumulates_, int64_t& old_num_accumulates_) { | ||
auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates"); | ||
auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates"); | ||
auto* in_num_updates = ctx.Input<Tensor>("in_num_updates"); | ||
|
||
old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0]; | ||
num_accumulates_ = in_num_accumulates->data<int64_t>()[0]; | ||
num_updates_ = in_num_updates->data<int64_t>()[0]; | ||
} | ||
|
||
template <> | ||
void SetAccumulators<paddle::platform::CPUDeviceContext>( | ||
const framework::ExecutionContext& ctx, int64_t num_updates_, | ||
int64_t num_accumulates_, int64_t old_num_accumulates_) { | ||
auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates"); | ||
auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates"); | ||
auto* out_num_updates = ctx.Output<Tensor>("out_num_updates"); | ||
|
||
out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates_; | ||
out_num_accumulates->data<int64_t>()[0] = num_accumulates_; | ||
out_num_updates->data<int64_t>()[0] = num_updates_; | ||
} | ||
|
||
class AverageAccumulatesOp : public framework::OperatorWithKernel { | ||
public: | ||
using framework::OperatorWithKernel::OperatorWithKernel; | ||
|
||
void InferShape(framework::InferShapeContext* ctx) const override { | ||
PADDLE_ENFORCE( | ||
ctx->HasInput("param"), | ||
"Input (param) of average_accumulates op should not be null."); | ||
PADDLE_ENFORCE( | ||
ctx->HasInput("in_sum_1"), | ||
"Input (sum_1) of average_accumulates op should not be null."); | ||
PADDLE_ENFORCE( | ||
ctx->HasInput("in_sum_2"), | ||
"Input (sum_2) of average_accumulates op should not be null."); | ||
PADDLE_ENFORCE( | ||
ctx->HasInput("in_sum_3"), | ||
"Input (sum_3) of average_accumulates op should not be null."); | ||
PADDLE_ENFORCE( | ||
ctx->HasInput("in_num_accumulates"), | ||
"Input (in_num_accumulates) of average_accumulates op should " | ||
"not be null."); | ||
PADDLE_ENFORCE(ctx->HasInput("in_old_num_accumulates"), | ||
"Input (old_num_accumulates) of average_accumulates op " | ||
"should not be null."); | ||
PADDLE_ENFORCE( | ||
ctx->HasInput("in_num_updates"), | ||
"Input (num_updates) of average_accumulates op should not be null."); | ||
|
||
PADDLE_ENFORCE( | ||
ctx->HasOutput("out_sum_1"), | ||
"Output (sum_1) of average_accumulates op should not be null."); | ||
PADDLE_ENFORCE( | ||
ctx->HasOutput("out_sum_2"), | ||
"Output (sum_2) of average_accumulates op should not be null."); | ||
PADDLE_ENFORCE( | ||
ctx->HasOutput("out_sum_3"), | ||
"Output (sum_3) of average_accumulates op should not be null."); | ||
PADDLE_ENFORCE(ctx->HasOutput("out_num_accumulates"), | ||
"Output (num_accumulates) of average_accumulates op should " | ||
"not be null."); | ||
PADDLE_ENFORCE(ctx->HasOutput("out_old_num_accumulates"), | ||
"Output (old_num_accumulates) of average_accumulates op " | ||
"should not be null."); | ||
PADDLE_ENFORCE( | ||
ctx->HasOutput("out_num_updates"), | ||
"Output (num_updates) of average_accumulates op should not be null."); | ||
|
||
auto in_dim = ctx->GetInputDim("param"); | ||
|
||
ctx->SetOutputDim("out_sum_1", in_dim); | ||
ctx->SetOutputDim("out_sum_2", in_dim); | ||
ctx->SetOutputDim("out_sum_3", in_dim); | ||
ctx->SetOutputDim("out_num_accumulates", {1}); | ||
ctx->SetOutputDim("out_old_num_accumulates", {1}); | ||
ctx->SetOutputDim("out_num_updates", {1}); | ||
} | ||
|
||
protected: | ||
framework::OpKernelType GetExpectedKernelType( | ||
const framework::ExecutionContext& ctx) const override { | ||
return framework::OpKernelType( | ||
framework::ToDataType(ctx.Input<Tensor>("param")->type()), | ||
ctx.GetPlace()); | ||
} | ||
}; | ||
|
||
class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker { | ||
public: | ||
AverageAccumulatesOpMaker(OpProto* proto, OpAttrChecker* op_checker) | ||
: OpProtoAndCheckerMaker(proto, op_checker) { | ||
AddInput("param", "(Tensor), The parameter to be accumulated."); | ||
AddInput("in_sum_1", | ||
"(Tensor), A tensor used to store the parameter " | ||
"sums with the same shape as input(param)."); | ||
AddInput("in_sum_2", | ||
"(Tensor), A auxiliary tensor to help " | ||
"accumulating sums of parameter values with the same shape as " | ||
"input(param). It is used to avoid loss of precision due to too " | ||
"many sums."); | ||
AddInput("in_sum_3", | ||
"(Tensor), A auxiliary tensor to help " | ||
"accumulating sums of parameter values with the same shape as " | ||
"input(param)."); | ||
AddInput("in_num_accumulates", | ||
"(Tensor<int64_t>), The accumulating times of current window with " | ||
"shape [1]."); | ||
AddInput( | ||
"in_old_num_accumulates", | ||
"(Tensor<int64_t>), The accumulating times of previous window with " | ||
"shape [1]."); | ||
AddInput("in_num_updates", | ||
"(Tensor<int64_t>), The total number of batches used by trainning " | ||
"before this batch with shape [1]."); | ||
|
||
AddOutput("out_sum_1", | ||
"(Tensor), A tensor used to store the " | ||
"parameter sums with the same shape as input(param)."); | ||
AddOutput("out_sum_2", | ||
"(Tensor), A auxiliary tensor to help " | ||
"accumulating sums of parameter values with the same shape as " | ||
"input(param). It is used to avoid loss of precision due to too " | ||
"many sums."); | ||
AddOutput("out_sum_3", | ||
"(Tensor), A auxiliary tensor to help " | ||
"accumulating sums of parameter values with the same shape as " | ||
"input(param)."); | ||
AddOutput( | ||
"out_num_accumulates", | ||
"(Tensor<int64_t>), The accumulating times of current window with " | ||
"shape [1]."); | ||
AddOutput( | ||
"out_old_num_accumulates", | ||
"(Tensor<int64_t>) The accumulating times of previous window with " | ||
"shape [1]."); | ||
AddOutput( | ||
"out_num_updates", | ||
"(Tensor<int64_t>), The total number of batches used by trainning " | ||
"before this batch with shape [1]."); | ||
|
||
AddAttr<float>("average_window", | ||
"(float, default 0) " | ||
"The rate of average window size relative to num_updates.") | ||
.SetDefault(0); | ||
AddAttr<int64_t>("max_average_window", | ||
"(int64_t) " | ||
"Maximum size of average window. It suggests that the " | ||
"number of mini-batches " | ||
"in one pass is appropriate value to set."); | ||
AddAttr<int64_t>("min_average_window", | ||
"(int64_t, default 10000L) " | ||
"Minimu size of average window.") | ||
.SetDefault(10000L); | ||
|
||
AddComment(R"DOC( | ||
AverageAccumulates Operator. | ||
Accumulate the sum of parameter whtin sliding window. The size of sliding window is | ||
determined by 'average_window', 'max_average_window' and 'min_average_window'. | ||
Memory was shared by Input(in_sum_1) and Output(out_sum_1) which acts as an accumulator 'sum_1'. | ||
'sum_2', 'sum_3', 'num_accumulates', 'old_num_accumulates' and 'num_updates' were the same as 'sum_1'. | ||
|
||
All the accumulators were inited to zero before training. | ||
|
||
And for a mini-batch in training, accumulators were computed as below steps: | ||
num_updates += 1 | ||
num_accumulates += 1 | ||
sum_1 += param | ||
if num_updates % kMaxNumAccumulates == 0: | ||
sum_2 += sum_1 | ||
sum_1 = 0 | ||
if num_accumulates >= min_average_window && num_accumulates >= min(max_average_window, num_updates * average_window): | ||
sum_3 = sum_1 + sum_2 | ||
sum_1 = 0 | ||
sum_2 = 0 | ||
old_num_accumulates = num_accumulates | ||
num_accumulates = 0 | ||
|
||
)DOC"); | ||
} | ||
}; | ||
|
||
} // namespace operators | ||
} // namespace paddle | ||
|
||
namespace ops = paddle::operators; | ||
REGISTER_OPERATOR(average_accumulates, ops::AverageAccumulatesOp, | ||
ops::AverageAccumulatesOpMaker, | ||
paddle::framework::EmptyGradOpMaker); | ||
REGISTER_OP_CPU_KERNEL( | ||
average_accumulates, | ||
ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, float>, | ||
ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, double>); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. */ | ||
|
||
#include "paddle/fluid/operators/average_accumulates_op.h" | ||
#include "paddle/fluid/platform/gpu_info.h" | ||
|
||
namespace paddle { | ||
namespace operators { | ||
template <> | ||
void GetAccumulators<paddle::platform::CUDADeviceContext>( | ||
const framework::ExecutionContext& ctx, int64_t& num_updates_, | ||
int64_t& num_accumulates_, int64_t& old_num_accumulates_) { | ||
auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates"); | ||
auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates"); | ||
auto* in_num_updates = ctx.Input<Tensor>("in_num_updates"); | ||
auto stream = ctx.cuda_device_context().stream(); | ||
memory::Copy(platform::CPUPlace(), &old_num_accumulates_, | ||
platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(), | ||
sizeof(int64_t), stream); | ||
memory::Copy(platform::CPUPlace(), &num_accumulates_, platform::CUDAPlace(), | ||
in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream); | ||
memory::Copy(platform::CPUPlace(), &num_updates_, platform::CUDAPlace(), | ||
in_num_updates->data<int64_t>(), sizeof(int64_t), stream); | ||
} | ||
|
||
template <> | ||
void SetAccumulators<paddle::platform::CUDADeviceContext>( | ||
const framework::ExecutionContext& ctx, int64_t num_updates_, | ||
int64_t num_accumulates_, int64_t old_num_accumulates_) { | ||
auto stream = ctx.cuda_device_context().stream(); | ||
auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates"); | ||
auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates"); | ||
auto* out_num_updates = ctx.Output<Tensor>("out_num_updates"); | ||
|
||
memory::Copy(platform::CUDAPlace(), out_old_num_accumulates->data<int64_t>(), | ||
platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t), | ||
stream); | ||
memory::Copy(platform::CUDAPlace(), out_num_accumulates->data<int64_t>(), | ||
platform::CPUPlace(), &num_accumulates_, sizeof(int64_t), | ||
stream); | ||
memory::Copy(platform::CUDAPlace(), out_num_updates->data<int64_t>(), | ||
platform::CPUPlace(), &num_updates_, sizeof(int64_t), stream); | ||
} | ||
|
||
} // namespace operators | ||
} // namespace paddle | ||
|
||
namespace ops = paddle::operators; | ||
REGISTER_OP_CUDA_KERNEL( | ||
average_accumulates, | ||
ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, float>, | ||
ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, double>); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. */ | ||
|
||
#pragma once | ||
#include <algorithm> | ||
#include "paddle/fluid/framework/eigen.h" | ||
#include "paddle/fluid/framework/op_registry.h" | ||
#include "paddle/fluid/operators/math/math_function.h" | ||
|
||
namespace paddle { | ||
namespace operators { | ||
|
||
using Tensor = framework::Tensor; | ||
|
||
template <typename T, int MajorType = Eigen::RowMajor, | ||
typename IndexType = Eigen::DenseIndex> | ||
using EigenVector = framework::EigenVector<T, MajorType, IndexType>; | ||
|
||
template <typename DeviceContext> | ||
void GetAccumulators(const framework::ExecutionContext& ctx, | ||
int64_t& num_updates, int64_t& num_accumulates, | ||
int64_t& old_num_accumulates); | ||
|
||
template <typename DeviceContext> | ||
void SetAccumulators(const framework::ExecutionContext& ctx, | ||
int64_t num_updates, int64_t num_accumulates, | ||
int64_t old_num_accumulates); | ||
|
||
template <typename DeviceContext, typename T> | ||
class AverageAccumulatesKernel : public framework::OpKernel<T> { | ||
public: | ||
void Compute(const framework::ExecutionContext& ctx) const override { | ||
// It is used to avoid loss of precision | ||
static const int64_t kMaxNumAccumulates = 16384; | ||
// Get accumulators from input | ||
int64_t num_updates = 0; | ||
int64_t num_accumulates = 0; | ||
int64_t old_num_accumulates = 0; | ||
GetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates, | ||
old_num_accumulates); | ||
|
||
// Get attrs | ||
float average_window = ctx.Attr<float>("average_window"); | ||
int64_t max_average_window = ctx.Attr<int64_t>("max_average_window"); | ||
int64_t min_average_window = ctx.Attr<int64_t>("min_average_window"); | ||
min_average_window = | ||
std::min<int64_t>(min_average_window, max_average_window); | ||
|
||
// Get inputs | ||
auto* param = ctx.Input<Tensor>("param"); | ||
auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1"); | ||
auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2"); | ||
auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3"); | ||
auto param_tensor = EigenVector<T>::Flatten(*param); | ||
auto in_sum_1_tensor = EigenVector<T>::Flatten(*in_sum_1); | ||
auto in_sum_2_tensor = EigenVector<T>::Flatten(*in_sum_2); | ||
auto in_sum_3_tensor = EigenVector<T>::Flatten(*in_sum_3); | ||
|
||
// Get outputs | ||
auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1"); | ||
auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2"); | ||
auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3"); | ||
auto out_sum_1_tensor = EigenVector<T>::Flatten(*out_sum_1); | ||
auto out_sum_2_tensor = EigenVector<T>::Flatten(*out_sum_2); | ||
auto out_sum_3_tensor = EigenVector<T>::Flatten(*out_sum_3); | ||
|
||
// Compute | ||
auto& place = *ctx.template device_context<DeviceContext>().eigen_device(); | ||
math::SetConstant<DeviceContext, T> constant_functor; | ||
++num_updates; | ||
++num_accumulates; | ||
out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor; | ||
out_sum_2_tensor.device(place) = in_sum_2_tensor; | ||
out_sum_3_tensor.device(place) = in_sum_3_tensor; | ||
if (num_updates % kMaxNumAccumulates == 0) { | ||
// Move the sum to a different buffer to avoid loss of precision due to | ||
// too many sums. | ||
out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor; | ||
constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1, | ||
0.0); | ||
} | ||
if (num_accumulates >= min_average_window && | ||
num_accumulates >= std::min<int64_t>(max_average_window, | ||
num_updates * average_window)) { | ||
// Now the average window is too long, discard the old sum. | ||
out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor; | ||
constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1, | ||
0.0); | ||
constant_functor(ctx.template device_context<DeviceContext>(), out_sum_2, | ||
0.0); | ||
old_num_accumulates = num_accumulates; | ||
num_accumulates = 0; | ||
} | ||
|
||
// Set accumulators to output | ||
SetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates, | ||
old_num_accumulates); | ||
} | ||
}; | ||
|
||
} // namespace operators | ||
} // namespace paddle |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there any reference paper for kMaxNumAccumulates
16384
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems that
16384
is an experimental value. There are no reference papers.