Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update the epsilon in batch_norm_layer to a variable in v2. #5692

Merged
merged 6 commits into from
Nov 22, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions paddle/gserver/layers/BatchNormBaseLayer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ bool BatchNormBaseLayer::init(const LayerMap& layerMap,
useGlobalStats_ = config_.use_global_stats();
}
movingAvgFraction_ = config_.moving_average_fraction();
epsilon_ = config_.epsilon();

weight_.reset(new Weight(1, channels_, parameters_[0]));
movingMean_.reset(new Weight(1, channels_, parameters_[1]));
Expand Down
2 changes: 2 additions & 0 deletions paddle/gserver/layers/BatchNormBaseLayer.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ class BatchNormBaseLayer : public Layer {
bool useGlobalStats_;
// use to compute moving mean and variance.
real movingAvgFraction_;
// Epsilon is a small random noise used in batch normalization for stability.
real epsilon_;
};

} // namespace paddle
6 changes: 2 additions & 4 deletions paddle/gserver/layers/BatchNormalizationLayer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@ namespace paddle {

REGISTER_LAYER(batch_norm, BatchNormalizationLayer);

const real BatchNormalizationLayer::EPS = 1E-5;

bool BatchNormalizationLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
/* Initialize the basic parent class */
Expand Down Expand Up @@ -53,7 +51,7 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {

calMovingMeanAndVar();

savedInvVar_->subScalar(-EPS);
savedInvVar_->subScalar(-epsilon_);
savedInvVar_->sqrt2(*savedInvVar_);
}

Expand All @@ -74,7 +72,7 @@ void BatchNormalizationLayer::setMeanAndStd() {
savedInvVar_->copyFrom(*(movingVar_->getW()));
savedInvVar_->downClip(real(0.0));

savedInvVar_->subScalar(-EPS);
savedInvVar_->subScalar(-epsilon_);
savedInvVar_->sqrt2(*savedInvVar_);
}

Expand Down
3 changes: 0 additions & 3 deletions paddle/gserver/layers/BatchNormalizationLayer.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,6 @@ class BatchNormalizationLayer : public BatchNormBaseLayer {
void backward(const UpdateCallback& callback = nullptr) override;

protected:
/// Epsilon value used in the batch normalization formula.
static const real EPS;

/// Load pre-calculated mean and std.
void setMeanAndStd();

Expand Down
16 changes: 10 additions & 6 deletions paddle/gserver/layers/CudnnBatchNormLayer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ namespace paddle {

REGISTER_LAYER(cudnn_batch_norm, CudnnBatchNormLayer);

const double CudnnBatchNormLayer::EPS = 1E-5;

bool CudnnBatchNormLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
/* Initialize the basic parent class */
Expand Down Expand Up @@ -61,6 +59,9 @@ void CudnnBatchNormLayer::forward(PassType passType) {
real* movingMean = movingMean_->getW()->getData();
real* movingVar = movingVar_->getW()->getData();

// cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));

if (!useGlobalStats_) {
REGISTER_TIMER_INFO("CudnnBatchFwTimer", getName().c_str());
real* savedMean = savedMean_->getData();
Expand All @@ -75,7 +76,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
1.0 - movingAvgFraction_,
movingMean,
movingVar,
EPS,
eps_,
savedMean,
savedInvVar);
} else {
Expand All @@ -90,7 +91,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
beta,
movingMean,
movingVar,
EPS);
eps_);
} else {
// There is a limitation in cudnn library.
// When the batch size is larger than 1024 in cuDNN v5.1,
Expand All @@ -101,7 +102,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
beta,
movingMean,
movingVar,
EPS,
eps_,
batchSize,
channels_,
imageH_ * imageD_,
Expand All @@ -128,6 +129,9 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
real* savedMean = savedMean_->getData();
real* savedInvVar = savedInvVar_->getData();

// cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));

auto create = [](MatrixPtr& m, size_t h, size_t w, real** p) {
Matrix::resizeOrCreate(m, h, w, false, true);
m->zeroMem();
Expand Down Expand Up @@ -157,7 +161,7 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
gamma,
gammaGrad,
betaGrad,
EPS,
eps_,
savedMean,
savedInvVar);

Expand Down
10 changes: 4 additions & 6 deletions paddle/gserver/layers/CudnnBatchNormLayer.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ limitations under the License. */

#pragma once

#include <cudnn.h>
#include "BatchNormBaseLayer.h"
#include "Layer.h"
#include "paddle/utils/Stat.h"
Expand Down Expand Up @@ -46,12 +47,9 @@ class CudnnBatchNormLayer : public BatchNormBaseLayer {
void backward(const UpdateCallback& callback = nullptr) override;

protected:
/**
* Epsilon value used in the batch normalization formula.
* Minimum allowed value is CUDNN_BN_MIN_EPSILON defined in cudnn.h.
* Same epsilon value should be used in forward and backward functions.
*/
static const double EPS;
/// Epsilon value used in the batch normalization formula.
/// Same epsilon value should be used in forward and backward functions.
double eps_;

/// Input/output tensor descriptor desc
hl_tensor_descriptor ioDesc_;
Expand Down
8 changes: 4 additions & 4 deletions paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ namespace paddle {

REGISTER_LAYER(mkldnn_batch_norm, MKLDNNBatchNormLayer);

const real MKLDNNBatchNormLayer::EPS = 1E-5;

bool MKLDNNBatchNormLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
if (!MKLDNNLayer::init(layerMap, parameterMap)) {
Expand Down Expand Up @@ -50,6 +48,8 @@ bool MKLDNNBatchNormLayer::init(const LayerMap& layerMap,
useGlobalStats_ = config_.use_global_stats();
}
movingAvgFraction_ = config_.moving_average_fraction();
epsilon_ = config_.epsilon();

VLOG(MKLDNN_BASE) << "--- " << (useGlobalStats_ ? "use" : "do not use")
<< " --- global stats";
VLOG(MKLDNN_BASE) << "Moving average fraction: " << movingAvgFraction_;
Expand Down Expand Up @@ -210,7 +210,7 @@ void MKLDNNBatchNormLayer::resetFwdPD(
if (wgt) {
flags_ = (flags_ | batch_normalization_flag::use_scale_shift);
}
auto fwdDesc = bn_fwd::desc(pk, in->getMemoryDesc(), EPS, flags_);
auto fwdDesc = bn_fwd::desc(pk, in->getMemoryDesc(), epsilon_, flags_);
pd.reset(new bn_fwd::primitive_desc(fwdDesc, engine_));
CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
if (wgt) {
Expand Down Expand Up @@ -277,7 +277,7 @@ void MKLDNNBatchNormLayer::resetBwdPD(
}
CHECK_PRIMITIVE_DESC_EQ(out, in->getPrimitiveDesc());
auto md = in->getMemoryDesc();
auto bwdDesc = bn_bwd::desc(prop_kind::backward, md, md, EPS, flags_);
auto bwdDesc = bn_bwd::desc(prop_kind::backward, md, md, epsilon_, flags_);
pd.reset(new bn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
CHECK(pd->weights_primitive_desc() == fwdPD_->weights_primitive_desc());
CHECK_PRIMITIVE_DESC_EQ(wgt, pd->diff_weights_primitive_desc());
Expand Down
3 changes: 2 additions & 1 deletion paddle/gserver/layers/MKLDNNBatchNormLayer.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ class MKLDNNBatchNormLayer : public MKLDNNLayer {
std::shared_ptr<bn_fwd::primitive_desc> fwdPD_;

// Epsilon value used in the batch normalization formula.
static const real EPS;
real epsilon_;

// weight and bias in paddle
std::unique_ptr<Weight> weight_;
std::unique_ptr<Weight> biases_;
Expand Down
4 changes: 4 additions & 0 deletions proto/ModelConfig.proto
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,10 @@ message LayerConfig {

// for switch order layer
optional ReshapeConfig reshape_conf = 59;

// for batch normalization layer
// The small constant added to the variance to improve numeric stability.
optional double epsilon = 60 [ default = 0.00001 ];
}

message EvaluatorConfig {
Expand Down
4 changes: 4 additions & 0 deletions python/paddle/trainer/config_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2412,6 +2412,7 @@ def __init__(self,
bias=True,
img3D=False,
use_global_stats=True,
epsilon=1e-5,
moving_average_fraction=0.9,
batch_norm_type=None,
mean_var_names=None,
Expand Down Expand Up @@ -2460,6 +2461,9 @@ def __init__(self,
self.config.use_global_stats = use_global_stats
if moving_average_fraction is not None:
self.config.moving_average_fraction = moving_average_fraction
if epsilon is not None:
Copy link
Contributor

@lcy-seso lcy-seso Nov 17, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个逻辑有问题吧?默认值 epsilon 已经被设置成 1e-5。让用户把 epsilon 设置成 None,然后又是默认值吗?这个很奇怪。

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里原本是按照moving_average_fraction的方式赋值的,已修改。

assert epsilon >= 1e-5, "epsilon must be no less than 1e-5."
self.config.epsilon = epsilon

input_layer = self.get_input_layer(0)
image_conf = self.config.inputs[0].image_conf
Expand Down
5 changes: 5 additions & 0 deletions python/paddle/trainer_config_helpers/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3118,6 +3118,7 @@ def batch_norm_layer(input,
param_attr=None,
layer_attr=None,
batch_norm_type=None,
epsilon=1e-5,
moving_average_fraction=0.9,
use_global_stats=None,
mean_var_names=None):
Expand Down Expand Up @@ -3188,6 +3189,8 @@ def batch_norm_layer(input,
will use the mean and variance of the current batch
of test data.
:type use_global_stats: bool | None.
:param epsilon: The small constant added to the variance to improve numeric stability.
:type epsilon: float.
:param moving_average_fraction: Factor used in the moving average computation.
:math:`runningMean = newMean*(1-factor) + runningMean*factor`
:type moving_average_fraction: float.
Expand All @@ -3205,6 +3208,7 @@ def batch_norm_layer(input,
assert (batch_norm_type is None) or (batch_norm_type == "batch_norm") or \
(batch_norm_type == "mkldnn_batch_norm") or \
(batch_norm_type == "cudnn_batch_norm")

l = Layer(
name=name,
img3D=img3D,
Expand All @@ -3214,6 +3218,7 @@ def batch_norm_layer(input,
type=LayerType.BATCH_NORM_LAYER,
batch_norm_type=batch_norm_type,
bias=ParamAttr.to_bias(bias_attr),
epsilon=epsilon,
moving_average_fraction=moving_average_fraction,
use_global_stats=use_global_stats,
mean_var_names=mean_var_names,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ layers {
height: 227
width: 227
depth: 1
epsilon: 1e-05
}
layers {
name: "__crmnorm_0__"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ layers {
height: 256
width: 256
depth: 1
epsilon: 1e-05
}
layers {
name: "__crmnorm_0__"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ layers {
height: 6
width: 20
depth: 3
epsilon: 1e-05
}
parameters {
name: "___batch_norm_0__.w0"
Expand Down