/
norm.py
414 lines (354 loc) · 16.4 KB
/
norm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import warnings
from paddle.nn.layer.norm import _BatchNormBase
from paddle.framework import no_grad
from paddle import _C_ops, in_dynamic_mode
from paddle.fluid.layer_helper import LayerHelper
class BatchNorm(paddle.nn.BatchNorm1D):
r"""
Applies Batch Normalization over a SparseCooTensor as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
When use_global_stats = False, the :math:`\mu_{\beta}`
and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch.
Calculated as follows:
.. math::
\mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &//\
\ mini-batch\ mean \\
\sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \
\mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\
When use_global_stats = True, the :math:`\mu_{\beta}`
and :math:`\sigma_{\beta}^{2}` are not the statistics of one mini-batch.
They are global or running statistics (moving_mean and moving_variance). It usually got from the
pre-trained model. Calculated as follows:
.. math::
moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\
moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\
The normalization function formula is as follows:
.. math::
\hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
- :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
- :math:`\gamma` : trainable proportional parameter
- :math:`\beta` : trainable deviation parameter
Parameters:
num_features(int): Indicate the number of channels of the input ``Tensor``.
momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
If it is set to None or one attribute of ParamAttr, batch_norm
will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
data_format(str, optional): Specify the input data format, may be "NC", "NCL" or "NLC". Default "NCL".
use_global_stats(bool|None, optional): Whether to use global mean and variance. If set to False, use the statistics of one mini-batch, if set to True, use the global statistics, if set to None, use global statistics in the test phase and use the statistics of one mini-batch in the training phase. Default: None.
name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
Shape:
- x: A SparseCooTensor with layout = 'NDHWC'.
- output: SparseCooTensor with same shape as input x.
Returns:
None.
Examples:
.. code-block:: python
import paddle
paddle.seed(123)
channels = 3
x_data = paddle.randn((1, 6, 6, 6, channels)).astype('float32')
dense_x = paddle.to_tensor(x_data)
sparse_x = dense_x.to_sparse_coo(4)
batch_norm = paddle.sparse.nn.BatchNorm(channels)
batch_norm_out = batch_norm(sparse_x)
print(batch_norm_out.shape)
# [1, 6, 6, 6, 3]
"""
def __init__(
self,
num_features,
momentum=0.9,
epsilon=1e-05,
weight_attr=None,
bias_attr=None,
data_format='NDHWC',
use_global_stats=None,
name=None,
):
super(BatchNorm, self).__init__(
num_features,
momentum=momentum,
epsilon=epsilon,
weight_attr=weight_attr,
bias_attr=bias_attr,
data_format=data_format,
use_global_stats=use_global_stats,
name=name,
)
def _check_data_format(self, input):
if input != "NDHWC":
raise ValueError('sparse BatchNorm only support layout of "NDHWC"')
def forward(self, input):
self._check_data_format(self._data_format)
if self.training:
warnings.warn(
"When training, we now always track global mean and variance."
)
if self._use_global_stats == None:
self._use_global_stats = not self.training
trainable_statistics = False
else:
trainable_statistics = not self._use_global_stats
data_format = 'NCHW' if self._data_format[1] == 'C' else 'NHWC'
if in_dynamic_mode():
batch_norm_out, _, _, _, _, _ = _C_ops.sparse_batch_norm_(
input,
self.weight,
self.bias,
self._mean,
self._variance,
self._momentum,
self._epsilon,
data_format,
not self.training,
self._use_global_stats,
trainable_statistics,
False,
)
return batch_norm_out
else:
inputs = {
'x': input,
'scale': self.weight,
'bias': self.bias,
'mean': self._mean,
'variance': self._variance,
}
attrs = {
'momentum': self._momentum,
'epsilon': self._epsilon,
'data_layout': data_format,
'is_test': not self.training,
'use_global_stats': self._use_global_stats,
'trainable_statistics': trainable_statistics,
'fuse_with_relu': False,
}
op_type = 'sparse_batch_norm'
helper = LayerHelper(op_type)
dtype = input.dtype
mean_out = helper.create_variable_for_type_inference(
dtype=dtype, stop_gradient=True
)
variance_out = helper.create_variable_for_type_inference(
dtype=dtype, stop_gradient=True
)
saved_mean = helper.create_variable_for_type_inference(
dtype=dtype, stop_gradient=True
)
saved_variance = helper.create_variable_for_type_inference(
dtype=dtype, stop_gradient=True
)
reserve_space = helper.create_variable_for_type_inference(
dtype=dtype, stop_gradient=True
)
out = helper.create_sparse_variable_for_type_inference(dtype)
outputs = {
"out": out,
"mean_out": mean_out,
"variance_out": variance_out,
"saved_mean": saved_mean,
"saved_variance": saved_variance,
"reserve_space": reserve_space,
}
helper.append_op(
type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
)
return out
class SyncBatchNorm(paddle.nn.SyncBatchNorm):
r"""
This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can
be used as a normalizer function for other operations, such as conv2d and fully connected
operations.
The data is normalized by the mean and variance of the channel based on whole mini-batch
, which including data in all gpus.
Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
for more details.
When model in training mode, the :math:`\\mu_{\\beta}`
and :math:`\\sigma_{\\beta}^{2}` are the statistics of whole mini-batch data in all gpus.
Calculated as follows:
.. math::
\mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &//\
\ mini-batch\ mean \\
\sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \
\mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\
- :math:`x` : whole mini-batch data in all gpus
- :math:`m` : the size of the whole mini-batch data
When model in evaluation mode, the :math:`\\mu_{\\beta}`
and :math:`\sigma_{\beta}^{2}` are global statistics (moving_mean and moving_variance,
which usually got from the pre-trained model). Global statistics calculated as follows:
.. math::
moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\
moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\
The formula of normalization is as follows:
.. math::
\hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
\sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
- :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
- :math:`\gamma` : trainable scale parameter vector
- :math:`\beta` : trainable shift parameter vector
Note:
If you want to use container to pack your model and has ``SyncBatchNorm`` in the
evaluation phase, please use ``nn.LayerList`` or ``nn.Sequential`` instead of
``list`` to pack the model.
Parameters:
num_features(int): Indicate the number of channels of the input ``Tensor``.
epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
of this layer. If it is set to None or one attribute of ParamAttr, this layerr
will create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. If it is set to False,
this layer will not have trainable scale parameter. Default: None.
bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of this layer.
If it is set to None or one attribute of ParamAttr, this layer
will create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. If it is set to False, this layer will not
have trainable bias parameter. Default: None.
data_format(str, optional): Specify the input data format, may be "NCHW". Default "NCHW".
name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
Shapes:
input: Tensor that the dimension from 2 to 5.
output: Tensor with the same shape as input.
Examples:
.. code-block:: python
# required: gpu
import paddle
import paddle.sparse.nn as nn
x = paddle.to_tensor([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]], dtype='float32')
x = x.to_sparse_coo(len(x.shape)-1)
if paddle.is_compiled_with_cuda():
sync_batch_norm = nn.SyncBatchNorm(2)
hidden1 = sync_batch_norm(x)
print(hidden1)
# Tensor(shape=[1, 2, 2, 2], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
# indices=[[0, 0, 0, 0],
# [0, 0, 1, 1],
# [0, 1, 0, 1]],
# values=[[-0.40730840, -0.13725480],
# [-0.40730840, -1.20299828],
# [ 1.69877410, -0.23414057],
# [-0.88415730, 1.57439375]])
"""
def __init__(
self,
num_features,
momentum=0.9,
epsilon=1e-05,
weight_attr=None,
bias_attr=None,
data_format='NCHW',
name=None,
):
super(SyncBatchNorm, self).__init__(
num_features,
momentum,
epsilon,
weight_attr,
bias_attr,
data_format,
name,
)
def forward(self, x):
self._check_data_format()
sync_batch_norm_out, _, _, _, _, _ = _C_ops.sparse_sync_batch_norm_(
x,
self.weight,
self.bias,
self._mean,
self._variance,
self._momentum,
self._epsilon,
self._data_format,
not self.training,
False,
False,
False,
)
return sync_batch_norm_out
@classmethod
def convert_sync_batchnorm(cls, layer):
r"""
Helper function to convert :class: `paddle.sparse.nn.BatchNorm` layers in the model to :class: `paddle.sparse.nn.SyncBatchNorm` layers.
Parameters:
layer(paddle.nn.Layer): model containing one or more `BatchNorm` layers.
Returns:
The original model with converted SyncBatchNorm layers. If BatchNorm layer in the model, use SyncBatchNorm layer instead.
Examples:
.. code-block:: python
import paddle
import paddle.sparse.nn as nn
model = paddle.nn.Sequential(nn.Conv3D(3, 5, 3), nn.BatchNorm(5))
sync_model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
"""
layer_output = layer
if isinstance(layer, _BatchNormBase):
if (
layer._weight_attr != None
and not isinstance(layer._weight_attr, bool)
and layer._weight_attr.name != None
):
layer._weight_attr.name = layer._weight_attr.name + '_sync'
if (
layer._bias_attr != None
and not isinstance(layer._bias_attr, bool)
and layer._bias_attr.name != None
):
layer._bias_attr.name = layer._bias_attr.name + '_sync'
# convert sparse BatchNorm
if isinstance(layer, BatchNorm):
layer_output = SyncBatchNorm(
layer._num_features,
layer._momentum,
layer._epsilon,
layer._weight_attr,
layer._bias_attr,
layer._data_format,
layer._name,
)
# convert dense BatchNorm
else:
layer_output = paddle.nn.SyncBatchNorm(
layer._num_features,
layer._momentum,
layer._epsilon,
layer._weight_attr,
layer._bias_attr,
layer._data_format,
layer._name,
)
if layer._weight_attr != False and layer._bias_attr != False:
with no_grad():
layer_output.weight = layer.weight
layer_output.bias = layer.bias
layer_output._mean = layer._mean
layer_output._variance = layer._variance
for name, sublayer in layer.named_children():
layer_output.add_sublayer(
name, cls.convert_sync_batchnorm(sublayer)
)
del layer
return layer_output