diff --git a/python/paddle/distributed/model/moe/grad_clip.py b/python/paddle/distributed/model/moe/grad_clip.py index e6b3d7d9e38cf..ed0935eefd762 100644 --- a/python/paddle/distributed/model/moe/grad_clip.py +++ b/python/paddle/distributed/model/moe/grad_clip.py @@ -18,7 +18,6 @@ from paddle.fluid.dygraph import base as imperative_base from paddle.fluid import core, layers, framework from paddle.distributed import collective - import six import warnings import copy @@ -113,7 +112,6 @@ def get_l2_norm_pow(params_grads, sum_dtype=None): if g.type == core.VarDesc.VarType.SELECTED_ROWS: merge_grad = layers.merge_selected_rows(g) merge_grad = layers.get_tensor_from_selected_rows(merge_grad) - sum_square = _squared_l2_norm(merge_grad) if sum_square.dtype == core.VarDesc.VarType.FP16: sum_square_list_fp16.append(sum_square) @@ -168,8 +166,6 @@ def _dygraph_clip(self, params_grads): # why to return sum_dtype? # we will call `get_l2_norm_pow` twice and the precisions may be different. - # For example, the first dtype is float64 while the second is float32 - # So we shuold give the first retuned dtype to the second calling to keep a higher precision. # For convenience and simplification, we use sum_dtype directly instead of global_norm_var_normal.dtype global_norm_var_normal, sum_dtype \ = self.get_l2_norm_pow(normal_params_grads) @@ -216,53 +212,8 @@ def _dygraph_clip(self, params_grads): if g.dtype == core.VarDesc.VarType.FP16 else clip_var) new_grad = layers.elementwise_mul(x=g, y=clip_input) params_and_grads.append((p, new_grad)) - return params_and_grads - def _process_context(self, context, param, grad): - if self.group_name not in context: - context[self.group_name] = [] - context[self.group_name + "_clip_value"] = self.clip_norm - context[self.group_name + "_clip"] = layers.fill_constant( - shape=[1], dtype=grad.dtype, value=self.clip_norm) - else: - if not self.clip_norm == context[self.group_name + "_clip_value"]: - raise ValueError( - "All parameters' 'clip_norm' of a same group should be the same" - ) - - merge_grad = grad - if grad.type == core.VarDesc.VarType.SELECTED_ROWS: - merge_grad = layers.merge_selected_rows(grad) - merge_grad = layers.get_tensor_from_selected_rows(merge_grad) - - local_norm_var = _squared_l2_norm(merge_grad) - context[self.group_name].append(local_norm_var) - - self.context = context - - def _create_operators(self, param, grad): - group_scale_name = self.group_name + "_scale" - if group_scale_name not in self.context: - group_norm_var = layers.sums(input=self.context[self.group_name]) - group_norm_var = layers.sqrt(x=group_norm_var) - clip_var = self.context[self.group_name + "_clip"] - group_scale_var = layers.elementwise_div( - x=clip_var, - y=layers.elementwise_max( - x=clip_var, y=group_norm_var)) - assert group_scale_var.shape == (1, ) - self.context[group_scale_name] = group_scale_var - - # inplace - param.block.append_op( - type='elementwise_mul', - inputs={'X': grad, - 'Y': self.context[group_scale_name]}, - outputs={'Out': grad}) - - return param, grad - ClipGradByGlobalNorm = ClipGradForMOEByGlobalNorm