PaddlePaddle · Aurelius84 · Jul 21, 2022 · Jul 20, 2022 · Jul 21, 2022
diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h
@@ -21,6 +21,23 @@
 #include "paddle/fluid/eager/to_static/run_program_op_node.h"
 #include "paddle/fluid/eager/utils.h"
 
+// Filter params without grads in global block. In this case, we will
+// tag its AutogradMeta with stop_gradient = True to avoid fault from
+// reducer while training on multi-cards.
+static void clear_no_grad_edges(
+    const std::vector<paddle::experimental::Tensor>& params,
+    const paddle::framework::BlockDesc* block_desc,
+    egr::GradNodeBase* grad_node,
+    size_t slot_id) {
+  for (size_t i = 0; i < params.size(); ++i) {
+    auto p_grad_name = paddle::framework::GradVarName(params[i].name());
+    if (!block_desc->HasVar(p_grad_name)) {
+      VLOG(1) << "clear edge of " << p_grad_name;
+      grad_node->MutableOutputMeta()[slot_id][i].GetMutableEdge().Clear();
+    }
+  }
+}
+
 inline void run_program_dygraph_function(
     const std::vector<paddle::experimental::Tensor>& x,
     const std::vector<paddle::experimental::Tensor>& params,
@@ -61,12 +78,16 @@ inline void run_program_dygraph_function(
     grad_node->SetAttrMap(attrs);
     // Set TensorWrappers
     grad_node->SetFwdX(x);
+
     grad_node->SetFwdParams(params);
     grad_node->SetStepScope(step_scope);
 
     // Set Grad out rank as same as fwd input and set stop gradient to bwd
     grad_node->SetGradOutMeta(x, /*slot id*/ 0);
     grad_node->SetGradOutMeta(params, /*slot id*/ 1);
+    auto* global_block = PADDLE_GET_CONST(paddle::framework::BlockDesc*,
+                                          attrs.at("global_block"));
+    clear_no_grad_edges(params, global_block, grad_node.get(), /*slot id*/ 1);
 
     grad_node->SetGradInMeta(deref_out, 0);
 

diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_params_no_grad.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_params_no_grad.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.distributed as dist
+
+import unittest
+
+
+class Net(nn.Layer):
+
+    def __init__(self):
+        super(Net, self).__init__()
+        self.emb1 = nn.Embedding(100, 16)
+        self.emb2 = nn.Embedding(100, 16)
+
+    def forward(self, ids):
+        feat1 = self.emb1(ids)
+        feat1.stop_gradient = True  # here
+
+        feat2 = self.emb2(ids)
+
+        out = feat1 + feat2
+        out = paddle.mean(out)
+        return out
+
+
+def train():
+    paddle.distributed.init_parallel_env()
+    net = Net()
+    net = paddle.jit.to_static(net)
+
+    sgd = paddle.optimizer.SGD(learning_rate=0.1, parameters=net.parameters())
+    dp_net = paddle.DataParallel(net)
+    for i in range(4):
+        x = paddle.randint(low=0, high=100, shape=[4, 10])
+        loss = dp_net(x)
+        loss.backward()
+        sgd.step()
+        loss.clear_gradient()
+        print(loss)
+
+
+class TestParamsNoGrad(unittest.TestCase):
+
+    def test_two_card(self):
+        if paddle.is_compiled_with_cuda() and len(
+                paddle.static.cuda_places()) > 1:
+            dist.spawn(train, nprocs=2, gpus='0,1')
+
+
+if __name__ == '__main__':
+    unittest.main()