From 3bb805ef32c1d525d21fec09b9f1e0e6068b282c Mon Sep 17 00:00:00 2001
From: ice <offical@byterain.co>
Date: Fri, 26 Sep 2025 22:01:11 +0800
Subject: [PATCH 1/8] =?UTF-8?q?feat:=20=E6=96=87=E6=A1=A3=E4=BF=AE?=
 =?UTF-8?q?=E5=A4=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../distributed/utils/global_gather_cn.rst    | 40 +++++++++++++++++--
 1 file changed, 36 insertions(+), 4 deletions(-)

diff --git a/docs/api/paddle/distributed/utils/global_gather_cn.rst b/docs/api/paddle/distributed/utils/global_gather_cn.rst
index 0ac995c99b4..c9d51509ac1 100644
--- a/docs/api/paddle/distributed/utils/global_gather_cn.rst
+++ b/docs/api/paddle/distributed/utils/global_gather_cn.rst
@@ -12,8 +12,8 @@ global_gather 根据 global_count 将 x 的数据收集到 n_expert * world_size
 如下图所示，world_size 是 2，n_expert 是 2，x 的 batch_size 是 4，local_count 是[2, 0, 2, 0]，0 卡的 global_count 是[2, 0, , ],
 1 卡的 global_count 是[2, 0, ,](因为篇幅问题，这里只展示在 0 卡运算的数据)，在 global_gather 算子里，
 global_count 和 local_count 的意义与其在 global_scatter 里正好相反，
-global_count[i]代表向第 (i // n_expert)张卡的第 (i % n_expert)个 expert 发送 local_expert[i]个数据，
-local_count[i]代表从第 (i // n_expert)张卡接收 global_count[i]个数据给本卡的 第(i % n_expert)个 expert。
+global_count[i] 代表向第 (i // n_expert)张卡的第 (i % n_expert)个 expert 发送 local_expert[i]个数据，
+local_count[i] 代表从第 (i // n_expert)张卡接收 global_count[i] 个数据给本卡的 第(i % n_expert)个 expert。
 发送的数据会按照每张卡的每个 expert 排列。图中的 rank0 代表第 0 张卡，rank1 代表第 1 张卡。
 
 global_gather 发送数据的流程如下：
@@ -43,8 +43,40 @@ global_gather 发送数据的流程如下：
 
 返回
 :::::::::
-Tensor，从所有 expert 接收的数据。
+Tensor，从所有 expert 接收的数据喵~
 
 代码示例
 :::::::::
-COPY-FROM: paddle.distributed.utils.global_gather
+
+.. code-block:: python
+
+    # required: distributed
+    import numpy as np
+    import paddle
+    from paddle.distributed import init_parallel_env
+    init_parallel_env()
+    n_expert = 2
+    world_size = 2
+    d_model = 2
+    in_feat = d_model
+    local_input_buf = np.array([[1, 2],[3, 4],[5, 6],[7, 8],[9, 10]], dtype=np.float32)
+    if paddle.distributed.ParallelEnv().local_rank == 0:
+        local_count = np.array([2, 1, 1, 1])
+        global_count = np.array([2, 1, 1, 1])
+    else:
+        local_count = np.array([1, 1, 2, 1])
+        global_count = np.array([1, 1, 2, 1])
+    local_input_buf = paddle.to_tensor(local_input_buf, dtype="float32", stop_gradient=False)
+    local_count = paddle.to_tensor(local_count, dtype="int64")
+    global_count = paddle.to_tensor(global_count, dtype="int64")
+    a = paddle.distributed.utils.global_gather(local_input_buf, local_count, global_count)
+    print(a)
+    # out for rank 0: [[1, 2], [3, 4], [7, 8], [1, 2], [7, 8]]
+    # out for rank 1: [[5, 6], [9, 10], [3, 4], [5, 6], [9, 10]]
+    a.stop_gradient = False
+    c = a * a
+    c.backward()
+    print("local_input_buf.grad", local_input_buf.grad)
+    # rank 0 的输出: [[2, 4], [6, 8], [10, 12], [14, 16], [18, 20]]
+    # rank 1 的输出: [[2, 4], [6, 8], [10, 12], [14, 16], [18, 20]]
+    # 这是彩蛋喵，希望不要被百度reviewer发现QAQ~

From e2e5a0bcf7ad738c2bf2cd6ec09b6db5f02610cb Mon Sep 17 00:00:00 2001
From: ice <offical@byterain.co>
Date: Fri, 26 Sep 2025 22:04:05 +0800
Subject: [PATCH 2/8] =?UTF-8?q?feat:=20=E6=A0=BC=E5=BC=8F=E5=8C=96Python?=
 =?UTF-8?q?=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/api/paddle/distributed/utils/global_gather_cn.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/api/paddle/distributed/utils/global_gather_cn.rst b/docs/api/paddle/distributed/utils/global_gather_cn.rst
index c9d51509ac1..d612adaa3ee 100644
--- a/docs/api/paddle/distributed/utils/global_gather_cn.rst
+++ b/docs/api/paddle/distributed/utils/global_gather_cn.rst
@@ -54,6 +54,7 @@ Tensor，从所有 expert 接收的数据喵~
     import numpy as np
     import paddle
     from paddle.distributed import init_parallel_env
+    # 初始化并行环境~
     init_parallel_env()
     n_expert = 2
     world_size = 2
@@ -71,8 +72,8 @@ Tensor，从所有 expert 接收的数据喵~
     global_count = paddle.to_tensor(global_count, dtype="int64")
     a = paddle.distributed.utils.global_gather(local_input_buf, local_count, global_count)
     print(a)
-    # out for rank 0: [[1, 2], [3, 4], [7, 8], [1, 2], [7, 8]]
-    # out for rank 1: [[5, 6], [9, 10], [3, 4], [5, 6], [9, 10]]
+    # rank 0 的输出: [[1, 2], [3, 4], [7, 8], [1, 2], [7, 8]]
+    # rank 1 的输出: [[5, 6], [9, 10], [3, 4], [5, 6], [9, 10]]
     a.stop_gradient = False
     c = a * a
     c.backward()

From 6bc70c7635ef84b9d43f2dc9b4061c2466d7d7c0 Mon Sep 17 00:00:00 2001
From: ice <offical@byterain.co>
Date: Sat, 27 Sep 2025 20:40:25 +0800
Subject: [PATCH 3/8] remove: code block

---
 .../distributed/utils/global_gather_cn.rst    | 34 +------------------
 1 file changed, 1 insertion(+), 33 deletions(-)

diff --git a/docs/api/paddle/distributed/utils/global_gather_cn.rst b/docs/api/paddle/distributed/utils/global_gather_cn.rst
index d612adaa3ee..607a455dcca 100644
--- a/docs/api/paddle/distributed/utils/global_gather_cn.rst
+++ b/docs/api/paddle/distributed/utils/global_gather_cn.rst
@@ -48,36 +48,4 @@ Tensor，从所有 expert 接收的数据喵~
 代码示例
 :::::::::
 
-.. code-block:: python
-
-    # required: distributed
-    import numpy as np
-    import paddle
-    from paddle.distributed import init_parallel_env
-    # 初始化并行环境~
-    init_parallel_env()
-    n_expert = 2
-    world_size = 2
-    d_model = 2
-    in_feat = d_model
-    local_input_buf = np.array([[1, 2],[3, 4],[5, 6],[7, 8],[9, 10]], dtype=np.float32)
-    if paddle.distributed.ParallelEnv().local_rank == 0:
-        local_count = np.array([2, 1, 1, 1])
-        global_count = np.array([2, 1, 1, 1])
-    else:
-        local_count = np.array([1, 1, 2, 1])
-        global_count = np.array([1, 1, 2, 1])
-    local_input_buf = paddle.to_tensor(local_input_buf, dtype="float32", stop_gradient=False)
-    local_count = paddle.to_tensor(local_count, dtype="int64")
-    global_count = paddle.to_tensor(global_count, dtype="int64")
-    a = paddle.distributed.utils.global_gather(local_input_buf, local_count, global_count)
-    print(a)
-    # rank 0 的输出: [[1, 2], [3, 4], [7, 8], [1, 2], [7, 8]]
-    # rank 1 的输出: [[5, 6], [9, 10], [3, 4], [5, 6], [9, 10]]
-    a.stop_gradient = False
-    c = a * a
-    c.backward()
-    print("local_input_buf.grad", local_input_buf.grad)
-    # rank 0 的输出: [[2, 4], [6, 8], [10, 12], [14, 16], [18, 20]]
-    # rank 1 的输出: [[2, 4], [6, 8], [10, 12], [14, 16], [18, 20]]
-    # 这是彩蛋喵，希望不要被百度reviewer发现QAQ~
+COPY-FROM: paddle.distributed.utils.global_gather

From f51cb2d24bc7c77c94f2c91353f0d073bbce2484 Mon Sep 17 00:00:00 2001
From: ice <offical@byterain.co>
Date: Sat, 27 Sep 2025 21:53:04 +0800
Subject: [PATCH 4/8] =?UTF-8?q?remove:=20=E5=BC=82=E5=B8=B8=20COPY-FROM?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/api/paddle/distributed/utils/global_gather_cn.rst | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/docs/api/paddle/distributed/utils/global_gather_cn.rst b/docs/api/paddle/distributed/utils/global_gather_cn.rst
index 607a455dcca..6b7a509a75b 100644
--- a/docs/api/paddle/distributed/utils/global_gather_cn.rst
+++ b/docs/api/paddle/distributed/utils/global_gather_cn.rst
@@ -45,7 +45,3 @@ global_gather 发送数据的流程如下：
 :::::::::
 Tensor，从所有 expert 接收的数据喵~
 
-代码示例
-:::::::::
-
-COPY-FROM: paddle.distributed.utils.global_gather

From cc3ec939fccfd99b6299b971fd670892255a0f13 Mon Sep 17 00:00:00 2001
From: ice <offical@byterain.co>
Date: Sun, 28 Sep 2025 10:26:31 +0800
Subject: [PATCH 5/8] =?UTF-8?q?feat:=20=E6=9B=B4=E6=96=B0=E4=BB=A3?=
 =?UTF-8?q?=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../distributed/utils/global_gather_cn.rst    | 39 +++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/docs/api/paddle/distributed/utils/global_gather_cn.rst b/docs/api/paddle/distributed/utils/global_gather_cn.rst
index 6b7a509a75b..eed779f95db 100644
--- a/docs/api/paddle/distributed/utils/global_gather_cn.rst
+++ b/docs/api/paddle/distributed/utils/global_gather_cn.rst
@@ -45,3 +45,42 @@ global_gather 发送数据的流程如下：
 :::::::::
 Tensor，从所有 expert 接收的数据喵~
 
+代码示例
+:::::::::
+
+.. code-block:: python
+
+  >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+  >>> import paddle
+  >>> from paddle.distributed import init_parallel_env
+  >>> from paddle.distributed.utils import moe_utils
+  >>> init_parallel_env()
+  >>> n_expert = 2
+  >>> world_size = 2
+  >>> d_model = 2
+  >>> in_feat = d_model
+  >>> local_input_buf = paddle._to_tensor(
+  ...     [[1, 2],[3, 4],[5, 6],[7, 8],[9, 10]],
+  ...     dtype='float32',
+  ...     stop_gradient=False
+  ... )
+  >>> if paddle.distributed.ParallelEnv().local_rank == 0:
+  ...     local_count = paddle.to_tensor([2, 1, 1, 1], dtype="int64")
+  ...     global_count = paddle.to_tensor([2, 1, 1, 1], dtype="int64")
+  >>> else:
+  ...     local_count = paddle.to_tensor([1, 1, 2, 1], dtype="int64")
+  ...     global_count = paddle.to_tensor([1, 1, 2, 1], dtype="int64")
+  >>> a = moe_utils.global_gather(
+  ...     local_input_buf,
+  ...     local_count,
+  ...     global_count
+  ... )
+  >>> print(a)
+  >>> # rank 0 的 输出: [[1, 2], [3, 4], [7, 8], [1, 2], [7, 8]]
+  >>> # rank 1 的 输出: [[5, 6], [9, 10], [3, 4], [5, 6], [9, 10]]
+  >>> a.stop_gradient = False
+  >>> c = a * a
+  >>> c.backward()
+  >>> print("local_input_buf.grad", local_input_buf.grad)
+  >>> # rank 0 的 输出: [[2, 4], [6, 8], [10, 12], [14, 16], [18, 20]]
+  >>> # rank 1 的 输出: [[2, 4], [6, 8], [10, 12], [14, 16], [18, 20]]

From 16a71eb7e3fdc6daa8dee9ae3f4be3cecda5dd7b Mon Sep 17 00:00:00 2001
From: ice <offical@byterain.co>
Date: Sun, 28 Sep 2025 10:47:25 +0800
Subject: [PATCH 6/8] remove: useless word(s)/symbol(s)

---
 docs/api/paddle/distributed/utils/global_gather_cn.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/api/paddle/distributed/utils/global_gather_cn.rst b/docs/api/paddle/distributed/utils/global_gather_cn.rst
index eed779f95db..96f0e60bec8 100644
--- a/docs/api/paddle/distributed/utils/global_gather_cn.rst
+++ b/docs/api/paddle/distributed/utils/global_gather_cn.rst
@@ -43,7 +43,7 @@ global_gather 发送数据的流程如下：
 
 返回
 :::::::::
-Tensor，从所有 expert 接收的数据喵~
+Tensor，从所有 expert 接收的数据
 
 代码示例
 :::::::::

From 55ab602e3041733ab6fd458cd15fbb035b9cc987 Mon Sep 17 00:00:00 2001
From: ice <offical@byterain.co>
Date: Tue, 30 Sep 2025 18:37:36 +0800
Subject: [PATCH 7/8] remove: code block

---
 .../distributed/utils/global_gather_cn.rst    | 40 -------------------
 1 file changed, 40 deletions(-)

diff --git a/docs/api/paddle/distributed/utils/global_gather_cn.rst b/docs/api/paddle/distributed/utils/global_gather_cn.rst
index 96f0e60bec8..0dfa1fc7777 100644
--- a/docs/api/paddle/distributed/utils/global_gather_cn.rst
+++ b/docs/api/paddle/distributed/utils/global_gather_cn.rst
@@ -44,43 +44,3 @@ global_gather 发送数据的流程如下：
 返回
 :::::::::
 Tensor，从所有 expert 接收的数据
-
-代码示例
-:::::::::
-
-.. code-block:: python
-
-  >>> # doctest: +REQUIRES(env:DISTRIBUTED)
-  >>> import paddle
-  >>> from paddle.distributed import init_parallel_env
-  >>> from paddle.distributed.utils import moe_utils
-  >>> init_parallel_env()
-  >>> n_expert = 2
-  >>> world_size = 2
-  >>> d_model = 2
-  >>> in_feat = d_model
-  >>> local_input_buf = paddle._to_tensor(
-  ...     [[1, 2],[3, 4],[5, 6],[7, 8],[9, 10]],
-  ...     dtype='float32',
-  ...     stop_gradient=False
-  ... )
-  >>> if paddle.distributed.ParallelEnv().local_rank == 0:
-  ...     local_count = paddle.to_tensor([2, 1, 1, 1], dtype="int64")
-  ...     global_count = paddle.to_tensor([2, 1, 1, 1], dtype="int64")
-  >>> else:
-  ...     local_count = paddle.to_tensor([1, 1, 2, 1], dtype="int64")
-  ...     global_count = paddle.to_tensor([1, 1, 2, 1], dtype="int64")
-  >>> a = moe_utils.global_gather(
-  ...     local_input_buf,
-  ...     local_count,
-  ...     global_count
-  ... )
-  >>> print(a)
-  >>> # rank 0 的 输出: [[1, 2], [3, 4], [7, 8], [1, 2], [7, 8]]
-  >>> # rank 1 的 输出: [[5, 6], [9, 10], [3, 4], [5, 6], [9, 10]]
-  >>> a.stop_gradient = False
-  >>> c = a * a
-  >>> c.backward()
-  >>> print("local_input_buf.grad", local_input_buf.grad)
-  >>> # rank 0 的 输出: [[2, 4], [6, 8], [10, 12], [14, 16], [18, 20]]
-  >>> # rank 1 的 输出: [[2, 4], [6, 8], [10, 12], [14, 16], [18, 20]]

From 64bfbf448b656e0ef9e605cb366f2d208195a7b9 Mon Sep 17 00:00:00 2001
From: ice <offical@byterain.co>
Date: Tue, 30 Sep 2025 22:11:49 +0800
Subject: [PATCH 8/8] Delete
 docs/api/paddle/distributed/utils/global_gather_cn.rst

---
 .../distributed/utils/global_gather_cn.rst    | 46 -------------------
 1 file changed, 46 deletions(-)
 delete mode 100644 docs/api/paddle/distributed/utils/global_gather_cn.rst

diff --git a/docs/api/paddle/distributed/utils/global_gather_cn.rst b/docs/api/paddle/distributed/utils/global_gather_cn.rst
deleted file mode 100644
index 0dfa1fc7777..00000000000
--- a/docs/api/paddle/distributed/utils/global_gather_cn.rst
+++ /dev/null
@@ -1,46 +0,0 @@
-.. _cn_api_paddle_distributed_utils_global_gather:
-
-global_gather
--------------------------------
-
-
-.. py:function:: paddle.distributed.utils.global_gather(x, local_count, global_count, group=None, use_calc_stream=True)
-
-global_gather 根据 global_count 将 x 的数据收集到 n_expert * world_size 个 expert，然后根据 local_count 接收数据。
-其中 expert 是用户定义的专家网络，n_expert 是指每张卡拥有的专家网络数目，world_size 是指运行网络的显卡数目。
-
-如下图所示，world_size 是 2，n_expert 是 2，x 的 batch_size 是 4，local_count 是[2, 0, 2, 0]，0 卡的 global_count 是[2, 0, , ],
-1 卡的 global_count 是[2, 0, ,](因为篇幅问题，这里只展示在 0 卡运算的数据)，在 global_gather 算子里，
-global_count 和 local_count 的意义与其在 global_scatter 里正好相反，
-global_count[i] 代表向第 (i // n_expert)张卡的第 (i % n_expert)个 expert 发送 local_expert[i]个数据，
-local_count[i] 代表从第 (i // n_expert)张卡接收 global_count[i] 个数据给本卡的 第(i % n_expert)个 expert。
-发送的数据会按照每张卡的每个 expert 排列。图中的 rank0 代表第 0 张卡，rank1 代表第 1 张卡。
-
-global_gather 发送数据的流程如下：
-
-第 0 张卡的 global_count[0]代表向第 0 张卡的第 0 个 expert 发送 2 个数据；
-
-第 0 张卡的 global_count[1]代表向第 0 张卡的第 1 个 expert 发送 0 个数据；
-
-第 1 张卡的 global_count[0]代表向第 0 张卡的第 0 个 expert 发送 2 个数据；
-
-第 1 张卡的 global_count[1]代表向第 0 张卡的第 1 个 expert 发送 0 个数据。
-
-
-.. image:: ../img/global_scatter_gather.png
-  :width: 800
-  :alt: global_scatter_gather
-  :align: center
-
-
-参数
-:::::::::
-    - **x** (Tensor) - 输入 Tensor。Tensor 的数据类型必须是 float16、float32、 float64、int32、int64。
-    - **local_count** (Tensor) - 拥有 n_expert * world_size 个数据的 Tensor，用于表示有多少数据接收。Tensor 的数据类型必须是 int64。
-    - **global_count** (Tensor) - 拥有 n_expert * world_size 个数据的 Tensor，用于表示有多少数据发送。Tensor 的数据类型必须是 int64。
-    - **group** (Group，可选) - new_group 返回的 Group 实例，或者设置为 None 表示默认地全局组。默认值：None。
-    - **use_calc_stream** (bool，可选) - 标识使用计算流还是通信流。默认值：True，表示用计算流。
-
-返回
-:::::::::
-Tensor，从所有 expert 接收的数据