fastdeploy/eplb/async_expert_loader.py

-Original file line number
+Diff line change
@@ Expand Up @@
                 # NumPy 不支持 bfloat16，因此先以 uint16 读取原始数据，再用 Paddle cast 为 bfloat16
                 tmp = np_array.view(np.uint16)
                 tensor = paddle.Tensor(tmp, dtype=paddle.bfloat16, place=paddle.CPUPlace(), zero_copy=True)
+            elif dtype == paddle.float8_e4m3fn:
+                tmp = np_array.view(np.uint8)
+                tensor = paddle.Tensor(tmp, dtype=paddle.float8_e4m3fn, place=paddle.CPUPlace(), zero_copy=True)
             else:
                 raise TypeError(f"Unsupported dtype: {dtype}")
@@ Expand Down Expand Up @@
             """
             up_gate_down = ["up_gate_proj", "down_proj"]
             quant_weight_scale = ["quant_weight", "weight_scale"]
-            if self.moe_quant_type == "w4a8":
-                quant_weight_scale = ["quant_weight"]
             ckpt_name = [
                 (f"ernie.layers.{layer_id}.mlp.experts.{expert_id}.{proj_name}.{quant_name}")
                 for layer_id, expert_id in need_to_reload
@@ Expand All @@
             from safetensors import safe_open
             for st_file in hf_weights_files:
-                with safe_open(st_file, framework="np", device="cpu") as f:
+                with safe_open(st_file, framework="paddle", device="cpu") as f:
                     for name in f.keys():
                         if name in ckpt_name:
                             weight = f.get_tensor(name)
@@ Expand Down @@

fastdeploy/eplb/experts_manager.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -21,7 +21,7 @@ class RedundantExpertManager:
  
        RedundantExpertManger

        """

        def __init__(self, rank=0, ep_size=32, fd_config=None):

        def __init__(self, rank=0, ep_size=64, fd_config=None):

            self.logger = get_logger("eplb_expert_manager", "eplb_{0}.log".format(rank))

            self.rank = rank

    @@ -101,7 +101,7 @@ def __init__(self, rank=0, ep_size=32, fd_config=None):
  
            self.http_timeout = 1

            # 重置重排状态: 'done' -> 'free'

            self.rearrange_end_ts = 0

            self.rearrange_reset_interval = 300

            self.rearrange_reset_interval = 30

            self.tensor_infos = None

    @@ -250,8 +250,8 @@ def caculate_expert_rank_table(self, is_init=False):
  
            eplb_strategy = self.eplb_config.redundant_expert_eplb_strategy

            if is_init:

                num_groups = 1

                num_nodes = 2

                num_gpus = 2 * 8

                num_nodes = 8

                num_gpus = 8 * 8

                eplb_strategy = ""

            # eplb

            rank_expert_list, logical_to_physical_map, expert_count = rebalance_experts(

    @@ -420,7 +420,9 @@ def allreduce_load_weight_result(self):
  
            if not exist_fail and all_success:

                # prefill需要等待调度屏蔽

                if (

                    self.fd_config.splitwise_role == "decode"

                    self.fd_config.scheduler_config.splitwise_role == "mixed"

                    or self.fd_config.scheduler_config.splitwise_role == "decode"

                    or self.fd_config.scheduler_config.splitwise_role == "prefill"

                    or not self.eplb_config.redundant_expert_enable_schedule_cordon

                ):

                    self.logger.info("redundant_expert: allreduce_load_weight_result success, notify infer.py")

fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py

-Original file line number
+Diff line change
@@ Expand Up @@
             # self.check(layer, up_gate_proj_weights, down_proj_weights)
             up_gate_proj_weight_scale = []
             down_proj_weight_scale = []
+            if isinstance(state_dict, list):
+                state_dict = dict(state_dict)
             for expert_idx in logical_expert_ids:
                 up_gate_proj_weight_scale.append(
                     get_tensor(state_dict.pop(up_gate_proj_expert_weight_scale_key.format(expert_idx)))
@@ Expand Down @@

fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py

-Original file line number
+Diff line change
@@ Expand Up @@
             # self.check(layer, up_gate_proj_weights, down_proj_weights)
             up_gate_proj_weight_scale = []
             down_proj_weight_scale = []
+            if isinstance(state_dict, list):
+                state_dict = dict(state_dict)
             for expert_idx in logical_expert_ids:
                 up_gate_proj_expert_weight_scale_key_name = up_gate_proj_expert_weight_scale_key.format(expert_idx)
                 down_proj_expert_weight_scale_key_name = down_proj_expert_weight_scale_key.format(expert_idx)
@@ Expand Down @@

fastdeploy/worker/worker_process.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -260,13 +260,21 @@ def update_weights_from_tensor(self, mmap_infos): @@
             """
             update_weights_from_tensor
             """
+            import time
+            while True:
+                if self.experts_manager.tensor_infos is None:
+                    time.sleep(0.1)
+                else:
+                    break
             state_dicts = load_tensor_from_shm_mem(self.experts_manager.tensor_infos, mmap_infos[MODEL_MAIN_NAME], logger)
             rank_expert_list, logical_to_physical_map, expert_count = self.experts_manager.get_ep_rank_to_expert_id_list()
             self.worker.get_model().redundant_table_manger.update_expert_rank_table(
                 rank_expert_list, logical_to_physical_map, expert_count
             )
             # TO BE FIXED
             self.worker.get_model().update_state_dict(state_dicts)
+            self.experts_manager.tensor_infos = None
         def _broadcast_model_weights_signal(self, src: int, group) -> int:
             model_weights_signal_tensor = paddle.full(shape=[1], fill_value=self.model_weights_signal[0], dtype="int32")
@@ Expand Down @@

[Fix] Fix eplb bug and support fp8 load weight #5178

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

Jiang-Jia-Jun merged 3 commits into PaddlePaddle:develop from xiaoxiaohehe001:fix_eplb_part2

Nov 24, 2025

-Original file line number
+Diff line change
@@ Expand Up @@
                 # NumPy 不支持 bfloat16，因此先以 uint16 读取原始数据，再用 Paddle cast 为 bfloat16
                 tmp = np_array.view(np.uint16)
                 tensor = paddle.Tensor(tmp, dtype=paddle.bfloat16, place=paddle.CPUPlace(), zero_copy=True)
+            elif dtype == paddle.float8_e4m3fn:
+                tmp = np_array.view(np.uint8)
+                tensor = paddle.Tensor(tmp, dtype=paddle.float8_e4m3fn, place=paddle.CPUPlace(), zero_copy=True)
             else:
                 raise TypeError(f"Unsupported dtype: {dtype}")
@@ Expand Down Expand Up @@
             """
             up_gate_down = ["up_gate_proj", "down_proj"]
             quant_weight_scale = ["quant_weight", "weight_scale"]
-            if self.moe_quant_type == "w4a8":
-                quant_weight_scale = ["quant_weight"]
             ckpt_name = [
                 (f"ernie.layers.{layer_id}.mlp.experts.{expert_id}.{proj_name}.{quant_name}")
                 for layer_id, expert_id in need_to_reload
@@ Expand All @@
             from safetensors import safe_open
             for st_file in hf_weights_files:
-                with safe_open(st_file, framework="np", device="cpu") as f:
+                with safe_open(st_file, framework="paddle", device="cpu") as f:
                     for name in f.keys():
                         if name in ckpt_name:
                             weight = f.get_tensor(name)
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up @@
             # self.check(layer, up_gate_proj_weights, down_proj_weights)
             up_gate_proj_weight_scale = []
             down_proj_weight_scale = []
+            if isinstance(state_dict, list):
+                state_dict = dict(state_dict)
             for expert_idx in logical_expert_ids:
                 up_gate_proj_weight_scale.append(
                     get_tensor(state_dict.pop(up_gate_proj_expert_weight_scale_key.format(expert_idx)))
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up @@
             # self.check(layer, up_gate_proj_weights, down_proj_weights)
             up_gate_proj_weight_scale = []
             down_proj_weight_scale = []
+            if isinstance(state_dict, list):
+                state_dict = dict(state_dict)
             for expert_idx in logical_expert_ids:
                 up_gate_proj_expert_weight_scale_key_name = up_gate_proj_expert_weight_scale_key.format(expert_idx)
                 down_proj_expert_weight_scale_key_name = down_proj_expert_weight_scale_key.format(expert_idx)
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -260,13 +260,21 @@ def update_weights_from_tensor(self, mmap_infos): @@
             """
             update_weights_from_tensor
             """
+            import time
+            while True:
+                if self.experts_manager.tensor_infos is None:
+                    time.sleep(0.1)
+                else:
+                    break
             state_dicts = load_tensor_from_shm_mem(self.experts_manager.tensor_infos, mmap_infos[MODEL_MAIN_NAME], logger)
             rank_expert_list, logical_to_physical_map, expert_count = self.experts_manager.get_ep_rank_to_expert_id_list()
             self.worker.get_model().redundant_table_manger.update_expert_rank_table(
                 rank_expert_list, logical_to_physical_map, expert_count
             )
             # TO BE FIXED
             self.worker.get_model().update_state_dict(state_dicts)
+            self.experts_manager.tensor_infos = None
         def _broadcast_model_weights_signal(self, src: int, group) -> int:
             model_weights_signal_tensor = paddle.full(shape=[1], fill_value=self.model_weights_signal[0], dtype="int32")
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Fix] Fix eplb bug and support fp8 load weight #5178

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!