From bcca44e312d36edcf691fb1fe23ad54ec46d29b4 Mon Sep 17 00:00:00 2001 From: xiaoxiaohehe001 Date: Tue, 9 Sep 2025 12:03:10 +0800 Subject: [PATCH 1/2] mm_post_fix --- .../models/ernie4_5_vl/ernie4_5_vl_moe.py | 14 ----------- fastdeploy/worker/gpu_model_runner.py | 23 +++++++++---------- 2 files changed, 11 insertions(+), 26 deletions(-) diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py index 5c6123627d8..4e894a314f5 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py @@ -48,7 +48,6 @@ if current_platform.is_cuda(): from fastdeploy.model_executor.ops.gpu import ( - extract_text_token_output, text_image_gather_scatter, text_image_index_out, ) @@ -516,19 +515,6 @@ def forward( ) hidden_states = hidden_states + residual - - # ----------------------- - max_seq_len, max_seq_len_index = paddle.topk(forward_meta.seq_lens_this_time, k=1) - hidden_states = extract_text_token_output( - max_seq_len, - max_seq_len_index.cast("int32"), - image_token_num.cast("int32"), - forward_meta.seq_lens_this_time, - forward_meta.cu_seqlens_q, - hidden_states.cast("float32"), - ).cast(self._dtype) - # ----------------------- - out = self.norm(hidden_states) return out diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 53591ca5971..680b1e1b34b 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1281,24 +1281,23 @@ def _dummy_run( self.share_inputs["image_features"], self.forward_meta, ) - hidden_states = model_output else: model_output = self.model( ids_remove_padding=self.share_inputs["ids_remove_padding"], forward_meta=self.forward_meta, ) - hidden_states = rebuild_padding( - model_output, - self.share_inputs["cu_seqlens_q"], - self.share_inputs["seq_lens_this_time"], - self.share_inputs["seq_lens_decoder"], - self.share_inputs["seq_lens_encoder"], - ( - self.share_inputs["output_padding_offset"] if self.speculative_decoding else None - ), # speculative decoding requires - self.parallel_config.max_model_len, - ) + hidden_states = rebuild_padding( + model_output, + self.share_inputs["cu_seqlens_q"], + self.share_inputs["seq_lens_this_time"], + self.share_inputs["seq_lens_decoder"], + self.share_inputs["seq_lens_encoder"], + ( + self.share_inputs["output_padding_offset"] if self.speculative_decoding else None + ), # speculative decoding requires + self.parallel_config.max_model_len, + ) # 4. Execute spec decode logits = self.model.compute_logits(hidden_states) From ceb78bcf36c07890cb8f9971f60054c3468f9caf Mon Sep 17 00:00:00 2001 From: xiaoxiaohehe001 Date: Tue, 9 Sep 2025 15:58:57 +0800 Subject: [PATCH 2/2] mm_post_fix_1 --- fastdeploy/worker/gpu_model_runner.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 680b1e1b34b..498026cd7c1 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1590,21 +1590,20 @@ class at the server level, which is too granular for ModelRunner. self.share_inputs["image_features"], self.forward_meta, ) - hidden_states = model_output else: model_output = self.model( ids_remove_padding=self.share_inputs["ids_remove_padding"], forward_meta=self.forward_meta, ) - hidden_states = rebuild_padding( - model_output, - self.share_inputs["cu_seqlens_q"], - self.share_inputs["seq_lens_this_time"], - self.share_inputs["seq_lens_decoder"], - self.share_inputs["seq_lens_encoder"], - (self.share_inputs["output_padding_offset"] if self.speculative_decoding else None), - self.parallel_config.max_model_len, - ) + hidden_states = rebuild_padding( + model_output, + self.share_inputs["cu_seqlens_q"], + self.share_inputs["seq_lens_this_time"], + self.share_inputs["seq_lens_decoder"], + self.share_inputs["seq_lens_encoder"], + (self.share_inputs["output_padding_offset"] if self.speculative_decoding else None), + self.parallel_config.max_model_len, + ) # 4. Compute logits, Sample logits = self.model.compute_logits(hidden_states)