From d6f8afdf833e800616ead13b489acfe232a4422c Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Tue, 25 Nov 2025 18:09:33 +0800 Subject: [PATCH 1/2] disable use_sequence_parallel_moe default --- fastdeploy/config.py | 10 ++++++++++ requirements.txt | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 1a7204044c1..7c8ee326e34 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -1689,10 +1689,20 @@ def postprocess(self): logger.info("Multi-modal models do not support prefix caching when using CUDAGraph!") if self.scheduler_config.splitwise_role == "mixed": + if self.parallel_config.use_sequence_parallel_moe and self.graph_opt_config.use_cudagraph: + self.parallel_config.use_sequence_parallel_moe = False + logger.info( + "Warning: sequence parallel moe do not support Mixed mode with cudagraph. We set use_sequence_parallel_moe to False." + ) self.model_config.moe_phase = MoEPhase(phase="prefill") elif self.scheduler_config.splitwise_role == "prefill": self.model_config.moe_phase = MoEPhase(phase="prefill") elif self.scheduler_config.splitwise_role == "decode": + if self.parallel_config.use_sequence_parallel_moe and self.graph_opt_config.use_cudagraph: + self.parallel_config.use_sequence_parallel_moe = False + logger.info( + "Warning: sequence parallel moe do not support PD's decode node with cudagraph. We set use_sequence_parallel_moe to False." + ) self.model_config.moe_phase = MoEPhase(phase="decode") else: raise NotImplementedError diff --git a/requirements.txt b/requirements.txt index c0b762958f6..2e016837dd4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -37,7 +37,7 @@ opentelemetry-api>=1.24.0 opentelemetry-sdk>=1.24.0 opentelemetry-instrumentation-redis opentelemetry-instrumentation-mysql -opentelemetry-distro  +opentelemetry-distro opentelemetry-exporter-otlp opentelemetry-instrumentation-fastapi opentelemetry-instrumentation-logging From a920d6c1440dfa1b422b7d01e3cc420c0aa26b06 Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Tue, 25 Nov 2025 18:29:14 +0800 Subject: [PATCH 2/2] update --- fastdeploy/config.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 7c8ee326e34..5f23598d57f 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -1601,6 +1601,14 @@ def __init__( self.check() self.print() + def _disable_sequence_parallel_moe_if_needed(self, mode_name): + if self.parallel_config.use_sequence_parallel_moe and self.graph_opt_config.use_cudagraph: + self.parallel_config.use_sequence_parallel_moe = False + logger.warning( + f"Sequence parallel MoE does not support {mode_name} mode with cudagraph. " + "Setting use_sequence_parallel_moe to False." + ) + def postprocess(self): """ calculate some parameters @@ -1689,20 +1697,12 @@ def postprocess(self): logger.info("Multi-modal models do not support prefix caching when using CUDAGraph!") if self.scheduler_config.splitwise_role == "mixed": - if self.parallel_config.use_sequence_parallel_moe and self.graph_opt_config.use_cudagraph: - self.parallel_config.use_sequence_parallel_moe = False - logger.info( - "Warning: sequence parallel moe do not support Mixed mode with cudagraph. We set use_sequence_parallel_moe to False." - ) + self._disable_sequence_parallel_moe_if_needed("Mixed") self.model_config.moe_phase = MoEPhase(phase="prefill") elif self.scheduler_config.splitwise_role == "prefill": self.model_config.moe_phase = MoEPhase(phase="prefill") elif self.scheduler_config.splitwise_role == "decode": - if self.parallel_config.use_sequence_parallel_moe and self.graph_opt_config.use_cudagraph: - self.parallel_config.use_sequence_parallel_moe = False - logger.info( - "Warning: sequence parallel moe do not support PD's decode node with cudagraph. We set use_sequence_parallel_moe to False." - ) + self._disable_sequence_parallel_moe_if_needed("PD's decode node") self.model_config.moe_phase = MoEPhase(phase="decode") else: raise NotImplementedError