diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 8d9aa9ab08f..4cf05ea989f 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -1601,6 +1601,14 @@ def __init__( self.check() self.print() + def _disable_sequence_parallel_moe_if_needed(self, mode_name): + if self.parallel_config.use_sequence_parallel_moe and self.graph_opt_config.use_cudagraph: + self.parallel_config.use_sequence_parallel_moe = False + logger.warning( + f"Sequence parallel MoE does not support {mode_name} mode with cudagraph. " + "Setting use_sequence_parallel_moe to False." + ) + def postprocess(self): """ calculate some parameters @@ -1685,10 +1693,12 @@ def postprocess(self): logger.info("Multi-modal models do not support prefix caching when using CUDAGraph!") if self.scheduler_config.splitwise_role == "mixed": + self._disable_sequence_parallel_moe_if_needed("Mixed") self.model_config.moe_phase = MoEPhase(phase="prefill") elif self.scheduler_config.splitwise_role == "prefill": self.model_config.moe_phase = MoEPhase(phase="prefill") elif self.scheduler_config.splitwise_role == "decode": + self._disable_sequence_parallel_moe_if_needed("PD's decode node") self.model_config.moe_phase = MoEPhase(phase="decode") else: raise NotImplementedError diff --git a/requirements.txt b/requirements.txt index c0b762958f6..2e016837dd4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -37,7 +37,7 @@ opentelemetry-api>=1.24.0 opentelemetry-sdk>=1.24.0 opentelemetry-instrumentation-redis opentelemetry-instrumentation-mysql -opentelemetry-distro  +opentelemetry-distro opentelemetry-exporter-otlp opentelemetry-instrumentation-fastapi opentelemetry-instrumentation-logging