From 9ac25e82313bba49d6b2a89f55538ed721fae3fb Mon Sep 17 00:00:00 2001 From: ckl117 Date: Fri, 29 May 2026 11:40:32 +0800 Subject: [PATCH 1/3] add warmup for _sample_from_probs --- .../model_executor/layers/sample/meta_data.py | 2 ++ .../model_executor/layers/sample/sampler.py | 25 ++++++++++--------- fastdeploy/worker/gpu_model_runner.py | 1 + 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/fastdeploy/model_executor/layers/sample/meta_data.py b/fastdeploy/model_executor/layers/sample/meta_data.py index d871108e737..bfb0776be45 100644 --- a/fastdeploy/model_executor/layers/sample/meta_data.py +++ b/fastdeploy/model_executor/layers/sample/meta_data.py @@ -67,3 +67,5 @@ class SamplingMetadata: # Add for HPU post-processing seq_lens_encoder: Optional[paddle.Tensor] = None seq_lens_decoder: Optional[paddle.Tensor] = None + # Add for sampler to distinguish dummy run and profile run + is_dummy_or_profile_run: bool = False diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py index 0c6c11265ef..4e406dfe634 100644 --- a/fastdeploy/model_executor/layers/sample/sampler.py +++ b/fastdeploy/model_executor/layers/sample/sampler.py @@ -217,18 +217,19 @@ def _sample_from_probs(probs, sampling_metadata, top_p=None, top_k=None, topp_se if top_p_list is not None: top_p_list = top_p_list[:token_num] need_top_p_sampling = any(p != 1.0 for p in top_p_list) - if not need_top_p_sampling and current_platform.is_cuda(): - if need_top_k_sampling: - probs = dispatch_top_k_renorm_probs(probs, top_k) - next_tokens = _random_sample(probs, topp_seed=topp_seed) - else: - _, next_tokens = top_k_top_p_sampling( - probs, - top_p, - top_k, - top_k_list, - topp_seed=topp_seed, - ) + for is_dummy in range(sampling_metadata.is_dummy_or_profile_run + 1): + if not is_dummy and not need_top_p_sampling and current_platform.is_cuda(): + if need_top_k_sampling: + probs = dispatch_top_k_renorm_probs(probs, top_k) + next_tokens = _random_sample(probs, topp_seed=topp_seed) + else: + _, next_tokens = top_k_top_p_sampling( + probs, + top_p, + top_k, + top_k_list, + topp_seed=topp_seed, + ) return next_tokens diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 31c7d491035..0ada1d193eb 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1383,6 +1383,7 @@ def _prepare_inputs(self, cached_token_num=-1, cached_real_bsz=-1, is_dummy_or_p top_p_normalized_logprobs=self.share_inputs["top_p_normalized_logprobs"], logits_processors=self.share_inputs["logits_processors"], share_inputs=self.share_inputs, + is_dummy_or_profile_run=is_dummy_or_profile_run, ) return token_num, token_num_event From fb6123007d488e7af40e4f53f81101264fce4e7b Mon Sep 17 00:00:00 2001 From: ckl117 Date: Fri, 29 May 2026 14:07:47 +0800 Subject: [PATCH 2/3] simplify code --- .../model_executor/layers/sample/sampler.py | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py index 4e406dfe634..b13291fad7a 100644 --- a/fastdeploy/model_executor/layers/sample/sampler.py +++ b/fastdeploy/model_executor/layers/sample/sampler.py @@ -217,19 +217,20 @@ def _sample_from_probs(probs, sampling_metadata, top_p=None, top_k=None, topp_se if top_p_list is not None: top_p_list = top_p_list[:token_num] need_top_p_sampling = any(p != 1.0 for p in top_p_list) - for is_dummy in range(sampling_metadata.is_dummy_or_profile_run + 1): - if not is_dummy and not need_top_p_sampling and current_platform.is_cuda(): - if need_top_k_sampling: - probs = dispatch_top_k_renorm_probs(probs, top_k) - next_tokens = _random_sample(probs, topp_seed=topp_seed) - else: - _, next_tokens = top_k_top_p_sampling( - probs, - top_p, - top_k, - top_k_list, - topp_seed=topp_seed, - ) + if not need_top_p_sampling and current_platform.is_cuda(): + if need_top_k_sampling: + probs = dispatch_top_k_renorm_probs(probs, top_k) + next_tokens = _random_sample(probs, topp_seed=topp_seed) + if sampling_metadata.is_dummy_or_profile_run: # warmup top_p != 1.0 path + _, _ = top_k_top_p_sampling(probs, top_p, top_k, top_k_list, topp_seed=topp_seed) + else: + _, next_tokens = top_k_top_p_sampling( + probs, + top_p, + top_k, + top_k_list, + topp_seed=topp_seed, + ) return next_tokens From 614775f26e59ad585c9fef51f9198bc630e56006 Mon Sep 17 00:00:00 2001 From: ckl117 Date: Sun, 31 May 2026 12:56:10 +0800 Subject: [PATCH 3/3] warmup must return next_tokens --- fastdeploy/model_executor/layers/sample/sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py index b13291fad7a..70d6cc029bc 100644 --- a/fastdeploy/model_executor/layers/sample/sampler.py +++ b/fastdeploy/model_executor/layers/sample/sampler.py @@ -222,7 +222,7 @@ def _sample_from_probs(probs, sampling_metadata, top_p=None, top_k=None, topp_se probs = dispatch_top_k_renorm_probs(probs, top_k) next_tokens = _random_sample(probs, topp_seed=topp_seed) if sampling_metadata.is_dummy_or_profile_run: # warmup top_p != 1.0 path - _, _ = top_k_top_p_sampling(probs, top_p, top_k, top_k_list, topp_seed=topp_seed) + _, next_tokens = top_k_top_p_sampling(probs, top_p, top_k, top_k_list, topp_seed=topp_seed) else: _, next_tokens = top_k_top_p_sampling( probs,