From 9ac25e82313bba49d6b2a89f55538ed721fae3fb Mon Sep 17 00:00:00 2001
From: ckl117 <ckl117@163.com>
Date: Fri, 29 May 2026 11:40:32 +0800
Subject: [PATCH 1/3] add warmup for _sample_from_probs

---
 .../model_executor/layers/sample/meta_data.py |  2 ++
 .../model_executor/layers/sample/sampler.py   | 25 ++++++++++---------
 fastdeploy/worker/gpu_model_runner.py         |  1 +
 3 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/fastdeploy/model_executor/layers/sample/meta_data.py b/fastdeploy/model_executor/layers/sample/meta_data.py
index d871108e737..bfb0776be45 100644
--- a/fastdeploy/model_executor/layers/sample/meta_data.py
+++ b/fastdeploy/model_executor/layers/sample/meta_data.py
@@ -67,3 +67,5 @@ class SamplingMetadata:
     # Add for HPU post-processing
     seq_lens_encoder: Optional[paddle.Tensor] = None
     seq_lens_decoder: Optional[paddle.Tensor] = None
+    # Add for sampler to distinguish dummy run and profile run
+    is_dummy_or_profile_run: bool = False
diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py
index 0c6c11265ef..4e406dfe634 100644
--- a/fastdeploy/model_executor/layers/sample/sampler.py
+++ b/fastdeploy/model_executor/layers/sample/sampler.py
@@ -217,18 +217,19 @@ def _sample_from_probs(probs, sampling_metadata, top_p=None, top_k=None, topp_se
     if top_p_list is not None:
         top_p_list = top_p_list[:token_num]
         need_top_p_sampling = any(p != 1.0 for p in top_p_list)
-    if not need_top_p_sampling and current_platform.is_cuda():
-        if need_top_k_sampling:
-            probs = dispatch_top_k_renorm_probs(probs, top_k)
-        next_tokens = _random_sample(probs, topp_seed=topp_seed)
-    else:
-        _, next_tokens = top_k_top_p_sampling(
-            probs,
-            top_p,
-            top_k,
-            top_k_list,
-            topp_seed=topp_seed,
-        )
+    for is_dummy in range(sampling_metadata.is_dummy_or_profile_run + 1):
+        if not is_dummy and not need_top_p_sampling and current_platform.is_cuda():
+            if need_top_k_sampling:
+                probs = dispatch_top_k_renorm_probs(probs, top_k)
+            next_tokens = _random_sample(probs, topp_seed=topp_seed)
+        else:
+            _, next_tokens = top_k_top_p_sampling(
+                probs,
+                top_p,
+                top_k,
+                top_k_list,
+                topp_seed=topp_seed,
+            )
     return next_tokens
 
 
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 31c7d491035..0ada1d193eb 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -1383,6 +1383,7 @@ def _prepare_inputs(self, cached_token_num=-1, cached_real_bsz=-1, is_dummy_or_p
             top_p_normalized_logprobs=self.share_inputs["top_p_normalized_logprobs"],
             logits_processors=self.share_inputs["logits_processors"],
             share_inputs=self.share_inputs,
+            is_dummy_or_profile_run=is_dummy_or_profile_run,
         )
         return token_num, token_num_event
 

From fb6123007d488e7af40e4f53f81101264fce4e7b Mon Sep 17 00:00:00 2001
From: ckl117 <ckl117@163.com>
Date: Fri, 29 May 2026 14:07:47 +0800
Subject: [PATCH 2/3] simplify code

---
 .../model_executor/layers/sample/sampler.py   | 27 ++++++++++---------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py
index 4e406dfe634..b13291fad7a 100644
--- a/fastdeploy/model_executor/layers/sample/sampler.py
+++ b/fastdeploy/model_executor/layers/sample/sampler.py
@@ -217,19 +217,20 @@ def _sample_from_probs(probs, sampling_metadata, top_p=None, top_k=None, topp_se
     if top_p_list is not None:
         top_p_list = top_p_list[:token_num]
         need_top_p_sampling = any(p != 1.0 for p in top_p_list)
-    for is_dummy in range(sampling_metadata.is_dummy_or_profile_run + 1):
-        if not is_dummy and not need_top_p_sampling and current_platform.is_cuda():
-            if need_top_k_sampling:
-                probs = dispatch_top_k_renorm_probs(probs, top_k)
-            next_tokens = _random_sample(probs, topp_seed=topp_seed)
-        else:
-            _, next_tokens = top_k_top_p_sampling(
-                probs,
-                top_p,
-                top_k,
-                top_k_list,
-                topp_seed=topp_seed,
-            )
+    if not need_top_p_sampling and current_platform.is_cuda():
+        if need_top_k_sampling:
+            probs = dispatch_top_k_renorm_probs(probs, top_k)
+        next_tokens = _random_sample(probs, topp_seed=topp_seed)
+        if sampling_metadata.is_dummy_or_profile_run:  # warmup top_p != 1.0 path
+            _, _ = top_k_top_p_sampling(probs, top_p, top_k, top_k_list, topp_seed=topp_seed)
+    else:
+        _, next_tokens = top_k_top_p_sampling(
+            probs,
+            top_p,
+            top_k,
+            top_k_list,
+            topp_seed=topp_seed,
+        )
     return next_tokens
 
 

From 614775f26e59ad585c9fef51f9198bc630e56006 Mon Sep 17 00:00:00 2001
From: ckl117 <ckl117@163.com>
Date: Sun, 31 May 2026 12:56:10 +0800
Subject: [PATCH 3/3] warmup must return next_tokens

---
 fastdeploy/model_executor/layers/sample/sampler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py
index b13291fad7a..70d6cc029bc 100644
--- a/fastdeploy/model_executor/layers/sample/sampler.py
+++ b/fastdeploy/model_executor/layers/sample/sampler.py
@@ -222,7 +222,7 @@ def _sample_from_probs(probs, sampling_metadata, top_p=None, top_k=None, topp_se
             probs = dispatch_top_k_renorm_probs(probs, top_k)
         next_tokens = _random_sample(probs, topp_seed=topp_seed)
         if sampling_metadata.is_dummy_or_profile_run:  # warmup top_p != 1.0 path
-            _, _ = top_k_top_p_sampling(probs, top_p, top_k, top_k_list, topp_seed=topp_seed)
+            _, next_tokens = top_k_top_p_sampling(probs, top_p, top_k, top_k_list, topp_seed=topp_seed)
     else:
         _, next_tokens = top_k_top_p_sampling(
             probs,