Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions fastdeploy/model_executor/layers/sample/meta_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,5 @@ class SamplingMetadata:
# Add for HPU post-processing
seq_lens_encoder: Optional[paddle.Tensor] = None
seq_lens_decoder: Optional[paddle.Tensor] = None
# Add for sampler to distinguish dummy run and profile run
is_dummy_or_profile_run: bool = False
2 changes: 2 additions & 0 deletions fastdeploy/model_executor/layers/sample/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,8 @@ def _sample_from_probs(probs, sampling_metadata, top_p=None, top_k=None, topp_se
if need_top_k_sampling:
probs = dispatch_top_k_renorm_probs(probs, top_k)
next_tokens = _random_sample(probs, topp_seed=topp_seed)
if sampling_metadata.is_dummy_or_profile_run: # warmup top_p != 1.0 path
_, next_tokens = top_k_top_p_sampling(probs, top_p, top_k, top_k_list, topp_seed=topp_seed)
else:
_, next_tokens = top_k_top_p_sampling(
probs,
Expand Down
1 change: 1 addition & 0 deletions fastdeploy/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1383,6 +1383,7 @@ def _prepare_inputs(self, cached_token_num=-1, cached_real_bsz=-1, is_dummy_or_p
top_p_normalized_logprobs=self.share_inputs["top_p_normalized_logprobs"],
logits_processors=self.share_inputs["logits_processors"],
share_inputs=self.share_inputs,
is_dummy_or_profile_run=is_dummy_or_profile_run,

This comment was marked as outdated.

)
return token_num, token_num_event

Expand Down
Loading