Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -182,24 +182,28 @@ def apply_speculative_penalty_multi_scores(
from fastdeploy.model_executor.ops.gpu import (
speculate_get_token_penalty_multi_scores,
)

speculate_get_token_penalty_multi_scores(
pre_token_ids,
logits,
repetition_penalties,
frequency_penalties,
presence_penalties,
temperature,
bad_words_token_ids,
step_idx,
min_dec_lens,
eos_token_ids,
seq_lens_this_time,
output_padding_offset,
output_cum_offsets,
max_len,
elif current_platform.is_xpu():
from fastdeploy.model_executor.ops.xpu import (
speculate_get_token_penalty_multi_scores,
)

else:
raise NotImplementedError
speculate_get_token_penalty_multi_scores(
pre_token_ids,
logits,
repetition_penalties,
frequency_penalties,
presence_penalties,
temperature,
bad_words_token_ids,
step_idx,
min_dec_lens,
eos_token_ids,
seq_lens_this_time,
output_padding_offset,
output_cum_offsets,
max_len,
)
# inplace
return logits
119 changes: 119 additions & 0 deletions fastdeploy/model_executor/layers/sample/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -572,6 +572,8 @@ def __init__(self, fd_config: FDConfig):
super().__init__()
if current_platform.is_cuda():
self.forward = self.forward_cuda
elif current_platform.is_xpu():
self.forward = self.forward_xpu
else:
raise NotImplementedError
self.logprobs_mode = fd_config.model_config.logprobs_mode
Expand Down Expand Up @@ -814,6 +816,80 @@ def forward_cuda(

return sampler_output

def forward_xpu(
self,
logits: paddle.Tensor,
sampling_metadata: SamplingMetadata,
max_model_len: int,
share_inputs: List[paddle.Tensor],
accept_all_drafts: bool = False,
reject_all_drafts: bool = False,
) -> paddle.Tensor:
from fastdeploy.model_executor.ops.xpu import speculate_verify, top_p_candidates

logits = apply_speculative_penalty_multi_scores(
sampling_metadata.pre_token_ids,
logits,
sampling_metadata.repetition_penalties,
sampling_metadata.frequency_penalties,
sampling_metadata.presence_penalties,
sampling_metadata.temperature,
sampling_metadata.bad_words_token_ids,
sampling_metadata.step_idx,
sampling_metadata.min_dec_lens,
sampling_metadata.eos_token_ids,
share_inputs["seq_lens_this_time"],
share_inputs["output_padding_offset"],
share_inputs["output_cum_offsets"],
max_model_len,
)

probs = F.softmax(logits)

verify_scores, verify_tokens, actual_candidate_len = top_p_candidates(
probs,
sampling_metadata.top_p,
share_inputs["output_padding_offset"],
self.speculative_max_candidate_len,
max_model_len,
)

speculate_verify(
share_inputs["accept_tokens"],
share_inputs["accept_num"],
share_inputs["step_idx"],
share_inputs["stop_flags"],
share_inputs["seq_lens_encoder"],
share_inputs["seq_lens_decoder"],
share_inputs[
"draft_tokens"
], # Both input and output, need to write the last 1 token accepted to position 0.
share_inputs["seq_lens_this_time"],
verify_tokens,
verify_scores,
share_inputs["max_dec_len"],
sampling_metadata.eos_token_ids,
share_inputs["is_block_step"],
share_inputs["output_cum_offsets"],
actual_candidate_len,
share_inputs["actual_draft_token_num"],
sampling_metadata.top_p,
max_model_len,
self.speculative_verify_window,
True, # enable_topp
(self.speculative_benchmark_mode or reject_all_drafts),
accept_all_drafts,
)
# TODO(chenhuan09): support return logprobs
token_ids = share_inputs["accept_tokens"]
sampler_output = SamplerOutput(
sampled_token_ids=token_ids,
logprobs_tensors=None,
token_num_per_batch=share_inputs["accept_num"],
cu_batch_token_offset=None,
)
return sampler_output


class MTPSampler(nn.Layer):
""" """
Expand All @@ -823,6 +899,8 @@ def __init__(self, fd_config: FDConfig):
super().__init__()
if current_platform.is_cuda():
self.forward = self.forward_cuda
elif current_platform.is_xpu():
self.forward = self.forward_xpu
else:
raise NotImplementedError
self.logprobs_mode = fd_config.model_config.logprobs_mode
Expand Down Expand Up @@ -1013,3 +1091,44 @@ def forward_cuda(
cu_batch_token_offset=share_inputs["cu_batch_token_offset"],
)
return next_tokens, sampler_output

def forward_xpu(
self,
logits: paddle.Tensor,
sampling_metadata: SamplingMetadata,
max_model_len: int,
share_inputs: List[paddle.Tensor],
) -> paddle.Tensor:

logits = apply_speculative_penalty_multi_scores(
sampling_metadata.pre_token_ids,
logits,
sampling_metadata.repetition_penalties,
sampling_metadata.frequency_penalties,
sampling_metadata.presence_penalties,
sampling_metadata.temperature,
sampling_metadata.bad_words_token_ids,
sampling_metadata.step_idx,
sampling_metadata.min_dec_lens,
sampling_metadata.eos_token_ids,
share_inputs["seq_lens_this_time"],
share_inputs["output_padding_offset"],
share_inputs["output_cum_offsets"],
max_model_len,
)
probs = F.softmax(logits)

_, next_tokens = top_k_top_p_sampling(
probs, sampling_metadata.top_p, sampling_metadata.top_k, sampling_metadata.top_k_list
)
# TODO(chenhuan09): add support for logprobs
token_ids = None
logprobs_tensors = None

sampler_output = SamplerOutput(
sampled_token_ids=token_ids,
logprobs_tensors=logprobs_tensors,
token_num_per_batch=None,
cu_batch_token_offset=None,
)
return next_tokens, sampler_output
Loading
Loading