diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py index 97aac5cf6f2..a1baf8e4658 100644 --- a/fastdeploy/input/text_processor.py +++ b/fastdeploy/input/text_processor.py @@ -185,6 +185,9 @@ def __init__(self, model_name_or_path, reasoning_parser_obj=None, tool_parser_ob from paddleformers.trl.llm_utils import get_eos_token_id self.eos_token_ids = get_eos_token_id(self.tokenizer, self.generation_config) + data_processor_logger.info( + f"The eos_token_ids obtained by merging tokenizer and generation_config is {self.eos_token_ids}" + ) self.eos_token_id_len = len(self.eos_token_ids) self.pad_token_id = self.get_pad_id() self.reasoning_parser = None @@ -396,7 +399,7 @@ def process_response_dict_normal(self, response_dict, **kwargs): is_end = response_dict["finished"] req_id = response_dict["request_id"] if is_end and len(token_ids) > 0 and not kwargs.get("include_stop_str_in_output"): - if token_ids[-1] == self.tokenizer.eos_token_id: + if token_ids[-1] in self.eos_token_ids: token_ids = token_ids[:-1] delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id) if is_end: @@ -434,7 +437,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): token_ids = response_dict["outputs"]["token_ids"] if is_end and len(token_ids) > 0 and not kwargs.get("include_stop_str_in_output"): - if token_ids[-1] == self.tokenizer.eos_token_id: + if token_ids[-1] in self.eos_token_ids: token_ids = token_ids[:-1] delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id) response_dict["outputs"]["raw_prediction"] = delta_text diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py b/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py index ea97bb5d739..d1be7af8036 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py @@ -199,6 +199,7 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs): layer.up_gate_proj_weight, { "weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)), + "weight_need_transpose": extra_weight_attrs.get("model_format") == "torch", "model_format": extra_weight_attrs.get("model_format", ""), }, ) @@ -206,6 +207,7 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs): layer.down_proj_weight, { "weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)), + "weight_need_transpose": extra_weight_attrs.get("model_format") == "torch", "model_format": extra_weight_attrs.get("model_format", ""), }, ) diff --git a/fastdeploy/model_executor/layers/quantization/__init__.py b/fastdeploy/model_executor/layers/quantization/__init__.py index 6be17828290..f8716369852 100644 --- a/fastdeploy/model_executor/layers/quantization/__init__.py +++ b/fastdeploy/model_executor/layers/quantization/__init__.py @@ -85,6 +85,8 @@ def parse_quant_config(args, model_config, is_ernie, is_v1_loader): else: if not quantization_config.get("is_quantized"): quantization_config["is_quantized"] = model_config.is_quantized + if args.dynamic_load_weight and quantization_config is not None: + quantization_config["is_quantized"] = True quant_cls = get_quantization_config(quant_config_name) quant_config = quant_cls.from_config(quantization_config) return quant_config