From 449587b25467d8b66add7c9f980468e034eabb52 Mon Sep 17 00:00:00 2001 From: yuanxiaolan Date: Tue, 16 Sep 2025 13:40:48 +0800 Subject: [PATCH] fix Cfp8 for RL load --- fastdeploy/model_executor/layers/attention/attention.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fastdeploy/model_executor/layers/attention/attention.py b/fastdeploy/model_executor/layers/attention/attention.py index d3730c9f362..889b38820dc 100644 --- a/fastdeploy/model_executor/layers/attention/attention.py +++ b/fastdeploy/model_executor/layers/attention/attention.py @@ -24,6 +24,9 @@ from paddleformers.utils.log import logger from fastdeploy.config import FDConfig +from fastdeploy.model_executor.layers.quantization.kv_cache import ( + KvCacheQuantzationTypes, +) from fastdeploy.model_executor.layers.quantization.quant_base import QuantMethodBase if TYPE_CHECKING: @@ -104,6 +107,12 @@ def __init__( if fd_config.quant_config and hasattr(fd_config.quant_config, "kv_cache_quant_type"): self.quant_method: QuantMethodBase = fd_config.quant_config.get_quant_method(self) + + # set for RL model, as RL do not need load state dict + if fd_config.quant_config.kv_cache_quant_type == KvCacheQuantzationTypes.BLOCK_WISE_FP8: + self.cache_quant_type_str = "block_wise_fp8" + self.quant_max_bound = 448.0 + self.quant_min_bound = -448.0 else: self.quant_method = None