From d91d392390178284a00e4f664c8cc9bad2f9c7be Mon Sep 17 00:00:00 2001 From: zhangjunjun04 Date: Wed, 3 Sep 2025 14:12:35 +0800 Subject: [PATCH 1/4] add http get retry --- fastdeploy/entrypoints/chat_utils.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/fastdeploy/entrypoints/chat_utils.py b/fastdeploy/entrypoints/chat_utils.py index eb25122e1b6..295c3657b34 100644 --- a/fastdeploy/entrypoints/chat_utils.py +++ b/fastdeploy/entrypoints/chat_utils.py @@ -15,6 +15,7 @@ """ import os +import time import uuid from copy import deepcopy from pathlib import Path @@ -32,6 +33,7 @@ from fastdeploy.multimodal.image import ImageMediaIO from fastdeploy.multimodal.video import VideoMediaIO +from fastdeploy.utils import api_server_logger class VideoURL(TypedDict, total=False): @@ -90,12 +92,32 @@ def parse_video(self, video_url): """Parse Video""" return self.load_from_url(video_url, self.video_io) + def http_get_with_retry(self, url, max_retries=3, retry_delay=1, backoff_factor=2): + """HTTP retry""" + + retry_cnt = 0 + delay = retry_delay + + while retry_cnt < max_retries: + try: + response = requests.get(url) + response.raise_for_status() + return response.content + except Exception as e: + retry_cnt += 1 + if retry_cnt >= max_retries: + api_server_logger.error(f"HTTP GET failed: {e}. Max retries reached") + raise + api_server_logger.info(f"HTTP GET failed: {e}. Start retry {retry_cnt}") + time.sleep(delay) + delay *= backoff_factor + def load_from_url(self, url, media_io): """Load media from URL""" parsed = urlparse(url) if parsed.scheme.startswith("http"): - media_bytes = requests.get(url).content + media_bytes = self.http_get_with_retry(url) return media_io.load_bytes(media_bytes) if parsed.scheme.startswith("data"): From 851930e3a73c479090fb0003e39c9aa272d06e8d Mon Sep 17 00:00:00 2001 From: ApplEOFDiscord Date: Wed, 3 Sep 2025 14:24:30 +0800 Subject: [PATCH 2/4] fix coments --- fastdeploy/entrypoints/chat_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastdeploy/entrypoints/chat_utils.py b/fastdeploy/entrypoints/chat_utils.py index 295c3657b34..9850430a557 100644 --- a/fastdeploy/entrypoints/chat_utils.py +++ b/fastdeploy/entrypoints/chat_utils.py @@ -93,7 +93,7 @@ def parse_video(self, video_url): return self.load_from_url(video_url, self.video_io) def http_get_with_retry(self, url, max_retries=3, retry_delay=1, backoff_factor=2): - """HTTP retry""" + """HTTP GET retry""" retry_cnt = 0 delay = retry_delay From 99588a42fa7e592a4107991538a6f96bae1262dd Mon Sep 17 00:00:00 2001 From: ApplEOFDiscord Date: Thu, 18 Sep 2025 15:09:15 +0800 Subject: [PATCH 3/4] disable prefix caching in mm model --- fastdeploy/config.py | 2 ++ fastdeploy/engine/args_utils.py | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index f17791f147e..a7a2cb57b0f 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -1259,6 +1259,8 @@ def postprocess(self): self.cache_config.postprocess(self.scheduler_config.max_num_batched_tokens, self.scheduler_config.max_num_seqs) self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size) + if self.model_config.enable_mm: + self.cache_config.enable_prefix_caching = False if self.guided_decoding_backend == "auto": if current_platform.is_xpu() or self.speculative_config.method is not None: diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 2dcbde6470c..a43d1d119f2 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -392,8 +392,6 @@ def __post_init__(self): self.enable_prefix_caching = False if self.speculative_config is not None: self.enable_prefix_caching = False - if self.enable_mm: - self.enable_prefix_caching = False if not current_platform.is_cuda(): self.enable_prefix_caching = False if self.dynamic_load_weight: From cdb7cfa2e2f22f7f2c03a9377a29625d6d4bf9cb Mon Sep 17 00:00:00 2001 From: ApplEOFDiscord Date: Thu, 18 Sep 2025 19:36:57 +0800 Subject: [PATCH 4/4] fix unit test --- fastdeploy/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index a7a2cb57b0f..7c8e8dcdfc1 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -1259,7 +1259,7 @@ def postprocess(self): self.cache_config.postprocess(self.scheduler_config.max_num_batched_tokens, self.scheduler_config.max_num_seqs) self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size) - if self.model_config.enable_mm: + if self.model_config is not None and self.model_config.enable_mm: self.cache_config.enable_prefix_caching = False if self.guided_decoding_backend == "auto":