From e7b769376cd6836a3407f07d3ca987a55a130888 Mon Sep 17 00:00:00 2001 From: ApplEOFDiscord Date: Tue, 4 Nov 2025 20:03:45 +0800 Subject: [PATCH 1/5] process transparent image --- .../input/ernie4_5_vl_processor/process.py | 2 + .../ernie4_5_vl_processor/utils/io_utils.py | 45 +++++++++++++++++++ .../input/paddleocr_vl_processor/process.py | 6 ++- fastdeploy/input/qwen_vl_processor/process.py | 6 ++- 4 files changed, 57 insertions(+), 2 deletions(-) diff --git a/fastdeploy/input/ernie4_5_vl_processor/process.py b/fastdeploy/input/ernie4_5_vl_processor/process.py index 4ccdf287f20..c83abf7c0b0 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/process.py +++ b/fastdeploy/input/ernie4_5_vl_processor/process.py @@ -36,6 +36,7 @@ from .image_preprocessor.image_preprocessor_adaptive import AdaptiveImageProcessor from .process_video import read_frames_decord, read_video_decord +from .utils.io_utils import process_transparent_image from .utils.render_timestamp import render_frame_timestamp @@ -349,6 +350,7 @@ def _add_text(self, tokens, outputs: Dict) -> None: outputs["cur_position"] += len(tokens) def _add_image(self, img, outputs: Dict, uuid: Optional[str]) -> None: + img = process_transparent_image(img) patches_h, patches_w = self.image_preprocessor.get_smarted_resize( img.height, img.width, diff --git a/fastdeploy/input/ernie4_5_vl_processor/utils/io_utils.py b/fastdeploy/input/ernie4_5_vl_processor/utils/io_utils.py index 1535b64d4f0..66ec398c874 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/utils/io_utils.py +++ b/fastdeploy/input/ernie4_5_vl_processor/utils/io_utils.py @@ -107,3 +107,48 @@ def get_downloadable( retry_interval=retry_interval, ) return downloaded_path + + +def has_transparent_background(img): + """判断图片是否有背景""" + if img.mode in ("RGBA", "LA") or (img.mode == "P" and "transparency" in img.info): + # Check for any pixel with alpha channel less than 255 (fully opaque) + alpha = img.convert("RGBA").split()[-1] + if alpha.getextrema()[0] < 255: + return True + return False + + +def add_white_background(img): + """ + 给透明背景的图,加个白色背景 + """ + if img.mode != "RGBA": + img = img.convert("RGBA") + # 创建一个白色背景的图像,尺寸与原图一致 + img_white_background = Image.new("RGBA", img.size, (255, 255, 255)) + + # 将原图粘贴到白色背景上 + img_white_background.paste(img, (0, 0), img) + + return img_white_background + + +def change_I16_to_L(img): + """ + 将图片从I;16模式转换为L模式 + """ + # 由于I模式的point函数只支持加减乘,所以下面的* (1 / 256)不能改成除法 + return img.point(lambda i: i * (1 / 256)).convert("L") + + +def process_transparent_image(img): + try: + if img.mode == "I;16": + img = change_I16_to_L(img) + if has_transparent_background(img): + img = add_white_background(img) + except Exception: + pass + + return img.convert("RGB") diff --git a/fastdeploy/input/paddleocr_vl_processor/process.py b/fastdeploy/input/paddleocr_vl_processor/process.py index 97cc6ebc82a..0d0e17a6dd6 100644 --- a/fastdeploy/input/paddleocr_vl_processor/process.py +++ b/fastdeploy/input/paddleocr_vl_processor/process.py @@ -26,6 +26,9 @@ from fastdeploy.engine.request import ImagePosition from fastdeploy.entrypoints.chat_utils import parse_chat_messages from fastdeploy.input.ernie4_5_vl_processor import read_video_decord +from fastdeploy.input.ernie4_5_vl_processor.utils.io_utils import ( + process_transparent_image, +) from fastdeploy.input.utils import IDS_TYPE_FLAG from fastdeploy.multimodal.hasher import MultimodalHasher from fastdeploy.utils import data_processor_logger @@ -349,7 +352,8 @@ def _add_image(self, img, outputs: Dict, uuid: Optional[str]) -> None: - Adds image token IDs and type markers - Generates appropriate position embeddings """ - ret = self.image_processor.preprocess(images=[img.convert("RGB")]) + img = process_transparent_image(img) + ret = self.image_processor.preprocess(images=[img]) num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2 grid_thw = ret["grid_thw"].tolist() diff --git a/fastdeploy/input/qwen_vl_processor/process.py b/fastdeploy/input/qwen_vl_processor/process.py index 050489ad036..477899a6687 100644 --- a/fastdeploy/input/qwen_vl_processor/process.py +++ b/fastdeploy/input/qwen_vl_processor/process.py @@ -26,6 +26,9 @@ from fastdeploy.engine.request import ImagePosition from fastdeploy.entrypoints.chat_utils import parse_chat_messages from fastdeploy.input.ernie4_5_vl_processor import read_video_decord +from fastdeploy.input.ernie4_5_vl_processor.utils.io_utils import ( + process_transparent_image, +) from fastdeploy.input.utils import IDS_TYPE_FLAG from fastdeploy.multimodal.hasher import MultimodalHasher from fastdeploy.utils import data_processor_logger @@ -346,7 +349,8 @@ def _add_image(self, img, outputs: Dict, uuid: Optional[str]) -> None: - Adds image token IDs and type markers - Generates appropriate position embeddings """ - ret = self.image_processor.preprocess(images=[img.convert("RGB")]) + img = process_transparent_image(img) + ret = self.image_processor.preprocess(images=[img]) num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2 grid_thw = ret["grid_thw"].tolist() From 6108b17e1d817cfb32c6fe153d049dc149dd6a8a Mon Sep 17 00:00:00 2001 From: ApplEOFDiscord Date: Tue, 4 Nov 2025 22:11:17 +0800 Subject: [PATCH 2/5] english comments --- fastdeploy/input/ernie4_5_vl_processor/utils/io_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fastdeploy/input/ernie4_5_vl_processor/utils/io_utils.py b/fastdeploy/input/ernie4_5_vl_processor/utils/io_utils.py index 66ec398c874..c9c832291f3 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/utils/io_utils.py +++ b/fastdeploy/input/ernie4_5_vl_processor/utils/io_utils.py @@ -125,10 +125,10 @@ def add_white_background(img): """ if img.mode != "RGBA": img = img.convert("RGBA") - # 创建一个白色背景的图像,尺寸与原图一致 + # Create an image with white background, which is the same size as the original image img_white_background = Image.new("RGBA", img.size, (255, 255, 255)) - # 将原图粘贴到白色背景上 + # Paste the original image to white background img_white_background.paste(img, (0, 0), img) return img_white_background @@ -138,7 +138,7 @@ def change_I16_to_L(img): """ 将图片从I;16模式转换为L模式 """ - # 由于I模式的point函数只支持加减乘,所以下面的* (1 / 256)不能改成除法 + # Only supports addition/subtraction/multiplication, * (1 / 256) cannot be replaced with division return img.point(lambda i: i * (1 / 256)).convert("L") From cf8841b5503789a6a931a5f7313b4308a1a5c1e9 Mon Sep 17 00:00:00 2001 From: ApplEOFDiscord Date: Wed, 5 Nov 2025 11:37:23 +0800 Subject: [PATCH 3/5] process transparency at downloading --- .../input/ernie4_5_vl_processor/process.py | 2 - .../ernie4_5_vl_processor/utils/io_utils.py | 45 ------------------- .../input/paddleocr_vl_processor/process.py | 6 +-- fastdeploy/input/qwen_vl_processor/process.py | 6 +-- fastdeploy/multimodal/image.py | 3 ++ 5 files changed, 5 insertions(+), 57 deletions(-) diff --git a/fastdeploy/input/ernie4_5_vl_processor/process.py b/fastdeploy/input/ernie4_5_vl_processor/process.py index c83abf7c0b0..4ccdf287f20 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/process.py +++ b/fastdeploy/input/ernie4_5_vl_processor/process.py @@ -36,7 +36,6 @@ from .image_preprocessor.image_preprocessor_adaptive import AdaptiveImageProcessor from .process_video import read_frames_decord, read_video_decord -from .utils.io_utils import process_transparent_image from .utils.render_timestamp import render_frame_timestamp @@ -350,7 +349,6 @@ def _add_text(self, tokens, outputs: Dict) -> None: outputs["cur_position"] += len(tokens) def _add_image(self, img, outputs: Dict, uuid: Optional[str]) -> None: - img = process_transparent_image(img) patches_h, patches_w = self.image_preprocessor.get_smarted_resize( img.height, img.width, diff --git a/fastdeploy/input/ernie4_5_vl_processor/utils/io_utils.py b/fastdeploy/input/ernie4_5_vl_processor/utils/io_utils.py index c9c832291f3..1535b64d4f0 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/utils/io_utils.py +++ b/fastdeploy/input/ernie4_5_vl_processor/utils/io_utils.py @@ -107,48 +107,3 @@ def get_downloadable( retry_interval=retry_interval, ) return downloaded_path - - -def has_transparent_background(img): - """判断图片是否有背景""" - if img.mode in ("RGBA", "LA") or (img.mode == "P" and "transparency" in img.info): - # Check for any pixel with alpha channel less than 255 (fully opaque) - alpha = img.convert("RGBA").split()[-1] - if alpha.getextrema()[0] < 255: - return True - return False - - -def add_white_background(img): - """ - 给透明背景的图,加个白色背景 - """ - if img.mode != "RGBA": - img = img.convert("RGBA") - # Create an image with white background, which is the same size as the original image - img_white_background = Image.new("RGBA", img.size, (255, 255, 255)) - - # Paste the original image to white background - img_white_background.paste(img, (0, 0), img) - - return img_white_background - - -def change_I16_to_L(img): - """ - 将图片从I;16模式转换为L模式 - """ - # Only supports addition/subtraction/multiplication, * (1 / 256) cannot be replaced with division - return img.point(lambda i: i * (1 / 256)).convert("L") - - -def process_transparent_image(img): - try: - if img.mode == "I;16": - img = change_I16_to_L(img) - if has_transparent_background(img): - img = add_white_background(img) - except Exception: - pass - - return img.convert("RGB") diff --git a/fastdeploy/input/paddleocr_vl_processor/process.py b/fastdeploy/input/paddleocr_vl_processor/process.py index 0d0e17a6dd6..97cc6ebc82a 100644 --- a/fastdeploy/input/paddleocr_vl_processor/process.py +++ b/fastdeploy/input/paddleocr_vl_processor/process.py @@ -26,9 +26,6 @@ from fastdeploy.engine.request import ImagePosition from fastdeploy.entrypoints.chat_utils import parse_chat_messages from fastdeploy.input.ernie4_5_vl_processor import read_video_decord -from fastdeploy.input.ernie4_5_vl_processor.utils.io_utils import ( - process_transparent_image, -) from fastdeploy.input.utils import IDS_TYPE_FLAG from fastdeploy.multimodal.hasher import MultimodalHasher from fastdeploy.utils import data_processor_logger @@ -352,8 +349,7 @@ def _add_image(self, img, outputs: Dict, uuid: Optional[str]) -> None: - Adds image token IDs and type markers - Generates appropriate position embeddings """ - img = process_transparent_image(img) - ret = self.image_processor.preprocess(images=[img]) + ret = self.image_processor.preprocess(images=[img.convert("RGB")]) num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2 grid_thw = ret["grid_thw"].tolist() diff --git a/fastdeploy/input/qwen_vl_processor/process.py b/fastdeploy/input/qwen_vl_processor/process.py index 477899a6687..050489ad036 100644 --- a/fastdeploy/input/qwen_vl_processor/process.py +++ b/fastdeploy/input/qwen_vl_processor/process.py @@ -26,9 +26,6 @@ from fastdeploy.engine.request import ImagePosition from fastdeploy.entrypoints.chat_utils import parse_chat_messages from fastdeploy.input.ernie4_5_vl_processor import read_video_decord -from fastdeploy.input.ernie4_5_vl_processor.utils.io_utils import ( - process_transparent_image, -) from fastdeploy.input.utils import IDS_TYPE_FLAG from fastdeploy.multimodal.hasher import MultimodalHasher from fastdeploy.utils import data_processor_logger @@ -349,8 +346,7 @@ def _add_image(self, img, outputs: Dict, uuid: Optional[str]) -> None: - Adds image token IDs and type markers - Generates appropriate position embeddings """ - img = process_transparent_image(img) - ret = self.image_processor.preprocess(images=[img]) + ret = self.image_processor.preprocess(images=[img.convert("RGB")]) num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2 grid_thw = ret["grid_thw"].tolist() diff --git a/fastdeploy/multimodal/image.py b/fastdeploy/multimodal/image.py index cfbc40de088..97e73fc37d1 100644 --- a/fastdeploy/multimodal/image.py +++ b/fastdeploy/multimodal/image.py @@ -22,6 +22,7 @@ from PIL import Image from .base import MediaIO +from .utils import process_transparency class ImageMediaIO(MediaIO[Image.Image]): @@ -59,6 +60,7 @@ def load_bytes(self, data: bytes) -> Image.Image: """ image = Image.open(BytesIO(data)) image.load() + image = process_transparency(image) return image.convert(self.image_mode) def load_base64(self, media_type: str, data: str) -> Image.Image: @@ -93,6 +95,7 @@ def load_file(self, filepath: str) -> Image.Image: """ image = Image.open(filepath) image.load() + image = process_transparency(image) return image.convert(self.image_mode) def load_file_request(self, request: Any) -> Image.Image: From 47a7ca08f8b9598040ebdbb485944d8e5d0303cd Mon Sep 17 00:00:00 2001 From: ApplEOFDiscord Date: Wed, 5 Nov 2025 13:05:42 +0800 Subject: [PATCH 4/5] fix --- fastdeploy/multimodal/image.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fastdeploy/multimodal/image.py b/fastdeploy/multimodal/image.py index 97e73fc37d1..ce54ba4d965 100644 --- a/fastdeploy/multimodal/image.py +++ b/fastdeploy/multimodal/image.py @@ -115,6 +115,7 @@ def load_file_request(self, request: Any) -> Image.Image: """ image = Image.open(requests.get(request, stream=True).raw) image.load() + image = process_transparency(image) return image.convert(self.image_mode) def encode_base64( From 2ad700130be7ee438d8689cb0fdd29dc1556745a Mon Sep 17 00:00:00 2001 From: ApplEOFDiscord Date: Wed, 5 Nov 2025 14:24:30 +0800 Subject: [PATCH 5/5] remove useless codes --- fastdeploy/multimodal/utils.py | 122 ++------------------------------- 1 file changed, 4 insertions(+), 118 deletions(-) diff --git a/fastdeploy/multimodal/utils.py b/fastdeploy/multimodal/utils.py index fa67be2a383..fa3ad4cbe22 100644 --- a/fastdeploy/multimodal/utils.py +++ b/fastdeploy/multimodal/utils.py @@ -14,141 +14,27 @@ # limitations under the License. """ -import base64 -import io -import ipaddress -import mimetypes -import os -import socket -import subprocess -import tempfile -from urllib.parse import urlparse - -import cairosvg -import pyheif -import requests -from pdf2image import convert_from_path from PIL import Image, ImageOps from fastdeploy.utils import data_processor_logger -def process_image_data(image_data, mime_type, url): - """处理不同类型的图像数据并返回 PIL 图像对象""" - - if mime_type in ["image/heif", "image/heic"] or url.lower().endswith(".heif") or url.lower().endswith(".heic"): - heif_file = pyheif.read(image_data) - pil_image = Image.frombytes( - heif_file.mode, - heif_file.size, - heif_file.data, - "raw", - heif_file.mode, - heif_file.stride, - ) - elif mime_type == "application/pdf" or url.lower().endswith(".pdf"): - with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf: - temp_pdf.write(image_data.getvalue()) - temp_pdf_path = temp_pdf.name - images = convert_from_path(temp_pdf_path) - pil_image = images[0] - os.remove(temp_pdf_path) - elif mime_type == "image/svg+xml" or url.lower().endswith(".svg"): - png_data = cairosvg.svg2png(bytestring=image_data.getvalue()) - pil_image = Image.open(io.BytesIO(png_data)) - elif mime_type in [ - "application/postscript", - "application/illustrator", - ] or url.lower().endswith(".ai"): - with ( - tempfile.NamedTemporaryFile(delete=False, suffix=".ai") as ai_temp, - tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as pdf_temp, - ): - ai_temp_path = ai_temp.name - pdf_temp_path = pdf_temp.name - ai_temp.write(image_data.getvalue()) - ai_temp.close() - subprocess.run( - ["inkscape", ai_temp_path, "--export-pdf=" + pdf_temp_path], - check=True, - ) - images = convert_from_path(pdf_temp_path) - pil_image = images[0] - os.remove(ai_temp_path) - os.remove(pdf_temp_path) - - elif mime_type == "image/gif" or url.lower().endswith(".gif"): - pil_image = Image.open(image_data) - else: - pil_image = Image.open(image_data) - - return pil_image - - -def http_to_pil_image(url): - """http_to_pil_image""" - - response = requests.get(url) - if response.status_code != 200: - raise Exception("Failed to download the image from URL.") - image_data = io.BytesIO(response.content) - - mime_type = response.headers.get("Content-Type") - if mime_type is None: - mime_type, _ = mimetypes.guess_type(url) - - data_processor_logger.info(f"Detected MIME type: {mime_type}") # 调试信息 - pil_image = process_image_data(image_data, mime_type, url) - - return pil_image - - -def base64_to_pil_image(base64_string): - """base64_to_pil_image""" - image_bytes = base64.b64decode(base64_string) - buffer = io.BytesIO(image_bytes) - pil_image = Image.open(buffer) - return pil_image - - -def is_public_url(url): - """判断是否公网url""" - try: - # 解析URL - parsed_url = urlparse(url) - hostname = parsed_url.hostname - if hostname is None: - return False - # 尝试将域名解析为IP地址 - ip_address = socket.gethostbyname(hostname) - # 转换为IP地址对象 - ip_obj = ipaddress.ip_address(ip_address) - # 判断是否为私有IP或保留IP地址 - if ip_obj.is_private or ip_obj.is_loopback or ip_obj.is_link_local or ip_obj.is_reserved: - return False - else: - return True - except Exception as e: - print(f"Error checking URL: {e}") - return False - - def process_transparency(image): """process transparency.""" def _is_transparent(image): - # 检查图片是否有alpha通道 + # Check if image has alpha channel if image.mode in ("RGBA", "LA") or (image.mode == "P" and "transparency" in image.info): - # 获取alpha通道 + # Get alpha channel alpha = image.convert("RGBA").split()[-1] - # 如果alpha通道中存在0,说明图片有透明部分 + # If alpha channel contains 0, image has transparent part if alpha.getextrema()[0] < 255: return True return False def _convert_transparent_paste(image): width, height = image.size - new_image = Image.new("RGB", (width, height), (255, 255, 255)) # 生成一张白色底图 + new_image = Image.new("RGB", (width, height), (255, 255, 255)) # Generate an image with white background new_image.paste(image, (0, 0), image) return new_image