From e7b769376cd6836a3407f07d3ca987a55a130888 Mon Sep 17 00:00:00 2001
From: ApplEOFDiscord <wwy640130@163.com>
Date: Tue, 4 Nov 2025 20:03:45 +0800
Subject: [PATCH 1/5] process transparent image

---
 .../input/ernie4_5_vl_processor/process.py    |  2 +
 .../ernie4_5_vl_processor/utils/io_utils.py   | 45 +++++++++++++++++++
 .../input/paddleocr_vl_processor/process.py   |  6 ++-
 fastdeploy/input/qwen_vl_processor/process.py |  6 ++-
 4 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/fastdeploy/input/ernie4_5_vl_processor/process.py b/fastdeploy/input/ernie4_5_vl_processor/process.py
index 4ccdf287f20..c83abf7c0b0 100644
--- a/fastdeploy/input/ernie4_5_vl_processor/process.py
+++ b/fastdeploy/input/ernie4_5_vl_processor/process.py
@@ -36,6 +36,7 @@
 
 from .image_preprocessor.image_preprocessor_adaptive import AdaptiveImageProcessor
 from .process_video import read_frames_decord, read_video_decord
+from .utils.io_utils import process_transparent_image
 from .utils.render_timestamp import render_frame_timestamp
 
 
@@ -349,6 +350,7 @@ def _add_text(self, tokens, outputs: Dict) -> None:
         outputs["cur_position"] += len(tokens)
 
     def _add_image(self, img, outputs: Dict, uuid: Optional[str]) -> None:
+        img = process_transparent_image(img)
         patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
             img.height,
             img.width,
diff --git a/fastdeploy/input/ernie4_5_vl_processor/utils/io_utils.py b/fastdeploy/input/ernie4_5_vl_processor/utils/io_utils.py
index 1535b64d4f0..66ec398c874 100644
--- a/fastdeploy/input/ernie4_5_vl_processor/utils/io_utils.py
+++ b/fastdeploy/input/ernie4_5_vl_processor/utils/io_utils.py
@@ -107,3 +107,48 @@ def get_downloadable(
         retry_interval=retry_interval,
     )
     return downloaded_path
+
+
+def has_transparent_background(img):
+    """判断图片是否有背景"""
+    if img.mode in ("RGBA", "LA") or (img.mode == "P" and "transparency" in img.info):
+        # Check for any pixel with alpha channel less than 255 (fully opaque)
+        alpha = img.convert("RGBA").split()[-1]
+        if alpha.getextrema()[0] < 255:
+            return True
+    return False
+
+
+def add_white_background(img):
+    """
+    给透明背景的图，加个白色背景
+    """
+    if img.mode != "RGBA":
+        img = img.convert("RGBA")
+    # 创建一个白色背景的图像，尺寸与原图一致
+    img_white_background = Image.new("RGBA", img.size, (255, 255, 255))
+
+    # 将原图粘贴到白色背景上
+    img_white_background.paste(img, (0, 0), img)
+
+    return img_white_background
+
+
+def change_I16_to_L(img):
+    """
+    将图片从I;16模式转换为L模式
+    """
+    # 由于I模式的point函数只支持加减乘，所以下面的* (1 / 256)不能改成除法
+    return img.point(lambda i: i * (1 / 256)).convert("L")
+
+
+def process_transparent_image(img):
+    try:
+        if img.mode == "I;16":
+            img = change_I16_to_L(img)
+        if has_transparent_background(img):
+            img = add_white_background(img)
+    except Exception:
+        pass
+
+    return img.convert("RGB")
diff --git a/fastdeploy/input/paddleocr_vl_processor/process.py b/fastdeploy/input/paddleocr_vl_processor/process.py
index 97cc6ebc82a..0d0e17a6dd6 100644
--- a/fastdeploy/input/paddleocr_vl_processor/process.py
+++ b/fastdeploy/input/paddleocr_vl_processor/process.py
@@ -26,6 +26,9 @@
 from fastdeploy.engine.request import ImagePosition
 from fastdeploy.entrypoints.chat_utils import parse_chat_messages
 from fastdeploy.input.ernie4_5_vl_processor import read_video_decord
+from fastdeploy.input.ernie4_5_vl_processor.utils.io_utils import (
+    process_transparent_image,
+)
 from fastdeploy.input.utils import IDS_TYPE_FLAG
 from fastdeploy.multimodal.hasher import MultimodalHasher
 from fastdeploy.utils import data_processor_logger
@@ -349,7 +352,8 @@ def _add_image(self, img, outputs: Dict, uuid: Optional[str]) -> None:
             - Adds image token IDs and type markers
             - Generates appropriate position embeddings
         """
-        ret = self.image_processor.preprocess(images=[img.convert("RGB")])
+        img = process_transparent_image(img)
+        ret = self.image_processor.preprocess(images=[img])
         num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
         grid_thw = ret["grid_thw"].tolist()
 
diff --git a/fastdeploy/input/qwen_vl_processor/process.py b/fastdeploy/input/qwen_vl_processor/process.py
index 050489ad036..477899a6687 100644
--- a/fastdeploy/input/qwen_vl_processor/process.py
+++ b/fastdeploy/input/qwen_vl_processor/process.py
@@ -26,6 +26,9 @@
 from fastdeploy.engine.request import ImagePosition
 from fastdeploy.entrypoints.chat_utils import parse_chat_messages
 from fastdeploy.input.ernie4_5_vl_processor import read_video_decord
+from fastdeploy.input.ernie4_5_vl_processor.utils.io_utils import (
+    process_transparent_image,
+)
 from fastdeploy.input.utils import IDS_TYPE_FLAG
 from fastdeploy.multimodal.hasher import MultimodalHasher
 from fastdeploy.utils import data_processor_logger
@@ -346,7 +349,8 @@ def _add_image(self, img, outputs: Dict, uuid: Optional[str]) -> None:
             - Adds image token IDs and type markers
             - Generates appropriate position embeddings
         """
-        ret = self.image_processor.preprocess(images=[img.convert("RGB")])
+        img = process_transparent_image(img)
+        ret = self.image_processor.preprocess(images=[img])
         num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
         grid_thw = ret["grid_thw"].tolist()
 

From 6108b17e1d817cfb32c6fe153d049dc149dd6a8a Mon Sep 17 00:00:00 2001
From: ApplEOFDiscord <wwy640130@163.com>
Date: Tue, 4 Nov 2025 22:11:17 +0800
Subject: [PATCH 2/5] english comments

---
 fastdeploy/input/ernie4_5_vl_processor/utils/io_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fastdeploy/input/ernie4_5_vl_processor/utils/io_utils.py b/fastdeploy/input/ernie4_5_vl_processor/utils/io_utils.py
index 66ec398c874..c9c832291f3 100644
--- a/fastdeploy/input/ernie4_5_vl_processor/utils/io_utils.py
+++ b/fastdeploy/input/ernie4_5_vl_processor/utils/io_utils.py
@@ -125,10 +125,10 @@ def add_white_background(img):
     """
     if img.mode != "RGBA":
         img = img.convert("RGBA")
-    # 创建一个白色背景的图像，尺寸与原图一致
+    # Create an image with white background, which is the same size as the original image
     img_white_background = Image.new("RGBA", img.size, (255, 255, 255))
 
-    # 将原图粘贴到白色背景上
+    # Paste the original image to white background
     img_white_background.paste(img, (0, 0), img)
 
     return img_white_background
@@ -138,7 +138,7 @@ def change_I16_to_L(img):
     """
     将图片从I;16模式转换为L模式
     """
-    # 由于I模式的point函数只支持加减乘，所以下面的* (1 / 256)不能改成除法
+    # Only supports addition/subtraction/multiplication, * (1 / 256) cannot be replaced with division
     return img.point(lambda i: i * (1 / 256)).convert("L")
 
 

From cf8841b5503789a6a931a5f7313b4308a1a5c1e9 Mon Sep 17 00:00:00 2001
From: ApplEOFDiscord <wwy640130@163.com>
Date: Wed, 5 Nov 2025 11:37:23 +0800
Subject: [PATCH 3/5] process transparency at downloading

---
 .../input/ernie4_5_vl_processor/process.py    |  2 -
 .../ernie4_5_vl_processor/utils/io_utils.py   | 45 -------------------
 .../input/paddleocr_vl_processor/process.py   |  6 +--
 fastdeploy/input/qwen_vl_processor/process.py |  6 +--
 fastdeploy/multimodal/image.py                |  3 ++
 5 files changed, 5 insertions(+), 57 deletions(-)

diff --git a/fastdeploy/input/ernie4_5_vl_processor/process.py b/fastdeploy/input/ernie4_5_vl_processor/process.py
index c83abf7c0b0..4ccdf287f20 100644
--- a/fastdeploy/input/ernie4_5_vl_processor/process.py
+++ b/fastdeploy/input/ernie4_5_vl_processor/process.py
@@ -36,7 +36,6 @@
 
 from .image_preprocessor.image_preprocessor_adaptive import AdaptiveImageProcessor
 from .process_video import read_frames_decord, read_video_decord
-from .utils.io_utils import process_transparent_image
 from .utils.render_timestamp import render_frame_timestamp
 
 
@@ -350,7 +349,6 @@ def _add_text(self, tokens, outputs: Dict) -> None:
         outputs["cur_position"] += len(tokens)
 
     def _add_image(self, img, outputs: Dict, uuid: Optional[str]) -> None:
-        img = process_transparent_image(img)
         patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
             img.height,
             img.width,
diff --git a/fastdeploy/input/ernie4_5_vl_processor/utils/io_utils.py b/fastdeploy/input/ernie4_5_vl_processor/utils/io_utils.py
index c9c832291f3..1535b64d4f0 100644
--- a/fastdeploy/input/ernie4_5_vl_processor/utils/io_utils.py
+++ b/fastdeploy/input/ernie4_5_vl_processor/utils/io_utils.py
@@ -107,48 +107,3 @@ def get_downloadable(
         retry_interval=retry_interval,
     )
     return downloaded_path
-
-
-def has_transparent_background(img):
-    """判断图片是否有背景"""
-    if img.mode in ("RGBA", "LA") or (img.mode == "P" and "transparency" in img.info):
-        # Check for any pixel with alpha channel less than 255 (fully opaque)
-        alpha = img.convert("RGBA").split()[-1]
-        if alpha.getextrema()[0] < 255:
-            return True
-    return False
-
-
-def add_white_background(img):
-    """
-    给透明背景的图，加个白色背景
-    """
-    if img.mode != "RGBA":
-        img = img.convert("RGBA")
-    # Create an image with white background, which is the same size as the original image
-    img_white_background = Image.new("RGBA", img.size, (255, 255, 255))
-
-    # Paste the original image to white background
-    img_white_background.paste(img, (0, 0), img)
-
-    return img_white_background
-
-
-def change_I16_to_L(img):
-    """
-    将图片从I;16模式转换为L模式
-    """
-    # Only supports addition/subtraction/multiplication, * (1 / 256) cannot be replaced with division
-    return img.point(lambda i: i * (1 / 256)).convert("L")
-
-
-def process_transparent_image(img):
-    try:
-        if img.mode == "I;16":
-            img = change_I16_to_L(img)
-        if has_transparent_background(img):
-            img = add_white_background(img)
-    except Exception:
-        pass
-
-    return img.convert("RGB")
diff --git a/fastdeploy/input/paddleocr_vl_processor/process.py b/fastdeploy/input/paddleocr_vl_processor/process.py
index 0d0e17a6dd6..97cc6ebc82a 100644
--- a/fastdeploy/input/paddleocr_vl_processor/process.py
+++ b/fastdeploy/input/paddleocr_vl_processor/process.py
@@ -26,9 +26,6 @@
 from fastdeploy.engine.request import ImagePosition
 from fastdeploy.entrypoints.chat_utils import parse_chat_messages
 from fastdeploy.input.ernie4_5_vl_processor import read_video_decord
-from fastdeploy.input.ernie4_5_vl_processor.utils.io_utils import (
-    process_transparent_image,
-)
 from fastdeploy.input.utils import IDS_TYPE_FLAG
 from fastdeploy.multimodal.hasher import MultimodalHasher
 from fastdeploy.utils import data_processor_logger
@@ -352,8 +349,7 @@ def _add_image(self, img, outputs: Dict, uuid: Optional[str]) -> None:
             - Adds image token IDs and type markers
             - Generates appropriate position embeddings
         """
-        img = process_transparent_image(img)
-        ret = self.image_processor.preprocess(images=[img])
+        ret = self.image_processor.preprocess(images=[img.convert("RGB")])
         num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
         grid_thw = ret["grid_thw"].tolist()
 
diff --git a/fastdeploy/input/qwen_vl_processor/process.py b/fastdeploy/input/qwen_vl_processor/process.py
index 477899a6687..050489ad036 100644
--- a/fastdeploy/input/qwen_vl_processor/process.py
+++ b/fastdeploy/input/qwen_vl_processor/process.py
@@ -26,9 +26,6 @@
 from fastdeploy.engine.request import ImagePosition
 from fastdeploy.entrypoints.chat_utils import parse_chat_messages
 from fastdeploy.input.ernie4_5_vl_processor import read_video_decord
-from fastdeploy.input.ernie4_5_vl_processor.utils.io_utils import (
-    process_transparent_image,
-)
 from fastdeploy.input.utils import IDS_TYPE_FLAG
 from fastdeploy.multimodal.hasher import MultimodalHasher
 from fastdeploy.utils import data_processor_logger
@@ -349,8 +346,7 @@ def _add_image(self, img, outputs: Dict, uuid: Optional[str]) -> None:
             - Adds image token IDs and type markers
             - Generates appropriate position embeddings
         """
-        img = process_transparent_image(img)
-        ret = self.image_processor.preprocess(images=[img])
+        ret = self.image_processor.preprocess(images=[img.convert("RGB")])
         num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
         grid_thw = ret["grid_thw"].tolist()
 
diff --git a/fastdeploy/multimodal/image.py b/fastdeploy/multimodal/image.py
index cfbc40de088..97e73fc37d1 100644
--- a/fastdeploy/multimodal/image.py
+++ b/fastdeploy/multimodal/image.py
@@ -22,6 +22,7 @@
 from PIL import Image
 
 from .base import MediaIO
+from .utils import process_transparency
 
 
 class ImageMediaIO(MediaIO[Image.Image]):
@@ -59,6 +60,7 @@ def load_bytes(self, data: bytes) -> Image.Image:
         """
         image = Image.open(BytesIO(data))
         image.load()
+        image = process_transparency(image)
         return image.convert(self.image_mode)
 
     def load_base64(self, media_type: str, data: str) -> Image.Image:
@@ -93,6 +95,7 @@ def load_file(self, filepath: str) -> Image.Image:
         """
         image = Image.open(filepath)
         image.load()
+        image = process_transparency(image)
         return image.convert(self.image_mode)
 
     def load_file_request(self, request: Any) -> Image.Image:

From 47a7ca08f8b9598040ebdbb485944d8e5d0303cd Mon Sep 17 00:00:00 2001
From: ApplEOFDiscord <wwy640130@163.com>
Date: Wed, 5 Nov 2025 13:05:42 +0800
Subject: [PATCH 4/5] fix

---
 fastdeploy/multimodal/image.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fastdeploy/multimodal/image.py b/fastdeploy/multimodal/image.py
index 97e73fc37d1..ce54ba4d965 100644
--- a/fastdeploy/multimodal/image.py
+++ b/fastdeploy/multimodal/image.py
@@ -115,6 +115,7 @@ def load_file_request(self, request: Any) -> Image.Image:
         """
         image = Image.open(requests.get(request, stream=True).raw)
         image.load()
+        image = process_transparency(image)
         return image.convert(self.image_mode)
 
     def encode_base64(

From 2ad700130be7ee438d8689cb0fdd29dc1556745a Mon Sep 17 00:00:00 2001
From: ApplEOFDiscord <wwy640130@163.com>
Date: Wed, 5 Nov 2025 14:24:30 +0800
Subject: [PATCH 5/5] remove useless codes

---
 fastdeploy/multimodal/utils.py | 122 ++-------------------------------
 1 file changed, 4 insertions(+), 118 deletions(-)

diff --git a/fastdeploy/multimodal/utils.py b/fastdeploy/multimodal/utils.py
index fa67be2a383..fa3ad4cbe22 100644
--- a/fastdeploy/multimodal/utils.py
+++ b/fastdeploy/multimodal/utils.py
@@ -14,141 +14,27 @@
 # limitations under the License.
 """
 
-import base64
-import io
-import ipaddress
-import mimetypes
-import os
-import socket
-import subprocess
-import tempfile
-from urllib.parse import urlparse
-
-import cairosvg
-import pyheif
-import requests
-from pdf2image import convert_from_path
 from PIL import Image, ImageOps
 
 from fastdeploy.utils import data_processor_logger
 
 
-def process_image_data(image_data, mime_type, url):
-    """处理不同类型的图像数据并返回 PIL 图像对象"""
-
-    if mime_type in ["image/heif", "image/heic"] or url.lower().endswith(".heif") or url.lower().endswith(".heic"):
-        heif_file = pyheif.read(image_data)
-        pil_image = Image.frombytes(
-            heif_file.mode,
-            heif_file.size,
-            heif_file.data,
-            "raw",
-            heif_file.mode,
-            heif_file.stride,
-        )
-    elif mime_type == "application/pdf" or url.lower().endswith(".pdf"):
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
-            temp_pdf.write(image_data.getvalue())
-            temp_pdf_path = temp_pdf.name
-        images = convert_from_path(temp_pdf_path)
-        pil_image = images[0]
-        os.remove(temp_pdf_path)
-    elif mime_type == "image/svg+xml" or url.lower().endswith(".svg"):
-        png_data = cairosvg.svg2png(bytestring=image_data.getvalue())
-        pil_image = Image.open(io.BytesIO(png_data))
-    elif mime_type in [
-        "application/postscript",
-        "application/illustrator",
-    ] or url.lower().endswith(".ai"):
-        with (
-            tempfile.NamedTemporaryFile(delete=False, suffix=".ai") as ai_temp,
-            tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as pdf_temp,
-        ):
-            ai_temp_path = ai_temp.name
-            pdf_temp_path = pdf_temp.name
-            ai_temp.write(image_data.getvalue())
-            ai_temp.close()
-            subprocess.run(
-                ["inkscape", ai_temp_path, "--export-pdf=" + pdf_temp_path],
-                check=True,
-            )
-            images = convert_from_path(pdf_temp_path)
-            pil_image = images[0]
-            os.remove(ai_temp_path)
-            os.remove(pdf_temp_path)
-
-    elif mime_type == "image/gif" or url.lower().endswith(".gif"):
-        pil_image = Image.open(image_data)
-    else:
-        pil_image = Image.open(image_data)
-
-    return pil_image
-
-
-def http_to_pil_image(url):
-    """http_to_pil_image"""
-
-    response = requests.get(url)
-    if response.status_code != 200:
-        raise Exception("Failed to download the image from URL.")
-    image_data = io.BytesIO(response.content)
-
-    mime_type = response.headers.get("Content-Type")
-    if mime_type is None:
-        mime_type, _ = mimetypes.guess_type(url)
-
-    data_processor_logger.info(f"Detected MIME type: {mime_type}")  # 调试信息
-    pil_image = process_image_data(image_data, mime_type, url)
-
-    return pil_image
-
-
-def base64_to_pil_image(base64_string):
-    """base64_to_pil_image"""
-    image_bytes = base64.b64decode(base64_string)
-    buffer = io.BytesIO(image_bytes)
-    pil_image = Image.open(buffer)
-    return pil_image
-
-
-def is_public_url(url):
-    """判断是否公网url"""
-    try:
-        # 解析URL
-        parsed_url = urlparse(url)
-        hostname = parsed_url.hostname
-        if hostname is None:
-            return False
-        # 尝试将域名解析为IP地址
-        ip_address = socket.gethostbyname(hostname)
-        # 转换为IP地址对象
-        ip_obj = ipaddress.ip_address(ip_address)
-        # 判断是否为私有IP或保留IP地址
-        if ip_obj.is_private or ip_obj.is_loopback or ip_obj.is_link_local or ip_obj.is_reserved:
-            return False
-        else:
-            return True
-    except Exception as e:
-        print(f"Error checking URL: {e}")
-        return False
-
-
 def process_transparency(image):
     """process transparency."""
 
     def _is_transparent(image):
-        # 检查图片是否有alpha通道
+        # Check if image has alpha channel
         if image.mode in ("RGBA", "LA") or (image.mode == "P" and "transparency" in image.info):
-            # 获取alpha通道
+            # Get alpha channel
             alpha = image.convert("RGBA").split()[-1]
-            # 如果alpha通道中存在0，说明图片有透明部分
+            # If alpha channel contains 0, image has transparent part
             if alpha.getextrema()[0] < 255:
                 return True
         return False
 
     def _convert_transparent_paste(image):
         width, height = image.size
-        new_image = Image.new("RGB", (width, height), (255, 255, 255))  # 生成一张白色底图
+        new_image = Image.new("RGB", (width, height), (255, 255, 255))  # Generate an image with white background
         new_image.paste(image, (0, 0), image)
         return new_image