From 53b6f94df79a56e0ec509c416b08d089521b4b85 Mon Sep 17 00:00:00 2001
From: liningping <728359849@qq.com>
Date: Wed, 13 Aug 2025 09:09:11 +0000
Subject: [PATCH 01/10] feat: add s3 unit test

---
 test_s3_auth.py | 83 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)
 create mode 100644 test_s3_auth.py

diff --git a/test_s3_auth.py b/test_s3_auth.py
new file mode 100644
index 0000000..445cfda
--- /dev/null
+++ b/test_s3_auth.py
@@ -0,0 +1,83 @@
+import boto3
+from botocore.exceptions import ClientError, NoCredentialsError
+
+def test_s3_authentication():
+    """测试S3认证"""
+    
+    # 配置信息
+    endpoint_url = "https://s3.kclab.cloud"
+    access_key = "B2sE0fKv1Y1lOpZtge5u"
+    secret_key = "JRx4MbrMbfUfQjIEm7speT52kQgjt0zafvlAuYxW"
+    bucket_name = "bucket-78134-shared"
+    region = "us-east-1"
+    
+    print("=== S3认证测试 ===")
+    print(f"端点: {endpoint_url}")
+    print(f"存储桶: {bucket_name}")
+    print(f"区域: {region}")
+    print()
+    
+    try:
+        # 创建S3客户端
+        s3_client = boto3.client(
+            's3',
+            endpoint_url=endpoint_url,
+            aws_access_key_id=access_key,
+            aws_secret_access_key=secret_key,
+            region_name=region
+        )
+        
+        print("1. 测试列出存储桶...")
+        response = s3_client.list_buckets()
+        buckets = [b['Name'] for b in response['Buckets']]
+        print(f"   可用存储桶: {buckets}")
+        
+        if bucket_name in buckets:
+            print(f"\n2. 测试访问存储桶: {bucket_name}")
+            try:
+                response = s3_client.list_objects_v2(Bucket=bucket_name, MaxKeys=5)
+                if 'Contents' in response:
+                    print(f"   存储桶内容数量: {len(response['Contents'])}")
+                    for obj in response['Contents'][:3]:
+                        print(f"     - {obj['Key']} ({obj['Size']} bytes)")
+                else:
+                    print("   存储桶为空")
+            except ClientError as e:
+                print(f"   访问存储桶失败: {e}")
+        
+        print(f"\n3. 测试上传小文件...")
+        test_content = b"Hello, S3 Test!"
+        test_key = "test/connectivity_test.txt"
+        
+        try:
+            s3_client.put_object(
+                Bucket=bucket_name,
+                Key=test_key,
+                Body=test_content,
+                ContentType='text/plain'
+            )
+            print("   上传成功!")
+            
+            # 测试下载
+            response = s3_client.get_object(Bucket=bucket_name, Key=test_key)
+            downloaded_content = response['Body'].read()
+            print(f"   下载成功: {downloaded_content}")
+            
+            # 清理测试文件
+            s3_client.delete_object(Bucket=bucket_name, Key=test_key)
+            print("   清理成功!")
+            
+        except ClientError as e:
+            print(f"   上传/下载失败: {e}")
+            
+    except NoCredentialsError:
+        print("错误: 无法找到AWS凭证")
+    except ClientError as e:
+        error_code = e.response['Error']['Code']
+        error_message = e.response['Error']['Message']
+        print(f"错误: {error_code} - {error_message}")
+    except Exception as e:
+        print(f"未知错误: {e}")
+
+if __name__ == "__main__":
+    test_s3_authentication()

From 6812ef68b298a741fa017fd7f4026dc88a72c6ed Mon Sep 17 00:00:00 2001
From: liningping <728359849@qq.com>
Date: Wed, 13 Aug 2025 10:07:51 +0000
Subject: [PATCH 02/10] fix: comfortable typing check

---
 test_s3_auth.py | 83 -------------------------------------------------
 1 file changed, 83 deletions(-)
 delete mode 100644 test_s3_auth.py

diff --git a/test_s3_auth.py b/test_s3_auth.py
deleted file mode 100644
index 445cfda..0000000
--- a/test_s3_auth.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import boto3
-from botocore.exceptions import ClientError, NoCredentialsError
-
-def test_s3_authentication():
-    """测试S3认证"""
-    
-    # 配置信息
-    endpoint_url = "https://s3.kclab.cloud"
-    access_key = "B2sE0fKv1Y1lOpZtge5u"
-    secret_key = "JRx4MbrMbfUfQjIEm7speT52kQgjt0zafvlAuYxW"
-    bucket_name = "bucket-78134-shared"
-    region = "us-east-1"
-    
-    print("=== S3认证测试 ===")
-    print(f"端点: {endpoint_url}")
-    print(f"存储桶: {bucket_name}")
-    print(f"区域: {region}")
-    print()
-    
-    try:
-        # 创建S3客户端
-        s3_client = boto3.client(
-            's3',
-            endpoint_url=endpoint_url,
-            aws_access_key_id=access_key,
-            aws_secret_access_key=secret_key,
-            region_name=region
-        )
-        
-        print("1. 测试列出存储桶...")
-        response = s3_client.list_buckets()
-        buckets = [b['Name'] for b in response['Buckets']]
-        print(f"   可用存储桶: {buckets}")
-        
-        if bucket_name in buckets:
-            print(f"\n2. 测试访问存储桶: {bucket_name}")
-            try:
-                response = s3_client.list_objects_v2(Bucket=bucket_name, MaxKeys=5)
-                if 'Contents' in response:
-                    print(f"   存储桶内容数量: {len(response['Contents'])}")
-                    for obj in response['Contents'][:3]:
-                        print(f"     - {obj['Key']} ({obj['Size']} bytes)")
-                else:
-                    print("   存储桶为空")
-            except ClientError as e:
-                print(f"   访问存储桶失败: {e}")
-        
-        print(f"\n3. 测试上传小文件...")
-        test_content = b"Hello, S3 Test!"
-        test_key = "test/connectivity_test.txt"
-        
-        try:
-            s3_client.put_object(
-                Bucket=bucket_name,
-                Key=test_key,
-                Body=test_content,
-                ContentType='text/plain'
-            )
-            print("   上传成功!")
-            
-            # 测试下载
-            response = s3_client.get_object(Bucket=bucket_name, Key=test_key)
-            downloaded_content = response['Body'].read()
-            print(f"   下载成功: {downloaded_content}")
-            
-            # 清理测试文件
-            s3_client.delete_object(Bucket=bucket_name, Key=test_key)
-            print("   清理成功!")
-            
-        except ClientError as e:
-            print(f"   上传/下载失败: {e}")
-            
-    except NoCredentialsError:
-        print("错误: 无法找到AWS凭证")
-    except ClientError as e:
-        error_code = e.response['Error']['Code']
-        error_message = e.response['Error']['Message']
-        print(f"错误: {error_code} - {error_message}")
-    except Exception as e:
-        print(f"未知错误: {e}")
-
-if __name__ == "__main__":
-    test_s3_authentication()

From 2c1b8056fd4061b59396e443f633e635ce4ab306 Mon Sep 17 00:00:00 2001
From: ningpingli <728359849@qq.com>
Date: Thu, 14 Aug 2025 21:04:27 +0800
Subject: [PATCH 03/10] feat: add information enhancer

---
 enhancers/__init__.py             |  0
 enhancers/information_enhancer.py | 73 +++++++++++++++++++++++++++++++
 parsers/document_parser.py        | 24 +++++-----
 tests/test_integration.py         |  9 ++--
 worker.py                         | 32 ++++++++++++++
 5 files changed, 122 insertions(+), 16 deletions(-)
 create mode 100644 enhancers/__init__.py
 create mode 100644 enhancers/information_enhancer.py
 create mode 100644 worker.py

diff --git a/enhancers/__init__.py b/enhancers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/enhancers/information_enhancer.py b/enhancers/information_enhancer.py
new file mode 100644
index 0000000..82659a1
--- /dev/null
+++ b/enhancers/information_enhancer.py
@@ -0,0 +1,73 @@
+from abc import ABC, abstractmethod
+from typing import Any
+
+class InformationEnhancer(ABC):
+    """信息增强器基类"""
+
+    def __init__(self) -> None:
+        pass
+
+    @abstractmethod
+    async def enhance(self, information: dict[str, Any]) -> dict[str, Any]:
+        """增强信息"""
+        pass
+
+class TableInformationEnhancer(InformationEnhancer):
+    """表格信息增强器"""
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    async def enhance(self, information: dict[str, Any]) -> dict[str, Any]:
+        """增强信息"""
+        return information
+
+class FormulasInformationEnhancer(InformationEnhancer):
+    """公式信息增强器"""
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    async def enhance(self, information: dict[str, Any]) -> dict[str, Any]:
+        """增强信息"""
+        return information
+
+class ImageInformationEnhancer(InformationEnhancer):
+    """图片信息增强器"""
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    async def enhance(self, information: dict[str, Any]) -> dict[str, Any]:
+        """增强信息"""
+        return information
+
+class InformationEnhancerFactory:
+    """信息增强器工厂"""
+
+    def __init__(self) -> None:
+        self.enhancers = [
+            TableInformationEnhancer(),
+            FormulasInformationEnhancer(),
+            ImageInformationEnhancer()
+        ]
+
+    def get_enhancer(self, information: dict[str, Any]) -> InformationEnhancer:
+        """获取信息增强器"""
+        match information.get("type"):
+            case "table":
+                return TableInformationEnhancer()
+            case "formulas":
+                return FormulasInformationEnhancer()
+            case "image":
+                return ImageInformationEnhancer()
+            case _:
+                return None
+    
+    async def enhance_information(self, information: dict[str, Any]) -> dict[str, Any]:
+        """增强信息"""
+        enhancer = self.get_enhancer(information)
+        if not enhancer:
+            raise ValueError(f"不支持的模态类型: {information.get('type')}")
+        return await enhancer.enhance(information)
+        
diff --git a/parsers/document_parser.py b/parsers/document_parser.py
index b203da3..69ad1d8 100644
--- a/parsers/document_parser.py
+++ b/parsers/document_parser.py
@@ -11,7 +11,7 @@ def __init__(self) -> None:
         self.supported_formats: list[str] = []
 
     @abstractmethod
-    async def parse(self, file_path: str, file_content: bytes) -> dict[str, Any]:
+    async def parse(self, file_path: str) -> list[dict[str, Any]]:
         """解析文档"""
         pass
 
@@ -30,19 +30,19 @@ def __init__(self) -> None:
     def can_parse(self, file_path: str) -> bool:
         return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
 
-    async def parse(self, file_path: str, file_content: bytes) -> dict[str, Any]:
+    async def parse(self, file_path: str) -> list[dict[str, Any]]:
         """解析PDF文档"""
         try:
             # 这里应该使用mineru库
             # 暂时返回模拟数据
-            return {
+            return [{
                 "type": "pdf",
                 "text": f"PDF文档内容: {file_path}",
                 "pages": 1,
                 "images": [],
                 "tables": [],
                 "formulas": []
-            }
+            }]
         except Exception as e:
             logger.error(f"解析PDF失败: {e}")
             raise
@@ -57,19 +57,19 @@ def __init__(self) -> None:
     def can_parse(self, file_path: str) -> bool:
         return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
 
-    async def parse(self, file_path: str, file_content: bytes) -> dict[str, Any]:
+    async def parse(self, file_path: str) -> list[dict[str, Any]]:
         """解析DOCX文档"""
         try:
             # 这里应该使用docling库
             # 暂时返回模拟数据
-            return {
+            return [{
                 "type": "docx",
                 "text": f"DOCX文档内容: {file_path}",
                 "pages": 1,
                 "images": [],
                 "tables": [],
                 "formulas": []
-            }
+            }]
         except Exception as e:
             logger.error(f"解析DOCX失败: {e}")
             raise
@@ -84,19 +84,19 @@ def __init__(self) -> None:
     def can_parse(self, file_path: str) -> bool:
         return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
 
-    async def parse(self, file_path: str, file_content: bytes) -> dict[str, Any]:
+    async def parse(self, file_path: str) -> list[dict[str, Any]]:
         """解析XLSX文档"""
         try:
             # 这里应该使用docling库
             # 暂时返回模拟数据
-            return {
+            return [{
                 "type": "xlsx",
                 "text": f"XLSX文档内容: {file_path}",
                 "pages": 1,
                 "images": [],
                 "tables": [],
                 "formulas": []
-            }
+            }]
         except Exception as e:
             logger.error(f"解析XLSX失败: {e}")
             raise
@@ -118,10 +118,10 @@ def get_parser(self, file_path: str) -> DocumentParser | None:
                 return parser
         return None
 
-    async def parse_document(self, file_path: str, file_content: bytes) -> dict[str, Any]:
+    async def parse_document(self, file_path: str) -> list[dict[str, Any]]:
         """解析文档"""
         parser = self.get_parser(file_path)
         if not parser:
             raise ValueError(f"不支持的文件格式: {file_path}")
 
-        return await parser.parse(file_path, file_content)
+        return await parser.parse(file_path)
diff --git a/tests/test_integration.py b/tests/test_integration.py
index 973a2fa..020c90d 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -19,7 +19,7 @@ class TestRedisIntegration:
     async def redis_client(self):
         """获取真实的Redis客户端"""
         try:
-            client = await get_redis_client("redis://localhost:6379")
+            client = await get_redis_client(os.getenv("REDIS_URL"))
             # 清理测试数据
             await client.flushdb()
             yield client
@@ -223,7 +223,7 @@ async def system_components(self):
         """获取所有系统组件"""
         try:
             # Redis客户端
-            redis_client = await get_redis_client("redis://localhost:6379")
+            redis_client = await get_redis_client(os.getenv("REDIS_URL"))
             await redis_client.flushdb()
             
             # 任务管理器
@@ -286,12 +286,12 @@ async def test_document_processing_workflow(self, system_components):
         assert retrieved_task["task_id"] == "workflow-test-123"
         
         # 6. 模拟文档解析结果
-        parsing_result = {
+        parsing_result = [{
             "text": "Chemical document content with formulas and structures.",
             "formulas": ["H2O", "CO2", "CH4"],
             "structures": ["molecular_structure_1.png"],
             "confidence": 0.92
-        }
+        }]
         
         # 7. 更新任务状态和结果
         update_success = await task_manager.update_task_status(
@@ -314,6 +314,7 @@ async def test_document_processing_workflow(self, system_components):
 # 环境检查装饰器
 def requires_redis(func):
     """需要Redis服务的装饰器"""
+    print(os.getenv("REDIS_URL"))
     return pytest.mark.skipif(
         not os.getenv("REDIS_URL") and not os.getenv("REDIS_ENABLED", "false").lower() == "true",
         reason="需要Redis服务"
diff --git a/worker.py b/worker.py
new file mode 100644
index 0000000..d534c0d
--- /dev/null
+++ b/worker.py
@@ -0,0 +1,32 @@
+from typing import Any
+from enhancers.information_enhancer import InformationEnhancerFactory
+import asyncio
+from sanic import Sanic
+from parsers.document_parser import DocumentParserFactory
+from config import settings
+
+async def worker(app: Sanic) -> list[dict[str, Any]]:
+    # 使用工厂获取合适的解析器
+    parser_factory = DocumentParserFactory()
+    enhancer_factory = InformationEnhancerFactory()
+    redis = app.ctx.redis
+    while True:
+        task = await redis.get_task()
+        if not task:
+            await asyncio.sleep(1)
+            continue
+        file_path = task.get("file_path")
+        information_list = await parser_factory.parse_document(file_path)
+        # 控制并发数量，防止访问量过大导致失败
+        SEMAPHORE_LIMIT = 10  # 可根据实际情况调整
+        semaphore = asyncio.Semaphore(SEMAPHORE_LIMIT)
+
+        async def enhance_with_semaphore(info):
+            async with semaphore:
+                return await enhancer_factory.enhance_information(info)
+
+        # 并发增强每个信息
+        enhanced_information_list = await asyncio.gather(
+            *(enhance_with_semaphore(info) for info in information_list)
+        )
+        return enhanced_information_list
\ No newline at end of file

From 480a717b548f9d892eaa5a4bb1665c256786526e Mon Sep 17 00:00:00 2001
From: liningping <728359849@qq.com>
Date: Mon, 18 Aug 2025 10:00:07 +0000
Subject: [PATCH 04/10] feat: add excel parser test

---
 enhancers/information_enhancer.py  |  37 +--
 parsers/document_parser.py         | 126 ++--------
 parsers/document_parser_factory.py |  29 +++
 parsers/excel_parser.py            | 386 +++++++++++++++++++++++++++++
 pyproject.toml                     |   5 +-
 tests/test_excel_parser.py         | 110 ++++++++
 uv.lock                            | 173 +++++++++++++
 worker.py                          |  29 ++-
 8 files changed, 751 insertions(+), 144 deletions(-)
 create mode 100644 parsers/document_parser_factory.py
 create mode 100644 parsers/excel_parser.py
 create mode 100644 tests/test_excel_parser.py

diff --git a/enhancers/information_enhancer.py b/enhancers/information_enhancer.py
index 82659a1..696dd10 100644
--- a/enhancers/information_enhancer.py
+++ b/enhancers/information_enhancer.py
@@ -1,44 +1,33 @@
 from abc import ABC, abstractmethod
-from typing import Any
 
-class InformationEnhancer(ABC):
-    """信息增强器基类"""
+from parsers.document_parser import DocumentData
 
-    def __init__(self) -> None:
-        pass
 
+class InformationEnhancer(ABC):
+    """信息增强器基类"""
     @abstractmethod
-    async def enhance(self, information: dict[str, Any]) -> dict[str, Any]:
+    async def enhance(self, information: DocumentData) -> DocumentData:
         """增强信息"""
         pass
 
 class TableInformationEnhancer(InformationEnhancer):
     """表格信息增强器"""
 
-    def __init__(self) -> None:
-        super().__init__()
-
-    async def enhance(self, information: dict[str, Any]) -> dict[str, Any]:
+    async def enhance(self, information: DocumentData) -> DocumentData:
         """增强信息"""
         return information
 
 class FormulasInformationEnhancer(InformationEnhancer):
     """公式信息增强器"""
 
-    def __init__(self) -> None:
-        super().__init__()
-
-    async def enhance(self, information: dict[str, Any]) -> dict[str, Any]:
+    async def enhance(self, information: DocumentData) -> DocumentData:
         """增强信息"""
         return information
 
 class ImageInformationEnhancer(InformationEnhancer):
     """图片信息增强器"""
 
-    def __init__(self) -> None:
-        super().__init__()
-
-    async def enhance(self, information: dict[str, Any]) -> dict[str, Any]:
+    async def enhance(self, information: DocumentData) -> DocumentData:
         """增强信息"""
         return information
 
@@ -52,9 +41,9 @@ def __init__(self) -> None:
             ImageInformationEnhancer()
         ]
 
-    def get_enhancer(self, information: dict[str, Any]) -> InformationEnhancer:
+    def get_enhancer(self, information: DocumentData) -> InformationEnhancer|None:
         """获取信息增强器"""
-        match information.get("type"):
+        match information.type:
             case "table":
                 return TableInformationEnhancer()
             case "formulas":
@@ -63,11 +52,11 @@ def get_enhancer(self, information: dict[str, Any]) -> InformationEnhancer:
                 return ImageInformationEnhancer()
             case _:
                 return None
-    
-    async def enhance_information(self, information: dict[str, Any]) -> dict[str, Any]:
+
+    async def enhance_information(self, information: DocumentData) -> DocumentData:
         """增强信息"""
         enhancer = self.get_enhancer(information)
         if not enhancer:
-            raise ValueError(f"不支持的模态类型: {information.get('type')}")
+            raise ValueError(f"不支持的模态类型: {information.type}")
         return await enhancer.enhance(information)
-        
+
diff --git a/parsers/document_parser.py b/parsers/document_parser.py
index 69ad1d8..1ea37f3 100644
--- a/parsers/document_parser.py
+++ b/parsers/document_parser.py
@@ -1,9 +1,25 @@
 import logging
 from abc import ABC, abstractmethod
-from typing import Any
+
+from pydantic import BaseModel
 
 logger = logging.getLogger(__name__)
 
+class DocumentData(BaseModel):
+    """文档数据类"""
+    type: str
+    name: str
+    content: str
+    description: str
+
+class ParseResult(BaseModel):
+    """解析结果类"""
+    title: str
+    document: list[DocumentData]
+    processing_time: float
+    success: bool
+    error_message: str | None = None
+
 class DocumentParser(ABC):
     """文档解析器基类"""
 
@@ -11,7 +27,7 @@ def __init__(self) -> None:
         self.supported_formats: list[str] = []
 
     @abstractmethod
-    async def parse(self, file_path: str) -> list[dict[str, Any]]:
+    async def parse(self, file_path: str) -> ParseResult:
         """解析文档"""
         pass
 
@@ -19,109 +35,3 @@ async def parse(self, file_path: str) -> list[dict[str, Any]]:
     def can_parse(self, file_path: str) -> bool:
         """检查是否可以解析该文件"""
         pass
-
-class PDFParser(DocumentParser):
-    """PDF文档解析器"""
-
-    def __init__(self) -> None:
-        super().__init__()
-        self.supported_formats = ['.pdf']
-
-    def can_parse(self, file_path: str) -> bool:
-        return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
-
-    async def parse(self, file_path: str) -> list[dict[str, Any]]:
-        """解析PDF文档"""
-        try:
-            # 这里应该使用mineru库
-            # 暂时返回模拟数据
-            return [{
-                "type": "pdf",
-                "text": f"PDF文档内容: {file_path}",
-                "pages": 1,
-                "images": [],
-                "tables": [],
-                "formulas": []
-            }]
-        except Exception as e:
-            logger.error(f"解析PDF失败: {e}")
-            raise
-
-class DOCXParser(DocumentParser):
-    """DOCX文档解析器"""
-
-    def __init__(self) -> None:
-        super().__init__()
-        self.supported_formats = ['.docx','.doc']
-
-    def can_parse(self, file_path: str) -> bool:
-        return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
-
-    async def parse(self, file_path: str) -> list[dict[str, Any]]:
-        """解析DOCX文档"""
-        try:
-            # 这里应该使用docling库
-            # 暂时返回模拟数据
-            return [{
-                "type": "docx",
-                "text": f"DOCX文档内容: {file_path}",
-                "pages": 1,
-                "images": [],
-                "tables": [],
-                "formulas": []
-            }]
-        except Exception as e:
-            logger.error(f"解析DOCX失败: {e}")
-            raise
-
-class XLSXParser(DocumentParser):
-    """XLSX文档解析器"""
-
-    def __init__(self) -> None:
-        super().__init__()
-        self.supported_formats = ['.xlsx']
-
-    def can_parse(self, file_path: str) -> bool:
-        return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
-
-    async def parse(self, file_path: str) -> list[dict[str, Any]]:
-        """解析XLSX文档"""
-        try:
-            # 这里应该使用docling库
-            # 暂时返回模拟数据
-            return [{
-                "type": "xlsx",
-                "text": f"XLSX文档内容: {file_path}",
-                "pages": 1,
-                "images": [],
-                "tables": [],
-                "formulas": []
-            }]
-        except Exception as e:
-            logger.error(f"解析XLSX失败: {e}")
-            raise
-
-class DocumentParserFactory:
-    """文档解析器工厂"""
-
-    def __init__(self) -> None:
-        self.parsers = [
-            PDFParser(),
-            DOCXParser(),
-            XLSXParser()
-        ]
-
-    def get_parser(self, file_path: str) -> DocumentParser | None:
-        """根据文件路径获取合适的解析器"""
-        for parser in self.parsers:
-            if parser.can_parse(file_path):
-                return parser
-        return None
-
-    async def parse_document(self, file_path: str) -> list[dict[str, Any]]:
-        """解析文档"""
-        parser = self.get_parser(file_path)
-        if not parser:
-            raise ValueError(f"不支持的文件格式: {file_path}")
-
-        return await parser.parse(file_path)
diff --git a/parsers/document_parser_factory.py b/parsers/document_parser_factory.py
new file mode 100644
index 0000000..e582873
--- /dev/null
+++ b/parsers/document_parser_factory.py
@@ -0,0 +1,29 @@
+import logging
+
+from parsers.document_parser import DocumentParser, ParseResult
+from parsers.excel_parser import ExcelParser
+
+logger = logging.getLogger(__name__)
+
+class DocumentParserFactory:
+    """文档解析器工厂"""
+
+    def __init__(self) -> None:
+        self.parsers: list[DocumentParser] = [
+            ExcelParser()
+        ]
+
+    def get_parser(self, file_path: str) -> DocumentParser | None:
+        """根据文件路径获取合适的解析器"""
+        for parser in self.parsers:
+            if parser.can_parse(file_path):
+                return parser
+        return None
+
+    async def parse_document(self, file_path: str) -> ParseResult:
+        """解析文档"""
+        parser = self.get_parser(file_path)
+        if not parser:
+            raise ValueError(f"不支持的文件格式: {file_path}")
+
+        return await parser.parse(file_path)
diff --git a/parsers/excel_parser.py b/parsers/excel_parser.py
new file mode 100644
index 0000000..4a33e5c
--- /dev/null
+++ b/parsers/excel_parser.py
@@ -0,0 +1,386 @@
+"""
+Excel文件解析器模块
+
+该模块提供将Excel文件转换为结构化JSON格式的功能，
+包括表格数据提取和图片处理。
+"""
+
+import base64
+import json
+import time
+import warnings
+from dataclasses import dataclass
+from datetime import date, datetime
+from pathlib import Path
+from typing import Any
+
+from openpyxl import load_workbook  # type: ignore
+from openpyxl.drawing.image import Image  # type: ignore
+from openpyxl.workbook.workbook import Workbook  # type: ignore
+from openpyxl.worksheet.worksheet import Worksheet  # type: ignore
+
+from parsers.document_parser import DocumentData, DocumentParser, ParseResult
+
+# 忽略 openpyxl 的特定警告
+warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl')
+
+# 类型别名
+CellValue = str|int|float|bool|None|datetime|date
+TableData = list[list[CellValue]]
+
+
+@dataclass
+class ExcelParseConfig:
+    """Excel解析配置类"""
+    data_only: bool = True
+    keep_vba: bool = False
+    default_image_format: str = 'png'
+    image_description_placeholder: str = "[待生成]"
+
+
+class ExcelParseError(Exception):
+    """Excel解析异常类"""
+    pass
+
+
+class ExcelParser(DocumentParser):
+    """Excel文件解析器类"""
+
+    def __init__(self, config: ExcelParseConfig | None = None):
+        """
+        初始化Excel解析器
+        Args:
+            config: 解析配置，如果为None则使用默认配置
+        """
+        super().__init__()
+        self.config: ExcelParseConfig = config or ExcelParseConfig()
+        self.image_index: int = 0
+        self.supported_formats: list[str] = ['.xlsx', '.xls']
+
+    async def parse(self, excel_path: str) -> ParseResult:
+        """
+        解析Excel文件并保存结果
+
+        Args:
+            excel_path: Excel文件路径
+            output_dir: 输出目录路径
+        Returns:
+            ParseResult: 解析结果对象
+        Raises:
+            ExcelParseError: 当解析失败时抛出
+        """
+        start_time = time.time()
+
+        try:
+            # 转换Excel到JSON格式
+            title, document_data = self._excel_to_json(excel_path)
+
+            # 计算处理时间
+            processing_time = time.time() - start_time
+
+
+            return ParseResult(
+                title=title,
+                document=document_data,
+                processing_time=processing_time,
+                success=True
+            )
+
+        except Exception as e:
+            processing_time = time.time() - start_time
+            return ParseResult(
+                title="",
+                document=[],
+                processing_time=processing_time,
+                success=False,
+                error_message=str(e)
+            )
+
+    def can_parse(self, file_path: str) -> bool:
+        """
+        验证输入文件
+        Args:
+            file_path: 文件路径
+        Returns:
+            bool: 是否支持解析
+        """
+        return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
+
+    def _excel_to_json(self, excel_path: str) -> tuple[str, list[DocumentData]]:
+        """
+        将Excel文件转换为JSON格式
+        Args:
+            excel_path: Excel文件路径
+        Returns:
+            DocumentData: 文档数据
+        """
+        # 获取文件名作为标题（不带扩展名）
+        title = Path(excel_path).stem
+
+        # 初始化内容列表和图片列表
+        content: list[DocumentData] = []
+        self.image_index = 0
+
+        # 加载工作簿
+        workbook = self._load_workbook(excel_path)
+
+        # 处理每个工作表
+        for sheet_index, sheet_name in enumerate(workbook.sheetnames):
+            sheet = workbook[sheet_name]
+
+            # 添加工作表标题
+            content.append(DocumentData(
+                type="text",
+                name=sheet_name,
+                content=f"工作表 {sheet_index + 1}: {sheet_name}",
+                description="工作表标题"
+            ))
+
+            # 处理图片
+            sheet_images = self._extract_sheet_images(sheet)
+            content.extend(sheet_images)
+
+            # 处理表格数据
+            table_content = self._extract_table_data(sheet)
+            content.append(DocumentData(
+                type="table",
+                name="表格",
+                content=json.dumps(table_content),
+                description="表格"
+            ))
+
+        # 添加结束文本
+        content.append(DocumentData(
+            type="text",
+            name="结束文本",
+            content="",
+            description="结束文本"
+        ))
+
+        return title, content
+
+    def _load_workbook(self, excel_path: str) -> Workbook:
+        """
+        加载Excel工作簿
+        Args:
+            excel_path: Excel文件路径
+        Returns:
+            Workbook: 加载的工作簿对象
+        """
+        return load_workbook(
+            excel_path,
+            data_only=self.config.data_only,
+            keep_vba=self.config.keep_vba
+        )
+
+    def _extract_sheet_images(self, sheet: Worksheet) -> list[DocumentData]:
+        """
+        提取工作表中的图片
+        Args:
+            sheet: 工作表对象
+        Returns:
+            List[DocumentData]: 图片信息列表
+        """
+        sheet_images: list[DocumentData] = []
+
+        images = getattr(sheet, '_images', None)
+        if not images or not isinstance(images, (list, tuple)):
+            return sheet_images
+
+        # 收集图片信息
+        for img_obj in images:
+            if not isinstance(img_obj, Image):
+                continue
+
+            try:
+                image_info = self._process_image_object(img_obj)
+                if image_info:
+                    sheet_images.append(image_info)
+            except Exception as e:
+                print(f"处理图片失败: {str(e)}")
+                continue
+
+        return sheet_images
+
+    def _process_image_object(self, img_obj: Image) -> DocumentData | None:
+        """
+        处理单个图片对象
+        Args:
+            img_obj: 图片对象
+        Returns:
+            Optional[DocumentData]: 图片信息，处理失败时返回None
+        """
+        try:
+            # 获取图片数据
+            img_data = img_obj._data()
+
+            # 获取图片格式
+            img_format = self._get_image_format(img_obj)
+
+            # 生成Base64编码
+            base64_encoded = base64.b64encode(img_data).decode('utf-8')
+            uri = f"data:image/{img_format};base64,{base64_encoded}"
+
+            # 创建图片信息
+            image_info = DocumentData(
+                type="image",
+                name=f"#/pictures/{self.image_index}",
+                content=uri,
+                description=self.config.image_description_placeholder
+            )
+
+            self.image_index += 1
+            return image_info
+
+        except Exception as e:
+            print(f"处理图片对象失败: {str(e)}")
+            return None
+
+    def _get_image_format(self, img_obj: Image) -> str:
+        """
+        获取图片格式
+        Args:
+            img_obj: 图片对象
+        Returns:
+            str: 图片格式
+        """
+        fmt = getattr(img_obj, 'format', None)
+        if isinstance(fmt, str) and fmt:
+            img_format: str = fmt.lower()
+            # 处理JPEG格式的别名
+            if img_format == 'jpeg':
+                img_format = 'jpg'
+            return img_format
+        return self.config.default_image_format
+
+    def _process_cell_value(self, cell_value: Any) -> CellValue:
+        """
+        预处理单元格值，将datetime对象转换为字符串
+        Args:
+            cell_value: 原始单元格值
+        Returns:
+            CellValue: 处理后的单元格值
+        """
+        if cell_value is None:
+            return ""
+
+        # 处理datetime对象，转换为ISO格式字符串
+        if isinstance(cell_value, datetime):
+            return cell_value.strftime("%Y-%m-%d %H:%M:%S")
+
+        # 处理date对象，转换为日期字符串
+        if isinstance(cell_value, date):
+            return cell_value.strftime("%Y-%m-%d")
+
+        # 处理其他类型
+        if isinstance(cell_value, str|int|float|bool):
+            return cell_value
+
+        # 对于其他类型，转换为字符串
+        return str(cell_value)
+
+    def _extract_table_data(self, sheet: Worksheet) -> dict[str, Any]:
+        """
+        提取表格数据
+        Args:
+            sheet: 工作表对象
+        Returns:
+            Dict[str, Any]: 表格数据
+        """
+        # 获取合并单元格信息
+        merged_ranges = self._get_merged_cells(sheet)
+        merged_map = self._create_merged_cell_map(merged_ranges, sheet)
+
+        # 计算表格维度
+        max_row = sheet.max_row
+        max_col = sheet.max_column
+
+        # 提取所有数据
+        all_rows = self._extract_all_rows(sheet, max_row, max_col, merged_map)
+
+        return {
+            "dimensions": {
+                "rows": len(all_rows),
+                "columns": max_col
+            },
+            "headers": all_rows[0] if all_rows else [],
+            "data": all_rows[1:] if len(all_rows) > 1 else []
+        }
+
+    def _get_merged_cells(self, sheet: Worksheet) -> dict[tuple[int, int, int, int], CellValue]:
+        """
+        获取合并单元格信息
+        Args:
+            sheet: 工作表对象
+        Returns:
+            Dict: 合并单元格映射
+        """
+        merged_ranges = {}
+        if sheet.merged_cells:
+            for merged_range in sheet.merged_cells.ranges:
+                min_row, min_col, max_row, max_col = (
+                    merged_range.min_row, merged_range.min_col,
+                    merged_range.max_row, merged_range.max_col
+                )
+                merged_value = sheet.cell(row=min_row, column=min_col).value
+                merged_ranges[(min_row, min_col, max_row, max_col)] = merged_value
+        return merged_ranges
+
+    def _create_merged_cell_map(self, merged_ranges: dict, sheet: Worksheet) -> dict[tuple[int, int], CellValue]:
+        """
+        创建合并单元格映射
+        Args:
+            merged_ranges: 合并单元格范围
+            sheet: 工作表对象
+        Returns:
+            Dict: 合并单元格映射
+        """
+        merged_map = {}
+        for (min_row, min_col, max_row, max_col), value in merged_ranges.items():
+            # 预处理合并单元格的值
+            processed_value = self._process_cell_value(value)
+            for row_idx in range(min_row, max_row + 1):
+                for col_idx in range(min_col, max_col + 1):
+                    merged_map[(row_idx, col_idx)] = processed_value
+        return merged_map
+
+    def _extract_all_rows(self, sheet: Worksheet, max_row: int, max_col: int,
+                          merged_map: dict[tuple[int, int], CellValue]) -> TableData:
+        """
+        提取所有行数据
+        Args:
+            sheet: 工作表对象
+            max_row: 最大行数
+            max_col: 最大列数
+            merged_map: 合并单元格映射
+        Returns:
+            TableData: 所有行数据
+        """
+        all_rows = []
+        for row_idx in range(1, max_row + 1):
+            row_data = []
+            for col_idx in range(1, max_col + 1):
+                # 检查是否是合并单元格
+                if (row_idx, col_idx) in merged_map:
+                    cell_value = merged_map[(row_idx, col_idx)]
+                else:
+                    cell = sheet.cell(row=row_idx, column=col_idx)
+                    cell_value = cell.value
+
+                # 预处理单元格值
+                processed_value = self._process_cell_value(cell_value)
+                row_data.append(processed_value)
+            all_rows.append(row_data)
+
+        return all_rows
+
+
+    def _save_json(self, data: Any, file_path: Path) -> None:
+        """
+        保存JSON数据到文件
+        Args:
+            data: 要保存的数据
+            file_path: 文件路径
+        """
+        with open(file_path, 'w', encoding='utf-8') as f:
+            json.dump(data, f, ensure_ascii=False, indent=4)
diff --git a/pyproject.toml b/pyproject.toml
index f6b4cbd..23446a4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,6 +13,8 @@ dependencies = [
     "dotenv>=0.9.9",
     "aiobotocore>=2.24.0",
     "redis>=6.4.0",
+    "openpyxl>=3.1.5",
+    "pydantic>=2.11.7",
 ]
 
 [dependency-groups]
@@ -25,7 +27,8 @@ all-dev = [
 dev = [
     "pytest>=8.4.1",
     "pytest-cov>=6.2.1",
-    "pytest-asyncio>=0.23.0",  # 添加异步测试支持
+    "pytest-asyncio>=0.23.0", # 添加异步测试支持
+    "pillow>=11.3.0",
 ]
 lint = [
     "bandit>=1.8.6",
diff --git a/tests/test_excel_parser.py b/tests/test_excel_parser.py
new file mode 100644
index 0000000..fa5a464
--- /dev/null
+++ b/tests/test_excel_parser.py
@@ -0,0 +1,110 @@
+import base64
+import os
+import tempfile
+
+import pytest
+from openpyxl import Workbook
+from openpyxl.drawing.image import Image as XLImage
+
+from parsers.excel_parser import ExcelParser
+from parsers.document_parser import DocumentData
+
+
+@pytest.mark.asyncio
+async def test_parse_real_basic_and_image():
+    # 准备临时PNG图片（1x1透明像素）
+    one_px_png_b64 = (
+        b"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAusB9Y2oU5wAAAAASUVORK5CYII="
+    )
+    png_fd, png_path = tempfile.mkstemp(suffix=".png")
+    try:
+        with os.fdopen(png_fd, "wb") as f:
+            f.write(base64.b64decode(one_px_png_b64))
+
+        # 构建包含图片与两个工作表的真实Excel文件
+        wb = Workbook()
+        ws1 = wb.active
+        ws1.title = "Sheet1"
+        # 表头与数据
+        ws1["A1"] = "Header1"
+        ws1["B1"] = "Header2"
+        ws1["A2"] = "Data1"
+        ws1["B2"] = "Data2"
+        # 插入图片
+        img = XLImage(png_path)
+        ws1.add_image(img, "A5")
+
+        # 第二个工作表
+        ws2 = wb.create_sheet("Sheet2")
+        ws2["A1"] = "Single Header"
+        ws2["A2"] = "Single Data"
+
+        xlsx_fd, xlsx_path = tempfile.mkstemp(suffix=".xlsx")
+        os.close(xlsx_fd)
+        wb.save(xlsx_path)
+
+        try:
+            parser = ExcelParser()
+            result = await parser.parse(xlsx_path)
+
+            assert result.success is True
+            # 内容：Sheet1标题、Sheet1图片、Sheet1表格、Sheet2标题、Sheet2表格、结束文本
+            content = result.document
+            assert len(content) == 6
+
+            # 校验顺序与关键字段
+            assert content[0].type == "text" and content[0].name == "Sheet1"
+            assert content[1].type == "image"
+            assert content[1].name == "#/pictures/0"
+            assert content[1].content.startswith("data:image/")
+
+            assert content[2].type == "table"
+            assert content[3].type == "text" and content[3].name == "Sheet2"
+            assert content[4].type == "table"
+            assert content[5].type == "text" and content[5].name == "结束文本"
+        finally:
+            os.remove(xlsx_path)
+    finally:
+        os.remove(png_path)
+
+
+@pytest.mark.asyncio
+async def test_parse_real_merged_cells():
+    # 构建包含合并单元格的真实Excel文件
+    wb = Workbook()
+    ws = wb.active
+    ws.title = "Sheet1"
+
+    # 合并 A1:B1 并设置值
+    ws.merge_cells(start_row=1, start_column=1, end_row=1, end_column=2)
+    ws["A1"] = "Merged Header"
+    # 填充下一行数据
+    ws["A2"] = "Value1"
+    ws["B2"] = "Value2"
+
+    xlsx_fd, xlsx_path = tempfile.mkstemp(suffix=".xlsx")
+    os.close(xlsx_fd)
+    wb.save(xlsx_path)
+
+    try:
+        parser = ExcelParser()
+        result = await parser.parse(xlsx_path)
+
+        assert result.success is True
+        content = result.document
+        # 结构：标题、表格、结束文本
+        assert len(content) == 3
+
+        # 表格在索引1
+        table_chunk: DocumentData = content[1]
+        assert table_chunk.type == "table"
+
+        import json as _json
+        payload = _json.loads(table_chunk.content)
+        assert payload["headers"] == ["Merged Header", "Merged Header"]
+        assert payload["dimensions"]["rows"] == 2
+        assert payload["dimensions"]["columns"] == 2
+    finally:
+        os.remove(xlsx_path)
+
+
diff --git a/uv.lock b/uv.lock
index d15f45a..6ee4cdd 100644
--- a/uv.lock
+++ b/uv.lock
@@ -111,6 +111,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e", size = 7490, upload-time = "2025-07-03T22:54:42.156Z" },
 ]
 
+[[package]]
+name = "annotated-types"
+version = "0.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
+]
+
 [[package]]
 name = "attrs"
 version = "25.3.0"
@@ -309,6 +318,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b2/b7/545d2c10c1fc15e48653c91efde329a790f2eecfbbf2bd16003b5db2bab0/dotenv-0.9.9-py2.py3-none-any.whl", hash = "sha256:29cf74a087b31dafdb5a446b6d7e11cbce8ed2741540e2339c69fbef92c94ce9", size = 1892, upload-time = "2025-02-19T22:15:01.647Z" },
 ]
 
+[[package]]
+name = "et-xmlfile"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" },
+]
+
 [[package]]
 name = "frozenlist"
 version = "1.7.0"
@@ -457,6 +475,8 @@ dependencies = [
     { name = "aiofiles" },
     { name = "aiohttp" },
     { name = "dotenv" },
+    { name = "openpyxl" },
+    { name = "pydantic" },
     { name = "redis" },
     { name = "sanic" },
     { name = "sanic-ext" },
@@ -467,6 +487,7 @@ all-dev = [
     { name = "bandit" },
     { name = "detect-secrets" },
     { name = "mypy" },
+    { name = "pillow" },
     { name = "pytest" },
     { name = "pytest-asyncio" },
     { name = "pytest-cov" },
@@ -474,6 +495,7 @@ all-dev = [
     { name = "types-boto3" },
 ]
 dev = [
+    { name = "pillow" },
     { name = "pytest" },
     { name = "pytest-asyncio" },
     { name = "pytest-cov" },
@@ -496,6 +518,8 @@ requires-dist = [
     { name = "aiofiles", specifier = ">=23.2.1" },
     { name = "aiohttp", specifier = ">=3.9.0" },
     { name = "dotenv", specifier = ">=0.9.9" },
+    { name = "openpyxl", specifier = ">=3.1.5" },
+    { name = "pydantic", specifier = ">=2.11.7" },
     { name = "redis", specifier = ">=6.4.0" },
     { name = "sanic", specifier = ">=23.12.0" },
     { name = "sanic-ext", specifier = ">=23.12.0" },
@@ -506,6 +530,7 @@ all-dev = [
     { name = "bandit", specifier = ">=1.8.6" },
     { name = "detect-secrets", specifier = ">=1.5.0" },
     { name = "mypy", specifier = ">=1.17.1" },
+    { name = "pillow", specifier = ">=11.3.0" },
     { name = "pytest", specifier = ">=8.4.1" },
     { name = "pytest-asyncio", specifier = ">=0.23.0" },
     { name = "pytest-cov", specifier = ">=6.2.1" },
@@ -513,6 +538,7 @@ all-dev = [
     { name = "types-boto3", specifier = ">=1.40.0,<2.0.0" },
 ]
 dev = [
+    { name = "pillow", specifier = ">=11.3.0" },
     { name = "pytest", specifier = ">=8.4.1" },
     { name = "pytest-asyncio", specifier = ">=0.23.0" },
     { name = "pytest-cov", specifier = ">=6.2.1" },
@@ -631,6 +657,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505", size = 4963, upload-time = "2025-04-22T14:54:22.983Z" },
 ]
 
+[[package]]
+name = "openpyxl"
+version = "3.1.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "et-xmlfile" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" },
+]
+
 [[package]]
 name = "packaging"
 version = "25.0"
@@ -661,6 +699,72 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/47/ac/684d71315abc7b1214d59304e23a982472967f6bf4bde5a98f1503f648dc/pbr-6.1.1-py2.py3-none-any.whl", hash = "sha256:38d4daea5d9fa63b3f626131b9d34947fd0c8be9b05a29276870580050a25a76", size = 108997, upload-time = "2025-02-04T14:28:03.168Z" },
 ]
 
+[[package]]
+name = "pillow"
+version = "11.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/d0d6dea55cd152ce3d6767bb38a8fc10e33796ba4ba210cbab9354b6d238/pillow-11.3.0.tar.gz", hash = "sha256:3828ee7586cd0b2091b6209e5ad53e20d0649bbe87164a459d0676e035e8f523", size = 47113069, upload-time = "2025-07-01T09:16:30.666Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/40/fe/1bc9b3ee13f68487a99ac9529968035cca2f0a51ec36892060edcc51d06a/pillow-11.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fdae223722da47b024b867c1ea0be64e0df702c5e0a60e27daad39bf960dd1e4", size = 5278800, upload-time = "2025-07-01T09:14:17.648Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/32/7e2ac19b5713657384cec55f89065fb306b06af008cfd87e572035b27119/pillow-11.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:921bd305b10e82b4d1f5e802b6850677f965d8394203d182f078873851dada69", size = 4686296, upload-time = "2025-07-01T09:14:19.828Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/1e/b9e12bbe6e4c2220effebc09ea0923a07a6da1e1f1bfbc8d7d29a01ce32b/pillow-11.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:eb76541cba2f958032d79d143b98a3a6b3ea87f0959bbe256c0b5e416599fd5d", size = 5871726, upload-time = "2025-07-03T13:10:04.448Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/33/e9200d2bd7ba00dc3ddb78df1198a6e80d7669cce6c2bdbeb2530a74ec58/pillow-11.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:67172f2944ebba3d4a7b54f2e95c786a3a50c21b88456329314caaa28cda70f6", size = 7644652, upload-time = "2025-07-03T13:10:10.391Z" },
+    { url = "https://files.pythonhosted.org/packages/41/f1/6f2427a26fc683e00d985bc391bdd76d8dd4e92fac33d841127eb8fb2313/pillow-11.3.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:97f07ed9f56a3b9b5f49d3661dc9607484e85c67e27f3e8be2c7d28ca032fec7", size = 5977787, upload-time = "2025-07-01T09:14:21.63Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/c9/06dd4a38974e24f932ff5f98ea3c546ce3f8c995d3f0985f8e5ba48bba19/pillow-11.3.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:676b2815362456b5b3216b4fd5bd89d362100dc6f4945154ff172e206a22c024", size = 6645236, upload-time = "2025-07-01T09:14:23.321Z" },
+    { url = "https://files.pythonhosted.org/packages/40/e7/848f69fb79843b3d91241bad658e9c14f39a32f71a301bcd1d139416d1be/pillow-11.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3e184b2f26ff146363dd07bde8b711833d7b0202e27d13540bfe2e35a323a809", size = 6086950, upload-time = "2025-07-01T09:14:25.237Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/1a/7cff92e695a2a29ac1958c2a0fe4c0b2393b60aac13b04a4fe2735cad52d/pillow-11.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6be31e3fc9a621e071bc17bb7de63b85cbe0bfae91bb0363c893cbe67247780d", size = 6723358, upload-time = "2025-07-01T09:14:27.053Z" },
+    { url = "https://files.pythonhosted.org/packages/26/7d/73699ad77895f69edff76b0f332acc3d497f22f5d75e5360f78cbcaff248/pillow-11.3.0-cp312-cp312-win32.whl", hash = "sha256:7b161756381f0918e05e7cb8a371fff367e807770f8fe92ecb20d905d0e1c149", size = 6275079, upload-time = "2025-07-01T09:14:30.104Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/ce/e7dfc873bdd9828f3b6e5c2bbb74e47a98ec23cc5c74fc4e54462f0d9204/pillow-11.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:a6444696fce635783440b7f7a9fc24b3ad10a9ea3f0ab66c5905be1c19ccf17d", size = 6986324, upload-time = "2025-07-01T09:14:31.899Z" },
+    { url = "https://files.pythonhosted.org/packages/16/8f/b13447d1bf0b1f7467ce7d86f6e6edf66c0ad7cf44cf5c87a37f9bed9936/pillow-11.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:2aceea54f957dd4448264f9bf40875da0415c83eb85f55069d89c0ed436e3542", size = 2423067, upload-time = "2025-07-01T09:14:33.709Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/93/0952f2ed8db3a5a4c7a11f91965d6184ebc8cd7cbb7941a260d5f018cd2d/pillow-11.3.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:1c627742b539bba4309df89171356fcb3cc5a9178355b2727d1b74a6cf155fbd", size = 2128328, upload-time = "2025-07-01T09:14:35.276Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/e8/100c3d114b1a0bf4042f27e0f87d2f25e857e838034e98ca98fe7b8c0a9c/pillow-11.3.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:30b7c02f3899d10f13d7a48163c8969e4e653f8b43416d23d13d1bbfdc93b9f8", size = 2170652, upload-time = "2025-07-01T09:14:37.203Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/86/3f758a28a6e381758545f7cdb4942e1cb79abd271bea932998fc0db93cb6/pillow-11.3.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:7859a4cc7c9295f5838015d8cc0a9c215b77e43d07a25e460f35cf516df8626f", size = 2227443, upload-time = "2025-07-01T09:14:39.344Z" },
+    { url = "https://files.pythonhosted.org/packages/01/f4/91d5b3ffa718df2f53b0dc109877993e511f4fd055d7e9508682e8aba092/pillow-11.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ec1ee50470b0d050984394423d96325b744d55c701a439d2bd66089bff963d3c", size = 5278474, upload-time = "2025-07-01T09:14:41.843Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/0e/37d7d3eca6c879fbd9dba21268427dffda1ab00d4eb05b32923d4fbe3b12/pillow-11.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7db51d222548ccfd274e4572fdbf3e810a5e66b00608862f947b163e613b67dd", size = 4686038, upload-time = "2025-07-01T09:14:44.008Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/b0/3426e5c7f6565e752d81221af9d3676fdbb4f352317ceafd42899aaf5d8a/pillow-11.3.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2d6fcc902a24ac74495df63faad1884282239265c6839a0a6416d33faedfae7e", size = 5864407, upload-time = "2025-07-03T13:10:15.628Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/c1/c6c423134229f2a221ee53f838d4be9d82bab86f7e2f8e75e47b6bf6cd77/pillow-11.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f0f5d8f4a08090c6d6d578351a2b91acf519a54986c055af27e7a93feae6d3f1", size = 7639094, upload-time = "2025-07-03T13:10:21.857Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/c9/09e6746630fe6372c67c648ff9deae52a2bc20897d51fa293571977ceb5d/pillow-11.3.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c37d8ba9411d6003bba9e518db0db0c58a680ab9fe5179f040b0463644bc9805", size = 5973503, upload-time = "2025-07-01T09:14:45.698Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/1c/a2a29649c0b1983d3ef57ee87a66487fdeb45132df66ab30dd37f7dbe162/pillow-11.3.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:13f87d581e71d9189ab21fe0efb5a23e9f28552d5be6979e84001d3b8505abe8", size = 6642574, upload-time = "2025-07-01T09:14:47.415Z" },
+    { url = "https://files.pythonhosted.org/packages/36/de/d5cc31cc4b055b6c6fd990e3e7f0f8aaf36229a2698501bcb0cdf67c7146/pillow-11.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:023f6d2d11784a465f09fd09a34b150ea4672e85fb3d05931d89f373ab14abb2", size = 6084060, upload-time = "2025-07-01T09:14:49.636Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/ea/502d938cbaeec836ac28a9b730193716f0114c41325db428e6b280513f09/pillow-11.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:45dfc51ac5975b938e9809451c51734124e73b04d0f0ac621649821a63852e7b", size = 6721407, upload-time = "2025-07-01T09:14:51.962Z" },
+    { url = "https://files.pythonhosted.org/packages/45/9c/9c5e2a73f125f6cbc59cc7087c8f2d649a7ae453f83bd0362ff7c9e2aee2/pillow-11.3.0-cp313-cp313-win32.whl", hash = "sha256:a4d336baed65d50d37b88ca5b60c0fa9d81e3a87d4a7930d3880d1624d5b31f3", size = 6273841, upload-time = "2025-07-01T09:14:54.142Z" },
+    { url = "https://files.pythonhosted.org/packages/23/85/397c73524e0cd212067e0c969aa245b01d50183439550d24d9f55781b776/pillow-11.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:0bce5c4fd0921f99d2e858dc4d4d64193407e1b99478bc5cacecba2311abde51", size = 6978450, upload-time = "2025-07-01T09:14:56.436Z" },
+    { url = "https://files.pythonhosted.org/packages/17/d2/622f4547f69cd173955194b78e4d19ca4935a1b0f03a302d655c9f6aae65/pillow-11.3.0-cp313-cp313-win_arm64.whl", hash = "sha256:1904e1264881f682f02b7f8167935cce37bc97db457f8e7849dc3a6a52b99580", size = 2423055, upload-time = "2025-07-01T09:14:58.072Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/80/a8a2ac21dda2e82480852978416cfacd439a4b490a501a288ecf4fe2532d/pillow-11.3.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4c834a3921375c48ee6b9624061076bc0a32a60b5532b322cc0ea64e639dd50e", size = 5281110, upload-time = "2025-07-01T09:14:59.79Z" },
+    { url = "https://files.pythonhosted.org/packages/44/d6/b79754ca790f315918732e18f82a8146d33bcd7f4494380457ea89eb883d/pillow-11.3.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5e05688ccef30ea69b9317a9ead994b93975104a677a36a8ed8106be9260aa6d", size = 4689547, upload-time = "2025-07-01T09:15:01.648Z" },
+    { url = "https://files.pythonhosted.org/packages/49/20/716b8717d331150cb00f7fdd78169c01e8e0c219732a78b0e59b6bdb2fd6/pillow-11.3.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1019b04af07fc0163e2810167918cb5add8d74674b6267616021ab558dc98ced", size = 5901554, upload-time = "2025-07-03T13:10:27.018Z" },
+    { url = "https://files.pythonhosted.org/packages/74/cf/a9f3a2514a65bb071075063a96f0a5cf949c2f2fce683c15ccc83b1c1cab/pillow-11.3.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f944255db153ebb2b19c51fe85dd99ef0ce494123f21b9db4877ffdfc5590c7c", size = 7669132, upload-time = "2025-07-03T13:10:33.01Z" },
+    { url = "https://files.pythonhosted.org/packages/98/3c/da78805cbdbee9cb43efe8261dd7cc0b4b93f2ac79b676c03159e9db2187/pillow-11.3.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1f85acb69adf2aaee8b7da124efebbdb959a104db34d3a2cb0f3793dbae422a8", size = 6005001, upload-time = "2025-07-01T09:15:03.365Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/fa/ce044b91faecf30e635321351bba32bab5a7e034c60187fe9698191aef4f/pillow-11.3.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:05f6ecbeff5005399bb48d198f098a9b4b6bdf27b8487c7f38ca16eeb070cd59", size = 6668814, upload-time = "2025-07-01T09:15:05.655Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/51/90f9291406d09bf93686434f9183aba27b831c10c87746ff49f127ee80cb/pillow-11.3.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a7bc6e6fd0395bc052f16b1a8670859964dbd7003bd0af2ff08342eb6e442cfe", size = 6113124, upload-time = "2025-07-01T09:15:07.358Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/5a/6fec59b1dfb619234f7636d4157d11fb4e196caeee220232a8d2ec48488d/pillow-11.3.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:83e1b0161c9d148125083a35c1c5a89db5b7054834fd4387499e06552035236c", size = 6747186, upload-time = "2025-07-01T09:15:09.317Z" },
+    { url = "https://files.pythonhosted.org/packages/49/6b/00187a044f98255225f172de653941e61da37104a9ea60e4f6887717e2b5/pillow-11.3.0-cp313-cp313t-win32.whl", hash = "sha256:2a3117c06b8fb646639dce83694f2f9eac405472713fcb1ae887469c0d4f6788", size = 6277546, upload-time = "2025-07-01T09:15:11.311Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/5c/6caaba7e261c0d75bab23be79f1d06b5ad2a2ae49f028ccec801b0e853d6/pillow-11.3.0-cp313-cp313t-win_amd64.whl", hash = "sha256:857844335c95bea93fb39e0fa2726b4d9d758850b34075a7e3ff4f4fa3aa3b31", size = 6985102, upload-time = "2025-07-01T09:15:13.164Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/7e/b623008460c09a0cb38263c93b828c666493caee2eb34ff67f778b87e58c/pillow-11.3.0-cp313-cp313t-win_arm64.whl", hash = "sha256:8797edc41f3e8536ae4b10897ee2f637235c94f27404cac7297f7b607dd0716e", size = 2424803, upload-time = "2025-07-01T09:15:15.695Z" },
+    { url = "https://files.pythonhosted.org/packages/73/f4/04905af42837292ed86cb1b1dabe03dce1edc008ef14c473c5c7e1443c5d/pillow-11.3.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:d9da3df5f9ea2a89b81bb6087177fb1f4d1c7146d583a3fe5c672c0d94e55e12", size = 5278520, upload-time = "2025-07-01T09:15:17.429Z" },
+    { url = "https://files.pythonhosted.org/packages/41/b0/33d79e377a336247df6348a54e6d2a2b85d644ca202555e3faa0cf811ecc/pillow-11.3.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:0b275ff9b04df7b640c59ec5a3cb113eefd3795a8df80bac69646ef699c6981a", size = 4686116, upload-time = "2025-07-01T09:15:19.423Z" },
+    { url = "https://files.pythonhosted.org/packages/49/2d/ed8bc0ab219ae8768f529597d9509d184fe8a6c4741a6864fea334d25f3f/pillow-11.3.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0743841cabd3dba6a83f38a92672cccbd69af56e3e91777b0ee7f4dba4385632", size = 5864597, upload-time = "2025-07-03T13:10:38.404Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/3d/b932bb4225c80b58dfadaca9d42d08d0b7064d2d1791b6a237f87f661834/pillow-11.3.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2465a69cf967b8b49ee1b96d76718cd98c4e925414ead59fdf75cf0fd07df673", size = 7638246, upload-time = "2025-07-03T13:10:44.987Z" },
+    { url = "https://files.pythonhosted.org/packages/09/b5/0487044b7c096f1b48f0d7ad416472c02e0e4bf6919541b111efd3cae690/pillow-11.3.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:41742638139424703b4d01665b807c6468e23e699e8e90cffefe291c5832b027", size = 5973336, upload-time = "2025-07-01T09:15:21.237Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/2d/524f9318f6cbfcc79fbc004801ea6b607ec3f843977652fdee4857a7568b/pillow-11.3.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:93efb0b4de7e340d99057415c749175e24c8864302369e05914682ba642e5d77", size = 6642699, upload-time = "2025-07-01T09:15:23.186Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/d2/a9a4f280c6aefedce1e8f615baaa5474e0701d86dd6f1dede66726462bbd/pillow-11.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7966e38dcd0fa11ca390aed7c6f20454443581d758242023cf36fcb319b1a874", size = 6083789, upload-time = "2025-07-01T09:15:25.1Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/54/86b0cd9dbb683a9d5e960b66c7379e821a19be4ac5810e2e5a715c09a0c0/pillow-11.3.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:98a9afa7b9007c67ed84c57c9e0ad86a6000da96eaa638e4f8abe5b65ff83f0a", size = 6720386, upload-time = "2025-07-01T09:15:27.378Z" },
+    { url = "https://files.pythonhosted.org/packages/e7/95/88efcaf384c3588e24259c4203b909cbe3e3c2d887af9e938c2022c9dd48/pillow-11.3.0-cp314-cp314-win32.whl", hash = "sha256:02a723e6bf909e7cea0dac1b0e0310be9d7650cd66222a5f1c571455c0a45214", size = 6370911, upload-time = "2025-07-01T09:15:29.294Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/cc/934e5820850ec5eb107e7b1a72dd278140731c669f396110ebc326f2a503/pillow-11.3.0-cp314-cp314-win_amd64.whl", hash = "sha256:a418486160228f64dd9e9efcd132679b7a02a5f22c982c78b6fc7dab3fefb635", size = 7117383, upload-time = "2025-07-01T09:15:31.128Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/e9/9c0a616a71da2a5d163aa37405e8aced9a906d574b4a214bede134e731bc/pillow-11.3.0-cp314-cp314-win_arm64.whl", hash = "sha256:155658efb5e044669c08896c0c44231c5e9abcaadbc5cd3648df2f7c0b96b9a6", size = 2511385, upload-time = "2025-07-01T09:15:33.328Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/33/c88376898aff369658b225262cd4f2659b13e8178e7534df9e6e1fa289f6/pillow-11.3.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:59a03cdf019efbfeeed910bf79c7c93255c3d54bc45898ac2a4140071b02b4ae", size = 5281129, upload-time = "2025-07-01T09:15:35.194Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/70/d376247fb36f1844b42910911c83a02d5544ebd2a8bad9efcc0f707ea774/pillow-11.3.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f8a5827f84d973d8636e9dc5764af4f0cf2318d26744b3d902931701b0d46653", size = 4689580, upload-time = "2025-07-01T09:15:37.114Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/1c/537e930496149fbac69efd2fc4329035bbe2e5475b4165439e3be9cb183b/pillow-11.3.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ee92f2fd10f4adc4b43d07ec5e779932b4eb3dbfbc34790ada5a6669bc095aa6", size = 5902860, upload-time = "2025-07-03T13:10:50.248Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/57/80f53264954dcefeebcf9dae6e3eb1daea1b488f0be8b8fef12f79a3eb10/pillow-11.3.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c96d333dcf42d01f47b37e0979b6bd73ec91eae18614864622d9b87bbd5bbf36", size = 7670694, upload-time = "2025-07-03T13:10:56.432Z" },
+    { url = "https://files.pythonhosted.org/packages/70/ff/4727d3b71a8578b4587d9c276e90efad2d6fe0335fd76742a6da08132e8c/pillow-11.3.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4c96f993ab8c98460cd0c001447bff6194403e8b1d7e149ade5f00594918128b", size = 6005888, upload-time = "2025-07-01T09:15:39.436Z" },
+    { url = "https://files.pythonhosted.org/packages/05/ae/716592277934f85d3be51d7256f3636672d7b1abfafdc42cf3f8cbd4b4c8/pillow-11.3.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:41342b64afeba938edb034d122b2dda5db2139b9a4af999729ba8818e0056477", size = 6670330, upload-time = "2025-07-01T09:15:41.269Z" },
+    { url = "https://files.pythonhosted.org/packages/e7/bb/7fe6cddcc8827b01b1a9766f5fdeb7418680744f9082035bdbabecf1d57f/pillow-11.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:068d9c39a2d1b358eb9f245ce7ab1b5c3246c7c8c7d9ba58cfa5b43146c06e50", size = 6114089, upload-time = "2025-07-01T09:15:43.13Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/f5/06bfaa444c8e80f1a8e4bff98da9c83b37b5be3b1deaa43d27a0db37ef84/pillow-11.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a1bc6ba083b145187f648b667e05a2534ecc4b9f2784c2cbe3089e44868f2b9b", size = 6748206, upload-time = "2025-07-01T09:15:44.937Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/77/bc6f92a3e8e6e46c0ca78abfffec0037845800ea38c73483760362804c41/pillow-11.3.0-cp314-cp314t-win32.whl", hash = "sha256:118ca10c0d60b06d006be10a501fd6bbdfef559251ed31b794668ed569c87e12", size = 6377370, upload-time = "2025-07-01T09:15:46.673Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/82/3a721f7d69dca802befb8af08b7c79ebcab461007ce1c18bd91a5d5896f9/pillow-11.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:8924748b688aa210d79883357d102cd64690e56b923a186f35a82cbc10f997db", size = 7121500, upload-time = "2025-07-01T09:15:48.512Z" },
+    { url = "https://files.pythonhosted.org/packages/89/c7/5572fa4a3f45740eaab6ae86fcdf7195b55beac1371ac8c619d880cfe948/pillow-11.3.0-cp314-cp314t-win_arm64.whl", hash = "sha256:79ea0d14d3ebad43ec77ad5272e6ff9bba5b679ef73375ea760261207fa8e0aa", size = 2512835, upload-time = "2025-07-01T09:15:50.399Z" },
+]
+
 [[package]]
 name = "pluggy"
 version = "1.6.0"
@@ -727,6 +831,63 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/cc/35/cc0aaecf278bb4575b8555f2b137de5ab821595ddae9da9d3cd1da4072c7/propcache-0.3.2-py3-none-any.whl", hash = "sha256:98f1ec44fb675f5052cccc8e609c46ed23a35a1cfd18545ad4e29002d858a43f", size = 12663, upload-time = "2025-06-09T22:56:04.484Z" },
 ]
 
+[[package]]
+name = "pydantic"
+version = "2.11.7"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "annotated-types" },
+    { name = "pydantic-core" },
+    { name = "typing-extensions" },
+    { name = "typing-inspection" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/00/dd/4325abf92c39ba8623b5af936ddb36ffcfe0beae70405d456ab1fb2f5b8c/pydantic-2.11.7.tar.gz", hash = "sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db", size = 788350, upload-time = "2025-06-14T08:33:17.137Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6a/c0/ec2b1c8712ca690e5d61979dee872603e92b8a32f94cc1b72d53beab008a/pydantic-2.11.7-py3-none-any.whl", hash = "sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b", size = 444782, upload-time = "2025-06-14T08:33:14.905Z" },
+]
+
+[[package]]
+name = "pydantic-core"
+version = "2.33.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ad/88/5f2260bdfae97aabf98f1778d43f69574390ad787afb646292a638c923d4/pydantic_core-2.33.2.tar.gz", hash = "sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc", size = 435195, upload-time = "2025-04-23T18:33:52.104Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/18/8a/2b41c97f554ec8c71f2a8a5f85cb56a8b0956addfe8b0efb5b3d77e8bdc3/pydantic_core-2.33.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc", size = 2009000, upload-time = "2025-04-23T18:31:25.863Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/02/6224312aacb3c8ecbaa959897af57181fb6cf3a3d7917fd44d0f2917e6f2/pydantic_core-2.33.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7", size = 1847996, upload-time = "2025-04-23T18:31:27.341Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/46/6dcdf084a523dbe0a0be59d054734b86a981726f221f4562aed313dbcb49/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025", size = 1880957, upload-time = "2025-04-23T18:31:28.956Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/6b/1ec2c03837ac00886ba8160ce041ce4e325b41d06a034adbef11339ae422/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011", size = 1964199, upload-time = "2025-04-23T18:31:31.025Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/1d/6bf34d6adb9debd9136bd197ca72642203ce9aaaa85cfcbfcf20f9696e83/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f", size = 2120296, upload-time = "2025-04-23T18:31:32.514Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/94/2bd0aaf5a591e974b32a9f7123f16637776c304471a0ab33cf263cf5591a/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88", size = 2676109, upload-time = "2025-04-23T18:31:33.958Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/41/4b043778cf9c4285d59742281a769eac371b9e47e35f98ad321349cc5d61/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1", size = 2002028, upload-time = "2025-04-23T18:31:39.095Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/d5/7bb781bf2748ce3d03af04d5c969fa1308880e1dca35a9bd94e1a96a922e/pydantic_core-2.33.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b", size = 2100044, upload-time = "2025-04-23T18:31:41.034Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/36/def5e53e1eb0ad896785702a5bbfd25eed546cdcf4087ad285021a90ed53/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1", size = 2058881, upload-time = "2025-04-23T18:31:42.757Z" },
+    { url = "https://files.pythonhosted.org/packages/01/6c/57f8d70b2ee57fc3dc8b9610315949837fa8c11d86927b9bb044f8705419/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6", size = 2227034, upload-time = "2025-04-23T18:31:44.304Z" },
+    { url = "https://files.pythonhosted.org/packages/27/b9/9c17f0396a82b3d5cbea4c24d742083422639e7bb1d5bf600e12cb176a13/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea", size = 2234187, upload-time = "2025-04-23T18:31:45.891Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/6a/adf5734ffd52bf86d865093ad70b2ce543415e0e356f6cacabbc0d9ad910/pydantic_core-2.33.2-cp312-cp312-win32.whl", hash = "sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290", size = 1892628, upload-time = "2025-04-23T18:31:47.819Z" },
+    { url = "https://files.pythonhosted.org/packages/43/e4/5479fecb3606c1368d496a825d8411e126133c41224c1e7238be58b87d7e/pydantic_core-2.33.2-cp312-cp312-win_amd64.whl", hash = "sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2", size = 1955866, upload-time = "2025-04-23T18:31:49.635Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/24/8b11e8b3e2be9dd82df4b11408a67c61bb4dc4f8e11b5b0fc888b38118b5/pydantic_core-2.33.2-cp312-cp312-win_arm64.whl", hash = "sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab", size = 1888894, upload-time = "2025-04-23T18:31:51.609Z" },
+    { url = "https://files.pythonhosted.org/packages/46/8c/99040727b41f56616573a28771b1bfa08a3d3fe74d3d513f01251f79f172/pydantic_core-2.33.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f", size = 2015688, upload-time = "2025-04-23T18:31:53.175Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/cc/5999d1eb705a6cefc31f0b4a90e9f7fc400539b1a1030529700cc1b51838/pydantic_core-2.33.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6", size = 1844808, upload-time = "2025-04-23T18:31:54.79Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/5e/a0a7b8885c98889a18b6e376f344da1ef323d270b44edf8174d6bce4d622/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef", size = 1885580, upload-time = "2025-04-23T18:31:57.393Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/2a/953581f343c7d11a304581156618c3f592435523dd9d79865903272c256a/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a", size = 1973859, upload-time = "2025-04-23T18:31:59.065Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/55/f1a813904771c03a3f97f676c62cca0c0a4138654107c1b61f19c644868b/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916", size = 2120810, upload-time = "2025-04-23T18:32:00.78Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/c3/053389835a996e18853ba107a63caae0b9deb4a276c6b472931ea9ae6e48/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a", size = 2676498, upload-time = "2025-04-23T18:32:02.418Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/3c/f4abd740877a35abade05e437245b192f9d0ffb48bbbbd708df33d3cda37/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d", size = 2000611, upload-time = "2025-04-23T18:32:04.152Z" },
+    { url = "https://files.pythonhosted.org/packages/59/a7/63ef2fed1837d1121a894d0ce88439fe3e3b3e48c7543b2a4479eb99c2bd/pydantic_core-2.33.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56", size = 2107924, upload-time = "2025-04-23T18:32:06.129Z" },
+    { url = "https://files.pythonhosted.org/packages/04/8f/2551964ef045669801675f1cfc3b0d74147f4901c3ffa42be2ddb1f0efc4/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5", size = 2063196, upload-time = "2025-04-23T18:32:08.178Z" },
+    { url = "https://files.pythonhosted.org/packages/26/bd/d9602777e77fc6dbb0c7db9ad356e9a985825547dce5ad1d30ee04903918/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e", size = 2236389, upload-time = "2025-04-23T18:32:10.242Z" },
+    { url = "https://files.pythonhosted.org/packages/42/db/0e950daa7e2230423ab342ae918a794964b053bec24ba8af013fc7c94846/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162", size = 2239223, upload-time = "2025-04-23T18:32:12.382Z" },
+    { url = "https://files.pythonhosted.org/packages/58/4d/4f937099c545a8a17eb52cb67fe0447fd9a373b348ccfa9a87f141eeb00f/pydantic_core-2.33.2-cp313-cp313-win32.whl", hash = "sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849", size = 1900473, upload-time = "2025-04-23T18:32:14.034Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/75/4a0a9bac998d78d889def5e4ef2b065acba8cae8c93696906c3a91f310ca/pydantic_core-2.33.2-cp313-cp313-win_amd64.whl", hash = "sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9", size = 1955269, upload-time = "2025-04-23T18:32:15.783Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/86/1beda0576969592f1497b4ce8e7bc8cbdf614c352426271b1b10d5f0aa64/pydantic_core-2.33.2-cp313-cp313-win_arm64.whl", hash = "sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9", size = 1893921, upload-time = "2025-04-23T18:32:18.473Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/7d/e09391c2eebeab681df2b74bfe6c43422fffede8dc74187b2b0bf6fd7571/pydantic_core-2.33.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac", size = 1806162, upload-time = "2025-04-23T18:32:20.188Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/3d/847b6b1fed9f8ed3bb95a9ad04fbd0b212e832d4f0f50ff4d9ee5a9f15cf/pydantic_core-2.33.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5", size = 1981560, upload-time = "2025-04-23T18:32:22.354Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/9a/e73262f6c6656262b5fdd723ad90f518f579b7bc8622e43a942eec53c938/pydantic_core-2.33.2-cp313-cp313t-win_amd64.whl", hash = "sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9", size = 1935777, upload-time = "2025-04-23T18:32:25.088Z" },
+]
+
 [[package]]
 name = "pygments"
 version = "2.19.2"
@@ -1012,6 +1173,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b5/00/d631e67a838026495268c2f6884f3711a15a9a2a96cd244fdaea53b823fb/typing_extensions-4.14.1-py3-none-any.whl", hash = "sha256:d1e1e3b58374dc93031d6eda2420a48ea44a36c2b4766a4fdeb3710755731d76", size = 43906, upload-time = "2025-07-04T13:28:32.743Z" },
 ]
 
+[[package]]
+name = "typing-inspection"
+version = "0.4.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f8/b1/0c11f5058406b3af7609f121aaa6b609744687f1d158b3c3a5bf4cc94238/typing_inspection-0.4.1.tar.gz", hash = "sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28", size = 75726, upload-time = "2025-05-21T18:55:23.885Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/17/69/cd203477f944c353c31bade965f880aa1061fd6bf05ded0726ca845b6ff7/typing_inspection-0.4.1-py3-none-any.whl", hash = "sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51", size = 14552, upload-time = "2025-05-21T18:55:22.152Z" },
+]
+
 [[package]]
 name = "ujson"
 version = "5.10.0"
diff --git a/worker.py b/worker.py
index d534c0d..35d440a 100644
--- a/worker.py
+++ b/worker.py
@@ -1,11 +1,14 @@
-from typing import Any
-from enhancers.information_enhancer import InformationEnhancerFactory
 import asyncio
+from typing import Any
+
 from sanic import Sanic
-from parsers.document_parser import DocumentParserFactory
-from config import settings
 
-async def worker(app: Sanic) -> list[dict[str, Any]]:
+from enhancers.information_enhancer import InformationEnhancerFactory
+from parsers.document_parser import DocumentData
+from parsers.document_parser_factory import DocumentParserFactory
+
+
+async def worker(app: Sanic) -> dict[str, Any]:
     # 使用工厂获取合适的解析器
     parser_factory = DocumentParserFactory()
     enhancer_factory = InformationEnhancerFactory()
@@ -16,17 +19,21 @@ async def worker(app: Sanic) -> list[dict[str, Any]]:
             await asyncio.sleep(1)
             continue
         file_path = task.get("file_path")
-        information_list = await parser_factory.parse_document(file_path)
+        parse_result = await parser_factory.parse_document(file_path)
+        if not parse_result.success:
+            continue
+        chunk_list = parse_result.document
         # 控制并发数量，防止访问量过大导致失败
         SEMAPHORE_LIMIT = 10  # 可根据实际情况调整
         semaphore = asyncio.Semaphore(SEMAPHORE_LIMIT)
 
-        async def enhance_with_semaphore(info):
+        async def enhance_with_semaphore(chunk: DocumentData, semaphore: asyncio.Semaphore) -> DocumentData:
             async with semaphore:
-                return await enhancer_factory.enhance_information(info)
+                return await enhancer_factory.enhance_information(chunk)
 
         # 并发增强每个信息
-        enhanced_information_list = await asyncio.gather(
-            *(enhance_with_semaphore(info) for info in information_list)
+        enhanced_chunk_list = await asyncio.gather(
+            *(enhance_with_semaphore(chunk, semaphore) for chunk in chunk_list)
         )
-        return enhanced_information_list
\ No newline at end of file
+        parse_result.document = enhanced_chunk_list
+        return parse_result.model_dump(mode="json")

From ec7b394cc81aa1af58764dc0945591caa540ec86 Mon Sep 17 00:00:00 2001
From: liningping <728359849@qq.com>
Date: Tue, 19 Aug 2025 06:57:53 +0000
Subject: [PATCH 05/10] fix:confort ruff

---
 parsers/excel_parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/parsers/excel_parser.py b/parsers/excel_parser.py
index 4a33e5c..b34419b 100644
--- a/parsers/excel_parser.py
+++ b/parsers/excel_parser.py
@@ -184,7 +184,7 @@ def _extract_sheet_images(self, sheet: Worksheet) -> list[DocumentData]:
         sheet_images: list[DocumentData] = []
 
         images = getattr(sheet, '_images', None)
-        if not images or not isinstance(images, (list, tuple)):
+        if not images or not isinstance(images, list|tuple):
             return sheet_images
 
         # 收集图片信息

From 8d9874e78b9e74ceda05bfb29fa05b4169a7e3b5 Mon Sep 17 00:00:00 2001
From: liningping <728359849@qq.com>
Date: Tue, 19 Aug 2025 11:07:34 +0000
Subject: [PATCH 06/10] feat: add document data models

---
 parsers/base_models.py             | 45 ++++++++++++++++++++++++++++
 parsers/document_parser.py         | 48 +++++++++++++-----------------
 parsers/document_parser_factory.py | 29 ------------------
 parsers/excel_parser.py            | 40 +++++++++++--------------
 storage/s3_client.py               |  4 +++
 tests/test_excel_parser.py         |  8 ++---
 6 files changed, 91 insertions(+), 83 deletions(-)
 create mode 100644 parsers/base_models.py
 delete mode 100644 parsers/document_parser_factory.py

diff --git a/parsers/base_models.py b/parsers/base_models.py
new file mode 100644
index 0000000..c9fece7
--- /dev/null
+++ b/parsers/base_models.py
@@ -0,0 +1,45 @@
+import logging
+from abc import ABC, abstractmethod
+from enum import Enum
+
+from pydantic import BaseModel
+
+logger = logging.getLogger(__name__)
+
+class ChunkType(str, Enum):
+    """块类型"""
+    TEXT = "text"
+    IMAGE = "image"
+    TABLE = "table"
+    FORMULA = "formula"
+
+class ChunkData(BaseModel):
+    """块数据类"""
+    type: ChunkType
+    name: str
+    content: str = ""
+    description: str = ""
+
+class DocumentData(BaseModel):
+    """解析结果类"""
+    title: str = ""
+    chunks: list[ChunkData] = []
+    processing_time: float = 0
+    success: bool
+    error_message: str | None = None
+
+class DocumentParser(ABC):
+    """文档解析器基类"""
+
+    def __init__(self) -> None:
+        self.supported_formats: list[str] = []
+
+    @abstractmethod
+    async def parse(self, file_path: str) -> DocumentData:
+        """解析文档"""
+        pass
+
+    @abstractmethod
+    def can_parse(self, file_path: str) -> bool:
+        """检查是否可以解析该文件"""
+        pass
diff --git a/parsers/document_parser.py b/parsers/document_parser.py
index 1ea37f3..e582873 100644
--- a/parsers/document_parser.py
+++ b/parsers/document_parser.py
@@ -1,37 +1,29 @@
 import logging
-from abc import ABC, abstractmethod
 
-from pydantic import BaseModel
+from parsers.document_parser import DocumentParser, ParseResult
+from parsers.excel_parser import ExcelParser
 
 logger = logging.getLogger(__name__)
 
-class DocumentData(BaseModel):
-    """文档数据类"""
-    type: str
-    name: str
-    content: str
-    description: str
-
-class ParseResult(BaseModel):
-    """解析结果类"""
-    title: str
-    document: list[DocumentData]
-    processing_time: float
-    success: bool
-    error_message: str | None = None
-
-class DocumentParser(ABC):
-    """文档解析器基类"""
+class DocumentParserFactory:
+    """文档解析器工厂"""
 
     def __init__(self) -> None:
-        self.supported_formats: list[str] = []
-
-    @abstractmethod
-    async def parse(self, file_path: str) -> ParseResult:
+        self.parsers: list[DocumentParser] = [
+            ExcelParser()
+        ]
+
+    def get_parser(self, file_path: str) -> DocumentParser | None:
+        """根据文件路径获取合适的解析器"""
+        for parser in self.parsers:
+            if parser.can_parse(file_path):
+                return parser
+        return None
+
+    async def parse_document(self, file_path: str) -> ParseResult:
         """解析文档"""
-        pass
+        parser = self.get_parser(file_path)
+        if not parser:
+            raise ValueError(f"不支持的文件格式: {file_path}")
 
-    @abstractmethod
-    def can_parse(self, file_path: str) -> bool:
-        """检查是否可以解析该文件"""
-        pass
+        return await parser.parse(file_path)
diff --git a/parsers/document_parser_factory.py b/parsers/document_parser_factory.py
deleted file mode 100644
index e582873..0000000
--- a/parsers/document_parser_factory.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import logging
-
-from parsers.document_parser import DocumentParser, ParseResult
-from parsers.excel_parser import ExcelParser
-
-logger = logging.getLogger(__name__)
-
-class DocumentParserFactory:
-    """文档解析器工厂"""
-
-    def __init__(self) -> None:
-        self.parsers: list[DocumentParser] = [
-            ExcelParser()
-        ]
-
-    def get_parser(self, file_path: str) -> DocumentParser | None:
-        """根据文件路径获取合适的解析器"""
-        for parser in self.parsers:
-            if parser.can_parse(file_path):
-                return parser
-        return None
-
-    async def parse_document(self, file_path: str) -> ParseResult:
-        """解析文档"""
-        parser = self.get_parser(file_path)
-        if not parser:
-            raise ValueError(f"不支持的文件格式: {file_path}")
-
-        return await parser.parse(file_path)
diff --git a/parsers/excel_parser.py b/parsers/excel_parser.py
index b34419b..eb7cd00 100644
--- a/parsers/excel_parser.py
+++ b/parsers/excel_parser.py
@@ -19,7 +19,7 @@
 from openpyxl.workbook.workbook import Workbook  # type: ignore
 from openpyxl.worksheet.worksheet import Worksheet  # type: ignore
 
-from parsers.document_parser import DocumentData, DocumentParser, ParseResult
+from parsers.base_models import ChunkData, ChunkType, DocumentData, DocumentParser
 
 # 忽略 openpyxl 的特定警告
 warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl')
@@ -57,7 +57,7 @@ def __init__(self, config: ExcelParseConfig | None = None):
         self.image_index: int = 0
         self.supported_formats: list[str] = ['.xlsx', '.xls']
 
-    async def parse(self, excel_path: str) -> ParseResult:
+    async def parse(self, excel_path: str) -> DocumentData:
         """
         解析Excel文件并保存结果
 
@@ -79,19 +79,16 @@ async def parse(self, excel_path: str) -> ParseResult:
             processing_time = time.time() - start_time
 
 
-            return ParseResult(
+            return DocumentData(
                 title=title,
-                document=document_data,
+                chunks=document_data,
                 processing_time=processing_time,
                 success=True
             )
 
         except Exception as e:
             processing_time = time.time() - start_time
-            return ParseResult(
-                title="",
-                document=[],
-                processing_time=processing_time,
+            return DocumentData(
                 success=False,
                 error_message=str(e)
             )
@@ -106,7 +103,7 @@ def can_parse(self, file_path: str) -> bool:
         """
         return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
 
-    def _excel_to_json(self, excel_path: str) -> tuple[str, list[DocumentData]]:
+    def _excel_to_json(self, excel_path: str) -> tuple[str, list[ChunkData]]:
         """
         将Excel文件转换为JSON格式
         Args:
@@ -118,7 +115,7 @@ def _excel_to_json(self, excel_path: str) -> tuple[str, list[DocumentData]]:
         title = Path(excel_path).stem
 
         # 初始化内容列表和图片列表
-        content: list[DocumentData] = []
+        content: list[ChunkData] = []
         self.image_index = 0
 
         # 加载工作簿
@@ -129,8 +126,8 @@ def _excel_to_json(self, excel_path: str) -> tuple[str, list[DocumentData]]:
             sheet = workbook[sheet_name]
 
             # 添加工作表标题
-            content.append(DocumentData(
-                type="text",
+            content.append(ChunkData(
+                type=ChunkType.TEXT,
                 name=sheet_name,
                 content=f"工作表 {sheet_index + 1}: {sheet_name}",
                 description="工作表标题"
@@ -142,16 +139,16 @@ def _excel_to_json(self, excel_path: str) -> tuple[str, list[DocumentData]]:
 
             # 处理表格数据
             table_content = self._extract_table_data(sheet)
-            content.append(DocumentData(
-                type="table",
+            content.append(ChunkData(
+                type=ChunkType.TABLE,
                 name="表格",
                 content=json.dumps(table_content),
                 description="表格"
             ))
 
         # 添加结束文本
-        content.append(DocumentData(
-            type="text",
+        content.append(ChunkData(
+            type=ChunkType.TEXT,
             name="结束文本",
             content="",
             description="结束文本"
@@ -173,7 +170,7 @@ def _load_workbook(self, excel_path: str) -> Workbook:
             keep_vba=self.config.keep_vba
         )
 
-    def _extract_sheet_images(self, sheet: Worksheet) -> list[DocumentData]:
+    def _extract_sheet_images(self, sheet: Worksheet) -> list[ChunkData]:
         """
         提取工作表中的图片
         Args:
@@ -181,7 +178,7 @@ def _extract_sheet_images(self, sheet: Worksheet) -> list[DocumentData]:
         Returns:
             List[DocumentData]: 图片信息列表
         """
-        sheet_images: list[DocumentData] = []
+        sheet_images: list[ChunkData] = []
 
         images = getattr(sheet, '_images', None)
         if not images or not isinstance(images, list|tuple):
@@ -202,7 +199,7 @@ def _extract_sheet_images(self, sheet: Worksheet) -> list[DocumentData]:
 
         return sheet_images
 
-    def _process_image_object(self, img_obj: Image) -> DocumentData | None:
+    def _process_image_object(self, img_obj: Image) -> ChunkData | None:
         """
         处理单个图片对象
         Args:
@@ -222,8 +219,8 @@ def _process_image_object(self, img_obj: Image) -> DocumentData | None:
             uri = f"data:image/{img_format};base64,{base64_encoded}"
 
             # 创建图片信息
-            image_info = DocumentData(
-                type="image",
+            image_info = ChunkData(
+                type=ChunkType.IMAGE,
                 name=f"#/pictures/{self.image_index}",
                 content=uri,
                 description=self.config.image_description_placeholder
@@ -374,7 +371,6 @@ def _extract_all_rows(self, sheet: Worksheet, max_row: int, max_col: int,
 
         return all_rows
 
-
     def _save_json(self, data: Any, file_path: Path) -> None:
         """
         保存JSON数据到文件
diff --git a/storage/s3_client.py b/storage/s3_client.py
index da19e34..7cd41ba 100644
--- a/storage/s3_client.py
+++ b/storage/s3_client.py
@@ -40,6 +40,10 @@ def __init__(self) -> None:
         super().__init__("S3 client not initialized")
 
 
+class S3Error(Exception):
+    """S3操作异常"""
+    pass
+
 class AsyncS3Client:
     def __init__(self,
                  endpoint_url: str | None,
diff --git a/tests/test_excel_parser.py b/tests/test_excel_parser.py
index fa5a464..b8f0775 100644
--- a/tests/test_excel_parser.py
+++ b/tests/test_excel_parser.py
@@ -7,7 +7,7 @@
 from openpyxl.drawing.image import Image as XLImage
 
 from parsers.excel_parser import ExcelParser
-from parsers.document_parser import DocumentData
+from parsers.base_models import ChunkData
 
 
 @pytest.mark.asyncio
@@ -49,7 +49,7 @@ async def test_parse_real_basic_and_image():
 
             assert result.success is True
             # 内容：Sheet1标题、Sheet1图片、Sheet1表格、Sheet2标题、Sheet2表格、结束文本
-            content = result.document
+            content = result.chunks
             assert len(content) == 6
 
             # 校验顺序与关键字段
@@ -91,12 +91,12 @@ async def test_parse_real_merged_cells():
         result = await parser.parse(xlsx_path)
 
         assert result.success is True
-        content = result.document
+        content = result.chunks
         # 结构：标题、表格、结束文本
         assert len(content) == 3
 
         # 表格在索引1
-        table_chunk: DocumentData = content[1]
+        table_chunk: ChunkData = content[1]
         assert table_chunk.type == "table"
 
         import json as _json

From b2da9e91cebe32f83716c45680bab41235fadd28 Mon Sep 17 00:00:00 2001
From: liningping <728359849@qq.com>
Date: Tue, 19 Aug 2025 11:26:02 +0000
Subject: [PATCH 07/10] fix: comfort mypy

---
 enhancers/information_enhancer.py | 20 ++++++++++----------
 parsers/document_parser.py        |  4 ++--
 worker.py                         | 10 +++++-----
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/enhancers/information_enhancer.py b/enhancers/information_enhancer.py
index 696dd10..aeb8758 100644
--- a/enhancers/information_enhancer.py
+++ b/enhancers/information_enhancer.py
@@ -1,33 +1,33 @@
 from abc import ABC, abstractmethod
 
-from parsers.document_parser import DocumentData
+from parsers.base_models import ChunkData, ChunkType
 
 
 class InformationEnhancer(ABC):
     """信息增强器基类"""
     @abstractmethod
-    async def enhance(self, information: DocumentData) -> DocumentData:
+    async def enhance(self, information: ChunkData) -> ChunkData:
         """增强信息"""
         pass
 
 class TableInformationEnhancer(InformationEnhancer):
     """表格信息增强器"""
 
-    async def enhance(self, information: DocumentData) -> DocumentData:
+    async def enhance(self, information: ChunkData) -> ChunkData:
         """增强信息"""
         return information
 
 class FormulasInformationEnhancer(InformationEnhancer):
     """公式信息增强器"""
 
-    async def enhance(self, information: DocumentData) -> DocumentData:
+    async def enhance(self, information: ChunkData) -> ChunkData:
         """增强信息"""
         return information
 
 class ImageInformationEnhancer(InformationEnhancer):
     """图片信息增强器"""
 
-    async def enhance(self, information: DocumentData) -> DocumentData:
+    async def enhance(self, information: ChunkData) -> ChunkData:
         """增强信息"""
         return information
 
@@ -41,19 +41,19 @@ def __init__(self) -> None:
             ImageInformationEnhancer()
         ]
 
-    def get_enhancer(self, information: DocumentData) -> InformationEnhancer|None:
+    def get_enhancer(self, information: ChunkData) -> InformationEnhancer|None:
         """获取信息增强器"""
         match information.type:
-            case "table":
+            case ChunkType.TABLE:
                 return TableInformationEnhancer()
-            case "formulas":
+            case ChunkType.FORMULA:
                 return FormulasInformationEnhancer()
-            case "image":
+            case ChunkType.IMAGE:
                 return ImageInformationEnhancer()
             case _:
                 return None
 
-    async def enhance_information(self, information: DocumentData) -> DocumentData:
+    async def enhance_information(self, information: ChunkData) -> ChunkData:
         """增强信息"""
         enhancer = self.get_enhancer(information)
         if not enhancer:
diff --git a/parsers/document_parser.py b/parsers/document_parser.py
index e582873..782d019 100644
--- a/parsers/document_parser.py
+++ b/parsers/document_parser.py
@@ -1,6 +1,6 @@
 import logging
 
-from parsers.document_parser import DocumentParser, ParseResult
+from parsers.base_models import DocumentData, DocumentParser
 from parsers.excel_parser import ExcelParser
 
 logger = logging.getLogger(__name__)
@@ -20,7 +20,7 @@ def get_parser(self, file_path: str) -> DocumentParser | None:
                 return parser
         return None
 
-    async def parse_document(self, file_path: str) -> ParseResult:
+    async def parse_document(self, file_path: str) -> DocumentData:
         """解析文档"""
         parser = self.get_parser(file_path)
         if not parser:
diff --git a/worker.py b/worker.py
index 35d440a..1400cb9 100644
--- a/worker.py
+++ b/worker.py
@@ -4,8 +4,8 @@
 from sanic import Sanic
 
 from enhancers.information_enhancer import InformationEnhancerFactory
-from parsers.document_parser import DocumentData
-from parsers.document_parser_factory import DocumentParserFactory
+from parsers.document_parser import DocumentParserFactory
+from parsers.base_models import ChunkData
 
 
 async def worker(app: Sanic) -> dict[str, Any]:
@@ -22,12 +22,12 @@ async def worker(app: Sanic) -> dict[str, Any]:
         parse_result = await parser_factory.parse_document(file_path)
         if not parse_result.success:
             continue
-        chunk_list = parse_result.document
+        chunk_list = parse_result.chunks
         # 控制并发数量，防止访问量过大导致失败
         SEMAPHORE_LIMIT = 10  # 可根据实际情况调整
         semaphore = asyncio.Semaphore(SEMAPHORE_LIMIT)
 
-        async def enhance_with_semaphore(chunk: DocumentData, semaphore: asyncio.Semaphore) -> DocumentData:
+        async def enhance_with_semaphore(chunk: ChunkData, semaphore: asyncio.Semaphore) -> ChunkData:
             async with semaphore:
                 return await enhancer_factory.enhance_information(chunk)
 
@@ -35,5 +35,5 @@ async def enhance_with_semaphore(chunk: DocumentData, semaphore: asyncio.Semapho
         enhanced_chunk_list = await asyncio.gather(
             *(enhance_with_semaphore(chunk, semaphore) for chunk in chunk_list)
         )
-        parse_result.document = enhanced_chunk_list
+        parse_result.chunks = enhanced_chunk_list
         return parse_result.model_dump(mode="json")

From da016a14ae4606ce35852d425d6a32fac3c9861f Mon Sep 17 00:00:00 2001
From: liningping <728359849@qq.com>
Date: Tue, 19 Aug 2025 12:09:08 +0000
Subject: [PATCH 08/10] fix: comfort ruff

---
 worker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/worker.py b/worker.py
index 1400cb9..9fc6507 100644
--- a/worker.py
+++ b/worker.py
@@ -4,8 +4,8 @@
 from sanic import Sanic
 
 from enhancers.information_enhancer import InformationEnhancerFactory
-from parsers.document_parser import DocumentParserFactory
 from parsers.base_models import ChunkData
+from parsers.document_parser import DocumentParserFactory
 
 
 async def worker(app: Sanic) -> dict[str, Any]:

From 940ef02f7923e0af3c1259ff383cbb602d08ea89 Mon Sep 17 00:00:00 2001
From: liningping <728359849@qq.com>
Date: Tue, 19 Aug 2025 13:40:28 +0000
Subject: [PATCH 09/10] fix: refine document data model

---
 parsers/base_models.py     |  17 +++-
 parsers/excel_parser.py    | 168 ++++++++++++++++---------------------
 tests/test_excel_parser.py |  44 +++++-----
 worker.py                  |   7 +-
 4 files changed, 113 insertions(+), 123 deletions(-)

diff --git a/parsers/base_models.py b/parsers/base_models.py
index c9fece7..6a68289 100644
--- a/parsers/base_models.py
+++ b/parsers/base_models.py
@@ -1,11 +1,13 @@
 import logging
 from abc import ABC, abstractmethod
 from enum import Enum
+from typing import Any
 
 from pydantic import BaseModel
 
 logger = logging.getLogger(__name__)
 
+
 class ChunkType(str, Enum):
     """块类型"""
     TEXT = "text"
@@ -13,17 +15,28 @@ class ChunkType(str, Enum):
     TABLE = "table"
     FORMULA = "formula"
 
+class TableDataItem(BaseModel):
+    """表格数据类"""
+    rows: int  # 行数
+    columns: int  # 列数
+    row_headers: list[Any] = []  # 行头
+    column_headers: list[Any] = []  # 列头
+    data: list[list[str]] = []  # 数据
+
 class ChunkData(BaseModel):
     """块数据类"""
     type: ChunkType
     name: str
-    content: str = ""
+    content: str|TableDataItem = ""
     description: str = ""
 
 class DocumentData(BaseModel):
     """解析结果类"""
     title: str = ""
-    chunks: list[ChunkData] = []
+    texts: list[ChunkData] = []
+    tables: list[ChunkData] = []
+    images: list[ChunkData] = []
+    formulas: list[ChunkData] = []
     processing_time: float = 0
     success: bool
     error_message: str | None = None
diff --git a/parsers/excel_parser.py b/parsers/excel_parser.py
index eb7cd00..e333faa 100644
--- a/parsers/excel_parser.py
+++ b/parsers/excel_parser.py
@@ -19,7 +19,13 @@
 from openpyxl.workbook.workbook import Workbook  # type: ignore
 from openpyxl.worksheet.worksheet import Worksheet  # type: ignore
 
-from parsers.base_models import ChunkData, ChunkType, DocumentData, DocumentParser
+from parsers.base_models import (
+    ChunkData,
+    ChunkType,
+    DocumentData,
+    DocumentParser,
+    TableDataItem,
+)
 
 # 忽略 openpyxl 的特定警告
 warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl')
@@ -57,105 +63,77 @@ def __init__(self, config: ExcelParseConfig | None = None):
         self.image_index: int = 0
         self.supported_formats: list[str] = ['.xlsx', '.xls']
 
-    async def parse(self, excel_path: str) -> DocumentData:
+    def can_parse(self, file_path: str) -> bool:
+        """
+        验证输入文件
+        Args:
+            file_path: 文件路径
+        Returns:
+            bool: 是否支持解析
         """
-        解析Excel文件并保存结果
+        return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
 
+    async def parse(self, excel_path: str) -> DocumentData:
+        """
+        将Excel文件转换为JSON格式
         Args:
             excel_path: Excel文件路径
-            output_dir: 输出目录路径
         Returns:
-            ParseResult: 解析结果对象
-        Raises:
-            ExcelParseError: 当解析失败时抛出
+            DocumentData: 文档数据
         """
+        # 获取文件名作为标题（不带扩展名）
         start_time = time.time()
 
         try:
-            # 转换Excel到JSON格式
-            title, document_data = self._excel_to_json(excel_path)
-
-            # 计算处理时间
+            # 初始化内容列表和图片列表
+            texts: list[ChunkData] = []
+            tables: list[ChunkData] = []
+            images: list[ChunkData] = []
+
+            # 加载工作簿
+            workbook = self._load_workbook(excel_path)
+
+            # 处理每个工作表
+            for sheet_index, sheet_name in enumerate(workbook.sheetnames):
+                sheet = workbook[sheet_name]
+
+                # 添加工作表标题
+                texts.append(ChunkData(
+                    type=ChunkType.TEXT,
+                    name=sheet_name,
+                    content=f"工作表 {sheet_index + 1}: {sheet_name}",
+                    description="工作表标题"
+                ))
+
+                # 处理图片
+                sheet_images = self._extract_sheet_images(sheet)
+                images.extend(sheet_images)
+
+                # 处理表格数据
+                table_content = self._extract_table_data(sheet)
+                tables.append(ChunkData(
+                    type=ChunkType.TABLE,
+                    name="表格",
+                    content=table_content,
+                    description="表格"
+                ))
             processing_time = time.time() - start_time
-
-
             return DocumentData(
-                title=title,
-                chunks=document_data,
+                title=Path(excel_path).stem,
+                texts=texts,
+                tables=tables,
+                images=images,
                 processing_time=processing_time,
                 success=True
             )
-
         except Exception as e:
             processing_time = time.time() - start_time
             return DocumentData(
                 success=False,
-                error_message=str(e)
+                error_message=str(e),
+                processing_time=processing_time
             )
 
-    def can_parse(self, file_path: str) -> bool:
-        """
-        验证输入文件
-        Args:
-            file_path: 文件路径
-        Returns:
-            bool: 是否支持解析
-        """
-        return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
-
-    def _excel_to_json(self, excel_path: str) -> tuple[str, list[ChunkData]]:
-        """
-        将Excel文件转换为JSON格式
-        Args:
-            excel_path: Excel文件路径
-        Returns:
-            DocumentData: 文档数据
-        """
-        # 获取文件名作为标题（不带扩展名）
-        title = Path(excel_path).stem
-
-        # 初始化内容列表和图片列表
-        content: list[ChunkData] = []
-        self.image_index = 0
-
-        # 加载工作簿
-        workbook = self._load_workbook(excel_path)
-
-        # 处理每个工作表
-        for sheet_index, sheet_name in enumerate(workbook.sheetnames):
-            sheet = workbook[sheet_name]
-
-            # 添加工作表标题
-            content.append(ChunkData(
-                type=ChunkType.TEXT,
-                name=sheet_name,
-                content=f"工作表 {sheet_index + 1}: {sheet_name}",
-                description="工作表标题"
-            ))
-
-            # 处理图片
-            sheet_images = self._extract_sheet_images(sheet)
-            content.extend(sheet_images)
-
-            # 处理表格数据
-            table_content = self._extract_table_data(sheet)
-            content.append(ChunkData(
-                type=ChunkType.TABLE,
-                name="表格",
-                content=json.dumps(table_content),
-                description="表格"
-            ))
-
-        # 添加结束文本
-        content.append(ChunkData(
-            type=ChunkType.TEXT,
-            name="结束文本",
-            content="",
-            description="结束文本"
-        ))
-
-        return title, content
-
     def _load_workbook(self, excel_path: str) -> Workbook:
         """
         加载Excel工作簿
@@ -250,13 +228,13 @@ def _get_image_format(self, img_obj: Image) -> str:
             return img_format
         return self.config.default_image_format
 
-    def _process_cell_value(self, cell_value: Any) -> CellValue:
+    def _process_cell_value(self, cell_value: Any) -> str:
         """
         预处理单元格值，将datetime对象转换为字符串
         Args:
             cell_value: 原始单元格值
         Returns:
-            CellValue: 处理后的单元格值
+            str: 处理后的单元格值
         """
         if cell_value is None:
             return ""
@@ -269,14 +247,10 @@ def _process_cell_value(self, cell_value: Any) -> CellValue:
         if isinstance(cell_value, date):
             return cell_value.strftime("%Y-%m-%d")
 
-        # 处理其他类型
-        if isinstance(cell_value, str|int|float|bool):
-            return cell_value
-
         # 对于其他类型，转换为字符串
         return str(cell_value)
 
-    def _extract_table_data(self, sheet: Worksheet) -> dict[str, Any]:
+    def _extract_table_data(self, sheet: Worksheet) -> TableDataItem:
         """
         提取表格数据
         Args:
@@ -295,16 +269,14 @@ def _extract_table_data(self, sheet: Worksheet) -> dict[str, Any]:
         # 提取所有数据
         all_rows = self._extract_all_rows(sheet, max_row, max_col, merged_map)
 
-        return {
-            "dimensions": {
-                "rows": len(all_rows),
-                "columns": max_col
-            },
-            "headers": all_rows[0] if all_rows else [],
-            "data": all_rows[1:] if len(all_rows) > 1 else []
-        }
+        return TableDataItem(
+            rows=len(all_rows),
+            columns=max_col,
+            row_headers=all_rows[0] if all_rows else [],
+            data=all_rows[1:] if len(all_rows) > 1 else []
+        )
 
-    def _get_merged_cells(self, sheet: Worksheet) -> dict[tuple[int, int, int, int], CellValue]:
+    def _get_merged_cells(self, sheet: Worksheet) -> dict[tuple[int, int, int, int], str]:
         """
         获取合并单元格信息
         Args:
@@ -323,7 +295,7 @@ def _get_merged_cells(self, sheet: Worksheet) -> dict[tuple[int, int, int, int],
                 merged_ranges[(min_row, min_col, max_row, max_col)] = merged_value
         return merged_ranges
 
-    def _create_merged_cell_map(self, merged_ranges: dict, sheet: Worksheet) -> dict[tuple[int, int], CellValue]:
+    def _create_merged_cell_map(self, merged_ranges: dict, sheet: Worksheet) -> dict[tuple[int, int], str]:
         """
         创建合并单元格映射
         Args:
@@ -342,7 +314,7 @@ def _create_merged_cell_map(self, merged_ranges: dict, sheet: Worksheet) -> dict
         return merged_map
 
     def _extract_all_rows(self, sheet: Worksheet, max_row: int, max_col: int,
-                          merged_map: dict[tuple[int, int], CellValue]) -> TableData:
+                          merged_map: dict[tuple[int, int], str]) -> list[list[str]]:
         """
         提取所有行数据
         Args:
diff --git a/tests/test_excel_parser.py b/tests/test_excel_parser.py
index b8f0775..7a72ba8 100644
--- a/tests/test_excel_parser.py
+++ b/tests/test_excel_parser.py
@@ -48,20 +48,22 @@ async def test_parse_real_basic_and_image():
             result = await parser.parse(xlsx_path)
 
             assert result.success is True
-            # 内容：Sheet1标题、Sheet1图片、Sheet1表格、Sheet2标题、Sheet2表格、结束文本
-            content = result.chunks
-            assert len(content) == 6
+            # 内容：Sheet1标题、Sheet1图片、Sheet1表格、Sheet2标题、Sheet2表格
+            content = result.tables
+            assert len(content) == 2
+
+            assert len(result.images) == 1
+            assert len(result.texts) == 2
 
             # 校验顺序与关键字段
-            assert content[0].type == "text" and content[0].name == "Sheet1"
-            assert content[1].type == "image"
-            assert content[1].name == "#/pictures/0"
-            assert content[1].content.startswith("data:image/")
-
-            assert content[2].type == "table"
-            assert content[3].type == "text" and content[3].name == "Sheet2"
-            assert content[4].type == "table"
-            assert content[5].type == "text" and content[5].name == "结束文本"
+            assert result.texts[0].type == "text" and result.texts[0].name == "Sheet1"
+            assert result.images[0].type == "image"
+            assert result.images[0].name == "#/pictures/0"
+            assert result.images[0].content.startswith("data:image/")
+
+            assert result.tables[0].type == "table"
+            assert result.texts[1].type == "text" and result.texts[1].name == "Sheet2"
+            assert result.tables[1].type == "table"
         finally:
             os.remove(xlsx_path)
     finally:
@@ -91,19 +93,19 @@ async def test_parse_real_merged_cells():
         result = await parser.parse(xlsx_path)
 
         assert result.success is True
-        content = result.chunks
-        # 结构：标题、表格、结束文本
-        assert len(content) == 3
+        # 结构：标题、表格
+        assert len(result.tables) == 1
+        assert len(result.texts) == 1
 
         # 表格在索引1
-        table_chunk: ChunkData = content[1]
+        table_chunk: ChunkData = result.tables[0]
         assert table_chunk.type == "table"
 
-        import json as _json
-        payload = _json.loads(table_chunk.content)
-        assert payload["headers"] == ["Merged Header", "Merged Header"]
-        assert payload["dimensions"]["rows"] == 2
-        assert payload["dimensions"]["columns"] == 2
+        payload = table_chunk.content
+        assert payload.row_headers == ["Merged Header", "Merged Header"]
+        assert payload.data == [["Value1", "Value2"]]
+        assert payload.rows == 2
+        assert payload.columns == 2
     finally:
         os.remove(xlsx_path)
 
diff --git a/worker.py b/worker.py
index 9fc6507..d59cea6 100644
--- a/worker.py
+++ b/worker.py
@@ -22,7 +22,7 @@ async def worker(app: Sanic) -> dict[str, Any]:
         parse_result = await parser_factory.parse_document(file_path)
         if not parse_result.success:
             continue
-        chunk_list = parse_result.chunks
+        chunk_list = parse_result.texts + parse_result.tables + parse_result.images + parse_result.formulas
         # 控制并发数量，防止访问量过大导致失败
         SEMAPHORE_LIMIT = 10  # 可根据实际情况调整
         semaphore = asyncio.Semaphore(SEMAPHORE_LIMIT)
@@ -35,5 +35,8 @@ async def enhance_with_semaphore(chunk: ChunkData, semaphore: asyncio.Semaphore)
         enhanced_chunk_list = await asyncio.gather(
             *(enhance_with_semaphore(chunk, semaphore) for chunk in chunk_list)
         )
-        parse_result.chunks = enhanced_chunk_list
+        parse_result.texts = enhanced_chunk_list[:len(parse_result.texts)]
+        parse_result.tables = enhanced_chunk_list[len(parse_result.texts):len(parse_result.texts) + len(parse_result.tables)]
+        parse_result.images = enhanced_chunk_list[len(parse_result.texts) + len(parse_result.tables):len(parse_result.texts) + len(parse_result.tables) + len(parse_result.images)]
+        parse_result.formulas = enhanced_chunk_list[len(parse_result.texts) + len(parse_result.tables) + len(parse_result.images):]
         return parse_result.model_dump(mode="json")

From d5e71b0b3b88365b7bd73a665aa021255a4429b9 Mon Sep 17 00:00:00 2001
From: liningping <728359849@qq.com>
Date: Tue, 19 Aug 2025 13:59:53 +0000
Subject: [PATCH 10/10] fix: add Field

---
 parsers/base_models.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/parsers/base_models.py b/parsers/base_models.py
index 6a68289..6b6b310 100644
--- a/parsers/base_models.py
+++ b/parsers/base_models.py
@@ -3,7 +3,7 @@
 from enum import Enum
 from typing import Any
 
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 
 logger = logging.getLogger(__name__)
 
@@ -19,9 +19,9 @@ class TableDataItem(BaseModel):
     """表格数据类"""
     rows: int  # 行数
     columns: int  # 列数
-    row_headers: list[Any] = []  # 行头
-    column_headers: list[Any] = []  # 列头
-    data: list[list[str]] = []  # 数据
+    row_headers: list[Any] = Field(default_factory=list)  # 行头
+    column_headers: list[Any] = Field(default_factory=list)  # 列头
+    data: list[list[str]] = Field(default_factory=list)  # 数据
 
 class ChunkData(BaseModel):
     """块数据类"""
@@ -33,10 +33,10 @@ class ChunkData(BaseModel):
 class DocumentData(BaseModel):
     """解析结果类"""
     title: str = ""
-    texts: list[ChunkData] = []
-    tables: list[ChunkData] = []
-    images: list[ChunkData] = []
-    formulas: list[ChunkData] = []
+    texts: list[ChunkData] = Field(default_factory=list)
+    tables: list[ChunkData] = Field(default_factory=list)
+    images: list[ChunkData] = Field(default_factory=list)
+    formulas: list[ChunkData] = Field(default_factory=list)
     processing_time: float = 0
     success: bool
     error_message: str | None = None
@@ -45,7 +45,7 @@ class DocumentParser(ABC):
     """文档解析器基类"""
 
     def __init__(self) -> None:
-        self.supported_formats: list[str] = []
+        self.supported_formats: list[str] = Field(default_factory=list)
 
     @abstractmethod
     async def parse(self, file_path: str) -> DocumentData:
@@ -55,4 +55,4 @@ async def parse(self, file_path: str) -> DocumentData:
     @abstractmethod
     def can_parse(self, file_path: str) -> bool:
         """检查是否可以解析该文件"""
-        pass
+        return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)