From 53b6f94df79a56e0ec509c416b08d089521b4b85 Mon Sep 17 00:00:00 2001 From: liningping <728359849@qq.com> Date: Wed, 13 Aug 2025 09:09:11 +0000 Subject: [PATCH 01/10] feat: add s3 unit test --- test_s3_auth.py | 83 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 test_s3_auth.py diff --git a/test_s3_auth.py b/test_s3_auth.py new file mode 100644 index 0000000..445cfda --- /dev/null +++ b/test_s3_auth.py @@ -0,0 +1,83 @@ +import boto3 +from botocore.exceptions import ClientError, NoCredentialsError + +def test_s3_authentication(): + """测试S3认证""" + + # 配置信息 + endpoint_url = "https://s3.kclab.cloud" + access_key = "B2sE0fKv1Y1lOpZtge5u" + secret_key = "JRx4MbrMbfUfQjIEm7speT52kQgjt0zafvlAuYxW" + bucket_name = "bucket-78134-shared" + region = "us-east-1" + + print("=== S3认证测试 ===") + print(f"端点: {endpoint_url}") + print(f"存储桶: {bucket_name}") + print(f"区域: {region}") + print() + + try: + # 创建S3客户端 + s3_client = boto3.client( + 's3', + endpoint_url=endpoint_url, + aws_access_key_id=access_key, + aws_secret_access_key=secret_key, + region_name=region + ) + + print("1. 测试列出存储桶...") + response = s3_client.list_buckets() + buckets = [b['Name'] for b in response['Buckets']] + print(f" 可用存储桶: {buckets}") + + if bucket_name in buckets: + print(f"\n2. 测试访问存储桶: {bucket_name}") + try: + response = s3_client.list_objects_v2(Bucket=bucket_name, MaxKeys=5) + if 'Contents' in response: + print(f" 存储桶内容数量: {len(response['Contents'])}") + for obj in response['Contents'][:3]: + print(f" - {obj['Key']} ({obj['Size']} bytes)") + else: + print(" 存储桶为空") + except ClientError as e: + print(f" 访问存储桶失败: {e}") + + print(f"\n3. 测试上传小文件...") + test_content = b"Hello, S3 Test!" + test_key = "test/connectivity_test.txt" + + try: + s3_client.put_object( + Bucket=bucket_name, + Key=test_key, + Body=test_content, + ContentType='text/plain' + ) + print(" 上传成功!") + + # 测试下载 + response = s3_client.get_object(Bucket=bucket_name, Key=test_key) + downloaded_content = response['Body'].read() + print(f" 下载成功: {downloaded_content}") + + # 清理测试文件 + s3_client.delete_object(Bucket=bucket_name, Key=test_key) + print(" 清理成功!") + + except ClientError as e: + print(f" 上传/下载失败: {e}") + + except NoCredentialsError: + print("错误: 无法找到AWS凭证") + except ClientError as e: + error_code = e.response['Error']['Code'] + error_message = e.response['Error']['Message'] + print(f"错误: {error_code} - {error_message}") + except Exception as e: + print(f"未知错误: {e}") + +if __name__ == "__main__": + test_s3_authentication() From 6812ef68b298a741fa017fd7f4026dc88a72c6ed Mon Sep 17 00:00:00 2001 From: liningping <728359849@qq.com> Date: Wed, 13 Aug 2025 10:07:51 +0000 Subject: [PATCH 02/10] fix: comfortable typing check --- test_s3_auth.py | 83 ------------------------------------------------- 1 file changed, 83 deletions(-) delete mode 100644 test_s3_auth.py diff --git a/test_s3_auth.py b/test_s3_auth.py deleted file mode 100644 index 445cfda..0000000 --- a/test_s3_auth.py +++ /dev/null @@ -1,83 +0,0 @@ -import boto3 -from botocore.exceptions import ClientError, NoCredentialsError - -def test_s3_authentication(): - """测试S3认证""" - - # 配置信息 - endpoint_url = "https://s3.kclab.cloud" - access_key = "B2sE0fKv1Y1lOpZtge5u" - secret_key = "JRx4MbrMbfUfQjIEm7speT52kQgjt0zafvlAuYxW" - bucket_name = "bucket-78134-shared" - region = "us-east-1" - - print("=== S3认证测试 ===") - print(f"端点: {endpoint_url}") - print(f"存储桶: {bucket_name}") - print(f"区域: {region}") - print() - - try: - # 创建S3客户端 - s3_client = boto3.client( - 's3', - endpoint_url=endpoint_url, - aws_access_key_id=access_key, - aws_secret_access_key=secret_key, - region_name=region - ) - - print("1. 测试列出存储桶...") - response = s3_client.list_buckets() - buckets = [b['Name'] for b in response['Buckets']] - print(f" 可用存储桶: {buckets}") - - if bucket_name in buckets: - print(f"\n2. 测试访问存储桶: {bucket_name}") - try: - response = s3_client.list_objects_v2(Bucket=bucket_name, MaxKeys=5) - if 'Contents' in response: - print(f" 存储桶内容数量: {len(response['Contents'])}") - for obj in response['Contents'][:3]: - print(f" - {obj['Key']} ({obj['Size']} bytes)") - else: - print(" 存储桶为空") - except ClientError as e: - print(f" 访问存储桶失败: {e}") - - print(f"\n3. 测试上传小文件...") - test_content = b"Hello, S3 Test!" - test_key = "test/connectivity_test.txt" - - try: - s3_client.put_object( - Bucket=bucket_name, - Key=test_key, - Body=test_content, - ContentType='text/plain' - ) - print(" 上传成功!") - - # 测试下载 - response = s3_client.get_object(Bucket=bucket_name, Key=test_key) - downloaded_content = response['Body'].read() - print(f" 下载成功: {downloaded_content}") - - # 清理测试文件 - s3_client.delete_object(Bucket=bucket_name, Key=test_key) - print(" 清理成功!") - - except ClientError as e: - print(f" 上传/下载失败: {e}") - - except NoCredentialsError: - print("错误: 无法找到AWS凭证") - except ClientError as e: - error_code = e.response['Error']['Code'] - error_message = e.response['Error']['Message'] - print(f"错误: {error_code} - {error_message}") - except Exception as e: - print(f"未知错误: {e}") - -if __name__ == "__main__": - test_s3_authentication() From 2c1b8056fd4061b59396e443f633e635ce4ab306 Mon Sep 17 00:00:00 2001 From: ningpingli <728359849@qq.com> Date: Thu, 14 Aug 2025 21:04:27 +0800 Subject: [PATCH 03/10] feat: add information enhancer --- enhancers/__init__.py | 0 enhancers/information_enhancer.py | 73 +++++++++++++++++++++++++++++++ parsers/document_parser.py | 24 +++++----- tests/test_integration.py | 9 ++-- worker.py | 32 ++++++++++++++ 5 files changed, 122 insertions(+), 16 deletions(-) create mode 100644 enhancers/__init__.py create mode 100644 enhancers/information_enhancer.py create mode 100644 worker.py diff --git a/enhancers/__init__.py b/enhancers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/enhancers/information_enhancer.py b/enhancers/information_enhancer.py new file mode 100644 index 0000000..82659a1 --- /dev/null +++ b/enhancers/information_enhancer.py @@ -0,0 +1,73 @@ +from abc import ABC, abstractmethod +from typing import Any + +class InformationEnhancer(ABC): + """信息增强器基类""" + + def __init__(self) -> None: + pass + + @abstractmethod + async def enhance(self, information: dict[str, Any]) -> dict[str, Any]: + """增强信息""" + pass + +class TableInformationEnhancer(InformationEnhancer): + """表格信息增强器""" + + def __init__(self) -> None: + super().__init__() + + async def enhance(self, information: dict[str, Any]) -> dict[str, Any]: + """增强信息""" + return information + +class FormulasInformationEnhancer(InformationEnhancer): + """公式信息增强器""" + + def __init__(self) -> None: + super().__init__() + + async def enhance(self, information: dict[str, Any]) -> dict[str, Any]: + """增强信息""" + return information + +class ImageInformationEnhancer(InformationEnhancer): + """图片信息增强器""" + + def __init__(self) -> None: + super().__init__() + + async def enhance(self, information: dict[str, Any]) -> dict[str, Any]: + """增强信息""" + return information + +class InformationEnhancerFactory: + """信息增强器工厂""" + + def __init__(self) -> None: + self.enhancers = [ + TableInformationEnhancer(), + FormulasInformationEnhancer(), + ImageInformationEnhancer() + ] + + def get_enhancer(self, information: dict[str, Any]) -> InformationEnhancer: + """获取信息增强器""" + match information.get("type"): + case "table": + return TableInformationEnhancer() + case "formulas": + return FormulasInformationEnhancer() + case "image": + return ImageInformationEnhancer() + case _: + return None + + async def enhance_information(self, information: dict[str, Any]) -> dict[str, Any]: + """增强信息""" + enhancer = self.get_enhancer(information) + if not enhancer: + raise ValueError(f"不支持的模态类型: {information.get('type')}") + return await enhancer.enhance(information) + diff --git a/parsers/document_parser.py b/parsers/document_parser.py index b203da3..69ad1d8 100644 --- a/parsers/document_parser.py +++ b/parsers/document_parser.py @@ -11,7 +11,7 @@ def __init__(self) -> None: self.supported_formats: list[str] = [] @abstractmethod - async def parse(self, file_path: str, file_content: bytes) -> dict[str, Any]: + async def parse(self, file_path: str) -> list[dict[str, Any]]: """解析文档""" pass @@ -30,19 +30,19 @@ def __init__(self) -> None: def can_parse(self, file_path: str) -> bool: return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats) - async def parse(self, file_path: str, file_content: bytes) -> dict[str, Any]: + async def parse(self, file_path: str) -> list[dict[str, Any]]: """解析PDF文档""" try: # 这里应该使用mineru库 # 暂时返回模拟数据 - return { + return [{ "type": "pdf", "text": f"PDF文档内容: {file_path}", "pages": 1, "images": [], "tables": [], "formulas": [] - } + }] except Exception as e: logger.error(f"解析PDF失败: {e}") raise @@ -57,19 +57,19 @@ def __init__(self) -> None: def can_parse(self, file_path: str) -> bool: return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats) - async def parse(self, file_path: str, file_content: bytes) -> dict[str, Any]: + async def parse(self, file_path: str) -> list[dict[str, Any]]: """解析DOCX文档""" try: # 这里应该使用docling库 # 暂时返回模拟数据 - return { + return [{ "type": "docx", "text": f"DOCX文档内容: {file_path}", "pages": 1, "images": [], "tables": [], "formulas": [] - } + }] except Exception as e: logger.error(f"解析DOCX失败: {e}") raise @@ -84,19 +84,19 @@ def __init__(self) -> None: def can_parse(self, file_path: str) -> bool: return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats) - async def parse(self, file_path: str, file_content: bytes) -> dict[str, Any]: + async def parse(self, file_path: str) -> list[dict[str, Any]]: """解析XLSX文档""" try: # 这里应该使用docling库 # 暂时返回模拟数据 - return { + return [{ "type": "xlsx", "text": f"XLSX文档内容: {file_path}", "pages": 1, "images": [], "tables": [], "formulas": [] - } + }] except Exception as e: logger.error(f"解析XLSX失败: {e}") raise @@ -118,10 +118,10 @@ def get_parser(self, file_path: str) -> DocumentParser | None: return parser return None - async def parse_document(self, file_path: str, file_content: bytes) -> dict[str, Any]: + async def parse_document(self, file_path: str) -> list[dict[str, Any]]: """解析文档""" parser = self.get_parser(file_path) if not parser: raise ValueError(f"不支持的文件格式: {file_path}") - return await parser.parse(file_path, file_content) + return await parser.parse(file_path) diff --git a/tests/test_integration.py b/tests/test_integration.py index 973a2fa..020c90d 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -19,7 +19,7 @@ class TestRedisIntegration: async def redis_client(self): """获取真实的Redis客户端""" try: - client = await get_redis_client("redis://localhost:6379") + client = await get_redis_client(os.getenv("REDIS_URL")) # 清理测试数据 await client.flushdb() yield client @@ -223,7 +223,7 @@ async def system_components(self): """获取所有系统组件""" try: # Redis客户端 - redis_client = await get_redis_client("redis://localhost:6379") + redis_client = await get_redis_client(os.getenv("REDIS_URL")) await redis_client.flushdb() # 任务管理器 @@ -286,12 +286,12 @@ async def test_document_processing_workflow(self, system_components): assert retrieved_task["task_id"] == "workflow-test-123" # 6. 模拟文档解析结果 - parsing_result = { + parsing_result = [{ "text": "Chemical document content with formulas and structures.", "formulas": ["H2O", "CO2", "CH4"], "structures": ["molecular_structure_1.png"], "confidence": 0.92 - } + }] # 7. 更新任务状态和结果 update_success = await task_manager.update_task_status( @@ -314,6 +314,7 @@ async def test_document_processing_workflow(self, system_components): # 环境检查装饰器 def requires_redis(func): """需要Redis服务的装饰器""" + print(os.getenv("REDIS_URL")) return pytest.mark.skipif( not os.getenv("REDIS_URL") and not os.getenv("REDIS_ENABLED", "false").lower() == "true", reason="需要Redis服务" diff --git a/worker.py b/worker.py new file mode 100644 index 0000000..d534c0d --- /dev/null +++ b/worker.py @@ -0,0 +1,32 @@ +from typing import Any +from enhancers.information_enhancer import InformationEnhancerFactory +import asyncio +from sanic import Sanic +from parsers.document_parser import DocumentParserFactory +from config import settings + +async def worker(app: Sanic) -> list[dict[str, Any]]: + # 使用工厂获取合适的解析器 + parser_factory = DocumentParserFactory() + enhancer_factory = InformationEnhancerFactory() + redis = app.ctx.redis + while True: + task = await redis.get_task() + if not task: + await asyncio.sleep(1) + continue + file_path = task.get("file_path") + information_list = await parser_factory.parse_document(file_path) + # 控制并发数量,防止访问量过大导致失败 + SEMAPHORE_LIMIT = 10 # 可根据实际情况调整 + semaphore = asyncio.Semaphore(SEMAPHORE_LIMIT) + + async def enhance_with_semaphore(info): + async with semaphore: + return await enhancer_factory.enhance_information(info) + + # 并发增强每个信息 + enhanced_information_list = await asyncio.gather( + *(enhance_with_semaphore(info) for info in information_list) + ) + return enhanced_information_list \ No newline at end of file From 480a717b548f9d892eaa5a4bb1665c256786526e Mon Sep 17 00:00:00 2001 From: liningping <728359849@qq.com> Date: Mon, 18 Aug 2025 10:00:07 +0000 Subject: [PATCH 04/10] feat: add excel parser test --- enhancers/information_enhancer.py | 37 +-- parsers/document_parser.py | 126 ++-------- parsers/document_parser_factory.py | 29 +++ parsers/excel_parser.py | 386 +++++++++++++++++++++++++++++ pyproject.toml | 5 +- tests/test_excel_parser.py | 110 ++++++++ uv.lock | 173 +++++++++++++ worker.py | 29 ++- 8 files changed, 751 insertions(+), 144 deletions(-) create mode 100644 parsers/document_parser_factory.py create mode 100644 parsers/excel_parser.py create mode 100644 tests/test_excel_parser.py diff --git a/enhancers/information_enhancer.py b/enhancers/information_enhancer.py index 82659a1..696dd10 100644 --- a/enhancers/information_enhancer.py +++ b/enhancers/information_enhancer.py @@ -1,44 +1,33 @@ from abc import ABC, abstractmethod -from typing import Any -class InformationEnhancer(ABC): - """信息增强器基类""" +from parsers.document_parser import DocumentData - def __init__(self) -> None: - pass +class InformationEnhancer(ABC): + """信息增强器基类""" @abstractmethod - async def enhance(self, information: dict[str, Any]) -> dict[str, Any]: + async def enhance(self, information: DocumentData) -> DocumentData: """增强信息""" pass class TableInformationEnhancer(InformationEnhancer): """表格信息增强器""" - def __init__(self) -> None: - super().__init__() - - async def enhance(self, information: dict[str, Any]) -> dict[str, Any]: + async def enhance(self, information: DocumentData) -> DocumentData: """增强信息""" return information class FormulasInformationEnhancer(InformationEnhancer): """公式信息增强器""" - def __init__(self) -> None: - super().__init__() - - async def enhance(self, information: dict[str, Any]) -> dict[str, Any]: + async def enhance(self, information: DocumentData) -> DocumentData: """增强信息""" return information class ImageInformationEnhancer(InformationEnhancer): """图片信息增强器""" - def __init__(self) -> None: - super().__init__() - - async def enhance(self, information: dict[str, Any]) -> dict[str, Any]: + async def enhance(self, information: DocumentData) -> DocumentData: """增强信息""" return information @@ -52,9 +41,9 @@ def __init__(self) -> None: ImageInformationEnhancer() ] - def get_enhancer(self, information: dict[str, Any]) -> InformationEnhancer: + def get_enhancer(self, information: DocumentData) -> InformationEnhancer|None: """获取信息增强器""" - match information.get("type"): + match information.type: case "table": return TableInformationEnhancer() case "formulas": @@ -63,11 +52,11 @@ def get_enhancer(self, information: dict[str, Any]) -> InformationEnhancer: return ImageInformationEnhancer() case _: return None - - async def enhance_information(self, information: dict[str, Any]) -> dict[str, Any]: + + async def enhance_information(self, information: DocumentData) -> DocumentData: """增强信息""" enhancer = self.get_enhancer(information) if not enhancer: - raise ValueError(f"不支持的模态类型: {information.get('type')}") + raise ValueError(f"不支持的模态类型: {information.type}") return await enhancer.enhance(information) - + diff --git a/parsers/document_parser.py b/parsers/document_parser.py index 69ad1d8..1ea37f3 100644 --- a/parsers/document_parser.py +++ b/parsers/document_parser.py @@ -1,9 +1,25 @@ import logging from abc import ABC, abstractmethod -from typing import Any + +from pydantic import BaseModel logger = logging.getLogger(__name__) +class DocumentData(BaseModel): + """文档数据类""" + type: str + name: str + content: str + description: str + +class ParseResult(BaseModel): + """解析结果类""" + title: str + document: list[DocumentData] + processing_time: float + success: bool + error_message: str | None = None + class DocumentParser(ABC): """文档解析器基类""" @@ -11,7 +27,7 @@ def __init__(self) -> None: self.supported_formats: list[str] = [] @abstractmethod - async def parse(self, file_path: str) -> list[dict[str, Any]]: + async def parse(self, file_path: str) -> ParseResult: """解析文档""" pass @@ -19,109 +35,3 @@ async def parse(self, file_path: str) -> list[dict[str, Any]]: def can_parse(self, file_path: str) -> bool: """检查是否可以解析该文件""" pass - -class PDFParser(DocumentParser): - """PDF文档解析器""" - - def __init__(self) -> None: - super().__init__() - self.supported_formats = ['.pdf'] - - def can_parse(self, file_path: str) -> bool: - return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats) - - async def parse(self, file_path: str) -> list[dict[str, Any]]: - """解析PDF文档""" - try: - # 这里应该使用mineru库 - # 暂时返回模拟数据 - return [{ - "type": "pdf", - "text": f"PDF文档内容: {file_path}", - "pages": 1, - "images": [], - "tables": [], - "formulas": [] - }] - except Exception as e: - logger.error(f"解析PDF失败: {e}") - raise - -class DOCXParser(DocumentParser): - """DOCX文档解析器""" - - def __init__(self) -> None: - super().__init__() - self.supported_formats = ['.docx','.doc'] - - def can_parse(self, file_path: str) -> bool: - return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats) - - async def parse(self, file_path: str) -> list[dict[str, Any]]: - """解析DOCX文档""" - try: - # 这里应该使用docling库 - # 暂时返回模拟数据 - return [{ - "type": "docx", - "text": f"DOCX文档内容: {file_path}", - "pages": 1, - "images": [], - "tables": [], - "formulas": [] - }] - except Exception as e: - logger.error(f"解析DOCX失败: {e}") - raise - -class XLSXParser(DocumentParser): - """XLSX文档解析器""" - - def __init__(self) -> None: - super().__init__() - self.supported_formats = ['.xlsx'] - - def can_parse(self, file_path: str) -> bool: - return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats) - - async def parse(self, file_path: str) -> list[dict[str, Any]]: - """解析XLSX文档""" - try: - # 这里应该使用docling库 - # 暂时返回模拟数据 - return [{ - "type": "xlsx", - "text": f"XLSX文档内容: {file_path}", - "pages": 1, - "images": [], - "tables": [], - "formulas": [] - }] - except Exception as e: - logger.error(f"解析XLSX失败: {e}") - raise - -class DocumentParserFactory: - """文档解析器工厂""" - - def __init__(self) -> None: - self.parsers = [ - PDFParser(), - DOCXParser(), - XLSXParser() - ] - - def get_parser(self, file_path: str) -> DocumentParser | None: - """根据文件路径获取合适的解析器""" - for parser in self.parsers: - if parser.can_parse(file_path): - return parser - return None - - async def parse_document(self, file_path: str) -> list[dict[str, Any]]: - """解析文档""" - parser = self.get_parser(file_path) - if not parser: - raise ValueError(f"不支持的文件格式: {file_path}") - - return await parser.parse(file_path) diff --git a/parsers/document_parser_factory.py b/parsers/document_parser_factory.py new file mode 100644 index 0000000..e582873 --- /dev/null +++ b/parsers/document_parser_factory.py @@ -0,0 +1,29 @@ +import logging + +from parsers.document_parser import DocumentParser, ParseResult +from parsers.excel_parser import ExcelParser + +logger = logging.getLogger(__name__) + +class DocumentParserFactory: + """文档解析器工厂""" + + def __init__(self) -> None: + self.parsers: list[DocumentParser] = [ + ExcelParser() + ] + + def get_parser(self, file_path: str) -> DocumentParser | None: + """根据文件路径获取合适的解析器""" + for parser in self.parsers: + if parser.can_parse(file_path): + return parser + return None + + async def parse_document(self, file_path: str) -> ParseResult: + """解析文档""" + parser = self.get_parser(file_path) + if not parser: + raise ValueError(f"不支持的文件格式: {file_path}") + + return await parser.parse(file_path) diff --git a/parsers/excel_parser.py b/parsers/excel_parser.py new file mode 100644 index 0000000..4a33e5c --- /dev/null +++ b/parsers/excel_parser.py @@ -0,0 +1,386 @@ +""" +Excel文件解析器模块 + +该模块提供将Excel文件转换为结构化JSON格式的功能, +包括表格数据提取和图片处理。 +""" + +import base64 +import json +import time +import warnings +from dataclasses import dataclass +from datetime import date, datetime +from pathlib import Path +from typing import Any + +from openpyxl import load_workbook # type: ignore +from openpyxl.drawing.image import Image # type: ignore +from openpyxl.workbook.workbook import Workbook # type: ignore +from openpyxl.worksheet.worksheet import Worksheet # type: ignore + +from parsers.document_parser import DocumentData, DocumentParser, ParseResult + +# 忽略 openpyxl 的特定警告 +warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl') + +# 类型别名 +CellValue = str|int|float|bool|None|datetime|date +TableData = list[list[CellValue]] + + +@dataclass +class ExcelParseConfig: + """Excel解析配置类""" + data_only: bool = True + keep_vba: bool = False + default_image_format: str = 'png' + image_description_placeholder: str = "[待生成]" + + +class ExcelParseError(Exception): + """Excel解析异常类""" + pass + + +class ExcelParser(DocumentParser): + """Excel文件解析器类""" + + def __init__(self, config: ExcelParseConfig | None = None): + """ + 初始化Excel解析器 + Args: + config: 解析配置,如果为None则使用默认配置 + """ + super().__init__() + self.config: ExcelParseConfig = config or ExcelParseConfig() + self.image_index: int = 0 + self.supported_formats: list[str] = ['.xlsx', '.xls'] + + async def parse(self, excel_path: str) -> ParseResult: + """ + 解析Excel文件并保存结果 + + Args: + excel_path: Excel文件路径 + output_dir: 输出目录路径 + Returns: + ParseResult: 解析结果对象 + Raises: + ExcelParseError: 当解析失败时抛出 + """ + start_time = time.time() + + try: + # 转换Excel到JSON格式 + title, document_data = self._excel_to_json(excel_path) + + # 计算处理时间 + processing_time = time.time() - start_time + + + return ParseResult( + title=title, + document=document_data, + processing_time=processing_time, + success=True + ) + + except Exception as e: + processing_time = time.time() - start_time + return ParseResult( + title="", + document=[], + processing_time=processing_time, + success=False, + error_message=str(e) + ) + + def can_parse(self, file_path: str) -> bool: + """ + 验证输入文件 + Args: + file_path: 文件路径 + Returns: + bool: 是否支持解析 + """ + return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats) + + def _excel_to_json(self, excel_path: str) -> tuple[str, list[DocumentData]]: + """ + 将Excel文件转换为JSON格式 + Args: + excel_path: Excel文件路径 + Returns: + DocumentData: 文档数据 + """ + # 获取文件名作为标题(不带扩展名) + title = Path(excel_path).stem + + # 初始化内容列表和图片列表 + content: list[DocumentData] = [] + self.image_index = 0 + + # 加载工作簿 + workbook = self._load_workbook(excel_path) + + # 处理每个工作表 + for sheet_index, sheet_name in enumerate(workbook.sheetnames): + sheet = workbook[sheet_name] + + # 添加工作表标题 + content.append(DocumentData( + type="text", + name=sheet_name, + content=f"工作表 {sheet_index + 1}: {sheet_name}", + description="工作表标题" + )) + + # 处理图片 + sheet_images = self._extract_sheet_images(sheet) + content.extend(sheet_images) + + # 处理表格数据 + table_content = self._extract_table_data(sheet) + content.append(DocumentData( + type="table", + name="表格", + content=json.dumps(table_content), + description="表格" + )) + + # 添加结束文本 + content.append(DocumentData( + type="text", + name="结束文本", + content="", + description="结束文本" + )) + + return title, content + + def _load_workbook(self, excel_path: str) -> Workbook: + """ + 加载Excel工作簿 + Args: + excel_path: Excel文件路径 + Returns: + Workbook: 加载的工作簿对象 + """ + return load_workbook( + excel_path, + data_only=self.config.data_only, + keep_vba=self.config.keep_vba + ) + + def _extract_sheet_images(self, sheet: Worksheet) -> list[DocumentData]: + """ + 提取工作表中的图片 + Args: + sheet: 工作表对象 + Returns: + List[DocumentData]: 图片信息列表 + """ + sheet_images: list[DocumentData] = [] + + images = getattr(sheet, '_images', None) + if not images or not isinstance(images, (list, tuple)): + return sheet_images + + # 收集图片信息 + for img_obj in images: + if not isinstance(img_obj, Image): + continue + + try: + image_info = self._process_image_object(img_obj) + if image_info: + sheet_images.append(image_info) + except Exception as e: + print(f"处理图片失败: {str(e)}") + continue + + return sheet_images + + def _process_image_object(self, img_obj: Image) -> DocumentData | None: + """ + 处理单个图片对象 + Args: + img_obj: 图片对象 + Returns: + Optional[DocumentData]: 图片信息,处理失败时返回None + """ + try: + # 获取图片数据 + img_data = img_obj._data() + + # 获取图片格式 + img_format = self._get_image_format(img_obj) + + # 生成Base64编码 + base64_encoded = base64.b64encode(img_data).decode('utf-8') + uri = f"data:image/{img_format};base64,{base64_encoded}" + + # 创建图片信息 + image_info = DocumentData( + type="image", + name=f"#/pictures/{self.image_index}", + content=uri, + description=self.config.image_description_placeholder + ) + + self.image_index += 1 + return image_info + + except Exception as e: + print(f"处理图片对象失败: {str(e)}") + return None + + def _get_image_format(self, img_obj: Image) -> str: + """ + 获取图片格式 + Args: + img_obj: 图片对象 + Returns: + str: 图片格式 + """ + fmt = getattr(img_obj, 'format', None) + if isinstance(fmt, str) and fmt: + img_format: str = fmt.lower() + # 处理JPEG格式的别名 + if img_format == 'jpeg': + img_format = 'jpg' + return img_format + return self.config.default_image_format + + def _process_cell_value(self, cell_value: Any) -> CellValue: + """ + 预处理单元格值,将datetime对象转换为字符串 + Args: + cell_value: 原始单元格值 + Returns: + CellValue: 处理后的单元格值 + """ + if cell_value is None: + return "" + + # 处理datetime对象,转换为ISO格式字符串 + if isinstance(cell_value, datetime): + return cell_value.strftime("%Y-%m-%d %H:%M:%S") + + # 处理date对象,转换为日期字符串 + if isinstance(cell_value, date): + return cell_value.strftime("%Y-%m-%d") + + # 处理其他类型 + if isinstance(cell_value, str|int|float|bool): + return cell_value + + # 对于其他类型,转换为字符串 + return str(cell_value) + + def _extract_table_data(self, sheet: Worksheet) -> dict[str, Any]: + """ + 提取表格数据 + Args: + sheet: 工作表对象 + Returns: + Dict[str, Any]: 表格数据 + """ + # 获取合并单元格信息 + merged_ranges = self._get_merged_cells(sheet) + merged_map = self._create_merged_cell_map(merged_ranges, sheet) + + # 计算表格维度 + max_row = sheet.max_row + max_col = sheet.max_column + + # 提取所有数据 + all_rows = self._extract_all_rows(sheet, max_row, max_col, merged_map) + + return { + "dimensions": { + "rows": len(all_rows), + "columns": max_col + }, + "headers": all_rows[0] if all_rows else [], + "data": all_rows[1:] if len(all_rows) > 1 else [] + } + + def _get_merged_cells(self, sheet: Worksheet) -> dict[tuple[int, int, int, int], CellValue]: + """ + 获取合并单元格信息 + Args: + sheet: 工作表对象 + Returns: + Dict: 合并单元格映射 + """ + merged_ranges = {} + if sheet.merged_cells: + for merged_range in sheet.merged_cells.ranges: + min_row, min_col, max_row, max_col = ( + merged_range.min_row, merged_range.min_col, + merged_range.max_row, merged_range.max_col + ) + merged_value = sheet.cell(row=min_row, column=min_col).value + merged_ranges[(min_row, min_col, max_row, max_col)] = merged_value + return merged_ranges + + def _create_merged_cell_map(self, merged_ranges: dict, sheet: Worksheet) -> dict[tuple[int, int], CellValue]: + """ + 创建合并单元格映射 + Args: + merged_ranges: 合并单元格范围 + sheet: 工作表对象 + Returns: + Dict: 合并单元格映射 + """ + merged_map = {} + for (min_row, min_col, max_row, max_col), value in merged_ranges.items(): + # 预处理合并单元格的值 + processed_value = self._process_cell_value(value) + for row_idx in range(min_row, max_row + 1): + for col_idx in range(min_col, max_col + 1): + merged_map[(row_idx, col_idx)] = processed_value + return merged_map + + def _extract_all_rows(self, sheet: Worksheet, max_row: int, max_col: int, + merged_map: dict[tuple[int, int], CellValue]) -> TableData: + """ + 提取所有行数据 + Args: + sheet: 工作表对象 + max_row: 最大行数 + max_col: 最大列数 + merged_map: 合并单元格映射 + Returns: + TableData: 所有行数据 + """ + all_rows = [] + for row_idx in range(1, max_row + 1): + row_data = [] + for col_idx in range(1, max_col + 1): + # 检查是否是合并单元格 + if (row_idx, col_idx) in merged_map: + cell_value = merged_map[(row_idx, col_idx)] + else: + cell = sheet.cell(row=row_idx, column=col_idx) + cell_value = cell.value + + # 预处理单元格值 + processed_value = self._process_cell_value(cell_value) + row_data.append(processed_value) + all_rows.append(row_data) + + return all_rows + + + def _save_json(self, data: Any, file_path: Path) -> None: + """ + 保存JSON数据到文件 + Args: + data: 要保存的数据 + file_path: 文件路径 + """ + with open(file_path, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) diff --git a/pyproject.toml b/pyproject.toml index f6b4cbd..23446a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,8 @@ dependencies = [ "dotenv>=0.9.9", "aiobotocore>=2.24.0", "redis>=6.4.0", + "openpyxl>=3.1.5", + "pydantic>=2.11.7", ] [dependency-groups] @@ -25,7 +27,8 @@ all-dev = [ dev = [ "pytest>=8.4.1", "pytest-cov>=6.2.1", - "pytest-asyncio>=0.23.0", # 添加异步测试支持 + "pytest-asyncio>=0.23.0", # 添加异步测试支持 + "pillow>=11.3.0", ] lint = [ "bandit>=1.8.6", diff --git a/tests/test_excel_parser.py b/tests/test_excel_parser.py new file mode 100644 index 0000000..fa5a464 --- /dev/null +++ b/tests/test_excel_parser.py @@ -0,0 +1,110 @@ +import base64 +import os +import tempfile + +import pytest +from openpyxl import Workbook +from openpyxl.drawing.image import Image as XLImage + +from parsers.excel_parser import ExcelParser +from parsers.document_parser import DocumentData + + +@pytest.mark.asyncio +async def test_parse_real_basic_and_image(): + # 准备临时PNG图片(1x1透明像素) + one_px_png_b64 = ( + b"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAusB9Y2oU5wAAAAASUVORK5CYII=" + ) + png_fd, png_path = tempfile.mkstemp(suffix=".png") + try: + with os.fdopen(png_fd, "wb") as f: + f.write(base64.b64decode(one_px_png_b64)) + + # 构建包含图片与两个工作表的真实Excel文件 + wb = Workbook() + ws1 = wb.active + ws1.title = "Sheet1" + # 表头与数据 + ws1["A1"] = "Header1" + ws1["B1"] = "Header2" + ws1["A2"] = "Data1" + ws1["B2"] = "Data2" + # 插入图片 + img = XLImage(png_path) + ws1.add_image(img, "A5") + + # 第二个工作表 + ws2 = wb.create_sheet("Sheet2") + ws2["A1"] = "Single Header" + ws2["A2"] = "Single Data" + + xlsx_fd, xlsx_path = tempfile.mkstemp(suffix=".xlsx") + os.close(xlsx_fd) + wb.save(xlsx_path) + + try: + parser = ExcelParser() + result = await parser.parse(xlsx_path) + + assert result.success is True + # 内容:Sheet1标题、Sheet1图片、Sheet1表格、Sheet2标题、Sheet2表格、结束文本 + content = result.document + assert len(content) == 6 + + # 校验顺序与关键字段 + assert content[0].type == "text" and content[0].name == "Sheet1" + assert content[1].type == "image" + assert content[1].name == "#/pictures/0" + assert content[1].content.startswith("data:image/") + + assert content[2].type == "table" + assert content[3].type == "text" and content[3].name == "Sheet2" + assert content[4].type == "table" + assert content[5].type == "text" and content[5].name == "结束文本" + finally: + os.remove(xlsx_path) + finally: + os.remove(png_path) + + +@pytest.mark.asyncio +async def test_parse_real_merged_cells(): + # 构建包含合并单元格的真实Excel文件 + wb = Workbook() + ws = wb.active + ws.title = "Sheet1" + + # 合并 A1:B1 并设置值 + ws.merge_cells(start_row=1, start_column=1, end_row=1, end_column=2) + ws["A1"] = "Merged Header" + # 填充下一行数据 + ws["A2"] = "Value1" + ws["B2"] = "Value2" + + xlsx_fd, xlsx_path = tempfile.mkstemp(suffix=".xlsx") + os.close(xlsx_fd) + wb.save(xlsx_path) + + try: + parser = ExcelParser() + result = await parser.parse(xlsx_path) + + assert result.success is True + content = result.document + # 结构:标题、表格、结束文本 + assert len(content) == 3 + + # 表格在索引1 + table_chunk: DocumentData = content[1] + assert table_chunk.type == "table" + + import json as _json + payload = _json.loads(table_chunk.content) + assert payload["headers"] == ["Merged Header", "Merged Header"] + assert payload["dimensions"]["rows"] == 2 + assert payload["dimensions"]["columns"] == 2 + finally: + os.remove(xlsx_path) + + diff --git a/uv.lock b/uv.lock index d15f45a..6ee4cdd 100644 --- a/uv.lock +++ b/uv.lock @@ -111,6 +111,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e", size = 7490, upload-time = "2025-07-03T22:54:42.156Z" }, ] +[[package]] +name = "annotated-types" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, +] + [[package]] name = "attrs" version = "25.3.0" @@ -309,6 +318,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b2/b7/545d2c10c1fc15e48653c91efde329a790f2eecfbbf2bd16003b5db2bab0/dotenv-0.9.9-py2.py3-none-any.whl", hash = "sha256:29cf74a087b31dafdb5a446b6d7e11cbce8ed2741540e2339c69fbef92c94ce9", size = 1892, upload-time = "2025-02-19T22:15:01.647Z" }, ] +[[package]] +name = "et-xmlfile" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" }, +] + [[package]] name = "frozenlist" version = "1.7.0" @@ -457,6 +475,8 @@ dependencies = [ { name = "aiofiles" }, { name = "aiohttp" }, { name = "dotenv" }, + { name = "openpyxl" }, + { name = "pydantic" }, { name = "redis" }, { name = "sanic" }, { name = "sanic-ext" }, @@ -467,6 +487,7 @@ all-dev = [ { name = "bandit" }, { name = "detect-secrets" }, { name = "mypy" }, + { name = "pillow" }, { name = "pytest" }, { name = "pytest-asyncio" }, { name = "pytest-cov" }, @@ -474,6 +495,7 @@ all-dev = [ { name = "types-boto3" }, ] dev = [ + { name = "pillow" }, { name = "pytest" }, { name = "pytest-asyncio" }, { name = "pytest-cov" }, @@ -496,6 +518,8 @@ requires-dist = [ { name = "aiofiles", specifier = ">=23.2.1" }, { name = "aiohttp", specifier = ">=3.9.0" }, { name = "dotenv", specifier = ">=0.9.9" }, + { name = "openpyxl", specifier = ">=3.1.5" }, + { name = "pydantic", specifier = ">=2.11.7" }, { name = "redis", specifier = ">=6.4.0" }, { name = "sanic", specifier = ">=23.12.0" }, { name = "sanic-ext", specifier = ">=23.12.0" }, @@ -506,6 +530,7 @@ all-dev = [ { name = "bandit", specifier = ">=1.8.6" }, { name = "detect-secrets", specifier = ">=1.5.0" }, { name = "mypy", specifier = ">=1.17.1" }, + { name = "pillow", specifier = ">=11.3.0" }, { name = "pytest", specifier = ">=8.4.1" }, { name = "pytest-asyncio", specifier = ">=0.23.0" }, { name = "pytest-cov", specifier = ">=6.2.1" }, @@ -513,6 +538,7 @@ all-dev = [ { name = "types-boto3", specifier = ">=1.40.0,<2.0.0" }, ] dev = [ + { name = "pillow", specifier = ">=11.3.0" }, { name = "pytest", specifier = ">=8.4.1" }, { name = "pytest-asyncio", specifier = ">=0.23.0" }, { name = "pytest-cov", specifier = ">=6.2.1" }, @@ -631,6 +657,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505", size = 4963, upload-time = "2025-04-22T14:54:22.983Z" }, ] +[[package]] +name = "openpyxl" +version = "3.1.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "et-xmlfile" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" }, +] + [[package]] name = "packaging" version = "25.0" @@ -661,6 +699,72 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/47/ac/684d71315abc7b1214d59304e23a982472967f6bf4bde5a98f1503f648dc/pbr-6.1.1-py2.py3-none-any.whl", hash = "sha256:38d4daea5d9fa63b3f626131b9d34947fd0c8be9b05a29276870580050a25a76", size = 108997, upload-time = "2025-02-04T14:28:03.168Z" }, ] +[[package]] +name = "pillow" +version = "11.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/d0d6dea55cd152ce3d6767bb38a8fc10e33796ba4ba210cbab9354b6d238/pillow-11.3.0.tar.gz", hash = "sha256:3828ee7586cd0b2091b6209e5ad53e20d0649bbe87164a459d0676e035e8f523", size = 47113069, upload-time = "2025-07-01T09:16:30.666Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/40/fe/1bc9b3ee13f68487a99ac9529968035cca2f0a51ec36892060edcc51d06a/pillow-11.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fdae223722da47b024b867c1ea0be64e0df702c5e0a60e27daad39bf960dd1e4", size = 5278800, upload-time = "2025-07-01T09:14:17.648Z" }, + { url = "https://files.pythonhosted.org/packages/2c/32/7e2ac19b5713657384cec55f89065fb306b06af008cfd87e572035b27119/pillow-11.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:921bd305b10e82b4d1f5e802b6850677f965d8394203d182f078873851dada69", size = 4686296, upload-time = "2025-07-01T09:14:19.828Z" }, + { url = "https://files.pythonhosted.org/packages/8e/1e/b9e12bbe6e4c2220effebc09ea0923a07a6da1e1f1bfbc8d7d29a01ce32b/pillow-11.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:eb76541cba2f958032d79d143b98a3a6b3ea87f0959bbe256c0b5e416599fd5d", size = 5871726, upload-time = "2025-07-03T13:10:04.448Z" }, + { url = "https://files.pythonhosted.org/packages/8d/33/e9200d2bd7ba00dc3ddb78df1198a6e80d7669cce6c2bdbeb2530a74ec58/pillow-11.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:67172f2944ebba3d4a7b54f2e95c786a3a50c21b88456329314caaa28cda70f6", size = 7644652, upload-time = "2025-07-03T13:10:10.391Z" }, + { url = "https://files.pythonhosted.org/packages/41/f1/6f2427a26fc683e00d985bc391bdd76d8dd4e92fac33d841127eb8fb2313/pillow-11.3.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:97f07ed9f56a3b9b5f49d3661dc9607484e85c67e27f3e8be2c7d28ca032fec7", size = 5977787, upload-time = "2025-07-01T09:14:21.63Z" }, + { url = "https://files.pythonhosted.org/packages/e4/c9/06dd4a38974e24f932ff5f98ea3c546ce3f8c995d3f0985f8e5ba48bba19/pillow-11.3.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:676b2815362456b5b3216b4fd5bd89d362100dc6f4945154ff172e206a22c024", size = 6645236, upload-time = "2025-07-01T09:14:23.321Z" }, + { url = "https://files.pythonhosted.org/packages/40/e7/848f69fb79843b3d91241bad658e9c14f39a32f71a301bcd1d139416d1be/pillow-11.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3e184b2f26ff146363dd07bde8b711833d7b0202e27d13540bfe2e35a323a809", size = 6086950, upload-time = "2025-07-01T09:14:25.237Z" }, + { url = "https://files.pythonhosted.org/packages/0b/1a/7cff92e695a2a29ac1958c2a0fe4c0b2393b60aac13b04a4fe2735cad52d/pillow-11.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6be31e3fc9a621e071bc17bb7de63b85cbe0bfae91bb0363c893cbe67247780d", size = 6723358, upload-time = "2025-07-01T09:14:27.053Z" }, + { url = "https://files.pythonhosted.org/packages/26/7d/73699ad77895f69edff76b0f332acc3d497f22f5d75e5360f78cbcaff248/pillow-11.3.0-cp312-cp312-win32.whl", hash = "sha256:7b161756381f0918e05e7cb8a371fff367e807770f8fe92ecb20d905d0e1c149", size = 6275079, upload-time = "2025-07-01T09:14:30.104Z" }, + { url = "https://files.pythonhosted.org/packages/8c/ce/e7dfc873bdd9828f3b6e5c2bbb74e47a98ec23cc5c74fc4e54462f0d9204/pillow-11.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:a6444696fce635783440b7f7a9fc24b3ad10a9ea3f0ab66c5905be1c19ccf17d", size = 6986324, upload-time = "2025-07-01T09:14:31.899Z" }, + { url = "https://files.pythonhosted.org/packages/16/8f/b13447d1bf0b1f7467ce7d86f6e6edf66c0ad7cf44cf5c87a37f9bed9936/pillow-11.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:2aceea54f957dd4448264f9bf40875da0415c83eb85f55069d89c0ed436e3542", size = 2423067, upload-time = "2025-07-01T09:14:33.709Z" }, + { url = "https://files.pythonhosted.org/packages/1e/93/0952f2ed8db3a5a4c7a11f91965d6184ebc8cd7cbb7941a260d5f018cd2d/pillow-11.3.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:1c627742b539bba4309df89171356fcb3cc5a9178355b2727d1b74a6cf155fbd", size = 2128328, upload-time = "2025-07-01T09:14:35.276Z" }, + { url = "https://files.pythonhosted.org/packages/4b/e8/100c3d114b1a0bf4042f27e0f87d2f25e857e838034e98ca98fe7b8c0a9c/pillow-11.3.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:30b7c02f3899d10f13d7a48163c8969e4e653f8b43416d23d13d1bbfdc93b9f8", size = 2170652, upload-time = "2025-07-01T09:14:37.203Z" }, + { url = "https://files.pythonhosted.org/packages/aa/86/3f758a28a6e381758545f7cdb4942e1cb79abd271bea932998fc0db93cb6/pillow-11.3.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:7859a4cc7c9295f5838015d8cc0a9c215b77e43d07a25e460f35cf516df8626f", size = 2227443, upload-time = "2025-07-01T09:14:39.344Z" }, + { url = "https://files.pythonhosted.org/packages/01/f4/91d5b3ffa718df2f53b0dc109877993e511f4fd055d7e9508682e8aba092/pillow-11.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ec1ee50470b0d050984394423d96325b744d55c701a439d2bd66089bff963d3c", size = 5278474, upload-time = "2025-07-01T09:14:41.843Z" }, + { url = "https://files.pythonhosted.org/packages/f9/0e/37d7d3eca6c879fbd9dba21268427dffda1ab00d4eb05b32923d4fbe3b12/pillow-11.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7db51d222548ccfd274e4572fdbf3e810a5e66b00608862f947b163e613b67dd", size = 4686038, upload-time = "2025-07-01T09:14:44.008Z" }, + { url = "https://files.pythonhosted.org/packages/ff/b0/3426e5c7f6565e752d81221af9d3676fdbb4f352317ceafd42899aaf5d8a/pillow-11.3.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2d6fcc902a24ac74495df63faad1884282239265c6839a0a6416d33faedfae7e", size = 5864407, upload-time = "2025-07-03T13:10:15.628Z" }, + { url = "https://files.pythonhosted.org/packages/fc/c1/c6c423134229f2a221ee53f838d4be9d82bab86f7e2f8e75e47b6bf6cd77/pillow-11.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f0f5d8f4a08090c6d6d578351a2b91acf519a54986c055af27e7a93feae6d3f1", size = 7639094, upload-time = "2025-07-03T13:10:21.857Z" }, + { url = "https://files.pythonhosted.org/packages/ba/c9/09e6746630fe6372c67c648ff9deae52a2bc20897d51fa293571977ceb5d/pillow-11.3.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c37d8ba9411d6003bba9e518db0db0c58a680ab9fe5179f040b0463644bc9805", size = 5973503, upload-time = "2025-07-01T09:14:45.698Z" }, + { url = "https://files.pythonhosted.org/packages/d5/1c/a2a29649c0b1983d3ef57ee87a66487fdeb45132df66ab30dd37f7dbe162/pillow-11.3.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:13f87d581e71d9189ab21fe0efb5a23e9f28552d5be6979e84001d3b8505abe8", size = 6642574, upload-time = "2025-07-01T09:14:47.415Z" }, + { url = "https://files.pythonhosted.org/packages/36/de/d5cc31cc4b055b6c6fd990e3e7f0f8aaf36229a2698501bcb0cdf67c7146/pillow-11.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:023f6d2d11784a465f09fd09a34b150ea4672e85fb3d05931d89f373ab14abb2", size = 6084060, upload-time = "2025-07-01T09:14:49.636Z" }, + { url = "https://files.pythonhosted.org/packages/d5/ea/502d938cbaeec836ac28a9b730193716f0114c41325db428e6b280513f09/pillow-11.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:45dfc51ac5975b938e9809451c51734124e73b04d0f0ac621649821a63852e7b", size = 6721407, upload-time = "2025-07-01T09:14:51.962Z" }, + { url = "https://files.pythonhosted.org/packages/45/9c/9c5e2a73f125f6cbc59cc7087c8f2d649a7ae453f83bd0362ff7c9e2aee2/pillow-11.3.0-cp313-cp313-win32.whl", hash = "sha256:a4d336baed65d50d37b88ca5b60c0fa9d81e3a87d4a7930d3880d1624d5b31f3", size = 6273841, upload-time = "2025-07-01T09:14:54.142Z" }, + { url = "https://files.pythonhosted.org/packages/23/85/397c73524e0cd212067e0c969aa245b01d50183439550d24d9f55781b776/pillow-11.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:0bce5c4fd0921f99d2e858dc4d4d64193407e1b99478bc5cacecba2311abde51", size = 6978450, upload-time = "2025-07-01T09:14:56.436Z" }, + { url = "https://files.pythonhosted.org/packages/17/d2/622f4547f69cd173955194b78e4d19ca4935a1b0f03a302d655c9f6aae65/pillow-11.3.0-cp313-cp313-win_arm64.whl", hash = "sha256:1904e1264881f682f02b7f8167935cce37bc97db457f8e7849dc3a6a52b99580", size = 2423055, upload-time = "2025-07-01T09:14:58.072Z" }, + { url = "https://files.pythonhosted.org/packages/dd/80/a8a2ac21dda2e82480852978416cfacd439a4b490a501a288ecf4fe2532d/pillow-11.3.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4c834a3921375c48ee6b9624061076bc0a32a60b5532b322cc0ea64e639dd50e", size = 5281110, upload-time = "2025-07-01T09:14:59.79Z" }, + { url = "https://files.pythonhosted.org/packages/44/d6/b79754ca790f315918732e18f82a8146d33bcd7f4494380457ea89eb883d/pillow-11.3.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5e05688ccef30ea69b9317a9ead994b93975104a677a36a8ed8106be9260aa6d", size = 4689547, upload-time = "2025-07-01T09:15:01.648Z" }, + { url = "https://files.pythonhosted.org/packages/49/20/716b8717d331150cb00f7fdd78169c01e8e0c219732a78b0e59b6bdb2fd6/pillow-11.3.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1019b04af07fc0163e2810167918cb5add8d74674b6267616021ab558dc98ced", size = 5901554, upload-time = "2025-07-03T13:10:27.018Z" }, + { url = "https://files.pythonhosted.org/packages/74/cf/a9f3a2514a65bb071075063a96f0a5cf949c2f2fce683c15ccc83b1c1cab/pillow-11.3.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f944255db153ebb2b19c51fe85dd99ef0ce494123f21b9db4877ffdfc5590c7c", size = 7669132, upload-time = "2025-07-03T13:10:33.01Z" }, + { url = "https://files.pythonhosted.org/packages/98/3c/da78805cbdbee9cb43efe8261dd7cc0b4b93f2ac79b676c03159e9db2187/pillow-11.3.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1f85acb69adf2aaee8b7da124efebbdb959a104db34d3a2cb0f3793dbae422a8", size = 6005001, upload-time = "2025-07-01T09:15:03.365Z" }, + { url = "https://files.pythonhosted.org/packages/6c/fa/ce044b91faecf30e635321351bba32bab5a7e034c60187fe9698191aef4f/pillow-11.3.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:05f6ecbeff5005399bb48d198f098a9b4b6bdf27b8487c7f38ca16eeb070cd59", size = 6668814, upload-time = "2025-07-01T09:15:05.655Z" }, + { url = "https://files.pythonhosted.org/packages/7b/51/90f9291406d09bf93686434f9183aba27b831c10c87746ff49f127ee80cb/pillow-11.3.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a7bc6e6fd0395bc052f16b1a8670859964dbd7003bd0af2ff08342eb6e442cfe", size = 6113124, upload-time = "2025-07-01T09:15:07.358Z" }, + { url = "https://files.pythonhosted.org/packages/cd/5a/6fec59b1dfb619234f7636d4157d11fb4e196caeee220232a8d2ec48488d/pillow-11.3.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:83e1b0161c9d148125083a35c1c5a89db5b7054834fd4387499e06552035236c", size = 6747186, upload-time = "2025-07-01T09:15:09.317Z" }, + { url = "https://files.pythonhosted.org/packages/49/6b/00187a044f98255225f172de653941e61da37104a9ea60e4f6887717e2b5/pillow-11.3.0-cp313-cp313t-win32.whl", hash = "sha256:2a3117c06b8fb646639dce83694f2f9eac405472713fcb1ae887469c0d4f6788", size = 6277546, upload-time = "2025-07-01T09:15:11.311Z" }, + { url = "https://files.pythonhosted.org/packages/e8/5c/6caaba7e261c0d75bab23be79f1d06b5ad2a2ae49f028ccec801b0e853d6/pillow-11.3.0-cp313-cp313t-win_amd64.whl", hash = "sha256:857844335c95bea93fb39e0fa2726b4d9d758850b34075a7e3ff4f4fa3aa3b31", size = 6985102, upload-time = "2025-07-01T09:15:13.164Z" }, + { url = "https://files.pythonhosted.org/packages/f3/7e/b623008460c09a0cb38263c93b828c666493caee2eb34ff67f778b87e58c/pillow-11.3.0-cp313-cp313t-win_arm64.whl", hash = "sha256:8797edc41f3e8536ae4b10897ee2f637235c94f27404cac7297f7b607dd0716e", size = 2424803, upload-time = "2025-07-01T09:15:15.695Z" }, + { url = "https://files.pythonhosted.org/packages/73/f4/04905af42837292ed86cb1b1dabe03dce1edc008ef14c473c5c7e1443c5d/pillow-11.3.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:d9da3df5f9ea2a89b81bb6087177fb1f4d1c7146d583a3fe5c672c0d94e55e12", size = 5278520, upload-time = "2025-07-01T09:15:17.429Z" }, + { url = "https://files.pythonhosted.org/packages/41/b0/33d79e377a336247df6348a54e6d2a2b85d644ca202555e3faa0cf811ecc/pillow-11.3.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:0b275ff9b04df7b640c59ec5a3cb113eefd3795a8df80bac69646ef699c6981a", size = 4686116, upload-time = "2025-07-01T09:15:19.423Z" }, + { url = "https://files.pythonhosted.org/packages/49/2d/ed8bc0ab219ae8768f529597d9509d184fe8a6c4741a6864fea334d25f3f/pillow-11.3.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0743841cabd3dba6a83f38a92672cccbd69af56e3e91777b0ee7f4dba4385632", size = 5864597, upload-time = "2025-07-03T13:10:38.404Z" }, + { url = "https://files.pythonhosted.org/packages/b5/3d/b932bb4225c80b58dfadaca9d42d08d0b7064d2d1791b6a237f87f661834/pillow-11.3.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2465a69cf967b8b49ee1b96d76718cd98c4e925414ead59fdf75cf0fd07df673", size = 7638246, upload-time = "2025-07-03T13:10:44.987Z" }, + { url = "https://files.pythonhosted.org/packages/09/b5/0487044b7c096f1b48f0d7ad416472c02e0e4bf6919541b111efd3cae690/pillow-11.3.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:41742638139424703b4d01665b807c6468e23e699e8e90cffefe291c5832b027", size = 5973336, upload-time = "2025-07-01T09:15:21.237Z" }, + { url = "https://files.pythonhosted.org/packages/a8/2d/524f9318f6cbfcc79fbc004801ea6b607ec3f843977652fdee4857a7568b/pillow-11.3.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:93efb0b4de7e340d99057415c749175e24c8864302369e05914682ba642e5d77", size = 6642699, upload-time = "2025-07-01T09:15:23.186Z" }, + { url = "https://files.pythonhosted.org/packages/6f/d2/a9a4f280c6aefedce1e8f615baaa5474e0701d86dd6f1dede66726462bbd/pillow-11.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7966e38dcd0fa11ca390aed7c6f20454443581d758242023cf36fcb319b1a874", size = 6083789, upload-time = "2025-07-01T09:15:25.1Z" }, + { url = "https://files.pythonhosted.org/packages/fe/54/86b0cd9dbb683a9d5e960b66c7379e821a19be4ac5810e2e5a715c09a0c0/pillow-11.3.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:98a9afa7b9007c67ed84c57c9e0ad86a6000da96eaa638e4f8abe5b65ff83f0a", size = 6720386, upload-time = "2025-07-01T09:15:27.378Z" }, + { url = "https://files.pythonhosted.org/packages/e7/95/88efcaf384c3588e24259c4203b909cbe3e3c2d887af9e938c2022c9dd48/pillow-11.3.0-cp314-cp314-win32.whl", hash = "sha256:02a723e6bf909e7cea0dac1b0e0310be9d7650cd66222a5f1c571455c0a45214", size = 6370911, upload-time = "2025-07-01T09:15:29.294Z" }, + { url = "https://files.pythonhosted.org/packages/2e/cc/934e5820850ec5eb107e7b1a72dd278140731c669f396110ebc326f2a503/pillow-11.3.0-cp314-cp314-win_amd64.whl", hash = "sha256:a418486160228f64dd9e9efcd132679b7a02a5f22c982c78b6fc7dab3fefb635", size = 7117383, upload-time = "2025-07-01T09:15:31.128Z" }, + { url = "https://files.pythonhosted.org/packages/d6/e9/9c0a616a71da2a5d163aa37405e8aced9a906d574b4a214bede134e731bc/pillow-11.3.0-cp314-cp314-win_arm64.whl", hash = "sha256:155658efb5e044669c08896c0c44231c5e9abcaadbc5cd3648df2f7c0b96b9a6", size = 2511385, upload-time = "2025-07-01T09:15:33.328Z" }, + { url = "https://files.pythonhosted.org/packages/1a/33/c88376898aff369658b225262cd4f2659b13e8178e7534df9e6e1fa289f6/pillow-11.3.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:59a03cdf019efbfeeed910bf79c7c93255c3d54bc45898ac2a4140071b02b4ae", size = 5281129, upload-time = "2025-07-01T09:15:35.194Z" }, + { url = "https://files.pythonhosted.org/packages/1f/70/d376247fb36f1844b42910911c83a02d5544ebd2a8bad9efcc0f707ea774/pillow-11.3.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f8a5827f84d973d8636e9dc5764af4f0cf2318d26744b3d902931701b0d46653", size = 4689580, upload-time = "2025-07-01T09:15:37.114Z" }, + { url = "https://files.pythonhosted.org/packages/eb/1c/537e930496149fbac69efd2fc4329035bbe2e5475b4165439e3be9cb183b/pillow-11.3.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ee92f2fd10f4adc4b43d07ec5e779932b4eb3dbfbc34790ada5a6669bc095aa6", size = 5902860, upload-time = "2025-07-03T13:10:50.248Z" }, + { url = "https://files.pythonhosted.org/packages/bd/57/80f53264954dcefeebcf9dae6e3eb1daea1b488f0be8b8fef12f79a3eb10/pillow-11.3.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c96d333dcf42d01f47b37e0979b6bd73ec91eae18614864622d9b87bbd5bbf36", size = 7670694, upload-time = "2025-07-03T13:10:56.432Z" }, + { url = "https://files.pythonhosted.org/packages/70/ff/4727d3b71a8578b4587d9c276e90efad2d6fe0335fd76742a6da08132e8c/pillow-11.3.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4c96f993ab8c98460cd0c001447bff6194403e8b1d7e149ade5f00594918128b", size = 6005888, upload-time = "2025-07-01T09:15:39.436Z" }, + { url = "https://files.pythonhosted.org/packages/05/ae/716592277934f85d3be51d7256f3636672d7b1abfafdc42cf3f8cbd4b4c8/pillow-11.3.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:41342b64afeba938edb034d122b2dda5db2139b9a4af999729ba8818e0056477", size = 6670330, upload-time = "2025-07-01T09:15:41.269Z" }, + { url = "https://files.pythonhosted.org/packages/e7/bb/7fe6cddcc8827b01b1a9766f5fdeb7418680744f9082035bdbabecf1d57f/pillow-11.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:068d9c39a2d1b358eb9f245ce7ab1b5c3246c7c8c7d9ba58cfa5b43146c06e50", size = 6114089, upload-time = "2025-07-01T09:15:43.13Z" }, + { url = "https://files.pythonhosted.org/packages/8b/f5/06bfaa444c8e80f1a8e4bff98da9c83b37b5be3b1deaa43d27a0db37ef84/pillow-11.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a1bc6ba083b145187f648b667e05a2534ecc4b9f2784c2cbe3089e44868f2b9b", size = 6748206, upload-time = "2025-07-01T09:15:44.937Z" }, + { url = "https://files.pythonhosted.org/packages/f0/77/bc6f92a3e8e6e46c0ca78abfffec0037845800ea38c73483760362804c41/pillow-11.3.0-cp314-cp314t-win32.whl", hash = "sha256:118ca10c0d60b06d006be10a501fd6bbdfef559251ed31b794668ed569c87e12", size = 6377370, upload-time = "2025-07-01T09:15:46.673Z" }, + { url = "https://files.pythonhosted.org/packages/4a/82/3a721f7d69dca802befb8af08b7c79ebcab461007ce1c18bd91a5d5896f9/pillow-11.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:8924748b688aa210d79883357d102cd64690e56b923a186f35a82cbc10f997db", size = 7121500, upload-time = "2025-07-01T09:15:48.512Z" }, + { url = "https://files.pythonhosted.org/packages/89/c7/5572fa4a3f45740eaab6ae86fcdf7195b55beac1371ac8c619d880cfe948/pillow-11.3.0-cp314-cp314t-win_arm64.whl", hash = "sha256:79ea0d14d3ebad43ec77ad5272e6ff9bba5b679ef73375ea760261207fa8e0aa", size = 2512835, upload-time = "2025-07-01T09:15:50.399Z" }, +] + [[package]] name = "pluggy" version = "1.6.0" @@ -727,6 +831,63 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cc/35/cc0aaecf278bb4575b8555f2b137de5ab821595ddae9da9d3cd1da4072c7/propcache-0.3.2-py3-none-any.whl", hash = "sha256:98f1ec44fb675f5052cccc8e609c46ed23a35a1cfd18545ad4e29002d858a43f", size = 12663, upload-time = "2025-06-09T22:56:04.484Z" }, ] +[[package]] +name = "pydantic" +version = "2.11.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "annotated-types" }, + { name = "pydantic-core" }, + { name = "typing-extensions" }, + { name = "typing-inspection" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/00/dd/4325abf92c39ba8623b5af936ddb36ffcfe0beae70405d456ab1fb2f5b8c/pydantic-2.11.7.tar.gz", hash = "sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db", size = 788350, upload-time = "2025-06-14T08:33:17.137Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6a/c0/ec2b1c8712ca690e5d61979dee872603e92b8a32f94cc1b72d53beab008a/pydantic-2.11.7-py3-none-any.whl", hash = "sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b", size = 444782, upload-time = "2025-06-14T08:33:14.905Z" }, +] + +[[package]] +name = "pydantic-core" +version = "2.33.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ad/88/5f2260bdfae97aabf98f1778d43f69574390ad787afb646292a638c923d4/pydantic_core-2.33.2.tar.gz", hash = "sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc", size = 435195, upload-time = "2025-04-23T18:33:52.104Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/8a/2b41c97f554ec8c71f2a8a5f85cb56a8b0956addfe8b0efb5b3d77e8bdc3/pydantic_core-2.33.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc", size = 2009000, upload-time = "2025-04-23T18:31:25.863Z" }, + { url = "https://files.pythonhosted.org/packages/a1/02/6224312aacb3c8ecbaa959897af57181fb6cf3a3d7917fd44d0f2917e6f2/pydantic_core-2.33.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7", size = 1847996, upload-time = "2025-04-23T18:31:27.341Z" }, + { url = "https://files.pythonhosted.org/packages/d6/46/6dcdf084a523dbe0a0be59d054734b86a981726f221f4562aed313dbcb49/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025", size = 1880957, upload-time = "2025-04-23T18:31:28.956Z" }, + { url = "https://files.pythonhosted.org/packages/ec/6b/1ec2c03837ac00886ba8160ce041ce4e325b41d06a034adbef11339ae422/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011", size = 1964199, upload-time = "2025-04-23T18:31:31.025Z" }, + { url = "https://files.pythonhosted.org/packages/2d/1d/6bf34d6adb9debd9136bd197ca72642203ce9aaaa85cfcbfcf20f9696e83/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f", size = 2120296, upload-time = "2025-04-23T18:31:32.514Z" }, + { url = "https://files.pythonhosted.org/packages/e0/94/2bd0aaf5a591e974b32a9f7123f16637776c304471a0ab33cf263cf5591a/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88", size = 2676109, upload-time = "2025-04-23T18:31:33.958Z" }, + { url = "https://files.pythonhosted.org/packages/f9/41/4b043778cf9c4285d59742281a769eac371b9e47e35f98ad321349cc5d61/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1", size = 2002028, upload-time = "2025-04-23T18:31:39.095Z" }, + { url = "https://files.pythonhosted.org/packages/cb/d5/7bb781bf2748ce3d03af04d5c969fa1308880e1dca35a9bd94e1a96a922e/pydantic_core-2.33.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b", size = 2100044, upload-time = "2025-04-23T18:31:41.034Z" }, + { url = "https://files.pythonhosted.org/packages/fe/36/def5e53e1eb0ad896785702a5bbfd25eed546cdcf4087ad285021a90ed53/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1", size = 2058881, upload-time = "2025-04-23T18:31:42.757Z" }, + { url = "https://files.pythonhosted.org/packages/01/6c/57f8d70b2ee57fc3dc8b9610315949837fa8c11d86927b9bb044f8705419/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6", size = 2227034, upload-time = "2025-04-23T18:31:44.304Z" }, + { url = "https://files.pythonhosted.org/packages/27/b9/9c17f0396a82b3d5cbea4c24d742083422639e7bb1d5bf600e12cb176a13/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea", size = 2234187, upload-time = "2025-04-23T18:31:45.891Z" }, + { url = "https://files.pythonhosted.org/packages/b0/6a/adf5734ffd52bf86d865093ad70b2ce543415e0e356f6cacabbc0d9ad910/pydantic_core-2.33.2-cp312-cp312-win32.whl", hash = "sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290", size = 1892628, upload-time = "2025-04-23T18:31:47.819Z" }, + { url = "https://files.pythonhosted.org/packages/43/e4/5479fecb3606c1368d496a825d8411e126133c41224c1e7238be58b87d7e/pydantic_core-2.33.2-cp312-cp312-win_amd64.whl", hash = "sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2", size = 1955866, upload-time = "2025-04-23T18:31:49.635Z" }, + { url = "https://files.pythonhosted.org/packages/0d/24/8b11e8b3e2be9dd82df4b11408a67c61bb4dc4f8e11b5b0fc888b38118b5/pydantic_core-2.33.2-cp312-cp312-win_arm64.whl", hash = "sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab", size = 1888894, upload-time = "2025-04-23T18:31:51.609Z" }, + { url = "https://files.pythonhosted.org/packages/46/8c/99040727b41f56616573a28771b1bfa08a3d3fe74d3d513f01251f79f172/pydantic_core-2.33.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f", size = 2015688, upload-time = "2025-04-23T18:31:53.175Z" }, + { url = "https://files.pythonhosted.org/packages/3a/cc/5999d1eb705a6cefc31f0b4a90e9f7fc400539b1a1030529700cc1b51838/pydantic_core-2.33.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6", size = 1844808, upload-time = "2025-04-23T18:31:54.79Z" }, + { url = "https://files.pythonhosted.org/packages/6f/5e/a0a7b8885c98889a18b6e376f344da1ef323d270b44edf8174d6bce4d622/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef", size = 1885580, upload-time = "2025-04-23T18:31:57.393Z" }, + { url = "https://files.pythonhosted.org/packages/3b/2a/953581f343c7d11a304581156618c3f592435523dd9d79865903272c256a/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a", size = 1973859, upload-time = "2025-04-23T18:31:59.065Z" }, + { url = "https://files.pythonhosted.org/packages/e6/55/f1a813904771c03a3f97f676c62cca0c0a4138654107c1b61f19c644868b/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916", size = 2120810, upload-time = "2025-04-23T18:32:00.78Z" }, + { url = "https://files.pythonhosted.org/packages/aa/c3/053389835a996e18853ba107a63caae0b9deb4a276c6b472931ea9ae6e48/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a", size = 2676498, upload-time = "2025-04-23T18:32:02.418Z" }, + { url = "https://files.pythonhosted.org/packages/eb/3c/f4abd740877a35abade05e437245b192f9d0ffb48bbbbd708df33d3cda37/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d", size = 2000611, upload-time = "2025-04-23T18:32:04.152Z" }, + { url = "https://files.pythonhosted.org/packages/59/a7/63ef2fed1837d1121a894d0ce88439fe3e3b3e48c7543b2a4479eb99c2bd/pydantic_core-2.33.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56", size = 2107924, upload-time = "2025-04-23T18:32:06.129Z" }, + { url = "https://files.pythonhosted.org/packages/04/8f/2551964ef045669801675f1cfc3b0d74147f4901c3ffa42be2ddb1f0efc4/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5", size = 2063196, upload-time = "2025-04-23T18:32:08.178Z" }, + { url = "https://files.pythonhosted.org/packages/26/bd/d9602777e77fc6dbb0c7db9ad356e9a985825547dce5ad1d30ee04903918/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e", size = 2236389, upload-time = "2025-04-23T18:32:10.242Z" }, + { url = "https://files.pythonhosted.org/packages/42/db/0e950daa7e2230423ab342ae918a794964b053bec24ba8af013fc7c94846/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162", size = 2239223, upload-time = "2025-04-23T18:32:12.382Z" }, + { url = "https://files.pythonhosted.org/packages/58/4d/4f937099c545a8a17eb52cb67fe0447fd9a373b348ccfa9a87f141eeb00f/pydantic_core-2.33.2-cp313-cp313-win32.whl", hash = "sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849", size = 1900473, upload-time = "2025-04-23T18:32:14.034Z" }, + { url = "https://files.pythonhosted.org/packages/a0/75/4a0a9bac998d78d889def5e4ef2b065acba8cae8c93696906c3a91f310ca/pydantic_core-2.33.2-cp313-cp313-win_amd64.whl", hash = "sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9", size = 1955269, upload-time = "2025-04-23T18:32:15.783Z" }, + { url = "https://files.pythonhosted.org/packages/f9/86/1beda0576969592f1497b4ce8e7bc8cbdf614c352426271b1b10d5f0aa64/pydantic_core-2.33.2-cp313-cp313-win_arm64.whl", hash = "sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9", size = 1893921, upload-time = "2025-04-23T18:32:18.473Z" }, + { url = "https://files.pythonhosted.org/packages/a4/7d/e09391c2eebeab681df2b74bfe6c43422fffede8dc74187b2b0bf6fd7571/pydantic_core-2.33.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac", size = 1806162, upload-time = "2025-04-23T18:32:20.188Z" }, + { url = "https://files.pythonhosted.org/packages/f1/3d/847b6b1fed9f8ed3bb95a9ad04fbd0b212e832d4f0f50ff4d9ee5a9f15cf/pydantic_core-2.33.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5", size = 1981560, upload-time = "2025-04-23T18:32:22.354Z" }, + { url = "https://files.pythonhosted.org/packages/6f/9a/e73262f6c6656262b5fdd723ad90f518f579b7bc8622e43a942eec53c938/pydantic_core-2.33.2-cp313-cp313t-win_amd64.whl", hash = "sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9", size = 1935777, upload-time = "2025-04-23T18:32:25.088Z" }, +] + [[package]] name = "pygments" version = "2.19.2" @@ -1012,6 +1173,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b5/00/d631e67a838026495268c2f6884f3711a15a9a2a96cd244fdaea53b823fb/typing_extensions-4.14.1-py3-none-any.whl", hash = "sha256:d1e1e3b58374dc93031d6eda2420a48ea44a36c2b4766a4fdeb3710755731d76", size = 43906, upload-time = "2025-07-04T13:28:32.743Z" }, ] +[[package]] +name = "typing-inspection" +version = "0.4.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f8/b1/0c11f5058406b3af7609f121aaa6b609744687f1d158b3c3a5bf4cc94238/typing_inspection-0.4.1.tar.gz", hash = "sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28", size = 75726, upload-time = "2025-05-21T18:55:23.885Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/17/69/cd203477f944c353c31bade965f880aa1061fd6bf05ded0726ca845b6ff7/typing_inspection-0.4.1-py3-none-any.whl", hash = "sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51", size = 14552, upload-time = "2025-05-21T18:55:22.152Z" }, +] + [[package]] name = "ujson" version = "5.10.0" diff --git a/worker.py b/worker.py index d534c0d..35d440a 100644 --- a/worker.py +++ b/worker.py @@ -1,11 +1,14 @@ -from typing import Any -from enhancers.information_enhancer import InformationEnhancerFactory import asyncio +from typing import Any + from sanic import Sanic -from parsers.document_parser import DocumentParserFactory -from config import settings -async def worker(app: Sanic) -> list[dict[str, Any]]: +from enhancers.information_enhancer import InformationEnhancerFactory +from parsers.document_parser import DocumentData +from parsers.document_parser_factory import DocumentParserFactory + + +async def worker(app: Sanic) -> dict[str, Any]: # 使用工厂获取合适的解析器 parser_factory = DocumentParserFactory() enhancer_factory = InformationEnhancerFactory() @@ -16,17 +19,21 @@ async def worker(app: Sanic) -> list[dict[str, Any]]: await asyncio.sleep(1) continue file_path = task.get("file_path") - information_list = await parser_factory.parse_document(file_path) + parse_result = await parser_factory.parse_document(file_path) + if not parse_result.success: + continue + chunk_list = parse_result.document # 控制并发数量,防止访问量过大导致失败 SEMAPHORE_LIMIT = 10 # 可根据实际情况调整 semaphore = asyncio.Semaphore(SEMAPHORE_LIMIT) - async def enhance_with_semaphore(info): + async def enhance_with_semaphore(chunk: DocumentData, semaphore: asyncio.Semaphore) -> DocumentData: async with semaphore: - return await enhancer_factory.enhance_information(info) + return await enhancer_factory.enhance_information(chunk) # 并发增强每个信息 - enhanced_information_list = await asyncio.gather( - *(enhance_with_semaphore(info) for info in information_list) + enhanced_chunk_list = await asyncio.gather( + *(enhance_with_semaphore(chunk, semaphore) for chunk in chunk_list) ) - return enhanced_information_list \ No newline at end of file + parse_result.document = enhanced_chunk_list + return parse_result.model_dump(mode="json") From ec7b394cc81aa1af58764dc0945591caa540ec86 Mon Sep 17 00:00:00 2001 From: liningping <728359849@qq.com> Date: Tue, 19 Aug 2025 06:57:53 +0000 Subject: [PATCH 05/10] fix:confort ruff --- parsers/excel_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parsers/excel_parser.py b/parsers/excel_parser.py index 4a33e5c..b34419b 100644 --- a/parsers/excel_parser.py +++ b/parsers/excel_parser.py @@ -184,7 +184,7 @@ def _extract_sheet_images(self, sheet: Worksheet) -> list[DocumentData]: sheet_images: list[DocumentData] = [] images = getattr(sheet, '_images', None) - if not images or not isinstance(images, (list, tuple)): + if not images or not isinstance(images, list|tuple): return sheet_images # 收集图片信息 From 8d9874e78b9e74ceda05bfb29fa05b4169a7e3b5 Mon Sep 17 00:00:00 2001 From: liningping <728359849@qq.com> Date: Tue, 19 Aug 2025 11:07:34 +0000 Subject: [PATCH 06/10] feat: add document data models --- parsers/base_models.py | 45 ++++++++++++++++++++++++++++ parsers/document_parser.py | 48 +++++++++++++----------------- parsers/document_parser_factory.py | 29 ------------------ parsers/excel_parser.py | 40 +++++++++++-------------- storage/s3_client.py | 4 +++ tests/test_excel_parser.py | 8 ++--- 6 files changed, 91 insertions(+), 83 deletions(-) create mode 100644 parsers/base_models.py delete mode 100644 parsers/document_parser_factory.py diff --git a/parsers/base_models.py b/parsers/base_models.py new file mode 100644 index 0000000..c9fece7 --- /dev/null +++ b/parsers/base_models.py @@ -0,0 +1,45 @@ +import logging +from abc import ABC, abstractmethod +from enum import Enum + +from pydantic import BaseModel + +logger = logging.getLogger(__name__) + +class ChunkType(str, Enum): + """块类型""" + TEXT = "text" + IMAGE = "image" + TABLE = "table" + FORMULA = "formula" + +class ChunkData(BaseModel): + """块数据类""" + type: ChunkType + name: str + content: str = "" + description: str = "" + +class DocumentData(BaseModel): + """解析结果类""" + title: str = "" + chunks: list[ChunkData] = [] + processing_time: float = 0 + success: bool + error_message: str | None = None + +class DocumentParser(ABC): + """文档解析器基类""" + + def __init__(self) -> None: + self.supported_formats: list[str] = [] + + @abstractmethod + async def parse(self, file_path: str) -> DocumentData: + """解析文档""" + pass + + @abstractmethod + def can_parse(self, file_path: str) -> bool: + """检查是否可以解析该文件""" + pass diff --git a/parsers/document_parser.py b/parsers/document_parser.py index 1ea37f3..e582873 100644 --- a/parsers/document_parser.py +++ b/parsers/document_parser.py @@ -1,37 +1,29 @@ import logging -from abc import ABC, abstractmethod -from pydantic import BaseModel +from parsers.document_parser import DocumentParser, ParseResult +from parsers.excel_parser import ExcelParser logger = logging.getLogger(__name__) -class DocumentData(BaseModel): - """文档数据类""" - type: str - name: str - content: str - description: str - -class ParseResult(BaseModel): - """解析结果类""" - title: str - document: list[DocumentData] - processing_time: float - success: bool - error_message: str | None = None - -class DocumentParser(ABC): - """文档解析器基类""" +class DocumentParserFactory: + """文档解析器工厂""" def __init__(self) -> None: - self.supported_formats: list[str] = [] - - @abstractmethod - async def parse(self, file_path: str) -> ParseResult: + self.parsers: list[DocumentParser] = [ + ExcelParser() + ] + + def get_parser(self, file_path: str) -> DocumentParser | None: + """根据文件路径获取合适的解析器""" + for parser in self.parsers: + if parser.can_parse(file_path): + return parser + return None + + async def parse_document(self, file_path: str) -> ParseResult: """解析文档""" - pass + parser = self.get_parser(file_path) + if not parser: + raise ValueError(f"不支持的文件格式: {file_path}") - @abstractmethod - def can_parse(self, file_path: str) -> bool: - """检查是否可以解析该文件""" - pass + return await parser.parse(file_path) diff --git a/parsers/document_parser_factory.py b/parsers/document_parser_factory.py deleted file mode 100644 index e582873..0000000 --- a/parsers/document_parser_factory.py +++ /dev/null @@ -1,29 +0,0 @@ -import logging - -from parsers.document_parser import DocumentParser, ParseResult -from parsers.excel_parser import ExcelParser - -logger = logging.getLogger(__name__) - -class DocumentParserFactory: - """文档解析器工厂""" - - def __init__(self) -> None: - self.parsers: list[DocumentParser] = [ - ExcelParser() - ] - - def get_parser(self, file_path: str) -> DocumentParser | None: - """根据文件路径获取合适的解析器""" - for parser in self.parsers: - if parser.can_parse(file_path): - return parser - return None - - async def parse_document(self, file_path: str) -> ParseResult: - """解析文档""" - parser = self.get_parser(file_path) - if not parser: - raise ValueError(f"不支持的文件格式: {file_path}") - - return await parser.parse(file_path) diff --git a/parsers/excel_parser.py b/parsers/excel_parser.py index b34419b..eb7cd00 100644 --- a/parsers/excel_parser.py +++ b/parsers/excel_parser.py @@ -19,7 +19,7 @@ from openpyxl.workbook.workbook import Workbook # type: ignore from openpyxl.worksheet.worksheet import Worksheet # type: ignore -from parsers.document_parser import DocumentData, DocumentParser, ParseResult +from parsers.base_models import ChunkData, ChunkType, DocumentData, DocumentParser # 忽略 openpyxl 的特定警告 warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl') @@ -57,7 +57,7 @@ def __init__(self, config: ExcelParseConfig | None = None): self.image_index: int = 0 self.supported_formats: list[str] = ['.xlsx', '.xls'] - async def parse(self, excel_path: str) -> ParseResult: + async def parse(self, excel_path: str) -> DocumentData: """ 解析Excel文件并保存结果 @@ -79,19 +79,16 @@ async def parse(self, excel_path: str) -> ParseResult: processing_time = time.time() - start_time - return ParseResult( + return DocumentData( title=title, - document=document_data, + chunks=document_data, processing_time=processing_time, success=True ) except Exception as e: processing_time = time.time() - start_time - return ParseResult( - title="", - document=[], - processing_time=processing_time, + return DocumentData( success=False, error_message=str(e) ) @@ -106,7 +103,7 @@ def can_parse(self, file_path: str) -> bool: """ return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats) - def _excel_to_json(self, excel_path: str) -> tuple[str, list[DocumentData]]: + def _excel_to_json(self, excel_path: str) -> tuple[str, list[ChunkData]]: """ 将Excel文件转换为JSON格式 Args: @@ -118,7 +115,7 @@ def _excel_to_json(self, excel_path: str) -> tuple[str, list[DocumentData]]: title = Path(excel_path).stem # 初始化内容列表和图片列表 - content: list[DocumentData] = [] + content: list[ChunkData] = [] self.image_index = 0 # 加载工作簿 @@ -129,8 +126,8 @@ def _excel_to_json(self, excel_path: str) -> tuple[str, list[DocumentData]]: sheet = workbook[sheet_name] # 添加工作表标题 - content.append(DocumentData( - type="text", + content.append(ChunkData( + type=ChunkType.TEXT, name=sheet_name, content=f"工作表 {sheet_index + 1}: {sheet_name}", description="工作表标题" @@ -142,16 +139,16 @@ def _excel_to_json(self, excel_path: str) -> tuple[str, list[DocumentData]]: # 处理表格数据 table_content = self._extract_table_data(sheet) - content.append(DocumentData( - type="table", + content.append(ChunkData( + type=ChunkType.TABLE, name="表格", content=json.dumps(table_content), description="表格" )) # 添加结束文本 - content.append(DocumentData( - type="text", + content.append(ChunkData( + type=ChunkType.TEXT, name="结束文本", content="", description="结束文本" @@ -173,7 +170,7 @@ def _load_workbook(self, excel_path: str) -> Workbook: keep_vba=self.config.keep_vba ) - def _extract_sheet_images(self, sheet: Worksheet) -> list[DocumentData]: + def _extract_sheet_images(self, sheet: Worksheet) -> list[ChunkData]: """ 提取工作表中的图片 Args: @@ -181,7 +178,7 @@ def _extract_sheet_images(self, sheet: Worksheet) -> list[DocumentData]: Returns: List[DocumentData]: 图片信息列表 """ - sheet_images: list[DocumentData] = [] + sheet_images: list[ChunkData] = [] images = getattr(sheet, '_images', None) if not images or not isinstance(images, list|tuple): @@ -202,7 +199,7 @@ def _extract_sheet_images(self, sheet: Worksheet) -> list[DocumentData]: return sheet_images - def _process_image_object(self, img_obj: Image) -> DocumentData | None: + def _process_image_object(self, img_obj: Image) -> ChunkData | None: """ 处理单个图片对象 Args: @@ -222,8 +219,8 @@ def _process_image_object(self, img_obj: Image) -> DocumentData | None: uri = f"data:image/{img_format};base64,{base64_encoded}" # 创建图片信息 - image_info = DocumentData( - type="image", + image_info = ChunkData( + type=ChunkType.IMAGE, name=f"#/pictures/{self.image_index}", content=uri, description=self.config.image_description_placeholder @@ -374,7 +371,6 @@ def _extract_all_rows(self, sheet: Worksheet, max_row: int, max_col: int, return all_rows - def _save_json(self, data: Any, file_path: Path) -> None: """ 保存JSON数据到文件 diff --git a/storage/s3_client.py b/storage/s3_client.py index da19e34..7cd41ba 100644 --- a/storage/s3_client.py +++ b/storage/s3_client.py @@ -40,6 +40,10 @@ def __init__(self) -> None: super().__init__("S3 client not initialized") +class S3Error(Exception): + """S3操作异常""" + pass + class AsyncS3Client: def __init__(self, endpoint_url: str | None, diff --git a/tests/test_excel_parser.py b/tests/test_excel_parser.py index fa5a464..b8f0775 100644 --- a/tests/test_excel_parser.py +++ b/tests/test_excel_parser.py @@ -7,7 +7,7 @@ from openpyxl.drawing.image import Image as XLImage from parsers.excel_parser import ExcelParser -from parsers.document_parser import DocumentData +from parsers.base_models import ChunkData @pytest.mark.asyncio @@ -49,7 +49,7 @@ async def test_parse_real_basic_and_image(): assert result.success is True # 内容:Sheet1标题、Sheet1图片、Sheet1表格、Sheet2标题、Sheet2表格、结束文本 - content = result.document + content = result.chunks assert len(content) == 6 # 校验顺序与关键字段 @@ -91,12 +91,12 @@ async def test_parse_real_merged_cells(): result = await parser.parse(xlsx_path) assert result.success is True - content = result.document + content = result.chunks # 结构:标题、表格、结束文本 assert len(content) == 3 # 表格在索引1 - table_chunk: DocumentData = content[1] + table_chunk: ChunkData = content[1] assert table_chunk.type == "table" import json as _json From b2da9e91cebe32f83716c45680bab41235fadd28 Mon Sep 17 00:00:00 2001 From: liningping <728359849@qq.com> Date: Tue, 19 Aug 2025 11:26:02 +0000 Subject: [PATCH 07/10] fix: comfort mypy --- enhancers/information_enhancer.py | 20 ++++++++++---------- parsers/document_parser.py | 4 ++-- worker.py | 10 +++++----- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/enhancers/information_enhancer.py b/enhancers/information_enhancer.py index 696dd10..aeb8758 100644 --- a/enhancers/information_enhancer.py +++ b/enhancers/information_enhancer.py @@ -1,33 +1,33 @@ from abc import ABC, abstractmethod -from parsers.document_parser import DocumentData +from parsers.base_models import ChunkData, ChunkType class InformationEnhancer(ABC): """信息增强器基类""" @abstractmethod - async def enhance(self, information: DocumentData) -> DocumentData: + async def enhance(self, information: ChunkData) -> ChunkData: """增强信息""" pass class TableInformationEnhancer(InformationEnhancer): """表格信息增强器""" - async def enhance(self, information: DocumentData) -> DocumentData: + async def enhance(self, information: ChunkData) -> ChunkData: """增强信息""" return information class FormulasInformationEnhancer(InformationEnhancer): """公式信息增强器""" - async def enhance(self, information: DocumentData) -> DocumentData: + async def enhance(self, information: ChunkData) -> ChunkData: """增强信息""" return information class ImageInformationEnhancer(InformationEnhancer): """图片信息增强器""" - async def enhance(self, information: DocumentData) -> DocumentData: + async def enhance(self, information: ChunkData) -> ChunkData: """增强信息""" return information @@ -41,19 +41,19 @@ def __init__(self) -> None: ImageInformationEnhancer() ] - def get_enhancer(self, information: DocumentData) -> InformationEnhancer|None: + def get_enhancer(self, information: ChunkData) -> InformationEnhancer|None: """获取信息增强器""" match information.type: - case "table": + case ChunkType.TABLE: return TableInformationEnhancer() - case "formulas": + case ChunkType.FORMULA: return FormulasInformationEnhancer() - case "image": + case ChunkType.IMAGE: return ImageInformationEnhancer() case _: return None - async def enhance_information(self, information: DocumentData) -> DocumentData: + async def enhance_information(self, information: ChunkData) -> ChunkData: """增强信息""" enhancer = self.get_enhancer(information) if not enhancer: diff --git a/parsers/document_parser.py b/parsers/document_parser.py index e582873..782d019 100644 --- a/parsers/document_parser.py +++ b/parsers/document_parser.py @@ -1,6 +1,6 @@ import logging -from parsers.document_parser import DocumentParser, ParseResult +from parsers.base_models import DocumentData, DocumentParser from parsers.excel_parser import ExcelParser logger = logging.getLogger(__name__) @@ -20,7 +20,7 @@ def get_parser(self, file_path: str) -> DocumentParser | None: return parser return None - async def parse_document(self, file_path: str) -> ParseResult: + async def parse_document(self, file_path: str) -> DocumentData: """解析文档""" parser = self.get_parser(file_path) if not parser: diff --git a/worker.py b/worker.py index 35d440a..1400cb9 100644 --- a/worker.py +++ b/worker.py @@ -4,8 +4,8 @@ from sanic import Sanic from enhancers.information_enhancer import InformationEnhancerFactory -from parsers.document_parser import DocumentData -from parsers.document_parser_factory import DocumentParserFactory +from parsers.document_parser import DocumentParserFactory +from parsers.base_models import ChunkData async def worker(app: Sanic) -> dict[str, Any]: @@ -22,12 +22,12 @@ async def worker(app: Sanic) -> dict[str, Any]: parse_result = await parser_factory.parse_document(file_path) if not parse_result.success: continue - chunk_list = parse_result.document + chunk_list = parse_result.chunks # 控制并发数量,防止访问量过大导致失败 SEMAPHORE_LIMIT = 10 # 可根据实际情况调整 semaphore = asyncio.Semaphore(SEMAPHORE_LIMIT) - async def enhance_with_semaphore(chunk: DocumentData, semaphore: asyncio.Semaphore) -> DocumentData: + async def enhance_with_semaphore(chunk: ChunkData, semaphore: asyncio.Semaphore) -> ChunkData: async with semaphore: return await enhancer_factory.enhance_information(chunk) @@ -35,5 +35,5 @@ async def enhance_with_semaphore(chunk: DocumentData, semaphore: asyncio.Semapho enhanced_chunk_list = await asyncio.gather( *(enhance_with_semaphore(chunk, semaphore) for chunk in chunk_list) ) - parse_result.document = enhanced_chunk_list + parse_result.chunks = enhanced_chunk_list return parse_result.model_dump(mode="json") From da016a14ae4606ce35852d425d6a32fac3c9861f Mon Sep 17 00:00:00 2001 From: liningping <728359849@qq.com> Date: Tue, 19 Aug 2025 12:09:08 +0000 Subject: [PATCH 08/10] fix: comfort ruff --- worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/worker.py b/worker.py index 1400cb9..9fc6507 100644 --- a/worker.py +++ b/worker.py @@ -4,8 +4,8 @@ from sanic import Sanic from enhancers.information_enhancer import InformationEnhancerFactory -from parsers.document_parser import DocumentParserFactory from parsers.base_models import ChunkData +from parsers.document_parser import DocumentParserFactory async def worker(app: Sanic) -> dict[str, Any]: From 940ef02f7923e0af3c1259ff383cbb602d08ea89 Mon Sep 17 00:00:00 2001 From: liningping <728359849@qq.com> Date: Tue, 19 Aug 2025 13:40:28 +0000 Subject: [PATCH 09/10] fix: refine document data model --- parsers/base_models.py | 17 +++- parsers/excel_parser.py | 168 ++++++++++++++++--------------------- tests/test_excel_parser.py | 44 +++++----- worker.py | 7 +- 4 files changed, 113 insertions(+), 123 deletions(-) diff --git a/parsers/base_models.py b/parsers/base_models.py index c9fece7..6a68289 100644 --- a/parsers/base_models.py +++ b/parsers/base_models.py @@ -1,11 +1,13 @@ import logging from abc import ABC, abstractmethod from enum import Enum +from typing import Any from pydantic import BaseModel logger = logging.getLogger(__name__) + class ChunkType(str, Enum): """块类型""" TEXT = "text" @@ -13,17 +15,28 @@ class ChunkType(str, Enum): TABLE = "table" FORMULA = "formula" +class TableDataItem(BaseModel): + """表格数据类""" + rows: int # 行数 + columns: int # 列数 + row_headers: list[Any] = [] # 行头 + column_headers: list[Any] = [] # 列头 + data: list[list[str]] = [] # 数据 + class ChunkData(BaseModel): """块数据类""" type: ChunkType name: str - content: str = "" + content: str|TableDataItem = "" description: str = "" class DocumentData(BaseModel): """解析结果类""" title: str = "" - chunks: list[ChunkData] = [] + texts: list[ChunkData] = [] + tables: list[ChunkData] = [] + images: list[ChunkData] = [] + formulas: list[ChunkData] = [] processing_time: float = 0 success: bool error_message: str | None = None diff --git a/parsers/excel_parser.py b/parsers/excel_parser.py index eb7cd00..e333faa 100644 --- a/parsers/excel_parser.py +++ b/parsers/excel_parser.py @@ -19,7 +19,13 @@ from openpyxl.workbook.workbook import Workbook # type: ignore from openpyxl.worksheet.worksheet import Worksheet # type: ignore -from parsers.base_models import ChunkData, ChunkType, DocumentData, DocumentParser +from parsers.base_models import ( + ChunkData, + ChunkType, + DocumentData, + DocumentParser, + TableDataItem, +) # 忽略 openpyxl 的特定警告 warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl') @@ -57,105 +63,77 @@ def __init__(self, config: ExcelParseConfig | None = None): self.image_index: int = 0 self.supported_formats: list[str] = ['.xlsx', '.xls'] - async def parse(self, excel_path: str) -> DocumentData: + def can_parse(self, file_path: str) -> bool: + """ + 验证输入文件 + Args: + file_path: 文件路径 + Returns: + bool: 是否支持解析 """ - 解析Excel文件并保存结果 + return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats) + async def parse(self, excel_path: str) -> DocumentData: + """ + 将Excel文件转换为JSON格式 Args: excel_path: Excel文件路径 - output_dir: 输出目录路径 Returns: - ParseResult: 解析结果对象 - Raises: - ExcelParseError: 当解析失败时抛出 + DocumentData: 文档数据 """ + # 获取文件名作为标题(不带扩展名) start_time = time.time() try: - # 转换Excel到JSON格式 - title, document_data = self._excel_to_json(excel_path) - - # 计算处理时间 + # 初始化内容列表和图片列表 + texts: list[ChunkData] = [] + tables: list[ChunkData] = [] + images: list[ChunkData] = [] + + # 加载工作簿 + workbook = self._load_workbook(excel_path) + + # 处理每个工作表 + for sheet_index, sheet_name in enumerate(workbook.sheetnames): + sheet = workbook[sheet_name] + + # 添加工作表标题 + texts.append(ChunkData( + type=ChunkType.TEXT, + name=sheet_name, + content=f"工作表 {sheet_index + 1}: {sheet_name}", + description="工作表标题" + )) + + # 处理图片 + sheet_images = self._extract_sheet_images(sheet) + images.extend(sheet_images) + + # 处理表格数据 + table_content = self._extract_table_data(sheet) + tables.append(ChunkData( + type=ChunkType.TABLE, + name="表格", + content=table_content, + description="表格" + )) processing_time = time.time() - start_time - - return DocumentData( - title=title, - chunks=document_data, + title=Path(excel_path).stem, + texts=texts, + tables=tables, + images=images, processing_time=processing_time, success=True ) - except Exception as e: processing_time = time.time() - start_time return DocumentData( success=False, - error_message=str(e) + error_message=str(e), + processing_time=processing_time ) - def can_parse(self, file_path: str) -> bool: - """ - 验证输入文件 - Args: - file_path: 文件路径 - Returns: - bool: 是否支持解析 - """ - return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats) - - def _excel_to_json(self, excel_path: str) -> tuple[str, list[ChunkData]]: - """ - 将Excel文件转换为JSON格式 - Args: - excel_path: Excel文件路径 - Returns: - DocumentData: 文档数据 - """ - # 获取文件名作为标题(不带扩展名) - title = Path(excel_path).stem - - # 初始化内容列表和图片列表 - content: list[ChunkData] = [] - self.image_index = 0 - - # 加载工作簿 - workbook = self._load_workbook(excel_path) - - # 处理每个工作表 - for sheet_index, sheet_name in enumerate(workbook.sheetnames): - sheet = workbook[sheet_name] - - # 添加工作表标题 - content.append(ChunkData( - type=ChunkType.TEXT, - name=sheet_name, - content=f"工作表 {sheet_index + 1}: {sheet_name}", - description="工作表标题" - )) - - # 处理图片 - sheet_images = self._extract_sheet_images(sheet) - content.extend(sheet_images) - - # 处理表格数据 - table_content = self._extract_table_data(sheet) - content.append(ChunkData( - type=ChunkType.TABLE, - name="表格", - content=json.dumps(table_content), - description="表格" - )) - - # 添加结束文本 - content.append(ChunkData( - type=ChunkType.TEXT, - name="结束文本", - content="", - description="结束文本" - )) - - return title, content - def _load_workbook(self, excel_path: str) -> Workbook: """ 加载Excel工作簿 @@ -250,13 +228,13 @@ def _get_image_format(self, img_obj: Image) -> str: return img_format return self.config.default_image_format - def _process_cell_value(self, cell_value: Any) -> CellValue: + def _process_cell_value(self, cell_value: Any) -> str: """ 预处理单元格值,将datetime对象转换为字符串 Args: cell_value: 原始单元格值 Returns: - CellValue: 处理后的单元格值 + str: 处理后的单元格值 """ if cell_value is None: return "" @@ -269,14 +247,10 @@ def _process_cell_value(self, cell_value: Any) -> CellValue: if isinstance(cell_value, date): return cell_value.strftime("%Y-%m-%d") - # 处理其他类型 - if isinstance(cell_value, str|int|float|bool): - return cell_value - # 对于其他类型,转换为字符串 return str(cell_value) - def _extract_table_data(self, sheet: Worksheet) -> dict[str, Any]: + def _extract_table_data(self, sheet: Worksheet) -> TableDataItem: """ 提取表格数据 Args: @@ -295,16 +269,14 @@ def _extract_table_data(self, sheet: Worksheet) -> dict[str, Any]: # 提取所有数据 all_rows = self._extract_all_rows(sheet, max_row, max_col, merged_map) - return { - "dimensions": { - "rows": len(all_rows), - "columns": max_col - }, - "headers": all_rows[0] if all_rows else [], - "data": all_rows[1:] if len(all_rows) > 1 else [] - } + return TableDataItem( + rows=len(all_rows), + columns=max_col, + row_headers=all_rows[0] if all_rows else [], + data=all_rows[1:] if len(all_rows) > 1 else [] + ) - def _get_merged_cells(self, sheet: Worksheet) -> dict[tuple[int, int, int, int], CellValue]: + def _get_merged_cells(self, sheet: Worksheet) -> dict[tuple[int, int, int, int], str]: """ 获取合并单元格信息 Args: @@ -323,7 +295,7 @@ def _get_merged_cells(self, sheet: Worksheet) -> dict[tuple[int, int, int, int], merged_ranges[(min_row, min_col, max_row, max_col)] = merged_value return merged_ranges - def _create_merged_cell_map(self, merged_ranges: dict, sheet: Worksheet) -> dict[tuple[int, int], CellValue]: + def _create_merged_cell_map(self, merged_ranges: dict, sheet: Worksheet) -> dict[tuple[int, int], str]: """ 创建合并单元格映射 Args: @@ -342,7 +314,7 @@ def _create_merged_cell_map(self, merged_ranges: dict, sheet: Worksheet) -> dict return merged_map def _extract_all_rows(self, sheet: Worksheet, max_row: int, max_col: int, - merged_map: dict[tuple[int, int], CellValue]) -> TableData: + merged_map: dict[tuple[int, int], str]) -> list[list[str]]: """ 提取所有行数据 Args: diff --git a/tests/test_excel_parser.py b/tests/test_excel_parser.py index b8f0775..7a72ba8 100644 --- a/tests/test_excel_parser.py +++ b/tests/test_excel_parser.py @@ -48,20 +48,22 @@ async def test_parse_real_basic_and_image(): result = await parser.parse(xlsx_path) assert result.success is True - # 内容:Sheet1标题、Sheet1图片、Sheet1表格、Sheet2标题、Sheet2表格、结束文本 - content = result.chunks - assert len(content) == 6 + # 内容:Sheet1标题、Sheet1图片、Sheet1表格、Sheet2标题、Sheet2表格 + content = result.tables + assert len(content) == 2 + + assert len(result.images) == 1 + assert len(result.texts) == 2 # 校验顺序与关键字段 - assert content[0].type == "text" and content[0].name == "Sheet1" - assert content[1].type == "image" - assert content[1].name == "#/pictures/0" - assert content[1].content.startswith("data:image/") - - assert content[2].type == "table" - assert content[3].type == "text" and content[3].name == "Sheet2" - assert content[4].type == "table" - assert content[5].type == "text" and content[5].name == "结束文本" + assert result.texts[0].type == "text" and result.texts[0].name == "Sheet1" + assert result.images[0].type == "image" + assert result.images[0].name == "#/pictures/0" + assert result.images[0].content.startswith("data:image/") + + assert result.tables[0].type == "table" + assert result.texts[1].type == "text" and result.texts[1].name == "Sheet2" + assert result.tables[1].type == "table" finally: os.remove(xlsx_path) finally: @@ -91,19 +93,19 @@ async def test_parse_real_merged_cells(): result = await parser.parse(xlsx_path) assert result.success is True - content = result.chunks - # 结构:标题、表格、结束文本 - assert len(content) == 3 + # 结构:标题、表格 + assert len(result.tables) == 1 + assert len(result.texts) == 1 # 表格在索引1 - table_chunk: ChunkData = content[1] + table_chunk: ChunkData = result.tables[0] assert table_chunk.type == "table" - import json as _json - payload = _json.loads(table_chunk.content) - assert payload["headers"] == ["Merged Header", "Merged Header"] - assert payload["dimensions"]["rows"] == 2 - assert payload["dimensions"]["columns"] == 2 + payload = table_chunk.content + assert payload.row_headers == ["Merged Header", "Merged Header"] + assert payload.data == [["Value1", "Value2"]] + assert payload.rows == 2 + assert payload.columns == 2 finally: os.remove(xlsx_path) diff --git a/worker.py b/worker.py index 9fc6507..d59cea6 100644 --- a/worker.py +++ b/worker.py @@ -22,7 +22,7 @@ async def worker(app: Sanic) -> dict[str, Any]: parse_result = await parser_factory.parse_document(file_path) if not parse_result.success: continue - chunk_list = parse_result.chunks + chunk_list = parse_result.texts + parse_result.tables + parse_result.images + parse_result.formulas # 控制并发数量,防止访问量过大导致失败 SEMAPHORE_LIMIT = 10 # 可根据实际情况调整 semaphore = asyncio.Semaphore(SEMAPHORE_LIMIT) @@ -35,5 +35,8 @@ async def enhance_with_semaphore(chunk: ChunkData, semaphore: asyncio.Semaphore) enhanced_chunk_list = await asyncio.gather( *(enhance_with_semaphore(chunk, semaphore) for chunk in chunk_list) ) - parse_result.chunks = enhanced_chunk_list + parse_result.texts = enhanced_chunk_list[:len(parse_result.texts)] + parse_result.tables = enhanced_chunk_list[len(parse_result.texts):len(parse_result.texts) + len(parse_result.tables)] + parse_result.images = enhanced_chunk_list[len(parse_result.texts) + len(parse_result.tables):len(parse_result.texts) + len(parse_result.tables) + len(parse_result.images)] + parse_result.formulas = enhanced_chunk_list[len(parse_result.texts) + len(parse_result.tables) + len(parse_result.images):] return parse_result.model_dump(mode="json") From d5e71b0b3b88365b7bd73a665aa021255a4429b9 Mon Sep 17 00:00:00 2001 From: liningping <728359849@qq.com> Date: Tue, 19 Aug 2025 13:59:53 +0000 Subject: [PATCH 10/10] fix: add Field --- parsers/base_models.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/parsers/base_models.py b/parsers/base_models.py index 6a68289..6b6b310 100644 --- a/parsers/base_models.py +++ b/parsers/base_models.py @@ -3,7 +3,7 @@ from enum import Enum from typing import Any -from pydantic import BaseModel +from pydantic import BaseModel, Field logger = logging.getLogger(__name__) @@ -19,9 +19,9 @@ class TableDataItem(BaseModel): """表格数据类""" rows: int # 行数 columns: int # 列数 - row_headers: list[Any] = [] # 行头 - column_headers: list[Any] = [] # 列头 - data: list[list[str]] = [] # 数据 + row_headers: list[Any] = Field(default_factory=list) # 行头 + column_headers: list[Any] = Field(default_factory=list) # 列头 + data: list[list[str]] = Field(default_factory=list) # 数据 class ChunkData(BaseModel): """块数据类""" @@ -33,10 +33,10 @@ class ChunkData(BaseModel): class DocumentData(BaseModel): """解析结果类""" title: str = "" - texts: list[ChunkData] = [] - tables: list[ChunkData] = [] - images: list[ChunkData] = [] - formulas: list[ChunkData] = [] + texts: list[ChunkData] = Field(default_factory=list) + tables: list[ChunkData] = Field(default_factory=list) + images: list[ChunkData] = Field(default_factory=list) + formulas: list[ChunkData] = Field(default_factory=list) processing_time: float = 0 success: bool error_message: str | None = None @@ -45,7 +45,7 @@ class DocumentParser(ABC): """文档解析器基类""" def __init__(self) -> None: - self.supported_formats: list[str] = [] + self.supported_formats: list[str] = Field(default_factory=list) @abstractmethod async def parse(self, file_path: str) -> DocumentData: @@ -55,4 +55,4 @@ async def parse(self, file_path: str) -> DocumentData: @abstractmethod def can_parse(self, file_path: str) -> bool: """检查是否可以解析该文件""" - pass + return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)