Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -174,3 +174,4 @@ cython_debug/
# PyPI configuration file
.pypirc

examples/
26 changes: 26 additions & 0 deletions parsers/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,27 @@
# Parsers package

from .base_models import DocumentData, DocumentParser
from .parser_registry import (
PARSER_REGISTRY,
get_parser,
get_supported_formats,
list_registered_parsers,
register_parser,
)

__all__ = [
'DocumentData',
'DocumentParser',
'PARSER_REGISTRY',
'register_parser',
'get_parser',
'get_supported_formats',
'list_registered_parsers',
'load_all_parsers',
]

def load_all_parsers() -> list[str]:
"""加载所有解析器"""
from .docx_parser import DocxDocumentParser
from .excel_parser import ExcelParser
return [DocxDocumentParser.__name__, ExcelParser.__name__]
16 changes: 4 additions & 12 deletions parsers/base_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@ class TableDataItem(BaseModel):
class ChunkData(BaseModel):
"""块数据类"""
type: ChunkType
name: str
content: str|TableDataItem = ""
description: str = ""
name: str|None = None
content: str|TableDataItem|None = None
description: str|None = None

class DocumentData(BaseModel):
"""解析结果类"""
title: str = ""
title: str|None = None
texts: list[ChunkData] = Field(default_factory=list)
tables: list[ChunkData] = Field(default_factory=list)
images: list[ChunkData] = Field(default_factory=list)
Expand All @@ -44,15 +44,7 @@ class DocumentData(BaseModel):
class DocumentParser(ABC):
"""文档解析器基类"""

def __init__(self) -> None:
self.supported_formats: list[str] = Field(default_factory=list)

@abstractmethod
async def parse(self, file_path: str) -> DocumentData:
"""解析文档"""
pass

@abstractmethod
def can_parse(self, file_path: str) -> bool:
"""检查是否可以解析该文件"""
return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
29 changes: 0 additions & 29 deletions parsers/document_parser.py

This file was deleted.

227 changes: 227 additions & 0 deletions parsers/docx_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
"""
DOCX文档解析器模块

该模块提供使用Docling库解析DOCX文档并提取结构化内容的功能。
支持标题、段落、列表、表格和图片的识别与输出。
"""

import asyncio
import logging
import time

from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, WordFormatOption
from docling.pipeline.simple_pipeline import SimplePipeline
from docling_core.types.doc.document import (
CodeItem,
DocItemLabel,
DoclingDocument,
FormulaItem,
ListItem,
PictureItem,
SectionHeaderItem,
TableItem,
TextItem,
TitleItem,
)

from parsers.base_models import (
ChunkData,
ChunkType,
DocumentData,
DocumentParser,
TableDataItem,
)
from parsers.parser_registry import register_parser

logger = logging.getLogger(__name__)


@register_parser(['.docx'])
class DocxDocumentParser(DocumentParser):
"""DOCX文档解析器

使用Docling的现代解析管道提取DOCX文档的结构化内容。
支持异步解析接口,符合DocumentParser抽象基类。
"""

def __init__(self) -> None:
"""初始化解析器"""
super().__init__()
self._converter = DocumentConverter(
format_options={InputFormat.DOCX: WordFormatOption(pipeline_cls=SimplePipeline)},
allowed_formats=[InputFormat.DOCX]
)
logger.debug("DocxDocumentParser initialized with SimplePipeline")

async def parse(self, file_path: str) -> DocumentData:
"""异步解析DOCX文件

Args:
file_path: DOCX文件路径

Returns:
DocumentData: 解析结果,包含标题、内容、处理时间和错误信息
"""
start_time = time.time()
try:
# 执行同步转换(在异步中运行)
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(None, self._converter.convert, file_path)
doc_data = result.document

# 确保文档数据包含所有必要的属性
if not hasattr(doc_data, 'name'):
doc_data.name = 'Unknown Document'
if not hasattr(doc_data, 'texts'):
doc_data.texts = []
if not hasattr(doc_data, 'pictures'):
doc_data.pictures = []
if not hasattr(doc_data, 'tables'):
doc_data.tables = []

title = self._extract_title(doc_data)
images = self._extract_images(doc_data.pictures)
tables = self._extract_tables(doc_data.tables)
texts = self._extract_texts(doc_data.texts)

processing_time = time.time() - start_time
logger.info(f"Successfully parsed DOCX: {file_path} (took {processing_time:.2f}s)")
return DocumentData(
title=title,
texts=texts,
tables=tables,
images=images,
processing_time=processing_time,
success=True
)

except Exception as e:
processing_time = time.time() - start_time
error_msg = f"Failed to parse DOCX file {file_path}: {type(e).__name__}: {e}"
logger.exception(error_msg) # 记录完整堆栈
return DocumentData(
success=False,
error_message=str(e),
processing_time=processing_time
)

def _extract_images(self, pictures: list[PictureItem]) -> list[ChunkData]:
"""提取文档中的图片

Args:
pictures: 图片列表

Returns:
List[ChunkData]: 图片列表
"""
image_items = []
for idx, picture in enumerate(pictures):
image_uri = ""
if hasattr(picture, 'image') and picture.image and hasattr(picture.image, 'uri'):
image_uri = str(picture.image.uri)

caption = ""
if hasattr(picture, 'captions') and picture.captions:
caption = str(picture.captions[0])

image_items.append(
ChunkData(
type=ChunkType.IMAGE,
name=getattr(picture, 'self_ref', None) or f"#/pictures/{idx}",
content=image_uri,
description=caption
)
)

return image_items

def _extract_tables(self, tables: list[TableItem]) -> list[ChunkData]:
"""提取文档中的表格

Args:
tables: 表格列表

Returns:
List[ChunkData]: 表格列表
"""
# 添加安全检查,确保 tables 参数存在且可迭代
if not tables or not hasattr(tables, '__iter__'):
return []

table_items: list[ChunkData] = []
for table in tables:
if not hasattr(table, 'data') or not hasattr(table.data, 'grid'):
continue
if len(table.data.grid) == 0:
continue

table_cells = table.data.grid
row_headers = [cell.text for cell in table_cells[0] if cell.row_header]
column_headers = [cell.text for cell in table_cells[0] if cell.column_header]
data = [[cell.text for cell in row] for row in table_cells[1:]]
table_data = TableDataItem(
rows=table.data.num_rows,
columns=table.data.num_cols,
row_headers=row_headers,
column_headers=column_headers,
data=data
)
table_items.append(
ChunkData(
type=ChunkType.TABLE,
name=getattr(table, 'self_ref', None) or f"table-{len(table_items)}",
content=table_data
)
)

return table_items

def _extract_title(self, doc_data: DoclingDocument) -> str:
"""提取文档中的标题
Args:
doc_data: 文档数据
Returns:
str: 标题
"""
title = ""
for item in doc_data.texts:
if hasattr(item, 'label') and item.label == DocItemLabel.TITLE:
title = item.text
break
return title if title else doc_data.name

def _extract_texts(self, texts:list[TitleItem|SectionHeaderItem|ListItem|CodeItem|FormulaItem|TextItem]) -> list[ChunkData]:
"""提取文档中的文本

Args:
text: 文本列表

Returns:
List[ChunkData]: 文本列表
"""
text_items: list[ChunkData] = []

for item in texts:
if not hasattr(item, 'label'):
continue
if not hasattr(item, 'text') or len(item.text) == 0:
continue
match item.label:
case DocItemLabel.FORMULA:
text_items.append(
ChunkData(
type=ChunkType.FORMULA,
name=item.self_ref or f"formula-{len(text_items)}",
content=item.text
)
)
case _:
text_items.append(
ChunkData(
type=ChunkType.TEXT,
name=f"text-{len(text_items)}",
content=item.text
)
)
return text_items
21 changes: 6 additions & 15 deletions parsers/excel_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
DocumentParser,
TableDataItem,
)
from parsers.parser_registry import register_parser

# 忽略 openpyxl 的特定警告
warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl')
Expand All @@ -49,6 +50,7 @@ class ExcelParseError(Exception):
pass


@register_parser(['.xlsx', '.xls'])
class ExcelParser(DocumentParser):
"""Excel文件解析器类"""

Expand All @@ -61,19 +63,8 @@ def __init__(self, config: ExcelParseConfig | None = None):
super().__init__()
self.config: ExcelParseConfig = config or ExcelParseConfig()
self.image_index: int = 0
self.supported_formats: list[str] = ['.xlsx', '.xls']

def can_parse(self, file_path: str) -> bool:
"""
验证输入文件
Args:
file_path: 文件路径
Returns:
bool: 是否支持解析
"""
return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)

async def parse(self, excel_path: str) -> DocumentData:
async def parse(self, file_path: str) -> DocumentData:
"""
将Excel文件转换为JSON格式
Args:
Expand All @@ -91,7 +82,7 @@ async def parse(self, excel_path: str) -> DocumentData:
images: list[ChunkData] = []

# 加载工作簿
workbook = self._load_workbook(excel_path)
workbook = self._load_workbook(file_path)

# 处理每个工作表
for sheet_index, sheet_name in enumerate(workbook.sheetnames):
Expand Down Expand Up @@ -119,7 +110,7 @@ async def parse(self, excel_path: str) -> DocumentData:
))
processing_time = time.time() - start_time
return DocumentData(
title=Path(excel_path).stem,
title=Path(file_path).stem,
texts=texts,
tables=tables,
images=images,
Expand Down Expand Up @@ -183,7 +174,7 @@ def _process_image_object(self, img_obj: Image) -> ChunkData | None:
Args:
img_obj: 图片对象
Returns:
Optional[DocumentData]: 图片信息,处理失败时返回None
ChunkData|None: 图片信息,处理失败时返回None
"""
try:
# 获取图片数据
Expand Down
Loading
Loading