In [2]:
# 相关包与其路径初配置
import sys
from pathlib import Path

# 获取当前文件的路径
current_path = Path().absolute()
# 添加项目根目录（包含 docx_parser 和 llm_structuring 的目录）到系统路径
root_path = current_path.parent.parent.parent
sys.path.append(str(root_path))

In [None]:
# 重新导入模块
from doc_analysis.docx_parser._01_xml_loader import DocxXMLLoader
from doc_analysis.docx_parser._02_xml_parser import DocxXMLParser


In [4]:
#doc_path = "C:/Users/huiwa/Downloads/文本分析测试/CaseTest/case3_北京铁运投标人须知.docx"
doc_path = "C:/Users/huiwa/Downloads/文本分析测试/CaseTest/case8：招标文件-第1包：一级压榨花生油.docx"
#doc_path = "C:/Users/huiwa/Downloads/文本分析测试/CaseTest/case 9 _样式测试.docx"
#doc_path = "C:/Users/huiwa/Documents/_CursorChat_Extra/Langchain_Jupyter/.LoRA微调/招标文件_微调库/WORD/[1].docx"
#doc_path = "C:/Users/huiwa/Downloads/文本分析测试/CaseTest/Case6_中国区零食包框架供应商采购项目招采文件.docx"

In [5]:
# 1. 加载文档
loader = DocxXMLLoader(doc_path)
raw_content = loader.extract_raw()

2025-02-13 14:51:25,983 - doc_analysis.docx_parser._01_xml_loader - INFO - 从 C:\Users\huiwa\Downloads\文本分析测试\CaseTest\case8：招标文件-第1包：一级压榨花生油.docx 成功提取 raw XML content


In [6]:
# 2. 创建解析器
parser = DocxXMLParser(raw_content)

2025-02-13 14:51:27,580 - doc_analysis.docx_parser._02_xml_parser - INFO - Successfully parsed XML content
2025-02-13 14:51:27,580 - doc_analysis.docx_parser._02_xml_parser - INFO - DocxXMLParser initialized successfully


In [7]:
type(parser)

doc_analysis.docx_parser._02_xml_parser.DocxXMLParser

In [8]:
# 3. 测试基础XPath查询功能
print("\n=== XPath查询测试 ===")

# 测试段落查询
paragraphs = parser.xpath('//w:p')
print(f"找到的段落数量: {len(paragraphs)}")

# 如果有段落，显示第一个段落的文本
if paragraphs:
    first_para_text = parser.get_element_text(paragraphs[0])
    print(f"\n第一个段落的文本: {first_para_text[:100]}...")  # 只显示前100个字符


=== XPath查询测试 ===
找到的段落数量: 2143

第一个段落的文本: ...


In [9]:
# 4. 测试样式查询
if parser.styles is not None:
    styles = parser.xpath('//w:style', parser.styles)
    print(f"\n找到的样式数量: {len(styles)}")
    
    # 显示样式ID
    for style in styles[:5]:  # 只显示前5个样式
        style_id = parser.get_attribute(style, 'styleId')
        print(f"样式ID: {style_id}")



找到的样式数量: 7
样式ID: 1
样式ID: 5
样式ID: 3
样式ID: 2
样式ID: 4


In [10]:
# 5. 测试文档结构
print("\n=== 文档结构测试 ===")
sections = parser.xpath('//w:sectPr')
tables = parser.xpath('//w:tbl')
pictures = parser.xpath('//pic:pic')
charts = parser.xpath('//c:chart')

print(f"节数量: {len(sections)}")
print(f"表格数量: {len(tables)}")
print(f"图片数量: {len(pictures)}")
print(f"图表数量: {len(charts)}")


=== 文档结构测试 ===
节数量: 63
表格数量: 25
图片数量: 1
图表数量: 0


In [13]:
from typing import Dict, Any 
def print_tree(tree: Dict[str, Any], indent: str = '') -> None:
    """格式化打印结构树"""
    if isinstance(tree, dict):
        for key, value in tree.items():
            if key in ['document', 'styles', 'numbering']:
                print(f"\n{indent}=== {key.upper()} ===")
                print_tree(value, indent)
            elif key == 'tag':
                print(f"\n{indent}{value}", end='')
            elif key == 'attributes' and value:
                print(f" ({', '.join(f'{k}={v}' for k, v in value.items())})", end='')
            elif key == 'children':
                for child in value:
                    print_tree(child, indent + '  ')
            elif key == 'children_count':
                print(f" [total: {value}]", end='')
            elif key == 'note':
                print(f"\n{indent}  {value}")

In [14]:
# 获取并打印文档结构树
print("=== 文档结构树 ===")
structure = parser.get_structure_tree(max_depth=10, max_children=10)
print_tree(structure)

=== 文档结构树 ===

=== DOCUMENT ===

document (Ignorable=w14 w15 wp14) [total: 1]
  body [total: 889]
    p (paraId=6E013A57) [total: 1]
      pPr [total: 2]
        spacing (line=256, lineRule=auto)
        rPr [total: 2]
          rFonts (ascii=Arial)
          sz (val=21)
    p (paraId=560195EE) [total: 1]
      pPr [total: 2]
        spacing (line=256, lineRule=auto)
        rPr [total: 2]
          rFonts (ascii=Arial)
          sz (val=21)
    p (paraId=5D6E8FBC) [total: 1]
      pPr [total: 2]
        spacing (line=257, lineRule=auto)
        rPr [total: 2]
          rFonts (ascii=Arial)
          sz (val=21)
    p (paraId=551EBE2B) [total: 1]
      pPr [total: 2]
        spacing (line=257, lineRule=auto)
        rPr [total: 2]
          rFonts (ascii=Arial)
          sz (val=21)
    p (paraId=4C2906CD) [total: 1]
      pPr [total: 2]
        spacing (line=257, lineRule=auto)
        rPr [total: 2]
          rFonts (ascii=Arial)
          sz (val=21)
    p (paraId=66779002) [total: 