# <font color="red">STEP 1: 文档提取测试</font>

In [33]:
# 初始化
import django_setup

In [34]:
# 导入相关模型：get_user_model, Project, FileRecord, DocumentAnalysis, FileProjectLink, ProjectHistory
from django.contrib.auth import get_user_model
from apps.doc_analysis.models import DocumentAnalysis, InvalidStatusTransition
from apps.projects.models import Project
from apps.files.models import FileRecord
from django.core.files.uploadedfile import SimpleUploadedFile
from django.core.exceptions import ValidationError
from apps.doc_analysis.steps._01_extract_docx_elements import DocxExtractorStep
from apps.doc_analysis.pipeline.types import ModelData, DocxElements, OutlineAnalysisResult

In [None]:
# 准备测试所需的 user, project, file_record  (其中project与file_record关联)
User = get_user_model()

# 获取已存在的测试数据

# 获取已存在的用户
user = User.objects.get(phone='18501771516')
print(f"用户: {user.phone}")
        
# 获取已存在的项目
project = Project.objects.get(project_name='测试项目1')
print(f"项目: {project.project_name}")
        
# 获取已存在的文件
file_record = FileRecord.objects.get(id='3')
print(f"文件: {file_record.name}")

In [None]:
# 清除‘测试分析A”，用于接下去的测试
DocumentAnalysis.objects.filter(title="测试分析A").delete()

In [None]:
# 1. 创建新的文档分析实例 - 测试分析A
docx_analysis = DocumentAnalysis.objects.create(
    project=project,
    title="测试分析A",
    created_by=user,
    analysis_questions=["投标要求", "评分标准"]  # 示例分析问题
)
print(f"创建文档分析: {docx_analysis.title} (ID: {docx_analysis.id})")

In [None]:
# 2.上传真实的 DOCX文件

# 2.1 准备文件路径
doc_path = "C:/Users/huiwa/Downloads/文本分析测试/CaseTest/case8：招标文件-第1包：一级压榨花生油.docx"

# 2.2 读取文件内容
with open(doc_path, 'rb') as f:
    file_content = f.read()
test_file = SimpleUploadedFile(
    "test_doc.docx",
    file_content,
    content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
print(f"文件大小: {test_file.size}")

# 2. 创建新的文件记录 并 存储文件对象
new_file_record = FileRecord.objects.create(
    name="test_doc.docx",
    file=test_file,  # 使用之前准备的测试文件
    owner=user,
    size = test_file.size
)
print(f"创建文件记录: {new_file_record.name}")

In [39]:
# 3. 关联新文件
try:
    docx_analysis.update_file_record(new_file_record)
except Exception as e:
    print(f"关联文件失败: {str(e)}")

In [None]:
# 4. 触发开始分析，并提取文档元素 elements 存入数据库
print("\n===== 最终状态检查 =====")
print(f"开始分析前-状态: {docx_analysis.status}")
#docx_analysis.start_analysis()
print(f"开始分析后-状态: {docx_analysis.status}")

# 初始化DocxExtractorStep
docx_extractor=DocxExtractorStep()

try:
    # 准备输入数据, 好比 DocumentAnalysis.instance.data
    input_data = ModelData(model= DocumentAnalysis, instance=docx_analysis)
    
    # 执行文档提取
    docx_elements = docx_extractor.process(input_data)
    
    # 打印提取结果
    print("文档提取成功！提取到的元素数量:", len(docx_elements))
    print("第一个元素示例:", docx_elements[0])
    
    # 检查保存到数据库的结果
    saved_analysis = DocumentAnalysis.objects.get(id=docx_analysis.id)
    print("保存的提取结果:", saved_analysis.extracted_elements)
    
except ValidationError as e:
    print("文档提取失败:", str(e))
    # 检查分析状态
    failed_analysis = DocumentAnalysis.objects.get(id=docx_analysis.id)
    print("分析状态:", failed_analysis.status)
    print("错误信息:", failed_analysis.error_message)


# <font color="red">STEP 2: outline 分析测试</font>

In [None]:
# 初始化
import django_setup

In [2]:
# 导入相关模型：get_user_model, Project, FileRecord, DocumentAnalysis, FileProjectLink, ProjectHistory
from django.contrib.auth import get_user_model
from apps.doc_analysis.models import DocumentAnalysis, InvalidStatusTransition
from apps.projects.models import Project
from apps.files.models import FileRecord
from django.core.files.uploadedfile import SimpleUploadedFile
from django.core.exceptions import ValidationError
from apps.doc_analysis.steps._01_extract_docx_elements import DocxExtractorStep
from apps.doc_analysis.pipeline.types import ModelData, DocxElements, OutlineAnalysisResult
from apps.doc_analysis.steps._02_outline_analysis import DocxOutlineAnalyzerStep
from pprint import pprint 

In [None]:
# 直接引用"测试分析A", 并获取其extracted_elements

saved_analysis = DocumentAnalysis.objects.get(title="测试分析A")
extracted_elements = saved_analysis.extracted_elements
pprint(extracted_elements['elements'][10])

In [4]:
# 创建 DocxElements 实例 from models.py 的 extracted_elements
docx_elements = DocxElements.from_model(extracted_elements)

In [5]:
# 直接测试 outline_analyzer.py
outline_analyzer = DocxOutlineAnalyzerStep()


In [6]:
input_datas = outline_analyzer.prepare_requests_data(docx_elements)

In [None]:
print(type(input_datas))
print(input_datas[0])
print(input_datas[1])


In [None]:
simulated_prompt, formatted_messages = outline_analyzer.simulate_prompt(input_datas[0])
pprint(formatted_messages)


In [None]:
analysis_result = outline_analyzer.process(docx_elements)

In [None]:
# 打印输出的结果
print(type(analysis_result))
print(type(analysis_result.analysis_result))
print(type(analysis_result.analysis_result.result))
print(type(analysis_result.analysis_result.result[0]))
pprint(analysis_result)


In [None]:
saved_analysis = DocumentAnalysis.objects.get(title="测试分析A")
pprint(saved_analysis.outline_analysis_result)


# <font color="red">STEP 3: 测试outline_improvement.py</font>


In [None]:
# 初始化
import django_setup

In [2]:
# 导入相关模型
from django.contrib.auth import get_user_model
from apps.doc_analysis.models import DocumentAnalysis, InvalidStatusTransition
from apps.projects.models import Project
from apps.files.models import FileRecord
from django.core.files.uploadedfile import SimpleUploadedFile
from django.core.exceptions import ValidationError
from apps.doc_analysis.steps._01_extract_docx_elements import DocxExtractorStep
from apps.doc_analysis.pipeline.types import ModelData, DocxElements, OutlineAnalysisResult
from apps.doc_analysis.steps._02_outline_analysis import DocxOutlineAnalyzerStep
from apps.doc_analysis.steps._03_outline_improvement import OutlineImprovementStep
from pprint import pprint 

In [3]:
# 直接引用"测试分析A", 并获取其docx_elements 和 outline_analysis_result

saved_analysis = DocumentAnalysis.objects.get(title="测试分析A")
docx_elements = DocxElements.from_model(saved_analysis.extracted_elements)
outline_analysis_result = OutlineAnalysisResult.from_model(saved_analysis.outline_analysis_result)

In [None]:
# 打印输出的结果
print(type(outline_analysis_result.analysis_result))
print(type(outline_analysis_result.analysis_result.result))
print(type(outline_analysis_result.analysis_result.result[0]))
pprint(outline_analysis_result)


## 分步骤测试

In [5]:
# 构建实例
outline_improver = OutlineImprovementStep()

In [None]:
# titles 提取
titles_to_improve = outline_improver.extract_titles_from_analysis(outline_analysis_result)
pprint(titles_to_improve)


In [8]:
# 改进元素
improved_elements = outline_improver.improve_document_elements(docx_elements, titles_to_improve)

In [None]:
# 打印改进后的数据
pprint(improved_elements)

## 集成测试

In [None]:
# 模拟用户确认
outline_analysis_result.user_confirm = True

pprint(outline_analysis_result)


In [6]:
# 完整测试 _02_outline_analysis.py
outline_improver = OutlineImprovementStep()
improved_docx_elements = outline_improver.process((docx_elements, outline_analysis_result))

In [None]:
# 打印原来的数据
pprint(docx_elements)

In [None]:
# 打印改进后的数据
pprint(improved_docx_elements)

# <font color=red>STEP 4: 构建文档树DocxTree <font>


In [None]:
# 初始化
import django_setup

In [2]:
# show_structure函数（）
from pprint import pprint
from typing import Any

def show_structure(data: Any, name: str = "data", max_depth: int = None) -> None:
    """
    显示数据的类型和结构的辅助函数
    
    参数:
        data: 要检查的数据
        name: 数据的名称/标签
        max_depth: pprint的最大深度
    """
    print(f"\n{'='*50}")
    print(f"检查对象: {name}")
    print(f"类型: {type(data)}")
    
    if hasattr(data, '__len__'):
        print(f"长度: {len(data)}")
    
    print("\n结构:")
    pprint(data, depth=max_depth, width=100)
    print(f"{'='*50}\n")


In [3]:
# 导入相关模型
from apps.doc_analysis.models import DocumentAnalysis
from apps.doc_analysis.pipeline.types import ModelData, ImprovedDocxElements, SimpleDocxNode, DocxTree
from apps.doc_analysis.steps._04_build_docxtree import BuildDocxTree
from pprint import pprint 

In [4]:
# 直接引用"测试分析A", 并获取 improved_docx_elements
saved_analysis = DocumentAnalysis.objects.get(title="测试分析A")
improved_docx_elements = ImprovedDocxElements.from_model(saved_analysis.improved_docx_elements)
#elements = improved_docx_elements.elements
#show_structure(elements)


## 构建文档树

In [None]:
# 构建文档树
build_docx_tree = BuildDocxTree()
docx_tree = build_docx_tree.process(improved_docx_elements)
show_structure(docx_tree)


In [None]:
# 按顺序打印所有节点
pprint(docx_tree._ordered_nodes)

In [None]:
# 按树结构打印所有节点
docx_tree.print_tree()

In [None]:
print(docx_tree.format_titles())

In [None]:
# 检查文档树在models.py里的存储情况
saved_analysis = DocumentAnalysis.objects.get(title="测试分析A")
print(type(saved_analysis.docxtree))
pprint(saved_analysis.docxtree)


## 测试从数据库提取文档树

In [None]:
# 还原：从models.py存储到doc_tree类型的还原
docx_tree_from_model = DocxTree.from_model(saved_analysis.docxtree)
print(type(docx_tree_from_model))
pprint(docx_tree_from_model)

In [None]:
# 还原 ： 按顺序打印所有节点
pprint(docx_tree_from_model._ordered_nodes)

In [None]:
# 还原： 按树结构打印所有节点
docx_tree_from_model.print_tree()

In [None]:
print(docx_tree_from_model.format_titles())

## 测试文档树方法：查找节点，打印树，标题格式化

In [None]:
# 查找某个节点 find_node()， 测试 .find_node() 方法，比如node_id = 78
node = docx_tree.find_node(78)
pprint(node)

In [None]:
# 打印某个节点以下的文档树结构
docx_tree.print_tree(124)


In [None]:
# 格式化 标题结构 用于大模型分析
titles_nodes = docx_tree.format_titles()
pprint(titles_nodes)

## 测试添加节点

In [None]:
# 添加 节点
docx_tree.add_title_node(
    content = '>>>> 插入新节点',
    level = 2,
    after_node_id = 150 
)

In [None]:
# 打印结果 - 看顺序列表里是否有插入？
pprint(docx_tree._ordered_nodes[140:160])

In [None]:
# 打印文档树
docx_tree.print_tree(124)

## 测试调整节点类型

In [19]:
# 节点改变层级
docx_tree.convert_to_title_node(156,2)

In [None]:
# 打印结果 - 看顺序列表里是否有插入？
pprint(docx_tree._ordered_nodes[140:160])

In [None]:
# 打印文档树
docx_tree.print_tree(124)

# <font color=red>  STEP 5: 测试文档树的使用</font>

In [1]:
# 初始化
import django_setup

Development settings loaded
INSTALLED_APPS: ['django.contrib.admin', 'django.contrib.auth', 'django.contrib.contenttypes', 'django.contrib.sessions', 'django.contrib.messages', 'django.contrib.staticfiles', 'rest_framework', 'corsheaders', 'storages', 'apps.authentication', 'apps.files', 'apps.projects', 'apps.doc_analysis', 'apps.chat', 'django_filters', 'drf_spectacular', 'rest_framework_simplejwt.token_blacklist', 'django_celery_results', 'django_celery_beat']


INFO 2025-02-25 01:59:53,492 storage default_storage 的类型: COSStorage


Settings从哪里加载？: config.settings.development
项目根目录对么？: C:\Users\huiwa\Documents\_All_Projects\BidPilot_new\backend
文件存储settings对么？: apps.files.storage.COSStorage
文件default_storage对么？: COSStorage

已经安装的应用 Installed Apps 完整了么？:
- django.contrib.admin
- django.contrib.auth
- django.contrib.contenttypes
- django.contrib.sessions
- django.contrib.messages
- django.contrib.staticfiles
- rest_framework
- corsheaders
- storages
- apps.authentication
- apps.files
- apps.projects
- apps.doc_analysis
- apps.chat
- django_filters
- drf_spectacular
- rest_framework_simplejwt.token_blacklist
- django_celery_results
- django_celery_beat


In [2]:
# 导入相关模型
from apps.doc_analysis.models import DocumentAnalysis
from apps.doc_analysis.pipeline.types import ModelData,DocxTree, DocxTreeMoreTitles
from apps.doc_analysis.steps._04_build_docxtree import BuildDocxTree
from apps.doc_analysis.steps._05_more_subtitles import MoreSubTitlesStep
from pprint import pprint 

In [3]:
# 直接引用"测试分析A", 并获取 doc_tree
saved_analysis = DocumentAnalysis.objects.get(title="测试分析A")
docx_tree = DocxTree.from_model(saved_analysis.docxtree)
print(type(docx_tree))
print(docx_tree.format_titles())

<class 'apps.doc_analysis.pipeline.types.DocxTree'>
    第一章  招标公告 [Level:1] [ID:46] [Tokens:2316]
    第二章 招标需求 [Level:1] [ID:95] [Tokens:1629]
    第三章  投标人须知 [Level:1] [ID:124] [Tokens:10019]
        前附表 [Level:2] [ID:125] [Tokens:2735]
        一、总则 [Level:2] [ID:130] [Tokens:1197]
        二、招标文件 [Level:2] [ID:159] [Tokens:635]
        三、投标文件的编制 [Level:2] [ID:175] [Tokens:3227]
        四、开标 [Level:2] [ID:256] [Tokens:379]
        五、评标 [Level:2] [ID:267] [Tokens:1191]
        六、定标 [Level:2] [ID:294] [Tokens:168]
        七、合同授予 [Level:2] [ID:298] [Tokens:82]
        八、履约保证金 [Level:2] [ID:300] [Tokens:54]
        九、招标代理服务费的收取 [Level:2] [ID:302] [Tokens:340]
    第四章  评标办法及评分标准 [Level:1] [ID:308] [Tokens:7041]
        前言 [Level:2] [ID:1740419863] [Tokens:2090]
        附件一：资格审查表 [Level:2] [ID:352] [Tokens:1031]
        附件二：符合性审查表 [Level:2] [ID:355] [Tokens:498]
        附件三：评分标准 [Level:2] [ID:357] [Tokens:3406]
    第五章 合同条款及格式 [Level:1] [ID:366] [Tokens:7692]
    第六章  投标文件格式 [Level:1] [ID:501

In [4]:
leaf_titles = docx_tree.get_leaf_titles()
pprint(leaf_titles)

[Node(46, '第一章  招标公告...', [title], [Lvl: 1], [48 children]),
 Node(95, '第二章 招标需求...', [title], [Lvl: 1], [28 children]),
 Node(125, '前附表...', [title], [Lvl: 2], [4 children]),
 Node(130, '一、总则...', [title], [Lvl: 2], [28 children]),
 Node(159, '二、招标文件...', [title], [Lvl: 2], [15 children]),
 Node(175, '三、投标文件的编制...', [title], [Lvl: 2], [80 children]),
 Node(256, '四、开标...', [title], [Lvl: 2], [10 children]),
 Node(267, '五、评标...', [title], [Lvl: 2], [26 children]),
 Node(294, '六、定标...', [title], [Lvl: 2], [3 children]),
 Node(298, '七、合同授予...', [title], [Lvl: 2], [1 children]),
 Node(300, '八、履约保证金...', [title], [Lvl: 2], [1 children]),
 Node(302, '九、招标代理服务费的收取...', [title], [Lvl: 2], [5 children]),
 Node(1740419863, '前言...', [title], [Lvl: 2], [43 children]),
 Node(352, '附件一：资格审查表...', [title], [Lvl: 2], [2 children]),
 Node(355, '附件二：符合性审查表...', [title], [Lvl: 2], [1 children]),
 Node(357, '附件三：评分标准...', [title], [Lvl: 2], [8 children]),
 Node(366, '第五章 合同条款及格式...', [title], [Lvl: 1], [1

In [5]:
# 创建子标题分析 实例
subtitles_analyzer = MoreSubTitlesStep()
# 检查 data_inputs
data_inputs = subtitles_analyzer.prepare_requests_data(docx_tree)
# 检查prompt
simulated_prompt, formatted_messages = subtitles_analyzer.simulate_prompt(data_inputs[0])
print(formatted_messages)

[{'role': 'system', 'content': '你是一个专业的招标文档分析助手，帮助用户分析文档的结构和内容。'}, {'role': 'human', 'content': '\n# Task\n识别文档中当前标题的直接下级子标题。\n\n# Requirements\n判断依据:\n- 段落表达完整的主题概念\n- 与当前标题存在直接的从属关系\n- 具有标题的特征格式\n\n注意:\n- 仅识别直接下级标题\n- 忽略更深层级的内容\n- 保持原章节编号不变\n\n# Output\n## Rules\n- 输出JSON格式的层级结构\n- 每个子标题包含其文本内容和层级信息\n- 保留原始段落与新识别子标题的对应关系\n- 空内容使用[]\n\n## Format\n\n请按以下JSON格式输出分析结果：\n{\n    "titles_to_detail": [\n        {\n            "title": "标题内容",\n            "ID": "标题ID",\n            "level": "标题层级"\n        }\n    ]\n}\n\n\n# Input\n<h1>第一章  招标公告[ID:46]</h1>\n<path> 第一章  招标公告</path>\n<hr/>\n<p>根据《中华人民共和国招标投标法》、《中华人民共和国招标投标法实施条 例》等规定， 北京中外建工程管理有限公司受招标人委托，就北京京铁运恒采购 供应站有限公司 2024 年端午节物资采购项目（第一包：一级压榨花生油）进行  国内公开招标采购，欢迎合格的投标人前来投标。 [ID:47]</p>\n<p>一、项目编号：DLXM-2024-148-01 [ID:48]</p>\n<p>二、采购组织类型：委托代理 [ID:49]</p>\n<p>三、采购方式：公开招标 [ID:50]</p>\n<p>四、招标最高限价：1840 万元 [ID:51]</p>\n<p>五、采购内容、数量及简要技术要求 [ID:52]</p>\n<table>[表格内容]</table>\n<p>六、合格投标人的资格要求(本项目采用资格后审) [ID:54]</p>\n<p>（1）在中华人民共和国境内注册的、具有独立承担民事责任的能

In [None]:
analysis_reuslts = subtitles_analyzer.process(docx_tree)

In [None]:
print(type(analysis_reuslts))
pprint(analysis_reuslts)

In [None]:
# 检查 more_subtitles 在models.py里的存储情况
saved_analysis = DocumentAnalysis.objects.get(title="测试分析A")
print(type(saved_analysis.more_subtitles))
pprint(saved_analysis.more_subtitles)

In [13]:
# 测试 从models.py提取 more_subtitles, 并转换为DocxTreeMoreTitles格式
more_subtitles_from_model = DocxTreeMoreTitles.from_model(saved_analysis.more_subtitles)
print(type(more_subtitles_from_model))
pprint(more_subtitles_from_model)