# <font color="red">STEP 1: 文档提取测试</font>

In [33]:
# 初始化
import django_setup

In [34]:
# 导入相关模型：get_user_model, Project, FileRecord, DocumentAnalysis, FileProjectLink, ProjectHistory
from django.contrib.auth import get_user_model
from apps.doc_analysis.models import DocumentAnalysis, InvalidStatusTransition
from apps.projects.models import Project
from apps.files.models import FileRecord
from django.core.files.uploadedfile import SimpleUploadedFile
from django.core.exceptions import ValidationError
from apps.doc_analysis.steps._01_extract_docx_elements import DocxExtractorStep
from apps.doc_analysis.pipeline.types import ModelData, DocxElements, OutlineAnalysisResult

In [None]:
# 准备测试所需的 user, project, file_record  (其中project与file_record关联)
User = get_user_model()

# 获取已存在的测试数据

# 获取已存在的用户
user = User.objects.get(phone='18501771516')
print(f"用户: {user.phone}")
        
# 获取已存在的项目
project = Project.objects.get(project_name='测试项目1')
print(f"项目: {project.project_name}")
        
# 获取已存在的文件
file_record = FileRecord.objects.get(id='3')
print(f"文件: {file_record.name}")

In [None]:
# 清除‘测试分析A”，用于接下去的测试
DocumentAnalysis.objects.filter(title="测试分析A").delete()

In [None]:
# 1. 创建新的文档分析实例 - 测试分析A
docx_analysis = DocumentAnalysis.objects.create(
    project=project,
    title="测试分析A",
    created_by=user,
    analysis_questions=["投标要求", "评分标准"]  # 示例分析问题
)
print(f"创建文档分析: {docx_analysis.title} (ID: {docx_analysis.id})")

In [None]:
# 2.上传真实的 DOCX文件

# 2.1 准备文件路径
doc_path = "C:/Users/huiwa/Downloads/文本分析测试/CaseTest/case8：招标文件-第1包：一级压榨花生油.docx"

# 2.2 读取文件内容
with open(doc_path, 'rb') as f:
    file_content = f.read()
test_file = SimpleUploadedFile(
    "test_doc.docx",
    file_content,
    content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
print(f"文件大小: {test_file.size}")

# 2. 创建新的文件记录 并 存储文件对象
new_file_record = FileRecord.objects.create(
    name="test_doc.docx",
    file=test_file,  # 使用之前准备的测试文件
    owner=user,
    size = test_file.size
)
print(f"创建文件记录: {new_file_record.name}")

In [39]:
# 3. 关联新文件
try:
    docx_analysis.update_file_record(new_file_record)
except Exception as e:
    print(f"关联文件失败: {str(e)}")

In [None]:
# 4. 触发开始分析，并提取文档元素 elements 存入数据库
print("\n===== 最终状态检查 =====")
print(f"开始分析前-状态: {docx_analysis.status}")
#docx_analysis.start_analysis()
print(f"开始分析后-状态: {docx_analysis.status}")

# 初始化DocxExtractorStep
docx_extractor=DocxExtractorStep()

try:
    # 准备输入数据, 好比 DocumentAnalysis.instance.data
    input_data = ModelData(model= DocumentAnalysis, instance=docx_analysis)
    
    # 执行文档提取
    docx_elements = docx_extractor.process(input_data)
    
    # 打印提取结果
    print("文档提取成功！提取到的元素数量:", len(docx_elements))
    print("第一个元素示例:", docx_elements[0])
    
    # 检查保存到数据库的结果
    saved_analysis = DocumentAnalysis.objects.get(id=docx_analysis.id)
    print("保存的提取结果:", saved_analysis.extracted_elements)
    
except ValidationError as e:
    print("文档提取失败:", str(e))
    # 检查分析状态
    failed_analysis = DocumentAnalysis.objects.get(id=docx_analysis.id)
    print("分析状态:", failed_analysis.status)
    print("错误信息:", failed_analysis.error_message)


# <font color="red">STEP 2: outline 分析测试</font>

In [1]:
# 初始化
import django_setup

Development settings loaded
INSTALLED_APPS: ['django.contrib.admin', 'django.contrib.auth', 'django.contrib.contenttypes', 'django.contrib.sessions', 'django.contrib.messages', 'django.contrib.staticfiles', 'rest_framework', 'corsheaders', 'storages', 'apps.authentication', 'apps.files', 'apps.projects', 'apps.doc_analysis', 'apps.chat', 'django_filters', 'drf_spectacular', 'rest_framework_simplejwt.token_blacklist', 'django_celery_results', 'django_celery_beat']


INFO 2025-02-24 15:32:04,139 storage default_storage 的类型: COSStorage


Settings从哪里加载？: config.settings.development
项目根目录对么？: C:\Users\huiwa\Documents\_All_Projects\BidPilot_new\backend
文件存储settings对么？: apps.files.storage.COSStorage
文件default_storage对么？: COSStorage

已经安装的应用 Installed Apps 完整了么？:
- django.contrib.admin
- django.contrib.auth
- django.contrib.contenttypes
- django.contrib.sessions
- django.contrib.messages
- django.contrib.staticfiles
- rest_framework
- corsheaders
- storages
- apps.authentication
- apps.files
- apps.projects
- apps.doc_analysis
- apps.chat
- django_filters
- drf_spectacular
- rest_framework_simplejwt.token_blacklist
- django_celery_results
- django_celery_beat


In [2]:
# 导入相关模型：get_user_model, Project, FileRecord, DocumentAnalysis, FileProjectLink, ProjectHistory
from django.contrib.auth import get_user_model
from apps.doc_analysis.models import DocumentAnalysis, InvalidStatusTransition
from apps.projects.models import Project
from apps.files.models import FileRecord
from django.core.files.uploadedfile import SimpleUploadedFile
from django.core.exceptions import ValidationError
from apps.doc_analysis.steps._01_extract_docx_elements import DocxExtractorStep
from apps.doc_analysis.pipeline.types import ModelData, DocxElements, OutlineAnalysisResult
from apps.doc_analysis.steps._02_outline_analysis import DocxOutlineAnalyzerStep
from pprint import pprint 

In [3]:
# 直接引用"测试分析A", 并获取其extracted_elements

saved_analysis = DocumentAnalysis.objects.get(title="测试分析A")
extracted_elements = saved_analysis.extracted_elements
pprint(extracted_elements['elements'][10])

{'content': '前附表',
 'is_toc': True,
 'position': 11,
 'toc_level': 2,
 'type': 'paragraph'}


In [4]:
# 创建 DocxElements 实例 from models.py 的 extracted_elements
docx_elements = DocxElements.from_model(extracted_elements)

In [5]:
# 直接测试 outline_analyzer.py
outline_analyzer = DocxOutlineAnalyzerStep()


In [6]:
input_datas = outline_analyzer.prepare_requests_data(docx_elements)

In [7]:
print(type(input_datas))
print(input_datas[0])
print(input_datas[1])


<class 'list'>

## TOC TITLE LIST
title:第一章 招标公告, level:1, position:8
title:第二章 招标需求, level:1, position:9
title:第三章 投标人须知, level:1, position:10
title:第四章 评标办法及评分标准, level:1, position:21
title:第五章 合同条款及格式, level:1, position:25
title:第六章 投标文件格式, level:1, position:26

## HEADING TITLE LIST
title:北京京铁运恒采购供应站有限公司 2024 年端午节物资采购项目, level:1, position:1
title:招标文件, level:1, position:2
title:第一章 招标公告, level:1, position:46
title:第二章 招标需求, level:1, position:95
title:第三章 投标人须知, level:1, position:124
title:第四章 评标办法及评分标准, level:1, position:308
title:第五章 合同条款及格式, level:1, position:366
title:第六章 投标文件格式, level:1, position:501


## TOC TITLE LIST
title:前附表, level:2, position:11
title:一、总则, level:2, position:12
title:二、招标文件, level:2, position:13
title:三、投标文件的编制, level:2, position:14
title:四、开标, level:2, position:15
title:五、评标, level:2, position:16
title:六、定标, level:2, position:17
title:七、合同授予, level:2, position:18
title:八、履约保证金, level:2, position:19
title:九、招标代理服务费的收取, level:2, position:20
title:附件一：资格审查表

In [8]:
simulated_prompt, formatted_messages = outline_analyzer.simulate_prompt(input_datas[0])
pprint(formatted_messages)


[{'content': '你是一个专业的招标文档分析助手，帮助用户分析文档的结构和内容。', 'role': 'system'},
 {'content': '\n'
             '# Task\n'
             '分析招标文档的目录结构和正文标题之间的一致性\n'
             '\n'
             '# Requirements\n'
             '- 比对目录中的标题和正文中的实际标题\n'
             '- 忽略标点符号和空格的差异\n'
             '- 仅匹配标题的实际文本内容\n'
             '- 分别罗列出"目录中存在但正文中不存在"和"正文中存在但目录中不存在"的标题\n'
             '\n'
             '# Output\n'
             '## Rules\n'
             '- 只输出JSON格式的结果\n'
             '- 不使用Markdown格式\n'
             '- 确保JSON格式严格有效\n'
             '- 空元素使用[]\n'
             '\n'
             '## Format\n'
             '\n'
             '请严格按照以下JSON格式输出目录分析结果，不要包含任何额外的解释或说明：\n'
             '{\n'
             '    "toc_only_titles": [\n'
             '        {\n'
             '            "title": "标题内容",\n'
             '            "position": "目录中的位置",\n'
             '            "level": "标题层级"\n'
             '        }\n'
             '    ],\n'
             '    "heading_only_titles": [\n'
    

In [9]:
analysis_result = outline_analyzer.process(docx_elements)

HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"
{
{
       " "toc_only_titles":HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"
{
    [],
    "heading "toc_only_titles": [],
    "heading_only_titles": []
toc_only_titles":}_only_titles": [
        {
            "title": "北京 [],
    "heading京铁运恒_only_titles": []
}采购供应站有限公司 2024 年端午节物资采购项目",
            "position": "1",
            "level": "1"
        },
        {
            "title": "招标文件",
            "position": "2",
            "level": "1"
        }
    ]
}

In [10]:
# 打印输出的结果
print(type(analysis_result))
print(type(analysis_result.analysis_result))
print(type(analysis_result.analysis_result.result))
print(type(analysis_result.analysis_result.result[0]))
pprint(analysis_result)


<class 'apps.doc_analysis.pipeline.types.OutlineAnalysisResult'>
<class 'apps.doc_analysis.LLM_services._llm_data_types.BatchResult'>
<class 'list'>
<class 'dict'>
OutlineAnalysisResult(document_analysis=ModelData(model=<class 'apps.doc_analysis.models.DocumentAnalysis'>,
                                                  instance=<DocumentAnalysis: 测试分析A - 待分析>),
                      analysis_result=BatchResult(result=[{'distribution': {'{\n    "toc_only_titles": [],\n    "heading_only_titles": [\n        {\n            "title": "北京京铁运恒采购供应站有限公司 2024 年端午节物资采购项目",\n            "position": "1",\n            "level": "1"\n        },\n        {\n            "title": "招标文件",\n            "position": "2",\n            "level": "1"\n        }\n    ]\n}': 1.0},
                                                           'probability': 1.0,
                                                           'sample_count': 1,
                                                           'task_id': 0,
     

In [11]:
saved_analysis = DocumentAnalysis.objects.get(title="测试分析A")
pprint(saved_analysis.outline_analysis_result)


{'analysis_result': {'approach': 'asyncio',
                     'error': None,
                     'probability': None,
                     'repeat_count': None,
                     'request_index': -1,
                     'result': [{'distribution': {'{\n    "toc_only_titles": [],\n    "heading_only_titles": [\n        {\n            "title": "北京京铁运恒采购供应站有限公司 2024 年端午节物资采购项目",\n            "position": "1",\n            "level": "1"\n        },\n        {\n            "title": "招标文件",\n            "position": "2",\n            "level": "1"\n        }\n    ]\n}': 1.0},
                                 'probability': 1.0,
                                 'sample_count': 1,
                                 'task_id': 0,
                                 'value': '{\n'
                                          '    "toc_only_titles": [],\n'
                                          '    "heading_only_titles": [\n'
                                          '        {\n'
                

# <font color="red">STEP 3: 测试outline_improvement.py</font>


In [None]:
# 初始化
import django_setup

In [5]:
# 导入相关模型
from django.contrib.auth import get_user_model
from apps.doc_analysis.models import DocumentAnalysis, InvalidStatusTransition
from apps.projects.models import Project
from apps.files.models import FileRecord
from django.core.files.uploadedfile import SimpleUploadedFile
from django.core.exceptions import ValidationError
from apps.doc_analysis.steps._01_extract_docx_elements import DocxExtractorStep
from apps.doc_analysis.pipeline.types import ModelData, DocxElements, OutlineAnalysisResult
from apps.doc_analysis.steps._02_outline_analysis import DocxOutlineAnalyzerStep
from apps.doc_analysis.steps._03_outline_improvement import OutlineImprovementStep
from pprint import pprint 

In [6]:
# 直接引用"测试分析A", 并获取其docx_elements 和 outline_analysis_result

saved_analysis = DocumentAnalysis.objects.get(title="测试分析A")
docx_elements = DocxElements.from_model(saved_analysis.extracted_elements)
outline_analysis_result = OutlineAnalysisResult.from_model(saved_analysis.outline_analysis_result)

In [None]:
# 打印输出的结果
print(type(outline_analysis_result.analysis_result))
print(type(outline_analysis_result.analysis_result.result))
print(type(outline_analysis_result.analysis_result.result[0]))
pprint(outline_analysis_result)


## 分步骤测试

In [6]:
# 构建实例
outline_improver = OutlineImprovementStep()

In [None]:
# titles 提取
titles_to_improve = outline_improver.extract_titles_from_analysis(outline_analysis_result)
pprint(titles_to_improve)


In [8]:
# 改进元素
improved_elements = outline_improver.improve_document_elements(docx_elements, titles_to_improve)

In [None]:
# 打印改进后的数据
pprint(improved_elements)

## 集成测试

In [None]:
# 模拟用户确认
outline_analysis_result.user_confirm = True
for element in outline_analysis_result.heading_only_elements:
    element['user_confirm'] = True
for element in outline_analysis_result.toc_only_elements:
    element['user_confirm'] = True

pprint(outline_analysis_result)


In [9]:
# 完整测试 _02_outline_analysis.py
outline_improver = OutlineImprovementStep()
improved_docx_elements = outline_improver.process((docx_elements, outline_analysis_result))

In [None]:
# 打印原来的数据
pprint(docx_elements)

In [None]:
# 打印改进后的数据
pprint(improved_docx_elements)

# <font color=red>STEP 4: 构建文档树DocxTree <font>


In [12]:
# 初始化
import django_setup

In [13]:
# show_structure函数（）
from pprint import pprint
from typing import Any

def show_structure(data: Any, name: str = "data", max_depth: int = None) -> None:
    """
    显示数据的类型和结构的辅助函数
    
    参数:
        data: 要检查的数据
        name: 数据的名称/标签
        max_depth: pprint的最大深度
    """
    print(f"\n{'='*50}")
    print(f"检查对象: {name}")
    print(f"类型: {type(data)}")
    
    if hasattr(data, '__len__'):
        print(f"长度: {len(data)}")
    
    print("\n结构:")
    pprint(data, depth=max_depth, width=100)
    print(f"{'='*50}\n")


In [14]:
# 导入相关模型
from apps.doc_analysis.models import DocumentAnalysis
from apps.doc_analysis.pipeline.types import ModelData, ImprovedDocxElements, SimpleDocxNode, DocxTree
from apps.doc_analysis.steps._04_build_docxtree import BuildDocxTree
from pprint import pprint 

In [15]:
# 直接引用"测试分析A", 并获取 improved_docx_elements
saved_analysis = DocumentAnalysis.objects.get(title="测试分析A")
improved_docx_elements = ImprovedDocxElements.from_model(saved_analysis.improved_docx_elements)
#elements = improved_docx_elements.elements
#show_structure(elements)


## 构建文档树

In [None]:
# 构建文档树
build_docx_tree = BuildDocxTree()
docx_tree = build_docx_tree.process(improved_docx_elements)
show_structure(docx_tree)


In [None]:
# 按顺序打印所有节点
pprint(docx_tree._ordered_nodes)

In [None]:
# 按树结构打印所有节点
docx_tree.print_tree()

In [None]:
# 检查文档树在models.py里的存储情况
saved_analysis = DocumentAnalysis.objects.get(title="测试分析A")
print(type(saved_analysis.docxtree))
pprint(saved_analysis.docxtree)


## 测试从数据库提取文档树

In [None]:
# 还原：从models.py存储到doc_tree类型的还原
docx_tree_from_model = DocxTree.from_model(saved_analysis.docxtree)
print(type(docx_tree_from_model))
pprint(docx_tree_from_model)

In [None]:
# 还原 ： 按顺序打印所有节点
pprint(docx_tree_from_model._ordered_nodes)

In [None]:
# 还原： 按树结构打印所有节点
docx_tree_from_model.print_tree()

## 测试文档树方法：查找节点，打印树，标题格式化

In [None]:
# 查找某个节点 find_node()， 测试 .find_node() 方法，比如node_id = 78
node = docx_tree.find_node(78)
pprint(node)

In [None]:
# 打印某个节点以下的文档树结构
docx_tree.print_tree(124)


In [None]:
# 格式化 标题结构 用于大模型分析
titles_nodes = docx_tree.format_titles()
pprint(titles_nodes)

## 测试添加节点

In [None]:
# 添加 节点
docx_tree.add_title_node(
    content = '>>>> 插入新节点',
    level = 2,
    after_node_id = 150 
)

In [None]:
# 打印结果 - 看顺序列表里是否有插入？
pprint(docx_tree._ordered_nodes[140:160])

In [None]:
# 打印文档树
docx_tree.print_tree(124)

## 测试调整节点类型

In [19]:
# 节点改变层级
docx_tree.convert_to_title_node(156,2)

In [None]:
# 打印结果 - 看顺序列表里是否有插入？
pprint(docx_tree._ordered_nodes[140:160])

In [None]:
# 打印文档树
docx_tree.print_tree(124)

# <font color=red>  STEP 5: 测试文档树的使用</font>

In [None]:
# 初始化
import django_setup

In [2]:
# 导入相关模型
from apps.doc_analysis.models import DocumentAnalysis
from apps.doc_analysis.pipeline.types import ModelData,DocxTree
from apps.doc_analysis.steps._04_build_docxtree import BuildDocxTree
from pprint import pprint 

In [None]:
# 直接引用"测试分析A", 并获取 doc_tree
saved_analysis = DocumentAnalysis.objects.get(title="测试分析A")
doc_tree = DocxTree.from_model(saved_analysis.docxtree)
print(type(doc_tree))
pprint(doc_tree)

In [None]:
llm_input = doc_tree.format_for_llm()
print(type(llm_input))
print(llm_input)


In [None]:
titles = doc_tree.format_titles()
print(type(titles))
print(titles)


In [None]:
titles_json = doc_tree.titles_to_json()
print(type(titles_json))
pprint(titles_json)
