In [None]:
import django_setup

In [22]:
# 导入相关模型：get_user_model, Project, FileRecord, DocumentAnalysis, FileProjectLink, ProjectHistory
from django.contrib.auth import get_user_model
from apps.doc_analysis.models import DocumentAnalysis, InvalidStatusTransition
from apps.projects.models import Project
from apps.files.models import FileRecord
from django.core.files.uploadedfile import SimpleUploadedFile
from django.core.exceptions import ValidationError
from apps.doc_analysis.steps.docx_extractor import DocxExtractorStep
from apps.doc_analysis.pipeline.types import ModelData,DocxElements

In [None]:
# 准备测试所需的 user, project, file_record  (其中project与file_record关联)
User = get_user_model()

# 获取已存在的测试数据

# 获取已存在的用户
user = User.objects.get(phone='18501771516')
print(f"用户: {user.phone}")
        
# 获取已存在的项目
project = Project.objects.get(project_name='测试项目1')
print(f"项目: {project.project_name}")
        
# 获取已存在的文件
file_record = FileRecord.objects.get(id='3')
print(f"文件: {file_record.name}")

In [None]:
#清除‘测试分析2”
DocumentAnalysis.objects.filter(title="测试分析A").delete()

In [None]:
# 1. 创建新的文档分析实例 - 测试分析2
docx_analysis = DocumentAnalysis.objects.create(
    project=project,
    title="测试分析A",
    created_by=user,
    analysis_questions=["投标要求", "评分标准"]  # 示例分析问题
)
print(f"创建文档分析: {docx_analysis.title} (ID: {docx_analysis.id})")

In [None]:
# 2.上传真实的 DOCX文件

# 2.1 准备文件路径
doc_path = "C:/Users/huiwa/Downloads/文本分析测试/CaseTest/case8：招标文件-第1包：一级压榨花生油.docx"

# 2.2 读取文件内容
with open(doc_path, 'rb') as f:
    file_content = f.read()
test_file = SimpleUploadedFile(
    "test_doc.docx",
    file_content,
    content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
print(f"文件大小: {test_file.size}")

# 2. 创建新的文件记录 并 存储文件对象
new_file_record = FileRecord.objects.create(
    name="test_doc.docx",
    file=test_file,  # 使用之前准备的测试文件
    owner=user,
    size = test_file.size
)
print(f"创建文件记录: {new_file_record.name}")

In [15]:
# 3. 关联新文件
try:
    docx_analysis.update_file_record(new_file_record)
except Exception as e:
    print(f"关联文件失败: {str(e)}")

In [None]:
# 4. 触发开始分析，并提取文档元素 elements 存入数据库
print("\n===== 最终状态检查 =====")
print(f"开始分析前-状态: {docx_analysis.status}")
#docx_analysis.start_analysis()
print(f"开始分析后-状态: {docx_analysis.status}")

# 初始化DocxExtractorStep
docx_extractor=DocxExtractorStep()

try:
    # 准备输入数据, 好比 DocumentAnalysis.instance.data
    input_data = ModelData(model= DocumentAnalysis, instance=docx_analysis)
    
    # 执行文档提取
    extracted_elements = docx_extractor.process(input_data)
    
    # 打印提取结果
    print("文档提取成功！提取到的元素数量:", len(extracted_elements))
    print("第一个元素示例:", extracted_elements[0])
    
    # 检查保存到数据库的结果
    saved_analysis = DocumentAnalysis.objects.get(id=docx_analysis.id)
    print("保存的提取结果:", saved_analysis.extracted_elements)
    
except ValidationError as e:
    print("文档提取失败:", str(e))
    # 检查分析状态
    failed_analysis = DocumentAnalysis.objects.get(id=docx_analysis.id)
    print("分析状态:", failed_analysis.status)
    print("错误信息:", failed_analysis.error_message)





In [None]:
from pprint import pprint 
pprint(saved_analysis.extracted_elements)

In [None]:
headings = DocxElements(saved_analysis.extracted_elements).get_headings()
pprint(headings)