# <font color="red"> 1. 测试 GenericLLMService</font>

In [17]:
# 初始化
import django_setup

In [20]:
# 导入相关模型
from apps.doc_analysis.LLM_services._generic_llm_services import GenericLLMService
from apps.doc_analysis.LLM_services._llm_data_types import LLMConfig, LLMRequest
import os, nest_asyncio
nest_asyncio.apply()

In [22]:
# 配置 config
config = LLMConfig(
    llm_model_name = "qwen-plus",
    temperature = 0.7,
    top_p =  0.8,
    streaming = True,
    api_key = os.getenv("ALIBABA_API_KEY"),
    base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1",
    max_workers = 4,
    timeout = 30,
    retry_times = 3
)

In [None]:
# 2. 创建服务实例
service = GenericLLMService(config)

In [31]:
# 3. 设置prompt模板
prompt_template = """
请根据以下内容回答问题：
{context}

问题：{requirement}

输出格式：{output_format}
"""

service.prompt_template = prompt_template

In [32]:
# 4. 准备测试数据
context = """
人工智能（AI）是计算机科学的一个分支，它企图了解智能的实质，并生产出一种新的能以人类智能相似的方式做出反应的智能机器。
"""
requirement = "请简要解释什么是人工智能"

output_format = "请用中文回答问题"

In [33]:
# 5. 创建请求对象
request = LLMRequest(context=context, requirement=requirement, output_format=output_format)

In [34]:
# 6. 异步调用服务
async def test_service():
    try:
        result = await service.process(request)
        print("处理结果：", result)
    except Exception as e:
        print("发生错误：", str(e))

In [None]:
# 7. 运行测试
await test_service()

# <font color="red"> 2. 测试 Generic/BatchLLMService, _template</font>

In [None]:
# 初始化
import django_setup

In [2]:
# 获得docx_elements 用于 outline 分析测试
# 导入相关模型：get_user_model, Project, FileRecord, DocumentAnalysis, FileProjectLink, ProjectHistory
from apps.doc_analysis.models import DocumentAnalysis
from apps.doc_analysis.pipeline.types import DocxElements, OutlineAnalysisResult
from pprint import pprint 
import os, json

# 直接引用"测试分析A", 并获取其extracted_elements
saved_analysis = DocumentAnalysis.objects.get(title="测试分析A")
extracted_elements = saved_analysis.extracted_elements

# 创建 DocxElements 实例 from models.py 的 extracted_elements
docx_elements = DocxElements.from_model(extracted_elements)

## GenericLLMService，_temaplate 

In [8]:
# call llm service
from apps.doc_analysis.LLM_services._Template import TemplateLLMAnalyzer

In [5]:
# 数据准备
formatted_toc = docx_elements.format_toc_chapters()
formatted_headings = docx_elements.format_heading_chapters()

In [None]:
# =========== build context, requirement, output_format ===========
print("========= build context ==========")
def _build_context(formatted_toc: str, formatted_headings: str) -> str:
    return f"""
1. 目录标题列表：从文档目录中提取的标题
2. 正文标题列表：从文档正文中提取的标题

数据格式：
"[文档位置], 标题层级, 标题内容"

数据内容：
1. <目录标题列表>：
{formatted_toc}

2. <正文标题列表>：
{formatted_headings}
"""

context = _build_context(formatted_toc, formatted_headings)
print(context)

# ====================================================================
print("========= build requirement ==========")
# build requirement， 参数 requirement
def _build_requirement() -> str:
    """
    构建大模型分析任务要求
    """
    return """
请对比<目录标题列表>和<正文标题列表>的标题内容
找出以下三类不同标题项：
1. 目录列表里有，但正文里没有的标题项
2. 目录列表里没有，但正文里有的标题项

请注意：只比对标题内容，不比对[文档位置] 和 标题层级
"""

requirement = _build_requirement()
print(requirement)

# ====================================================================
print("========= build output_format ==========")
# build output_format, 参数 output_format
def _build_output_format() -> str:
    """
    构建大模型分析的输出要求
    """
    return OutlineAnalysisResult.get_prompt_specification()

output_format = _build_output_format()
print(output_format)


In [None]:
# 测试单独的分析 .analyze()
result = await TemplateLLMAnalyzer.analyze(context, requirement, output_format)

## BatchLLMService, _template

In [10]:
#数据准备
# 提取<目录标题列表>和<正文标题列表>, formatted的格式，而不是Json格式
# 1. 目录标题列表
toc_chapters = docx_elements.format_toc_chapters()
toc_sections = docx_elements.format_toc_sections()
toc_subsections = docx_elements.format_toc_subsections()

# 2. 正文标题列表
heading_chapters = docx_elements.format_heading_chapters()
heading_sections = docx_elements.format_heading_sections()
heading_subsections = docx_elements.format_heading_subsections()

In [11]:
# build contexts, requirements, output_formats
# contexts
chapter_context = TemplateLLMAnalyzer.build_context(toc_chapters, heading_chapters)
section_context = TemplateLLMAnalyzer.build_context(toc_sections, heading_sections)
subsection_context = TemplateLLMAnalyzer.build_context(toc_subsections, heading_subsections)

requirement = TemplateLLMAnalyzer.build_requirement()
output_format = TemplateLLMAnalyzer.build_output_format()

contexts = [chapter_context, chapter_context, chapter_context]
requirements = [requirement, requirement, requirement]
output_formats = [output_format, output_format, output_format]


In [None]:
# 测试批量分析 .batch_analyze()
results = await TemplateLLMAnalyzer.batch_analyze(contexts, requirements, output_formats)

In [None]:
# 打印结果
print(f"结果类型：{type(results)}, 长度：{len(results)}个 {type(results[0])}")
pprint(results)

### 模拟输入 requests

In [27]:
from apps.doc_analysis.LLM_services._llm_data_types import BatchResult, LLMRequest

In [None]:
# 模拟 和 打印 requests 
requests = LLMRequest.create_batch(contexts,requirements,output_formats)
print(f"结果类型：{type(requests)}, 长度：{len(requests)}个 {type(requests[0])}")
pprint(requests)

### 结果合并 BatchResult.merge() 

In [None]:
# 打印 BatchResults
print(f"结果类型：{type(results)}, 长度：{len(results)}个 {type(results[0])}")
pprint(results)

In [None]:
# 打印合并的结果
merged_result = BatchResult.merge(results)
print(f"结果类型：{type(merged_result)}")
pprint(merged_result)

### 多路投票

In [38]:
voted_result = BatchResult.merge_with_probability(results)

In [None]:
# 打印投票结果
print(f"结果类型：{type(voted_result)}")
pprint(voted_result)

# <font color="red"> 3. 测试 outline_llm_analyzer.py </font>


In [None]:
# 初始化
import django_setup

In [2]:
# 获得docx_elements 用于 outline 分析测试
# 导入相关模型：get_user_model, Project, FileRecord, DocumentAnalysis, FileProjectLink, ProjectHistory
from apps.doc_analysis.models import DocumentAnalysis
from apps.doc_analysis.pipeline.types import DocxElements, OutlineAnalysisResult
from pprint import pprint 
import os, json

# 直接引用"测试分析A", 并获取其extracted_elements
saved_analysis = DocumentAnalysis.objects.get(title="测试分析A")
extracted_elements = saved_analysis.extracted_elements

# 创建 DocxElements 实例 from models.py 的 extracted_elements
docx_elements = DocxElements.from_model(extracted_elements)

In [3]:
# 导入 OutlineLLMAnalyzer
from apps.doc_analysis.LLM_services.outline_llm_analyzer import OutlineLLMAnalyzer
from apps.doc_analysis.LLM_services._llm_data_types import BatchResult

In [4]:
#数据准备
# 提取<目录标题列表>和<正文标题列表>, formatted的格式，而不是Json格式
# 1. 目录标题列表
toc_chapters = docx_elements.format_toc_chapters()
toc_sections = docx_elements.format_toc_sections()
toc_subsections = docx_elements.format_toc_subsections()

# 2. 正文标题列表
heading_chapters = docx_elements.format_heading_chapters()
heading_sections = docx_elements.format_heading_sections()
heading_subsections = docx_elements.format_heading_subsections()

In [5]:
# build contexts, requirements, output_formats
# contexts
chapter_context = OutlineLLMAnalyzer.build_context(toc_chapters, heading_chapters)
section_context = OutlineLLMAnalyzer.build_context(toc_sections, heading_sections)
subsection_context = OutlineLLMAnalyzer.build_context(toc_subsections, heading_subsections)

requirement = OutlineLLMAnalyzer.build_requirement()
output_format = OutlineLLMAnalyzer.build_output_format()

contexts = [chapter_context, chapter_context, chapter_context]
requirements = [requirement, requirement, requirement]
output_formats = [output_format, output_format, output_format]

In [None]:
print(chapter_context)

In [None]:
# 测试批量分析 .batch_analyze()
results = await OutlineLLMAnalyzer.batch_analyze(
    contexts=contexts, 
    requirements=requirements, 
    output_formats=output_formats,
    model_name = "qwen-turbo"
    )

In [None]:
# 打印结果
print(f"结果类型：{type(results)}, 长度：{len(results)}个 {type(results[0])}")
pprint(results)

In [None]:
# 多路投票 1
voted_result1 = BatchResult.merge_with_probability(results)
# 打印投票结果
print(f"结果类型：{type(voted_result1)}")
pprint(voted_result1)

In [None]:
# 模拟 单个 PROMPT 输入

# --------  构建用户输入  --------
from apps.doc_analysis.LLM_services._llm_data_types import LLMRequest

sim_request = LLMRequest(
                context = chapter_context,
                requirement = requirement,
                output_format = output_format
                )

# ------- 模拟大模型prompt输入  -------
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
sim_prompt = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(
        "你是一个专业的文档分析助手，需要严格按照用户要求处理和分析文档内容。"
    ),
    HumanMessagePromptTemplate.from_template(
        OutlineLLMAnalyzer.build_prompt_template(),  # 植入prompt模板
        input_variables=["context", 
                         "requirement", 
                        "output_format",
                        ]
    )
])
#print(request.__dict__)
formatted_prompt = await sim_prompt.ainvoke(sim_request.__dict__)

# ------  打印大模型的prompt输入  ------
#pprint(formatted_prompt)
formatted_messages = formatted_prompt.to_messages()
# 转换为字典列表
messages_dict = [
    {
        "role": message.type,
        "content": message.content
    } for message in formatted_messages
]
import json
prompt_json = json.dumps(messages_dict, ensure_ascii=False, indent=2) 
pprint(prompt_json)


# <font color=red>4. 测试 request重复三次组成group <font>

In [1]:
# 初始化
import django_setup

Development settings loaded
INSTALLED_APPS: ['django.contrib.admin', 'django.contrib.auth', 'django.contrib.contenttypes', 'django.contrib.sessions', 'django.contrib.messages', 'django.contrib.staticfiles', 'rest_framework', 'corsheaders', 'storages', 'apps.authentication', 'apps.files', 'apps.projects', 'apps.doc_analysis', 'apps.chat', 'django_filters', 'drf_spectacular', 'rest_framework_simplejwt.token_blacklist', 'django_celery_results', 'django_celery_beat']


INFO 2025-02-22 22:22:51,081 storage default_storage 的类型: COSStorage


Settings从哪里加载？: config.settings.development
项目根目录对么？: C:\Users\huiwa\Documents\_All_Projects\BidPilot_new\backend
文件存储settings对么？: apps.files.storage.COSStorage
文件default_storage对么？: COSStorage

已经安装的应用 Installed Apps 完整了么？:
- django.contrib.admin
- django.contrib.auth
- django.contrib.contenttypes
- django.contrib.sessions
- django.contrib.messages
- django.contrib.staticfiles
- rest_framework
- corsheaders
- storages
- apps.authentication
- apps.files
- apps.projects
- apps.doc_analysis
- apps.chat
- django_filters
- drf_spectacular
- rest_framework_simplejwt.token_blacklist
- django_celery_results
- django_celery_beat


In [2]:
# 获得docx_elements 用于 outline 分析测试
# 导入相关模型：get_user_model, Project, FileRecord, DocumentAnalysis, FileProjectLink, ProjectHistory
from apps.doc_analysis.models import DocumentAnalysis
from apps.doc_analysis.pipeline.types import DocxElements, OutlineAnalysisResult
from pprint import pprint 
import os, json

# 直接引用"测试分析A", 并获取其extracted_elements
saved_analysis = DocumentAnalysis.objects.get(title="测试分析A")
extracted_elements = saved_analysis.extracted_elements

# 创建 DocxElements 实例 from models.py 的 extracted_elements
docx_elements = DocxElements.from_model(extracted_elements)

In [3]:
# 导入 OutlineLLMAnalyzer
from apps.doc_analysis.LLM_services.outline_llm_analyzer import OutlineLLMAnalyzer
from apps.doc_analysis.LLM_services._llm_data_types import BatchResult, LLMRequest

In [4]:
#数据准备
# 提取<目录标题列表>和<正文标题列表>, formatted的格式，而不是Json格式
# 1. 目录标题列表
toc_chapters = docx_elements.format_toc_chapters()
toc_sections = docx_elements.format_toc_sections()[:118]
toc_subsections = docx_elements.format_toc_subsections()

# 2. 正文标题列表
heading_chapters = docx_elements.format_heading_chapters()
heading_sections = docx_elements.format_heading_sections()[:120]
heading_subsections = docx_elements.format_heading_subsections()

In [5]:
print(toc_sections)
print(heading_sections)

"position":11, "Level":2, "title":前附表
"position":12, "Level":2, "title":一、总则
"position":13, "Level":2, "title":二、招标文件

"position":125, "Level":2, "title":前附表
"position":130, "Level":2, "title":一、总则
"position":159, "Level":2, "title":二、招标文件


In [6]:
# build contexts, requirements, output_formats
# contexts
chapter_context = OutlineLLMAnalyzer.build_context(toc_chapters, heading_chapters)
section_context = OutlineLLMAnalyzer.build_context(toc_sections, heading_sections)
subsection_context = OutlineLLMAnalyzer.build_context(toc_subsections, heading_subsections)

requirement = OutlineLLMAnalyzer.build_requirement()
output_format = OutlineLLMAnalyzer.build_output_format()

contexts = [chapter_context, section_context, subsection_context]
requirements = [requirement, requirement, requirement]
output_formats = [output_format, output_format, output_format]

## Repeated Inputs

In [6]:
# 构建带有repeat的requests groups
requests = []
for group_id, (context, requirement, output_format) in enumerate(zip(contexts, requirements, output_formats)):
    for _ in range(3):  # 每组重复三次
        request = LLMRequest(
            context=context,
            requirement=requirement,
            output_format=output_format,
            group_id=group_id  # 添加group_id
        )
        requests.append(request)
                


In [None]:
# 打印requests的结果
print(type(requests))
requests_dict = []
for requst in requests:
    request_dict = request.__dict__
    requests_dict.append(request_dict)
pprint(requests_dict)

## BatchLLMService 处理带group_id的LLMRequest

In [None]:
# 测试批量分析 .batch_analyze()
results = await OutlineLLMAnalyzer.batch_analyze_with_repeats(
    contexts=contexts, 
    requirements=requirements, 
    output_formats=output_formats,
    repeats = 3
    )

In [None]:
pprint(results)

In [13]:
final_results = BatchResult.merge_hybrid(results)

In [None]:
pprint(final_results)

In [7]:
# 测试批量分析 .batch_analyze()
results2 = await OutlineLLMAnalyzer.batch_analyze_with_repeats(
    contexts=contexts, 
    requirements=requirements, 
    output_formats=output_formats,
    repeats = 1
    )

批量处理请求: 3 个任务
Final prompt:
messages=[SystemMessage(content='你是一个专业的文档分析助手，需要严格按照用户要求处理和分析文档内容。', additional_kwargs={}, response_metadata={}), HumanMessage(content='\n请为我分析招标文档的目录结构和正文标题的一致性。\n\n### 输入数据说明: \n\n1. 目录标题列表：从文档目录中提取的标题\n2. 正文标题列表：从文档正文中提取的标题\n\n数据格式：\n"[文档位置], 标题层级, 标题内容"\n\n数据内容：\n1. <目录标题列表>：\n"position":8, "Level":1, "title":第一章  招标公告\n"position":9, "Level":1, "title":第二章  招标需求\n"position":10, "Level":1, "title":第三章  投标人须知\n"position":21, "Level":1, "title":第四章  评标办法及评分标准\n"position":25, "Level":1, "title":第五章  合同条款及格式\n"position":26, "Level":1, "title":第六章  投标文件格式\n\n2. <正文标题列表>：\n"position":1, "Level":1, "title":北京京铁运恒采购供应站有限公司 2024 年端午节物资采购项目\n"position":2, "Level":1, "title":招标文件\n"position":46, "Level":1, "title":第一章  招标公告\n"position":95, "Level":1, "title":第二章 招标需求\n"position":124, "Level":1, "title":第三章  投标人须知\n"position":308, "Level":1, "title":第四章  评标办法及评分标准\n"position":366, "Level":1, "title":第五章 合同条款及格式\n"position":501, "Level":1, "title":第六章  投标文件格式\n\n\n##

In [11]:
pprint(results2)

[BatchResult(result='{\n'
                    '    "toc_only_titles": [],\n'
                    '    "heading_only_titles": [\n'
                    '        {\n'
                    '            "title": "北京京铁运恒采购供应站有限公司 2024 年端午节物资采购项目",\n'
                    '            "position": "1",\n'
                    '            "level": "1"\n'
                    '        },\n'
                    '        {\n'
                    '            "title": "招标文件",\n'
                    '            "position": "2",\n'
                    '            "level": "1"\n'
                    '        }\n'
                    '    ]\n'
                    '}',
             success=True,
             error=None,
             request_index=0,
             approach='asyncio',
             task_id=0,
             probability=None,
             repeat_count=None),
 BatchResult(result='{\n'
                    '    "toc_only_titles": [],\n'
                    '    "heading_only_titles": []\n'
       

In [9]:
final_results2 = BatchResult.merge_hybrid(results2)

In [12]:
pprint(final_results2)

BatchResult(result=[{'distribution': {'{\n    "toc_only_titles": [],\n    "heading_only_titles": [\n        {\n            "title": "北京京铁运恒采购供应站有限公司 2024 年端午节物资采购项目",\n            "position": "1",\n            "level": "1"\n        },\n        {\n            "title": "招标文件",\n            "position": "2",\n            "level": "1"\n        }\n    ]\n}': 1.0},
                     'probability': 1.0,
                     'sample_count': 1,
                     'task_id': 0,
                     'value': '{\n'
                              '    "toc_only_titles": [],\n'
                              '    "heading_only_titles": [\n'
                              '        {\n'
                              '            "title": "北京京铁运恒采购供应站有限公司 2024 '
                              '年端午节物资采购项目",\n'
                              '            "position": "1",\n'
                              '            "level": "1"\n'
                              '        },\n'
                             

In [13]:
final_results2.result

[{'value': '{\n    "toc_only_titles": [],\n    "heading_only_titles": [\n        {\n            "title": "北京京铁运恒采购供应站有限公司 2024 年端午节物资采购项目",\n            "position": "1",\n            "level": "1"\n        },\n        {\n            "title": "招标文件",\n            "position": "2",\n            "level": "1"\n        }\n    ]\n}',
  'probability': 1.0,
  'distribution': {'{\n    "toc_only_titles": [],\n    "heading_only_titles": [\n        {\n            "title": "北京京铁运恒采购供应站有限公司 2024 年端午节物资采购项目",\n            "position": "1",\n            "level": "1"\n        },\n        {\n            "title": "招标文件",\n            "position": "2",\n            "level": "1"\n        }\n    ]\n}': 1.0},
  'sample_count': 1,
  'task_id': 0},
 {'value': '{\n    "toc_only_titles": [],\n    "heading_only_titles": []\n}',
  'probability': 1.0,
  'distribution': {'{\n    "toc_only_titles": [],\n    "heading_only_titles": []\n}': 1.0},
  'sample_count': 1,
  'task_id': 1},
 {'value': '{\n    "toc_only_titles": []

In [14]:
final_results2.success

True