In [None]:
# 初始化
import django_setup

In [2]:
from apps.projects.models import Project, Task, TaskStatus, TaskType
from typing import List
from pprint import pprint

In [None]:
# 模拟celery task 处获取 项目信息
current_project = Project.objects.get(project_name='测试项目1')
print(f"用于测试的项目: {current_project.project_name}")
print(f"项目包含的文件: {current_project.files.all()}")


In [4]:
from apps.projects.services._02_outline_analysis import DocxOutlineAnalyzerStep
outline_analysis_step = DocxOutlineAnalyzerStep()


In [None]:
outline_analysis_step.process(current_project)

In [None]:
task = Task.objects.get(stage__project=current_project, type=TaskType.DOCX_EXTRACTION_TASK)
print(task.result_raw)


#### 模拟_02_outline_analysis.py

In [None]:
# validate_input 处获取 项目信息
def validate_input(data: Project) -> bool:
    """验证输入数据"""

    # 检查是否存在DOCX_EXTRACTION_TASK类型的任务
    task = Task.objects.get(stage__project=current_project, type=TaskType.DOCX_EXTRACTION_TASK)
    if not task or not task.docx_tiptap:
        return False
    return True

print(validate_input(current_project))

In [5]:
task = Task.objects.get(stage__project=current_project, type=TaskType.DOCX_EXTRACTION_TASK)

In [None]:
pprint(task.docx_tiptap)


In [7]:
import tiktoken
def count_tokens(text: str) -> int:
    """计算文本的token数量"""
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
    return len(encoding.encode(text))


In [None]:
## ------- prepare requests data -------
# 测试helpers
from apps.projects.tiptap.helpers import TiptapUtils
paragraphs, index_path_map = TiptapUtils.extract_indexed_paragraphs(task.docx_tiptap, 50)
print(count_tokens(str(paragraphs)))
print(paragraphs)


In [None]:
print(type(index_path_map), type(paragraphs))

In [10]:
def output_format_required() -> str:
    """
    OutlineAnalysis会按chapter,section,subsection逐个层级比较<目录标题列表> 与 <正文标题列表>
    以下定义了，大模型返回的输出格式的规范说明
    """
    return """

生成 Markdown：
标题使用相应级别的 Markdown 语法（#）
保留 index 信息，使用 <!-- index: xxx --> 注释格式

输入示例： 

[
  {"content": "第六章 投标文件格式", "index": 484},
  {"content": "6.1 评标方法", "index": 512},
  {"content": "6.1.1 资格审查", "index": 530},
  {"content": "本项目采用综合评分法", "index": 540}
]


输出示例：

<!-- index: 484 -->
# 第六章 投标文件格式

<!-- index: 512 -->
## 6.1 评标方法

<!-- index: 530 -->
### 6.1.1 资格审查

"""





In [11]:
def build_prompt_template() -> str:
    return """
你是一个擅长文档结构分析的 AI，接下来我会提供一些文本内容，每条数据包含 content（文本内容）和 index（索引）。你的任务是：
识别标题：判断文本是否是一个章节标题（例如“第X章”、“X.X”、“X.X.X” 等， 也可能是其他格式）。
确定层级：
“第X章” → H1（#）
“X.X” → H2（##）
“X.X.X” → H3（###）
如果不是标题，则忽略



## Format
{output_format}

# Input
{data_input}
"""

In [12]:
from apps._tools.LLM_services._llm_data_types import LLMConfig
import os

In [13]:
def build_llm_config(model_name: str) -> LLMConfig:
    """构建LLM配置"""
    return LLMConfig(
                llm_model_name = model_name,  # qwen-plus
                temperature = 0.7,
                top_p =  0.8,
                streaming = True,
                api_key = os.getenv("ALIBABA_API_KEY"),
                base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1",
                max_workers = 4,
                timeout = 30,
                retry_times = 3
            )

llm_config = build_llm_config(model_name="qwen-max-0125")

In [None]:
llm_config_to_model = llm_config.to_model()
print(type(llm_config_to_model))
print(llm_config_to_model)

In [None]:
llm_config_from_model = LLMConfig.from_model(llm_config_to_model)
print(type(llm_config_from_model))
print(llm_config_from_model)


In [16]:
# LLM CONFIG
import os, asyncio, nest_asyncio
nest_asyncio.apply()
from apps._tools.LLM_services._llm_data_types import BatchResult, LLMConfig
from apps._tools.LLM_services.llm_service import LLMService
from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

def build_llm_config(model_name: str) -> LLMConfig:
    """构建LLM配置"""
    return LLMConfig(
                llm_model_name = model_name,  # qwen-plus
                temperature = 0.7,
                top_p =  0.8,
                streaming = True,
                api_key = os.getenv("ALIBABA_API_KEY"),
                base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1",
                max_workers = 4,
                timeout = 30,
                retry_times = 3
            )

def llm_analyze(data_inputs: List[str], repeats: int = 1):
    # 构建LLM服务所需配置
    llm_config = build_llm_config(model_name="qwen-max-0125")
    prompt_template = build_prompt_template()
    output_format = output_format_required()
    
    # 初始化LLM服务
    llm_service = LLMService(
        config=llm_config,
        prompt_template=prompt_template,
        output_format=output_format
    )

    # 异步分析封装
    async def _analyze():
        return await llm_service.analyze(
            data_input=data_inputs,
        )
    
    return asyncio.run(_analyze())

In [None]:
data_inputs = paragraphs


raw_results = llm_analyze(
    data_inputs=data_inputs
)
print(raw_results)

In [None]:
print(type(raw_results))


In [13]:
# from apps.projects.tiptap.client import TiptapClient
# client = TiptapClient()   
# md = client.json_to_markdown(findings)
# pprint(md)


In [14]:
# prepare_requests_data 处获取 项目信息

# def prepare_requests_data(data) -> List[str]:
#     """
#     准备大模型分析所需的数据 data_inputs, 通常是List[str] 格式
#     """

#     # 1. 提取目录标题列表
#     toc_chapters = data.format_toc_chapters()
#     toc_sections = data.format_toc_sections()#[:118]
#     toc_subsections = data.format_toc_subsections()

#     # 2. 提取正文标题列表
#     heading_chapters = data.format_heading_chapters()
#     heading_sections = data.format_heading_sections()#[:120]
#     heading_subsections = data.format_heading_subsections()

#     # 3. 构建数据输入
#     data_input1 = self._build_data_input(toc_chapters, heading_chapters)
#     data_input2 = self._build_data_input(toc_sections, heading_sections)
#     data_input3 = self._build_data_input(toc_subsections, heading_subsections)

#     data_inputs = [data_input1, data_input2, data_input3]

#     return data_inputs

# data_inputs = prepare_requests_data(current_project)
