From 1d74dfb675a3f8bd875668400e7d0a516175387a Mon Sep 17 00:00:00 2001 From: scuuy <912074188@qq.com> Date: Fri, 14 Nov 2025 14:11:24 +0800 Subject: [PATCH 01/10] fix merge bug --- backend/app/api/v1/endpoints/pipelines.py | 120 ++++++++++++++ backend/app/core/config.py | 1 + backend/app/schemas/pipelines.py | 132 +++++++++++---- backend/app/services/pipeline_registry.py | 186 ++++++++++++++++++++++ 4 files changed, 409 insertions(+), 30 deletions(-) create mode 100644 backend/app/services/pipeline_registry.py diff --git a/backend/app/api/v1/endpoints/pipelines.py b/backend/app/api/v1/endpoints/pipelines.py index e69de29..31fdd1e 100644 --- a/backend/app/api/v1/endpoints/pipelines.py +++ b/backend/app/api/v1/endpoints/pipelines.py @@ -0,0 +1,120 @@ +from typing import List +from fastapi import APIRouter, HTTPException, BackgroundTasks, Request +from app.schemas.pipelines import ( + PipelineIn, + PipelineOut, + PipelineExecutionRequest, + PipelineExecutionResult +) +from app.services.pipeline_registry import PipelineRegistry, _PIPELINE_REGISTRY +from app.api.v1.resp import ok, created +from app.api.v1.envelope import ApiResponse +from app.core.logger_setup import get_logger + +# 配置日志 +logger = get_logger(__name__) + +# 创建路由器 +router = APIRouter(tags=["pipelines"]) + +# CRUD操作API +@router.get("/", response_model=ApiResponse[List[PipelineOut]], operation_id="list_pipelines", summary="返回所有注册的Pipeline列表") +def list_pipelines(request: Request): + try: + logger.info(f"Request: {request.method} {request.url.path}") + pipelines = _PIPELINE_REGISTRY.list_pipelines() + logger.info(f"Successfully listed {len(pipelines)} pipelines") + return ok(pipelines) + except Exception as e: + logger.error(f"Failed to list pipelines: {str(e)}", exc_info=True) + raise HTTPException(500, f"Failed to list pipelines: {str(e)}") + +@router.post("/", response_model=ApiResponse[PipelineOut], operation_id="create_pipeline", summary="创建一个新的Pipeline") +def create_pipeline(request: Request, payload: PipelineIn): + try: + logger.info(f"Request: {request.method} {request.url.path}, Pipeline name: {payload.name}") + pipeline = _PIPELINE_SERVICE.create_pipeline(payload) + return created(pipeline) + except ValueError as e: + logger.error(f"Invalid pipeline configuration: {str(e)}", exc_info=True) + raise HTTPException(400, f"Invalid pipeline configuration: {str(e)}") + except Exception as e: + logger.error(f"Failed to create pipeline: {str(e)}", exc_info=True) + raise HTTPException(400, f"Failed to create pipeline: {str(e)}") + +@router.get("/{pipeline_id}", response_model=ApiResponse[PipelineOut], operation_id="get_pipeline", summary="根据ID获取Pipeline详情") +def get_pipeline(pipeline_id: str): + pipeline = _PIPELINE_SERVICE.get_pipeline(pipeline_id) + if not pipeline: + raise HTTPException(404, f"Pipeline with id {pipeline_id} not found") + return ok(pipeline) + +@router.put("/{pipeline_id}", response_model=ApiResponse[PipelineOut], operation_id="update_pipeline", summary="更新指定的Pipeline") +def update_pipeline(pipeline_id: str, payload: PipelineIn): + try: + updated_pipeline = _PIPELINE_SERVICE.update_pipeline(pipeline_id, payload) + return ok(updated_pipeline) + except ValueError as e: + logger.error(f"Failed to update pipeline: {str(e)}") + raise HTTPException(404, str(e)) + except Exception as e: + logger.error(f"Failed to update pipeline {pipeline_id}: {e}") + raise HTTPException(400, f"Failed to update pipeline: {e}") + +@router.delete("/{pipeline_id}", operation_id="delete_pipeline", summary="删除指定的Pipeline") +def delete_pipeline(pipeline_id: str): + try: + success = _PIPELINE_SERVICE.delete_pipeline(pipeline_id) + if not success: + raise HTTPException(404, f"Pipeline with id {pipeline_id} not found") + return ok(message=f"Pipeline {pipeline_id} deleted successfully") + except HTTPException: + raise + except Exception as e: + logger.error(f"Failed to delete pipeline {pipeline_id}: {e}") + raise HTTPException(500, f"Failed to delete pipeline: {e}") + +# Pipeline执行API +@router.post("/execute", response_model=ApiResponse[PipelineExecutionResult], operation_id="execute_pipeline", summary="执行Pipeline") +async def execute_pipeline(request: Request, payload: PipelineExecutionRequest, background_tasks: BackgroundTasks): + try: + logger.info(f"Request: {request.method} {request.url.path}") + + # 调用服务层开始执行 + execution_id, pipeline_config, initial_result = _PIPELINE_SERVICE.start_execution( + pipeline_id=payload.pipeline_id, + config=payload.config + ) + + # 在后台异步执行Pipeline + background_tasks.add_task( + _PIPELINE_SERVICE.execute_pipeline_task, + execution_id, + pipeline_config + ) + + return ok(initial_result, message="Pipeline execution started") + except ValueError as e: + logger.error(f"Invalid execution request: {str(e)}") + raise HTTPException(400, str(e)) + except HTTPException: + raise + except Exception as e: + logger.error(f"Failed to start pipeline execution: {e}") + raise HTTPException(400, f"Failed to start pipeline execution: {e}") + +@router.get("/execution/{execution_id}", response_model=ApiResponse[PipelineExecutionResult], operation_id="get_execution_result", summary="获取Pipeline执行结果") +def get_execution_result(execution_id: str): + result = _PIPELINE_SERVICE.get_execution_result(execution_id) + if not result: + raise HTTPException(404, f"Execution with id {execution_id} not found") + return ok(result) + +@router.get("/executions", response_model=ApiResponse[List[PipelineExecutionResult]], operation_id="list_executions", summary="列出所有Pipeline执行记录") +def list_executions(): + try: + executions = _PIPELINE_SERVICE.list_executions() + return ok(executions) + except Exception as e: + logger.error(f"Failed to list executions: {e}") + raise HTTPException(500, f"Failed to list executions: {e}") \ No newline at end of file diff --git a/backend/app/core/config.py b/backend/app/core/config.py index 97a1ac8..ca57364 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -5,6 +5,7 @@ class Settings(BaseSettings): ENV: str = "dev" DATA_REGISTRY: str = "data/registry.yaml" TASK_REGISTRY: str = "data/task_registry.yaml" + PIPELINE_REGISTRY: str = "data/pipeline_registry.json" DataFlow_CORE_DIR: str = "data/dataflow_core" settings = Settings() diff --git a/backend/app/schemas/pipelines.py b/backend/app/schemas/pipelines.py index 80af960..d0bce52 100644 --- a/backend/app/schemas/pipelines.py +++ b/backend/app/schemas/pipelines.py @@ -1,33 +1,105 @@ from enum import Enum -from typing import List, Union -from pydantic import BaseModel, Field -from typing_extensions import Annotated -import os -from app.core.config import settings +from typing import List, Dict, Any, Optional, Union +from pydantic import BaseModel, Field, field_validator + + class Pipeline(str, Enum): + """Pipeline类型枚举""" undefined = "undefined" - Text2SQL = "text2sql" - Reasoning = "reasoning" - CodeGeneration = "code_generation" - Translation = "translation" - # 可继续扩展 - - -def generate_pipeline_enum(path: str): - """根据文件夹自动生成 Enum""" - - items = [ - name for name in os.listdir(path) - if os.path.isdir(os.path.join(path, name)) - ] - if not items: - raise ValueError(f"No pipeline found in {path}") - return Enum("Pipeline", {name: name for name in items}) - -# Pipeline = generate_pipeline_enum(os.path.join(settings.DataFlow_CORE_DIR, "example_data")) - -# print("Generated Pipeline Enum with items:", list(Pipeline)) -# OneOrManyPipelines = Annotated[ -# Union[Pipeline, List[Pipeline]], -# Field(description="选择一个或多个 pipeline") -# ] \ No newline at end of file + agentic_rag = "agentic_rag" + chemistry = "chemistry" + code = "code" + conversation = "conversation" + core_speech = "core_speech" + db = "db" + core_text = "core_text" + core_vision = "core_vision" + general_text = "general_text" + knowledge_cleaning = "knowledge_cleaning" + text2sql = "text2sql" + reasoning = "reasoning" + code_generation = "code_generation" + translation = "translation" + text_pt = "text_pt" + text_sft = "text_sft" + + +class ExecutionStatus(str, Enum): + """Pipeline执行状态枚举""" + queued = "queued" + running = "running" + completed = "completed" + failed = "failed" + + +class PipelineOperator(BaseModel): + """Pipeline算子模型""" + name: str = Field(..., description="算子名称") + params: Dict[str, Any] = Field(default_factory=dict, description="算子参数配置") + + @field_validator('name') + def validate_operator_name(cls, v: str) -> str: + """验证算子名称格式""" + if not v.replace('_', '').isalnum(): + raise ValueError('Operator name can only contain letters, numbers and underscores') + # 后续可以补充从可用算子集中验证算子名称是否存在 + return v + + +class PipelineConfig(BaseModel): + """Pipeline配置模型""" + input_dataset: str = Field(..., description="输入数据集ID") + # 用 list 的顺序代表算子执行顺序 + operators: List[PipelineOperator] = Field(default_factory=list, description="算子执行序列") + run_config: Dict[str, Any] = Field(default_factory=dict, description="运行时配置参数") + + @field_validator('operators') + def validate_operators(cls, v: List[PipelineOperator]) -> List[PipelineOperator]: + """确保至少有一个算子""" + if not v: + raise ValueError('Pipeline must have at least one operator') + return v + + +class PipelineIn(BaseModel): + """创建/更新Pipeline的请求模型""" + name: str = Field(..., description="Pipeline名称") + config: PipelineConfig = Field(..., description="Pipeline详细配置") + tags: List[str] = Field(default_factory=list, description="标签列表,用于分类和搜索") + + +class PipelineOut(BaseModel): + """Pipeline响应模型, 包含完整信息""" + id: str = Field(..., description="Pipeline唯一标识符") + name: str = Field(..., description="Pipeline名称") + config: PipelineConfig = Field(..., description="Pipeline配置") + tags: List[str] = Field(default_factory=list, description="标签列表") + created_at: str = Field(..., description="创建时间") + updated_at: str = Field(..., description="更新时间") + status: ExecutionStatus = Field(..., description="当前执行状态") + + class Config: + from_attributes = True + + +class PipelineExecutionRequest(BaseModel): + """Pipeline执行请求模型""" + pipeline_id: Optional[str] = Field(None, description="预定义Pipeline ID") + config: Optional[PipelineConfig] = Field(None, description="自定义Pipeline配置") + + @field_validator('pipeline_id', 'config') + def validate_at_least_one(cls, v, info): + """确保至少提供pipeline_id或config之一""" + if info.data.get('pipeline_id') is None and info.data.get('config') is None: + raise ValueError('Either pipeline_id or config must be provided') + return v + + +class PipelineExecutionResult(BaseModel): + """Pipeline执行结果模型""" + execution_id: str = Field(..., description="执行会话唯一标识符") + status: ExecutionStatus = Field(..., description="执行状态") + output: Dict[str, Any] = Field(default_factory=dict, description="执行输出结果") + logs: List[str] = Field(default_factory=list, description="执行日志列表") + started_at: Optional[str] = Field(None, description="执行开始时间") + completed_at: Optional[str] = Field(None, description="执行完成时间") \ No newline at end of file diff --git a/backend/app/services/pipeline_registry.py b/backend/app/services/pipeline_registry.py new file mode 100644 index 0000000..cc24a27 --- /dev/null +++ b/backend/app/services/pipeline_registry.py @@ -0,0 +1,186 @@ +import json +import uuid +import datetime +from typing import List, Optional +from app.core.logger_setup import get_logger +from app.schemas.pipelines import ( + PipelineIn, + PipelineOut, + PipelineConfig, + PipelineExecutionResult +) + +logger = get_logger(__name__) + + +class PipelineRegistry: + def __init__(self): + self._pipeline_registry = {} + self._execution_results = {} + + def get_current_time(self): + """获取当前时间的ISO格式字符串""" + return datetime.datetime.now().isoformat() + + def list_pipelines(self) -> List[PipelineOut]: + """列出所有注册的Pipeline""" + return list(self._pipeline_registry.values()) + + def create_pipeline(self, pipeline_data: PipelineIn) -> PipelineOut: + """创建一个新的Pipeline""" + # 生成唯一ID + pipeline_id = str(uuid.uuid4()) + current_time = self.get_current_time() + + # 创建PipelineOut对象 + pipeline = PipelineOut( + id=pipeline_id, + name=pipeline_data.name, + config=pipeline_data.config, + tags=pipeline_data.tags, + created_at=current_time, + updated_at=current_time + ) + + # 保存到注册表 + self._pipeline_registry[pipeline_id] = pipeline + logger.info(f"Successfully created pipeline: {pipeline_id} with name: {pipeline_data.name}") + return pipeline + + def get_pipeline(self, pipeline_id: str) -> Optional[PipelineOut]: + """根据ID获取Pipeline""" + return self._pipeline_registry.get(pipeline_id) + + def update_pipeline(self, pipeline_id: str, pipeline_data: PipelineIn) -> PipelineOut: + """更新指定的Pipeline""" + if pipeline_id not in self._pipeline_registry: + raise ValueError(f"Pipeline with id {pipeline_id} not found") + + current_pipeline = self._pipeline_registry[pipeline_id] + # 更新字段,保留创建时间 + updated_pipeline = PipelineOut( + id=pipeline_id, + name=pipeline_data.name, + config=pipeline_data.config, + tags=pipeline_data.tags, + created_at=current_pipeline.created_at, + updated_at=self.get_current_time() + ) + + self._pipeline_registry[pipeline_id] = updated_pipeline + logger.info(f"Updated pipeline: {pipeline_id}") + return updated_pipeline + + def delete_pipeline(self, pipeline_id: str) -> bool: + """删除指定的Pipeline""" + if pipeline_id not in self._pipeline_registry: + return False + + del self._pipeline_registry[pipeline_id] + logger.info(f"Deleted pipeline: {pipeline_id}") + return True + + async def execute_pipeline_task(self, execution_id: str, pipeline_config: PipelineConfig): + """异步执行Pipeline的任务""" + logs = [] + output = {} + status = "running" + logger.info(f"Starting background execution task for execution_id: {execution_id}") + + try: + # 初始化执行结果 + self._execution_results[execution_id] = PipelineExecutionResult( + execution_id=execution_id, + status=status, + output=output, + logs=logs + ) + + # 记录开始执行 + logs.append(f"[{self.get_current_time()}] Starting pipeline execution") + logs.append(f"[{self.get_current_time()}] Input dataset: {pipeline_config.input_dataset}") + logs.append(f"[{self.get_current_time()}] Run config: {json.dumps(pipeline_config.run_config)}") + + # 模拟数据加载 + logs.append(f"[{self.get_current_time()}] Loading dataset: {pipeline_config.input_dataset}") + + # 按顺序执行算子 + current_data = {"dataset_id": pipeline_config.input_dataset, "data": {}} + for i, operator in enumerate(pipeline_config.operators): + logs.append(f"[{self.get_current_time()}] Executing operator {i+1}/{len(pipeline_config.operators)}: {operator.name}") + logs.append(f"[{self.get_current_time()}] Operator params: {json.dumps(operator.params)}") + + try: + # 模拟算子执行 + current_data["data"] = { + "operator": operator.name, + "params": operator.params, + "output": f"Processed by {operator.name}" + } + logs.append(f"[{self.get_current_time()}] Operator {operator.name} executed successfully") + except Exception as op_error: + error_msg = f"Operator {operator.name} failed: {op_error}" + logs.append(f"[{self.get_current_time()}] ERROR: {error_msg}") + status = "failed" + output["error"] = error_msg + break + + # 更新执行结果 + if status != "failed": + status = "completed" + output["result"] = current_data + logs.append(f"[{self.get_current_time()}] Pipeline execution completed successfully") + + except Exception as e: + status = "failed" + error_msg = f"Pipeline execution failed: {e}" + logs.append(f"[{self.get_current_time()}] ERROR: {error_msg}") + output["error"] = error_msg + + # 保存最终结果 + self._execution_results[execution_id] = PipelineExecutionResult( + execution_id=execution_id, + status=status, + output=output, + logs=logs + ) + + def start_execution(self, pipeline_id: Optional[str] = None, config: Optional[PipelineConfig] = None) -> tuple[str, PipelineExecutionResult]: + """开始执行Pipeline""" + # 获取Pipeline配置 + if pipeline_id: + pipeline = self.get_pipeline(pipeline_id) + if not pipeline: + raise ValueError(f"Pipeline with id {pipeline_id} not found") + pipeline_config = pipeline.config + logger.info(f"Executing predefined pipeline: {pipeline_id}") + else: + if not config: + raise ValueError("Either pipeline_id or config must be provided") + pipeline_config = config + logger.info("Executing pipeline with provided config") + + # 生成执行ID + execution_id = str(uuid.uuid4()) + + # 返回初始结果 + initial_result = PipelineExecutionResult( + execution_id=execution_id, + status="queued", + output={}, + logs=[f"[{self.get_current_time()}] Pipeline execution queued"] + ) + + return execution_id, pipeline_config, initial_result + + def get_execution_result(self, execution_id: str) -> Optional[PipelineExecutionResult]: + """获取Pipeline执行结果""" + return self._execution_results.get(execution_id) + + def list_executions(self) -> List[PipelineExecutionResult]: + """列出所有Pipeline执行记录""" + return list(self._execution_results.values()) + + +# 创建全局服务实例 +_PIPELINE_REGISTRY = PipelineRegistry() From 42a6db88344a8f46102b07d1b12190c5e76ad462 Mon Sep 17 00:00:00 2001 From: scuuy <912074188@qq.com> Date: Fri, 14 Nov 2025 14:06:14 +0800 Subject: [PATCH 02/10] fix bug in logger --- backend/app/api/v1/endpoints/pipelines.py | 16 ++++++++-------- backend/app/api/v1/router.py | 2 ++ backend/app/core/logger_setup.py | 16 +++++++++++++++- 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/backend/app/api/v1/endpoints/pipelines.py b/backend/app/api/v1/endpoints/pipelines.py index 31fdd1e..350434b 100644 --- a/backend/app/api/v1/endpoints/pipelines.py +++ b/backend/app/api/v1/endpoints/pipelines.py @@ -33,7 +33,7 @@ def list_pipelines(request: Request): def create_pipeline(request: Request, payload: PipelineIn): try: logger.info(f"Request: {request.method} {request.url.path}, Pipeline name: {payload.name}") - pipeline = _PIPELINE_SERVICE.create_pipeline(payload) + pipeline = _PIPELINE_REGISTRY.create_pipeline(payload) return created(pipeline) except ValueError as e: logger.error(f"Invalid pipeline configuration: {str(e)}", exc_info=True) @@ -44,7 +44,7 @@ def create_pipeline(request: Request, payload: PipelineIn): @router.get("/{pipeline_id}", response_model=ApiResponse[PipelineOut], operation_id="get_pipeline", summary="根据ID获取Pipeline详情") def get_pipeline(pipeline_id: str): - pipeline = _PIPELINE_SERVICE.get_pipeline(pipeline_id) + pipeline = _PIPELINE_REGISTRY.get_pipeline(pipeline_id) if not pipeline: raise HTTPException(404, f"Pipeline with id {pipeline_id} not found") return ok(pipeline) @@ -52,7 +52,7 @@ def get_pipeline(pipeline_id: str): @router.put("/{pipeline_id}", response_model=ApiResponse[PipelineOut], operation_id="update_pipeline", summary="更新指定的Pipeline") def update_pipeline(pipeline_id: str, payload: PipelineIn): try: - updated_pipeline = _PIPELINE_SERVICE.update_pipeline(pipeline_id, payload) + updated_pipeline = _PIPELINE_REGISTRY.update_pipeline(pipeline_id, payload) return ok(updated_pipeline) except ValueError as e: logger.error(f"Failed to update pipeline: {str(e)}") @@ -64,7 +64,7 @@ def update_pipeline(pipeline_id: str, payload: PipelineIn): @router.delete("/{pipeline_id}", operation_id="delete_pipeline", summary="删除指定的Pipeline") def delete_pipeline(pipeline_id: str): try: - success = _PIPELINE_SERVICE.delete_pipeline(pipeline_id) + success = _PIPELINE_REGISTRY.delete_pipeline(pipeline_id) if not success: raise HTTPException(404, f"Pipeline with id {pipeline_id} not found") return ok(message=f"Pipeline {pipeline_id} deleted successfully") @@ -81,14 +81,14 @@ async def execute_pipeline(request: Request, payload: PipelineExecutionRequest, logger.info(f"Request: {request.method} {request.url.path}") # 调用服务层开始执行 - execution_id, pipeline_config, initial_result = _PIPELINE_SERVICE.start_execution( + execution_id, pipeline_config, initial_result = _PIPELINE_REGISTRY.start_execution( pipeline_id=payload.pipeline_id, config=payload.config ) # 在后台异步执行Pipeline background_tasks.add_task( - _PIPELINE_SERVICE.execute_pipeline_task, + _PIPELINE_REGISTRY.execute_pipeline_task, execution_id, pipeline_config ) @@ -105,7 +105,7 @@ async def execute_pipeline(request: Request, payload: PipelineExecutionRequest, @router.get("/execution/{execution_id}", response_model=ApiResponse[PipelineExecutionResult], operation_id="get_execution_result", summary="获取Pipeline执行结果") def get_execution_result(execution_id: str): - result = _PIPELINE_SERVICE.get_execution_result(execution_id) + result = _PIPELINE_REGISTRY.get_execution_result(execution_id) if not result: raise HTTPException(404, f"Execution with id {execution_id} not found") return ok(result) @@ -113,7 +113,7 @@ def get_execution_result(execution_id: str): @router.get("/executions", response_model=ApiResponse[List[PipelineExecutionResult]], operation_id="list_executions", summary="列出所有Pipeline执行记录") def list_executions(): try: - executions = _PIPELINE_SERVICE.list_executions() + executions = _PIPELINE_REGISTRY.list_executions() return ok(executions) except Exception as e: logger.error(f"Failed to list executions: {e}") diff --git a/backend/app/api/v1/router.py b/backend/app/api/v1/router.py index 09f5ebf..1da62ec 100644 --- a/backend/app/api/v1/router.py +++ b/backend/app/api/v1/router.py @@ -3,10 +3,12 @@ from .endpoints import datasets from .endpoints import operators from .endpoints import tasks +from .endpoints import pipelines api_router = APIRouter() # api_router.include_router(health.router, prefix="/health") api_router.include_router(datasets.router, prefix="/datasets") api_router.include_router(operators.router, prefix="/operators") api_router.include_router(tasks.router, prefix="/tasks") +api_router.include_router(pipelines.router, prefix="/pipelines") # api_router.include_router(models.router, prefix="/models") # api_router.include_router(inference.router, prefix="/inference") diff --git a/backend/app/core/logger_setup.py b/backend/app/core/logger_setup.py index 0dc7a4f..e43873d 100644 --- a/backend/app/core/logger_setup.py +++ b/backend/app/core/logger_setup.py @@ -2,11 +2,25 @@ import sys def setup_logging(): - pass + """设置日志配置""" + # 移除默认的处理器 + logger.remove() + # 添加一个新的处理器,输出到控制台 + logger.add( + sys.stdout, + format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}", + level="INFO" + ) + +def get_logger(name=None): + """获取logger实例,支持指定模块名""" + # 对于loguru来说,name参数不是必需的,但为了兼容调用方,我们保持接口一致 + return logger # if __name__ == "__main__": # setup_logging() +# logger = get_logger() # logger.info("Logging is set up and ready to go!") # logger.warning("This is a warning with colors!") # logger.error("This is an error message.") From c736ce835dcb8c612d4afa4e998700fb113500cb Mon Sep 17 00:00:00 2001 From: scuuy <912074188@qq.com> Date: Fri, 14 Nov 2025 14:48:17 +0800 Subject: [PATCH 03/10] test the pipeline api and fix bug --- backend/app/api/v1/endpoints/pipelines.py | 4 +- backend/app/services/pipeline_registry.py | 14 +- backend/tests/test_pipeline_registry.py | 352 ++++++++++++++++++++++ 3 files changed, 364 insertions(+), 6 deletions(-) create mode 100644 backend/tests/test_pipeline_registry.py diff --git a/backend/app/api/v1/endpoints/pipelines.py b/backend/app/api/v1/endpoints/pipelines.py index 350434b..601bfcb 100644 --- a/backend/app/api/v1/endpoints/pipelines.py +++ b/backend/app/api/v1/endpoints/pipelines.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Dict from fastapi import APIRouter, HTTPException, BackgroundTasks, Request from app.schemas.pipelines import ( PipelineIn, @@ -61,7 +61,7 @@ def update_pipeline(pipeline_id: str, payload: PipelineIn): logger.error(f"Failed to update pipeline {pipeline_id}: {e}") raise HTTPException(400, f"Failed to update pipeline: {e}") -@router.delete("/{pipeline_id}", operation_id="delete_pipeline", summary="删除指定的Pipeline") +@router.delete("/{pipeline_id}", response_model=ApiResponse[Dict], operation_id="delete_pipeline", summary="删除指定的Pipeline") def delete_pipeline(pipeline_id: str): try: success = _PIPELINE_REGISTRY.delete_pipeline(pipeline_id) diff --git a/backend/app/services/pipeline_registry.py b/backend/app/services/pipeline_registry.py index cc24a27..6e27075 100644 --- a/backend/app/services/pipeline_registry.py +++ b/backend/app/services/pipeline_registry.py @@ -7,7 +7,8 @@ PipelineIn, PipelineOut, PipelineConfig, - PipelineExecutionResult + PipelineExecutionResult, + ExecutionStatus ) logger = get_logger(__name__) @@ -39,7 +40,8 @@ def create_pipeline(self, pipeline_data: PipelineIn) -> PipelineOut: config=pipeline_data.config, tags=pipeline_data.tags, created_at=current_time, - updated_at=current_time + updated_at=current_time, + status=ExecutionStatus.queued ) # 保存到注册表 @@ -57,14 +59,15 @@ def update_pipeline(self, pipeline_id: str, pipeline_data: PipelineIn) -> Pipeli raise ValueError(f"Pipeline with id {pipeline_id} not found") current_pipeline = self._pipeline_registry[pipeline_id] - # 更新字段,保留创建时间 + # 更新字段,保留创建时间和状态 updated_pipeline = PipelineOut( id=pipeline_id, name=pipeline_data.name, config=pipeline_data.config, tags=pipeline_data.tags, created_at=current_pipeline.created_at, - updated_at=self.get_current_time() + updated_at=self.get_current_time(), + status=current_pipeline.status ) self._pipeline_registry[pipeline_id] = updated_pipeline @@ -171,6 +174,9 @@ def start_execution(self, pipeline_id: Optional[str] = None, config: Optional[Pi logs=[f"[{self.get_current_time()}] Pipeline execution queued"] ) + # 保存执行结果到_execution_results字典 + self._execution_results[execution_id] = initial_result + return execution_id, pipeline_config, initial_result def get_execution_result(self, execution_id: str) -> Optional[PipelineExecutionResult]: diff --git a/backend/tests/test_pipeline_registry.py b/backend/tests/test_pipeline_registry.py new file mode 100644 index 0000000..9643835 --- /dev/null +++ b/backend/tests/test_pipeline_registry.py @@ -0,0 +1,352 @@ +""" +Pipeline注册表功能测试 + +使用 pytest 运行: + pytest tests/test_pipeline_registry.py -v + 或 + pytest tests/test_pipeline_registry.py::test_create_pipeline -v # 运行单个测试 +""" +import pytest +from typing import Dict, Any +from app.services.pipeline_registry import PipelineRegistry +from app.schemas.pipelines import ( + PipelineIn, + PipelineConfig, + PipelineOperator, + PipelineExecutionResult, + ExecutionStatus +) + + +# 测试数据 fixtures +@pytest.fixture +def pipeline_registry(): + """创建一个新的PipelineRegistry实例""" + return PipelineRegistry() + + +@pytest.fixture +def sample_pipeline_operator(): + """创建一个示例PipelineOperator""" + return PipelineOperator( + name="data_processor", + params={"batch_size": 32, "shuffle": True} + ) + + +@pytest.fixture +def sample_pipeline_config(sample_pipeline_operator): + """创建一个示例PipelineConfig""" + return PipelineConfig( + input_dataset="test_dataset_123", + operators=[sample_pipeline_operator], + run_config={"max_workers": 4, "timeout": 3600} + ) + + +@pytest.fixture +def sample_pipeline_data(sample_pipeline_config): + """创建示例Pipeline输入数据""" + return PipelineIn( + name="测试Pipeline", + config=sample_pipeline_config, + tags=["test", "nlp"] + ) + + +@pytest.fixture +def created_pipeline(pipeline_registry, sample_pipeline_data): + """创建并返回一个已创建的Pipeline""" + return pipeline_registry.create_pipeline(sample_pipeline_data) + + +class TestPipelineRegistry: + """Pipeline注册表测试类""" + + def test_create_registry(self): + """测试创建Pipeline注册表实例""" + registry = PipelineRegistry() + assert registry is not None + assert hasattr(registry, '_pipeline_registry') + assert hasattr(registry, '_execution_results') + + def test_create_pipeline(self, pipeline_registry, sample_pipeline_data): + """测试创建Pipeline""" + pipeline = pipeline_registry.create_pipeline(sample_pipeline_data) + + assert pipeline is not None + # 简单验证返回对象的类型,确保它有预期的属性 + assert hasattr(pipeline, 'id') + assert hasattr(pipeline, 'name') + assert hasattr(pipeline, 'config') + assert hasattr(pipeline, 'tags') + assert hasattr(pipeline, 'created_at') + assert hasattr(pipeline, 'updated_at') + assert hasattr(pipeline, 'status') + + assert pipeline.id is not None + assert pipeline.name == sample_pipeline_data.name + assert pipeline.config.input_dataset == sample_pipeline_data.config.input_dataset + assert pipeline.tags == sample_pipeline_data.tags + assert pipeline.created_at is not None + assert pipeline.updated_at is not None + assert pipeline.status == ExecutionStatus.queued + + def test_get_pipeline(self, pipeline_registry, created_pipeline): + """测试获取Pipeline详情""" + pipeline_id = created_pipeline.id + retrieved_pipeline = pipeline_registry.get_pipeline(pipeline_id) + + assert retrieved_pipeline is not None + assert retrieved_pipeline.id == pipeline_id + assert retrieved_pipeline.name == created_pipeline.name + assert retrieved_pipeline.config.input_dataset == created_pipeline.config.input_dataset + + def test_get_nonexistent_pipeline(self, pipeline_registry): + """测试获取不存在的Pipeline""" + pipeline = pipeline_registry.get_pipeline("nonexistent_id") + assert pipeline is None + + def test_update_pipeline(self, pipeline_registry, created_pipeline, sample_pipeline_data): + """测试更新Pipeline""" + pipeline_id = created_pipeline.id + + # 修改数据 + updated_data = PipelineIn( + name="更新后的Pipeline", + config=sample_pipeline_data.config, + tags=["updated", "test"] + ) + + updated_pipeline = pipeline_registry.update_pipeline(pipeline_id, updated_data) + + assert updated_pipeline.id == pipeline_id + assert updated_pipeline.name == "更新后的Pipeline" + assert updated_pipeline.tags == ["updated", "test"] + assert updated_pipeline.created_at == created_pipeline.created_at # 创建时间不变 + assert updated_pipeline.updated_at != created_pipeline.updated_at # 更新时间改变 + + def test_update_nonexistent_pipeline(self, pipeline_registry, sample_pipeline_data): + """测试更新不存在的Pipeline""" + with pytest.raises(ValueError): + pipeline_registry.update_pipeline("nonexistent_id", sample_pipeline_data) + + def test_delete_pipeline(self, pipeline_registry, created_pipeline): + """测试删除Pipeline""" + pipeline_id = created_pipeline.id + + # 确认Pipeline存在 + assert pipeline_registry.get_pipeline(pipeline_id) is not None + + # 删除Pipeline + result = pipeline_registry.delete_pipeline(pipeline_id) + assert result is True + + # 确认Pipeline已删除 + assert pipeline_registry.get_pipeline(pipeline_id) is None + + def test_delete_nonexistent_pipeline(self, pipeline_registry): + """测试删除不存在的Pipeline""" + result = pipeline_registry.delete_pipeline("nonexistent_id") + assert result is False + + def test_list_pipelines(self, pipeline_registry, sample_pipeline_data): + """测试列出所有Pipelines""" + # 创建多个Pipelines + pipeline1 = pipeline_registry.create_pipeline(sample_pipeline_data) + pipeline2 = pipeline_registry.create_pipeline( + PipelineIn( + name="Pipeline 2", + config=sample_pipeline_data.config, + tags=["tag2"] + ) + ) + + all_pipelines = pipeline_registry.list_pipelines() + assert len(all_pipelines) >= 2 + + # 验证创建的Pipelines都在列表中 + pipeline_ids = [p.id for p in all_pipelines] + assert pipeline1.id in pipeline_ids + assert pipeline2.id in pipeline_ids + + def test_start_execution_with_pipeline_id(self, pipeline_registry, created_pipeline): + """测试使用pipeline_id开始执行""" + execution_id, pipeline_config, initial_result = pipeline_registry.start_execution( + pipeline_id=created_pipeline.id + ) + + assert execution_id is not None + assert pipeline_config.input_dataset == created_pipeline.config.input_dataset + assert initial_result.execution_id == execution_id + assert initial_result.status == ExecutionStatus.queued + + def test_start_execution_with_config(self, pipeline_registry, sample_pipeline_config): + """测试使用config开始执行""" + execution_id, pipeline_config, initial_result = pipeline_registry.start_execution( + config=sample_pipeline_config + ) + + assert execution_id is not None + assert pipeline_config.input_dataset == sample_pipeline_config.input_dataset + assert initial_result.execution_id == execution_id + assert initial_result.status == ExecutionStatus.queued + + def test_start_execution_without_required_params(self, pipeline_registry): + """测试不提供必要参数的情况""" + with pytest.raises(ValueError): + pipeline_registry.start_execution() + + def test_start_execution_with_nonexistent_pipeline_id(self, pipeline_registry): + """测试使用不存在的pipeline_id""" + with pytest.raises(ValueError): + pipeline_registry.start_execution(pipeline_id="nonexistent_id") + + def test_get_execution_result(self, pipeline_registry, created_pipeline): + """测试获取执行结果""" + execution_id, _, _ = pipeline_registry.start_execution( + pipeline_id=created_pipeline.id + ) + + # 初始时应该能获取到状态为queued的结果 + result = pipeline_registry.get_execution_result(execution_id) + assert result is not None + assert result.execution_id == execution_id + assert result.status == ExecutionStatus.queued + + def test_get_nonexistent_execution_result(self, pipeline_registry): + """测试获取不存在的执行结果""" + result = pipeline_registry.get_execution_result("nonexistent_execution_id") + assert result is None + + def test_list_executions(self, pipeline_registry, created_pipeline, sample_pipeline_config): + """测试列出所有执行记录""" + # 创建多个执行记录 + execution_id1, _, _ = pipeline_registry.start_execution( + pipeline_id=created_pipeline.id + ) + execution_id2, _, _ = pipeline_registry.start_execution( + config=sample_pipeline_config + ) + + all_executions = pipeline_registry.list_executions() + assert len(all_executions) >= 2 + + # 验证创建的执行记录都在列表中 + execution_ids = [e.execution_id for e in all_executions] + assert execution_id1 in execution_ids + assert execution_id2 in execution_ids + + +class TestPipelineRegistryEdgeCases: + """Pipeline注册表边界情况测试""" + + def test_create_pipeline_with_complex_config(self, pipeline_registry): + """测试使用复杂配置创建Pipeline""" + # 创建多个算子的配置 + complex_config = PipelineConfig( + input_dataset="complex_dataset", + operators=[ + PipelineOperator(name="operator1", params={"param1": "value1"}), + PipelineOperator(name="operator2", params={"param2": "value2"}), + PipelineOperator(name="operator3", params={"param3": "value3"}) + ], + run_config={"complex_param": {"nested": "value"}} + ) + + pipeline_data = PipelineIn( + name="复杂Pipeline", + config=complex_config, + tags=["complex", "test"] + ) + + pipeline = pipeline_registry.create_pipeline(pipeline_data) + assert pipeline is not None + assert len(pipeline.config.operators) == 3 + assert pipeline.config.run_config == {"complex_param": {"nested": "value"}} + + def test_pipeline_lifecycle(self, pipeline_registry, sample_pipeline_data, sample_pipeline_config): + """测试完整的Pipeline生命周期""" + # 1. 创建Pipeline + pipeline = pipeline_registry.create_pipeline(sample_pipeline_data) + pipeline_id = pipeline.id + + # 2. 获取Pipeline + retrieved_pipeline = pipeline_registry.get_pipeline(pipeline_id) + assert retrieved_pipeline is not None + + # 3. 更新Pipeline + updated_data = PipelineIn( + name="更新后的Pipeline", + config=sample_pipeline_config, + tags=["updated"] + ) + updated_pipeline = pipeline_registry.update_pipeline(pipeline_id, updated_data) + assert updated_pipeline.name == "更新后的Pipeline" + + # 4. 执行Pipeline + execution_id, _, _ = pipeline_registry.start_execution(pipeline_id=pipeline_id) + + # 5. 删除Pipeline + delete_result = pipeline_registry.delete_pipeline(pipeline_id) + assert delete_result is True + + def test_concurrent_pipeline_creation(self, pipeline_registry, sample_pipeline_data): + """测试并发创建多个Pipelines""" + pipelines = [] + for i in range(5): + pipeline = pipeline_registry.create_pipeline( + PipelineIn( + name=f"Pipeline_{i}", + config=sample_pipeline_data.config, + tags=[f"tag_{i}"] + ) + ) + pipelines.append(pipeline) + + # 验证所有Pipelines都有唯一ID + pipeline_ids = [p.id for p in pipelines] + assert len(pipeline_ids) == len(set(pipeline_ids)) # 确保ID唯一 + + # 验证所有Pipelines都能被获取 + for pipeline in pipelines: + retrieved = pipeline_registry.get_pipeline(pipeline.id) + assert retrieved is not None + assert retrieved.id == pipeline.id + + +@pytest.mark.asyncio +async def test_execute_pipeline_task(pipeline_registry, sample_pipeline_config): + """测试异步执行Pipeline任务""" + execution_id = "test_execution_async" + + # 执行任务 + await pipeline_registry.execute_pipeline_task(execution_id, sample_pipeline_config) + + # 获取执行结果 + result = pipeline_registry.get_execution_result(execution_id) + + assert result is not None + assert result.execution_id == execution_id + assert result.status in [ExecutionStatus.completed, ExecutionStatus.failed] + assert len(result.logs) > 0 + + +@pytest.mark.parametrize("status_value", ["queued", "running", "completed", "failed"]) +@pytest.mark.asyncio +async def test_different_execution_statuses(pipeline_registry, sample_pipeline_config, status_value): + """参数化测试:测试不同的执行状态""" + execution_id = f"test_status_{status_value}" + + # 先创建一个初始状态的执行结果 + initial_result = PipelineExecutionResult( + execution_id=execution_id, + status=status_value, + output={}, + logs=[f"测试状态: {status_value}"] + ) + pipeline_registry._execution_results[execution_id] = initial_result + + # 获取并验证状态 + result = pipeline_registry.get_execution_result(execution_id) + assert result.status == status_value From c93c785d9a0053720b8ef056d33ce2905a0a1a0b Mon Sep 17 00:00:00 2001 From: scuuy <912074188@qq.com> Date: Mon, 24 Nov 2025 11:33:23 +0800 Subject: [PATCH 04/10] fix bug --- backend/app/core/config.py | 2 +- backend/app/services/pipeline_registry.py | 272 +++++++++++++++------- 2 files changed, 183 insertions(+), 91 deletions(-) diff --git a/backend/app/core/config.py b/backend/app/core/config.py index ca57364..3a7b3b9 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -5,7 +5,7 @@ class Settings(BaseSettings): ENV: str = "dev" DATA_REGISTRY: str = "data/registry.yaml" TASK_REGISTRY: str = "data/task_registry.yaml" - PIPELINE_REGISTRY: str = "data/pipeline_registry.json" + PIPELINE_REGISTRY: str = "data/pipeline_registry.yaml" DataFlow_CORE_DIR: str = "data/dataflow_core" settings = Settings() diff --git a/backend/app/services/pipeline_registry.py b/backend/app/services/pipeline_registry.py index 6e27075..7dcf38a 100644 --- a/backend/app/services/pipeline_registry.py +++ b/backend/app/services/pipeline_registry.py @@ -1,89 +1,174 @@ import json import uuid import datetime -from typing import List, Optional +import os +import yaml +from typing import List, Optional, Dict, Any, Tuple from app.core.logger_setup import get_logger -from app.schemas.pipelines import ( - PipelineIn, - PipelineOut, - PipelineConfig, - PipelineExecutionResult, - ExecutionStatus -) +from app.core.config import settings logger = get_logger(__name__) - class PipelineRegistry: - def __init__(self): - self._pipeline_registry = {} - self._execution_results = {} + def __init__(self, path: str | None = None): + """初始化Pipeline注册表""" + self.path = path or settings.PIPELINE_REGISTRY + self._ensure() + def _ensure(self): + """确保注册表文件存在,并加载api_pipelines目录中的所有py文件""" + if not os.path.exists(self.path): + os.makedirs(os.path.dirname(self.path), exist_ok=True) + + # 创建初始数据结构 + initial_data = {"pipelines": {}, "executions": {}} + + # 尝试加载api_pipelines目录中的py文件 + try: + # 直接使用settings中的路径作为相对路径的开头 + api_pipelines_dir = os.path.join(settings.DataFlow_CORE_DIR, "api_pipelines") + + logger.info(f"Checking for API pipelines in: {api_pipelines_dir}") + + # 如果目录存在,扫描所有py文件 + if os.path.exists(api_pipelines_dir): + logger.info(f"API pipelines directory found, scanning for Python files") + + # 获取当前时间 + current_time = self.get_current_time() + + # 遍历目录中的所有py文件 + for filename in os.listdir(api_pipelines_dir): + if filename.endswith(".py") and not filename.startswith("__"): + # 生成pipeline_id + pipeline_id = f"api_pipeline_{filename[:-3]}" + + # 创建pipeline配置 + pipeline_data = { + "id": pipeline_id, + "name": filename[:-3].replace("_", " ").title(), + "config": { + "file_path": os.path.join(api_pipelines_dir, filename), + "module_name": f"{settings.DataFlow_CORE_DIR.replace('/', '.')}.api_pipelines.{filename[:-3]}", + "type": "api_pipeline" + }, + "tags": ["api"], + "created_at": current_time, + "updated_at": current_time, + "status": "active" + } + + # 添加到初始数据中 + initial_data["pipelines"][pipeline_id] = pipeline_data + logger.info(f"Added API pipeline: {pipeline_data['name']} ({pipeline_id})") + + logger.info(f"Successfully loaded {len(initial_data['pipelines'])} API pipelines") + else: + logger.warning(f"API pipelines directory not found: {api_pipelines_dir}") + except Exception as e: + logger.error(f"Error loading API pipelines: {e}", exc_info=True) + # 即使出错,仍然创建基本的注册表文件 + + # 写入初始数据到文件 + with open(self.path, "w", encoding="utf-8") as f: + yaml.safe_dump(initial_data, f, allow_unicode=True) + + def _read(self) -> Dict: + """读取注册表文件""" + with open(self.path, "r", encoding="utf-8") as f: + return yaml.safe_load(f) or {"pipelines": {}, "executions": {}} + + def _write(self, data: Dict): + """写入注册表文件""" + with open(self.path, "w", encoding="utf-8") as f: + yaml.safe_dump(data, f, allow_unicode=True, sort_keys=False) + + def get_current_time(self): """获取当前时间的ISO格式字符串""" return datetime.datetime.now().isoformat() - def list_pipelines(self) -> List[PipelineOut]: + def list_pipelines(self) -> List[Dict[str, Any]]: """列出所有注册的Pipeline""" - return list(self._pipeline_registry.values()) + data = self._read() + # 直接返回字典列表,不需要转换为对象 + return list(data.get("pipelines", {}).values()) - def create_pipeline(self, pipeline_data: PipelineIn) -> PipelineOut: + def create_pipeline(self, pipeline_data: Dict[str, Any]) -> Dict[str, Any]: """创建一个新的Pipeline""" + data = self._read() + # 生成唯一ID pipeline_id = str(uuid.uuid4()) current_time = self.get_current_time() - # 创建PipelineOut对象 - pipeline = PipelineOut( - id=pipeline_id, - name=pipeline_data.name, - config=pipeline_data.config, - tags=pipeline_data.tags, - created_at=current_time, - updated_at=current_time, - status=ExecutionStatus.queued - ) - - # 保存到注册表 - self._pipeline_registry[pipeline_id] = pipeline - logger.info(f"Successfully created pipeline: {pipeline_id} with name: {pipeline_data.name}") + # 直接创建字典表示的pipeline + pipeline = { + "id": pipeline_id, + "name": pipeline_data.get("name", ""), + "config": pipeline_data.get("config", {}), + "tags": pipeline_data.get("tags", []), + "created_at": current_time, + "updated_at": current_time, + "status": "queued" + } + + # 直接保存到文件 + data["pipelines"][pipeline_id] = pipeline + self._write(data) + + logger.info(f"Successfully created pipeline: {pipeline_id} with name: {pipeline_data.get('name', '')}") return pipeline - def get_pipeline(self, pipeline_id: str) -> Optional[PipelineOut]: + def get_pipeline(self, pipeline_id: str) -> Optional[Dict[str, Any]]: """根据ID获取Pipeline""" - return self._pipeline_registry.get(pipeline_id) + data = self._read() + pipeline_data = data.get("pipelines", {}).get(pipeline_id) + if pipeline_data: + return pipeline_data.copy() # 返回副本避免修改原数据 + return None - def update_pipeline(self, pipeline_id: str, pipeline_data: PipelineIn) -> PipelineOut: + def update_pipeline(self, pipeline_id: str, pipeline_data: Dict[str, Any]) -> Dict[str, Any]: """更新指定的Pipeline""" - if pipeline_id not in self._pipeline_registry: + data = self._read() + + if pipeline_id not in data.get("pipelines", {}): raise ValueError(f"Pipeline with id {pipeline_id} not found") - current_pipeline = self._pipeline_registry[pipeline_id] + # 获取当前Pipeline数据并直接更新 + updated_pipeline = data["pipelines"][pipeline_id].copy() + # 更新字段,保留创建时间和状态 - updated_pipeline = PipelineOut( - id=pipeline_id, - name=pipeline_data.name, - config=pipeline_data.config, - tags=pipeline_data.tags, - created_at=current_pipeline.created_at, - updated_at=self.get_current_time(), - status=current_pipeline.status - ) - - self._pipeline_registry[pipeline_id] = updated_pipeline + updated_pipeline.update({ + "name": pipeline_data.get("name", updated_pipeline.get("name", "")), + "config": pipeline_data.get("config", updated_pipeline.get("config", {})), + "tags": pipeline_data.get("tags", updated_pipeline.get("tags", [])), + "updated_at": self.get_current_time() + # 保持created_at和status不变 + }) + + # 直接保存到文件 + data["pipelines"][pipeline_id] = updated_pipeline + self._write(data) + logger.info(f"Updated pipeline: {pipeline_id}") return updated_pipeline def delete_pipeline(self, pipeline_id: str) -> bool: """删除指定的Pipeline""" - if pipeline_id not in self._pipeline_registry: + data = self._read() + + if pipeline_id not in data.get("pipelines", {}): return False - del self._pipeline_registry[pipeline_id] + # 直接从文件删除 + del data["pipelines"][pipeline_id] + self._write(data) + logger.info(f"Deleted pipeline: {pipeline_id}") return True - async def execute_pipeline_task(self, execution_id: str, pipeline_config: PipelineConfig): + async def execute_pipeline_task(self, execution_id: str, pipeline_config: Dict[str, Any]): """异步执行Pipeline的任务""" logs = [] output = {} @@ -91,38 +176,33 @@ async def execute_pipeline_task(self, execution_id: str, pipeline_config: Pipeli logger.info(f"Starting background execution task for execution_id: {execution_id}") try: - # 初始化执行结果 - self._execution_results[execution_id] = PipelineExecutionResult( - execution_id=execution_id, - status=status, - output=output, - logs=logs - ) - # 记录开始执行 logs.append(f"[{self.get_current_time()}] Starting pipeline execution") - logs.append(f"[{self.get_current_time()}] Input dataset: {pipeline_config.input_dataset}") - logs.append(f"[{self.get_current_time()}] Run config: {json.dumps(pipeline_config.run_config)}") + logs.append(f"[{self.get_current_time()}] Input dataset: {pipeline_config.get('input_dataset', '')}") + logs.append(f"[{self.get_current_time()}] Run config: {json.dumps(pipeline_config.get('run_config', {}))}") # 模拟数据加载 - logs.append(f"[{self.get_current_time()}] Loading dataset: {pipeline_config.input_dataset}") + logs.append(f"[{self.get_current_time()}] Loading dataset: {pipeline_config.get('input_dataset', '')}") # 按顺序执行算子 - current_data = {"dataset_id": pipeline_config.input_dataset, "data": {}} - for i, operator in enumerate(pipeline_config.operators): - logs.append(f"[{self.get_current_time()}] Executing operator {i+1}/{len(pipeline_config.operators)}: {operator.name}") - logs.append(f"[{self.get_current_time()}] Operator params: {json.dumps(operator.params)}") + current_data = {"dataset_id": pipeline_config.get('input_dataset', ''), "data": {}} + operators = pipeline_config.get('operators', []) + for i, operator in enumerate(operators): + op_name = operator.get('name', 'Unknown') + op_params = operator.get('params', {}) + logs.append(f"[{self.get_current_time()}] Executing operator {i+1}/{len(operators)}: {op_name}") + logs.append(f"[{self.get_current_time()}] Operator params: {json.dumps(op_params)}") try: # 模拟算子执行 current_data["data"] = { - "operator": operator.name, - "params": operator.params, - "output": f"Processed by {operator.name}" + "operator": op_name, + "params": op_params, + "output": f"Processed by {op_name}" } - logs.append(f"[{self.get_current_time()}] Operator {operator.name} executed successfully") + logs.append(f"[{self.get_current_time()}] Operator {op_name} executed successfully") except Exception as op_error: - error_msg = f"Operator {operator.name} failed: {op_error}" + error_msg = f"Operator {op_name} failed: {op_error}" logs.append(f"[{self.get_current_time()}] ERROR: {error_msg}") status = "failed" output["error"] = error_msg @@ -140,22 +220,26 @@ async def execute_pipeline_task(self, execution_id: str, pipeline_config: Pipeli logs.append(f"[{self.get_current_time()}] ERROR: {error_msg}") output["error"] = error_msg - # 保存最终结果 - self._execution_results[execution_id] = PipelineExecutionResult( - execution_id=execution_id, - status=status, - output=output, - logs=logs - ) + # 直接保存执行结果到文件 + execution_result = { + "execution_id": execution_id, + "status": status, + "output": output, + "logs": logs + } + + data = self._read() + data["executions"][execution_id] = execution_result + self._write(data) - def start_execution(self, pipeline_id: Optional[str] = None, config: Optional[PipelineConfig] = None) -> tuple[str, PipelineExecutionResult]: + def start_execution(self, pipeline_id: Optional[str] = None, config: Optional[Dict[str, Any]] = None) -> Tuple[str, Dict[str, Any], Dict[str, Any]]: """开始执行Pipeline""" # 获取Pipeline配置 if pipeline_id: pipeline = self.get_pipeline(pipeline_id) if not pipeline: raise ValueError(f"Pipeline with id {pipeline_id} not found") - pipeline_config = pipeline.config + pipeline_config = pipeline.get("config", {}) logger.info(f"Executing predefined pipeline: {pipeline_id}") else: if not config: @@ -166,26 +250,34 @@ def start_execution(self, pipeline_id: Optional[str] = None, config: Optional[Pi # 生成执行ID execution_id = str(uuid.uuid4()) - # 返回初始结果 - initial_result = PipelineExecutionResult( - execution_id=execution_id, - status="queued", - output={}, - logs=[f"[{self.get_current_time()}] Pipeline execution queued"] - ) + # 创建初始结果 + initial_result = { + "execution_id": execution_id, + "status": "queued", + "output": {}, + "logs": [f"[{self.get_current_time()}] Pipeline execution queued"] + } - # 保存执行结果到_execution_results字典 - self._execution_results[execution_id] = initial_result + # 直接保存到文件 + data = self._read() + data["executions"][execution_id] = initial_result + self._write(data) return execution_id, pipeline_config, initial_result - def get_execution_result(self, execution_id: str) -> Optional[PipelineExecutionResult]: + def get_execution_result(self, execution_id: str) -> Optional[Dict[str, Any]]: """获取Pipeline执行结果""" - return self._execution_results.get(execution_id) + data = self._read() + execution_data = data.get("executions", {}).get(execution_id) + if execution_data: + return execution_data.copy() # 返回副本避免修改原数据 + return None - def list_executions(self) -> List[PipelineExecutionResult]: + def list_executions(self) -> List[Dict[str, Any]]: """列出所有Pipeline执行记录""" - return list(self._execution_results.values()) + data = self._read() + # 直接返回字典列表,不需要转换为对象 + return list(data.get("executions", {}).values()) # 创建全局服务实例 From c1c7ed285b329593cda4d5511a17df4143740e9d Mon Sep 17 00:00:00 2001 From: scuuy <912074188@qq.com> Date: Mon, 24 Nov 2025 12:21:04 +0800 Subject: [PATCH 05/10] fix pipelines --- backend/app/api/v1/endpoints/pipelines.py | 6 ++++-- backend/app/services/pipeline_registry.py | 5 +++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/backend/app/api/v1/endpoints/pipelines.py b/backend/app/api/v1/endpoints/pipelines.py index 601bfcb..633cc80 100644 --- a/backend/app/api/v1/endpoints/pipelines.py +++ b/backend/app/api/v1/endpoints/pipelines.py @@ -33,7 +33,8 @@ def list_pipelines(request: Request): def create_pipeline(request: Request, payload: PipelineIn): try: logger.info(f"Request: {request.method} {request.url.path}, Pipeline name: {payload.name}") - pipeline = _PIPELINE_REGISTRY.create_pipeline(payload) + pipeline_in_data = payload.model_dump() + pipeline = _PIPELINE_REGISTRY.create_pipeline(pipeline_in_data) return created(pipeline) except ValueError as e: logger.error(f"Invalid pipeline configuration: {str(e)}", exc_info=True) @@ -52,7 +53,8 @@ def get_pipeline(pipeline_id: str): @router.put("/{pipeline_id}", response_model=ApiResponse[PipelineOut], operation_id="update_pipeline", summary="更新指定的Pipeline") def update_pipeline(pipeline_id: str, payload: PipelineIn): try: - updated_pipeline = _PIPELINE_REGISTRY.update_pipeline(pipeline_id, payload) + pipeline_in_data = payload.model_dump() + updated_pipeline = _PIPELINE_REGISTRY.update_pipeline(pipeline_id, pipeline_in_data) return ok(updated_pipeline) except ValueError as e: logger.error(f"Failed to update pipeline: {str(e)}") diff --git a/backend/app/services/pipeline_registry.py b/backend/app/services/pipeline_registry.py index 7dcf38a..9a0bef2 100644 --- a/backend/app/services/pipeline_registry.py +++ b/backend/app/services/pipeline_registry.py @@ -50,12 +50,13 @@ def _ensure(self): "config": { "file_path": os.path.join(api_pipelines_dir, filename), "module_name": f"{settings.DataFlow_CORE_DIR.replace('/', '.')}.api_pipelines.{filename[:-3]}", - "type": "api_pipeline" + "type": "api_pipeline", + "input_dataset": "" }, "tags": ["api"], "created_at": current_time, "updated_at": current_time, - "status": "active" + "status": "queued" } # 添加到初始数据中 From dbbeccdea58bba3babcd7892ce79ad0e2579bd0e Mon Sep 17 00:00:00 2001 From: scuuy <912074188@qq.com> Date: Mon, 1 Dec 2025 13:23:02 +0800 Subject: [PATCH 06/10] 1117 --- backend/app/schemas/pipelines.py | 18 +- backend/app/services/pipeline_registry.py | 1 - backend/resources/ops.json | 18050 ++++++++++++++++++++ 3 files changed, 18059 insertions(+), 10 deletions(-) create mode 100644 backend/resources/ops.json diff --git a/backend/app/schemas/pipelines.py b/backend/app/schemas/pipelines.py index d0bce52..101d344 100644 --- a/backend/app/schemas/pipelines.py +++ b/backend/app/schemas/pipelines.py @@ -1,7 +1,7 @@ from enum import Enum from typing import List, Dict, Any, Optional, Union from pydantic import BaseModel, Field, field_validator - +from app.schemas.operator import OperatorDetailSchema class Pipeline(str, Enum): """Pipeline类型枚举""" @@ -32,18 +32,18 @@ class ExecutionStatus(str, Enum): failed = "failed" -class PipelineOperator(BaseModel): +class PipelineOperator(OperatorDetailSchema): # 画布上的pipeline类 """Pipeline算子模型""" name: str = Field(..., description="算子名称") params: Dict[str, Any] = Field(default_factory=dict, description="算子参数配置") - @field_validator('name') - def validate_operator_name(cls, v: str) -> str: - """验证算子名称格式""" - if not v.replace('_', '').isalnum(): - raise ValueError('Operator name can only contain letters, numbers and underscores') - # 后续可以补充从可用算子集中验证算子名称是否存在 - return v + # @field_validator('name') + # def validate_operator_name(cls, v: str) -> str: + # """验证算子名称格式""" + # if not v.replace('_', '').isalnum(): + # raise ValueError('Operator name can only contain letters, numbers and underscores') + # # 后续可以补充从可用算子集中验证算子名称是否存在 + # return v class PipelineConfig(BaseModel): diff --git a/backend/app/services/pipeline_registry.py b/backend/app/services/pipeline_registry.py index 9a0bef2..c627bb9 100644 --- a/backend/app/services/pipeline_registry.py +++ b/backend/app/services/pipeline_registry.py @@ -50,7 +50,6 @@ def _ensure(self): "config": { "file_path": os.path.join(api_pipelines_dir, filename), "module_name": f"{settings.DataFlow_CORE_DIR.replace('/', '.')}.api_pipelines.{filename[:-3]}", - "type": "api_pipeline", "input_dataset": "" }, "tags": ["api"], diff --git a/backend/resources/ops.json b/backend/resources/ops.json new file mode 100644 index 0000000..231f235 --- /dev/null +++ b/backend/resources/ops.json @@ -0,0 +1,18050 @@ +{ + "agentic_rag": [ + { + "node": 1, + "name": "AgenticRAGQAF1SampleEvaluator", + "description": "用于评估预测答案与多个参考答案之间的 F1 分数", + "type": { + "level_1": "agentic_rag", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_prediction_key", + "default": "refined_answer", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_ground_truth_key", + "default": "golden_doc_answer", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "F1Score", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 2, + "name": "AgenticRAGAtomicTaskGenerator", + "description": "该算子用于为提供的文本内容生成合适的高质量问题与可验证答案。\n\n输入参数:\n- input_key: 输入文本内容字段名(默认值:\"prompts\")\n- output_question_key: 输出问题字段名(默认值:\"question\")\n- output_answer_key: 输出答案字段名(默认值:\"answer\")\n- output_refined_answer_key: 输出精炼答案字段名(默认值:\"refined_answer\")\n- output_optional_answer_key: 输出可替代精炼答案字段名(默认值:\"optional_answer\")\n- output_golden_doc_answer_key: 输出黄金文档回答字段名(默认值:\"golden_doc_answer\")\n", + "type": { + "level_1": "agentic_rag", + "level_2": "generate" + }, + "allowed_prompts": [ + "AtomicTaskGeneratorGetIdentifierPrompt", + "AtomicTaskGeneratorGetConlcusionPrompt", + "AtomicTaskGeneratorQuestionPrompt", + "AtomicTaskGeneratorCleanQAPrompt", + "AtomicTaskGeneratorAnswerPrompt", + "AtomicTaskGeneratorRecallScorePrompt", + "AtomicTaskGeneratorOptionalAnswerPrompt", + "AtomicTaskGeneratorGoldenDocAnswerPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "data_num", + "default": 100, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_per_task", + "default": 10, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_question", + "default": 10, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "prompts", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_question_key", + "default": "question", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_answer_key", + "default": "answer", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_refined_answer_key", + "default": "refined_answer", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_optional_answer_key", + "default": "optional_answer", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_llm_answer_key", + "default": "llm_answer", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_golden_doc_answer_key", + "default": "golden_doc_answer", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 3, + "name": "AgenticRAGDepthQAGenerator", + "description": "该算子以已有问答生成更深度的问题。\n\n输入参数:\n- input_key: 输入字段名(默认值:\"question\")\n- output_key: 输出字段名(默认值:\"depth_question\")\n", + "type": { + "level_1": "agentic_rag", + "level_2": "generate" + }, + "allowed_prompts": [ + "DepthQAGeneratorGetIdentifierPrompt", + "DepthQAGeneratorBackwardTaskPrompt", + "DepthQAGeneratorSupersetCheckPrompt", + "DepthQAGeneratorQuestionPrompt", + "DepthQAGeneratorAnswerPrompt", + "DepthQAGeneratorRecallScorePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "n_rounds", + "default": 2, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "question", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "depth_question", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 4, + "name": "AgenticRAGWidthQAGenerator", + "description": "该算子用于结合两个问答,生成新的问题。\n\n输入参数:\n- input_question_key: 输入问题字段名(默认值:\"question\")\n- input_identifier_key: 输入标识符字段名(默认值:\"identifier\")\n- input_answer_key: 输入答案字段名(默认值:\"answer\")\n- output_question_key: 输出问题字段名(默认值:\"generated_width_task\")\n", + "type": { + "level_1": "agentic_rag", + "level_2": "generate" + }, + "allowed_prompts": [ + "WidthQAGeneratorMergePrompt", + "WidthQAGeneratorOriginCheckPrompt", + "WidthQAGeneratorQuestionVerifyPrompt", + "WidthQAGeneratorAnswerPrompt", + "WidthQAGeneratorRecallScorePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_question_key", + "default": "question", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_identifier_key", + "default": "identifier", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_answer_key", + "default": "answer", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_question_key", + "default": "generated_width_task", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + } + ], + "chemistry": [ + { + "node": 5, + "name": "ExtractSmilesFromTextGenerator", + "description": "ExtractSmilesFromText 用于从 OCR 文本中抽取或解析化学分子的 SMILES 表达式。算子会根据给定的提示模板(prompt_template),结合文本内容和(可选的)单体缩写信息,调用大语言模型完成解析与结构化,并将结果以 JSON 格式写回到指定列。\n\n输入参数:\n- llm_serving:LLM 服务对象,需实现 LLMServingABC 接口\n- prompt_template:提示词模板对象,用于构造模型输入\n- input_content_key: OCR 文本的列名(默认 'text')\n- input_abbreviation_key:包含缩写/单体信息的列名(默认 'abbreviations'),可为空\n- output_key:写回抽取结果的列名(默认 'synth_smiles')\n\n输出参数:\n- DataFrame,其中 output_key 列为模型返回并经 JSON 解析后的 SMILES 结构\n- 返回 output_key,供后续算子引用\n\n备注:\n- 模型输出会尝试解析为 JSON;若解析失败,将返回 [] 并记录失败次数。", + "type": { + "level_1": "chemistry", + "level_2": "generate" + }, + "allowed_prompts": [ + "ExtractSmilesFromTextPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": "", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_content_key", + "default": "text", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_abbreviation_key", + "default": "abbreviations", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "synth_smiles", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 6, + "name": "SmilesEquivalenceDatasetEvaluator", + "description": "评估 golden_label 与 synth_smiles 的 SMILES 等价性并计算分数。逐块输出 final_result、块内得分与准确率,并统计全局总分。", + "type": { + "level_1": "chemistry", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_golden_key", + "default": "golden_label", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_synth_key", + "default": "synth_smiles", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "final_result", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + } + ], + "code": [ + { + "node": 7, + "name": "CodeAutoGeneratedSampleEvaluator", + "description": "基于自动生成标记评估代码样本,检测文件头部的自动生成标记。\n\n评估指标:\n- CodeAutoGeneratedMarkerCount: 检测到的自动生成标记数量\n- CodeAutoGeneratedScore: 综合自动生成得分 (0-1,1表示非自动生成)\n\n输入要求:需要包含'lines'列\n\n输出参数:\n- CodeAutoGeneratedMarkerCount: 检测到的自动生成标记数量\n- CodeAutoGeneratedScore: 综合自动生成得分", + "type": { + "level_1": "code", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "is_generated_func", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 8, + "name": "CodeAutoGeneratedFilter", + "description": "基于CodeAutoGeneratedSampleEvaluator的得分过滤自动生成的代码文件,确保只保留人工编写的代码。\n\n评估指标:\n- 自动生成标记数量:检测文件前5行中的自动生成标记\n- 检测标记:'auto-generated', 'autogenerated', 'automatically generated'等\n- 综合自动生成得分:0-1,1表示非自动生成\n- 支持外部检测函数进行额外验证\n\n输入参数:\n- input_key: 输入字段名(需要包含'lines'列)\n- output_key: 输出标签字段名 (默认: 'auto_generated_filter_label')\n- min_score: 最小自动生成得分阈值 (默认: 1.0)\n- max_score: 最大自动生成得分阈值 (默认: 1.0)\n- is_generated_func: 可选的外部检测函数\n\n输出参数:\n- 过滤后的DataFrame,仅保留自动生成得分在指定范围内的代码样本\n- 返回包含自动生成得分标签字段名的列表", + "type": { + "level_1": "code", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 1.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 1.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "is_generated_func", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "auto_generated_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 9, + "name": "CodeDocumentQualitySampleEvaluator", + "description": "基于综合文档级质量指标评估代码样本,包括内容长度、重复模式、字符组成和文本熵值。\n\n评估指标:\n- CodeDocumentQualityCharCount: 字符数\n- CodeDocumentQualityWordCount: 词数\n- CodeDocumentQualityDuplicateLinesRatio: 重复行比例\n- CodeDocumentQualityDuplicateNgramRatio: n-gram重复比例\n- CodeDocumentQualityCurlyBracketRatio: 花括号比例\n- CodeDocumentQualityAllCapsRatio: 全大写单词比例\n- CodeDocumentQualityEntropy: 单字符熵值\n- CodeDocumentQualityScore: 综合文档质量得分 (0-1,1表示通过所有质量检查)\n\n输入要求:需要包含'text'、'filename'、'language'列\n\n输出参数:\n- 各种质量指标的数值\n- CodeDocumentQualityScore: 综合文档质量得分", + "type": { + "level_1": "code", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "thresholds", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 10, + "name": "CodeDocumentQualityFilter", + "description": "基于CodeDocumentQualitySampleEvaluator的得分应用综合文档级质量过滤规则,移除低质量代码和文本样本。\n\n评估指标:\n- 内容长度:字符数、词数、行数范围检查\n- 重复模式:重复行比例、2-10gram重复比例\n- 字符组成:花括号比例、全大写单词比例\n- 文本熵值:单字符熵值检查\n- 综合文档质量得分:0-1,1表示通过所有质量检查\n\n输入参数:\n- input_key: 输入字段名(需要包含'text'、'filename'、'language'列)\n- output_key: 输出标签字段名 (默认: 'doc_quality_filter_label')\n- min_score: 最小文档质量得分阈值 (默认: 1.0)\n- max_score: 最大文档质量得分阈值 (默认: 1.0)\n- thresholds: 可选的阈值字典,用于覆盖默认阈值\n\n输出参数:\n- 过滤后的DataFrame,仅保留文档质量得分在指定范围内的样本\n- 返回包含文档质量得分标签字段名的列表", + "type": { + "level_1": "code", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 1.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 1.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "thresholds", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "doc_quality_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 11, + "name": "CodeEncodedDataSampleEvaluator", + "description": "基于编码数据模式评估代码样本,检测Base64、十六进制和Unicode转义序列。\n\n评估指标:\n- CodeEncodedDataBase64Ratio: Base64编码数据比例\n- CodeEncodedDataHexRatio: 十六进制数据比例\n- CodeEncodedDataUnicodeRatio: Unicode转义序列比例\n- CodeEncodedDataScore: 综合编码数据得分 (0-1,1表示通过编码数据检查)\n\n输入要求:需要包含'text'列\n\n输出参数:\n- CodeEncodedDataBase64Ratio: Base64编码数据比例\n- CodeEncodedDataHexRatio: 十六进制数据比例\n- CodeEncodedDataUnicodeRatio: Unicode转义序列比例\n- CodeEncodedDataScore: 综合编码数据得分", + "type": { + "level_1": "code", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 12, + "name": "CodeEncodedDataFilter", + "description": "基于CodeEncodedDataSampleEvaluator的得分过滤代码样本,移除二进制内容和自动生成代码。\n\n评估指标:\n- Base64编码数据比例:检测连续64+字符的Base64字符串\n- 十六进制数据比例:检测8+个连续的十六进制对\n- Unicode转义序列比例:检测8+个连续的\\uXXXX序列\n- 综合编码数据得分:0-1,1表示通过检查\n\n输入参数:\n- input_key: 输入字段名(需要包含'text'列)\n- output_key: 输出标签字段名 (默认: 'encoded_data_filter_label')\n- min_score: 最小编码数据得分阈值 (默认: 1.0)\n- max_score: 最大编码数据得分阈值 (默认: 1.0)\n\n输出参数:\n- 过滤后的DataFrame,仅保留编码数据得分在指定范围内的代码样本\n- 返回包含编码数据得分标签字段名的列表", + "type": { + "level_1": "code", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 1.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 1.0, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "encoded_data_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 13, + "name": "CodeFileTypeContentFilter", + "description": "基于文件类型和内容特征直接过滤代码样本,针对不同文件格式应用特定规则。\n\n过滤规则:\n- Text/JSON/YAML/Graphviz文件:行数 > 512 行\n- HTML文件:可见文本长度 < 100字符 或 可见文本比例 < 20%\n- Text文件:文件名不符合文档规范(非readme/notes/todo等)\n\n输入参数:\n- input_key: 输入字段名(需要包含'filetype'、'filename'、'line_count'等列)\n- output_key: 输出标签字段名 (默认: 'file_type_content_filter_label')\n\n输出参数:\n- 过滤后的DataFrame,仅保留符合文件类型规则的样本\n- 返回包含输出标签字段名的列表", + "type": { + "level_1": "code", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "file_type_content_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 14, + "name": "CodeLengthSampleEvaluator", + "description": "基于代码长度特征评估代码样本,分析总行数、平均行长和最大行长。\n\n评估指标:\n- CodeLengthTotalLines: 总行数\n- CodeLengthAvgLineLength: 平均行长\n- CodeLengthMaxLineLength: 最大行长\n- CodeLengthScore: 综合长度得分 (0-1,1表示通过所有长度检查)\n\n输入要求:需要包含'lines'和'language'列\n\n输出参数:\n- CodeLengthTotalLines: 总行数\n- CodeLengthAvgLineLength: 平均行长\n- CodeLengthMaxLineLength: 最大行长\n- CodeLengthScore: 综合长度得分", + "type": { + "level_1": "code", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 15, + "name": "CodeLengthSampleFilter", + "description": "基于CodeLengthSampleEvaluator的得分过滤代码样本,移除超大文件和格式不良的代码。\n\n评估指标:\n- 总行数:检查是否超过100,000行\n- 平均行长:普通语言>100字符,特殊语言>100,000字符\n- 最大行长:普通语言>1,000字符\n\n输入参数:\n- input_key: 输入字段名(需要包含'lines'和'language'列)\n- output_key: 输出标签字段名 (默认: 'length_filter_label')\n- min_score: 最小长度得分阈值 (默认: 1.0)\n- max_score: 最大长度得分阈值 (默认: 1.0)\n\n输出参数:\n- 过滤后的DataFrame,仅保留长度得分在指定范围内的代码样本\n- 返回包含长度得分标签字段名的列表", + "type": { + "level_1": "code", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 1.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 1.0, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "length_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 16, + "name": "CodeQualitySampleEvaluator", + "description": "该算子用于评估生成的代码片段与其源指令的匹配质量,并输出分数和反馈。\n\n输入参数:\n- input_instruction_key: 包含人类指令的字段名 (默认: 'generated_instruction')\n- input_code_key: 包含生成代码的字段名 (默认: 'generated_code')\n输出参数:\n- output_score_key: 用于存储质量分数的字段名 (默认: 'quality_score')\n- output_feedback_key: 用于存储质量反馈的字段名 (默认: 'quality_feedback')\n", + "type": { + "level_1": "code", + "level_2": "eval" + }, + "allowed_prompts": [ + "CodeQualityEvaluatorPrompt", + "DiyCodePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_code_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_score_key", + "default": "quality_score", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_feedback_key", + "default": "quality_feedback", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 17, + "name": "CodeQualityScoreFilter", + "description": "基于LLM生成的代码质量分数过滤代码样本,评估正确性、完整性、清晰度、最佳实践和效率。\n\n评估维度:\n- 正确性:代码语法和逻辑是否正确\n- 完整性:代码是否完整实现功能\n- 清晰度:代码是否清晰易懂\n- 最佳实践:是否遵循编程最佳实践\n- 效率:代码执行效率如何\n\n输入参数:\n- input_code_key: 输入代码字段名\n- input_instruction_key: 输入指令字段名\n- output_score_key: 输出打分字段名 (默认: 'quality_score')\n- output_feedback_key: 输出反馈字段名 (默认: 'quality_feedback')\n- output_key: 输出过滤标签字段名 (默认: 'quality_score_filter_label')\n- min_score: 最小质量分数阈值 (默认: 7)\n- max_score: 最大质量分数阈值 (默认: 10)\n\n输出参数:\n- 过滤后的DataFrame,仅保留质量分数在指定范围内的代码样本\n- 返回包含质量分数标签字段名的列表", + "type": { + "level_1": "code", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "min_score", + "default": 7, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 10, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_code_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_score_key", + "default": "quality_score", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_feedback_key", + "default": "quality_feedback", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "quality_score_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 18, + "name": "CodeGenericScoreFilter", + "description": "基于数值分数列直接过滤数据集,提供灵活的阈值比较方法。\n\n比较方法:\n- greater_equal: 分数 >= 阈值\n- greater: 分数 > 阈值\n- less_equal: 分数 <= 阈值\n- less: 分数 < 阈值\n- equal: 分数 = 阈值\n\n输入参数:\n- input_key: 包含分数的字段名\n- output_key: 输出标签字段名 (默认: 'generic_score_filter_label')\n- score_threshold: 分数阈值 (默认: 8)\n- filter_method: 比较方法 (默认: 'greater_equal')\n\n输出参数:\n- 过滤后的DataFrame,仅保留符合分数条件的样本\n- 返回包含输出标签字段名的列表", + "type": { + "level_1": "code", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "score_threshold", + "default": 8, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "filter_method", + "default": "greater_equal", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "generic_score_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 19, + "name": "CodeTextCompositionSampleEvaluator", + "description": "基于字符组成评估代码样本,分析字母字符和字母数字字符的比例。\n\n评估指标:\n- CodeTextCompositionAlphaRatio: 字母字符比例\n- CodeTextCompositionAlnumRatio: 字母数字字符比例\n- CodeTextCompositionScore: 综合字符组成得分 (0-1,1表示通过字符组成检查)\n\n输入要求:需要包含'text'和'language'列\n\n输出参数:\n- CodeTextCompositionAlphaRatio: 字母字符比例\n- CodeTextCompositionAlnumRatio: 字母数字字符比例\n- CodeTextCompositionScore: 综合字符组成得分", + "type": { + "level_1": "code", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 20, + "name": "CodeTextCompositionFilter", + "description": "基于CodeTextCompositionSampleEvaluator的得分过滤代码样本,移除二进制文件、加密内容和不可读文本。\n\n评估指标:\n- 字母字符比例:普通语言需要>=25%\n- 字母数字字符比例:汇编语言需要>=25%\n- 综合字符组成得分:0-1,1表示通过检查\n\n输入参数:\n- input_key: 输入字段名(需要包含'text'和'language'列)\n- output_key: 输出标签字段名 (默认: 'text_composition_filter_label')\n- min_score: 最小字符组成得分阈值 (默认: 1.0)\n- max_score: 最大字符组成得分阈值 (默认: 1.0)\n\n输出参数:\n- 过滤后的DataFrame,仅保留字符组成得分在指定范围内的代码样本\n- 返回包含字符组成得分标签字段名的列表", + "type": { + "level_1": "code", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 1.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 1.0, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "text_composition_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 21, + "name": "CodeCodeToInstructionGenerator", + "description": "该算子用于分析代码片段并反向生成可能产生该代码的人类指令。\n\n输入参数:\n- input_key: 包含原始代码片段的字段名 (默认: 'code')\n输出参数:\n- output_key: 用于存储生成指令的字段名 (默认: 'generated_instruction')\n", + "type": { + "level_1": "code", + "level_2": "generate" + }, + "allowed_prompts": [ + "CodeCodeToInstructionGeneratorPrompt", + "DiyCodePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "code", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "generated_instruction", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 22, + "name": "CodeInstructionToCodeGenerator", + "description": "该算子根据给定的人类指令生成相应的代码片段。\n\n输入参数:\n- input_key: 包含人类指令的字段名 (默认: 'instruction')\n输出参数:\n- output_key: 用于存储生成代码的字段名 (默认: 'generated_code')\n", + "type": { + "level_1": "code", + "level_2": "generate" + }, + "allowed_prompts": [ + "CodeInstructionToCodeGeneratorPrompt", + "DiyCodePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "generated_code", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 23, + "name": "CodeEnhancementInstructionGenerator", + "description": "该算子用于增强人类指令,将不同输出格式的任务统一为生成完整函数。\n\n输入参数:\n- input_key: 包含原始代码片段的字段名 (默认: 'code')\n输出参数:\n- output_key: 用于存储生成指令的字段名 (默认: 'generated_instruction')\n", + "type": { + "level_1": "code", + "level_2": "generate" + }, + "allowed_prompts": [ + "CodeInstructionEnhancement", + "DiyCodePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "messages", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "generated_instruction", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 24, + "name": "CodeInstructionGenerator", + "description": "该算子用于生成新的指令,从数据池中随机抽取few-shot样本,生成类似难度的指令。\n\n输入参数:\n- input_key: 包含原始指令的字段名 (默认: 'prompt')\n输出参数:\n- output_key: 用于存储生成指令的字段名 (默认: 'generated_instruction')\n", + "type": { + "level_1": "code", + "level_2": "generate" + }, + "allowed_prompts": [ + "CodeInstructionGeneratePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "num_few_shot", + "default": 3, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "num_generate", + "default": 10, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "prompt", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "generated_instruction", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 25, + "name": "CodeSandboxSampleEvaluator", + "description": "该算子在一个安全的沙箱环境中执行代码片段以验证其正确性。\n\n输入参数:\n- input_code_key: 包含待执行代码的字段名 (默认: 'generated_code')\n输出参数:\n- output_status_key: 用于存储执行状态 ('PASS' 或 'FAIL') 的字段名 (默认: 'sandbox_status')\n- output_log_key: 用于存储执行日志或错误信息的字段名 (默认: 'sandbox_log')\n", + "type": { + "level_1": "code", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "language", + "default": "python", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "timeout_length", + "default": 15, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "use_process_isolation", + "default": true, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_status_key", + "default": "sandbox_status", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_log_key", + "default": "sandbox_log", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + } + ], + "conversations": [ + { + "node": 26, + "name": "ScenarioExtractGenerator", + "description": "从对话内容中提取场景信息,使用LLM服务分析对话并生成场景描述。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_chat_key:对话内容字段名\n- output_key:输出场景字段名,默认'scenario'\n输出参数:\n- 包含提取场景信息的DataFrame\n- 包含输出字段名的列表", + "type": { + "level_1": "conversations", + "level_2": "generate" + }, + "allowed_prompts": [ + "ExtractScenarioPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_chat_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "scenario", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 27, + "name": "ScenarioExpandGenerator", + "description": "基于原始场景生成新的替代场景,使用LLM服务重写或改写原有场景内容。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_scenario_key:原始场景字段名\n- output_key:生成的新场景字段名,默认'modified_scenario'\n输出参数:\n- 包含生成新场景的DataFrame\n- 包含输出字段名的列表", + "type": { + "level_1": "conversations", + "level_2": "generate" + }, + "allowed_prompts": [ + "ExpandScenarioPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_scenario_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "modified_scenario", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 28, + "name": "AtomTaskGenerator", + "description": "根据输入的场景信息,使用LLM服务生成对应的原子任务。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_scenario_key:场景字段名\n- output_key:原子任务的输出字段名,默认'atom_task'\n输出参数:\n- 包含原子任务的DataFrame\n- 包含输出字段名的列表", + "type": { + "level_1": "conversations", + "level_2": "generate" + }, + "allowed_prompts": [ + "FuncAtomicTaskGeneratePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_scenario_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "atom_task", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 29, + "name": "SequentialTaskGenerator", + "description": "根据输入的原子任务,使用LLM服务生成该任务的后继任务和两者的组合任务。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_task_key:原子任务字段名\n- output_subsequent_task_key:后继任务输出字段名,默认'subsequent_task'\n- output_composition_task_key:组合任务输出字段名,默认'composition_task'\n输出参数:\n- 包含后继任务和组合任务的DataFrame\n- 输出字段名的列表(后继任务字段和组合任务字段)", + "type": { + "level_1": "conversations", + "level_2": "generate" + }, + "allowed_prompts": [ + "SequentialTaskGeneratePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_task_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_subsequent_task_key", + "default": "subsequent_task", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_composition_task_key", + "default": "composition_task", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 30, + "name": "ParaSeqTaskGenerator", + "description": "基于原子任务,使用LLM服务生成三个任务类型:并行任务、后继任务以及这三者的组合任务。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_task_key:原子任务字段名\n- output_parallel_task_key:并行任务输出字段名,默认'parallel_task'\n- output_subsequent_task_key:后继任务输出字段名,默认'subsequent_task'\n- output_composition_task_key:组合任务输出字段名,默认'composition_task'\n输出参数:\n- 包含并行任务、后继任务与组合任务的DataFrame\n- 输出字段名列表(并行任务、后继任务、组合任务)", + "type": { + "level_1": "conversations", + "level_2": "generate" + }, + "allowed_prompts": [ + "ParathenSeqTaskGeneratePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_task_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_parallel_task_key", + "default": "parallel_task", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_subsequent_task_key", + "default": "subsequent_task", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_composition_task_key", + "default": "composition_task", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 31, + "name": "FunctionGenerator", + "description": "基于组合任务及其相关子任务,使用LLM服务生成对应的函数列表。输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_composition_task_key:组合任务字段名\n- input_sub_tasks_keys:子任务字段名列表(如原子任务、并行任务、后继任务等)\n- output_key:函数列表输出字段名,默认'functions'\n输出参数:\n- 包含函数定义或函数列表的DataFrame\n- 输出字段名的列表(函数列表字段)", + "type": { + "level_1": "conversations", + "level_2": "generate" + }, + "allowed_prompts": [ + "FuncGeneratePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_composition_task_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_sub_tasks_keys", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "functions", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 32, + "name": "MultiTurnConversationGenerator", + "description": "根据组合任务及其子任务函数,使用LLM服务模拟多轮对话过程,由User、Assistant和Tool三个Agent协同生成完整的对话数据。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_task_key:任务字段名(组合任务)\n- input_sub_tasks_keys:子任务字段名列表\n- input_functions_key:子任务函数字段名\n- output_conversations_key:输出对话字段名,默认'conversations'\n输出参数:\n- 包含已完成的多轮对话记录的DataFrame\n- 输出字段名(对话字段名)", + "type": { + "level_1": "conversations", + "level_2": "generate" + }, + "allowed_prompts": [ + "ConversationUserPrompt", + "ConversationAssistantPrompt", + "ConversationToolPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_task_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_sub_tasks_keys", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_functions_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_conversations_key", + "default": "conversations", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 33, + "name": "ConsistentChatGenerator", + "description": "根据预置主题和人类意图,两阶段从0合成多轮对话格式数据(合成数量大于9000时建议增加标签数量)。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- num_dialogs_per_intent:每个意图生成的对话数量,默认20\n- num_turns_per_dialog:每个对话的轮次数量,默认6\n- temperature:生成温度,控制输出随机性,默认0.9\n输出参数:\n- 包含category和conversation字段的DataFrame,其中conversation为多轮对话列表", + "type": { + "level_1": "conversations", + "level_2": "generate" + }, + "allowed_prompts": [ + "ConsistentQueryPrompt", + "ConsistentResponsePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "num_dialogs_per_intent", + "default": 20, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "num_turns_per_dialog", + "default": 6, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "temperature", + "default": 0.9, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 34, + "name": "FuncCallConversationSampleEvaluator", + "description": "对对话样本进行打分评估:使用 LLM 服务根据预设评分提示词对每条对话进行评分,并将结果写回数据流。\n输入参数:\n- llm_serving:LLM 服务对象,需实现 LLMServingABC 接口\n- input_conversation_key:DataFrame 中对话内容字段名,默认 'conversations'\n- output_score_key:评分结果输出字段名,默认 'score'\n处理流程:\n- 读取存储中的 DataFrame\n- 将每条对话重组为评分提示词并调用 LLM 生成评分(JSON)\n- 解析 JSON,提取 'score' 字段写入 DataFrame;解析失败则回退为 0\n输出参数:\n- 包含评分结果列的 DataFrame\n- 包含输出字段名的列表(仅 'score' 或自定义的输出列名)", + "type": { + "level_1": "conversations", + "level_2": "eval" + }, + "allowed_prompts": [ + "ConversationEvalPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_conversation_key", + "default": "conversations", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_score_key", + "default": "score", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 35, + "name": "CompositionTaskFilter", + "description": "根据组合任务及其子任务,使用LLM服务判断组合任务是否具备可行性与完备性,从而进行可运行任务的筛选。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_composition_task_key:组合任务字段名\n- input_sub_tasks_keys:子任务字段名列表(如原子任务、并行任务、后继任务等)\n- output_key:可运行标签的输出字段名,默认'runable_label'\n输出参数:\n- 仅包含可运行组合任务的数据DataFrame\n- 包含输出字段名的列表(可运行标签字段)", + "type": { + "level_1": "conversations", + "level_2": "filter" + }, + "allowed_prompts": [ + "CompositionTaskFilterPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_composition_task_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_sub_tasks_keys", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "runable_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + } + ], + "core_speech": [ + { + "node": 36, + "name": "Speech2TextGenerator", + "description": "该算子用于将语音内容转录为文本。它接收语音文件路径或URL,使用大语言模型进行转录,并将转录结果保存到数据框中。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- system_prompt:系统提示词,用于定义模型行为,默认为'You are a helpful assistant'\n- input_key:输入语音文件路径或URL的字段名,默认为'raw_content'\n- output_key:输出转录文本的字段名,默认为'generated_content'\n输出参数:\n- 返回输出字段名,用于后续算子引用\n- 在数据框中添加包含转录文本的新列", + "type": { + "level_1": "core_speech", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "system_prompt", + "default": "You are a helpful assistant", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "raw_content", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "generated_content", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + } + ], + "core_text": [ + { + "node": 37, + "name": "PromptedGenerator", + "description": "基于用户提供的提示词(prompt)生成数据。结合系统提示词和输入内容生成符合要求的输出文本。输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- system_prompt:系统提示词,定义模型行为,默认为'You are a helpful agent.'\n- input_key:输入内容字段名,默认为'raw_content'\n- output_key:输出生成内容字段名,默认为'generated_content'\n输出参数:\n- 包含生成内容的DataFrame\n- 返回输出字段名,用于后续算子引用", + "type": { + "level_1": "core_text", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "system_prompt", + "default": "You are a helpful agent.", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "json_schema", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "raw_content", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "generated_content", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 38, + "name": "PairedPromptedGenerator", + "description": "PairedPromptedGenerator:基于两列配对输入(input_key_1 与 input_key_2)进行成对提示生成。\n算子会将 system_prompt 与每行的两列文本按固定模板拼接后,调用 LLM 服务批量生成结果,并将模型输出写回到 DataFrame 的指定列。\n\n输入参数:\n- llm_serving:LLM 服务对象(实现 LLMServingABC 接口)\n- system_prompt:系统提示词(默认 'You are a helpful agent.')。该提示会放在每条样本前缀, 用于约束模型的角色与输出风格。\n- input_key_1:第一列输入字段名(默认 'input_key_1')\n- input_key_2:第二列输入字段名(默认 'input_key_2')\n- output_key:输出字段名(默认 'generated_content')\n\n处理逻辑:\n1) 从 storage 中读取名为 'dataframe' 的 DataFrame;\n2) 对于每一行,若 input_key_1 与 input_key_2 均非空,则按模板:\n system_prompt + input_key_1 + 值 + '\\n' + input_key_2 + 值\n 构造 LLM 输入;\n3) 批量调用 llm_serving.generate_from_input 生成文本;\n4) 将生成结果写入 DataFrame 的 output_key 列并保存。\n\n输出:\n- 返回写入了生成结果的新 DataFrame(由 storage 管理保存),\n- 返回 output_key 以便后续算子引用。", + "type": { + "level_1": "core_text", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "system_prompt", + "default": "You are a helpful agent.", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key_1", + "default": "input_key_1", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key_2", + "default": "input_key_2", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "generated_content", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 39, + "name": "RandomDomainKnowledgeRowGenerator", + "description": "N/A (调用失败)", + "type": { + "level_1": "core_text", + "level_2": "generate" + }, + "allowed_prompts": [ + "SFTFromScratchGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "generation_num", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "domain_keys", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "generated_content", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 40, + "name": "Text2QAGenerator", + "description": "该算子用于为给定的文档片段生成种子QA对。\n\n输入参数:\n- input_key: 包含文档片段的字段名\n- prompt_key: 包含提示词的字段名\n- output_quesion_key: 包含生成问题的字段名\n- output_answer_key: 包含生成答案的字段名\n", + "type": { + "level_1": "core_text", + "level_2": "generate" + }, + "allowed_prompts": [ + "Text2QAAutoPromptGeneratorPrompt", + "Text2QASeedQuestionGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "text", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_question_num", + "default": 1, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_prompt_key", + "default": "generated_prompt", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_quesion_key", + "default": "generated_question", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_answer_key", + "default": "generated_answer", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 41, + "name": "Text2MultiHopQAGenerator", + "description": "('MultiHopQAGenerator 是多跳问答对生成处理器,支持从文本中自动生成需要多步推理的问题与答案。', '处理流程包括:文本预处理、信息抽取、问题生成与回答生成,支持自定义语言模型后端和参数。', '输出格式如下:', '输入:\\ntext: <原始上下文文本>', '输出:\\n{\\n \"text\": <处理后的文本字符串>,\\n \"qa_pairs\": [\\n {\\n \"question\": <字符串:生成的问题>,\\n \"reasoning_steps\": [\\n {\"step\": <推理过程的步骤 1>},\\n {\"step\": <步骤 2>} ...\\n ],\\n \"answer\": <字符串:最终答案>,\\n \"supporting_facts\": [<支持该答案的事实 1>, <事实 2>, ...],\\n \"type\": <可选:问题类型,如“生物学”、“历史”等>\\n },\\n ...\\n ],\\n \"metadata\": {\\n \"source\": <数据来源>,\\n \"timestamp\": <时间戳字符串>,\\n \"complexity\": <整数:问题复杂度标记>\\n }\\n}')", + "type": { + "level_1": "core_text", + "level_2": "generate" + }, + "allowed_prompts": [ + "Text2MultiHopQAGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "seed", + "default": 0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "lang", + "default": "en", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "num_q", + "default": 5, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "cleaned_chunk", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "QA_pairs", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_meta_key", + "default": "QA_metadata", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 42, + "name": "EmbeddingGenerator", + "description": "EmbeddingGenerator算子用于从输入文本生成向量表示(embedding),通常用于语义检索、聚类或下游模型输入等任务。\n\n输入参数:\n- embedding_serving:Embedding服务对象,需实现LLMServingABC接口,用于生成文本的向量表示\n- input_key:输入文本字段名,默认为'text'\n- output_key:输出向量字段名,默认为'embeddings'\n\n输出参数:\n- 包含文本向量的DataFrame,每行对应一个输入文本的embedding\n- 返回输出字段名(如'embeddings'),可供后续算子引用", + "type": { + "level_1": "core_text", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "embedding_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "text", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "embeddings", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 43, + "name": "RetrievalGenerator", + "description": "N/A (非 staticmethod)", + "type": { + "level_1": "core_text", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "system_prompt", + "default": "You are a helpful agent.", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "json_schema", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "raw_content", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "generated_content", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 44, + "name": "BenchDatasetEvaluator", + "description": "该算子用于对比预测答案与标准答案的匹配度,支持两种评估模式:\n\n1. 字符串匹配(match):使用数学验证方法比较答案,适用于有明确答案的问题\n2. 语义匹配(semantic):使用LLM评估语义相似度,仅输入预测答案与标准答案\n\n输入参数:\n- input_test_answer_key:预测答案字段名\n- input_gt_answer_key:标准答案字段名\n- compare_method:比较方法(match/semantic)\n\n输出参数:\n- answer_match_result:匹配结果(True/False)\n- 统计结果将保存到指定的eval_result_path路径\n", + "type": { + "level_1": "core_text", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "eval_result_path", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "compare_method", + "default": "match", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "system_prompt", + "default": "You are a helpful assistant specialized in evaluating answer correctness.", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_test_answer_key", + "default": "generated_cot", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_gt_answer_key", + "default": "golden_answer", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 45, + "name": "BenchDatasetEvaluatorQuestion", + "description": "该算子用于对比预测答案与标准答案的匹配度,支持两种评估模式:\n\n1. 字符串匹配(match):使用数学验证方法比较答案,适用于有明确答案的问题\n2. 语义匹配(semantic):使用LLM评估答案的语义相似度,适用于开放性问题\n\n输入参数:\n- input_test_answer_key:预测答案字段名\n- input_gt_answer_key:标准答案字段名\n- input_question_key:问题字段名(语义匹配模式下必需)\n- compare_method:比较方法(match/semantic)\n\n输出参数:\n- answer_match_result:匹配结果(True/False)\n- 统计结果将保存到指定的eval_result_path路径\n", + "type": { + "level_1": "core_text", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "eval_result_path", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "compare_method", + "default": "match", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "system_prompt", + "default": "You are a helpful assistant specialized in evaluating answer correctness.", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_test_answer_key", + "default": "generated_cot", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_gt_answer_key", + "default": "golden_answer", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_question_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 46, + "name": "Text2QASampleEvaluator", + "description": "该算子用于为给的的文档片段生成种子QA对打分\n\n输入参数:\n- input_question_key: Field name containing the generated question\n- input_answer_key: Field name containing the generated answer\n- output_question_quality_key: Field name containing the question quality grade\n- output_question_quality_feedback_key: Field name containing the question quality feedback\n- output_answer_alignment_key: Field name containing the answer alignment grade\n- output_answer_alignment_feedback_key: Field name containing the answer alignment feedback\n- output_answer_verifiability_key: Field name containing the answer verifiability grade\n- output_downstream_value_key: Field name containing the downstream value grade\n- output_downstream_value_feedback_key: Field name containing the downstream value feedback\n", + "type": { + "level_1": "core_text", + "level_2": "eval" + }, + "allowed_prompts": [ + "Text2QAQuestionQualityPrompt", + "Text2QAAnswerAlignmentPrompt", + "Text2QAAnswerVerifiabilityPrompt", + "Text2QADownstreamValuePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_question_key", + "default": "generated_question", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_answer_key", + "default": "generated_answer", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_question_quality_key", + "default": "question_quality_grades", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_question_quality_feedback_key", + "default": "question_quality_feedbacks", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_answer_alignment_key", + "default": "answer_alignment_grades", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_answer_alignment_feedback_key", + "default": "answer_alignment_feedbacks", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_answer_verifiability_key", + "default": "answer_verifiability_grades", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_answer_verifiability_feedback_key", + "default": "answer_verifiability_feedbacks", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_downstream_value_key", + "default": "downstream_value_grades", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_downstream_value_feedback_key", + "default": "downstream_value_feedbacks", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 47, + "name": "PromptedEvaluator", + "description": "PromptedEvaluator:使用 LLM 根据系统提示词对数据质量进行评分,并将评分写回 DataFrame(同时通过 storage 持久化)。模型应只输出分数(整数)。\n功能:对每行输入文本生成一个评分。\n输入参数:\n- llm_serving:LLM 服务对象,需实现 LLMServingABC 接口。\n- system_prompt:系统提示词(默认:'Please evaluate the quality of this data on a scale from 1 to 5.')。\n- input_key:输入文本所在列名(默认:'raw_content')。\n- output_key:评分结果写入的列名(默认:'eval')。\n输出:\n- 返回输出列名(用于后续算子引用),评分结果已写回并保存。", + "type": { + "level_1": "core_text", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "system_prompt", + "default": "Please evaluate the quality of this data on a scale from 1 to 5.", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "raw_content", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "eval", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 48, + "name": "PromptedFilter", + "description": "PromptedFilter 使用内置的 PromptedEvaluator 对输入数据进行数值化打分,并根据指定的分数区间(min_score 到 max_score,闭区间)筛选出符合条件的样本。默认情况下打分范围是 1–5,但用户可以通过 system_prompt 自定义其他评分规则。\n\n输入参数:\n- llm_serving:LLM 服务对象,需实现 LLMServingABC 接口\n- system_prompt:系统提示词,定义评估规范(可选,默认 'Please evaluate the quality of this data on a scale from 1 to 5.')\n- input_key:待评估文本所在列名(默认 'raw_content')\n- output_key:写回打分结果的列名(默认 'eval',若已存在将被覆盖)\n- min_score:筛选的最小分(默认 5)\n- max_score:筛选的最大分(默认 5)\n\n输出参数:\n- 过滤后的 DataFrame(仅保留分数位于 [min_score, max_score] 的行)\n- 返回 output_key 以供后续算子引用\n\n备注:\n- 默认打分区间是 1–5,但可根据实际 prompt 改变。", + "type": { + "level_1": "core_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "system_prompt", + "default": "Please evaluate the quality of this data on a scale from 1 to 5.", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "min_score", + "default": 1, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 5, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "raw_content", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "eval", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 49, + "name": "KCenterGreedyFilter", + "description": "该算子用于从大量的文档片段中选取部分文档片段,用于后续生成种子QA对\n\n输入参数:\n- input_key: 包含文档片段的字段名\n- embedding_model_path: 嵌入模型路径\n- num_samples: 选取的文档片段数量\n- method: 选择方法,随机或k-center-greedy\n\n", + "type": { + "level_1": "core_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "num_samples", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "embedding_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "content", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 50, + "name": "GeneralFilter", + "description": "该算子支持通过多个自定义函数对 DataFrame 进行灵活过滤。\n\n每条过滤规则是一个函数(例如 lambda 表达式),接受一个 DataFrame 并返回一个布尔类型的 Series,用于指定保留哪些行。\n\n输入参数:\n- filter_rules:一个函数列表,每个函数形式为 lambda df: ...,需返回一个与 df 长度一致的布尔 Series。所有规则之间采用与(AND)关系组合。\n\n示例:\n - lambda df: df['score'] > 0.5\n - lambda df: df['label'].isin(['A', 'B'])", + "type": { + "level_1": "core_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "filter_rules", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 51, + "name": "PromptedRefiner", + "description": "PromptedRefiner 根据给定的 system_prompt 对指定列的文本进行改写/润色/规范化,并将结果**就地写回**同一列(覆盖原内容)。其做法是对每一行拼接 `system_prompt + raw_content` 作为模型输入,批量生成改写结果。\n\n输入参数:\n- llm_serving:LLM 服务对象,需实现 LLMServingABC 接口\n- system_prompt:系统提示词,用于描述改写目标与风格(默认 'You are a helpful agent.')\n- input_key:要改写的文本列名(默认 'raw_content'),改写后会覆盖该列\n\n输出参数:\n- 覆盖后的 DataFrame(同名列被改写后的文本)\n- 无返回值(结果已通过 DataFlowStorage 写出)\n\n备注:\n- 该算子**覆盖** input_key 列;若需保留原文,建议先拷贝到新列。\n- 期望每行在 input_key 列提供可用文本;空值将不会生成对应输入,如与行数不匹配可能导致赋值报错。", + "type": { + "level_1": "core_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "system_prompt", + "default": "You are a helpful agent.", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "raw_content", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 52, + "name": "PandasOperator", + "description": "该算子支持通过多个自定义函数对 DataFrame 进行任意操作(如添加列、重命名、排序等)。\n\n每个函数(通常为 lambda 表达式)接受一个 DataFrame 并返回一个修改后的 DataFrame。\n\n输入参数:\n- process_fn:一个函数列表,每个函数形式为 lambda df: ...,必须返回一个 DataFrame。\n\n示例:\n - lambda df: df.assign(score2=df['score'] * 2)\n - lambda df: df.sort_values('score', ascending=False)", + "type": { + "level_1": "core_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "process_fn", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + } + ], + "core_vision": [ + { + "node": 53, + "name": "PromptedVQAGenerator", + "description": "该算子用于视觉问答生成,接收包含图像和问题的输入内容,使用大语言模型生成回答,并将生成的回答保存到数据框中。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- system_prompt:系统提示词,用于定义模型行为,默认为'You are a helpful assistant.'\n- input_key:输入内容的字段名,默认为'raw_content'\n- output_key:输出生成内容的字段名,默认为'generated_content'\n输出参数:\n- 返回输出字段名,用于后续算子引用\n- 在数据框中添加包含生成回答的新列", + "type": { + "level_1": "core_vision", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "system_prompt", + "default": "You are a helpful assistant.", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "raw_content", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "generated_content", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + } + ], + "db": [ + { + "node": 54, + "name": "DBOperator", + "description": "N/A (非 staticmethod)", + "type": { + "level_1": "db", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "expr", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + } + ], + "general_text": [ + { + "node": 55, + "name": "ColonEndFilter", + "description": "该算子用于检查文本是否以冒号结尾,常用于判断问题是否为不完整的提问。\n初始化参数:\n- 无\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'{类名小写}_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 56, + "name": "SentenceNumberFilter", + "description": "该算子用于检查文本中的句子数量是否在指定范围内,使用正则表达式匹配句子结束符号(。!?.!?)进行分割。\n初始化参数:\n- min_sentences:最小句子数量阈值,默认为3\n- max_sentences:最大句子数量阈值,默认为7500\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'sentence_number_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_sentences", + "default": 3, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_sentences", + "default": 7500, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "sentence_number_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 57, + "name": "LineEndWithEllipsisFilter", + "description": "该算子用于检测并过滤以省略号(...)或(……)结尾的文本行,常用于识别不完整的表述。\n初始化参数:\n- threshold:以省略号结尾的行数比率阈值,默认为0.3\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'line_end_with_ellipsis_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "threshold", + "default": 0.3, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "line_end_with_ellipsis_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 58, + "name": "ContentNullFilter", + "description": "该算子用于过滤空值、空字符串或仅包含空白字符的文本,确保输入数据的有效性。\n初始化参数:\n- 无\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'content_null_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "content_null_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 59, + "name": "SymbolWordRatioFilter", + "description": "该算子用于检查文本中特定符号(#, ..., …)与单词数量的比率是否超过阈值,过滤符号使用过多的文本。\n初始化参数:\n- threshold:符号与单词比率阈值,默认为0.4\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'symbol_word_ratio_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "threshold", + "default": 0.4, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "symbol_word_ratio_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 60, + "name": "AlphaWordsFilter", + "description": "该算子用于验证文本中字母单词的比率是否达到阈值,支持NLTK分词或简单空格分割两种模式。\n初始化参数:\n- threshold:字母单词比率阈值(无默认值,必须提供)\n- use_tokenizer:是否使用NLTK分词器(无默认值,必须提供)\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'alpha_words_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "threshold", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "use_tokenizer", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "alpha_words_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 61, + "name": "HtmlEntityFilter", + "description": "该算子用于检测并过滤包含HTML实体(如&、<、>等)的文本,确保内容不包含标记语言元素。\n初始化参数:\n- 无\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'html_entity_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "html_entity_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 62, + "name": "IDCardFilter", + "description": "该算子用于检测并过滤包含身份证相关术语的文本,使用正则表达式匹配身份证号码模式以保护敏感信息。\n初始化参数:\n- threshold:身份证相关词汇匹配次数阈值,默认为3\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'id_card_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "threshold", + "default": 3, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "id_card_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 63, + "name": "NoPuncFilter", + "description": "该算子用于确保文本包含足够的标点符号,通过统计句子间最大单词数量进行过滤。\n初始化参数:\n- threshold:句子间最大单词数量阈值,默认为112\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'no_punc_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "threshold", + "default": 112, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "no_punc_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 64, + "name": "SpecialCharacterFilter", + "description": "该算子用于移除包含特殊/unicode字符的文本,使用预定义模式检测非标准字符以确保文本规范性。\n初始化参数:\n- 无\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'special_character_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "special_character_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 65, + "name": "WatermarkFilter", + "description": "该算子用于检测并移除包含版权/水印内容的文本,使用指定关键词列表识别受保护内容。\n初始化参数:\n- watermarks:水印关键词列表,默认为['Copyright', 'Watermark', 'Confidential']\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'watermark_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "watermarks", + "default": [ + "Copyright", + "Watermark", + "Confidential" + ], + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "watermark_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 66, + "name": "MeanWordLengthFilter", + "description": "该算子用于检查文本中单词的平均长度是否在指定范围内,通过字符总数除以单词数量计算平均值。\n初始化参数:\n- min_length:最小平均单词长度,默认为3\n- max_length:最大平均单词长度,默认为10\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'mean_word_length_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_length", + "default": 3, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_length", + "default": 10, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "mean_word_length_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 67, + "name": "StopWordFilter", + "description": "该算子用于验证文本中停用词的比率是否高于阈值,使用NLTK分词器进行单词分割和停用词识别。\n初始化参数:\n- threshold:停用词比率阈值(无默认值,必须提供)\n- use_tokenizer:是否使用NLTK分词器(无默认值,必须提供)\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'stop_word_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "threshold", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "use_tokenizer", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "stop_word_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 68, + "name": "CurlyBracketFilter", + "description": "该算子用于检测文本中是否存在过多的花括号使用,通过花括号数量与文本长度的比率进行过滤。\n初始化参数:\n- threshold:花括号比率阈值,默认为0.025\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'curly_bracket_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "threshold", + "default": 0.025, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "curly_bracket_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 69, + "name": "CapitalWordsFilter", + "description": "该算子用于检查文本中大写单词的比率是否超过阈值,支持可选的分词器进行单词识别。\n初始化参数:\n- threshold:大写单词比率阈值,默认为0.2\n- use_tokenizer:是否使用NLTK分词器,默认为False\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'capital_words_filter'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "threshold", + "default": 0.2, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "use_tokenizer", + "default": false, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "capital_words_filter", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 70, + "name": "LoremIpsumFilter", + "description": "该算子用于检测并过滤包含占位文本(如'lorem ipsum')的文本,使用正则表达式模式匹配并结合阈值过滤。\n初始化参数:\n- threshold:'lorem ipsum'出现次数与文本长度的比率阈值,默认为3e-8\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'loremipsum_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "threshold", + "default": 3e-08, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "loremipsum_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 71, + "name": "UniqueWordsFilter", + "description": "该算子用于检查文本中唯一单词的比率是否达到阈值,通过集合操作计算唯一单词数量与总单词数量的比率。\n初始化参数:\n- threshold:最小唯一单词比率阈值,默认为0.1\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'unique_words_filter'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "threshold", + "default": 0.1, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "unique_words_filter", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 72, + "name": "CharNumberFilter", + "description": "该算子用于验证文本在去除空白字符后的字符数量是否达到最小阈值。\n初始化参数:\n- threshold:最小字符数量阈值,默认为100\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'char_number_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "threshold", + "default": 100, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "char_number_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 73, + "name": "LineStartWithBulletpointFilter", + "description": "该算子用于检测并过滤以各种项目符号符号开头的文本行,使用Unicode字符匹配结合比率阈值进行过滤。\n初始化参数:\n- threshold:以项目符号开头的行数比率阈值,默认为0.9\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'line_start_with_bullet_point_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "threshold", + "default": 0.9, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "line_start_with_bullet_point_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 74, + "name": "LineWithJavascriptFilter", + "description": "该算子用于识别并过滤包含'javascript'引用的文本,通过关键词匹配和阈值判断进行内容过滤。\n初始化参数:\n- threshold:不包含'javascript'的最小行数阈值,默认为3\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'line_with_javascript_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "threshold", + "default": 3, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "line_with_javascript_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 75, + "name": "LangkitSampleEvaluator", + "description": "使用Langkit工具包计算文本统计信息,帮助评估文本结构复杂性和可读性。提取多种语言特征,包括句子长度、词汇多样性、情感倾向等。\n\n输出参数:\n- LangkitNumSentencesScore: 句子数量\n- LangkitNumWordsScore: 单词数量\n- LangkitAvgWordLengthScore: 平均单词长度\n- LangkitFleschReadingEaseScore: 可读性评分(Flesch公式)\n- LangkitSentimentScore: 情感倾向(-1到1之间)", + "type": { + "level_1": "general_text", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 76, + "name": "LangkitFilter", + "description": "基于LangkitScorer打分器的得分对数据进行过滤。使用Langkit工具包计算11种文本统计信息,帮助评估文本结构复杂性和可读性。\n输入参数:\n- min_scores:各指标的最小阈值字典,包含11个语言统计指标\n- max_scores:各指标的最大阈值字典,包含11个语言统计指标\n- metrics_to_keep:需要保留的评估指标列表\n输出参数:\n- 过滤后的DataFrame,仅保留所有指标都在指定范围内的文本\n- 返回包含各指标标签字段名的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_scores", + "default": { + "flesch_reading_ease": 0, + "automated_readability_index": 0, + "aggregate_reading_level": 0, + "syllable_count": 32.0, + "lexicon_count": 23.0, + "sentence_count": 1.0, + "character_count": 118.0, + "letter_count": 109.0, + "polysyllable_count": 0.0, + "monosyllable_count": 13.0, + "difficult_words": 4.0 + }, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_scores", + "default": { + "flesch_reading_ease": 100, + "automated_readability_index": 100, + "aggregate_reading_level": 100, + "syllable_count": 2331.9, + "lexicon_count": 1554.0, + "sentence_count": 89.1, + "character_count": 7466.3, + "letter_count": 7193.0, + "polysyllable_count": 216.4, + "monosyllable_count": 1044.1, + "difficult_words": 213.4 + }, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "metrics_to_keep", + "default": [ + "flesch_reading_ease", + "automated_readability_index", + "aggregate_reading_level", + "syllable_count", + "lexicon_count", + "sentence_count", + "character_count", + "letter_count", + "polysyllable_count", + "monosyllable_count", + "difficult_words" + ], + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_keys", + "default": [ + "flesch_reading_ease", + "automated_readability_index", + "aggregate_reading_level", + "syllable_count", + "lexicon_count", + "sentence_count", + "character_count", + "letter_count", + "polysyllable_count", + "monosyllable_count", + "difficult_words" + ], + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 77, + "name": "LexicalDiversitySampleEvaluator", + "description": "使用MTLD(词汇多样性测量)和HDD(移动平均类型-标记比)方法计算文本词汇多样性。\n\n功能说明:\n- MTLD(词汇多样性测量):通过计算维持特定TTR阈值所需的单词数量来评估词汇多样性\n- HDD(移动平均类型-标记比):基于样本的词汇丰富度估计\n\n输入要求:文本长度需大于50个单词\n\n输出参数:\n- LexicalDiversityMTLDScore: MTLD多样性得分(值越高表示多样性越好)\n- LexicalDiversityHD-DScore: HDD多样性得分(值越高表示多样性越好)", + "type": { + "level_1": "general_text", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 78, + "name": "LexicalDiversityFilter", + "description": "基于LexicalDiversityScorer打分器的得分对数据进行过滤。使用MTLD(移动平均类型-令牌比)和HDD(超几何分布多样性)两种方法计算词汇多样性,高分代表更丰富的词汇使用。\n输入参数:\n- min_scores:各指标的最小阈值字典,包含'mtld'和'hdd'\n- max_scores:各指标的最大阈值字典,包含'mtld'和'hdd'\n输出参数:\n- 过滤后的DataFrame,仅保留词汇多样性在指定范围内的文本\n- 返回包含各指标标签字段名的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_scores", + "default": { + "mtld": 50, + "hdd": 0.8 + }, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_scores", + "default": { + "mtld": 99999, + "hdd": 1.0 + }, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_keys", + "default": [ + "mtld", + "hdd" + ], + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 79, + "name": "NgramSampleEvaluator", + "description": "计算文本中n-gram的重复比例,评估文本冗余度。通过比较唯一n-gram数量与总n-gram数量的比值来衡量文本原创性。\n\n初始化参数:\n- ngrams: n-gram的长度,默认为5\n\n输出参数:\n- NgramScore: n-gram重复比例得分(0到1之间,得分越高表示重复比例越低)", + "type": { + "level_1": "general_text", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "ngrams", + "default": 5, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "NgramScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 80, + "name": "NgramFilter", + "description": "基于NgramScorer打分器的得分对数据进行过滤。计算文本中n-gram的重复比例,得分越高表示重复比例越低,文本冗余度越小。\n输入参数:\n- min_score:最小n-gram得分阈值\n- max_score:最大n-gram得分阈值\n- ngrams:n-gram的n值\n输出参数:\n- 过滤后的DataFrame,仅保留n-gram得分在指定范围内的文本\n- 返回包含n-gram得分字段名的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 0.8, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 1, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "ngrams", + "default": 5, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "NgramScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 81, + "name": "PresidioSampleEvaluator", + "description": "使用Microsoft Presidio模型识别文本中的个人身份信息(PII),返回检测到的PII实体数量。支持多种实体类型如姓名、邮箱、电话号码等,基于dslim/bert-base-NER模型实现。适用于评估文本的隐私安全风险。\n输入参数:\n- text: 待检测的文本字符串\n- lang: 语言类型,默认为'en'\n输出参数:\n- int: 检测到的PII实体数量", + "type": { + "level_1": "general_text", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "lang", + "default": "en", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "PresidioScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 82, + "name": "PresidioFilter", + "description": "基于PresidioScorer打分器的得分对数据进行过滤。使用Microsoft Presidio模型识别文本中的私人实体(PII),返回PII信息个数。\n支持识别姓名、邮箱、电话号码、身份证号等多种敏感信息类型,可用于数据隐私保护和合规性检查。\n输入参数:\n- min_score:保留样本的最小PII数量阈值,默认为0\n- max_score:保留样本的最大PII数量阈值,默认为5\n- lang:文本语言,默认为'en'\n- device:模型运行设备,默认为'cuda'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n输出参数:\n- 过滤后的DataFrame,仅保留PII数量在[min_score, max_score]范围内的样本\n- 返回包含输出字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 5, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "lang", + "default": "en", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "PresidioScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 83, + "name": "BlocklistFilter", + "description": "该算子使用特定语言的阻止列表进行文本过滤,支持可选的分词器进行单词级匹配。\n输入参数:\n- input_key:输入文本字段名,默认为'text'\n- language:语言代码,默认为'zh'\n- blocklist_dir:阻止列表文件目录,默认为'./blocklists/'\n- threshold:匹配次数阈值,默认为1\n- use_tokenizer:是否使用分词器,默认为True\n- tokenizer:分词器对象,默认为None\n输出参数:\n- 过滤后的DataFrame,仅保留不包含阻止列表关键词的文本行\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "language", + "default": "en", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "threshold", + "default": 1, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "use_tokenizer", + "default": false, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "blocklist_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 84, + "name": "HashDeduplicateFilter", + "description": "使用多种哈希函数对文本进行精确去重,支持md5、sha256或xxh3算法。通过计算文本的哈希值识别重复数据。\n\n初始化参数:\n- hash_func: 哈希函数名称,可选'md5'、'sha256'或'xxh3',默认为'md5'\n\n运行参数:\n- input_keys: 用于计算哈希的多个字段列表(与input_key二选一)\n- input_key: 用于计算哈希的单个字段名(与input_keys二选一)\n- output_key: 去重标记字段名,默认为'minhash_deduplicated_label'\n\n输出说明:标记为1的数据表示首次出现,标记为0的数据表示重复数据\n算法特点:\n- md5: 128位哈希值,平衡速度和唯一性\n- sha256: 256位哈希值,更高安全性,速度较慢\n- xxh3: 128位哈希值,最快的哈希算法", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "hash_func", + "default": "md5", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_keys", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "minhash_deduplicated_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 85, + "name": "LanguageFilter", + "description": "使用FastText语言识别模型过滤数据。下载并加载预训练的FastText语言识别模型,检查文本的语言是否在允许的语言列表中。\n输入参数:\n- allowed_languages:允许的语言标签列表\n- model_cache_dir:模型缓存目录路径\n输出参数:\n- 过滤后的DataFrame,仅保留语言在允许列表中的文本\n- 返回包含语言标签字段名的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "allowed_languages", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "language_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 86, + "name": "LLMLanguageFilter", + "description": "使用大语言模型识别语言并过滤数据", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "allowed_languages", + "default": [ + "en" + ], + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "language_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 87, + "name": "MinHashDeduplicateFilter", + "description": "结合MinHash与LSH(局部敏感哈希)实现高效近似去重。将文本转换为MinHash签名,使用LSH快速查找相似文本,实现大规模数据集的近似去重。\n输入参数:\n- num_perm:生成MinHash签名的排列数\n- threshold:相似度阈值,超过此阈值判定为相似文本\n- use_n_gram:是否使用n-gram分词\n- ngram:n-gram的n值\n输出参数:\n- 去重后的DataFrame,仅保留唯一文本\n- 返回包含去重标签字段名的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "num_perm", + "default": 128, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "threshold", + "default": 0.9, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "use_n_gram", + "default": true, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "ngram", + "default": 5, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_keys", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "minhash_deduplicated_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 88, + "name": "NgramHashDeduplicateFilter", + "description": "结合n-gram技术与哈希算法识别相似文本,实现近似去重。将文本分割为多个n-gram片段,计算每个片段的哈希值,通过比较哈希集合的相似度来判断文本相似性。\n输入参数:\n- n_gram:将文本分割的片段数量\n- hash_func:哈希函数类型,支持'md5'、'sha256'和'xxh3'\n- diff_size:哈希集合差异阈值,小于此值判定为相似文本\n输出参数:\n- 去重后的DataFrame,仅保留唯一文本\n- 返回包含去重标签字段名的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "n_gram", + "default": 3, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "hash_func", + "default": "md5", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "diff_size", + "default": 1, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_keys", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "minhash_deduplicated_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 89, + "name": "PerspectiveSampleEvaluator", + "description": "使用Perspective API评估文本的毒性,返回毒性概率,得分越高表明文本毒性越高。\n输入参数:\n- serving:Perspective API服务对象\n- input_key:输入文本字段名\n- output_key:输出得分字段名,默认'PerspectiveScore'\n输出参数:\n- 包含毒性评估得分的DataFrame", + "type": { + "level_1": "general_text", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "PerspectiveScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 90, + "name": "PerspectiveFilter", + "description": "基于PerspectiveScorer打分器的得分对数据进行过滤使用Perspective API评估文本的毒性,返回毒性概率,得分越高表明文本毒性越高。\n输入参数:\n- min_score:最小毒性得分阈值\n- max_score:最大毒性得分阈值\n输出参数:\n- 过滤后的DataFrame,仅保留毒性得分在指定范围内的文本\n- 返回包含毒性得分字段名的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 0.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 0.5, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "PerspectiveScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 91, + "name": "SemDeduplicateFilter", + "description": "基于BERT语义相似度识别语义重复文本,执行近似去重操作。通过计算文本嵌入向量间的余弦相似度,识别语义相似的文本并保留唯一样本。\n支持多字段组合作为去重依据,可有效去除内容相似但表述不同的重复数据,提高数据集多样性。\n输入参数:\n- eps:相似度阈值,值越小表示允许的相似度越低,默认为0.05(即余弦相似度大于0.95视为重复)\n- model_name:预训练模型名称,默认为'sentence-transformers/all-MiniLM-L6-v2'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- device:模型运行设备,默认为'cuda'\n- input_keys:多个输入字段名列表,与input_key二选一\n- input_key:单个输入字段名,与input_keys二选一\n- output_key:去重结果字段名,默认为'minhash_deduplicated_label'\n输出参数:\n- 过滤后的DataFrame,仅保留语义不重复的样本(标记为1的样本)\n- 返回包含去重结果字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "eps", + "default": 0.05, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_name", + "default": "sentence-transformers/all-MiniLM-L6-v2", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_keys", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "minhash_deduplicated_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 92, + "name": "SimHashDeduplicateFilter", + "description": "使用SimHash算法通过汉明距离识别相似文本,执行近似去重操作。将文本转换为固定长度的指纹,通过计算指纹间的汉明距离判断文本相似度。\n相比语义去重速度更快,适合大规模数据集的快速去重预处理,尤其适用于检测字符层面相似的文本。\n输入参数:\n- fingerprint_size:指纹长度,默认为64位\n- bound:相似度阈值,值越小表示允许的相似度越低,默认为0.1(即相似度大于0.9视为重复)\n- input_keys:多个输入字段名列表,与input_key二选一\n- input_key:单个输入字段名,与input_keys二选一\n- output_key:去重结果字段名,默认为'minhash_deduplicated_label'\n输出参数:\n- 过滤后的DataFrame,仅保留相似性低于阈值的唯一样本(标记为1的样本)\n- 返回包含去重结果字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "fingerprint_size", + "default": 64, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "bound", + "default": 0.1, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_keys", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "minhash_deduplicated_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 93, + "name": "WordNumberFilter", + "description": "该算子用于过滤单词数量不在指定范围内的文本,通过空格分割计算单词数量。\n输入参数:\n- input_key:输入文本字段名,默认为'text'\n- min_words:最小单词数量阈值,默认为5\n- max_words:最大单词数量阈值,默认为100\n输出参数:\n- 过滤后的DataFrame,仅保留单词数量在指定范围内的文本行\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_words", + "default": 20, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_words", + "default": 100000, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "word_number_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 94, + "name": "HtmlEntityRefiner", + "description": "去除文本中的HTML实体,包括标准实体(如 、<)和各种变体形式(全角符号、中文分号等)。支持自定义需要移除的HTML实体列表。输入参数:\n- html_entities:需要移除的HTML实体列表,默认为包含常见实体的列表\n- input_key:输入文本字段名\n输出参数:\n- 包含移除HTML实体后文本的DataFrame\n- 返回输入字段名,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "html_entities", + "default": [ + "nbsp", + "lt", + "gt", + "amp", + "quot", + "apos", + "hellip", + "ndash", + "mdash", + "lsquo", + "rsquo", + "ldquo", + "rdquo" + ], + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 95, + "name": "HtmlUrlRemoverRefiner", + "description": "去除文本中的URL链接和HTML标签,净化文本内容。使用正则表达式匹配并移除各种形式的URL和HTML标签。输入参数:\n- input_key:输入文本字段名\n输出参数:\n- 包含净化后文本的DataFrame\n- 返回输入字段名,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 96, + "name": "LowercaseRefiner", + "description": "将文本字段中的所有大写字符转换为小写,统一文本格式。对指定字段的文本内容进行全小写处理。输入参数:\n- input_key:输入文本字段名\n输出参数:\n- 包含小写转换后文本的DataFrame\n- 返回输入字段名,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 97, + "name": "NERRefiner", + "description": "使用命名实体识别(NER)技术识别并屏蔽文本中的特定实体。使用spaCy的'en_core_web_sm'模型识别实体,并将其替换为对应的实体类型标签。输入参数:\n- input_key:输入文本字段名\n输出参数:\n- 包含实体屏蔽后文本的DataFrame\n- 返回输入字段名,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 98, + "name": "PIIAnonymizeRefiner", + "description": "使用Presidio和BERT-NER模型识别并匿名化文本中的个人身份信息(PII)。支持多种PII类型的检测和匿名化处理。输入参数:\n- lang:语言代码,默认为'en'\n- device:运行设备,默认为'cuda'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- model_name:NER模型名称,默认为'dslim/bert-base-NER'\n- input_key:输入文本字段名\n输出参数:\n- 包含匿名化后文本的DataFrame\n- 返回输入字段名,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "lang", + "default": "en", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_name", + "default": "dslim/bert-base-NER", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 99, + "name": "ReferenceRemoverRefiner", + "description": "删除文本中未闭合的引用标签和引用链接,包括标签和{{cite}}模板的各种完整和不完整形式。净化文本中的引用标记。输入参数:\n- input_key:输入文本字段名\n输出参数:\n- 包含移除引用标记后文本的DataFrame\n- 返回输入字段名,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 100, + "name": "RemoveContractionsRefiner", + "description": "该算子用于扩展文本中的英语缩写词,将缩写形式转换为完整形式(例如将\"can't\"扩展为\"cannot\")。\n使用contractions库进行缩写词扩展,提高文本标准化程度。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含扩展缩写词后的文本\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 101, + "name": "RemoveEmojiRefiner", + "description": "该算子用于去除文本中的Unicode图像表情符号,包括表情符号、杂项符号、交通符号、旗帜等各类图像符号。\n通过正则表达式匹配Unicode表情符号范围,实现高效过滤。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含去除表情符号的文本\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 102, + "name": "RemoveEmoticonsRefiner", + "description": "该算子用于移除文本中的文本型表情符号,例如':-)'、':D'、':('等字符组合表情。\n基于预定义的表情符号字典进行匹配替换,支持多种常见文本表情模式。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含去除文本表情的文本\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 103, + "name": "RemoveExtraSpacesRefiner", + "description": "该算子用于移除文本中的多余空格,将连续的多个空格替换为单个空格,并去除文本前后的空白字符。\n通过字符串分割和连接实现空格标准化,提高文本格式一致性。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含标准化空格的文本\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 104, + "name": "RemoveImageRefsRefiner", + "description": "该算子用于去除文本中的图片引用格式,包括Markdown图片链接、图片编号、特殊符号组合等图像引用模式。\n通过多模式正则表达式匹配,识别并移除多种图片引用格式。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含去除图片引用的文本\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 105, + "name": "RemoveNumberRefiner", + "description": "该算子用于移除文本中的数字字符,包括0-9的阿拉伯数字。\n通过字符过滤实现数字移除,保留纯文本内容。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含去除数字的文本\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 106, + "name": "RemovePunctuationRefiner", + "description": "该算子用于移除文本中的标点符号,包括英文标点符号集合中的所有符号。\n使用string.punctuation定义的标点集合进行过滤,实现文本去标点处理。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含去除标点的文本\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 107, + "name": "RemoveRepetitionsPunctuationRefiner", + "description": "该算子用于移除文本中重复的标点符号,例如将\"!!!\"变为\"!\",\",,\"变为\",\"。\n通过正则表达式匹配连续重复的标点符号,替换为单个符号。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含标准化标点的文本\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 108, + "name": "RemoveStopwordsRefiner", + "description": "该算子用于移除文本中的英语停用词(如\"the\",\"is\",\"in\"等无实际意义的高频词汇)。\n使用NLTK库的stopwords语料库进行停用词过滤,提高文本特征密度。\n输入参数:\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含去除停用词的文本\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 109, + "name": "SpellingCorrectionRefiner", + "description": "该算子用于通过SymSpell算法对文本中的拼写错误进行纠正,支持自定义编辑距离和词典路径。\n若本地词典不存在则自动下载,使用近似字符串匹配实现拼写纠错功能。\n输入参数:\n- max_edit_distance:最大编辑距离,默认为2\n- prefix_length:前缀长度,默认为7\n- dictionary_path:词典路径,默认为'frequency_dictionary_en_82_765.txt'\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含纠正拼写错误的文本\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "max_edit_distance", + "default": 2, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prefix_length", + "default": 7, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "dictionary_path", + "default": "frequency_dictionary_en_82_765.txt", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 110, + "name": "StemmingLemmatizationRefiner", + "description": "该算子用于对文本进行词干提取或词形还原处理,将词语转换为其基本形式。\n支持两种处理方式:Porter词干提取(stemming)和WordNet词形还原(lemmatization),可通过参数选择。\n输入参数:\n- method:处理方法,可选'stemming'或'lemmatization',默认为'stemming'\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含词干/词形还原后的文本\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "method", + "default": "stemming", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 111, + "name": "TextNormalizationRefiner", + "description": "该算子用于规范化文本中的日期格式和货币格式,统一为标准表示形式。\n日期格式统一转换为'YYYY-MM-DD'形式,货币格式转换为'金额 USD'形式,提高数据一致性。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含格式规范化的文本\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 112, + "name": "BertSampleEvaluator", + "description": "使用BERTScore评估生成文本与参考文本的相似度,基于上下文嵌入计算P/R/F1分数。\n输入参数:\n- lang:语言类型,默认为'en'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- input_key:生成文本字段名\n- reference_key:参考文本字段名\n- output_key:输出得分字段名,默认为'BertScore'\n输出参数:\n- 包含F1相似度得分的DataFrame", + "type": { + "level_1": "general_text", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "lang", + "default": "en", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_reference_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "BertScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 113, + "name": "BleuSampleEvaluator", + "description": "计算BLEU分数评估生成文本与参考文本的n-gram重叠度,支持1-4元语法分析。\n输入参数:\n- n:最大n-gram长度,默认为4\n- eff:参考长度计算方式,可选'shortest'/'average'/'longest',默认为'average'\n- special_reflen:特殊参考长度,默认为None\n- input_key:生成文本字段名\n- reference_key:参考文本字段名\n- output_key:输出得分字段名,默认为'BleuScore'\n输出参数:\n- 包含BLEU得分的DataFrame", + "type": { + "level_1": "general_text", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "n", + "default": 4, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "eff", + "default": "average", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "special_reflen", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_reference_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "BleuScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 114, + "name": "CiderSampleEvaluator", + "description": "使用CIDEr指标评估生成文本与参考文本的相似度,基于TF-IDF加权的n-gram重叠度。\n输入参数:\n- n:最大n-gram长度,默认为4\n- sigma:高斯惩罚参数,默认为6.0\n- df_mode:文档频率模式,默认为'coco-val-df'\n- idf_path:IDF文件路径,默认为预训练COCO数据集IDF\n- input_key:生成文本字段名\n- reference_key:参考文本字段名\n- output_key:输出得分字段名,默认为'CiderScore'\n输出参数:\n- 包含CIDEr得分的DataFrame", + "type": { + "level_1": "general_text", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "n", + "default": 4, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "sigma", + "default": 6.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "df_mode", + "default": "coco-val-df", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "idf_path", + "default": "./dataflow/operators/general_pt/eval/cider/coco-val-df.p", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_reference_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "CiderScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 115, + "name": "Task2VecDatasetEvaluator", + "description": "使用Task2Vec方法评估数据集的多样性,通过计算样本嵌入的余弦距离矩阵来量化多样性。\n输入参数:\n- device:计算设备,默认为'cuda'\n- sample_nums:采样次数,默认为10\n- sample_size:每次采样样本数,默认为1\n- method:嵌入方法,可选'montecarlo'或'variational',默认为'montecarlo'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- input_key:输入文本字段名\n输出参数:\n- Task2VecDiversityScore:多样性得分\n- ConfidenceInterval:置信区间", + "type": { + "level_1": "general_text", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "sample_nums", + "default": 10, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "sample_size", + "default": 1, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "method", + "default": "montecarlo", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 116, + "name": "VendiDatasetEvaluator", + "description": "通过计算VendiScore来评估数据集的多样性,使用BERT和SimCSE模型生成嵌入并计算分数。\n输入参数:\n- device:计算设备,默认为'cuda'\n- input_key:输入文本字段名\n输出参数:\n- BERTVendiScore:基于BERT的多样性得分\n- SimCSEVendiScore:基于SimCSE的多样性得分", + "type": { + "level_1": "general_text", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + } + ], + "knowledge_cleaning": [ + { + "node": 117, + "name": "KBCChunkGenerator", + "description": "('CorpusTextSplitter是轻量级文本分割工具,', '支持词/句/语义/递归分块,', '可配置块大小、重叠和最小块长度')", + "type": { + "level_1": "knowledge_cleaning", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "chunk_size", + "default": 512, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "chunk_overlap", + "default": 50, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "split_method", + "default": "token", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "min_tokens_per_chunk", + "default": 100, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "tokenizer_name", + "default": "bert-base-uncased", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "text_path", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "raw_chunk", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 118, + "name": "KBCChunkGeneratorBatch", + "description": "('CorpusTextSplitter是轻量级文本分割工具,', '支持词/句/语义/递归分块,', '可配置块大小、重叠和最小块长度')", + "type": { + "level_1": "knowledge_cleaning", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "chunk_size", + "default": 512, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "chunk_overlap", + "default": 50, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "split_method", + "default": "token", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "min_tokens_per_chunk", + "default": 100, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "tokenizer_name", + "default": "bert-base-uncased", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "text_path", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "chunk_path", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 119, + "name": "FileOrURLToMarkdownConverter", + "description": "知识提取算子:支持从多种文件格式中提取结构化内容并转换为标准Markdown\n核心功能:\n1. PDF文件:使用MinerU解析引擎提取文本/表格/公式,保留原始布局\n2. Office文档(DOC/PPT等):通过DocConverter转换为Markdown格式\n3. 网页内容(HTML/XML):使用trafilatura提取正文并转为Markdown\n4. 纯文本(TXT/MD):直接透传不做处理\n特殊处理:\n- 自动识别中英文文档(lang参数)\n- 支持本地文件路径和URL输入\n- 生成中间文件到指定目录(intermediate_dir)", + "type": { + "level_1": "knowledge_cleaning", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "url", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "raw_file", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "intermediate_dir", + "default": "intermediate", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "lang", + "default": "en", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "mineru_backend", + "default": "vlm-sglang-engine", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 120, + "name": "FileOrURLToMarkdownConverterBatch", + "description": "知识提取算子:支持从多种文件格式中提取结构化内容并转换为标准Markdown\n核心功能:\n1. PDF文件:使用MinerU解析引擎提取文本/表格/公式,保留原始布局\n2. Office文档(DOC/PPT等):通过DocConverter转换为Markdown格式\n3. 网页内容(HTML/XML):使用trafilatura提取正文并转为Markdown\n4. 纯文本(TXT/MD):直接透传不做处理\n特殊处理:\n- 自动识别中英文文档(lang参数)\n- 支持本地文件路径和URL输入\n- 生成中间文件到指定目录(intermediate_dir)", + "type": { + "level_1": "knowledge_cleaning", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "intermediate_dir", + "default": "intermediate", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "lang", + "default": "en", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "mineru_backend", + "default": "vlm-sglang-engine", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "source", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "text_path", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 121, + "name": "KBCTextCleaner", + "description": "知识清洗算子:对原始知识内容进行标准化处理,包括HTML标签清理、特殊字符规范化、链接处理和结构优化,提升RAG知识库的质量。主要功能:\n1. 移除冗余HTML标签但保留语义化标签\n2. 标准化引号/破折号等特殊字符\n3. 处理超链接同时保留文本\n4. 保持原始段落结构和代码缩进\n5. 确保事实性内容零修改\n\n输入格式示例:\n
\n

标题文本

\n

正文段落,包括特殊符号,例如“弯引号”、–破折号等

\n \"示意图\"\n 链接文本\n
代码片段
\n ...\n
\n\n输出格式示例:\n标题文本\n\n正文段落,包括特殊符号,例如\"直引号\"、-破折号等\n\n[Image: 示例图 example.jpg]\n\n链接文本\n\n代码片段\n\n[结构保持,语义保留,敏感信息脱敏处理(如手机号、保密标记等)]", + "type": { + "level_1": "knowledge_cleaning", + "level_2": "generate" + }, + "allowed_prompts": [ + "KnowledgeCleanerPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "lang", + "default": "en", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "raw_chunk", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "cleaned_chunk", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 122, + "name": "KBCTextCleanerBatch", + "description": "知识清洗算子:对原始知识内容进行标准化处理,包括HTML标签清理、特殊字符规范化、链接处理和结构优化,提升RAG知识库的质量。主要功能:\n1. 移除冗余HTML标签但保留语义化标签\n2. 标准化引号/破折号等特殊字符\n3. 处理超链接同时保留文本\n4. 保持原始段落结构和代码缩进\n5. 确保事实性内容零修改", + "type": { + "level_1": "knowledge_cleaning", + "level_2": "generate" + }, + "allowed_prompts": [ + "KnowledgeCleanerPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "lang", + "default": "en", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "chunk_path", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "cleaned_chunk_path", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 123, + "name": "KBCMultiHopQAGeneratorBatch", + "description": "('MultiHopQAGenerator 是多跳问答对生成处理器,支持从文本中自动生成需要多步推理的问题与答案。', '处理流程包括:文本预处理、信息抽取、问题生成与回答生成,支持自定义语言模型后端和参数。', '输出格式如下:', '输入:\\ntext: <原始上下文文本>', '输出:\\n{\\n \"text\": <处理后的文本字符串>,\\n \"qa_pairs\": [\\n {\\n \"question\": <字符串:生成的问题>,\\n \"reasoning_steps\": [\\n {\"step\": <推理过程的步骤 1>},\\n {\"step\": <步骤 2>} ...\\n ],\\n \"answer\": <字符串:最终答案>,\\n \"supporting_facts\": [<支持该答案的事实 1>, <事实 2>, ...],\\n \"type\": <可选:问题类型,如“生物学”、“历史”等>\\n },\\n ...\\n ],\\n \"metadata\": {\\n \"source\": <数据来源>,\\n \"timestamp\": <时间戳字符串>,\\n \"complexity\": <整数:问题复杂度标记>\\n }\\n}')", + "type": { + "level_1": "knowledge_cleaning", + "level_2": "generate" + }, + "allowed_prompts": [ + "Text2MultiHopQAGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "seed", + "default": 0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "lang", + "default": "en", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "chunk_path", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "enhanced_chunk_path", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 124, + "name": "QAExtractor", + "description": "QA对提取器 - 将嵌套的QA_pairs转换为Alpaca微调格式\n\n核心功能:\n从结构化的QA对数据中提取问答内容,自动整合推理步骤和支持事实,\n输出符合Stanford Alpaca标准的instruction-input-output格式。\n\n初始化参数:\n• qa_key: QA对的字段名 (默认: 'QA_pairs')\n• output_json_file: 输出JSON文件路径 (可选,不指定则只更新DataFrame)\n• instruction: 统一的指令前缀 (默认: 'Please answer the following question...')\n\n运行参数 (input_key):\n• None - 包含所有字段 (question + reasoning_steps + supporting_facts)\n• '' - 空字符串,不包含额外上下文\n• 'reasoning_steps' - 只包含推理步骤\n• 'question,reasoning_steps' - 逗号分隔多个字段\n• ['question', 'supporting_facts'] - 列表格式\n\n输出字段:\n• instruction: 问题指令\n• input: 上下文信息 (根据input_key动态拼接)\n• output: 答案\n\n适用场景: 知识库QA微调、领域问答模型训练", + "type": { + "level_1": "knowledge_cleaning", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "qa_key", + "default": "QA_pairs", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_json_file", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "instruction", + "default": "Please answer the following question based on the provided information.", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + } + ], + "reasoning": [ + { + "node": 125, + "name": "ReasoningAnswerGenerator", + "description": "该算子用于为给定问题生成答案,调用大语言模型进行推理。\n输入参数:\n- llm_serving:LLM服务实例,用于生成答案\n- prompt_template:提示模板对象,用于构建生成提示词\n输出参数:\n- output_key:生成的答案字段,默认'generated_cot'", + "type": { + "level_1": "reasoning", + "level_2": "generate" + }, + "allowed_prompts": [ + "MathAnswerGeneratorPrompt", + "GeneralAnswerGeneratorPrompt", + "DiyAnswerGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": "", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "generated_cot", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 126, + "name": "ReasoningQuestionGenerator", + "description": "该算子用于基于现有问题生成新问题。\n输入参数:\n- num_prompts:生成问题的数量,整数,范围1-5(含),默认1\n- llm_serving:LLM服务实例,用于生成问题\n- prompt_template:提示模板对象,用于构建生成提示词\n输出参数:\n- 原始输入列(由input_key指定):新增生成的问题\n- Synth_or_Input:标识问题来源,'input'表示原始问题,'synth'表示生成的新问题", + "type": { + "level_1": "reasoning", + "level_2": "generate" + }, + "allowed_prompts": [ + "MathQuestionSynthesisPrompt", + "GeneralQuestionSynthesisPrompt", + "DiyQuestionSynthesisPrompt" + ], + "parameter": { + "init": [ + { + "name": "num_prompts", + "default": 1, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_synth_or_input_flag", + "default": "Synth_or_Input", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 127, + "name": "ReasoningAnswerExtractionQwenMathEvalGenerator", + "description": "该算子用于从数学问题回答中提取规范化答案表达式,进行字符串清洗、单位处理和格式标准化。\n\n输入参数:\n- input_key:输入数据字段名\n- answer_key:原始答案字段名\n- output_key:处理后的答案字段名\n- unit_texts:需要过滤的单位文本列表\n\n输出参数:\n- output_key:标准化后的数学表达式字段", + "type": { + "level_1": "reasoning", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "dataset_name", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "pseudo_correct_solution_example", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "extraction", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 128, + "name": "ReasoningPseudoAnswerGenerator", + "description": "该算子生成多个候选答案并通过统计选择最优解,实现伪答案生成。\n\n输入参数:\n- input_file:输入文件路径\n- output_file:输出文件路径\n- max_times:最大生成次数\n- selection_mode:统计选择模式(frequency/consistency)\n\n输出参数:\n- final_answer:最终选择答案字段\n- candidate_answers:候选答案列表字段", + "type": { + "level_1": "reasoning", + "level_2": "generate" + }, + "allowed_prompts": [ + "MathAnswerGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_times", + "default": 3, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key_answer", + "default": "pseudo_answers", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key_answer_value", + "default": "pseudo_answer_value", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key_solutions", + "default": "pseudo_solutions", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key_correct_solution_example", + "default": "pseudo_correct_solution_example", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 129, + "name": "ReasoningPretrainFormatConvertGenerator", + "description": "该算子用于将SFT格式数据转换为预训练格式。\n\n输入参数:\n- read_key_question:问题字段名\n- read_key_answer:答案字段名\n- output_key:输出文本字段名\n\n输出参数:\n- output_key:输出文本字段名,包含问题和答案的拼接结果\n- 输出文件:转换后的预训练格式数据文件路径", + "type": { + "level_1": "reasoning", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_read_key_question", + "default": "question", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_read_key_answer", + "default": "answer", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "text", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 130, + "name": "ReasoningQuestionFusionGenerator", + "description": "该算子用于基于现有问题生成新问题。\n输入参数:\n- num_prompts:生成问题的数量,整数,范围1-5(含),默认1\n- llm_serving:LLM服务实例,用于生成问题\n- prompt_template:提示模板对象,用于构建生成提示词\n输出参数:\n- 原始输入列(由input_key指定):新增生成的问题\n- Synth_or_Input:标识问题来源,'input'表示原始问题,'synth'表示生成的新问题", + "type": { + "level_1": "reasoning", + "level_2": "generate" + }, + "allowed_prompts": [ + "MathQuestionParallelFusionGeneratorPrompt", + "MathQuestionSequentialFusionGeneratorPrompt", + "MathQuestionConditionFusionGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "num_prompts", + "default": 1, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_problem_1", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_problem_2", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 131, + "name": "ReasoningCategoryDatasetEvaluator", + "description": "该算子用于统计数据集中的类别信息,包括主类别和次类别的分布情况。它计算每个类别的样本数量,并返回类别分布的统计结果。\n输入参数:\n- input_primary_category_key:主类别字段名,默认为'primary_category'\n- input_secondary_category_key:次类别字段名,默认为'secondary_category'\n输出参数:\n- 返回包含类别统计信息的字典,主类别作为键,值为包含该类别样本数量和次类别分布的字典", + "type": { + "level_1": "reasoning", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_primary_category_key", + "default": "primary_category", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_secondary_category_key", + "default": "secondary_category", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 132, + "name": "ReasoningDifficultyDatasetEvaluator", + "description": "该算子用于统计数据集中的难度信息,计算不同难度级别的样本数量分布。它统计每个难度级别的样本数量,并返回难度分布的统计结果。\n输入参数:\n- input_diffulty_key:难度分数字段名,默认为'difficulty_score'\n输出参数:\n- 返回包含难度统计信息的字典,难度级别作为键,值为该难度级别的样本数量", + "type": { + "level_1": "reasoning", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_diffulty_key", + "default": "difficulty_score", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 133, + "name": "ReasoningTokenDatasetEvaluator", + "description": "该算子用于统计数据集中问题和回答的token信息,包括token数量的最小值、最大值、平均值和中位数等统计指标。它使用指定的tokenizer对文本进行编码,并计算token长度的分布情况。\n输入参数:\n- input_question_key:问题文本字段名\n- input_answer_key:回答文本字段名\n- model_name_or_path:tokenizer模型名称或路径\n输出参数:\n- 返回包含token统计信息的字典,包括问题和回答的token数量的零值计数、最小值、最大值、平均值和中位数", + "type": { + "level_1": "reasoning", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "model_name_or_path", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_question_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_answer_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 134, + "name": "ReasoningQuestionCategorySampleEvaluator", + "description": "该算子用于对用户问题进行多级分类(主分类和子分类)。通过大语言模型对输入问题进行语义分析,输出分类编码结果。\n\n输入参数:\n- db_port/db_name/table_name:数据库连接参数(存储模式)\n- input_file/output_file:文件路径(文件模式)\n- input_key:输入数据中问题字段的键名\n- generator_type:模型调用方式(aisuite/request)\n\n输出参数:\n- classification_result:包含主分类和子分类的编码结果", + "type": { + "level_1": "reasoning", + "level_2": "eval" + }, + "allowed_prompts": [ + "MathQuestionCategoryPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "question_category", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 135, + "name": "ReasoningQuestionDifficultySampleEvaluator", + "description": "该算子用于评估问题的难度等级。通过大语言模型分析问题复杂度,输出1-10级的难度评分。\n\n输入参数:\n- eval_stage:评估阶段标识\n- read_min/max_score:分数过滤阈值\n- 其他参数同ReasoningCategoryDatasetEvaluator\n\n输出参数:\n- difficulty_score:数值型难度评分(1-10)", + "type": { + "level_1": "reasoning", + "level_2": "eval" + }, + "allowed_prompts": [ + "MathQuestionDifficultyPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "difficulty_score", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 136, + "name": "ReasoningQuestionSolvableSampleEvaluator", + "description": "该算子用于对用户问题进行多级分类(主分类和子分类)。通过大语言模型对输入问题进行语义分析,输出分类编码结果。\n\n输入参数:\n- db_port/db_name/table_name:数据库连接参数(存储模式)\n- input_file/output_file:文件路径(文件模式)\n- input_key:输入数据中问题字段的键名\n- generator_type:模型调用方式(aisuite/request)\n\n输出参数:\n- classification_result:包含主分类和子分类的编码结果", + "type": { + "level_1": "reasoning", + "level_2": "eval" + }, + "allowed_prompts": [ + "MathQuestionEvaluatorPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 137, + "name": "ReasoningAnswerFormatterFilter", + "description": "该算子用于检查答案格式是否符合规范,主要验证数学答案是否包含正确的\\boxed{}标记。\n\n输入参数:\n- input_key:输入字段名\n- result_key:结果字段名\n\n输出参数:\n- 通过格式检查返回1,否则返回0", + "type": { + "level_1": "reasoning", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "generated_cot", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 138, + "name": "ReasoningAnswerGroundTruthFilter", + "description": "该算子用于对比预测答案与标准答案的匹配度,支持精确匹配和数学验证两种方式。\n\n输入参数:\n- input_test_answer_key:预测答案字段名\n- input_gt_answer_key:标准答案字段名\n- compare_method:比较方法(exact/math_verify)\n\n输出参数:\n- 匹配成功返回1,否则返回0", + "type": { + "level_1": "reasoning", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "compare_method", + "default": "math_verify", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_test_answer_key", + "default": "generated_cot", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_gt_answer_key", + "default": "golden_answer", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 139, + "name": "ReasoningAnswerNgramFilter", + "description": "该算子基于n-gram重复率过滤答案,检测回答中的重复模式。\n\n输入参数:\n- min_score:最小可接受分数\n- max_score:最大可接受分数\n- ngrams:n-gram大小\n\n输出参数:\n- 分数在范围内返回1,否则返回0", + "type": { + "level_1": "reasoning", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 0.1, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 1.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "ngrams", + "default": 5, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_question_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_answer_key", + "default": "generated_cot", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 140, + "name": "ReasoningAnswerPipelineRootFilter", + "description": "答案处理流程根节点,负责将输入数据根据有无真实标签GT分发到不同处理分支。\n\n输入参数:\n- input_file:输入文件路径\n- output_dir:输出目录路径\n- branch_config:分支配置参数\n- parallel_workers:并行工作线程数\n\n输出参数:\n- 多个输出文件路径(根据分支配置生成)", + "type": { + "level_1": "reasoning", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_answer_key", + "default": "output", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_gt_key", + "default": "golden_answer", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 141, + "name": "ReasoningAnswerTokenLengthFilter", + "description": "该算子根据token数量过滤过长的答案。\n\n输入参数:\n- max_answer_token_length:最大token数\n- tokenizer_dir:分词器路径\n- read_min/max_score:分数范围\n\n输出参数:\n- 长度合规返回1,否则返回0", + "type": { + "level_1": "reasoning", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "max_answer_token_length", + "default": 8192, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "tokenizer_dir", + "default": "Qwen/Qwen2.5-0.5B-Instruct", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "generated_cot", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 142, + "name": "ReasoningQuestionFilter", + "description": "该算子用于对问题进行正确性检查,包括格式是否规范、语义是否合理、条件是否矛盾以及是否具备充分信息可解。调用大语言模型依次执行四阶段判断,最终返回每个问题是否合格的二分类结果(保留合格样本)。\n输入参数:\n- system_prompt:系统提示词,用于定义模型行为\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- prompt_template:提示模板对象,用于构建检查提示词\n- input_key:输入问题字段名,默认为'math_problem'\n输出参数:\n- 过滤后的DataFrame,仅保留判断结果为True的行\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "reasoning", + "level_2": "filter" + }, + "allowed_prompts": [ + "MathQuestionFilterPrompt", + "GeneralQuestionFilterPrompt", + "DiyQuestionFilterPrompt" + ], + "parameter": { + "init": [ + { + "name": "system_prompt", + "default": "You are a helpful assistant.", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": "", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "math_problem", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 143, + "name": "ReasoningAnswerModelJudgeFilter", + "description": "该算子用于对答案进行正确性评判,通过比较当前答案与参考答案的语义一致性,判断答案是否正确。调用大语言模型进行语义理解和判断,最终返回每个答案是否正确的二分类结果。\n输入参数:\n- system_prompt:系统提示词,用于定义模型行为\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- prompt_template:提示模板对象,用于构建评判提示词\n- keep_all_samples:是否保留所有样本,默认为False(仅保留正确答案)\n- question_key:问题字段名,默认为'question'\n- answer_key:当前答案字段名,默认为'answer'\n- reference_key:参考答案字段名,默认为'reference_answer'\n输出参数:\n- DataFrame,包含原始数据和判断结果(answer_match_result字段)\n- 如果keep_all_samples为False,则仅保留判断结果为True的行\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "reasoning", + "level_2": "filter" + }, + "allowed_prompts": [ + "AnswerJudgePromptQuestion", + "AnswerJudgePrompt" + ], + "parameter": { + "init": [ + { + "name": "system_prompt", + "default": "You are a helpful assistant specialized in evaluating answer correctness.", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": "", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "keep_all_samples", + "default": false, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_question_key", + "default": "question", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_answer_key", + "default": "answer", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_reference_key", + "default": "reference_answer", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + } + ], + "text2sql": [ + { + "node": 144, + "name": "SQLConsistencyFilter", + "description": "对条目进行过滤,检测SQL和自然语言问题是否对应,即判断SQL是否能解决该问题。\n\n输入参数:\n- input_sql_key: 输入SQL列名\n- input_db_id_key: 输入数据库ID列名\n- input_question_key: 输入问题列名\n\n", + "type": { + "level_1": "text2sql", + "level_2": "filter" + }, + "allowed_prompts": [ + "SQLConsistencyFilterPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "database_manager", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_sql_key", + "default": "SQL", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_db_id_key", + "default": "db_id", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_question_key", + "default": "question", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 145, + "name": "SQLExecutionFilter", + "description": "对条目进行过滤,在数据库中执行SQL,筛选掉不可执行的条目。\n\n输入参数:\n- input_sql_key: 输入SQL列名\n- input_db_id_key: 输入数据库ID列名\n\n", + "type": { + "level_1": "text2sql", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "database_manager", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_sql_key", + "default": "sql", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_db_id_key", + "default": "db_id", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 146, + "name": "SQLGenerator", + "description": "基于数据库信息,合成SQL,覆盖不同的难度、数据库Schema、函数和风格。\n\n输出参数:\n- output_sql_key: 输出SQL列名\n- output_db_id_key: 数据库ID列名\n\n", + "type": { + "level_1": "text2sql", + "level_2": "generate" + }, + "allowed_prompts": [ + "SelectSQLGeneratorPrompt", + "SelectVecSQLGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "database_manager", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "generate_num", + "default": 300, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_sql_key", + "default": "sql", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_db_id_key", + "default": "db_id", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 147, + "name": "SQLByColumnGenerator", + "description": "基于数据库信息,合成SQL,覆盖不同的难度、数据库Schema、函数和风格。\n\n输出参数:\n- output_sql_key: 输出SQL列名\n- output_db_id_key: 数据库ID列名\n\n", + "type": { + "level_1": "text2sql", + "level_2": "generate" + }, + "allowed_prompts": [ + "SelectSQLGeneratorPrompt", + "SelectVecSQLGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "database_manager", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "generate_num", + "default": 5, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_sql_key", + "default": "sql", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_db_id_key", + "default": "db_id", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 148, + "name": "SQLVariationGenerator", + "description": "对于每个条目,基于已有的SQL,指导模型生成SQL的变种,即在原有SQL的基础上,进行数据替换、函数变换、难度变换等操作,生成更加丰富的SQL。\n\n输入参数:\n- input_sql_key: SQL列名\n- input_db_id_key: 数据库ID列名\n\n", + "type": { + "level_1": "text2sql", + "level_2": "generate" + }, + "allowed_prompts": [ + "SQLVariationGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "database_manager", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "num_variations", + "default": 10, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_sql_key", + "default": "sql", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_db_id_key", + "default": "db_id", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 149, + "name": "Text2SQLCoTGenerator", + "description": "对于每个条目,生成从自然语言问题和数据库Schema到SQL的CoT长链路推理过程。\n\n输入参数:\n- input_sql_key: 输入SQL列名\n- input_question_key: 输入问题列名\n- input_db_id_key: 输入数据库ID列名\n\n输出参数:\n- output_cot_key: 输出CoT列名", + "type": { + "level_1": "text2sql", + "level_2": "generate" + }, + "allowed_prompts": [ + "Text2SQLCotGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "database_manager", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_sql_key", + "default": "SQL", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_question_key", + "default": "question", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_db_id_key", + "default": "db_id", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_evidence_key", + "default": "evidence", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_cot_key", + "default": "cot_reasoning", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 150, + "name": "Text2SQLPromptGenerator", + "description": "从数据库提取Schema信息,结合自然语言问题生成提示词。其中提示词模版支持自定义。\n\n输入参数:\n- input_question_key: 问题列名\n- input_db_id_key: 数据库ID列名\n- output_prompt_key: 输出prompt列名\n\n输出参数:\n- output_prompt_key: 生成的prompt", + "type": { + "level_1": "text2sql", + "level_2": "generate" + }, + "allowed_prompts": [ + "Text2SQLPromptGeneratorPrompt", + "Text2VecSQLPromptGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "database_manager", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_question_key", + "default": "question", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_db_id_key", + "default": "db_id", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_evidence_key", + "default": "evidence", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_prompt_key", + "default": "prompt", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 151, + "name": "Text2SQLQuestionGenerator", + "description": "对于每个条目,如果自然语言问题为空,生成SQL对应的自然语言问题。为保证正确,生成多个候选问题,并选择最优的。\n\n输入参数:\n- input_sql_key: 输入SQL列名\n- input_db_id_key: 数据库ID列名\n\n输出参数:\n- output_question_key: 输出问题列名", + "type": { + "level_1": "text2sql", + "level_2": "generate" + }, + "allowed_prompts": [ + "Text2SQLQuestionGeneratorPrompt", + "Text2VecSQLQuestionGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "embedding_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "database_manager", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "question_candidates_num", + "default": 5, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_sql_key", + "default": "sql", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_db_id_key", + "default": "db_id", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_question_key", + "default": "question", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_evidence_key", + "default": "evidence", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 152, + "name": "SQLComponentClassifier", + "description": "根据SQL的组件数量和复杂度,评估SQL的难度。\n\n输入参数:\n- input_sql_key: 输入SQL列名\n\n输出参数:\n- output_difficulty_key: 输出难度列名", + "type": { + "level_1": "text2sql", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "difficulty_thresholds", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "difficulty_labels", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_sql_key", + "default": "SQL", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_difficulty_key", + "default": "sql_component_difficulty", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 153, + "name": "SQLExecutionClassifier", + "description": "让模型根据自然语言问题、数据库Schema和提示词,多次生成SQL,通过生成SQL的准确率,评估该问题对于模型的难度。\n\n输入参数:\n- input_db_id_key: 输入数据库ID列名\n- input_sql_key: 输入SQL列名\n- input_prompt_key: 输入prompt列名\n\n输出参数:\n- output_difficulty_key: 输出难度列名", + "type": { + "level_1": "text2sql", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "database_manager", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "num_generations", + "default": 10, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "difficulty_thresholds", + "default": [ + 2, + 5, + 9 + ], + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "difficulty_labels", + "default": [ + "extra", + "hard", + "medium", + "easy" + ], + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_db_id_key", + "default": "db_id", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_sql_key", + "default": "SQL", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_prompt_key", + "default": "rl_prompt", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_difficulty_key", + "default": "sql_execution_difficulty", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + } + ], + "text_pt": [ + { + "node": 154, + "name": "CCNetDeduplicateFilter", + "description": "CCNet去重方法,基于SHA-1哈希算法的前N位进行重复识别,实现精确去重。\n\n初始化参数:\n- bit_length: 哈希值的位数,默认为64位\n\n运行参数:\n- input_keys: 用于计算哈希的多个字段列表(与input_key二选一)\n- input_key: 用于计算哈希的单个字段名(与input_keys二选一)\n- output_key: 去重标记字段名,默认为'minhash_deduplicated_label'\n\n输出说明:标记为1的数据表示首次出现,标记为0的数据表示重复数据", + "type": { + "level_1": "text_pt", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "bit_length", + "default": 64, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_keys", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "minhash_deduplicated_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 155, + "name": "DebertaV3SampleEvaluator", + "description": "基于Nvidia Deberta V3模型的质量分类器,用于评估文本质量并返回分类结果。\n输入参数:\n- model_name:预训练模型名称\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- device:计算设备,默认为'cuda'\n- input_key:输入文本字段名\n- output_key:输出分类结果字段名,默认为'Debertav3Score'\n输出参数:\n- 包含文本质量分类结果的DataFrame", + "type": { + "level_1": "text_pt", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "model_name", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "Debertav3Score", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 156, + "name": "DebertaV3Filter", + "description": "基于DebertaV3Scorer打分器的得分对数据进行过滤。使用Nvidia Deberta V3模型的质量分类器评估文本质量。\n\n初始化参数:\n- allowed_scores: 允许通过的分数列表,默认为['Medium', 'High']\n- model_name: 模型名称,默认为'nvidia/quality-classifier-deberta'\n- model_cache_dir: 模型缓存目录,默认为'./dataflow_cache'\n- device: 运行设备,默认为'cuda'\n- batch_size: 批处理大小,默认为16\n\n运行参数:\n- input_key: 输入文本字段名\n- output_key: 输出分数字段名,默认为'Debertav3Score'\n\n过滤逻辑:保留分类结果在allowed_scores列表中的数据", + "type": { + "level_1": "text_pt", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "allowed_scores", + "default": [ + "Medium", + "High" + ], + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_name", + "default": "nvidia/quality-classifier-deberta", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "batch_size", + "default": 16, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "Debertav3Score", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 157, + "name": "FineWebEduSampleEvaluator", + "description": "基于Fineweb-Edu分类器评估文本的教育价值。该分类器使用预训练的序列分类模型对文本进行评估,返回0-1之间的分数,分数越高表示文本的教育价值越高。适用于筛选具有教育意义的文本内容。\n输入参数:\n- text: 待评估的文本字符串\n输出参数:\n- float: 0-1之间的教育价值分数,越高表示教育价值越大", + "type": { + "level_1": "text_pt", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "FinewebEduScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 158, + "name": "FineWebEduFilter", + "description": "基于FineWebEduScorer打分器的得分对数据进行过滤。Fineweb-Edu是一个用于评估文本教育价值的分类器。\n\n初始化参数:\n- min_score: 最低分数阈值,默认为2.5\n- max_score: 最高分数阈值,默认为10000\n- model_cache_dir: 模型缓存目录,默认为'./dataflow_cache'\n- device: 运行设备,默认为'cuda'\n\n运行参数:\n- input_key: 输入文本字段名\n- output_key: 输出分数字段名,默认为'FinewebEduScore'\n\n评分标准:0-5分,分数越高表示文本具有越高的教育价值\n过滤逻辑:保留分数在[min_score, max_score]范围内的数据", + "type": { + "level_1": "text_pt", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 2.5, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 10000, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "FinewebEduScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 159, + "name": "PairQualSampleEvaluator", + "description": "基于BGE模型和GPT成对比较数据训练的文本质量评分器,支持中英文输入。通过对文本进行单样本评估,返回0-1之间的质量分数,分数越高表示文本质量越好。模型分为英文版本(zks2856/PairQual-Scorer-en)和中文版本(zks2856/PairQual-Scorer-zh)。\n输入参数:\n- text: 待评估的文本字符串\n- lang: 语言类型,可选'en'或'zh'\n输出参数:\n- float: 0-1之间的质量分数,越高表示质量越好", + "type": { + "level_1": "text_pt", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "lang", + "default": "en", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_length", + "default": 512, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "PairQualScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 160, + "name": "PairQualFilter", + "description": "基于PairQualScorer打分器的得分对数据进行过滤。基于BGE模型,使用GPT对文本成对比较打分后训练而成的双语文本质量评分器,得分越高表示质量越高。\n输入参数:\n- min_score:最小质量得分阈值\n- max_score:最大质量得分阈值\n- model_cache_dir:模型缓存目录路径\n- lang:文本语言类型\n输出参数:\n- 过滤后的DataFrame,仅保留质量得分在指定范围内的文本\n- 返回包含质量得分字段名的列表", + "type": { + "level_1": "text_pt", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 10000, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "lang", + "default": "en", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "PairQualScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 161, + "name": "PerplexitySampleEvaluator", + "description": "基于Huggingface语言模型计算文本的困惑度(Perplexity),困惑度越低表示文本的流畅性和可理解性越高。输入参数:\n- model_name:Huggingface模型路径或名称\n- device:模型运行设备\n输出参数:\n- float: 困惑度值,越低表示文本流畅性越好", + "type": { + "level_1": "text_pt", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "model_name", + "default": "gpt2", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "raw_content", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "PerplexityScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 162, + "name": "PerplexityFilter", + "description": "基于PerplexityScorer打分器的得分对数据进行过滤。基于Huggingface模型计算文本的困惑度,困惑度越低,文本的流畅性和可理解性越高。\n输入参数:\n- min_score:最小困惑度阈值\n- max_score:最大困惑度阈值\n- model_name:Huggingface模型路径或名称\n- device:模型运行设备\n输出参数:\n- 过滤后的DataFrame,仅保留困惑度在指定范围内的文本\n- 返回包含困惑度得分字段名的列表", + "type": { + "level_1": "text_pt", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 10.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 500.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_name", + "default": "gpt2", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "PerplexityScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 163, + "name": "QuratingSampleEvaluator", + "description": "通过Qurating模型(princeton-nlp/QuRater-1.3B)从四个维度评估文本质量:写作风格(writing_style)、所需专业程度(required_expertise)、事实与趣闻(facts_and_trivia)和教育价值(educational_value)。每个维度返回0-1之间的分数,综合评估文本的整体质量。\n输入参数:\n- text: 待评估的文本字符串\n- labels: 评估维度列表,默认为['writing_style', 'required_expertise', 'facts_and_trivia', 'educational_value']\n输出参数:\n- dict: 包含各维度分数的字典,键为维度名称,值为0-1之间的分数", + "type": { + "level_1": "text_pt", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "map_batch_size", + "default": 512, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "num_workers", + "default": 1, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device_batch_size", + "default": 16, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "labels", + "default": [ + "writing_style", + "required_expertise", + "facts_and_trivia", + "educational_value" + ], + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 164, + "name": "QuratingFilter", + "description": "基于QuratingScorer打分器的得分对数据进行过滤。通过Qurating模型从四个维度评估文本质量:写作风格、所需专业知识、事实与 trivia 内容、教育价值。\n每个维度评分范围为0-9分,综合判断文本质量,可用于筛选高质量教育类或知识类内容。\n输入参数:\n- min_scores:各维度保留样本的最小分数阈值,默认为{'writing_style':0,'required_expertise':0,'facts_and_trivia':0,'educational_value':0}\n- max_scores:各维度保留样本的最大分数阈值,默认为{'writing_style':9,'required_expertise':9,'facts_and_trivia':9,'educational_value':9}\n- map_batch_size:映射批次大小,默认为512\n- num_workers:数据加载工作进程数,默认为1\n- device_batch_size:设备批次大小,默认为16\n- device:模型运行设备,默认为'cuda'\n- labels:评估维度列表,默认为['writing_style', 'required_expertise', 'facts_and_trivia', 'educational_value']\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n输出参数:\n- 过滤后的DataFrame,仅保留所有维度分数均在对应阈值范围内的样本\n- 返回包含各维度过滤结果字段名的列表,用于后续算子引用", + "type": { + "level_1": "text_pt", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_scores", + "default": { + "writing_style": 0, + "required_expertise": 0, + "facts_and_trivia": 0, + "educational_value": 0 + }, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_scores", + "default": { + "writing_style": 9, + "required_expertise": 9, + "facts_and_trivia": 9, + "educational_value": 9 + }, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "map_batch_size", + "default": 512, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "num_workers", + "default": 1, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device_batch_size", + "default": 16, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "labels", + "default": [ + "writing_style", + "required_expertise", + "facts_and_trivia", + "educational_value" + ], + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 165, + "name": "TextbookSampleEvaluator", + "description": "基于FastText分类器(kenhktsui/llm-data-textbook-quality-fasttext-classifer-v2)评估文本的教育价值,将文本分为低(Low)、中(Mid)、高(High)三个等级,并映射为1.0、3.0、5.0的分数。适用于筛选适合作为教材的高质量文本内容。\n输入参数:\n- text: 待评估的文本字符串\n输出参数:\n- float: 教育价值分数,可能值为1.0(低)、3.0(中)、5.0(高)", + "type": { + "level_1": "text_pt", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "TextbookScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 166, + "name": "TextbookFilter", + "description": "基于TextbookScorer打分器的得分对数据进行过滤。使用FastText分类器评估文本的教育价值,判断文本是否适合作为教材内容。\n分类器经过训练可识别具有教育意义、结构清晰、知识准确的文本,适用于构建教育类数据集。\n输入参数:\n- min_score:保留样本的最小教育价值分数阈值,默认为0.99\n- max_score:保留样本的最大教育价值分数阈值,默认为1.0\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- input_key:输入文本字段名\n- output_key:教育价值分数字段名,默认为'TextbookScore'\n输出参数:\n- 过滤后的DataFrame,仅保留教育价值分数在[min_score, max_score]范围内的样本\n- 返回包含教育价值分数字段名的列表,用于后续算子引用", + "type": { + "level_1": "text_pt", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 0.99, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 1, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "TextbookScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 167, + "name": "Phi4QAGenerator", + "description": "基于给定文档内容,生成预训练格式的多轮对话问答数据。将原始文档内容转换为适合语言模型预训练的对话格式数据。输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_key:输入文档内容字段名,默认为'raw_content'\n- output_key:输出生成内容字段名,默认为'generated_content'\n输出参数:\n- 包含原始内容和生成内容的DataFrame\n- 返回输出字段名,用于后续算子引用", + "type": { + "level_1": "text_pt", + "level_2": "generate" + }, + "allowed_prompts": [ + "Phi4QAGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "raw_content", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "generated_content", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 168, + "name": "MetaSampleEvaluator", + "description": "通过LLM评估文本的多个元属性,包括文本结构、多样性与复杂性、流畅性与可理解性、安全性、教育价值以及内容准确性与有效性。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- dimensions:评估维度列表,每个维度对应的字典中包含dimension_name,description,和示例字段:\n * dimension_name:维度名称\n * description:维度的描述\n * example_list:包含示例文本和得分的列表\n- input_key:输入文本字段名\n输出参数:\n- 包含6个评估维度得分的DataFrame,列名为:Text Structure, Diversity & Complexity, Fluency & Understandability, Safety, Educational Value, Content Accuracy & Effectiveness", + "type": { + "level_1": "text_pt", + "level_2": "eval" + }, + "allowed_prompts": [ + "MetaPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "dimensions", + "default": [ + { + "dimension_name": "Text Structure", + "description": "Evaluate the surface-level quality of the text, including spelling accuracy, grammar, vocabulary richness, and sentence structure.", + "example_list": [ + { + "text": "The experimental procedure was meticulously documented, with each variable clearly defined.", + "score": "5" + }, + { + "text": "teh data was wrong and we dont no why it happen like that", + "score": "2" + } + ] + }, + { + "dimension_name": "Diversity and Complexity", + "description": "Assess how rich and conceptually varied the content is, and whether it requires expert or deep reasoning to understand.", + "example_list": [ + { + "text": "This article compares Bayesian inference and frequentist approaches in statistical modeling, highlighting theoretical and practical trade-offs.", + "score": "5" + }, + { + "text": "Dogs are pets. They bark. They are friendly.", + "score": "2" + } + ] + }, + { + "dimension_name": "Fluency and Understandability", + "description": "Evaluate whether the text flows naturally, is easy to follow, and avoids awkward or disjointed phrasing.", + "example_list": [ + { + "text": "Despite initial challenges, the team successfully completed the deployment by adhering to a revised strategy.", + "score": "5" + }, + { + "text": "The problem was and then fixed by something happens deployment successful maybe.", + "score": "2" + } + ] + }, + { + "dimension_name": "Safety", + "description": "Identify whether the text contains profanities, hate speech, or excessive personally identifiable information (PII).", + "example_list": [ + { + "text": "The software collects anonymous usage data to improve performance.", + "score": "5" + }, + { + "text": "You idiot, your address 123 Main St will be posted online.", + "score": "1" + } + ] + }, + { + "dimension_name": "Educational Value", + "description": "Determine whether the text provides insight, stimulates thinking, or offers meaningful learning potential.", + "example_list": [ + { + "text": "Understanding the principles of thermodynamics allows engineers to design more efficient engines.", + "score": "5" + }, + { + "text": "The sky is blue. Water is wet. This is how it is.", + "score": "2" + } + ] + }, + { + "dimension_name": "Content Accuracy and Effectiveness", + "description": "Assess the truthfulness, relevance, and practical usefulness of the content.", + "example_list": [ + { + "text": "Newton's second law states that F = ma, which explains the relationship between force, mass, and acceleration.", + "score": "5" + }, + { + "text": "The Earth is flat and doesn't rotate around the Sun.", + "score": "1" + } + ] + } + ], + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + } + ], + "text_sft": [ + { + "node": 169, + "name": "AlpagasusSampleEvaluator", + "description": "通过调用GPT评估指令的质量,返回一个质量得分,得分越高表明指令的质量越高。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- dimension:评估维度,默认为'quality'\n- input_instruction_key:指令字段名\n- input_input_key:输入文本字段名\n- input_output_key:输出文本字段名\n- output_key:输出得分字段名,默认'AlpagasusScore'\n输出参数:\n- 包含评估得分的DataFrame", + "type": { + "level_1": "text_sft", + "level_2": "eval" + }, + "allowed_prompts": [ + "AlpagasusPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "dimension", + "default": "quality", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_output_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "AlpagasusScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 170, + "name": "DeitaQualitySampleEvaluator", + "description": "基于Llama模型的Deita指令质量评估器,通过生成1-6分的质量评分评估指令质量。\n输入参数:\n- device:计算设备,默认为'cuda'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- max_length:最大序列长度,默认为512\n- input_instruction_key:指令文本字段名,默认为'instruction'\n- input_output_key:输出文本字段名,默认为'output'\n- output_key:输出得分字段名,默认为'DeitaQualityScore'\n输出参数:\n- 包含指令质量评分的DataFrame(1-6分)", + "type": { + "level_1": "text_sft", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_length", + "default": 512, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_output_key", + "default": "output", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "DeitaQualityScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 171, + "name": "DeitaComplexitySampleEvaluator", + "description": "基于Llama模型的Deita指令复杂性评估器,通过生成1-6分的复杂性评分评估指令难度。\n输入参数:\n- device:计算设备,默认为'cuda'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- max_length:最大序列长度,默认为512\n- input_instruction_key:指令文本字段名,默认为'instruction'\n- input_output_key:输出文本字段名,默认为'output'\n- output_key:输出得分字段名,默认为'DeitaComplexityScore'\n输出参数:\n- 包含指令复杂性评分的DataFrame(1-6分)", + "type": { + "level_1": "text_sft", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_length", + "default": 512, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_output_key", + "default": "output", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "DeitaComplexityScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 172, + "name": "InstagSampleEvaluator", + "description": "使用Instag评分器评估指令的内容多样性和意图标签。通过分析指令文本生成相关标签,标签数量越多表示内容多样性越大,同时返回标签的详细解释。基于OFA-Sys/InsTagger模型实现。\n输入参数:\n- query: 待评估的指令文本\n输出参数:\n- int: 标签数量(内容多样性指标)\n- list: 包含标签和解释的字典列表", + "type": { + "level_1": "text_sft", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_new_tokens", + "default": 1024, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "temperature", + "default": 0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "do_sample", + "default": false, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "num_return_sequences", + "default": 1, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "return_dict_in_generate", + "default": true, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "InstagScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 173, + "name": "RMSampleEvaluator", + "description": "基于人类偏好数据训练的奖励模型(OpenAssistant/reward-model-deberta-v3-large-v2)对文本质量进行打分,高分代表质量较高。模型输入为指令和响应文本对,输出0-1之间的奖励分数,反映人类对文本质量的偏好判断。\n输入参数:\n- instruction: 指令文本字符串\n- output: 响应文本字符串\n输出参数:\n- float: 0-1之间的奖励分数,越高表示质量越好", + "type": { + "level_1": "text_sft", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_output_key", + "default": "output", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "RMScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 174, + "name": "SuperfilteringSampleEvaluator", + "description": "使用Superfiltering方法评估指令的跟随难度,基于GPT-2模型计算条件困惑度与独立困惑度的比值,得分越高表示指令越难跟随。该方法通过比较指令条件下的响应困惑度与独立响应困惑度,评估指令的清晰度和跟随难度。\n输入参数:\n- instruction: 指令文本\n- input_text: 输入文本(可选)\n- output: 响应文本\n输出参数:\n- float: 困惑度比值,越高表示指令跟随难度越大", + "type": { + "level_1": "text_sft", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_length", + "default": 512, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_output_key", + "default": "output", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "SuperfilteringScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 175, + "name": "TreeinstructSampleEvaluator", + "description": "通过生成语法树的节点数来衡量指令复杂性,节点越多表示指令越复杂。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_instruction_key:指令字段名\n- output_key:输出得分字段名,默认'TreeinstructScore'\n输出参数:\n- 包含指令复杂性得分的DataFrame", + "type": { + "level_1": "text_sft", + "level_2": "eval" + }, + "allowed_prompts": [ + "TreeinstructPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "TreeinstructScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 176, + "name": "AlpagasusFilter", + "description": "基于AlpagasusScorer打分器的得分对数据进行过滤。通过调用GPT模型评估指令的质量,返回一个质量得分。\n\n初始化参数:\n- min_score: 最低分数阈值,默认为3\n- max_score: 最高分数阈值,默认为5\n- llm_serving: LLM服务实例\n- dimension: 评估维度,默认为'quality'(质量)\n\n运行参数:\n- input_instruction_key: 输入指令字段名\n- input_input_key: 输入内容字段名\n- input_output_key: 输出内容字段名\n- output_key: 输出分数字段名,默认为'AlpagasusScore'\n\n过滤逻辑:保留分数在[min_score, max_score]范围内的数据", + "type": { + "level_1": "text_sft", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 3, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 5, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "dimension", + "default": "quality", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_output_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "AlpagasusScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 177, + "name": "DeitaQualityFilter", + "description": "基于DeitaQualityScorer打分器的得分对数据进行过滤。使用基于Llama模型的Deita指令质量评估器,评估指令的质量高低。\n\n初始化参数:\n- min_score: 最低分数阈值,默认为2.5\n- max_score: 最高分数阈值,默认为10000.0\n- device: 运行设备,默认为'cuda'\n- model_cache_dir: 模型缓存目录,默认为'./dataflow_cache'\n- max_length: 最大序列长度,默认为512\n\n运行参数:\n- input_instruction_key: 输入指令字段名,默认为'instruction'\n- input_output_key: 输入输出字段名,默认为'output'\n- output_key: 输出分数字段名,默认为'DeitaQualityScore'\n\n评分标准:1-6分,分数越高表示指令质量越高\n过滤逻辑:保留分数在[min_score, max_score]范围内的数据", + "type": { + "level_1": "text_sft", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 2.5, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 10000.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_length", + "default": 512, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_output_key", + "default": "output", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "DeitaQualityScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 178, + "name": "DeitaComplexityFilter", + "description": "基于DeitaComplexityScorer打分器的得分对数据进行过滤。使用基于Llama模型的Deita指令复杂性评估器,评估指令的复杂程度。\n\n初始化参数:\n- min_score: 最低分数阈值,默认为3.0\n- max_score: 最高分数阈值,默认为5.0\n- device: 运行设备,默认为'cuda'\n- model_cache_dir: 模型缓存目录,默认为'./dataflow_cache'\n- max_length: 最大序列长度,默认为512\n\n运行参数:\n- input_instruction_key: 输入指令字段名,默认为'instruction'\n- input_output_key: 输入输出字段名,默认为'output'\n- output_key: 输出分数字段名,默认为'DeitaComplexityScore'\n\n评分标准:1-6分,分数越高表示指令复杂性越高\n过滤逻辑:保留分数在[min_score, max_score]范围内的数据", + "type": { + "level_1": "text_sft", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 3.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 5.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_length", + "default": 512, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_output_key", + "default": "output", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "DeitaComplexityScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 179, + "name": "InstagFilter", + "description": "基于InstagScorer打分器的过滤算子。使用预训练的Instag模型对指令进行分析,返回标签的数量来评估指令的内容多样性。参数包括模型缓存目录(model_cache_dir)、计算设备(device)和最大新生成标记数(max_new_tokens)。过滤范围由min_score和max_score参数控制,标签越多表示内容多样性越大。", + "type": { + "level_1": "text_sft", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 0.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 1.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_new_tokens", + "default": 1024, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "InstagScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 180, + "name": "RMFilter", + "description": "基于RMScorer打分器的得分对数据进行过滤。使用基于人类偏好数据训练的奖励模型对文本质量进行评分,高分代表质量较高。\n奖励模型能够评估文本的相关性、有用性、无害性等人类偏好指标,可用于筛选符合人类价值观的高质量文本。\n输入参数:\n- min_score:保留样本的最小奖励分数阈值,默认为0.2\n- max_score:保留样本的最大奖励分数阈值,默认为0.8\n- device:模型运行设备,默认为'cuda'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- input_instruction_key:指令字段名,默认为'instruction'\n- input_output_key:输出字段名,默认为'output'\n输出参数:\n- 过滤后的DataFrame,仅保留奖励分数在[min_score, max_score]范围内的样本\n- 返回包含奖励分数字段名的列表,用于后续算子引用", + "type": { + "level_1": "text_sft", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 0.2, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 0.8, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_output_key", + "default": "output", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "RMScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 181, + "name": "SuperfilteringFilter", + "description": "使用Superfiltering评分器过滤掉低质量数据。基于GPT-2模型计算困惑度比值来评估指令跟随难度,比值越低表示指令越容易被模型理解和执行。\n适用于筛选适合特定模型能力的指令数据,提高模型训练效率和效果。\n输入参数:\n- min_score:保留样本的最小分数阈值,默认为0.0\n- max_score:保留样本的最大分数阈值,默认为1.0\n- device:模型运行设备,默认为'cuda'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- max_length:文本最大长度,默认为512\n- input_instruction_key:指令字段名,默认为'instruction'\n- input_input_key:输入字段名,默认为'input'\n- input_output_key:输出字段名,默认为'output'\n- output_key:过滤结果分数字段名,默认为'SuperfilteringScore'\n输出参数:\n- 过滤后的DataFrame,仅保留分数在[min_score, max_score]范围内的样本\n- 返回包含过滤结果分数字段名的列表,用于后续算子引用", + "type": { + "level_1": "text_sft", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 0.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 1.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_length", + "default": 512, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_input_key", + "default": "input", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_output_key", + "default": "output", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "SuperfilteringScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 182, + "name": "TreeinstructFilter", + "description": "基于TreeinstructScore打分器的得分对数据进行过滤。通过生成语法树的节点数来衡量指令复杂性,节点越多表示指令越复杂。\n适用于筛选特定复杂度范围内的指令数据,平衡数据集难度分布,优化模型训练效果。\n输入参数:\n- min_score:保留样本的最小语法树节点数阈值,默认为7\n- max_score:保留样本的最大语法树节点数阈值,默认为100\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_key:输入指令字段名\n- output_key:语法树节点数字段名,默认为'TreeinstructScore'\n输出参数:\n- 过滤后的DataFrame,仅保留语法树节点数在[min_score, max_score]范围内的样本\n- 返回包含语法树节点数字段名的列表,用于后续算子引用", + "type": { + "level_1": "text_sft", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 7, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 100, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "TreeinstructScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 183, + "name": "CondorGenerator", + "description": "基于预置知识树标签,两阶段从0合成SFT格式数据(合成数量大于5000时建议增加标签数量)。第一阶段生成不同难度级别的问题,第二阶段为每个问题生成对应的答案。输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- num_samples:生成样本总数,建议小于5000,默认值为15\n输出参数:\n- 包含'difficulty'、'instruction'和'output'字段的DataFrame\n- 返回生成的DataFrame用于后续处理", + "type": { + "level_1": "text_sft", + "level_2": "generate" + }, + "allowed_prompts": [ + "CondorQuestionPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "num_samples", + "default": 15, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "use_task_diversity", + "default": true, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 184, + "name": "SFTGeneratorSeed", + "description": "基于给定文档内容,生成监督微调格式的问答数据。并支持用户自定义生成内容要求。从原始文档中提取信息,生成符合SFT格式的指令-响应对。输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- custom_prompt:用户自定义提示词\n- input_key:输入文档内容字段名,默认为'raw_content'\n- max_tokens:生成文本的最大token数,默认为4096\n输出参数:\n- 包含'instruction'、'output'和'raw_content'字段的DataFrame\n- 返回包含'instruction'和'output'字段名的列表,用于后续算子引用", + "type": { + "level_1": "text_sft", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "custom_prompt", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "raw_content", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 185, + "name": "CondorRefiner", + "description": "两阶段优化指令回复质量:第一阶段调用API生成对回复的评论,第二阶段利用评论调用API改写回复,提升指令对质量。通过迭代优化提高问答对的整体质量。输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_instruction_key:输入指令字段名,默认为'instruction'\n- input_output_key:输入回复字段名,默认为'output'\n输出参数:\n- 包含优化后回复的DataFrame\n- 返回包含优化后回复字段名的列表,用于后续算子引用", + "type": { + "level_1": "text_sft", + "level_2": "refine" + }, + "allowed_prompts": [ + "CondorCritiquePrompt", + "CondorRefinePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_output_key", + "default": "output", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + } + ], + "vqa": [ + { + "node": 186, + "name": "VQAExtractPdf2Img", + "description": "N/A (非 staticmethod)", + "type": { + "level_1": "vqa", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "dpi", + "default": 300, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_pdf_path", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_image_folder", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 187, + "name": "VQAExtractDocLayoutMinerU", + "description": "N/A (非 staticmethod)", + "type": { + "level_1": "vqa", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "mineru_backend", + "default": "vlm-transformers", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_pdf_file_path", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_folder", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 188, + "name": "VQAExtractPicExtractor", + "description": "N/A (非 staticmethod)", + "type": { + "level_1": "vqa", + "level_2": "generate" + }, + "allowed_prompts": [ + "VQAExtractPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "interleaved", + "default": true, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_layout_path", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_subject", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_folder", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 189, + "name": "VQAExtractQAPairExtractor", + "description": "N/A (非 staticmethod)", + "type": { + "level_1": "vqa", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_vqa_extract_path", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_qa_path", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 190, + "name": "VQAExtractTag2Img", + "description": "N/A (非 staticmethod)", + "type": { + "level_1": "vqa", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "layout_prefix", + "default": "doclay_page_", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "image_prefix", + "default": "page_", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_layout_json", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_pdf_image_dir", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_image_dir", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_qa_file", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_qa_file", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_md_file", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 191, + "name": "VQAClipHeader", + "description": "N/A (非 staticmethod)", + "type": { + "level_1": "vqa", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_image_path", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_layout_path", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_image_folder", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_layout_prefix", + "default": "doclay", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 192, + "name": "VQAConcatenateImages", + "description": "N/A (非 staticmethod)", + "type": { + "level_1": "vqa", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_image_folder", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_image_folder", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + } + ], + "Default": [ + { + "node": 1, + "name": "AgenticRAGQAF1SampleEvaluator", + "description": "用于评估预测答案与多个参考答案之间的 F1 分数", + "type": { + "level_1": "agentic_rag", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_prediction_key", + "default": "refined_answer", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_ground_truth_key", + "default": "golden_doc_answer", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "F1Score", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 2, + "name": "AgenticRAGAtomicTaskGenerator", + "description": "该算子用于为提供的文本内容生成合适的高质量问题与可验证答案。\n\n输入参数:\n- input_key: 输入文本内容字段名(默认值:\"prompts\")\n- output_question_key: 输出问题字段名(默认值:\"question\")\n- output_answer_key: 输出答案字段名(默认值:\"answer\")\n- output_refined_answer_key: 输出精炼答案字段名(默认值:\"refined_answer\")\n- output_optional_answer_key: 输出可替代精炼答案字段名(默认值:\"optional_answer\")\n- output_golden_doc_answer_key: 输出黄金文档回答字段名(默认值:\"golden_doc_answer\")\n", + "type": { + "level_1": "agentic_rag", + "level_2": "generate" + }, + "allowed_prompts": [ + "AtomicTaskGeneratorGetIdentifierPrompt", + "AtomicTaskGeneratorGetConlcusionPrompt", + "AtomicTaskGeneratorQuestionPrompt", + "AtomicTaskGeneratorCleanQAPrompt", + "AtomicTaskGeneratorAnswerPrompt", + "AtomicTaskGeneratorRecallScorePrompt", + "AtomicTaskGeneratorOptionalAnswerPrompt", + "AtomicTaskGeneratorGoldenDocAnswerPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "data_num", + "default": 100, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_per_task", + "default": 10, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_question", + "default": 10, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "prompts", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_question_key", + "default": "question", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_answer_key", + "default": "answer", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_refined_answer_key", + "default": "refined_answer", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_optional_answer_key", + "default": "optional_answer", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_llm_answer_key", + "default": "llm_answer", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_golden_doc_answer_key", + "default": "golden_doc_answer", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 3, + "name": "AgenticRAGDepthQAGenerator", + "description": "该算子以已有问答生成更深度的问题。\n\n输入参数:\n- input_key: 输入字段名(默认值:\"question\")\n- output_key: 输出字段名(默认值:\"depth_question\")\n", + "type": { + "level_1": "agentic_rag", + "level_2": "generate" + }, + "allowed_prompts": [ + "DepthQAGeneratorGetIdentifierPrompt", + "DepthQAGeneratorBackwardTaskPrompt", + "DepthQAGeneratorSupersetCheckPrompt", + "DepthQAGeneratorQuestionPrompt", + "DepthQAGeneratorAnswerPrompt", + "DepthQAGeneratorRecallScorePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "n_rounds", + "default": 2, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "question", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "depth_question", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 4, + "name": "AgenticRAGWidthQAGenerator", + "description": "该算子用于结合两个问答,生成新的问题。\n\n输入参数:\n- input_question_key: 输入问题字段名(默认值:\"question\")\n- input_identifier_key: 输入标识符字段名(默认值:\"identifier\")\n- input_answer_key: 输入答案字段名(默认值:\"answer\")\n- output_question_key: 输出问题字段名(默认值:\"generated_width_task\")\n", + "type": { + "level_1": "agentic_rag", + "level_2": "generate" + }, + "allowed_prompts": [ + "WidthQAGeneratorMergePrompt", + "WidthQAGeneratorOriginCheckPrompt", + "WidthQAGeneratorQuestionVerifyPrompt", + "WidthQAGeneratorAnswerPrompt", + "WidthQAGeneratorRecallScorePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_question_key", + "default": "question", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_identifier_key", + "default": "identifier", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_answer_key", + "default": "answer", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_question_key", + "default": "generated_width_task", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 5, + "name": "ExtractSmilesFromTextGenerator", + "description": "ExtractSmilesFromText 用于从 OCR 文本中抽取或解析化学分子的 SMILES 表达式。算子会根据给定的提示模板(prompt_template),结合文本内容和(可选的)单体缩写信息,调用大语言模型完成解析与结构化,并将结果以 JSON 格式写回到指定列。\n\n输入参数:\n- llm_serving:LLM 服务对象,需实现 LLMServingABC 接口\n- prompt_template:提示词模板对象,用于构造模型输入\n- input_content_key: OCR 文本的列名(默认 'text')\n- input_abbreviation_key:包含缩写/单体信息的列名(默认 'abbreviations'),可为空\n- output_key:写回抽取结果的列名(默认 'synth_smiles')\n\n输出参数:\n- DataFrame,其中 output_key 列为模型返回并经 JSON 解析后的 SMILES 结构\n- 返回 output_key,供后续算子引用\n\n备注:\n- 模型输出会尝试解析为 JSON;若解析失败,将返回 [] 并记录失败次数。", + "type": { + "level_1": "chemistry", + "level_2": "generate" + }, + "allowed_prompts": [ + "ExtractSmilesFromTextPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": "", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_content_key", + "default": "text", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_abbreviation_key", + "default": "abbreviations", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "synth_smiles", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 6, + "name": "SmilesEquivalenceDatasetEvaluator", + "description": "评估 golden_label 与 synth_smiles 的 SMILES 等价性并计算分数。逐块输出 final_result、块内得分与准确率,并统计全局总分。", + "type": { + "level_1": "chemistry", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_golden_key", + "default": "golden_label", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_synth_key", + "default": "synth_smiles", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "final_result", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 7, + "name": "CodeAutoGeneratedSampleEvaluator", + "description": "基于自动生成标记评估代码样本,检测文件头部的自动生成标记。\n\n评估指标:\n- CodeAutoGeneratedMarkerCount: 检测到的自动生成标记数量\n- CodeAutoGeneratedScore: 综合自动生成得分 (0-1,1表示非自动生成)\n\n输入要求:需要包含'lines'列\n\n输出参数:\n- CodeAutoGeneratedMarkerCount: 检测到的自动生成标记数量\n- CodeAutoGeneratedScore: 综合自动生成得分", + "type": { + "level_1": "code", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "is_generated_func", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 8, + "name": "CodeAutoGeneratedFilter", + "description": "基于CodeAutoGeneratedSampleEvaluator的得分过滤自动生成的代码文件,确保只保留人工编写的代码。\n\n评估指标:\n- 自动生成标记数量:检测文件前5行中的自动生成标记\n- 检测标记:'auto-generated', 'autogenerated', 'automatically generated'等\n- 综合自动生成得分:0-1,1表示非自动生成\n- 支持外部检测函数进行额外验证\n\n输入参数:\n- input_key: 输入字段名(需要包含'lines'列)\n- output_key: 输出标签字段名 (默认: 'auto_generated_filter_label')\n- min_score: 最小自动生成得分阈值 (默认: 1.0)\n- max_score: 最大自动生成得分阈值 (默认: 1.0)\n- is_generated_func: 可选的外部检测函数\n\n输出参数:\n- 过滤后的DataFrame,仅保留自动生成得分在指定范围内的代码样本\n- 返回包含自动生成得分标签字段名的列表", + "type": { + "level_1": "code", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 1.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 1.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "is_generated_func", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "auto_generated_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 9, + "name": "CodeDocumentQualitySampleEvaluator", + "description": "基于综合文档级质量指标评估代码样本,包括内容长度、重复模式、字符组成和文本熵值。\n\n评估指标:\n- CodeDocumentQualityCharCount: 字符数\n- CodeDocumentQualityWordCount: 词数\n- CodeDocumentQualityDuplicateLinesRatio: 重复行比例\n- CodeDocumentQualityDuplicateNgramRatio: n-gram重复比例\n- CodeDocumentQualityCurlyBracketRatio: 花括号比例\n- CodeDocumentQualityAllCapsRatio: 全大写单词比例\n- CodeDocumentQualityEntropy: 单字符熵值\n- CodeDocumentQualityScore: 综合文档质量得分 (0-1,1表示通过所有质量检查)\n\n输入要求:需要包含'text'、'filename'、'language'列\n\n输出参数:\n- 各种质量指标的数值\n- CodeDocumentQualityScore: 综合文档质量得分", + "type": { + "level_1": "code", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "thresholds", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 10, + "name": "CodeDocumentQualityFilter", + "description": "基于CodeDocumentQualitySampleEvaluator的得分应用综合文档级质量过滤规则,移除低质量代码和文本样本。\n\n评估指标:\n- 内容长度:字符数、词数、行数范围检查\n- 重复模式:重复行比例、2-10gram重复比例\n- 字符组成:花括号比例、全大写单词比例\n- 文本熵值:单字符熵值检查\n- 综合文档质量得分:0-1,1表示通过所有质量检查\n\n输入参数:\n- input_key: 输入字段名(需要包含'text'、'filename'、'language'列)\n- output_key: 输出标签字段名 (默认: 'doc_quality_filter_label')\n- min_score: 最小文档质量得分阈值 (默认: 1.0)\n- max_score: 最大文档质量得分阈值 (默认: 1.0)\n- thresholds: 可选的阈值字典,用于覆盖默认阈值\n\n输出参数:\n- 过滤后的DataFrame,仅保留文档质量得分在指定范围内的样本\n- 返回包含文档质量得分标签字段名的列表", + "type": { + "level_1": "code", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 1.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 1.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "thresholds", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "doc_quality_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 11, + "name": "CodeEncodedDataSampleEvaluator", + "description": "基于编码数据模式评估代码样本,检测Base64、十六进制和Unicode转义序列。\n\n评估指标:\n- CodeEncodedDataBase64Ratio: Base64编码数据比例\n- CodeEncodedDataHexRatio: 十六进制数据比例\n- CodeEncodedDataUnicodeRatio: Unicode转义序列比例\n- CodeEncodedDataScore: 综合编码数据得分 (0-1,1表示通过编码数据检查)\n\n输入要求:需要包含'text'列\n\n输出参数:\n- CodeEncodedDataBase64Ratio: Base64编码数据比例\n- CodeEncodedDataHexRatio: 十六进制数据比例\n- CodeEncodedDataUnicodeRatio: Unicode转义序列比例\n- CodeEncodedDataScore: 综合编码数据得分", + "type": { + "level_1": "code", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 12, + "name": "CodeEncodedDataFilter", + "description": "基于CodeEncodedDataSampleEvaluator的得分过滤代码样本,移除二进制内容和自动生成代码。\n\n评估指标:\n- Base64编码数据比例:检测连续64+字符的Base64字符串\n- 十六进制数据比例:检测8+个连续的十六进制对\n- Unicode转义序列比例:检测8+个连续的\\uXXXX序列\n- 综合编码数据得分:0-1,1表示通过检查\n\n输入参数:\n- input_key: 输入字段名(需要包含'text'列)\n- output_key: 输出标签字段名 (默认: 'encoded_data_filter_label')\n- min_score: 最小编码数据得分阈值 (默认: 1.0)\n- max_score: 最大编码数据得分阈值 (默认: 1.0)\n\n输出参数:\n- 过滤后的DataFrame,仅保留编码数据得分在指定范围内的代码样本\n- 返回包含编码数据得分标签字段名的列表", + "type": { + "level_1": "code", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 1.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 1.0, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "encoded_data_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 13, + "name": "CodeFileTypeContentFilter", + "description": "基于文件类型和内容特征直接过滤代码样本,针对不同文件格式应用特定规则。\n\n过滤规则:\n- Text/JSON/YAML/Graphviz文件:行数 > 512 行\n- HTML文件:可见文本长度 < 100字符 或 可见文本比例 < 20%\n- Text文件:文件名不符合文档规范(非readme/notes/todo等)\n\n输入参数:\n- input_key: 输入字段名(需要包含'filetype'、'filename'、'line_count'等列)\n- output_key: 输出标签字段名 (默认: 'file_type_content_filter_label')\n\n输出参数:\n- 过滤后的DataFrame,仅保留符合文件类型规则的样本\n- 返回包含输出标签字段名的列表", + "type": { + "level_1": "code", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "file_type_content_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 14, + "name": "CodeLengthSampleEvaluator", + "description": "基于代码长度特征评估代码样本,分析总行数、平均行长和最大行长。\n\n评估指标:\n- CodeLengthTotalLines: 总行数\n- CodeLengthAvgLineLength: 平均行长\n- CodeLengthMaxLineLength: 最大行长\n- CodeLengthScore: 综合长度得分 (0-1,1表示通过所有长度检查)\n\n输入要求:需要包含'lines'和'language'列\n\n输出参数:\n- CodeLengthTotalLines: 总行数\n- CodeLengthAvgLineLength: 平均行长\n- CodeLengthMaxLineLength: 最大行长\n- CodeLengthScore: 综合长度得分", + "type": { + "level_1": "code", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 15, + "name": "CodeLengthSampleFilter", + "description": "基于CodeLengthSampleEvaluator的得分过滤代码样本,移除超大文件和格式不良的代码。\n\n评估指标:\n- 总行数:检查是否超过100,000行\n- 平均行长:普通语言>100字符,特殊语言>100,000字符\n- 最大行长:普通语言>1,000字符\n\n输入参数:\n- input_key: 输入字段名(需要包含'lines'和'language'列)\n- output_key: 输出标签字段名 (默认: 'length_filter_label')\n- min_score: 最小长度得分阈值 (默认: 1.0)\n- max_score: 最大长度得分阈值 (默认: 1.0)\n\n输出参数:\n- 过滤后的DataFrame,仅保留长度得分在指定范围内的代码样本\n- 返回包含长度得分标签字段名的列表", + "type": { + "level_1": "code", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 1.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 1.0, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "length_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 16, + "name": "CodeQualitySampleEvaluator", + "description": "该算子用于评估生成的代码片段与其源指令的匹配质量,并输出分数和反馈。\n\n输入参数:\n- input_instruction_key: 包含人类指令的字段名 (默认: 'generated_instruction')\n- input_code_key: 包含生成代码的字段名 (默认: 'generated_code')\n输出参数:\n- output_score_key: 用于存储质量分数的字段名 (默认: 'quality_score')\n- output_feedback_key: 用于存储质量反馈的字段名 (默认: 'quality_feedback')\n", + "type": { + "level_1": "code", + "level_2": "eval" + }, + "allowed_prompts": [ + "CodeQualityEvaluatorPrompt", + "DiyCodePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_code_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_score_key", + "default": "quality_score", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_feedback_key", + "default": "quality_feedback", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 17, + "name": "CodeQualityScoreFilter", + "description": "基于LLM生成的代码质量分数过滤代码样本,评估正确性、完整性、清晰度、最佳实践和效率。\n\n评估维度:\n- 正确性:代码语法和逻辑是否正确\n- 完整性:代码是否完整实现功能\n- 清晰度:代码是否清晰易懂\n- 最佳实践:是否遵循编程最佳实践\n- 效率:代码执行效率如何\n\n输入参数:\n- input_code_key: 输入代码字段名\n- input_instruction_key: 输入指令字段名\n- output_score_key: 输出打分字段名 (默认: 'quality_score')\n- output_feedback_key: 输出反馈字段名 (默认: 'quality_feedback')\n- output_key: 输出过滤标签字段名 (默认: 'quality_score_filter_label')\n- min_score: 最小质量分数阈值 (默认: 7)\n- max_score: 最大质量分数阈值 (默认: 10)\n\n输出参数:\n- 过滤后的DataFrame,仅保留质量分数在指定范围内的代码样本\n- 返回包含质量分数标签字段名的列表", + "type": { + "level_1": "code", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "min_score", + "default": 7, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 10, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_code_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_score_key", + "default": "quality_score", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_feedback_key", + "default": "quality_feedback", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "quality_score_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 18, + "name": "CodeGenericScoreFilter", + "description": "基于数值分数列直接过滤数据集,提供灵活的阈值比较方法。\n\n比较方法:\n- greater_equal: 分数 >= 阈值\n- greater: 分数 > 阈值\n- less_equal: 分数 <= 阈值\n- less: 分数 < 阈值\n- equal: 分数 = 阈值\n\n输入参数:\n- input_key: 包含分数的字段名\n- output_key: 输出标签字段名 (默认: 'generic_score_filter_label')\n- score_threshold: 分数阈值 (默认: 8)\n- filter_method: 比较方法 (默认: 'greater_equal')\n\n输出参数:\n- 过滤后的DataFrame,仅保留符合分数条件的样本\n- 返回包含输出标签字段名的列表", + "type": { + "level_1": "code", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "score_threshold", + "default": 8, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "filter_method", + "default": "greater_equal", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "generic_score_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 19, + "name": "CodeTextCompositionSampleEvaluator", + "description": "基于字符组成评估代码样本,分析字母字符和字母数字字符的比例。\n\n评估指标:\n- CodeTextCompositionAlphaRatio: 字母字符比例\n- CodeTextCompositionAlnumRatio: 字母数字字符比例\n- CodeTextCompositionScore: 综合字符组成得分 (0-1,1表示通过字符组成检查)\n\n输入要求:需要包含'text'和'language'列\n\n输出参数:\n- CodeTextCompositionAlphaRatio: 字母字符比例\n- CodeTextCompositionAlnumRatio: 字母数字字符比例\n- CodeTextCompositionScore: 综合字符组成得分", + "type": { + "level_1": "code", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 20, + "name": "CodeTextCompositionFilter", + "description": "基于CodeTextCompositionSampleEvaluator的得分过滤代码样本,移除二进制文件、加密内容和不可读文本。\n\n评估指标:\n- 字母字符比例:普通语言需要>=25%\n- 字母数字字符比例:汇编语言需要>=25%\n- 综合字符组成得分:0-1,1表示通过检查\n\n输入参数:\n- input_key: 输入字段名(需要包含'text'和'language'列)\n- output_key: 输出标签字段名 (默认: 'text_composition_filter_label')\n- min_score: 最小字符组成得分阈值 (默认: 1.0)\n- max_score: 最大字符组成得分阈值 (默认: 1.0)\n\n输出参数:\n- 过滤后的DataFrame,仅保留字符组成得分在指定范围内的代码样本\n- 返回包含字符组成得分标签字段名的列表", + "type": { + "level_1": "code", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 1.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 1.0, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "text_composition_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 21, + "name": "CodeCodeToInstructionGenerator", + "description": "该算子用于分析代码片段并反向生成可能产生该代码的人类指令。\n\n输入参数:\n- input_key: 包含原始代码片段的字段名 (默认: 'code')\n输出参数:\n- output_key: 用于存储生成指令的字段名 (默认: 'generated_instruction')\n", + "type": { + "level_1": "code", + "level_2": "generate" + }, + "allowed_prompts": [ + "CodeCodeToInstructionGeneratorPrompt", + "DiyCodePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "code", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "generated_instruction", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 22, + "name": "CodeInstructionToCodeGenerator", + "description": "该算子根据给定的人类指令生成相应的代码片段。\n\n输入参数:\n- input_key: 包含人类指令的字段名 (默认: 'instruction')\n输出参数:\n- output_key: 用于存储生成代码的字段名 (默认: 'generated_code')\n", + "type": { + "level_1": "code", + "level_2": "generate" + }, + "allowed_prompts": [ + "CodeInstructionToCodeGeneratorPrompt", + "DiyCodePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "generated_code", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 23, + "name": "CodeEnhancementInstructionGenerator", + "description": "该算子用于增强人类指令,将不同输出格式的任务统一为生成完整函数。\n\n输入参数:\n- input_key: 包含原始代码片段的字段名 (默认: 'code')\n输出参数:\n- output_key: 用于存储生成指令的字段名 (默认: 'generated_instruction')\n", + "type": { + "level_1": "code", + "level_2": "generate" + }, + "allowed_prompts": [ + "CodeInstructionEnhancement", + "DiyCodePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "messages", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "generated_instruction", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 24, + "name": "CodeInstructionGenerator", + "description": "该算子用于生成新的指令,从数据池中随机抽取few-shot样本,生成类似难度的指令。\n\n输入参数:\n- input_key: 包含原始指令的字段名 (默认: 'prompt')\n输出参数:\n- output_key: 用于存储生成指令的字段名 (默认: 'generated_instruction')\n", + "type": { + "level_1": "code", + "level_2": "generate" + }, + "allowed_prompts": [ + "CodeInstructionGeneratePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "num_few_shot", + "default": 3, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "num_generate", + "default": 10, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "prompt", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "generated_instruction", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 25, + "name": "CodeSandboxSampleEvaluator", + "description": "该算子在一个安全的沙箱环境中执行代码片段以验证其正确性。\n\n输入参数:\n- input_code_key: 包含待执行代码的字段名 (默认: 'generated_code')\n输出参数:\n- output_status_key: 用于存储执行状态 ('PASS' 或 'FAIL') 的字段名 (默认: 'sandbox_status')\n- output_log_key: 用于存储执行日志或错误信息的字段名 (默认: 'sandbox_log')\n", + "type": { + "level_1": "code", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "language", + "default": "python", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "timeout_length", + "default": 15, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "use_process_isolation", + "default": true, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_status_key", + "default": "sandbox_status", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_log_key", + "default": "sandbox_log", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 26, + "name": "ScenarioExtractGenerator", + "description": "从对话内容中提取场景信息,使用LLM服务分析对话并生成场景描述。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_chat_key:对话内容字段名\n- output_key:输出场景字段名,默认'scenario'\n输出参数:\n- 包含提取场景信息的DataFrame\n- 包含输出字段名的列表", + "type": { + "level_1": "conversations", + "level_2": "generate" + }, + "allowed_prompts": [ + "ExtractScenarioPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_chat_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "scenario", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 27, + "name": "ScenarioExpandGenerator", + "description": "基于原始场景生成新的替代场景,使用LLM服务重写或改写原有场景内容。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_scenario_key:原始场景字段名\n- output_key:生成的新场景字段名,默认'modified_scenario'\n输出参数:\n- 包含生成新场景的DataFrame\n- 包含输出字段名的列表", + "type": { + "level_1": "conversations", + "level_2": "generate" + }, + "allowed_prompts": [ + "ExpandScenarioPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_scenario_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "modified_scenario", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 28, + "name": "AtomTaskGenerator", + "description": "根据输入的场景信息,使用LLM服务生成对应的原子任务。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_scenario_key:场景字段名\n- output_key:原子任务的输出字段名,默认'atom_task'\n输出参数:\n- 包含原子任务的DataFrame\n- 包含输出字段名的列表", + "type": { + "level_1": "conversations", + "level_2": "generate" + }, + "allowed_prompts": [ + "FuncAtomicTaskGeneratePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_scenario_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "atom_task", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 29, + "name": "SequentialTaskGenerator", + "description": "根据输入的原子任务,使用LLM服务生成该任务的后继任务和两者的组合任务。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_task_key:原子任务字段名\n- output_subsequent_task_key:后继任务输出字段名,默认'subsequent_task'\n- output_composition_task_key:组合任务输出字段名,默认'composition_task'\n输出参数:\n- 包含后继任务和组合任务的DataFrame\n- 输出字段名的列表(后继任务字段和组合任务字段)", + "type": { + "level_1": "conversations", + "level_2": "generate" + }, + "allowed_prompts": [ + "SequentialTaskGeneratePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_task_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_subsequent_task_key", + "default": "subsequent_task", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_composition_task_key", + "default": "composition_task", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 30, + "name": "ParaSeqTaskGenerator", + "description": "基于原子任务,使用LLM服务生成三个任务类型:并行任务、后继任务以及这三者的组合任务。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_task_key:原子任务字段名\n- output_parallel_task_key:并行任务输出字段名,默认'parallel_task'\n- output_subsequent_task_key:后继任务输出字段名,默认'subsequent_task'\n- output_composition_task_key:组合任务输出字段名,默认'composition_task'\n输出参数:\n- 包含并行任务、后继任务与组合任务的DataFrame\n- 输出字段名列表(并行任务、后继任务、组合任务)", + "type": { + "level_1": "conversations", + "level_2": "generate" + }, + "allowed_prompts": [ + "ParathenSeqTaskGeneratePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_task_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_parallel_task_key", + "default": "parallel_task", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_subsequent_task_key", + "default": "subsequent_task", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_composition_task_key", + "default": "composition_task", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 31, + "name": "FunctionGenerator", + "description": "基于组合任务及其相关子任务,使用LLM服务生成对应的函数列表。输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_composition_task_key:组合任务字段名\n- input_sub_tasks_keys:子任务字段名列表(如原子任务、并行任务、后继任务等)\n- output_key:函数列表输出字段名,默认'functions'\n输出参数:\n- 包含函数定义或函数列表的DataFrame\n- 输出字段名的列表(函数列表字段)", + "type": { + "level_1": "conversations", + "level_2": "generate" + }, + "allowed_prompts": [ + "FuncGeneratePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_composition_task_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_sub_tasks_keys", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "functions", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 32, + "name": "MultiTurnConversationGenerator", + "description": "根据组合任务及其子任务函数,使用LLM服务模拟多轮对话过程,由User、Assistant和Tool三个Agent协同生成完整的对话数据。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_task_key:任务字段名(组合任务)\n- input_sub_tasks_keys:子任务字段名列表\n- input_functions_key:子任务函数字段名\n- output_conversations_key:输出对话字段名,默认'conversations'\n输出参数:\n- 包含已完成的多轮对话记录的DataFrame\n- 输出字段名(对话字段名)", + "type": { + "level_1": "conversations", + "level_2": "generate" + }, + "allowed_prompts": [ + "ConversationUserPrompt", + "ConversationAssistantPrompt", + "ConversationToolPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_task_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_sub_tasks_keys", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_functions_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_conversations_key", + "default": "conversations", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 33, + "name": "ConsistentChatGenerator", + "description": "根据预置主题和人类意图,两阶段从0合成多轮对话格式数据(合成数量大于9000时建议增加标签数量)。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- num_dialogs_per_intent:每个意图生成的对话数量,默认20\n- num_turns_per_dialog:每个对话的轮次数量,默认6\n- temperature:生成温度,控制输出随机性,默认0.9\n输出参数:\n- 包含category和conversation字段的DataFrame,其中conversation为多轮对话列表", + "type": { + "level_1": "conversations", + "level_2": "generate" + }, + "allowed_prompts": [ + "ConsistentQueryPrompt", + "ConsistentResponsePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "num_dialogs_per_intent", + "default": 20, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "num_turns_per_dialog", + "default": 6, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "temperature", + "default": 0.9, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 34, + "name": "FuncCallConversationSampleEvaluator", + "description": "对对话样本进行打分评估:使用 LLM 服务根据预设评分提示词对每条对话进行评分,并将结果写回数据流。\n输入参数:\n- llm_serving:LLM 服务对象,需实现 LLMServingABC 接口\n- input_conversation_key:DataFrame 中对话内容字段名,默认 'conversations'\n- output_score_key:评分结果输出字段名,默认 'score'\n处理流程:\n- 读取存储中的 DataFrame\n- 将每条对话重组为评分提示词并调用 LLM 生成评分(JSON)\n- 解析 JSON,提取 'score' 字段写入 DataFrame;解析失败则回退为 0\n输出参数:\n- 包含评分结果列的 DataFrame\n- 包含输出字段名的列表(仅 'score' 或自定义的输出列名)", + "type": { + "level_1": "conversations", + "level_2": "eval" + }, + "allowed_prompts": [ + "ConversationEvalPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_conversation_key", + "default": "conversations", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_score_key", + "default": "score", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 35, + "name": "CompositionTaskFilter", + "description": "根据组合任务及其子任务,使用LLM服务判断组合任务是否具备可行性与完备性,从而进行可运行任务的筛选。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_composition_task_key:组合任务字段名\n- input_sub_tasks_keys:子任务字段名列表(如原子任务、并行任务、后继任务等)\n- output_key:可运行标签的输出字段名,默认'runable_label'\n输出参数:\n- 仅包含可运行组合任务的数据DataFrame\n- 包含输出字段名的列表(可运行标签字段)", + "type": { + "level_1": "conversations", + "level_2": "filter" + }, + "allowed_prompts": [ + "CompositionTaskFilterPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_composition_task_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_sub_tasks_keys", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "runable_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 36, + "name": "Speech2TextGenerator", + "description": "该算子用于将语音内容转录为文本。它接收语音文件路径或URL,使用大语言模型进行转录,并将转录结果保存到数据框中。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- system_prompt:系统提示词,用于定义模型行为,默认为'You are a helpful assistant'\n- input_key:输入语音文件路径或URL的字段名,默认为'raw_content'\n- output_key:输出转录文本的字段名,默认为'generated_content'\n输出参数:\n- 返回输出字段名,用于后续算子引用\n- 在数据框中添加包含转录文本的新列", + "type": { + "level_1": "core_speech", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "system_prompt", + "default": "You are a helpful assistant", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "raw_content", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "generated_content", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 37, + "name": "PromptedGenerator", + "description": "基于用户提供的提示词(prompt)生成数据。结合系统提示词和输入内容生成符合要求的输出文本。输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- system_prompt:系统提示词,定义模型行为,默认为'You are a helpful agent.'\n- input_key:输入内容字段名,默认为'raw_content'\n- output_key:输出生成内容字段名,默认为'generated_content'\n输出参数:\n- 包含生成内容的DataFrame\n- 返回输出字段名,用于后续算子引用", + "type": { + "level_1": "core_text", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "system_prompt", + "default": "You are a helpful agent.", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "json_schema", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "raw_content", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "generated_content", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 38, + "name": "PairedPromptedGenerator", + "description": "PairedPromptedGenerator:基于两列配对输入(input_key_1 与 input_key_2)进行成对提示生成。\n算子会将 system_prompt 与每行的两列文本按固定模板拼接后,调用 LLM 服务批量生成结果,并将模型输出写回到 DataFrame 的指定列。\n\n输入参数:\n- llm_serving:LLM 服务对象(实现 LLMServingABC 接口)\n- system_prompt:系统提示词(默认 'You are a helpful agent.')。该提示会放在每条样本前缀, 用于约束模型的角色与输出风格。\n- input_key_1:第一列输入字段名(默认 'input_key_1')\n- input_key_2:第二列输入字段名(默认 'input_key_2')\n- output_key:输出字段名(默认 'generated_content')\n\n处理逻辑:\n1) 从 storage 中读取名为 'dataframe' 的 DataFrame;\n2) 对于每一行,若 input_key_1 与 input_key_2 均非空,则按模板:\n system_prompt + input_key_1 + 值 + '\\n' + input_key_2 + 值\n 构造 LLM 输入;\n3) 批量调用 llm_serving.generate_from_input 生成文本;\n4) 将生成结果写入 DataFrame 的 output_key 列并保存。\n\n输出:\n- 返回写入了生成结果的新 DataFrame(由 storage 管理保存),\n- 返回 output_key 以便后续算子引用。", + "type": { + "level_1": "core_text", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "system_prompt", + "default": "You are a helpful agent.", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key_1", + "default": "input_key_1", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key_2", + "default": "input_key_2", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "generated_content", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 39, + "name": "RandomDomainKnowledgeRowGenerator", + "description": "N/A (调用失败)", + "type": { + "level_1": "core_text", + "level_2": "generate" + }, + "allowed_prompts": [ + "SFTFromScratchGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "generation_num", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "domain_keys", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "generated_content", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 40, + "name": "Text2QAGenerator", + "description": "该算子用于为给定的文档片段生成种子QA对。\n\n输入参数:\n- input_key: 包含文档片段的字段名\n- prompt_key: 包含提示词的字段名\n- output_quesion_key: 包含生成问题的字段名\n- output_answer_key: 包含生成答案的字段名\n", + "type": { + "level_1": "core_text", + "level_2": "generate" + }, + "allowed_prompts": [ + "Text2QAAutoPromptGeneratorPrompt", + "Text2QASeedQuestionGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "text", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_question_num", + "default": 1, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_prompt_key", + "default": "generated_prompt", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_quesion_key", + "default": "generated_question", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_answer_key", + "default": "generated_answer", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 41, + "name": "Text2MultiHopQAGenerator", + "description": "('MultiHopQAGenerator 是多跳问答对生成处理器,支持从文本中自动生成需要多步推理的问题与答案。', '处理流程包括:文本预处理、信息抽取、问题生成与回答生成,支持自定义语言模型后端和参数。', '输出格式如下:', '输入:\\ntext: <原始上下文文本>', '输出:\\n{\\n \"text\": <处理后的文本字符串>,\\n \"qa_pairs\": [\\n {\\n \"question\": <字符串:生成的问题>,\\n \"reasoning_steps\": [\\n {\"step\": <推理过程的步骤 1>},\\n {\"step\": <步骤 2>} ...\\n ],\\n \"answer\": <字符串:最终答案>,\\n \"supporting_facts\": [<支持该答案的事实 1>, <事实 2>, ...],\\n \"type\": <可选:问题类型,如“生物学”、“历史”等>\\n },\\n ...\\n ],\\n \"metadata\": {\\n \"source\": <数据来源>,\\n \"timestamp\": <时间戳字符串>,\\n \"complexity\": <整数:问题复杂度标记>\\n }\\n}')", + "type": { + "level_1": "core_text", + "level_2": "generate" + }, + "allowed_prompts": [ + "Text2MultiHopQAGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "seed", + "default": 0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "lang", + "default": "en", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "num_q", + "default": 5, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "cleaned_chunk", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "QA_pairs", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_meta_key", + "default": "QA_metadata", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 42, + "name": "EmbeddingGenerator", + "description": "EmbeddingGenerator算子用于从输入文本生成向量表示(embedding),通常用于语义检索、聚类或下游模型输入等任务。\n\n输入参数:\n- embedding_serving:Embedding服务对象,需实现LLMServingABC接口,用于生成文本的向量表示\n- input_key:输入文本字段名,默认为'text'\n- output_key:输出向量字段名,默认为'embeddings'\n\n输出参数:\n- 包含文本向量的DataFrame,每行对应一个输入文本的embedding\n- 返回输出字段名(如'embeddings'),可供后续算子引用", + "type": { + "level_1": "core_text", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "embedding_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "text", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "embeddings", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 43, + "name": "RetrievalGenerator", + "description": "N/A (非 staticmethod)", + "type": { + "level_1": "core_text", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "system_prompt", + "default": "You are a helpful agent.", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "json_schema", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "raw_content", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "generated_content", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 44, + "name": "BenchDatasetEvaluator", + "description": "该算子用于对比预测答案与标准答案的匹配度,支持两种评估模式:\n\n1. 字符串匹配(match):使用数学验证方法比较答案,适用于有明确答案的问题\n2. 语义匹配(semantic):使用LLM评估语义相似度,仅输入预测答案与标准答案\n\n输入参数:\n- input_test_answer_key:预测答案字段名\n- input_gt_answer_key:标准答案字段名\n- compare_method:比较方法(match/semantic)\n\n输出参数:\n- answer_match_result:匹配结果(True/False)\n- 统计结果将保存到指定的eval_result_path路径\n", + "type": { + "level_1": "core_text", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "eval_result_path", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "compare_method", + "default": "match", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "system_prompt", + "default": "You are a helpful assistant specialized in evaluating answer correctness.", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_test_answer_key", + "default": "generated_cot", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_gt_answer_key", + "default": "golden_answer", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 45, + "name": "BenchDatasetEvaluatorQuestion", + "description": "该算子用于对比预测答案与标准答案的匹配度,支持两种评估模式:\n\n1. 字符串匹配(match):使用数学验证方法比较答案,适用于有明确答案的问题\n2. 语义匹配(semantic):使用LLM评估答案的语义相似度,适用于开放性问题\n\n输入参数:\n- input_test_answer_key:预测答案字段名\n- input_gt_answer_key:标准答案字段名\n- input_question_key:问题字段名(语义匹配模式下必需)\n- compare_method:比较方法(match/semantic)\n\n输出参数:\n- answer_match_result:匹配结果(True/False)\n- 统计结果将保存到指定的eval_result_path路径\n", + "type": { + "level_1": "core_text", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "eval_result_path", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "compare_method", + "default": "match", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "system_prompt", + "default": "You are a helpful assistant specialized in evaluating answer correctness.", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_test_answer_key", + "default": "generated_cot", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_gt_answer_key", + "default": "golden_answer", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_question_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 46, + "name": "Text2QASampleEvaluator", + "description": "该算子用于为给的的文档片段生成种子QA对打分\n\n输入参数:\n- input_question_key: Field name containing the generated question\n- input_answer_key: Field name containing the generated answer\n- output_question_quality_key: Field name containing the question quality grade\n- output_question_quality_feedback_key: Field name containing the question quality feedback\n- output_answer_alignment_key: Field name containing the answer alignment grade\n- output_answer_alignment_feedback_key: Field name containing the answer alignment feedback\n- output_answer_verifiability_key: Field name containing the answer verifiability grade\n- output_downstream_value_key: Field name containing the downstream value grade\n- output_downstream_value_feedback_key: Field name containing the downstream value feedback\n", + "type": { + "level_1": "core_text", + "level_2": "eval" + }, + "allowed_prompts": [ + "Text2QAQuestionQualityPrompt", + "Text2QAAnswerAlignmentPrompt", + "Text2QAAnswerVerifiabilityPrompt", + "Text2QADownstreamValuePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_question_key", + "default": "generated_question", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_answer_key", + "default": "generated_answer", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_question_quality_key", + "default": "question_quality_grades", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_question_quality_feedback_key", + "default": "question_quality_feedbacks", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_answer_alignment_key", + "default": "answer_alignment_grades", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_answer_alignment_feedback_key", + "default": "answer_alignment_feedbacks", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_answer_verifiability_key", + "default": "answer_verifiability_grades", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_answer_verifiability_feedback_key", + "default": "answer_verifiability_feedbacks", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_downstream_value_key", + "default": "downstream_value_grades", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_downstream_value_feedback_key", + "default": "downstream_value_feedbacks", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 47, + "name": "PromptedEvaluator", + "description": "PromptedEvaluator:使用 LLM 根据系统提示词对数据质量进行评分,并将评分写回 DataFrame(同时通过 storage 持久化)。模型应只输出分数(整数)。\n功能:对每行输入文本生成一个评分。\n输入参数:\n- llm_serving:LLM 服务对象,需实现 LLMServingABC 接口。\n- system_prompt:系统提示词(默认:'Please evaluate the quality of this data on a scale from 1 to 5.')。\n- input_key:输入文本所在列名(默认:'raw_content')。\n- output_key:评分结果写入的列名(默认:'eval')。\n输出:\n- 返回输出列名(用于后续算子引用),评分结果已写回并保存。", + "type": { + "level_1": "core_text", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "system_prompt", + "default": "Please evaluate the quality of this data on a scale from 1 to 5.", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "raw_content", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "eval", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 48, + "name": "PromptedFilter", + "description": "PromptedFilter 使用内置的 PromptedEvaluator 对输入数据进行数值化打分,并根据指定的分数区间(min_score 到 max_score,闭区间)筛选出符合条件的样本。默认情况下打分范围是 1–5,但用户可以通过 system_prompt 自定义其他评分规则。\n\n输入参数:\n- llm_serving:LLM 服务对象,需实现 LLMServingABC 接口\n- system_prompt:系统提示词,定义评估规范(可选,默认 'Please evaluate the quality of this data on a scale from 1 to 5.')\n- input_key:待评估文本所在列名(默认 'raw_content')\n- output_key:写回打分结果的列名(默认 'eval',若已存在将被覆盖)\n- min_score:筛选的最小分(默认 5)\n- max_score:筛选的最大分(默认 5)\n\n输出参数:\n- 过滤后的 DataFrame(仅保留分数位于 [min_score, max_score] 的行)\n- 返回 output_key 以供后续算子引用\n\n备注:\n- 默认打分区间是 1–5,但可根据实际 prompt 改变。", + "type": { + "level_1": "core_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "system_prompt", + "default": "Please evaluate the quality of this data on a scale from 1 to 5.", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "min_score", + "default": 1, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 5, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "raw_content", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "eval", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 49, + "name": "KCenterGreedyFilter", + "description": "该算子用于从大量的文档片段中选取部分文档片段,用于后续生成种子QA对\n\n输入参数:\n- input_key: 包含文档片段的字段名\n- embedding_model_path: 嵌入模型路径\n- num_samples: 选取的文档片段数量\n- method: 选择方法,随机或k-center-greedy\n\n", + "type": { + "level_1": "core_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "num_samples", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "embedding_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "content", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 50, + "name": "GeneralFilter", + "description": "该算子支持通过多个自定义函数对 DataFrame 进行灵活过滤。\n\n每条过滤规则是一个函数(例如 lambda 表达式),接受一个 DataFrame 并返回一个布尔类型的 Series,用于指定保留哪些行。\n\n输入参数:\n- filter_rules:一个函数列表,每个函数形式为 lambda df: ...,需返回一个与 df 长度一致的布尔 Series。所有规则之间采用与(AND)关系组合。\n\n示例:\n - lambda df: df['score'] > 0.5\n - lambda df: df['label'].isin(['A', 'B'])", + "type": { + "level_1": "core_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "filter_rules", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 51, + "name": "PromptedRefiner", + "description": "PromptedRefiner 根据给定的 system_prompt 对指定列的文本进行改写/润色/规范化,并将结果**就地写回**同一列(覆盖原内容)。其做法是对每一行拼接 `system_prompt + raw_content` 作为模型输入,批量生成改写结果。\n\n输入参数:\n- llm_serving:LLM 服务对象,需实现 LLMServingABC 接口\n- system_prompt:系统提示词,用于描述改写目标与风格(默认 'You are a helpful agent.')\n- input_key:要改写的文本列名(默认 'raw_content'),改写后会覆盖该列\n\n输出参数:\n- 覆盖后的 DataFrame(同名列被改写后的文本)\n- 无返回值(结果已通过 DataFlowStorage 写出)\n\n备注:\n- 该算子**覆盖** input_key 列;若需保留原文,建议先拷贝到新列。\n- 期望每行在 input_key 列提供可用文本;空值将不会生成对应输入,如与行数不匹配可能导致赋值报错。", + "type": { + "level_1": "core_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "system_prompt", + "default": "You are a helpful agent.", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "raw_content", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 52, + "name": "PandasOperator", + "description": "该算子支持通过多个自定义函数对 DataFrame 进行任意操作(如添加列、重命名、排序等)。\n\n每个函数(通常为 lambda 表达式)接受一个 DataFrame 并返回一个修改后的 DataFrame。\n\n输入参数:\n- process_fn:一个函数列表,每个函数形式为 lambda df: ...,必须返回一个 DataFrame。\n\n示例:\n - lambda df: df.assign(score2=df['score'] * 2)\n - lambda df: df.sort_values('score', ascending=False)", + "type": { + "level_1": "core_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "process_fn", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 53, + "name": "PromptedVQAGenerator", + "description": "该算子用于视觉问答生成,接收包含图像和问题的输入内容,使用大语言模型生成回答,并将生成的回答保存到数据框中。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- system_prompt:系统提示词,用于定义模型行为,默认为'You are a helpful assistant.'\n- input_key:输入内容的字段名,默认为'raw_content'\n- output_key:输出生成内容的字段名,默认为'generated_content'\n输出参数:\n- 返回输出字段名,用于后续算子引用\n- 在数据框中添加包含生成回答的新列", + "type": { + "level_1": "core_vision", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "system_prompt", + "default": "You are a helpful assistant.", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "raw_content", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "generated_content", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 54, + "name": "DBOperator", + "description": "N/A (非 staticmethod)", + "type": { + "level_1": "db", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "expr", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 55, + "name": "ColonEndFilter", + "description": "该算子用于检查文本是否以冒号结尾,常用于判断问题是否为不完整的提问。\n初始化参数:\n- 无\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'{类名小写}_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 56, + "name": "SentenceNumberFilter", + "description": "该算子用于检查文本中的句子数量是否在指定范围内,使用正则表达式匹配句子结束符号(。!?.!?)进行分割。\n初始化参数:\n- min_sentences:最小句子数量阈值,默认为3\n- max_sentences:最大句子数量阈值,默认为7500\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'sentence_number_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_sentences", + "default": 3, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_sentences", + "default": 7500, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "sentence_number_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 57, + "name": "LineEndWithEllipsisFilter", + "description": "该算子用于检测并过滤以省略号(...)或(……)结尾的文本行,常用于识别不完整的表述。\n初始化参数:\n- threshold:以省略号结尾的行数比率阈值,默认为0.3\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'line_end_with_ellipsis_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "threshold", + "default": 0.3, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "line_end_with_ellipsis_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 58, + "name": "ContentNullFilter", + "description": "该算子用于过滤空值、空字符串或仅包含空白字符的文本,确保输入数据的有效性。\n初始化参数:\n- 无\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'content_null_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "content_null_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 59, + "name": "SymbolWordRatioFilter", + "description": "该算子用于检查文本中特定符号(#, ..., …)与单词数量的比率是否超过阈值,过滤符号使用过多的文本。\n初始化参数:\n- threshold:符号与单词比率阈值,默认为0.4\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'symbol_word_ratio_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "threshold", + "default": 0.4, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "symbol_word_ratio_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 60, + "name": "AlphaWordsFilter", + "description": "该算子用于验证文本中字母单词的比率是否达到阈值,支持NLTK分词或简单空格分割两种模式。\n初始化参数:\n- threshold:字母单词比率阈值(无默认值,必须提供)\n- use_tokenizer:是否使用NLTK分词器(无默认值,必须提供)\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'alpha_words_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "threshold", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "use_tokenizer", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "alpha_words_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 61, + "name": "HtmlEntityFilter", + "description": "该算子用于检测并过滤包含HTML实体(如&、<、>等)的文本,确保内容不包含标记语言元素。\n初始化参数:\n- 无\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'html_entity_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "html_entity_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 62, + "name": "IDCardFilter", + "description": "该算子用于检测并过滤包含身份证相关术语的文本,使用正则表达式匹配身份证号码模式以保护敏感信息。\n初始化参数:\n- threshold:身份证相关词汇匹配次数阈值,默认为3\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'id_card_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "threshold", + "default": 3, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "id_card_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 63, + "name": "NoPuncFilter", + "description": "该算子用于确保文本包含足够的标点符号,通过统计句子间最大单词数量进行过滤。\n初始化参数:\n- threshold:句子间最大单词数量阈值,默认为112\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'no_punc_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "threshold", + "default": 112, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "no_punc_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 64, + "name": "SpecialCharacterFilter", + "description": "该算子用于移除包含特殊/unicode字符的文本,使用预定义模式检测非标准字符以确保文本规范性。\n初始化参数:\n- 无\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'special_character_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "special_character_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 65, + "name": "WatermarkFilter", + "description": "该算子用于检测并移除包含版权/水印内容的文本,使用指定关键词列表识别受保护内容。\n初始化参数:\n- watermarks:水印关键词列表,默认为['Copyright', 'Watermark', 'Confidential']\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'watermark_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "watermarks", + "default": [ + "Copyright", + "Watermark", + "Confidential" + ], + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "watermark_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 66, + "name": "MeanWordLengthFilter", + "description": "该算子用于检查文本中单词的平均长度是否在指定范围内,通过字符总数除以单词数量计算平均值。\n初始化参数:\n- min_length:最小平均单词长度,默认为3\n- max_length:最大平均单词长度,默认为10\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'mean_word_length_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_length", + "default": 3, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_length", + "default": 10, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "mean_word_length_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 67, + "name": "StopWordFilter", + "description": "该算子用于验证文本中停用词的比率是否高于阈值,使用NLTK分词器进行单词分割和停用词识别。\n初始化参数:\n- threshold:停用词比率阈值(无默认值,必须提供)\n- use_tokenizer:是否使用NLTK分词器(无默认值,必须提供)\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'stop_word_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "threshold", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "use_tokenizer", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "stop_word_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 68, + "name": "CurlyBracketFilter", + "description": "该算子用于检测文本中是否存在过多的花括号使用,通过花括号数量与文本长度的比率进行过滤。\n初始化参数:\n- threshold:花括号比率阈值,默认为0.025\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'curly_bracket_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "threshold", + "default": 0.025, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "curly_bracket_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 69, + "name": "CapitalWordsFilter", + "description": "该算子用于检查文本中大写单词的比率是否超过阈值,支持可选的分词器进行单词识别。\n初始化参数:\n- threshold:大写单词比率阈值,默认为0.2\n- use_tokenizer:是否使用NLTK分词器,默认为False\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'capital_words_filter'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "threshold", + "default": 0.2, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "use_tokenizer", + "default": false, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "capital_words_filter", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 70, + "name": "LoremIpsumFilter", + "description": "该算子用于检测并过滤包含占位文本(如'lorem ipsum')的文本,使用正则表达式模式匹配并结合阈值过滤。\n初始化参数:\n- threshold:'lorem ipsum'出现次数与文本长度的比率阈值,默认为3e-8\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'loremipsum_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "threshold", + "default": 3e-08, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "loremipsum_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 71, + "name": "UniqueWordsFilter", + "description": "该算子用于检查文本中唯一单词的比率是否达到阈值,通过集合操作计算唯一单词数量与总单词数量的比率。\n初始化参数:\n- threshold:最小唯一单词比率阈值,默认为0.1\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'unique_words_filter'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "threshold", + "default": 0.1, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "unique_words_filter", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 72, + "name": "CharNumberFilter", + "description": "该算子用于验证文本在去除空白字符后的字符数量是否达到最小阈值。\n初始化参数:\n- threshold:最小字符数量阈值,默认为100\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'char_number_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "threshold", + "default": 100, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "char_number_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 73, + "name": "LineStartWithBulletpointFilter", + "description": "该算子用于检测并过滤以各种项目符号符号开头的文本行,使用Unicode字符匹配结合比率阈值进行过滤。\n初始化参数:\n- threshold:以项目符号开头的行数比率阈值,默认为0.9\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'line_start_with_bullet_point_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "threshold", + "default": 0.9, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "line_start_with_bullet_point_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 74, + "name": "LineWithJavascriptFilter", + "description": "该算子用于识别并过滤包含'javascript'引用的文本,通过关键词匹配和阈值判断进行内容过滤。\n初始化参数:\n- threshold:不包含'javascript'的最小行数阈值,默认为3\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'line_with_javascript_filter_label'\n返回值:\n- 包含output_key的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "threshold", + "default": 3, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "line_with_javascript_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 75, + "name": "LangkitSampleEvaluator", + "description": "使用Langkit工具包计算文本统计信息,帮助评估文本结构复杂性和可读性。提取多种语言特征,包括句子长度、词汇多样性、情感倾向等。\n\n输出参数:\n- LangkitNumSentencesScore: 句子数量\n- LangkitNumWordsScore: 单词数量\n- LangkitAvgWordLengthScore: 平均单词长度\n- LangkitFleschReadingEaseScore: 可读性评分(Flesch公式)\n- LangkitSentimentScore: 情感倾向(-1到1之间)", + "type": { + "level_1": "general_text", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 76, + "name": "LangkitFilter", + "description": "基于LangkitScorer打分器的得分对数据进行过滤。使用Langkit工具包计算11种文本统计信息,帮助评估文本结构复杂性和可读性。\n输入参数:\n- min_scores:各指标的最小阈值字典,包含11个语言统计指标\n- max_scores:各指标的最大阈值字典,包含11个语言统计指标\n- metrics_to_keep:需要保留的评估指标列表\n输出参数:\n- 过滤后的DataFrame,仅保留所有指标都在指定范围内的文本\n- 返回包含各指标标签字段名的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_scores", + "default": { + "flesch_reading_ease": 0, + "automated_readability_index": 0, + "aggregate_reading_level": 0, + "syllable_count": 32.0, + "lexicon_count": 23.0, + "sentence_count": 1.0, + "character_count": 118.0, + "letter_count": 109.0, + "polysyllable_count": 0.0, + "monosyllable_count": 13.0, + "difficult_words": 4.0 + }, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_scores", + "default": { + "flesch_reading_ease": 100, + "automated_readability_index": 100, + "aggregate_reading_level": 100, + "syllable_count": 2331.9, + "lexicon_count": 1554.0, + "sentence_count": 89.1, + "character_count": 7466.3, + "letter_count": 7193.0, + "polysyllable_count": 216.4, + "monosyllable_count": 1044.1, + "difficult_words": 213.4 + }, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "metrics_to_keep", + "default": [ + "flesch_reading_ease", + "automated_readability_index", + "aggregate_reading_level", + "syllable_count", + "lexicon_count", + "sentence_count", + "character_count", + "letter_count", + "polysyllable_count", + "monosyllable_count", + "difficult_words" + ], + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_keys", + "default": [ + "flesch_reading_ease", + "automated_readability_index", + "aggregate_reading_level", + "syllable_count", + "lexicon_count", + "sentence_count", + "character_count", + "letter_count", + "polysyllable_count", + "monosyllable_count", + "difficult_words" + ], + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 77, + "name": "LexicalDiversitySampleEvaluator", + "description": "使用MTLD(词汇多样性测量)和HDD(移动平均类型-标记比)方法计算文本词汇多样性。\n\n功能说明:\n- MTLD(词汇多样性测量):通过计算维持特定TTR阈值所需的单词数量来评估词汇多样性\n- HDD(移动平均类型-标记比):基于样本的词汇丰富度估计\n\n输入要求:文本长度需大于50个单词\n\n输出参数:\n- LexicalDiversityMTLDScore: MTLD多样性得分(值越高表示多样性越好)\n- LexicalDiversityHD-DScore: HDD多样性得分(值越高表示多样性越好)", + "type": { + "level_1": "general_text", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 78, + "name": "LexicalDiversityFilter", + "description": "基于LexicalDiversityScorer打分器的得分对数据进行过滤。使用MTLD(移动平均类型-令牌比)和HDD(超几何分布多样性)两种方法计算词汇多样性,高分代表更丰富的词汇使用。\n输入参数:\n- min_scores:各指标的最小阈值字典,包含'mtld'和'hdd'\n- max_scores:各指标的最大阈值字典,包含'mtld'和'hdd'\n输出参数:\n- 过滤后的DataFrame,仅保留词汇多样性在指定范围内的文本\n- 返回包含各指标标签字段名的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_scores", + "default": { + "mtld": 50, + "hdd": 0.8 + }, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_scores", + "default": { + "mtld": 99999, + "hdd": 1.0 + }, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_keys", + "default": [ + "mtld", + "hdd" + ], + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 79, + "name": "NgramSampleEvaluator", + "description": "计算文本中n-gram的重复比例,评估文本冗余度。通过比较唯一n-gram数量与总n-gram数量的比值来衡量文本原创性。\n\n初始化参数:\n- ngrams: n-gram的长度,默认为5\n\n输出参数:\n- NgramScore: n-gram重复比例得分(0到1之间,得分越高表示重复比例越低)", + "type": { + "level_1": "general_text", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "ngrams", + "default": 5, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "NgramScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 80, + "name": "NgramFilter", + "description": "基于NgramScorer打分器的得分对数据进行过滤。计算文本中n-gram的重复比例,得分越高表示重复比例越低,文本冗余度越小。\n输入参数:\n- min_score:最小n-gram得分阈值\n- max_score:最大n-gram得分阈值\n- ngrams:n-gram的n值\n输出参数:\n- 过滤后的DataFrame,仅保留n-gram得分在指定范围内的文本\n- 返回包含n-gram得分字段名的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 0.8, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 1, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "ngrams", + "default": 5, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "NgramScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 81, + "name": "PresidioSampleEvaluator", + "description": "使用Microsoft Presidio模型识别文本中的个人身份信息(PII),返回检测到的PII实体数量。支持多种实体类型如姓名、邮箱、电话号码等,基于dslim/bert-base-NER模型实现。适用于评估文本的隐私安全风险。\n输入参数:\n- text: 待检测的文本字符串\n- lang: 语言类型,默认为'en'\n输出参数:\n- int: 检测到的PII实体数量", + "type": { + "level_1": "general_text", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "lang", + "default": "en", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "PresidioScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 82, + "name": "PresidioFilter", + "description": "基于PresidioScorer打分器的得分对数据进行过滤。使用Microsoft Presidio模型识别文本中的私人实体(PII),返回PII信息个数。\n支持识别姓名、邮箱、电话号码、身份证号等多种敏感信息类型,可用于数据隐私保护和合规性检查。\n输入参数:\n- min_score:保留样本的最小PII数量阈值,默认为0\n- max_score:保留样本的最大PII数量阈值,默认为5\n- lang:文本语言,默认为'en'\n- device:模型运行设备,默认为'cuda'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n输出参数:\n- 过滤后的DataFrame,仅保留PII数量在[min_score, max_score]范围内的样本\n- 返回包含输出字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 5, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "lang", + "default": "en", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "PresidioScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 83, + "name": "BlocklistFilter", + "description": "该算子使用特定语言的阻止列表进行文本过滤,支持可选的分词器进行单词级匹配。\n输入参数:\n- input_key:输入文本字段名,默认为'text'\n- language:语言代码,默认为'zh'\n- blocklist_dir:阻止列表文件目录,默认为'./blocklists/'\n- threshold:匹配次数阈值,默认为1\n- use_tokenizer:是否使用分词器,默认为True\n- tokenizer:分词器对象,默认为None\n输出参数:\n- 过滤后的DataFrame,仅保留不包含阻止列表关键词的文本行\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "language", + "default": "en", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "threshold", + "default": 1, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "use_tokenizer", + "default": false, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "blocklist_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 84, + "name": "HashDeduplicateFilter", + "description": "使用多种哈希函数对文本进行精确去重,支持md5、sha256或xxh3算法。通过计算文本的哈希值识别重复数据。\n\n初始化参数:\n- hash_func: 哈希函数名称,可选'md5'、'sha256'或'xxh3',默认为'md5'\n\n运行参数:\n- input_keys: 用于计算哈希的多个字段列表(与input_key二选一)\n- input_key: 用于计算哈希的单个字段名(与input_keys二选一)\n- output_key: 去重标记字段名,默认为'minhash_deduplicated_label'\n\n输出说明:标记为1的数据表示首次出现,标记为0的数据表示重复数据\n算法特点:\n- md5: 128位哈希值,平衡速度和唯一性\n- sha256: 256位哈希值,更高安全性,速度较慢\n- xxh3: 128位哈希值,最快的哈希算法", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "hash_func", + "default": "md5", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_keys", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "minhash_deduplicated_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 85, + "name": "LanguageFilter", + "description": "使用FastText语言识别模型过滤数据。下载并加载预训练的FastText语言识别模型,检查文本的语言是否在允许的语言列表中。\n输入参数:\n- allowed_languages:允许的语言标签列表\n- model_cache_dir:模型缓存目录路径\n输出参数:\n- 过滤后的DataFrame,仅保留语言在允许列表中的文本\n- 返回包含语言标签字段名的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "allowed_languages", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "language_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 86, + "name": "LLMLanguageFilter", + "description": "使用大语言模型识别语言并过滤数据", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "allowed_languages", + "default": [ + "en" + ], + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "language_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 87, + "name": "MinHashDeduplicateFilter", + "description": "结合MinHash与LSH(局部敏感哈希)实现高效近似去重。将文本转换为MinHash签名,使用LSH快速查找相似文本,实现大规模数据集的近似去重。\n输入参数:\n- num_perm:生成MinHash签名的排列数\n- threshold:相似度阈值,超过此阈值判定为相似文本\n- use_n_gram:是否使用n-gram分词\n- ngram:n-gram的n值\n输出参数:\n- 去重后的DataFrame,仅保留唯一文本\n- 返回包含去重标签字段名的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "num_perm", + "default": 128, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "threshold", + "default": 0.9, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "use_n_gram", + "default": true, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "ngram", + "default": 5, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_keys", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "minhash_deduplicated_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 88, + "name": "NgramHashDeduplicateFilter", + "description": "结合n-gram技术与哈希算法识别相似文本,实现近似去重。将文本分割为多个n-gram片段,计算每个片段的哈希值,通过比较哈希集合的相似度来判断文本相似性。\n输入参数:\n- n_gram:将文本分割的片段数量\n- hash_func:哈希函数类型,支持'md5'、'sha256'和'xxh3'\n- diff_size:哈希集合差异阈值,小于此值判定为相似文本\n输出参数:\n- 去重后的DataFrame,仅保留唯一文本\n- 返回包含去重标签字段名的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "n_gram", + "default": 3, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "hash_func", + "default": "md5", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "diff_size", + "default": 1, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_keys", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "minhash_deduplicated_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 89, + "name": "PerspectiveSampleEvaluator", + "description": "使用Perspective API评估文本的毒性,返回毒性概率,得分越高表明文本毒性越高。\n输入参数:\n- serving:Perspective API服务对象\n- input_key:输入文本字段名\n- output_key:输出得分字段名,默认'PerspectiveScore'\n输出参数:\n- 包含毒性评估得分的DataFrame", + "type": { + "level_1": "general_text", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "PerspectiveScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 90, + "name": "PerspectiveFilter", + "description": "基于PerspectiveScorer打分器的得分对数据进行过滤使用Perspective API评估文本的毒性,返回毒性概率,得分越高表明文本毒性越高。\n输入参数:\n- min_score:最小毒性得分阈值\n- max_score:最大毒性得分阈值\n输出参数:\n- 过滤后的DataFrame,仅保留毒性得分在指定范围内的文本\n- 返回包含毒性得分字段名的列表", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 0.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 0.5, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "PerspectiveScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 91, + "name": "SemDeduplicateFilter", + "description": "基于BERT语义相似度识别语义重复文本,执行近似去重操作。通过计算文本嵌入向量间的余弦相似度,识别语义相似的文本并保留唯一样本。\n支持多字段组合作为去重依据,可有效去除内容相似但表述不同的重复数据,提高数据集多样性。\n输入参数:\n- eps:相似度阈值,值越小表示允许的相似度越低,默认为0.05(即余弦相似度大于0.95视为重复)\n- model_name:预训练模型名称,默认为'sentence-transformers/all-MiniLM-L6-v2'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- device:模型运行设备,默认为'cuda'\n- input_keys:多个输入字段名列表,与input_key二选一\n- input_key:单个输入字段名,与input_keys二选一\n- output_key:去重结果字段名,默认为'minhash_deduplicated_label'\n输出参数:\n- 过滤后的DataFrame,仅保留语义不重复的样本(标记为1的样本)\n- 返回包含去重结果字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "eps", + "default": 0.05, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_name", + "default": "sentence-transformers/all-MiniLM-L6-v2", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_keys", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "minhash_deduplicated_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 92, + "name": "SimHashDeduplicateFilter", + "description": "使用SimHash算法通过汉明距离识别相似文本,执行近似去重操作。将文本转换为固定长度的指纹,通过计算指纹间的汉明距离判断文本相似度。\n相比语义去重速度更快,适合大规模数据集的快速去重预处理,尤其适用于检测字符层面相似的文本。\n输入参数:\n- fingerprint_size:指纹长度,默认为64位\n- bound:相似度阈值,值越小表示允许的相似度越低,默认为0.1(即相似度大于0.9视为重复)\n- input_keys:多个输入字段名列表,与input_key二选一\n- input_key:单个输入字段名,与input_keys二选一\n- output_key:去重结果字段名,默认为'minhash_deduplicated_label'\n输出参数:\n- 过滤后的DataFrame,仅保留相似性低于阈值的唯一样本(标记为1的样本)\n- 返回包含去重结果字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "fingerprint_size", + "default": 64, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "bound", + "default": 0.1, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_keys", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "minhash_deduplicated_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 93, + "name": "WordNumberFilter", + "description": "该算子用于过滤单词数量不在指定范围内的文本,通过空格分割计算单词数量。\n输入参数:\n- input_key:输入文本字段名,默认为'text'\n- min_words:最小单词数量阈值,默认为5\n- max_words:最大单词数量阈值,默认为100\n输出参数:\n- 过滤后的DataFrame,仅保留单词数量在指定范围内的文本行\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_words", + "default": 20, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_words", + "default": 100000, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "word_number_filter_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 94, + "name": "HtmlEntityRefiner", + "description": "去除文本中的HTML实体,包括标准实体(如 、<)和各种变体形式(全角符号、中文分号等)。支持自定义需要移除的HTML实体列表。输入参数:\n- html_entities:需要移除的HTML实体列表,默认为包含常见实体的列表\n- input_key:输入文本字段名\n输出参数:\n- 包含移除HTML实体后文本的DataFrame\n- 返回输入字段名,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "html_entities", + "default": [ + "nbsp", + "lt", + "gt", + "amp", + "quot", + "apos", + "hellip", + "ndash", + "mdash", + "lsquo", + "rsquo", + "ldquo", + "rdquo" + ], + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 95, + "name": "HtmlUrlRemoverRefiner", + "description": "去除文本中的URL链接和HTML标签,净化文本内容。使用正则表达式匹配并移除各种形式的URL和HTML标签。输入参数:\n- input_key:输入文本字段名\n输出参数:\n- 包含净化后文本的DataFrame\n- 返回输入字段名,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 96, + "name": "LowercaseRefiner", + "description": "将文本字段中的所有大写字符转换为小写,统一文本格式。对指定字段的文本内容进行全小写处理。输入参数:\n- input_key:输入文本字段名\n输出参数:\n- 包含小写转换后文本的DataFrame\n- 返回输入字段名,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 97, + "name": "NERRefiner", + "description": "使用命名实体识别(NER)技术识别并屏蔽文本中的特定实体。使用spaCy的'en_core_web_sm'模型识别实体,并将其替换为对应的实体类型标签。输入参数:\n- input_key:输入文本字段名\n输出参数:\n- 包含实体屏蔽后文本的DataFrame\n- 返回输入字段名,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 98, + "name": "PIIAnonymizeRefiner", + "description": "使用Presidio和BERT-NER模型识别并匿名化文本中的个人身份信息(PII)。支持多种PII类型的检测和匿名化处理。输入参数:\n- lang:语言代码,默认为'en'\n- device:运行设备,默认为'cuda'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- model_name:NER模型名称,默认为'dslim/bert-base-NER'\n- input_key:输入文本字段名\n输出参数:\n- 包含匿名化后文本的DataFrame\n- 返回输入字段名,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "lang", + "default": "en", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_name", + "default": "dslim/bert-base-NER", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 99, + "name": "ReferenceRemoverRefiner", + "description": "删除文本中未闭合的引用标签和引用链接,包括标签和{{cite}}模板的各种完整和不完整形式。净化文本中的引用标记。输入参数:\n- input_key:输入文本字段名\n输出参数:\n- 包含移除引用标记后文本的DataFrame\n- 返回输入字段名,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 100, + "name": "RemoveContractionsRefiner", + "description": "该算子用于扩展文本中的英语缩写词,将缩写形式转换为完整形式(例如将\"can't\"扩展为\"cannot\")。\n使用contractions库进行缩写词扩展,提高文本标准化程度。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含扩展缩写词后的文本\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 101, + "name": "RemoveEmojiRefiner", + "description": "该算子用于去除文本中的Unicode图像表情符号,包括表情符号、杂项符号、交通符号、旗帜等各类图像符号。\n通过正则表达式匹配Unicode表情符号范围,实现高效过滤。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含去除表情符号的文本\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 102, + "name": "RemoveEmoticonsRefiner", + "description": "该算子用于移除文本中的文本型表情符号,例如':-)'、':D'、':('等字符组合表情。\n基于预定义的表情符号字典进行匹配替换,支持多种常见文本表情模式。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含去除文本表情的文本\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 103, + "name": "RemoveExtraSpacesRefiner", + "description": "该算子用于移除文本中的多余空格,将连续的多个空格替换为单个空格,并去除文本前后的空白字符。\n通过字符串分割和连接实现空格标准化,提高文本格式一致性。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含标准化空格的文本\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 104, + "name": "RemoveImageRefsRefiner", + "description": "该算子用于去除文本中的图片引用格式,包括Markdown图片链接、图片编号、特殊符号组合等图像引用模式。\n通过多模式正则表达式匹配,识别并移除多种图片引用格式。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含去除图片引用的文本\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 105, + "name": "RemoveNumberRefiner", + "description": "该算子用于移除文本中的数字字符,包括0-9的阿拉伯数字。\n通过字符过滤实现数字移除,保留纯文本内容。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含去除数字的文本\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 106, + "name": "RemovePunctuationRefiner", + "description": "该算子用于移除文本中的标点符号,包括英文标点符号集合中的所有符号。\n使用string.punctuation定义的标点集合进行过滤,实现文本去标点处理。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含去除标点的文本\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 107, + "name": "RemoveRepetitionsPunctuationRefiner", + "description": "该算子用于移除文本中重复的标点符号,例如将\"!!!\"变为\"!\",\",,\"变为\",\"。\n通过正则表达式匹配连续重复的标点符号,替换为单个符号。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含标准化标点的文本\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 108, + "name": "RemoveStopwordsRefiner", + "description": "该算子用于移除文本中的英语停用词(如\"the\",\"is\",\"in\"等无实际意义的高频词汇)。\n使用NLTK库的stopwords语料库进行停用词过滤,提高文本特征密度。\n输入参数:\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含去除停用词的文本\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 109, + "name": "SpellingCorrectionRefiner", + "description": "该算子用于通过SymSpell算法对文本中的拼写错误进行纠正,支持自定义编辑距离和词典路径。\n若本地词典不存在则自动下载,使用近似字符串匹配实现拼写纠错功能。\n输入参数:\n- max_edit_distance:最大编辑距离,默认为2\n- prefix_length:前缀长度,默认为7\n- dictionary_path:词典路径,默认为'frequency_dictionary_en_82_765.txt'\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含纠正拼写错误的文本\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "max_edit_distance", + "default": 2, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prefix_length", + "default": 7, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "dictionary_path", + "default": "frequency_dictionary_en_82_765.txt", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 110, + "name": "StemmingLemmatizationRefiner", + "description": "该算子用于对文本进行词干提取或词形还原处理,将词语转换为其基本形式。\n支持两种处理方式:Porter词干提取(stemming)和WordNet词形还原(lemmatization),可通过参数选择。\n输入参数:\n- method:处理方法,可选'stemming'或'lemmatization',默认为'stemming'\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含词干/词形还原后的文本\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "method", + "default": "stemming", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 111, + "name": "TextNormalizationRefiner", + "description": "该算子用于规范化文本中的日期格式和货币格式,统一为标准表示形式。\n日期格式统一转换为'YYYY-MM-DD'形式,货币格式转换为'金额 USD'形式,提高数据一致性。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含格式规范化的文本\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "general_text", + "level_2": "refine" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 112, + "name": "BertSampleEvaluator", + "description": "使用BERTScore评估生成文本与参考文本的相似度,基于上下文嵌入计算P/R/F1分数。\n输入参数:\n- lang:语言类型,默认为'en'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- input_key:生成文本字段名\n- reference_key:参考文本字段名\n- output_key:输出得分字段名,默认为'BertScore'\n输出参数:\n- 包含F1相似度得分的DataFrame", + "type": { + "level_1": "general_text", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "lang", + "default": "en", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_reference_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "BertScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 113, + "name": "BleuSampleEvaluator", + "description": "计算BLEU分数评估生成文本与参考文本的n-gram重叠度,支持1-4元语法分析。\n输入参数:\n- n:最大n-gram长度,默认为4\n- eff:参考长度计算方式,可选'shortest'/'average'/'longest',默认为'average'\n- special_reflen:特殊参考长度,默认为None\n- input_key:生成文本字段名\n- reference_key:参考文本字段名\n- output_key:输出得分字段名,默认为'BleuScore'\n输出参数:\n- 包含BLEU得分的DataFrame", + "type": { + "level_1": "general_text", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "n", + "default": 4, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "eff", + "default": "average", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "special_reflen", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_reference_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "BleuScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 114, + "name": "CiderSampleEvaluator", + "description": "使用CIDEr指标评估生成文本与参考文本的相似度,基于TF-IDF加权的n-gram重叠度。\n输入参数:\n- n:最大n-gram长度,默认为4\n- sigma:高斯惩罚参数,默认为6.0\n- df_mode:文档频率模式,默认为'coco-val-df'\n- idf_path:IDF文件路径,默认为预训练COCO数据集IDF\n- input_key:生成文本字段名\n- reference_key:参考文本字段名\n- output_key:输出得分字段名,默认为'CiderScore'\n输出参数:\n- 包含CIDEr得分的DataFrame", + "type": { + "level_1": "general_text", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "n", + "default": 4, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "sigma", + "default": 6.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "df_mode", + "default": "coco-val-df", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "idf_path", + "default": "./dataflow/operators/general_pt/eval/cider/coco-val-df.p", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_reference_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "CiderScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 115, + "name": "Task2VecDatasetEvaluator", + "description": "使用Task2Vec方法评估数据集的多样性,通过计算样本嵌入的余弦距离矩阵来量化多样性。\n输入参数:\n- device:计算设备,默认为'cuda'\n- sample_nums:采样次数,默认为10\n- sample_size:每次采样样本数,默认为1\n- method:嵌入方法,可选'montecarlo'或'variational',默认为'montecarlo'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- input_key:输入文本字段名\n输出参数:\n- Task2VecDiversityScore:多样性得分\n- ConfidenceInterval:置信区间", + "type": { + "level_1": "general_text", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "sample_nums", + "default": 10, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "sample_size", + "default": 1, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "method", + "default": "montecarlo", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 116, + "name": "VendiDatasetEvaluator", + "description": "通过计算VendiScore来评估数据集的多样性,使用BERT和SimCSE模型生成嵌入并计算分数。\n输入参数:\n- device:计算设备,默认为'cuda'\n- input_key:输入文本字段名\n输出参数:\n- BERTVendiScore:基于BERT的多样性得分\n- SimCSEVendiScore:基于SimCSE的多样性得分", + "type": { + "level_1": "general_text", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 117, + "name": "KBCChunkGenerator", + "description": "('CorpusTextSplitter是轻量级文本分割工具,', '支持词/句/语义/递归分块,', '可配置块大小、重叠和最小块长度')", + "type": { + "level_1": "knowledge_cleaning", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "chunk_size", + "default": 512, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "chunk_overlap", + "default": 50, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "split_method", + "default": "token", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "min_tokens_per_chunk", + "default": 100, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "tokenizer_name", + "default": "bert-base-uncased", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "text_path", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "raw_chunk", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 118, + "name": "KBCChunkGeneratorBatch", + "description": "('CorpusTextSplitter是轻量级文本分割工具,', '支持词/句/语义/递归分块,', '可配置块大小、重叠和最小块长度')", + "type": { + "level_1": "knowledge_cleaning", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "chunk_size", + "default": 512, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "chunk_overlap", + "default": 50, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "split_method", + "default": "token", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "min_tokens_per_chunk", + "default": 100, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "tokenizer_name", + "default": "bert-base-uncased", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "text_path", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "chunk_path", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 119, + "name": "FileOrURLToMarkdownConverter", + "description": "知识提取算子:支持从多种文件格式中提取结构化内容并转换为标准Markdown\n核心功能:\n1. PDF文件:使用MinerU解析引擎提取文本/表格/公式,保留原始布局\n2. Office文档(DOC/PPT等):通过DocConverter转换为Markdown格式\n3. 网页内容(HTML/XML):使用trafilatura提取正文并转为Markdown\n4. 纯文本(TXT/MD):直接透传不做处理\n特殊处理:\n- 自动识别中英文文档(lang参数)\n- 支持本地文件路径和URL输入\n- 生成中间文件到指定目录(intermediate_dir)", + "type": { + "level_1": "knowledge_cleaning", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "url", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "raw_file", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "intermediate_dir", + "default": "intermediate", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "lang", + "default": "en", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "mineru_backend", + "default": "vlm-sglang-engine", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 120, + "name": "FileOrURLToMarkdownConverterBatch", + "description": "知识提取算子:支持从多种文件格式中提取结构化内容并转换为标准Markdown\n核心功能:\n1. PDF文件:使用MinerU解析引擎提取文本/表格/公式,保留原始布局\n2. Office文档(DOC/PPT等):通过DocConverter转换为Markdown格式\n3. 网页内容(HTML/XML):使用trafilatura提取正文并转为Markdown\n4. 纯文本(TXT/MD):直接透传不做处理\n特殊处理:\n- 自动识别中英文文档(lang参数)\n- 支持本地文件路径和URL输入\n- 生成中间文件到指定目录(intermediate_dir)", + "type": { + "level_1": "knowledge_cleaning", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "intermediate_dir", + "default": "intermediate", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "lang", + "default": "en", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "mineru_backend", + "default": "vlm-sglang-engine", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "source", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "text_path", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 121, + "name": "KBCTextCleaner", + "description": "知识清洗算子:对原始知识内容进行标准化处理,包括HTML标签清理、特殊字符规范化、链接处理和结构优化,提升RAG知识库的质量。主要功能:\n1. 移除冗余HTML标签但保留语义化标签\n2. 标准化引号/破折号等特殊字符\n3. 处理超链接同时保留文本\n4. 保持原始段落结构和代码缩进\n5. 确保事实性内容零修改\n\n输入格式示例:\n
\n

标题文本

\n

正文段落,包括特殊符号,例如“弯引号”、–破折号等

\n \"示意图\"\n 链接文本\n
代码片段
\n ...\n
\n\n输出格式示例:\n标题文本\n\n正文段落,包括特殊符号,例如\"直引号\"、-破折号等\n\n[Image: 示例图 example.jpg]\n\n链接文本\n\n代码片段\n\n[结构保持,语义保留,敏感信息脱敏处理(如手机号、保密标记等)]", + "type": { + "level_1": "knowledge_cleaning", + "level_2": "generate" + }, + "allowed_prompts": [ + "KnowledgeCleanerPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "lang", + "default": "en", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "raw_chunk", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "cleaned_chunk", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 122, + "name": "KBCTextCleanerBatch", + "description": "知识清洗算子:对原始知识内容进行标准化处理,包括HTML标签清理、特殊字符规范化、链接处理和结构优化,提升RAG知识库的质量。主要功能:\n1. 移除冗余HTML标签但保留语义化标签\n2. 标准化引号/破折号等特殊字符\n3. 处理超链接同时保留文本\n4. 保持原始段落结构和代码缩进\n5. 确保事实性内容零修改", + "type": { + "level_1": "knowledge_cleaning", + "level_2": "generate" + }, + "allowed_prompts": [ + "KnowledgeCleanerPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "lang", + "default": "en", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "chunk_path", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "cleaned_chunk_path", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 123, + "name": "KBCMultiHopQAGeneratorBatch", + "description": "('MultiHopQAGenerator 是多跳问答对生成处理器,支持从文本中自动生成需要多步推理的问题与答案。', '处理流程包括:文本预处理、信息抽取、问题生成与回答生成,支持自定义语言模型后端和参数。', '输出格式如下:', '输入:\\ntext: <原始上下文文本>', '输出:\\n{\\n \"text\": <处理后的文本字符串>,\\n \"qa_pairs\": [\\n {\\n \"question\": <字符串:生成的问题>,\\n \"reasoning_steps\": [\\n {\"step\": <推理过程的步骤 1>},\\n {\"step\": <步骤 2>} ...\\n ],\\n \"answer\": <字符串:最终答案>,\\n \"supporting_facts\": [<支持该答案的事实 1>, <事实 2>, ...],\\n \"type\": <可选:问题类型,如“生物学”、“历史”等>\\n },\\n ...\\n ],\\n \"metadata\": {\\n \"source\": <数据来源>,\\n \"timestamp\": <时间戳字符串>,\\n \"complexity\": <整数:问题复杂度标记>\\n }\\n}')", + "type": { + "level_1": "knowledge_cleaning", + "level_2": "generate" + }, + "allowed_prompts": [ + "Text2MultiHopQAGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "seed", + "default": 0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "lang", + "default": "en", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "chunk_path", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "enhanced_chunk_path", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 124, + "name": "QAExtractor", + "description": "QA对提取器 - 将嵌套的QA_pairs转换为Alpaca微调格式\n\n核心功能:\n从结构化的QA对数据中提取问答内容,自动整合推理步骤和支持事实,\n输出符合Stanford Alpaca标准的instruction-input-output格式。\n\n初始化参数:\n• qa_key: QA对的字段名 (默认: 'QA_pairs')\n• output_json_file: 输出JSON文件路径 (可选,不指定则只更新DataFrame)\n• instruction: 统一的指令前缀 (默认: 'Please answer the following question...')\n\n运行参数 (input_key):\n• None - 包含所有字段 (question + reasoning_steps + supporting_facts)\n• '' - 空字符串,不包含额外上下文\n• 'reasoning_steps' - 只包含推理步骤\n• 'question,reasoning_steps' - 逗号分隔多个字段\n• ['question', 'supporting_facts'] - 列表格式\n\n输出字段:\n• instruction: 问题指令\n• input: 上下文信息 (根据input_key动态拼接)\n• output: 答案\n\n适用场景: 知识库QA微调、领域问答模型训练", + "type": { + "level_1": "knowledge_cleaning", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "qa_key", + "default": "QA_pairs", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_json_file", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "instruction", + "default": "Please answer the following question based on the provided information.", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 125, + "name": "ReasoningAnswerGenerator", + "description": "该算子用于为给定问题生成答案,调用大语言模型进行推理。\n输入参数:\n- llm_serving:LLM服务实例,用于生成答案\n- prompt_template:提示模板对象,用于构建生成提示词\n输出参数:\n- output_key:生成的答案字段,默认'generated_cot'", + "type": { + "level_1": "reasoning", + "level_2": "generate" + }, + "allowed_prompts": [ + "MathAnswerGeneratorPrompt", + "GeneralAnswerGeneratorPrompt", + "DiyAnswerGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": "", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "generated_cot", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 126, + "name": "ReasoningQuestionGenerator", + "description": "该算子用于基于现有问题生成新问题。\n输入参数:\n- num_prompts:生成问题的数量,整数,范围1-5(含),默认1\n- llm_serving:LLM服务实例,用于生成问题\n- prompt_template:提示模板对象,用于构建生成提示词\n输出参数:\n- 原始输入列(由input_key指定):新增生成的问题\n- Synth_or_Input:标识问题来源,'input'表示原始问题,'synth'表示生成的新问题", + "type": { + "level_1": "reasoning", + "level_2": "generate" + }, + "allowed_prompts": [ + "MathQuestionSynthesisPrompt", + "GeneralQuestionSynthesisPrompt", + "DiyQuestionSynthesisPrompt" + ], + "parameter": { + "init": [ + { + "name": "num_prompts", + "default": 1, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_synth_or_input_flag", + "default": "Synth_or_Input", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 127, + "name": "ReasoningAnswerExtractionQwenMathEvalGenerator", + "description": "该算子用于从数学问题回答中提取规范化答案表达式,进行字符串清洗、单位处理和格式标准化。\n\n输入参数:\n- input_key:输入数据字段名\n- answer_key:原始答案字段名\n- output_key:处理后的答案字段名\n- unit_texts:需要过滤的单位文本列表\n\n输出参数:\n- output_key:标准化后的数学表达式字段", + "type": { + "level_1": "reasoning", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "dataset_name", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "pseudo_correct_solution_example", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "extraction", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 128, + "name": "ReasoningPseudoAnswerGenerator", + "description": "该算子生成多个候选答案并通过统计选择最优解,实现伪答案生成。\n\n输入参数:\n- input_file:输入文件路径\n- output_file:输出文件路径\n- max_times:最大生成次数\n- selection_mode:统计选择模式(frequency/consistency)\n\n输出参数:\n- final_answer:最终选择答案字段\n- candidate_answers:候选答案列表字段", + "type": { + "level_1": "reasoning", + "level_2": "generate" + }, + "allowed_prompts": [ + "MathAnswerGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_times", + "default": 3, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key_answer", + "default": "pseudo_answers", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key_answer_value", + "default": "pseudo_answer_value", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key_solutions", + "default": "pseudo_solutions", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key_correct_solution_example", + "default": "pseudo_correct_solution_example", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 129, + "name": "ReasoningPretrainFormatConvertGenerator", + "description": "该算子用于将SFT格式数据转换为预训练格式。\n\n输入参数:\n- read_key_question:问题字段名\n- read_key_answer:答案字段名\n- output_key:输出文本字段名\n\n输出参数:\n- output_key:输出文本字段名,包含问题和答案的拼接结果\n- 输出文件:转换后的预训练格式数据文件路径", + "type": { + "level_1": "reasoning", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_read_key_question", + "default": "question", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_read_key_answer", + "default": "answer", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "text", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 130, + "name": "ReasoningQuestionFusionGenerator", + "description": "该算子用于基于现有问题生成新问题。\n输入参数:\n- num_prompts:生成问题的数量,整数,范围1-5(含),默认1\n- llm_serving:LLM服务实例,用于生成问题\n- prompt_template:提示模板对象,用于构建生成提示词\n输出参数:\n- 原始输入列(由input_key指定):新增生成的问题\n- Synth_or_Input:标识问题来源,'input'表示原始问题,'synth'表示生成的新问题", + "type": { + "level_1": "reasoning", + "level_2": "generate" + }, + "allowed_prompts": [ + "MathQuestionParallelFusionGeneratorPrompt", + "MathQuestionSequentialFusionGeneratorPrompt", + "MathQuestionConditionFusionGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "num_prompts", + "default": 1, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_problem_1", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_problem_2", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 131, + "name": "ReasoningCategoryDatasetEvaluator", + "description": "该算子用于统计数据集中的类别信息,包括主类别和次类别的分布情况。它计算每个类别的样本数量,并返回类别分布的统计结果。\n输入参数:\n- input_primary_category_key:主类别字段名,默认为'primary_category'\n- input_secondary_category_key:次类别字段名,默认为'secondary_category'\n输出参数:\n- 返回包含类别统计信息的字典,主类别作为键,值为包含该类别样本数量和次类别分布的字典", + "type": { + "level_1": "reasoning", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_primary_category_key", + "default": "primary_category", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_secondary_category_key", + "default": "secondary_category", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 132, + "name": "ReasoningDifficultyDatasetEvaluator", + "description": "该算子用于统计数据集中的难度信息,计算不同难度级别的样本数量分布。它统计每个难度级别的样本数量,并返回难度分布的统计结果。\n输入参数:\n- input_diffulty_key:难度分数字段名,默认为'difficulty_score'\n输出参数:\n- 返回包含难度统计信息的字典,难度级别作为键,值为该难度级别的样本数量", + "type": { + "level_1": "reasoning", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_diffulty_key", + "default": "difficulty_score", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 133, + "name": "ReasoningTokenDatasetEvaluator", + "description": "该算子用于统计数据集中问题和回答的token信息,包括token数量的最小值、最大值、平均值和中位数等统计指标。它使用指定的tokenizer对文本进行编码,并计算token长度的分布情况。\n输入参数:\n- input_question_key:问题文本字段名\n- input_answer_key:回答文本字段名\n- model_name_or_path:tokenizer模型名称或路径\n输出参数:\n- 返回包含token统计信息的字典,包括问题和回答的token数量的零值计数、最小值、最大值、平均值和中位数", + "type": { + "level_1": "reasoning", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "model_name_or_path", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_question_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_answer_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 134, + "name": "ReasoningQuestionCategorySampleEvaluator", + "description": "该算子用于对用户问题进行多级分类(主分类和子分类)。通过大语言模型对输入问题进行语义分析,输出分类编码结果。\n\n输入参数:\n- db_port/db_name/table_name:数据库连接参数(存储模式)\n- input_file/output_file:文件路径(文件模式)\n- input_key:输入数据中问题字段的键名\n- generator_type:模型调用方式(aisuite/request)\n\n输出参数:\n- classification_result:包含主分类和子分类的编码结果", + "type": { + "level_1": "reasoning", + "level_2": "eval" + }, + "allowed_prompts": [ + "MathQuestionCategoryPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "question_category", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 135, + "name": "ReasoningQuestionDifficultySampleEvaluator", + "description": "该算子用于评估问题的难度等级。通过大语言模型分析问题复杂度,输出1-10级的难度评分。\n\n输入参数:\n- eval_stage:评估阶段标识\n- read_min/max_score:分数过滤阈值\n- 其他参数同ReasoningCategoryDatasetEvaluator\n\n输出参数:\n- difficulty_score:数值型难度评分(1-10)", + "type": { + "level_1": "reasoning", + "level_2": "eval" + }, + "allowed_prompts": [ + "MathQuestionDifficultyPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "difficulty_score", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 136, + "name": "ReasoningQuestionSolvableSampleEvaluator", + "description": "该算子用于对用户问题进行多级分类(主分类和子分类)。通过大语言模型对输入问题进行语义分析,输出分类编码结果。\n\n输入参数:\n- db_port/db_name/table_name:数据库连接参数(存储模式)\n- input_file/output_file:文件路径(文件模式)\n- input_key:输入数据中问题字段的键名\n- generator_type:模型调用方式(aisuite/request)\n\n输出参数:\n- classification_result:包含主分类和子分类的编码结果", + "type": { + "level_1": "reasoning", + "level_2": "eval" + }, + "allowed_prompts": [ + "MathQuestionEvaluatorPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 137, + "name": "ReasoningAnswerFormatterFilter", + "description": "该算子用于检查答案格式是否符合规范,主要验证数学答案是否包含正确的\\boxed{}标记。\n\n输入参数:\n- input_key:输入字段名\n- result_key:结果字段名\n\n输出参数:\n- 通过格式检查返回1,否则返回0", + "type": { + "level_1": "reasoning", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "generated_cot", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 138, + "name": "ReasoningAnswerGroundTruthFilter", + "description": "该算子用于对比预测答案与标准答案的匹配度,支持精确匹配和数学验证两种方式。\n\n输入参数:\n- input_test_answer_key:预测答案字段名\n- input_gt_answer_key:标准答案字段名\n- compare_method:比较方法(exact/math_verify)\n\n输出参数:\n- 匹配成功返回1,否则返回0", + "type": { + "level_1": "reasoning", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "compare_method", + "default": "math_verify", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_test_answer_key", + "default": "generated_cot", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_gt_answer_key", + "default": "golden_answer", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 139, + "name": "ReasoningAnswerNgramFilter", + "description": "该算子基于n-gram重复率过滤答案,检测回答中的重复模式。\n\n输入参数:\n- min_score:最小可接受分数\n- max_score:最大可接受分数\n- ngrams:n-gram大小\n\n输出参数:\n- 分数在范围内返回1,否则返回0", + "type": { + "level_1": "reasoning", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 0.1, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 1.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "ngrams", + "default": 5, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_question_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_answer_key", + "default": "generated_cot", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 140, + "name": "ReasoningAnswerPipelineRootFilter", + "description": "答案处理流程根节点,负责将输入数据根据有无真实标签GT分发到不同处理分支。\n\n输入参数:\n- input_file:输入文件路径\n- output_dir:输出目录路径\n- branch_config:分支配置参数\n- parallel_workers:并行工作线程数\n\n输出参数:\n- 多个输出文件路径(根据分支配置生成)", + "type": { + "level_1": "reasoning", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_answer_key", + "default": "output", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_gt_key", + "default": "golden_answer", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 141, + "name": "ReasoningAnswerTokenLengthFilter", + "description": "该算子根据token数量过滤过长的答案。\n\n输入参数:\n- max_answer_token_length:最大token数\n- tokenizer_dir:分词器路径\n- read_min/max_score:分数范围\n\n输出参数:\n- 长度合规返回1,否则返回0", + "type": { + "level_1": "reasoning", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "max_answer_token_length", + "default": 8192, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "tokenizer_dir", + "default": "Qwen/Qwen2.5-0.5B-Instruct", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "generated_cot", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 142, + "name": "ReasoningQuestionFilter", + "description": "该算子用于对问题进行正确性检查,包括格式是否规范、语义是否合理、条件是否矛盾以及是否具备充分信息可解。调用大语言模型依次执行四阶段判断,最终返回每个问题是否合格的二分类结果(保留合格样本)。\n输入参数:\n- system_prompt:系统提示词,用于定义模型行为\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- prompt_template:提示模板对象,用于构建检查提示词\n- input_key:输入问题字段名,默认为'math_problem'\n输出参数:\n- 过滤后的DataFrame,仅保留判断结果为True的行\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "reasoning", + "level_2": "filter" + }, + "allowed_prompts": [ + "MathQuestionFilterPrompt", + "GeneralQuestionFilterPrompt", + "DiyQuestionFilterPrompt" + ], + "parameter": { + "init": [ + { + "name": "system_prompt", + "default": "You are a helpful assistant.", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": "", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "math_problem", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 143, + "name": "ReasoningAnswerModelJudgeFilter", + "description": "该算子用于对答案进行正确性评判,通过比较当前答案与参考答案的语义一致性,判断答案是否正确。调用大语言模型进行语义理解和判断,最终返回每个答案是否正确的二分类结果。\n输入参数:\n- system_prompt:系统提示词,用于定义模型行为\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- prompt_template:提示模板对象,用于构建评判提示词\n- keep_all_samples:是否保留所有样本,默认为False(仅保留正确答案)\n- question_key:问题字段名,默认为'question'\n- answer_key:当前答案字段名,默认为'answer'\n- reference_key:参考答案字段名,默认为'reference_answer'\n输出参数:\n- DataFrame,包含原始数据和判断结果(answer_match_result字段)\n- 如果keep_all_samples为False,则仅保留判断结果为True的行\n- 返回包含输入字段名的列表,用于后续算子引用", + "type": { + "level_1": "reasoning", + "level_2": "filter" + }, + "allowed_prompts": [ + "AnswerJudgePromptQuestion", + "AnswerJudgePrompt" + ], + "parameter": { + "init": [ + { + "name": "system_prompt", + "default": "You are a helpful assistant specialized in evaluating answer correctness.", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": "", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "keep_all_samples", + "default": false, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_question_key", + "default": "question", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_answer_key", + "default": "answer", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_reference_key", + "default": "reference_answer", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 144, + "name": "SQLConsistencyFilter", + "description": "对条目进行过滤,检测SQL和自然语言问题是否对应,即判断SQL是否能解决该问题。\n\n输入参数:\n- input_sql_key: 输入SQL列名\n- input_db_id_key: 输入数据库ID列名\n- input_question_key: 输入问题列名\n\n", + "type": { + "level_1": "text2sql", + "level_2": "filter" + }, + "allowed_prompts": [ + "SQLConsistencyFilterPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "database_manager", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_sql_key", + "default": "SQL", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_db_id_key", + "default": "db_id", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_question_key", + "default": "question", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 145, + "name": "SQLExecutionFilter", + "description": "对条目进行过滤,在数据库中执行SQL,筛选掉不可执行的条目。\n\n输入参数:\n- input_sql_key: 输入SQL列名\n- input_db_id_key: 输入数据库ID列名\n\n", + "type": { + "level_1": "text2sql", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "database_manager", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_sql_key", + "default": "sql", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_db_id_key", + "default": "db_id", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 146, + "name": "SQLGenerator", + "description": "基于数据库信息,合成SQL,覆盖不同的难度、数据库Schema、函数和风格。\n\n输出参数:\n- output_sql_key: 输出SQL列名\n- output_db_id_key: 数据库ID列名\n\n", + "type": { + "level_1": "text2sql", + "level_2": "generate" + }, + "allowed_prompts": [ + "SelectSQLGeneratorPrompt", + "SelectVecSQLGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "database_manager", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "generate_num", + "default": 300, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_sql_key", + "default": "sql", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_db_id_key", + "default": "db_id", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 147, + "name": "SQLByColumnGenerator", + "description": "基于数据库信息,合成SQL,覆盖不同的难度、数据库Schema、函数和风格。\n\n输出参数:\n- output_sql_key: 输出SQL列名\n- output_db_id_key: 数据库ID列名\n\n", + "type": { + "level_1": "text2sql", + "level_2": "generate" + }, + "allowed_prompts": [ + "SelectSQLGeneratorPrompt", + "SelectVecSQLGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "database_manager", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "generate_num", + "default": 5, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_sql_key", + "default": "sql", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_db_id_key", + "default": "db_id", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 148, + "name": "SQLVariationGenerator", + "description": "对于每个条目,基于已有的SQL,指导模型生成SQL的变种,即在原有SQL的基础上,进行数据替换、函数变换、难度变换等操作,生成更加丰富的SQL。\n\n输入参数:\n- input_sql_key: SQL列名\n- input_db_id_key: 数据库ID列名\n\n", + "type": { + "level_1": "text2sql", + "level_2": "generate" + }, + "allowed_prompts": [ + "SQLVariationGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "database_manager", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "num_variations", + "default": 10, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_sql_key", + "default": "sql", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_db_id_key", + "default": "db_id", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 149, + "name": "Text2SQLCoTGenerator", + "description": "对于每个条目,生成从自然语言问题和数据库Schema到SQL的CoT长链路推理过程。\n\n输入参数:\n- input_sql_key: 输入SQL列名\n- input_question_key: 输入问题列名\n- input_db_id_key: 输入数据库ID列名\n\n输出参数:\n- output_cot_key: 输出CoT列名", + "type": { + "level_1": "text2sql", + "level_2": "generate" + }, + "allowed_prompts": [ + "Text2SQLCotGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "database_manager", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_sql_key", + "default": "SQL", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_question_key", + "default": "question", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_db_id_key", + "default": "db_id", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_evidence_key", + "default": "evidence", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_cot_key", + "default": "cot_reasoning", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 150, + "name": "Text2SQLPromptGenerator", + "description": "从数据库提取Schema信息,结合自然语言问题生成提示词。其中提示词模版支持自定义。\n\n输入参数:\n- input_question_key: 问题列名\n- input_db_id_key: 数据库ID列名\n- output_prompt_key: 输出prompt列名\n\n输出参数:\n- output_prompt_key: 生成的prompt", + "type": { + "level_1": "text2sql", + "level_2": "generate" + }, + "allowed_prompts": [ + "Text2SQLPromptGeneratorPrompt", + "Text2VecSQLPromptGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "database_manager", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_question_key", + "default": "question", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_db_id_key", + "default": "db_id", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_evidence_key", + "default": "evidence", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_prompt_key", + "default": "prompt", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 151, + "name": "Text2SQLQuestionGenerator", + "description": "对于每个条目,如果自然语言问题为空,生成SQL对应的自然语言问题。为保证正确,生成多个候选问题,并选择最优的。\n\n输入参数:\n- input_sql_key: 输入SQL列名\n- input_db_id_key: 数据库ID列名\n\n输出参数:\n- output_question_key: 输出问题列名", + "type": { + "level_1": "text2sql", + "level_2": "generate" + }, + "allowed_prompts": [ + "Text2SQLQuestionGeneratorPrompt", + "Text2VecSQLQuestionGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "embedding_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "database_manager", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "question_candidates_num", + "default": 5, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "prompt_template", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_sql_key", + "default": "sql", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_db_id_key", + "default": "db_id", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_question_key", + "default": "question", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_evidence_key", + "default": "evidence", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 152, + "name": "SQLComponentClassifier", + "description": "根据SQL的组件数量和复杂度,评估SQL的难度。\n\n输入参数:\n- input_sql_key: 输入SQL列名\n\n输出参数:\n- output_difficulty_key: 输出难度列名", + "type": { + "level_1": "text2sql", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "difficulty_thresholds", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "difficulty_labels", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_sql_key", + "default": "SQL", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_difficulty_key", + "default": "sql_component_difficulty", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 153, + "name": "SQLExecutionClassifier", + "description": "让模型根据自然语言问题、数据库Schema和提示词,多次生成SQL,通过生成SQL的准确率,评估该问题对于模型的难度。\n\n输入参数:\n- input_db_id_key: 输入数据库ID列名\n- input_sql_key: 输入SQL列名\n- input_prompt_key: 输入prompt列名\n\n输出参数:\n- output_difficulty_key: 输出难度列名", + "type": { + "level_1": "text2sql", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "database_manager", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "num_generations", + "default": 10, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "difficulty_thresholds", + "default": [ + 2, + 5, + 9 + ], + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "difficulty_labels", + "default": [ + "extra", + "hard", + "medium", + "easy" + ], + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_db_id_key", + "default": "db_id", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_sql_key", + "default": "SQL", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_prompt_key", + "default": "rl_prompt", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_difficulty_key", + "default": "sql_execution_difficulty", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 154, + "name": "CCNetDeduplicateFilter", + "description": "CCNet去重方法,基于SHA-1哈希算法的前N位进行重复识别,实现精确去重。\n\n初始化参数:\n- bit_length: 哈希值的位数,默认为64位\n\n运行参数:\n- input_keys: 用于计算哈希的多个字段列表(与input_key二选一)\n- input_key: 用于计算哈希的单个字段名(与input_keys二选一)\n- output_key: 去重标记字段名,默认为'minhash_deduplicated_label'\n\n输出说明:标记为1的数据表示首次出现,标记为0的数据表示重复数据", + "type": { + "level_1": "text_pt", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "bit_length", + "default": 64, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_keys", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "minhash_deduplicated_label", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 155, + "name": "DebertaV3SampleEvaluator", + "description": "基于Nvidia Deberta V3模型的质量分类器,用于评估文本质量并返回分类结果。\n输入参数:\n- model_name:预训练模型名称\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- device:计算设备,默认为'cuda'\n- input_key:输入文本字段名\n- output_key:输出分类结果字段名,默认为'Debertav3Score'\n输出参数:\n- 包含文本质量分类结果的DataFrame", + "type": { + "level_1": "text_pt", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "model_name", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "Debertav3Score", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 156, + "name": "DebertaV3Filter", + "description": "基于DebertaV3Scorer打分器的得分对数据进行过滤。使用Nvidia Deberta V3模型的质量分类器评估文本质量。\n\n初始化参数:\n- allowed_scores: 允许通过的分数列表,默认为['Medium', 'High']\n- model_name: 模型名称,默认为'nvidia/quality-classifier-deberta'\n- model_cache_dir: 模型缓存目录,默认为'./dataflow_cache'\n- device: 运行设备,默认为'cuda'\n- batch_size: 批处理大小,默认为16\n\n运行参数:\n- input_key: 输入文本字段名\n- output_key: 输出分数字段名,默认为'Debertav3Score'\n\n过滤逻辑:保留分类结果在allowed_scores列表中的数据", + "type": { + "level_1": "text_pt", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "allowed_scores", + "default": [ + "Medium", + "High" + ], + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_name", + "default": "nvidia/quality-classifier-deberta", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "batch_size", + "default": 16, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "Debertav3Score", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 157, + "name": "FineWebEduSampleEvaluator", + "description": "基于Fineweb-Edu分类器评估文本的教育价值。该分类器使用预训练的序列分类模型对文本进行评估,返回0-1之间的分数,分数越高表示文本的教育价值越高。适用于筛选具有教育意义的文本内容。\n输入参数:\n- text: 待评估的文本字符串\n输出参数:\n- float: 0-1之间的教育价值分数,越高表示教育价值越大", + "type": { + "level_1": "text_pt", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "FinewebEduScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 158, + "name": "FineWebEduFilter", + "description": "基于FineWebEduScorer打分器的得分对数据进行过滤。Fineweb-Edu是一个用于评估文本教育价值的分类器。\n\n初始化参数:\n- min_score: 最低分数阈值,默认为2.5\n- max_score: 最高分数阈值,默认为10000\n- model_cache_dir: 模型缓存目录,默认为'./dataflow_cache'\n- device: 运行设备,默认为'cuda'\n\n运行参数:\n- input_key: 输入文本字段名\n- output_key: 输出分数字段名,默认为'FinewebEduScore'\n\n评分标准:0-5分,分数越高表示文本具有越高的教育价值\n过滤逻辑:保留分数在[min_score, max_score]范围内的数据", + "type": { + "level_1": "text_pt", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 2.5, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 10000, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "FinewebEduScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 159, + "name": "PairQualSampleEvaluator", + "description": "基于BGE模型和GPT成对比较数据训练的文本质量评分器,支持中英文输入。通过对文本进行单样本评估,返回0-1之间的质量分数,分数越高表示文本质量越好。模型分为英文版本(zks2856/PairQual-Scorer-en)和中文版本(zks2856/PairQual-Scorer-zh)。\n输入参数:\n- text: 待评估的文本字符串\n- lang: 语言类型,可选'en'或'zh'\n输出参数:\n- float: 0-1之间的质量分数,越高表示质量越好", + "type": { + "level_1": "text_pt", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "lang", + "default": "en", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_length", + "default": 512, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "PairQualScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 160, + "name": "PairQualFilter", + "description": "基于PairQualScorer打分器的得分对数据进行过滤。基于BGE模型,使用GPT对文本成对比较打分后训练而成的双语文本质量评分器,得分越高表示质量越高。\n输入参数:\n- min_score:最小质量得分阈值\n- max_score:最大质量得分阈值\n- model_cache_dir:模型缓存目录路径\n- lang:文本语言类型\n输出参数:\n- 过滤后的DataFrame,仅保留质量得分在指定范围内的文本\n- 返回包含质量得分字段名的列表", + "type": { + "level_1": "text_pt", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 10000, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "lang", + "default": "en", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "PairQualScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 161, + "name": "PerplexitySampleEvaluator", + "description": "基于Huggingface语言模型计算文本的困惑度(Perplexity),困惑度越低表示文本的流畅性和可理解性越高。输入参数:\n- model_name:Huggingface模型路径或名称\n- device:模型运行设备\n输出参数:\n- float: 困惑度值,越低表示文本流畅性越好", + "type": { + "level_1": "text_pt", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "model_name", + "default": "gpt2", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "raw_content", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "PerplexityScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 162, + "name": "PerplexityFilter", + "description": "基于PerplexityScorer打分器的得分对数据进行过滤。基于Huggingface模型计算文本的困惑度,困惑度越低,文本的流畅性和可理解性越高。\n输入参数:\n- min_score:最小困惑度阈值\n- max_score:最大困惑度阈值\n- model_name:Huggingface模型路径或名称\n- device:模型运行设备\n输出参数:\n- 过滤后的DataFrame,仅保留困惑度在指定范围内的文本\n- 返回包含困惑度得分字段名的列表", + "type": { + "level_1": "text_pt", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 10.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 500.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_name", + "default": "gpt2", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "PerplexityScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 163, + "name": "QuratingSampleEvaluator", + "description": "通过Qurating模型(princeton-nlp/QuRater-1.3B)从四个维度评估文本质量:写作风格(writing_style)、所需专业程度(required_expertise)、事实与趣闻(facts_and_trivia)和教育价值(educational_value)。每个维度返回0-1之间的分数,综合评估文本的整体质量。\n输入参数:\n- text: 待评估的文本字符串\n- labels: 评估维度列表,默认为['writing_style', 'required_expertise', 'facts_and_trivia', 'educational_value']\n输出参数:\n- dict: 包含各维度分数的字典,键为维度名称,值为0-1之间的分数", + "type": { + "level_1": "text_pt", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "map_batch_size", + "default": 512, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "num_workers", + "default": 1, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device_batch_size", + "default": 16, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "labels", + "default": [ + "writing_style", + "required_expertise", + "facts_and_trivia", + "educational_value" + ], + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 164, + "name": "QuratingFilter", + "description": "基于QuratingScorer打分器的得分对数据进行过滤。通过Qurating模型从四个维度评估文本质量:写作风格、所需专业知识、事实与 trivia 内容、教育价值。\n每个维度评分范围为0-9分,综合判断文本质量,可用于筛选高质量教育类或知识类内容。\n输入参数:\n- min_scores:各维度保留样本的最小分数阈值,默认为{'writing_style':0,'required_expertise':0,'facts_and_trivia':0,'educational_value':0}\n- max_scores:各维度保留样本的最大分数阈值,默认为{'writing_style':9,'required_expertise':9,'facts_and_trivia':9,'educational_value':9}\n- map_batch_size:映射批次大小,默认为512\n- num_workers:数据加载工作进程数,默认为1\n- device_batch_size:设备批次大小,默认为16\n- device:模型运行设备,默认为'cuda'\n- labels:评估维度列表,默认为['writing_style', 'required_expertise', 'facts_and_trivia', 'educational_value']\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n输出参数:\n- 过滤后的DataFrame,仅保留所有维度分数均在对应阈值范围内的样本\n- 返回包含各维度过滤结果字段名的列表,用于后续算子引用", + "type": { + "level_1": "text_pt", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_scores", + "default": { + "writing_style": 0, + "required_expertise": 0, + "facts_and_trivia": 0, + "educational_value": 0 + }, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_scores", + "default": { + "writing_style": 9, + "required_expertise": 9, + "facts_and_trivia": 9, + "educational_value": 9 + }, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "map_batch_size", + "default": 512, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "num_workers", + "default": 1, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device_batch_size", + "default": 16, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "labels", + "default": [ + "writing_style", + "required_expertise", + "facts_and_trivia", + "educational_value" + ], + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 165, + "name": "TextbookSampleEvaluator", + "description": "基于FastText分类器(kenhktsui/llm-data-textbook-quality-fasttext-classifer-v2)评估文本的教育价值,将文本分为低(Low)、中(Mid)、高(High)三个等级,并映射为1.0、3.0、5.0的分数。适用于筛选适合作为教材的高质量文本内容。\n输入参数:\n- text: 待评估的文本字符串\n输出参数:\n- float: 教育价值分数,可能值为1.0(低)、3.0(中)、5.0(高)", + "type": { + "level_1": "text_pt", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "TextbookScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 166, + "name": "TextbookFilter", + "description": "基于TextbookScorer打分器的得分对数据进行过滤。使用FastText分类器评估文本的教育价值,判断文本是否适合作为教材内容。\n分类器经过训练可识别具有教育意义、结构清晰、知识准确的文本,适用于构建教育类数据集。\n输入参数:\n- min_score:保留样本的最小教育价值分数阈值,默认为0.99\n- max_score:保留样本的最大教育价值分数阈值,默认为1.0\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- input_key:输入文本字段名\n- output_key:教育价值分数字段名,默认为'TextbookScore'\n输出参数:\n- 过滤后的DataFrame,仅保留教育价值分数在[min_score, max_score]范围内的样本\n- 返回包含教育价值分数字段名的列表,用于后续算子引用", + "type": { + "level_1": "text_pt", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 0.99, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 1, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "TextbookScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 167, + "name": "Phi4QAGenerator", + "description": "基于给定文档内容,生成预训练格式的多轮对话问答数据。将原始文档内容转换为适合语言模型预训练的对话格式数据。输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_key:输入文档内容字段名,默认为'raw_content'\n- output_key:输出生成内容字段名,默认为'generated_content'\n输出参数:\n- 包含原始内容和生成内容的DataFrame\n- 返回输出字段名,用于后续算子引用", + "type": { + "level_1": "text_pt", + "level_2": "generate" + }, + "allowed_prompts": [ + "Phi4QAGeneratorPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "raw_content", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "generated_content", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 168, + "name": "MetaSampleEvaluator", + "description": "通过LLM评估文本的多个元属性,包括文本结构、多样性与复杂性、流畅性与可理解性、安全性、教育价值以及内容准确性与有效性。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- dimensions:评估维度列表,每个维度对应的字典中包含dimension_name,description,和示例字段:\n * dimension_name:维度名称\n * description:维度的描述\n * example_list:包含示例文本和得分的列表\n- input_key:输入文本字段名\n输出参数:\n- 包含6个评估维度得分的DataFrame,列名为:Text Structure, Diversity & Complexity, Fluency & Understandability, Safety, Educational Value, Content Accuracy & Effectiveness", + "type": { + "level_1": "text_pt", + "level_2": "eval" + }, + "allowed_prompts": [ + "MetaPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "dimensions", + "default": [ + { + "dimension_name": "Text Structure", + "description": "Evaluate the surface-level quality of the text, including spelling accuracy, grammar, vocabulary richness, and sentence structure.", + "example_list": [ + { + "text": "The experimental procedure was meticulously documented, with each variable clearly defined.", + "score": "5" + }, + { + "text": "teh data was wrong and we dont no why it happen like that", + "score": "2" + } + ] + }, + { + "dimension_name": "Diversity and Complexity", + "description": "Assess how rich and conceptually varied the content is, and whether it requires expert or deep reasoning to understand.", + "example_list": [ + { + "text": "This article compares Bayesian inference and frequentist approaches in statistical modeling, highlighting theoretical and practical trade-offs.", + "score": "5" + }, + { + "text": "Dogs are pets. They bark. They are friendly.", + "score": "2" + } + ] + }, + { + "dimension_name": "Fluency and Understandability", + "description": "Evaluate whether the text flows naturally, is easy to follow, and avoids awkward or disjointed phrasing.", + "example_list": [ + { + "text": "Despite initial challenges, the team successfully completed the deployment by adhering to a revised strategy.", + "score": "5" + }, + { + "text": "The problem was and then fixed by something happens deployment successful maybe.", + "score": "2" + } + ] + }, + { + "dimension_name": "Safety", + "description": "Identify whether the text contains profanities, hate speech, or excessive personally identifiable information (PII).", + "example_list": [ + { + "text": "The software collects anonymous usage data to improve performance.", + "score": "5" + }, + { + "text": "You idiot, your address 123 Main St will be posted online.", + "score": "1" + } + ] + }, + { + "dimension_name": "Educational Value", + "description": "Determine whether the text provides insight, stimulates thinking, or offers meaningful learning potential.", + "example_list": [ + { + "text": "Understanding the principles of thermodynamics allows engineers to design more efficient engines.", + "score": "5" + }, + { + "text": "The sky is blue. Water is wet. This is how it is.", + "score": "2" + } + ] + }, + { + "dimension_name": "Content Accuracy and Effectiveness", + "description": "Assess the truthfulness, relevance, and practical usefulness of the content.", + "example_list": [ + { + "text": "Newton's second law states that F = ma, which explains the relationship between force, mass, and acceleration.", + "score": "5" + }, + { + "text": "The Earth is flat and doesn't rotate around the Sun.", + "score": "1" + } + ] + } + ], + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 169, + "name": "AlpagasusSampleEvaluator", + "description": "通过调用GPT评估指令的质量,返回一个质量得分,得分越高表明指令的质量越高。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- dimension:评估维度,默认为'quality'\n- input_instruction_key:指令字段名\n- input_input_key:输入文本字段名\n- input_output_key:输出文本字段名\n- output_key:输出得分字段名,默认'AlpagasusScore'\n输出参数:\n- 包含评估得分的DataFrame", + "type": { + "level_1": "text_sft", + "level_2": "eval" + }, + "allowed_prompts": [ + "AlpagasusPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "dimension", + "default": "quality", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_output_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "AlpagasusScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 170, + "name": "DeitaQualitySampleEvaluator", + "description": "基于Llama模型的Deita指令质量评估器,通过生成1-6分的质量评分评估指令质量。\n输入参数:\n- device:计算设备,默认为'cuda'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- max_length:最大序列长度,默认为512\n- input_instruction_key:指令文本字段名,默认为'instruction'\n- input_output_key:输出文本字段名,默认为'output'\n- output_key:输出得分字段名,默认为'DeitaQualityScore'\n输出参数:\n- 包含指令质量评分的DataFrame(1-6分)", + "type": { + "level_1": "text_sft", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_length", + "default": 512, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_output_key", + "default": "output", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "DeitaQualityScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 171, + "name": "DeitaComplexitySampleEvaluator", + "description": "基于Llama模型的Deita指令复杂性评估器,通过生成1-6分的复杂性评分评估指令难度。\n输入参数:\n- device:计算设备,默认为'cuda'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- max_length:最大序列长度,默认为512\n- input_instruction_key:指令文本字段名,默认为'instruction'\n- input_output_key:输出文本字段名,默认为'output'\n- output_key:输出得分字段名,默认为'DeitaComplexityScore'\n输出参数:\n- 包含指令复杂性评分的DataFrame(1-6分)", + "type": { + "level_1": "text_sft", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_length", + "default": 512, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_output_key", + "default": "output", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "DeitaComplexityScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 172, + "name": "InstagSampleEvaluator", + "description": "使用Instag评分器评估指令的内容多样性和意图标签。通过分析指令文本生成相关标签,标签数量越多表示内容多样性越大,同时返回标签的详细解释。基于OFA-Sys/InsTagger模型实现。\n输入参数:\n- query: 待评估的指令文本\n输出参数:\n- int: 标签数量(内容多样性指标)\n- list: 包含标签和解释的字典列表", + "type": { + "level_1": "text_sft", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_new_tokens", + "default": 1024, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "temperature", + "default": 0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "do_sample", + "default": false, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "num_return_sequences", + "default": 1, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "return_dict_in_generate", + "default": true, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "InstagScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 173, + "name": "RMSampleEvaluator", + "description": "基于人类偏好数据训练的奖励模型(OpenAssistant/reward-model-deberta-v3-large-v2)对文本质量进行打分,高分代表质量较高。模型输入为指令和响应文本对,输出0-1之间的奖励分数,反映人类对文本质量的偏好判断。\n输入参数:\n- instruction: 指令文本字符串\n- output: 响应文本字符串\n输出参数:\n- float: 0-1之间的奖励分数,越高表示质量越好", + "type": { + "level_1": "text_sft", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_output_key", + "default": "output", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "RMScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 174, + "name": "SuperfilteringSampleEvaluator", + "description": "使用Superfiltering方法评估指令的跟随难度,基于GPT-2模型计算条件困惑度与独立困惑度的比值,得分越高表示指令越难跟随。该方法通过比较指令条件下的响应困惑度与独立响应困惑度,评估指令的清晰度和跟随难度。\n输入参数:\n- instruction: 指令文本\n- input_text: 输入文本(可选)\n- output: 响应文本\n输出参数:\n- float: 困惑度比值,越高表示指令跟随难度越大", + "type": { + "level_1": "text_sft", + "level_2": "eval" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_length", + "default": 512, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_output_key", + "default": "output", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "SuperfilteringScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 175, + "name": "TreeinstructSampleEvaluator", + "description": "通过生成语法树的节点数来衡量指令复杂性,节点越多表示指令越复杂。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_instruction_key:指令字段名\n- output_key:输出得分字段名,默认'TreeinstructScore'\n输出参数:\n- 包含指令复杂性得分的DataFrame", + "type": { + "level_1": "text_sft", + "level_2": "eval" + }, + "allowed_prompts": [ + "TreeinstructPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "TreeinstructScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 176, + "name": "AlpagasusFilter", + "description": "基于AlpagasusScorer打分器的得分对数据进行过滤。通过调用GPT模型评估指令的质量,返回一个质量得分。\n\n初始化参数:\n- min_score: 最低分数阈值,默认为3\n- max_score: 最高分数阈值,默认为5\n- llm_serving: LLM服务实例\n- dimension: 评估维度,默认为'quality'(质量)\n\n运行参数:\n- input_instruction_key: 输入指令字段名\n- input_input_key: 输入内容字段名\n- input_output_key: 输出内容字段名\n- output_key: 输出分数字段名,默认为'AlpagasusScore'\n\n过滤逻辑:保留分数在[min_score, max_score]范围内的数据", + "type": { + "level_1": "text_sft", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 3, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 5, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "dimension", + "default": "quality", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_output_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "AlpagasusScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 177, + "name": "DeitaQualityFilter", + "description": "基于DeitaQualityScorer打分器的得分对数据进行过滤。使用基于Llama模型的Deita指令质量评估器,评估指令的质量高低。\n\n初始化参数:\n- min_score: 最低分数阈值,默认为2.5\n- max_score: 最高分数阈值,默认为10000.0\n- device: 运行设备,默认为'cuda'\n- model_cache_dir: 模型缓存目录,默认为'./dataflow_cache'\n- max_length: 最大序列长度,默认为512\n\n运行参数:\n- input_instruction_key: 输入指令字段名,默认为'instruction'\n- input_output_key: 输入输出字段名,默认为'output'\n- output_key: 输出分数字段名,默认为'DeitaQualityScore'\n\n评分标准:1-6分,分数越高表示指令质量越高\n过滤逻辑:保留分数在[min_score, max_score]范围内的数据", + "type": { + "level_1": "text_sft", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 2.5, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 10000.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_length", + "default": 512, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_output_key", + "default": "output", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "DeitaQualityScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 178, + "name": "DeitaComplexityFilter", + "description": "基于DeitaComplexityScorer打分器的得分对数据进行过滤。使用基于Llama模型的Deita指令复杂性评估器,评估指令的复杂程度。\n\n初始化参数:\n- min_score: 最低分数阈值,默认为3.0\n- max_score: 最高分数阈值,默认为5.0\n- device: 运行设备,默认为'cuda'\n- model_cache_dir: 模型缓存目录,默认为'./dataflow_cache'\n- max_length: 最大序列长度,默认为512\n\n运行参数:\n- input_instruction_key: 输入指令字段名,默认为'instruction'\n- input_output_key: 输入输出字段名,默认为'output'\n- output_key: 输出分数字段名,默认为'DeitaComplexityScore'\n\n评分标准:1-6分,分数越高表示指令复杂性越高\n过滤逻辑:保留分数在[min_score, max_score]范围内的数据", + "type": { + "level_1": "text_sft", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 3.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 5.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_length", + "default": 512, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_output_key", + "default": "output", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "DeitaComplexityScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 179, + "name": "InstagFilter", + "description": "基于InstagScorer打分器的过滤算子。使用预训练的Instag模型对指令进行分析,返回标签的数量来评估指令的内容多样性。参数包括模型缓存目录(model_cache_dir)、计算设备(device)和最大新生成标记数(max_new_tokens)。过滤范围由min_score和max_score参数控制,标签越多表示内容多样性越大。", + "type": { + "level_1": "text_sft", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 0.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 1.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_new_tokens", + "default": 1024, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "InstagScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 180, + "name": "RMFilter", + "description": "基于RMScorer打分器的得分对数据进行过滤。使用基于人类偏好数据训练的奖励模型对文本质量进行评分,高分代表质量较高。\n奖励模型能够评估文本的相关性、有用性、无害性等人类偏好指标,可用于筛选符合人类价值观的高质量文本。\n输入参数:\n- min_score:保留样本的最小奖励分数阈值,默认为0.2\n- max_score:保留样本的最大奖励分数阈值,默认为0.8\n- device:模型运行设备,默认为'cuda'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- input_instruction_key:指令字段名,默认为'instruction'\n- input_output_key:输出字段名,默认为'output'\n输出参数:\n- 过滤后的DataFrame,仅保留奖励分数在[min_score, max_score]范围内的样本\n- 返回包含奖励分数字段名的列表,用于后续算子引用", + "type": { + "level_1": "text_sft", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 0.2, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 0.8, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_output_key", + "default": "output", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "RMScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 181, + "name": "SuperfilteringFilter", + "description": "使用Superfiltering评分器过滤掉低质量数据。基于GPT-2模型计算困惑度比值来评估指令跟随难度,比值越低表示指令越容易被模型理解和执行。\n适用于筛选适合特定模型能力的指令数据,提高模型训练效率和效果。\n输入参数:\n- min_score:保留样本的最小分数阈值,默认为0.0\n- max_score:保留样本的最大分数阈值,默认为1.0\n- device:模型运行设备,默认为'cuda'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- max_length:文本最大长度,默认为512\n- input_instruction_key:指令字段名,默认为'instruction'\n- input_input_key:输入字段名,默认为'input'\n- input_output_key:输出字段名,默认为'output'\n- output_key:过滤结果分数字段名,默认为'SuperfilteringScore'\n输出参数:\n- 过滤后的DataFrame,仅保留分数在[min_score, max_score]范围内的样本\n- 返回包含过滤结果分数字段名的列表,用于后续算子引用", + "type": { + "level_1": "text_sft", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 0.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 1.0, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "device", + "default": "cuda", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "model_cache_dir", + "default": "./dataflow_cache", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_length", + "default": 512, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_input_key", + "default": "input", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_output_key", + "default": "output", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "SuperfilteringScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 182, + "name": "TreeinstructFilter", + "description": "基于TreeinstructScore打分器的得分对数据进行过滤。通过生成语法树的节点数来衡量指令复杂性,节点越多表示指令越复杂。\n适用于筛选特定复杂度范围内的指令数据,平衡数据集难度分布,优化模型训练效果。\n输入参数:\n- min_score:保留样本的最小语法树节点数阈值,默认为7\n- max_score:保留样本的最大语法树节点数阈值,默认为100\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_key:输入指令字段名\n- output_key:语法树节点数字段名,默认为'TreeinstructScore'\n输出参数:\n- 过滤后的DataFrame,仅保留语法树节点数在[min_score, max_score]范围内的样本\n- 返回包含语法树节点数字段名的列表,用于后续算子引用", + "type": { + "level_1": "text_sft", + "level_2": "filter" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "min_score", + "default": 7, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "max_score", + "default": 100, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_key", + "default": "TreeinstructScore", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 183, + "name": "CondorGenerator", + "description": "基于预置知识树标签,两阶段从0合成SFT格式数据(合成数量大于5000时建议增加标签数量)。第一阶段生成不同难度级别的问题,第二阶段为每个问题生成对应的答案。输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- num_samples:生成样本总数,建议小于5000,默认值为15\n输出参数:\n- 包含'difficulty'、'instruction'和'output'字段的DataFrame\n- 返回生成的DataFrame用于后续处理", + "type": { + "level_1": "text_sft", + "level_2": "generate" + }, + "allowed_prompts": [ + "CondorQuestionPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "num_samples", + "default": 15, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "use_task_diversity", + "default": true, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 184, + "name": "SFTGeneratorSeed", + "description": "基于给定文档内容,生成监督微调格式的问答数据。并支持用户自定义生成内容要求。从原始文档中提取信息,生成符合SFT格式的指令-响应对。输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- custom_prompt:用户自定义提示词\n- input_key:输入文档内容字段名,默认为'raw_content'\n- max_tokens:生成文本的最大token数,默认为4096\n输出参数:\n- 包含'instruction'、'output'和'raw_content'字段的DataFrame\n- 返回包含'instruction'和'output'字段名的列表,用于后续算子引用", + "type": { + "level_1": "text_sft", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "custom_prompt", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_key", + "default": "raw_content", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 185, + "name": "CondorRefiner", + "description": "两阶段优化指令回复质量:第一阶段调用API生成对回复的评论,第二阶段利用评论调用API改写回复,提升指令对质量。通过迭代优化提高问答对的整体质量。输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_instruction_key:输入指令字段名,默认为'instruction'\n- input_output_key:输入回复字段名,默认为'output'\n输出参数:\n- 包含优化后回复的DataFrame\n- 返回包含优化后回复字段名的列表,用于后续算子引用", + "type": { + "level_1": "text_sft", + "level_2": "refine" + }, + "allowed_prompts": [ + "CondorCritiquePrompt", + "CondorRefinePrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_instruction_key", + "default": "instruction", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_output_key", + "default": "output", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 186, + "name": "VQAExtractPdf2Img", + "description": "N/A (非 staticmethod)", + "type": { + "level_1": "vqa", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "dpi", + "default": 300, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_pdf_path", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_image_folder", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 187, + "name": "VQAExtractDocLayoutMinerU", + "description": "N/A (非 staticmethod)", + "type": { + "level_1": "vqa", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "mineru_backend", + "default": "vlm-transformers", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_pdf_file_path", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_folder", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 188, + "name": "VQAExtractPicExtractor", + "description": "N/A (非 staticmethod)", + "type": { + "level_1": "vqa", + "level_2": "generate" + }, + "allowed_prompts": [ + "VQAExtractPrompt" + ], + "parameter": { + "init": [ + { + "name": "llm_serving", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "interleaved", + "default": true, + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_layout_path", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_subject", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_folder", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 189, + "name": "VQAExtractQAPairExtractor", + "description": "N/A (非 staticmethod)", + "type": { + "level_1": "vqa", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_vqa_extract_path", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_qa_path", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 190, + "name": "VQAExtractTag2Img", + "description": "N/A (非 staticmethod)", + "type": { + "level_1": "vqa", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [ + { + "name": "layout_prefix", + "default": "doclay_page_", + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "image_prefix", + "default": "page_", + "kind": "POSITIONAL_OR_KEYWORD" + } + ], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_layout_json", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_pdf_image_dir", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_image_dir", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_qa_file", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_qa_file", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_md_file", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 191, + "name": "VQAClipHeader", + "description": "N/A (非 staticmethod)", + "type": { + "level_1": "vqa", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_image_path", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_layout_path", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_image_folder", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_layout_prefix", + "default": "doclay", + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + }, + { + "node": 192, + "name": "VQAConcatenateImages", + "description": "N/A (非 staticmethod)", + "type": { + "level_1": "vqa", + "level_2": "generate" + }, + "allowed_prompts": [], + "parameter": { + "init": [], + "run": [ + { + "name": "storage", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "input_image_folder", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + }, + { + "name": "output_image_folder", + "default": null, + "kind": "POSITIONAL_OR_KEYWORD" + } + ] + }, + "required": "", + "depends_on": [], + "mode": "" + } + ] +} \ No newline at end of file From 28916fda67b33148aad6742660c25b17f4871a24 Mon Sep 17 00:00:00 2001 From: scuuy <912074188@qq.com> Date: Mon, 1 Dec 2025 13:38:42 +0800 Subject: [PATCH 07/10] 1201 first --- backend/app/schemas/pipelines.py | 3 +-- backend/app/services/pipeline_registry.py | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/app/schemas/pipelines.py b/backend/app/schemas/pipelines.py index 101d344..3188a72 100644 --- a/backend/app/schemas/pipelines.py +++ b/backend/app/schemas/pipelines.py @@ -36,7 +36,7 @@ class PipelineOperator(OperatorDetailSchema): # 画布上的pipeline类 """Pipeline算子模型""" name: str = Field(..., description="算子名称") params: Dict[str, Any] = Field(default_factory=dict, description="算子参数配置") - + location: tuple[int, int] = Field(default=(0, 0), description="算子在画布上的位置, 包含x和y两个坐标值") # @field_validator('name') # def validate_operator_name(cls, v: str) -> str: # """验证算子名称格式""" @@ -51,7 +51,6 @@ class PipelineConfig(BaseModel): input_dataset: str = Field(..., description="输入数据集ID") # 用 list 的顺序代表算子执行顺序 operators: List[PipelineOperator] = Field(default_factory=list, description="算子执行序列") - run_config: Dict[str, Any] = Field(default_factory=dict, description="运行时配置参数") @field_validator('operators') def validate_operators(cls, v: List[PipelineOperator]) -> List[PipelineOperator]: diff --git a/backend/app/services/pipeline_registry.py b/backend/app/services/pipeline_registry.py index c627bb9..9e3a838 100644 --- a/backend/app/services/pipeline_registry.py +++ b/backend/app/services/pipeline_registry.py @@ -50,7 +50,8 @@ def _ensure(self): "config": { "file_path": os.path.join(api_pipelines_dir, filename), "module_name": f"{settings.DataFlow_CORE_DIR.replace('/', '.')}.api_pipelines.{filename[:-3]}", - "input_dataset": "" + "input_dataset": "", + "operators": [], }, "tags": ["api"], "created_at": current_time, From 5d7365bc422079a95927d9d3b2d7864ed0fe7e12 Mon Sep 17 00:00:00 2001 From: scuuy <912074188@qq.com> Date: Mon, 1 Dec 2025 16:42:58 +0800 Subject: [PATCH 08/10] pipeline registry --- backend/app/schemas/pipelines.py | 27 ++++++++++++----------- backend/app/services/pipeline_registry.py | 1 - 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/backend/app/schemas/pipelines.py b/backend/app/schemas/pipelines.py index 3188a72..e508354 100644 --- a/backend/app/schemas/pipelines.py +++ b/backend/app/schemas/pipelines.py @@ -2,6 +2,7 @@ from typing import List, Dict, Any, Optional, Union from pydantic import BaseModel, Field, field_validator from app.schemas.operator import OperatorDetailSchema +from dataflow.utils.storage import FileStorage class Pipeline(str, Enum): """Pipeline类型枚举""" @@ -45,19 +46,19 @@ class PipelineOperator(OperatorDetailSchema): # 画布上的pipeline类 # # 后续可以补充从可用算子集中验证算子名称是否存在 # return v - class PipelineConfig(BaseModel): """Pipeline配置模型""" + file_path: str = Field(..., description="Pipeline文件路径") input_dataset: str = Field(..., description="输入数据集ID") # 用 list 的顺序代表算子执行顺序 operators: List[PipelineOperator] = Field(default_factory=list, description="算子执行序列") - @field_validator('operators') - def validate_operators(cls, v: List[PipelineOperator]) -> List[PipelineOperator]: - """确保至少有一个算子""" - if not v: - raise ValueError('Pipeline must have at least one operator') - return v + # @field_validator('operators') + # def validate_operators(cls, v: List[PipelineOperator]) -> List[PipelineOperator]: + # """确保至少有一个算子""" + # if not v: + # raise ValueError('Pipeline must have at least one operator') + # return v class PipelineIn(BaseModel): @@ -86,12 +87,12 @@ class PipelineExecutionRequest(BaseModel): pipeline_id: Optional[str] = Field(None, description="预定义Pipeline ID") config: Optional[PipelineConfig] = Field(None, description="自定义Pipeline配置") - @field_validator('pipeline_id', 'config') - def validate_at_least_one(cls, v, info): - """确保至少提供pipeline_id或config之一""" - if info.data.get('pipeline_id') is None and info.data.get('config') is None: - raise ValueError('Either pipeline_id or config must be provided') - return v + # @field_validator('pipeline_id', 'config') + # def validate_at_least_one(cls, v, info): + # """确保至少提供pipeline_id或config之一""" + # if info.data.get('pipeline_id') is None and info.data.get('config') is None: + # raise ValueError('Either pipeline_id or config must be provided') + # return v class PipelineExecutionResult(BaseModel): diff --git a/backend/app/services/pipeline_registry.py b/backend/app/services/pipeline_registry.py index 9e3a838..aa08aa3 100644 --- a/backend/app/services/pipeline_registry.py +++ b/backend/app/services/pipeline_registry.py @@ -49,7 +49,6 @@ def _ensure(self): "name": filename[:-3].replace("_", " ").title(), "config": { "file_path": os.path.join(api_pipelines_dir, filename), - "module_name": f"{settings.DataFlow_CORE_DIR.replace('/', '.')}.api_pipelines.{filename[:-3]}", "input_dataset": "", "operators": [], }, From 040f9f9c950f5c4a3c09f74d6c62612abb76e88f Mon Sep 17 00:00:00 2001 From: scuuy <912074188@qq.com> Date: Mon, 1 Dec 2025 16:51:45 +0800 Subject: [PATCH 09/10] remove resources --- backend/resources/ops.json | 18050 ----------------------------------- 1 file changed, 18050 deletions(-) delete mode 100644 backend/resources/ops.json diff --git a/backend/resources/ops.json b/backend/resources/ops.json deleted file mode 100644 index 231f235..0000000 --- a/backend/resources/ops.json +++ /dev/null @@ -1,18050 +0,0 @@ -{ - "agentic_rag": [ - { - "node": 1, - "name": "AgenticRAGQAF1SampleEvaluator", - "description": "用于评估预测答案与多个参考答案之间的 F1 分数", - "type": { - "level_1": "agentic_rag", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_prediction_key", - "default": "refined_answer", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_ground_truth_key", - "default": "golden_doc_answer", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "F1Score", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 2, - "name": "AgenticRAGAtomicTaskGenerator", - "description": "该算子用于为提供的文本内容生成合适的高质量问题与可验证答案。\n\n输入参数:\n- input_key: 输入文本内容字段名(默认值:\"prompts\")\n- output_question_key: 输出问题字段名(默认值:\"question\")\n- output_answer_key: 输出答案字段名(默认值:\"answer\")\n- output_refined_answer_key: 输出精炼答案字段名(默认值:\"refined_answer\")\n- output_optional_answer_key: 输出可替代精炼答案字段名(默认值:\"optional_answer\")\n- output_golden_doc_answer_key: 输出黄金文档回答字段名(默认值:\"golden_doc_answer\")\n", - "type": { - "level_1": "agentic_rag", - "level_2": "generate" - }, - "allowed_prompts": [ - "AtomicTaskGeneratorGetIdentifierPrompt", - "AtomicTaskGeneratorGetConlcusionPrompt", - "AtomicTaskGeneratorQuestionPrompt", - "AtomicTaskGeneratorCleanQAPrompt", - "AtomicTaskGeneratorAnswerPrompt", - "AtomicTaskGeneratorRecallScorePrompt", - "AtomicTaskGeneratorOptionalAnswerPrompt", - "AtomicTaskGeneratorGoldenDocAnswerPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "data_num", - "default": 100, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_per_task", - "default": 10, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_question", - "default": 10, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "prompts", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_question_key", - "default": "question", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_answer_key", - "default": "answer", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_refined_answer_key", - "default": "refined_answer", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_optional_answer_key", - "default": "optional_answer", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_llm_answer_key", - "default": "llm_answer", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_golden_doc_answer_key", - "default": "golden_doc_answer", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 3, - "name": "AgenticRAGDepthQAGenerator", - "description": "该算子以已有问答生成更深度的问题。\n\n输入参数:\n- input_key: 输入字段名(默认值:\"question\")\n- output_key: 输出字段名(默认值:\"depth_question\")\n", - "type": { - "level_1": "agentic_rag", - "level_2": "generate" - }, - "allowed_prompts": [ - "DepthQAGeneratorGetIdentifierPrompt", - "DepthQAGeneratorBackwardTaskPrompt", - "DepthQAGeneratorSupersetCheckPrompt", - "DepthQAGeneratorQuestionPrompt", - "DepthQAGeneratorAnswerPrompt", - "DepthQAGeneratorRecallScorePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "n_rounds", - "default": 2, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "question", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "depth_question", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 4, - "name": "AgenticRAGWidthQAGenerator", - "description": "该算子用于结合两个问答,生成新的问题。\n\n输入参数:\n- input_question_key: 输入问题字段名(默认值:\"question\")\n- input_identifier_key: 输入标识符字段名(默认值:\"identifier\")\n- input_answer_key: 输入答案字段名(默认值:\"answer\")\n- output_question_key: 输出问题字段名(默认值:\"generated_width_task\")\n", - "type": { - "level_1": "agentic_rag", - "level_2": "generate" - }, - "allowed_prompts": [ - "WidthQAGeneratorMergePrompt", - "WidthQAGeneratorOriginCheckPrompt", - "WidthQAGeneratorQuestionVerifyPrompt", - "WidthQAGeneratorAnswerPrompt", - "WidthQAGeneratorRecallScorePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_question_key", - "default": "question", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_identifier_key", - "default": "identifier", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_answer_key", - "default": "answer", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_question_key", - "default": "generated_width_task", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - } - ], - "chemistry": [ - { - "node": 5, - "name": "ExtractSmilesFromTextGenerator", - "description": "ExtractSmilesFromText 用于从 OCR 文本中抽取或解析化学分子的 SMILES 表达式。算子会根据给定的提示模板(prompt_template),结合文本内容和(可选的)单体缩写信息,调用大语言模型完成解析与结构化,并将结果以 JSON 格式写回到指定列。\n\n输入参数:\n- llm_serving:LLM 服务对象,需实现 LLMServingABC 接口\n- prompt_template:提示词模板对象,用于构造模型输入\n- input_content_key: OCR 文本的列名(默认 'text')\n- input_abbreviation_key:包含缩写/单体信息的列名(默认 'abbreviations'),可为空\n- output_key:写回抽取结果的列名(默认 'synth_smiles')\n\n输出参数:\n- DataFrame,其中 output_key 列为模型返回并经 JSON 解析后的 SMILES 结构\n- 返回 output_key,供后续算子引用\n\n备注:\n- 模型输出会尝试解析为 JSON;若解析失败,将返回 [] 并记录失败次数。", - "type": { - "level_1": "chemistry", - "level_2": "generate" - }, - "allowed_prompts": [ - "ExtractSmilesFromTextPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": "", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_content_key", - "default": "text", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_abbreviation_key", - "default": "abbreviations", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "synth_smiles", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 6, - "name": "SmilesEquivalenceDatasetEvaluator", - "description": "评估 golden_label 与 synth_smiles 的 SMILES 等价性并计算分数。逐块输出 final_result、块内得分与准确率,并统计全局总分。", - "type": { - "level_1": "chemistry", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_golden_key", - "default": "golden_label", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_synth_key", - "default": "synth_smiles", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "final_result", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - } - ], - "code": [ - { - "node": 7, - "name": "CodeAutoGeneratedSampleEvaluator", - "description": "基于自动生成标记评估代码样本,检测文件头部的自动生成标记。\n\n评估指标:\n- CodeAutoGeneratedMarkerCount: 检测到的自动生成标记数量\n- CodeAutoGeneratedScore: 综合自动生成得分 (0-1,1表示非自动生成)\n\n输入要求:需要包含'lines'列\n\n输出参数:\n- CodeAutoGeneratedMarkerCount: 检测到的自动生成标记数量\n- CodeAutoGeneratedScore: 综合自动生成得分", - "type": { - "level_1": "code", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "is_generated_func", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 8, - "name": "CodeAutoGeneratedFilter", - "description": "基于CodeAutoGeneratedSampleEvaluator的得分过滤自动生成的代码文件,确保只保留人工编写的代码。\n\n评估指标:\n- 自动生成标记数量:检测文件前5行中的自动生成标记\n- 检测标记:'auto-generated', 'autogenerated', 'automatically generated'等\n- 综合自动生成得分:0-1,1表示非自动生成\n- 支持外部检测函数进行额外验证\n\n输入参数:\n- input_key: 输入字段名(需要包含'lines'列)\n- output_key: 输出标签字段名 (默认: 'auto_generated_filter_label')\n- min_score: 最小自动生成得分阈值 (默认: 1.0)\n- max_score: 最大自动生成得分阈值 (默认: 1.0)\n- is_generated_func: 可选的外部检测函数\n\n输出参数:\n- 过滤后的DataFrame,仅保留自动生成得分在指定范围内的代码样本\n- 返回包含自动生成得分标签字段名的列表", - "type": { - "level_1": "code", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 1.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 1.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "is_generated_func", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "auto_generated_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 9, - "name": "CodeDocumentQualitySampleEvaluator", - "description": "基于综合文档级质量指标评估代码样本,包括内容长度、重复模式、字符组成和文本熵值。\n\n评估指标:\n- CodeDocumentQualityCharCount: 字符数\n- CodeDocumentQualityWordCount: 词数\n- CodeDocumentQualityDuplicateLinesRatio: 重复行比例\n- CodeDocumentQualityDuplicateNgramRatio: n-gram重复比例\n- CodeDocumentQualityCurlyBracketRatio: 花括号比例\n- CodeDocumentQualityAllCapsRatio: 全大写单词比例\n- CodeDocumentQualityEntropy: 单字符熵值\n- CodeDocumentQualityScore: 综合文档质量得分 (0-1,1表示通过所有质量检查)\n\n输入要求:需要包含'text'、'filename'、'language'列\n\n输出参数:\n- 各种质量指标的数值\n- CodeDocumentQualityScore: 综合文档质量得分", - "type": { - "level_1": "code", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "thresholds", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 10, - "name": "CodeDocumentQualityFilter", - "description": "基于CodeDocumentQualitySampleEvaluator的得分应用综合文档级质量过滤规则,移除低质量代码和文本样本。\n\n评估指标:\n- 内容长度:字符数、词数、行数范围检查\n- 重复模式:重复行比例、2-10gram重复比例\n- 字符组成:花括号比例、全大写单词比例\n- 文本熵值:单字符熵值检查\n- 综合文档质量得分:0-1,1表示通过所有质量检查\n\n输入参数:\n- input_key: 输入字段名(需要包含'text'、'filename'、'language'列)\n- output_key: 输出标签字段名 (默认: 'doc_quality_filter_label')\n- min_score: 最小文档质量得分阈值 (默认: 1.0)\n- max_score: 最大文档质量得分阈值 (默认: 1.0)\n- thresholds: 可选的阈值字典,用于覆盖默认阈值\n\n输出参数:\n- 过滤后的DataFrame,仅保留文档质量得分在指定范围内的样本\n- 返回包含文档质量得分标签字段名的列表", - "type": { - "level_1": "code", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 1.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 1.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "thresholds", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "doc_quality_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 11, - "name": "CodeEncodedDataSampleEvaluator", - "description": "基于编码数据模式评估代码样本,检测Base64、十六进制和Unicode转义序列。\n\n评估指标:\n- CodeEncodedDataBase64Ratio: Base64编码数据比例\n- CodeEncodedDataHexRatio: 十六进制数据比例\n- CodeEncodedDataUnicodeRatio: Unicode转义序列比例\n- CodeEncodedDataScore: 综合编码数据得分 (0-1,1表示通过编码数据检查)\n\n输入要求:需要包含'text'列\n\n输出参数:\n- CodeEncodedDataBase64Ratio: Base64编码数据比例\n- CodeEncodedDataHexRatio: 十六进制数据比例\n- CodeEncodedDataUnicodeRatio: Unicode转义序列比例\n- CodeEncodedDataScore: 综合编码数据得分", - "type": { - "level_1": "code", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 12, - "name": "CodeEncodedDataFilter", - "description": "基于CodeEncodedDataSampleEvaluator的得分过滤代码样本,移除二进制内容和自动生成代码。\n\n评估指标:\n- Base64编码数据比例:检测连续64+字符的Base64字符串\n- 十六进制数据比例:检测8+个连续的十六进制对\n- Unicode转义序列比例:检测8+个连续的\\uXXXX序列\n- 综合编码数据得分:0-1,1表示通过检查\n\n输入参数:\n- input_key: 输入字段名(需要包含'text'列)\n- output_key: 输出标签字段名 (默认: 'encoded_data_filter_label')\n- min_score: 最小编码数据得分阈值 (默认: 1.0)\n- max_score: 最大编码数据得分阈值 (默认: 1.0)\n\n输出参数:\n- 过滤后的DataFrame,仅保留编码数据得分在指定范围内的代码样本\n- 返回包含编码数据得分标签字段名的列表", - "type": { - "level_1": "code", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 1.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 1.0, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "encoded_data_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 13, - "name": "CodeFileTypeContentFilter", - "description": "基于文件类型和内容特征直接过滤代码样本,针对不同文件格式应用特定规则。\n\n过滤规则:\n- Text/JSON/YAML/Graphviz文件:行数 > 512 行\n- HTML文件:可见文本长度 < 100字符 或 可见文本比例 < 20%\n- Text文件:文件名不符合文档规范(非readme/notes/todo等)\n\n输入参数:\n- input_key: 输入字段名(需要包含'filetype'、'filename'、'line_count'等列)\n- output_key: 输出标签字段名 (默认: 'file_type_content_filter_label')\n\n输出参数:\n- 过滤后的DataFrame,仅保留符合文件类型规则的样本\n- 返回包含输出标签字段名的列表", - "type": { - "level_1": "code", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "file_type_content_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 14, - "name": "CodeLengthSampleEvaluator", - "description": "基于代码长度特征评估代码样本,分析总行数、平均行长和最大行长。\n\n评估指标:\n- CodeLengthTotalLines: 总行数\n- CodeLengthAvgLineLength: 平均行长\n- CodeLengthMaxLineLength: 最大行长\n- CodeLengthScore: 综合长度得分 (0-1,1表示通过所有长度检查)\n\n输入要求:需要包含'lines'和'language'列\n\n输出参数:\n- CodeLengthTotalLines: 总行数\n- CodeLengthAvgLineLength: 平均行长\n- CodeLengthMaxLineLength: 最大行长\n- CodeLengthScore: 综合长度得分", - "type": { - "level_1": "code", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 15, - "name": "CodeLengthSampleFilter", - "description": "基于CodeLengthSampleEvaluator的得分过滤代码样本,移除超大文件和格式不良的代码。\n\n评估指标:\n- 总行数:检查是否超过100,000行\n- 平均行长:普通语言>100字符,特殊语言>100,000字符\n- 最大行长:普通语言>1,000字符\n\n输入参数:\n- input_key: 输入字段名(需要包含'lines'和'language'列)\n- output_key: 输出标签字段名 (默认: 'length_filter_label')\n- min_score: 最小长度得分阈值 (默认: 1.0)\n- max_score: 最大长度得分阈值 (默认: 1.0)\n\n输出参数:\n- 过滤后的DataFrame,仅保留长度得分在指定范围内的代码样本\n- 返回包含长度得分标签字段名的列表", - "type": { - "level_1": "code", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 1.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 1.0, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "length_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 16, - "name": "CodeQualitySampleEvaluator", - "description": "该算子用于评估生成的代码片段与其源指令的匹配质量,并输出分数和反馈。\n\n输入参数:\n- input_instruction_key: 包含人类指令的字段名 (默认: 'generated_instruction')\n- input_code_key: 包含生成代码的字段名 (默认: 'generated_code')\n输出参数:\n- output_score_key: 用于存储质量分数的字段名 (默认: 'quality_score')\n- output_feedback_key: 用于存储质量反馈的字段名 (默认: 'quality_feedback')\n", - "type": { - "level_1": "code", - "level_2": "eval" - }, - "allowed_prompts": [ - "CodeQualityEvaluatorPrompt", - "DiyCodePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_code_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_score_key", - "default": "quality_score", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_feedback_key", - "default": "quality_feedback", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 17, - "name": "CodeQualityScoreFilter", - "description": "基于LLM生成的代码质量分数过滤代码样本,评估正确性、完整性、清晰度、最佳实践和效率。\n\n评估维度:\n- 正确性:代码语法和逻辑是否正确\n- 完整性:代码是否完整实现功能\n- 清晰度:代码是否清晰易懂\n- 最佳实践:是否遵循编程最佳实践\n- 效率:代码执行效率如何\n\n输入参数:\n- input_code_key: 输入代码字段名\n- input_instruction_key: 输入指令字段名\n- output_score_key: 输出打分字段名 (默认: 'quality_score')\n- output_feedback_key: 输出反馈字段名 (默认: 'quality_feedback')\n- output_key: 输出过滤标签字段名 (默认: 'quality_score_filter_label')\n- min_score: 最小质量分数阈值 (默认: 7)\n- max_score: 最大质量分数阈值 (默认: 10)\n\n输出参数:\n- 过滤后的DataFrame,仅保留质量分数在指定范围内的代码样本\n- 返回包含质量分数标签字段名的列表", - "type": { - "level_1": "code", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "min_score", - "default": 7, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 10, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_code_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_score_key", - "default": "quality_score", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_feedback_key", - "default": "quality_feedback", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "quality_score_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 18, - "name": "CodeGenericScoreFilter", - "description": "基于数值分数列直接过滤数据集,提供灵活的阈值比较方法。\n\n比较方法:\n- greater_equal: 分数 >= 阈值\n- greater: 分数 > 阈值\n- less_equal: 分数 <= 阈值\n- less: 分数 < 阈值\n- equal: 分数 = 阈值\n\n输入参数:\n- input_key: 包含分数的字段名\n- output_key: 输出标签字段名 (默认: 'generic_score_filter_label')\n- score_threshold: 分数阈值 (默认: 8)\n- filter_method: 比较方法 (默认: 'greater_equal')\n\n输出参数:\n- 过滤后的DataFrame,仅保留符合分数条件的样本\n- 返回包含输出标签字段名的列表", - "type": { - "level_1": "code", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "score_threshold", - "default": 8, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "filter_method", - "default": "greater_equal", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "generic_score_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 19, - "name": "CodeTextCompositionSampleEvaluator", - "description": "基于字符组成评估代码样本,分析字母字符和字母数字字符的比例。\n\n评估指标:\n- CodeTextCompositionAlphaRatio: 字母字符比例\n- CodeTextCompositionAlnumRatio: 字母数字字符比例\n- CodeTextCompositionScore: 综合字符组成得分 (0-1,1表示通过字符组成检查)\n\n输入要求:需要包含'text'和'language'列\n\n输出参数:\n- CodeTextCompositionAlphaRatio: 字母字符比例\n- CodeTextCompositionAlnumRatio: 字母数字字符比例\n- CodeTextCompositionScore: 综合字符组成得分", - "type": { - "level_1": "code", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 20, - "name": "CodeTextCompositionFilter", - "description": "基于CodeTextCompositionSampleEvaluator的得分过滤代码样本,移除二进制文件、加密内容和不可读文本。\n\n评估指标:\n- 字母字符比例:普通语言需要>=25%\n- 字母数字字符比例:汇编语言需要>=25%\n- 综合字符组成得分:0-1,1表示通过检查\n\n输入参数:\n- input_key: 输入字段名(需要包含'text'和'language'列)\n- output_key: 输出标签字段名 (默认: 'text_composition_filter_label')\n- min_score: 最小字符组成得分阈值 (默认: 1.0)\n- max_score: 最大字符组成得分阈值 (默认: 1.0)\n\n输出参数:\n- 过滤后的DataFrame,仅保留字符组成得分在指定范围内的代码样本\n- 返回包含字符组成得分标签字段名的列表", - "type": { - "level_1": "code", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 1.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 1.0, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "text_composition_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 21, - "name": "CodeCodeToInstructionGenerator", - "description": "该算子用于分析代码片段并反向生成可能产生该代码的人类指令。\n\n输入参数:\n- input_key: 包含原始代码片段的字段名 (默认: 'code')\n输出参数:\n- output_key: 用于存储生成指令的字段名 (默认: 'generated_instruction')\n", - "type": { - "level_1": "code", - "level_2": "generate" - }, - "allowed_prompts": [ - "CodeCodeToInstructionGeneratorPrompt", - "DiyCodePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "code", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "generated_instruction", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 22, - "name": "CodeInstructionToCodeGenerator", - "description": "该算子根据给定的人类指令生成相应的代码片段。\n\n输入参数:\n- input_key: 包含人类指令的字段名 (默认: 'instruction')\n输出参数:\n- output_key: 用于存储生成代码的字段名 (默认: 'generated_code')\n", - "type": { - "level_1": "code", - "level_2": "generate" - }, - "allowed_prompts": [ - "CodeInstructionToCodeGeneratorPrompt", - "DiyCodePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "generated_code", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 23, - "name": "CodeEnhancementInstructionGenerator", - "description": "该算子用于增强人类指令,将不同输出格式的任务统一为生成完整函数。\n\n输入参数:\n- input_key: 包含原始代码片段的字段名 (默认: 'code')\n输出参数:\n- output_key: 用于存储生成指令的字段名 (默认: 'generated_instruction')\n", - "type": { - "level_1": "code", - "level_2": "generate" - }, - "allowed_prompts": [ - "CodeInstructionEnhancement", - "DiyCodePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "messages", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "generated_instruction", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 24, - "name": "CodeInstructionGenerator", - "description": "该算子用于生成新的指令,从数据池中随机抽取few-shot样本,生成类似难度的指令。\n\n输入参数:\n- input_key: 包含原始指令的字段名 (默认: 'prompt')\n输出参数:\n- output_key: 用于存储生成指令的字段名 (默认: 'generated_instruction')\n", - "type": { - "level_1": "code", - "level_2": "generate" - }, - "allowed_prompts": [ - "CodeInstructionGeneratePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "num_few_shot", - "default": 3, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "num_generate", - "default": 10, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "prompt", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "generated_instruction", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 25, - "name": "CodeSandboxSampleEvaluator", - "description": "该算子在一个安全的沙箱环境中执行代码片段以验证其正确性。\n\n输入参数:\n- input_code_key: 包含待执行代码的字段名 (默认: 'generated_code')\n输出参数:\n- output_status_key: 用于存储执行状态 ('PASS' 或 'FAIL') 的字段名 (默认: 'sandbox_status')\n- output_log_key: 用于存储执行日志或错误信息的字段名 (默认: 'sandbox_log')\n", - "type": { - "level_1": "code", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "language", - "default": "python", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "timeout_length", - "default": 15, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "use_process_isolation", - "default": true, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_status_key", - "default": "sandbox_status", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_log_key", - "default": "sandbox_log", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - } - ], - "conversations": [ - { - "node": 26, - "name": "ScenarioExtractGenerator", - "description": "从对话内容中提取场景信息,使用LLM服务分析对话并生成场景描述。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_chat_key:对话内容字段名\n- output_key:输出场景字段名,默认'scenario'\n输出参数:\n- 包含提取场景信息的DataFrame\n- 包含输出字段名的列表", - "type": { - "level_1": "conversations", - "level_2": "generate" - }, - "allowed_prompts": [ - "ExtractScenarioPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_chat_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "scenario", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 27, - "name": "ScenarioExpandGenerator", - "description": "基于原始场景生成新的替代场景,使用LLM服务重写或改写原有场景内容。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_scenario_key:原始场景字段名\n- output_key:生成的新场景字段名,默认'modified_scenario'\n输出参数:\n- 包含生成新场景的DataFrame\n- 包含输出字段名的列表", - "type": { - "level_1": "conversations", - "level_2": "generate" - }, - "allowed_prompts": [ - "ExpandScenarioPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_scenario_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "modified_scenario", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 28, - "name": "AtomTaskGenerator", - "description": "根据输入的场景信息,使用LLM服务生成对应的原子任务。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_scenario_key:场景字段名\n- output_key:原子任务的输出字段名,默认'atom_task'\n输出参数:\n- 包含原子任务的DataFrame\n- 包含输出字段名的列表", - "type": { - "level_1": "conversations", - "level_2": "generate" - }, - "allowed_prompts": [ - "FuncAtomicTaskGeneratePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_scenario_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "atom_task", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 29, - "name": "SequentialTaskGenerator", - "description": "根据输入的原子任务,使用LLM服务生成该任务的后继任务和两者的组合任务。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_task_key:原子任务字段名\n- output_subsequent_task_key:后继任务输出字段名,默认'subsequent_task'\n- output_composition_task_key:组合任务输出字段名,默认'composition_task'\n输出参数:\n- 包含后继任务和组合任务的DataFrame\n- 输出字段名的列表(后继任务字段和组合任务字段)", - "type": { - "level_1": "conversations", - "level_2": "generate" - }, - "allowed_prompts": [ - "SequentialTaskGeneratePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_task_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_subsequent_task_key", - "default": "subsequent_task", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_composition_task_key", - "default": "composition_task", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 30, - "name": "ParaSeqTaskGenerator", - "description": "基于原子任务,使用LLM服务生成三个任务类型:并行任务、后继任务以及这三者的组合任务。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_task_key:原子任务字段名\n- output_parallel_task_key:并行任务输出字段名,默认'parallel_task'\n- output_subsequent_task_key:后继任务输出字段名,默认'subsequent_task'\n- output_composition_task_key:组合任务输出字段名,默认'composition_task'\n输出参数:\n- 包含并行任务、后继任务与组合任务的DataFrame\n- 输出字段名列表(并行任务、后继任务、组合任务)", - "type": { - "level_1": "conversations", - "level_2": "generate" - }, - "allowed_prompts": [ - "ParathenSeqTaskGeneratePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_task_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_parallel_task_key", - "default": "parallel_task", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_subsequent_task_key", - "default": "subsequent_task", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_composition_task_key", - "default": "composition_task", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 31, - "name": "FunctionGenerator", - "description": "基于组合任务及其相关子任务,使用LLM服务生成对应的函数列表。输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_composition_task_key:组合任务字段名\n- input_sub_tasks_keys:子任务字段名列表(如原子任务、并行任务、后继任务等)\n- output_key:函数列表输出字段名,默认'functions'\n输出参数:\n- 包含函数定义或函数列表的DataFrame\n- 输出字段名的列表(函数列表字段)", - "type": { - "level_1": "conversations", - "level_2": "generate" - }, - "allowed_prompts": [ - "FuncGeneratePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_composition_task_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_sub_tasks_keys", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "functions", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 32, - "name": "MultiTurnConversationGenerator", - "description": "根据组合任务及其子任务函数,使用LLM服务模拟多轮对话过程,由User、Assistant和Tool三个Agent协同生成完整的对话数据。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_task_key:任务字段名(组合任务)\n- input_sub_tasks_keys:子任务字段名列表\n- input_functions_key:子任务函数字段名\n- output_conversations_key:输出对话字段名,默认'conversations'\n输出参数:\n- 包含已完成的多轮对话记录的DataFrame\n- 输出字段名(对话字段名)", - "type": { - "level_1": "conversations", - "level_2": "generate" - }, - "allowed_prompts": [ - "ConversationUserPrompt", - "ConversationAssistantPrompt", - "ConversationToolPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_task_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_sub_tasks_keys", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_functions_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_conversations_key", - "default": "conversations", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 33, - "name": "ConsistentChatGenerator", - "description": "根据预置主题和人类意图,两阶段从0合成多轮对话格式数据(合成数量大于9000时建议增加标签数量)。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- num_dialogs_per_intent:每个意图生成的对话数量,默认20\n- num_turns_per_dialog:每个对话的轮次数量,默认6\n- temperature:生成温度,控制输出随机性,默认0.9\n输出参数:\n- 包含category和conversation字段的DataFrame,其中conversation为多轮对话列表", - "type": { - "level_1": "conversations", - "level_2": "generate" - }, - "allowed_prompts": [ - "ConsistentQueryPrompt", - "ConsistentResponsePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "num_dialogs_per_intent", - "default": 20, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "num_turns_per_dialog", - "default": 6, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "temperature", - "default": 0.9, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 34, - "name": "FuncCallConversationSampleEvaluator", - "description": "对对话样本进行打分评估:使用 LLM 服务根据预设评分提示词对每条对话进行评分,并将结果写回数据流。\n输入参数:\n- llm_serving:LLM 服务对象,需实现 LLMServingABC 接口\n- input_conversation_key:DataFrame 中对话内容字段名,默认 'conversations'\n- output_score_key:评分结果输出字段名,默认 'score'\n处理流程:\n- 读取存储中的 DataFrame\n- 将每条对话重组为评分提示词并调用 LLM 生成评分(JSON)\n- 解析 JSON,提取 'score' 字段写入 DataFrame;解析失败则回退为 0\n输出参数:\n- 包含评分结果列的 DataFrame\n- 包含输出字段名的列表(仅 'score' 或自定义的输出列名)", - "type": { - "level_1": "conversations", - "level_2": "eval" - }, - "allowed_prompts": [ - "ConversationEvalPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_conversation_key", - "default": "conversations", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_score_key", - "default": "score", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 35, - "name": "CompositionTaskFilter", - "description": "根据组合任务及其子任务,使用LLM服务判断组合任务是否具备可行性与完备性,从而进行可运行任务的筛选。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_composition_task_key:组合任务字段名\n- input_sub_tasks_keys:子任务字段名列表(如原子任务、并行任务、后继任务等)\n- output_key:可运行标签的输出字段名,默认'runable_label'\n输出参数:\n- 仅包含可运行组合任务的数据DataFrame\n- 包含输出字段名的列表(可运行标签字段)", - "type": { - "level_1": "conversations", - "level_2": "filter" - }, - "allowed_prompts": [ - "CompositionTaskFilterPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_composition_task_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_sub_tasks_keys", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "runable_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - } - ], - "core_speech": [ - { - "node": 36, - "name": "Speech2TextGenerator", - "description": "该算子用于将语音内容转录为文本。它接收语音文件路径或URL,使用大语言模型进行转录,并将转录结果保存到数据框中。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- system_prompt:系统提示词,用于定义模型行为,默认为'You are a helpful assistant'\n- input_key:输入语音文件路径或URL的字段名,默认为'raw_content'\n- output_key:输出转录文本的字段名,默认为'generated_content'\n输出参数:\n- 返回输出字段名,用于后续算子引用\n- 在数据框中添加包含转录文本的新列", - "type": { - "level_1": "core_speech", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "system_prompt", - "default": "You are a helpful assistant", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "raw_content", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "generated_content", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - } - ], - "core_text": [ - { - "node": 37, - "name": "PromptedGenerator", - "description": "基于用户提供的提示词(prompt)生成数据。结合系统提示词和输入内容生成符合要求的输出文本。输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- system_prompt:系统提示词,定义模型行为,默认为'You are a helpful agent.'\n- input_key:输入内容字段名,默认为'raw_content'\n- output_key:输出生成内容字段名,默认为'generated_content'\n输出参数:\n- 包含生成内容的DataFrame\n- 返回输出字段名,用于后续算子引用", - "type": { - "level_1": "core_text", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "system_prompt", - "default": "You are a helpful agent.", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "json_schema", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "raw_content", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "generated_content", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 38, - "name": "PairedPromptedGenerator", - "description": "PairedPromptedGenerator:基于两列配对输入(input_key_1 与 input_key_2)进行成对提示生成。\n算子会将 system_prompt 与每行的两列文本按固定模板拼接后,调用 LLM 服务批量生成结果,并将模型输出写回到 DataFrame 的指定列。\n\n输入参数:\n- llm_serving:LLM 服务对象(实现 LLMServingABC 接口)\n- system_prompt:系统提示词(默认 'You are a helpful agent.')。该提示会放在每条样本前缀, 用于约束模型的角色与输出风格。\n- input_key_1:第一列输入字段名(默认 'input_key_1')\n- input_key_2:第二列输入字段名(默认 'input_key_2')\n- output_key:输出字段名(默认 'generated_content')\n\n处理逻辑:\n1) 从 storage 中读取名为 'dataframe' 的 DataFrame;\n2) 对于每一行,若 input_key_1 与 input_key_2 均非空,则按模板:\n system_prompt + input_key_1 + 值 + '\\n' + input_key_2 + 值\n 构造 LLM 输入;\n3) 批量调用 llm_serving.generate_from_input 生成文本;\n4) 将生成结果写入 DataFrame 的 output_key 列并保存。\n\n输出:\n- 返回写入了生成结果的新 DataFrame(由 storage 管理保存),\n- 返回 output_key 以便后续算子引用。", - "type": { - "level_1": "core_text", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "system_prompt", - "default": "You are a helpful agent.", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key_1", - "default": "input_key_1", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key_2", - "default": "input_key_2", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "generated_content", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 39, - "name": "RandomDomainKnowledgeRowGenerator", - "description": "N/A (调用失败)", - "type": { - "level_1": "core_text", - "level_2": "generate" - }, - "allowed_prompts": [ - "SFTFromScratchGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "generation_num", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "domain_keys", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "generated_content", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 40, - "name": "Text2QAGenerator", - "description": "该算子用于为给定的文档片段生成种子QA对。\n\n输入参数:\n- input_key: 包含文档片段的字段名\n- prompt_key: 包含提示词的字段名\n- output_quesion_key: 包含生成问题的字段名\n- output_answer_key: 包含生成答案的字段名\n", - "type": { - "level_1": "core_text", - "level_2": "generate" - }, - "allowed_prompts": [ - "Text2QAAutoPromptGeneratorPrompt", - "Text2QASeedQuestionGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "text", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_question_num", - "default": 1, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_prompt_key", - "default": "generated_prompt", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_quesion_key", - "default": "generated_question", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_answer_key", - "default": "generated_answer", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 41, - "name": "Text2MultiHopQAGenerator", - "description": "('MultiHopQAGenerator 是多跳问答对生成处理器,支持从文本中自动生成需要多步推理的问题与答案。', '处理流程包括:文本预处理、信息抽取、问题生成与回答生成,支持自定义语言模型后端和参数。', '输出格式如下:', '输入:\\ntext: <原始上下文文本>', '输出:\\n{\\n \"text\": <处理后的文本字符串>,\\n \"qa_pairs\": [\\n {\\n \"question\": <字符串:生成的问题>,\\n \"reasoning_steps\": [\\n {\"step\": <推理过程的步骤 1>},\\n {\"step\": <步骤 2>} ...\\n ],\\n \"answer\": <字符串:最终答案>,\\n \"supporting_facts\": [<支持该答案的事实 1>, <事实 2>, ...],\\n \"type\": <可选:问题类型,如“生物学”、“历史”等>\\n },\\n ...\\n ],\\n \"metadata\": {\\n \"source\": <数据来源>,\\n \"timestamp\": <时间戳字符串>,\\n \"complexity\": <整数:问题复杂度标记>\\n }\\n}')", - "type": { - "level_1": "core_text", - "level_2": "generate" - }, - "allowed_prompts": [ - "Text2MultiHopQAGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "seed", - "default": 0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "lang", - "default": "en", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "num_q", - "default": 5, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "cleaned_chunk", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "QA_pairs", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_meta_key", - "default": "QA_metadata", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 42, - "name": "EmbeddingGenerator", - "description": "EmbeddingGenerator算子用于从输入文本生成向量表示(embedding),通常用于语义检索、聚类或下游模型输入等任务。\n\n输入参数:\n- embedding_serving:Embedding服务对象,需实现LLMServingABC接口,用于生成文本的向量表示\n- input_key:输入文本字段名,默认为'text'\n- output_key:输出向量字段名,默认为'embeddings'\n\n输出参数:\n- 包含文本向量的DataFrame,每行对应一个输入文本的embedding\n- 返回输出字段名(如'embeddings'),可供后续算子引用", - "type": { - "level_1": "core_text", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "embedding_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "text", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "embeddings", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 43, - "name": "RetrievalGenerator", - "description": "N/A (非 staticmethod)", - "type": { - "level_1": "core_text", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "system_prompt", - "default": "You are a helpful agent.", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "json_schema", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "raw_content", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "generated_content", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 44, - "name": "BenchDatasetEvaluator", - "description": "该算子用于对比预测答案与标准答案的匹配度,支持两种评估模式:\n\n1. 字符串匹配(match):使用数学验证方法比较答案,适用于有明确答案的问题\n2. 语义匹配(semantic):使用LLM评估语义相似度,仅输入预测答案与标准答案\n\n输入参数:\n- input_test_answer_key:预测答案字段名\n- input_gt_answer_key:标准答案字段名\n- compare_method:比较方法(match/semantic)\n\n输出参数:\n- answer_match_result:匹配结果(True/False)\n- 统计结果将保存到指定的eval_result_path路径\n", - "type": { - "level_1": "core_text", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "eval_result_path", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "compare_method", - "default": "match", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "system_prompt", - "default": "You are a helpful assistant specialized in evaluating answer correctness.", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_test_answer_key", - "default": "generated_cot", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_gt_answer_key", - "default": "golden_answer", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 45, - "name": "BenchDatasetEvaluatorQuestion", - "description": "该算子用于对比预测答案与标准答案的匹配度,支持两种评估模式:\n\n1. 字符串匹配(match):使用数学验证方法比较答案,适用于有明确答案的问题\n2. 语义匹配(semantic):使用LLM评估答案的语义相似度,适用于开放性问题\n\n输入参数:\n- input_test_answer_key:预测答案字段名\n- input_gt_answer_key:标准答案字段名\n- input_question_key:问题字段名(语义匹配模式下必需)\n- compare_method:比较方法(match/semantic)\n\n输出参数:\n- answer_match_result:匹配结果(True/False)\n- 统计结果将保存到指定的eval_result_path路径\n", - "type": { - "level_1": "core_text", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "eval_result_path", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "compare_method", - "default": "match", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "system_prompt", - "default": "You are a helpful assistant specialized in evaluating answer correctness.", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_test_answer_key", - "default": "generated_cot", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_gt_answer_key", - "default": "golden_answer", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_question_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 46, - "name": "Text2QASampleEvaluator", - "description": "该算子用于为给的的文档片段生成种子QA对打分\n\n输入参数:\n- input_question_key: Field name containing the generated question\n- input_answer_key: Field name containing the generated answer\n- output_question_quality_key: Field name containing the question quality grade\n- output_question_quality_feedback_key: Field name containing the question quality feedback\n- output_answer_alignment_key: Field name containing the answer alignment grade\n- output_answer_alignment_feedback_key: Field name containing the answer alignment feedback\n- output_answer_verifiability_key: Field name containing the answer verifiability grade\n- output_downstream_value_key: Field name containing the downstream value grade\n- output_downstream_value_feedback_key: Field name containing the downstream value feedback\n", - "type": { - "level_1": "core_text", - "level_2": "eval" - }, - "allowed_prompts": [ - "Text2QAQuestionQualityPrompt", - "Text2QAAnswerAlignmentPrompt", - "Text2QAAnswerVerifiabilityPrompt", - "Text2QADownstreamValuePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_question_key", - "default": "generated_question", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_answer_key", - "default": "generated_answer", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_question_quality_key", - "default": "question_quality_grades", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_question_quality_feedback_key", - "default": "question_quality_feedbacks", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_answer_alignment_key", - "default": "answer_alignment_grades", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_answer_alignment_feedback_key", - "default": "answer_alignment_feedbacks", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_answer_verifiability_key", - "default": "answer_verifiability_grades", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_answer_verifiability_feedback_key", - "default": "answer_verifiability_feedbacks", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_downstream_value_key", - "default": "downstream_value_grades", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_downstream_value_feedback_key", - "default": "downstream_value_feedbacks", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 47, - "name": "PromptedEvaluator", - "description": "PromptedEvaluator:使用 LLM 根据系统提示词对数据质量进行评分,并将评分写回 DataFrame(同时通过 storage 持久化)。模型应只输出分数(整数)。\n功能:对每行输入文本生成一个评分。\n输入参数:\n- llm_serving:LLM 服务对象,需实现 LLMServingABC 接口。\n- system_prompt:系统提示词(默认:'Please evaluate the quality of this data on a scale from 1 to 5.')。\n- input_key:输入文本所在列名(默认:'raw_content')。\n- output_key:评分结果写入的列名(默认:'eval')。\n输出:\n- 返回输出列名(用于后续算子引用),评分结果已写回并保存。", - "type": { - "level_1": "core_text", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "system_prompt", - "default": "Please evaluate the quality of this data on a scale from 1 to 5.", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "raw_content", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "eval", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 48, - "name": "PromptedFilter", - "description": "PromptedFilter 使用内置的 PromptedEvaluator 对输入数据进行数值化打分,并根据指定的分数区间(min_score 到 max_score,闭区间)筛选出符合条件的样本。默认情况下打分范围是 1–5,但用户可以通过 system_prompt 自定义其他评分规则。\n\n输入参数:\n- llm_serving:LLM 服务对象,需实现 LLMServingABC 接口\n- system_prompt:系统提示词,定义评估规范(可选,默认 'Please evaluate the quality of this data on a scale from 1 to 5.')\n- input_key:待评估文本所在列名(默认 'raw_content')\n- output_key:写回打分结果的列名(默认 'eval',若已存在将被覆盖)\n- min_score:筛选的最小分(默认 5)\n- max_score:筛选的最大分(默认 5)\n\n输出参数:\n- 过滤后的 DataFrame(仅保留分数位于 [min_score, max_score] 的行)\n- 返回 output_key 以供后续算子引用\n\n备注:\n- 默认打分区间是 1–5,但可根据实际 prompt 改变。", - "type": { - "level_1": "core_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "system_prompt", - "default": "Please evaluate the quality of this data on a scale from 1 to 5.", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "min_score", - "default": 1, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 5, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "raw_content", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "eval", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 49, - "name": "KCenterGreedyFilter", - "description": "该算子用于从大量的文档片段中选取部分文档片段,用于后续生成种子QA对\n\n输入参数:\n- input_key: 包含文档片段的字段名\n- embedding_model_path: 嵌入模型路径\n- num_samples: 选取的文档片段数量\n- method: 选择方法,随机或k-center-greedy\n\n", - "type": { - "level_1": "core_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "num_samples", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "embedding_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "content", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 50, - "name": "GeneralFilter", - "description": "该算子支持通过多个自定义函数对 DataFrame 进行灵活过滤。\n\n每条过滤规则是一个函数(例如 lambda 表达式),接受一个 DataFrame 并返回一个布尔类型的 Series,用于指定保留哪些行。\n\n输入参数:\n- filter_rules:一个函数列表,每个函数形式为 lambda df: ...,需返回一个与 df 长度一致的布尔 Series。所有规则之间采用与(AND)关系组合。\n\n示例:\n - lambda df: df['score'] > 0.5\n - lambda df: df['label'].isin(['A', 'B'])", - "type": { - "level_1": "core_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "filter_rules", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 51, - "name": "PromptedRefiner", - "description": "PromptedRefiner 根据给定的 system_prompt 对指定列的文本进行改写/润色/规范化,并将结果**就地写回**同一列(覆盖原内容)。其做法是对每一行拼接 `system_prompt + raw_content` 作为模型输入,批量生成改写结果。\n\n输入参数:\n- llm_serving:LLM 服务对象,需实现 LLMServingABC 接口\n- system_prompt:系统提示词,用于描述改写目标与风格(默认 'You are a helpful agent.')\n- input_key:要改写的文本列名(默认 'raw_content'),改写后会覆盖该列\n\n输出参数:\n- 覆盖后的 DataFrame(同名列被改写后的文本)\n- 无返回值(结果已通过 DataFlowStorage 写出)\n\n备注:\n- 该算子**覆盖** input_key 列;若需保留原文,建议先拷贝到新列。\n- 期望每行在 input_key 列提供可用文本;空值将不会生成对应输入,如与行数不匹配可能导致赋值报错。", - "type": { - "level_1": "core_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "system_prompt", - "default": "You are a helpful agent.", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "raw_content", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 52, - "name": "PandasOperator", - "description": "该算子支持通过多个自定义函数对 DataFrame 进行任意操作(如添加列、重命名、排序等)。\n\n每个函数(通常为 lambda 表达式)接受一个 DataFrame 并返回一个修改后的 DataFrame。\n\n输入参数:\n- process_fn:一个函数列表,每个函数形式为 lambda df: ...,必须返回一个 DataFrame。\n\n示例:\n - lambda df: df.assign(score2=df['score'] * 2)\n - lambda df: df.sort_values('score', ascending=False)", - "type": { - "level_1": "core_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "process_fn", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - } - ], - "core_vision": [ - { - "node": 53, - "name": "PromptedVQAGenerator", - "description": "该算子用于视觉问答生成,接收包含图像和问题的输入内容,使用大语言模型生成回答,并将生成的回答保存到数据框中。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- system_prompt:系统提示词,用于定义模型行为,默认为'You are a helpful assistant.'\n- input_key:输入内容的字段名,默认为'raw_content'\n- output_key:输出生成内容的字段名,默认为'generated_content'\n输出参数:\n- 返回输出字段名,用于后续算子引用\n- 在数据框中添加包含生成回答的新列", - "type": { - "level_1": "core_vision", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "system_prompt", - "default": "You are a helpful assistant.", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "raw_content", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "generated_content", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - } - ], - "db": [ - { - "node": 54, - "name": "DBOperator", - "description": "N/A (非 staticmethod)", - "type": { - "level_1": "db", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "expr", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - } - ], - "general_text": [ - { - "node": 55, - "name": "ColonEndFilter", - "description": "该算子用于检查文本是否以冒号结尾,常用于判断问题是否为不完整的提问。\n初始化参数:\n- 无\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'{类名小写}_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 56, - "name": "SentenceNumberFilter", - "description": "该算子用于检查文本中的句子数量是否在指定范围内,使用正则表达式匹配句子结束符号(。!?.!?)进行分割。\n初始化参数:\n- min_sentences:最小句子数量阈值,默认为3\n- max_sentences:最大句子数量阈值,默认为7500\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'sentence_number_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_sentences", - "default": 3, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_sentences", - "default": 7500, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "sentence_number_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 57, - "name": "LineEndWithEllipsisFilter", - "description": "该算子用于检测并过滤以省略号(...)或(……)结尾的文本行,常用于识别不完整的表述。\n初始化参数:\n- threshold:以省略号结尾的行数比率阈值,默认为0.3\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'line_end_with_ellipsis_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "threshold", - "default": 0.3, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "line_end_with_ellipsis_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 58, - "name": "ContentNullFilter", - "description": "该算子用于过滤空值、空字符串或仅包含空白字符的文本,确保输入数据的有效性。\n初始化参数:\n- 无\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'content_null_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "content_null_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 59, - "name": "SymbolWordRatioFilter", - "description": "该算子用于检查文本中特定符号(#, ..., …)与单词数量的比率是否超过阈值,过滤符号使用过多的文本。\n初始化参数:\n- threshold:符号与单词比率阈值,默认为0.4\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'symbol_word_ratio_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "threshold", - "default": 0.4, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "symbol_word_ratio_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 60, - "name": "AlphaWordsFilter", - "description": "该算子用于验证文本中字母单词的比率是否达到阈值,支持NLTK分词或简单空格分割两种模式。\n初始化参数:\n- threshold:字母单词比率阈值(无默认值,必须提供)\n- use_tokenizer:是否使用NLTK分词器(无默认值,必须提供)\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'alpha_words_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "threshold", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "use_tokenizer", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "alpha_words_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 61, - "name": "HtmlEntityFilter", - "description": "该算子用于检测并过滤包含HTML实体(如&、<、>等)的文本,确保内容不包含标记语言元素。\n初始化参数:\n- 无\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'html_entity_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "html_entity_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 62, - "name": "IDCardFilter", - "description": "该算子用于检测并过滤包含身份证相关术语的文本,使用正则表达式匹配身份证号码模式以保护敏感信息。\n初始化参数:\n- threshold:身份证相关词汇匹配次数阈值,默认为3\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'id_card_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "threshold", - "default": 3, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "id_card_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 63, - "name": "NoPuncFilter", - "description": "该算子用于确保文本包含足够的标点符号,通过统计句子间最大单词数量进行过滤。\n初始化参数:\n- threshold:句子间最大单词数量阈值,默认为112\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'no_punc_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "threshold", - "default": 112, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "no_punc_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 64, - "name": "SpecialCharacterFilter", - "description": "该算子用于移除包含特殊/unicode字符的文本,使用预定义模式检测非标准字符以确保文本规范性。\n初始化参数:\n- 无\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'special_character_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "special_character_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 65, - "name": "WatermarkFilter", - "description": "该算子用于检测并移除包含版权/水印内容的文本,使用指定关键词列表识别受保护内容。\n初始化参数:\n- watermarks:水印关键词列表,默认为['Copyright', 'Watermark', 'Confidential']\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'watermark_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "watermarks", - "default": [ - "Copyright", - "Watermark", - "Confidential" - ], - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "watermark_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 66, - "name": "MeanWordLengthFilter", - "description": "该算子用于检查文本中单词的平均长度是否在指定范围内,通过字符总数除以单词数量计算平均值。\n初始化参数:\n- min_length:最小平均单词长度,默认为3\n- max_length:最大平均单词长度,默认为10\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'mean_word_length_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_length", - "default": 3, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_length", - "default": 10, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "mean_word_length_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 67, - "name": "StopWordFilter", - "description": "该算子用于验证文本中停用词的比率是否高于阈值,使用NLTK分词器进行单词分割和停用词识别。\n初始化参数:\n- threshold:停用词比率阈值(无默认值,必须提供)\n- use_tokenizer:是否使用NLTK分词器(无默认值,必须提供)\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'stop_word_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "threshold", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "use_tokenizer", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "stop_word_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 68, - "name": "CurlyBracketFilter", - "description": "该算子用于检测文本中是否存在过多的花括号使用,通过花括号数量与文本长度的比率进行过滤。\n初始化参数:\n- threshold:花括号比率阈值,默认为0.025\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'curly_bracket_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "threshold", - "default": 0.025, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "curly_bracket_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 69, - "name": "CapitalWordsFilter", - "description": "该算子用于检查文本中大写单词的比率是否超过阈值,支持可选的分词器进行单词识别。\n初始化参数:\n- threshold:大写单词比率阈值,默认为0.2\n- use_tokenizer:是否使用NLTK分词器,默认为False\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'capital_words_filter'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "threshold", - "default": 0.2, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "use_tokenizer", - "default": false, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "capital_words_filter", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 70, - "name": "LoremIpsumFilter", - "description": "该算子用于检测并过滤包含占位文本(如'lorem ipsum')的文本,使用正则表达式模式匹配并结合阈值过滤。\n初始化参数:\n- threshold:'lorem ipsum'出现次数与文本长度的比率阈值,默认为3e-8\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'loremipsum_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "threshold", - "default": 3e-08, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "loremipsum_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 71, - "name": "UniqueWordsFilter", - "description": "该算子用于检查文本中唯一单词的比率是否达到阈值,通过集合操作计算唯一单词数量与总单词数量的比率。\n初始化参数:\n- threshold:最小唯一单词比率阈值,默认为0.1\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'unique_words_filter'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "threshold", - "default": 0.1, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "unique_words_filter", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 72, - "name": "CharNumberFilter", - "description": "该算子用于验证文本在去除空白字符后的字符数量是否达到最小阈值。\n初始化参数:\n- threshold:最小字符数量阈值,默认为100\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'char_number_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "threshold", - "default": 100, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "char_number_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 73, - "name": "LineStartWithBulletpointFilter", - "description": "该算子用于检测并过滤以各种项目符号符号开头的文本行,使用Unicode字符匹配结合比率阈值进行过滤。\n初始化参数:\n- threshold:以项目符号开头的行数比率阈值,默认为0.9\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'line_start_with_bullet_point_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "threshold", - "default": 0.9, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "line_start_with_bullet_point_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 74, - "name": "LineWithJavascriptFilter", - "description": "该算子用于识别并过滤包含'javascript'引用的文本,通过关键词匹配和阈值判断进行内容过滤。\n初始化参数:\n- threshold:不包含'javascript'的最小行数阈值,默认为3\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'line_with_javascript_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "threshold", - "default": 3, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "line_with_javascript_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 75, - "name": "LangkitSampleEvaluator", - "description": "使用Langkit工具包计算文本统计信息,帮助评估文本结构复杂性和可读性。提取多种语言特征,包括句子长度、词汇多样性、情感倾向等。\n\n输出参数:\n- LangkitNumSentencesScore: 句子数量\n- LangkitNumWordsScore: 单词数量\n- LangkitAvgWordLengthScore: 平均单词长度\n- LangkitFleschReadingEaseScore: 可读性评分(Flesch公式)\n- LangkitSentimentScore: 情感倾向(-1到1之间)", - "type": { - "level_1": "general_text", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 76, - "name": "LangkitFilter", - "description": "基于LangkitScorer打分器的得分对数据进行过滤。使用Langkit工具包计算11种文本统计信息,帮助评估文本结构复杂性和可读性。\n输入参数:\n- min_scores:各指标的最小阈值字典,包含11个语言统计指标\n- max_scores:各指标的最大阈值字典,包含11个语言统计指标\n- metrics_to_keep:需要保留的评估指标列表\n输出参数:\n- 过滤后的DataFrame,仅保留所有指标都在指定范围内的文本\n- 返回包含各指标标签字段名的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_scores", - "default": { - "flesch_reading_ease": 0, - "automated_readability_index": 0, - "aggregate_reading_level": 0, - "syllable_count": 32.0, - "lexicon_count": 23.0, - "sentence_count": 1.0, - "character_count": 118.0, - "letter_count": 109.0, - "polysyllable_count": 0.0, - "monosyllable_count": 13.0, - "difficult_words": 4.0 - }, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_scores", - "default": { - "flesch_reading_ease": 100, - "automated_readability_index": 100, - "aggregate_reading_level": 100, - "syllable_count": 2331.9, - "lexicon_count": 1554.0, - "sentence_count": 89.1, - "character_count": 7466.3, - "letter_count": 7193.0, - "polysyllable_count": 216.4, - "monosyllable_count": 1044.1, - "difficult_words": 213.4 - }, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "metrics_to_keep", - "default": [ - "flesch_reading_ease", - "automated_readability_index", - "aggregate_reading_level", - "syllable_count", - "lexicon_count", - "sentence_count", - "character_count", - "letter_count", - "polysyllable_count", - "monosyllable_count", - "difficult_words" - ], - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_keys", - "default": [ - "flesch_reading_ease", - "automated_readability_index", - "aggregate_reading_level", - "syllable_count", - "lexicon_count", - "sentence_count", - "character_count", - "letter_count", - "polysyllable_count", - "monosyllable_count", - "difficult_words" - ], - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 77, - "name": "LexicalDiversitySampleEvaluator", - "description": "使用MTLD(词汇多样性测量)和HDD(移动平均类型-标记比)方法计算文本词汇多样性。\n\n功能说明:\n- MTLD(词汇多样性测量):通过计算维持特定TTR阈值所需的单词数量来评估词汇多样性\n- HDD(移动平均类型-标记比):基于样本的词汇丰富度估计\n\n输入要求:文本长度需大于50个单词\n\n输出参数:\n- LexicalDiversityMTLDScore: MTLD多样性得分(值越高表示多样性越好)\n- LexicalDiversityHD-DScore: HDD多样性得分(值越高表示多样性越好)", - "type": { - "level_1": "general_text", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 78, - "name": "LexicalDiversityFilter", - "description": "基于LexicalDiversityScorer打分器的得分对数据进行过滤。使用MTLD(移动平均类型-令牌比)和HDD(超几何分布多样性)两种方法计算词汇多样性,高分代表更丰富的词汇使用。\n输入参数:\n- min_scores:各指标的最小阈值字典,包含'mtld'和'hdd'\n- max_scores:各指标的最大阈值字典,包含'mtld'和'hdd'\n输出参数:\n- 过滤后的DataFrame,仅保留词汇多样性在指定范围内的文本\n- 返回包含各指标标签字段名的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_scores", - "default": { - "mtld": 50, - "hdd": 0.8 - }, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_scores", - "default": { - "mtld": 99999, - "hdd": 1.0 - }, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_keys", - "default": [ - "mtld", - "hdd" - ], - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 79, - "name": "NgramSampleEvaluator", - "description": "计算文本中n-gram的重复比例,评估文本冗余度。通过比较唯一n-gram数量与总n-gram数量的比值来衡量文本原创性。\n\n初始化参数:\n- ngrams: n-gram的长度,默认为5\n\n输出参数:\n- NgramScore: n-gram重复比例得分(0到1之间,得分越高表示重复比例越低)", - "type": { - "level_1": "general_text", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "ngrams", - "default": 5, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "NgramScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 80, - "name": "NgramFilter", - "description": "基于NgramScorer打分器的得分对数据进行过滤。计算文本中n-gram的重复比例,得分越高表示重复比例越低,文本冗余度越小。\n输入参数:\n- min_score:最小n-gram得分阈值\n- max_score:最大n-gram得分阈值\n- ngrams:n-gram的n值\n输出参数:\n- 过滤后的DataFrame,仅保留n-gram得分在指定范围内的文本\n- 返回包含n-gram得分字段名的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 0.8, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 1, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "ngrams", - "default": 5, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "NgramScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 81, - "name": "PresidioSampleEvaluator", - "description": "使用Microsoft Presidio模型识别文本中的个人身份信息(PII),返回检测到的PII实体数量。支持多种实体类型如姓名、邮箱、电话号码等,基于dslim/bert-base-NER模型实现。适用于评估文本的隐私安全风险。\n输入参数:\n- text: 待检测的文本字符串\n- lang: 语言类型,默认为'en'\n输出参数:\n- int: 检测到的PII实体数量", - "type": { - "level_1": "general_text", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "lang", - "default": "en", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "PresidioScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 82, - "name": "PresidioFilter", - "description": "基于PresidioScorer打分器的得分对数据进行过滤。使用Microsoft Presidio模型识别文本中的私人实体(PII),返回PII信息个数。\n支持识别姓名、邮箱、电话号码、身份证号等多种敏感信息类型,可用于数据隐私保护和合规性检查。\n输入参数:\n- min_score:保留样本的最小PII数量阈值,默认为0\n- max_score:保留样本的最大PII数量阈值,默认为5\n- lang:文本语言,默认为'en'\n- device:模型运行设备,默认为'cuda'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n输出参数:\n- 过滤后的DataFrame,仅保留PII数量在[min_score, max_score]范围内的样本\n- 返回包含输出字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 5, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "lang", - "default": "en", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "PresidioScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 83, - "name": "BlocklistFilter", - "description": "该算子使用特定语言的阻止列表进行文本过滤,支持可选的分词器进行单词级匹配。\n输入参数:\n- input_key:输入文本字段名,默认为'text'\n- language:语言代码,默认为'zh'\n- blocklist_dir:阻止列表文件目录,默认为'./blocklists/'\n- threshold:匹配次数阈值,默认为1\n- use_tokenizer:是否使用分词器,默认为True\n- tokenizer:分词器对象,默认为None\n输出参数:\n- 过滤后的DataFrame,仅保留不包含阻止列表关键词的文本行\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "language", - "default": "en", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "threshold", - "default": 1, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "use_tokenizer", - "default": false, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "blocklist_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 84, - "name": "HashDeduplicateFilter", - "description": "使用多种哈希函数对文本进行精确去重,支持md5、sha256或xxh3算法。通过计算文本的哈希值识别重复数据。\n\n初始化参数:\n- hash_func: 哈希函数名称,可选'md5'、'sha256'或'xxh3',默认为'md5'\n\n运行参数:\n- input_keys: 用于计算哈希的多个字段列表(与input_key二选一)\n- input_key: 用于计算哈希的单个字段名(与input_keys二选一)\n- output_key: 去重标记字段名,默认为'minhash_deduplicated_label'\n\n输出说明:标记为1的数据表示首次出现,标记为0的数据表示重复数据\n算法特点:\n- md5: 128位哈希值,平衡速度和唯一性\n- sha256: 256位哈希值,更高安全性,速度较慢\n- xxh3: 128位哈希值,最快的哈希算法", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "hash_func", - "default": "md5", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_keys", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "minhash_deduplicated_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 85, - "name": "LanguageFilter", - "description": "使用FastText语言识别模型过滤数据。下载并加载预训练的FastText语言识别模型,检查文本的语言是否在允许的语言列表中。\n输入参数:\n- allowed_languages:允许的语言标签列表\n- model_cache_dir:模型缓存目录路径\n输出参数:\n- 过滤后的DataFrame,仅保留语言在允许列表中的文本\n- 返回包含语言标签字段名的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "allowed_languages", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "language_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 86, - "name": "LLMLanguageFilter", - "description": "使用大语言模型识别语言并过滤数据", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "allowed_languages", - "default": [ - "en" - ], - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "language_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 87, - "name": "MinHashDeduplicateFilter", - "description": "结合MinHash与LSH(局部敏感哈希)实现高效近似去重。将文本转换为MinHash签名,使用LSH快速查找相似文本,实现大规模数据集的近似去重。\n输入参数:\n- num_perm:生成MinHash签名的排列数\n- threshold:相似度阈值,超过此阈值判定为相似文本\n- use_n_gram:是否使用n-gram分词\n- ngram:n-gram的n值\n输出参数:\n- 去重后的DataFrame,仅保留唯一文本\n- 返回包含去重标签字段名的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "num_perm", - "default": 128, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "threshold", - "default": 0.9, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "use_n_gram", - "default": true, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "ngram", - "default": 5, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_keys", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "minhash_deduplicated_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 88, - "name": "NgramHashDeduplicateFilter", - "description": "结合n-gram技术与哈希算法识别相似文本,实现近似去重。将文本分割为多个n-gram片段,计算每个片段的哈希值,通过比较哈希集合的相似度来判断文本相似性。\n输入参数:\n- n_gram:将文本分割的片段数量\n- hash_func:哈希函数类型,支持'md5'、'sha256'和'xxh3'\n- diff_size:哈希集合差异阈值,小于此值判定为相似文本\n输出参数:\n- 去重后的DataFrame,仅保留唯一文本\n- 返回包含去重标签字段名的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "n_gram", - "default": 3, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "hash_func", - "default": "md5", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "diff_size", - "default": 1, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_keys", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "minhash_deduplicated_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 89, - "name": "PerspectiveSampleEvaluator", - "description": "使用Perspective API评估文本的毒性,返回毒性概率,得分越高表明文本毒性越高。\n输入参数:\n- serving:Perspective API服务对象\n- input_key:输入文本字段名\n- output_key:输出得分字段名,默认'PerspectiveScore'\n输出参数:\n- 包含毒性评估得分的DataFrame", - "type": { - "level_1": "general_text", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "PerspectiveScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 90, - "name": "PerspectiveFilter", - "description": "基于PerspectiveScorer打分器的得分对数据进行过滤使用Perspective API评估文本的毒性,返回毒性概率,得分越高表明文本毒性越高。\n输入参数:\n- min_score:最小毒性得分阈值\n- max_score:最大毒性得分阈值\n输出参数:\n- 过滤后的DataFrame,仅保留毒性得分在指定范围内的文本\n- 返回包含毒性得分字段名的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 0.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 0.5, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "PerspectiveScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 91, - "name": "SemDeduplicateFilter", - "description": "基于BERT语义相似度识别语义重复文本,执行近似去重操作。通过计算文本嵌入向量间的余弦相似度,识别语义相似的文本并保留唯一样本。\n支持多字段组合作为去重依据,可有效去除内容相似但表述不同的重复数据,提高数据集多样性。\n输入参数:\n- eps:相似度阈值,值越小表示允许的相似度越低,默认为0.05(即余弦相似度大于0.95视为重复)\n- model_name:预训练模型名称,默认为'sentence-transformers/all-MiniLM-L6-v2'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- device:模型运行设备,默认为'cuda'\n- input_keys:多个输入字段名列表,与input_key二选一\n- input_key:单个输入字段名,与input_keys二选一\n- output_key:去重结果字段名,默认为'minhash_deduplicated_label'\n输出参数:\n- 过滤后的DataFrame,仅保留语义不重复的样本(标记为1的样本)\n- 返回包含去重结果字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "eps", - "default": 0.05, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_name", - "default": "sentence-transformers/all-MiniLM-L6-v2", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_keys", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "minhash_deduplicated_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 92, - "name": "SimHashDeduplicateFilter", - "description": "使用SimHash算法通过汉明距离识别相似文本,执行近似去重操作。将文本转换为固定长度的指纹,通过计算指纹间的汉明距离判断文本相似度。\n相比语义去重速度更快,适合大规模数据集的快速去重预处理,尤其适用于检测字符层面相似的文本。\n输入参数:\n- fingerprint_size:指纹长度,默认为64位\n- bound:相似度阈值,值越小表示允许的相似度越低,默认为0.1(即相似度大于0.9视为重复)\n- input_keys:多个输入字段名列表,与input_key二选一\n- input_key:单个输入字段名,与input_keys二选一\n- output_key:去重结果字段名,默认为'minhash_deduplicated_label'\n输出参数:\n- 过滤后的DataFrame,仅保留相似性低于阈值的唯一样本(标记为1的样本)\n- 返回包含去重结果字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "fingerprint_size", - "default": 64, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "bound", - "default": 0.1, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_keys", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "minhash_deduplicated_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 93, - "name": "WordNumberFilter", - "description": "该算子用于过滤单词数量不在指定范围内的文本,通过空格分割计算单词数量。\n输入参数:\n- input_key:输入文本字段名,默认为'text'\n- min_words:最小单词数量阈值,默认为5\n- max_words:最大单词数量阈值,默认为100\n输出参数:\n- 过滤后的DataFrame,仅保留单词数量在指定范围内的文本行\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_words", - "default": 20, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_words", - "default": 100000, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "word_number_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 94, - "name": "HtmlEntityRefiner", - "description": "去除文本中的HTML实体,包括标准实体(如 、<)和各种变体形式(全角符号、中文分号等)。支持自定义需要移除的HTML实体列表。输入参数:\n- html_entities:需要移除的HTML实体列表,默认为包含常见实体的列表\n- input_key:输入文本字段名\n输出参数:\n- 包含移除HTML实体后文本的DataFrame\n- 返回输入字段名,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "html_entities", - "default": [ - "nbsp", - "lt", - "gt", - "amp", - "quot", - "apos", - "hellip", - "ndash", - "mdash", - "lsquo", - "rsquo", - "ldquo", - "rdquo" - ], - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 95, - "name": "HtmlUrlRemoverRefiner", - "description": "去除文本中的URL链接和HTML标签,净化文本内容。使用正则表达式匹配并移除各种形式的URL和HTML标签。输入参数:\n- input_key:输入文本字段名\n输出参数:\n- 包含净化后文本的DataFrame\n- 返回输入字段名,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 96, - "name": "LowercaseRefiner", - "description": "将文本字段中的所有大写字符转换为小写,统一文本格式。对指定字段的文本内容进行全小写处理。输入参数:\n- input_key:输入文本字段名\n输出参数:\n- 包含小写转换后文本的DataFrame\n- 返回输入字段名,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 97, - "name": "NERRefiner", - "description": "使用命名实体识别(NER)技术识别并屏蔽文本中的特定实体。使用spaCy的'en_core_web_sm'模型识别实体,并将其替换为对应的实体类型标签。输入参数:\n- input_key:输入文本字段名\n输出参数:\n- 包含实体屏蔽后文本的DataFrame\n- 返回输入字段名,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 98, - "name": "PIIAnonymizeRefiner", - "description": "使用Presidio和BERT-NER模型识别并匿名化文本中的个人身份信息(PII)。支持多种PII类型的检测和匿名化处理。输入参数:\n- lang:语言代码,默认为'en'\n- device:运行设备,默认为'cuda'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- model_name:NER模型名称,默认为'dslim/bert-base-NER'\n- input_key:输入文本字段名\n输出参数:\n- 包含匿名化后文本的DataFrame\n- 返回输入字段名,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "lang", - "default": "en", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_name", - "default": "dslim/bert-base-NER", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 99, - "name": "ReferenceRemoverRefiner", - "description": "删除文本中未闭合的引用标签和引用链接,包括标签和{{cite}}模板的各种完整和不完整形式。净化文本中的引用标记。输入参数:\n- input_key:输入文本字段名\n输出参数:\n- 包含移除引用标记后文本的DataFrame\n- 返回输入字段名,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 100, - "name": "RemoveContractionsRefiner", - "description": "该算子用于扩展文本中的英语缩写词,将缩写形式转换为完整形式(例如将\"can't\"扩展为\"cannot\")。\n使用contractions库进行缩写词扩展,提高文本标准化程度。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含扩展缩写词后的文本\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 101, - "name": "RemoveEmojiRefiner", - "description": "该算子用于去除文本中的Unicode图像表情符号,包括表情符号、杂项符号、交通符号、旗帜等各类图像符号。\n通过正则表达式匹配Unicode表情符号范围,实现高效过滤。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含去除表情符号的文本\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 102, - "name": "RemoveEmoticonsRefiner", - "description": "该算子用于移除文本中的文本型表情符号,例如':-)'、':D'、':('等字符组合表情。\n基于预定义的表情符号字典进行匹配替换,支持多种常见文本表情模式。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含去除文本表情的文本\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 103, - "name": "RemoveExtraSpacesRefiner", - "description": "该算子用于移除文本中的多余空格,将连续的多个空格替换为单个空格,并去除文本前后的空白字符。\n通过字符串分割和连接实现空格标准化,提高文本格式一致性。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含标准化空格的文本\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 104, - "name": "RemoveImageRefsRefiner", - "description": "该算子用于去除文本中的图片引用格式,包括Markdown图片链接、图片编号、特殊符号组合等图像引用模式。\n通过多模式正则表达式匹配,识别并移除多种图片引用格式。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含去除图片引用的文本\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 105, - "name": "RemoveNumberRefiner", - "description": "该算子用于移除文本中的数字字符,包括0-9的阿拉伯数字。\n通过字符过滤实现数字移除,保留纯文本内容。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含去除数字的文本\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 106, - "name": "RemovePunctuationRefiner", - "description": "该算子用于移除文本中的标点符号,包括英文标点符号集合中的所有符号。\n使用string.punctuation定义的标点集合进行过滤,实现文本去标点处理。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含去除标点的文本\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 107, - "name": "RemoveRepetitionsPunctuationRefiner", - "description": "该算子用于移除文本中重复的标点符号,例如将\"!!!\"变为\"!\",\",,\"变为\",\"。\n通过正则表达式匹配连续重复的标点符号,替换为单个符号。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含标准化标点的文本\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 108, - "name": "RemoveStopwordsRefiner", - "description": "该算子用于移除文本中的英语停用词(如\"the\",\"is\",\"in\"等无实际意义的高频词汇)。\n使用NLTK库的stopwords语料库进行停用词过滤,提高文本特征密度。\n输入参数:\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含去除停用词的文本\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 109, - "name": "SpellingCorrectionRefiner", - "description": "该算子用于通过SymSpell算法对文本中的拼写错误进行纠正,支持自定义编辑距离和词典路径。\n若本地词典不存在则自动下载,使用近似字符串匹配实现拼写纠错功能。\n输入参数:\n- max_edit_distance:最大编辑距离,默认为2\n- prefix_length:前缀长度,默认为7\n- dictionary_path:词典路径,默认为'frequency_dictionary_en_82_765.txt'\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含纠正拼写错误的文本\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "max_edit_distance", - "default": 2, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prefix_length", - "default": 7, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "dictionary_path", - "default": "frequency_dictionary_en_82_765.txt", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 110, - "name": "StemmingLemmatizationRefiner", - "description": "该算子用于对文本进行词干提取或词形还原处理,将词语转换为其基本形式。\n支持两种处理方式:Porter词干提取(stemming)和WordNet词形还原(lemmatization),可通过参数选择。\n输入参数:\n- method:处理方法,可选'stemming'或'lemmatization',默认为'stemming'\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含词干/词形还原后的文本\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "method", - "default": "stemming", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 111, - "name": "TextNormalizationRefiner", - "description": "该算子用于规范化文本中的日期格式和货币格式,统一为标准表示形式。\n日期格式统一转换为'YYYY-MM-DD'形式,货币格式转换为'金额 USD'形式,提高数据一致性。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含格式规范化的文本\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 112, - "name": "BertSampleEvaluator", - "description": "使用BERTScore评估生成文本与参考文本的相似度,基于上下文嵌入计算P/R/F1分数。\n输入参数:\n- lang:语言类型,默认为'en'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- input_key:生成文本字段名\n- reference_key:参考文本字段名\n- output_key:输出得分字段名,默认为'BertScore'\n输出参数:\n- 包含F1相似度得分的DataFrame", - "type": { - "level_1": "general_text", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "lang", - "default": "en", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_reference_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "BertScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 113, - "name": "BleuSampleEvaluator", - "description": "计算BLEU分数评估生成文本与参考文本的n-gram重叠度,支持1-4元语法分析。\n输入参数:\n- n:最大n-gram长度,默认为4\n- eff:参考长度计算方式,可选'shortest'/'average'/'longest',默认为'average'\n- special_reflen:特殊参考长度,默认为None\n- input_key:生成文本字段名\n- reference_key:参考文本字段名\n- output_key:输出得分字段名,默认为'BleuScore'\n输出参数:\n- 包含BLEU得分的DataFrame", - "type": { - "level_1": "general_text", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "n", - "default": 4, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "eff", - "default": "average", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "special_reflen", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_reference_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "BleuScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 114, - "name": "CiderSampleEvaluator", - "description": "使用CIDEr指标评估生成文本与参考文本的相似度,基于TF-IDF加权的n-gram重叠度。\n输入参数:\n- n:最大n-gram长度,默认为4\n- sigma:高斯惩罚参数,默认为6.0\n- df_mode:文档频率模式,默认为'coco-val-df'\n- idf_path:IDF文件路径,默认为预训练COCO数据集IDF\n- input_key:生成文本字段名\n- reference_key:参考文本字段名\n- output_key:输出得分字段名,默认为'CiderScore'\n输出参数:\n- 包含CIDEr得分的DataFrame", - "type": { - "level_1": "general_text", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "n", - "default": 4, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "sigma", - "default": 6.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "df_mode", - "default": "coco-val-df", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "idf_path", - "default": "./dataflow/operators/general_pt/eval/cider/coco-val-df.p", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_reference_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "CiderScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 115, - "name": "Task2VecDatasetEvaluator", - "description": "使用Task2Vec方法评估数据集的多样性,通过计算样本嵌入的余弦距离矩阵来量化多样性。\n输入参数:\n- device:计算设备,默认为'cuda'\n- sample_nums:采样次数,默认为10\n- sample_size:每次采样样本数,默认为1\n- method:嵌入方法,可选'montecarlo'或'variational',默认为'montecarlo'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- input_key:输入文本字段名\n输出参数:\n- Task2VecDiversityScore:多样性得分\n- ConfidenceInterval:置信区间", - "type": { - "level_1": "general_text", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "sample_nums", - "default": 10, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "sample_size", - "default": 1, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "method", - "default": "montecarlo", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 116, - "name": "VendiDatasetEvaluator", - "description": "通过计算VendiScore来评估数据集的多样性,使用BERT和SimCSE模型生成嵌入并计算分数。\n输入参数:\n- device:计算设备,默认为'cuda'\n- input_key:输入文本字段名\n输出参数:\n- BERTVendiScore:基于BERT的多样性得分\n- SimCSEVendiScore:基于SimCSE的多样性得分", - "type": { - "level_1": "general_text", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - } - ], - "knowledge_cleaning": [ - { - "node": 117, - "name": "KBCChunkGenerator", - "description": "('CorpusTextSplitter是轻量级文本分割工具,', '支持词/句/语义/递归分块,', '可配置块大小、重叠和最小块长度')", - "type": { - "level_1": "knowledge_cleaning", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "chunk_size", - "default": 512, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "chunk_overlap", - "default": 50, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "split_method", - "default": "token", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "min_tokens_per_chunk", - "default": 100, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "tokenizer_name", - "default": "bert-base-uncased", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "text_path", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "raw_chunk", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 118, - "name": "KBCChunkGeneratorBatch", - "description": "('CorpusTextSplitter是轻量级文本分割工具,', '支持词/句/语义/递归分块,', '可配置块大小、重叠和最小块长度')", - "type": { - "level_1": "knowledge_cleaning", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "chunk_size", - "default": 512, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "chunk_overlap", - "default": 50, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "split_method", - "default": "token", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "min_tokens_per_chunk", - "default": 100, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "tokenizer_name", - "default": "bert-base-uncased", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "text_path", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "chunk_path", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 119, - "name": "FileOrURLToMarkdownConverter", - "description": "知识提取算子:支持从多种文件格式中提取结构化内容并转换为标准Markdown\n核心功能:\n1. PDF文件:使用MinerU解析引擎提取文本/表格/公式,保留原始布局\n2. Office文档(DOC/PPT等):通过DocConverter转换为Markdown格式\n3. 网页内容(HTML/XML):使用trafilatura提取正文并转为Markdown\n4. 纯文本(TXT/MD):直接透传不做处理\n特殊处理:\n- 自动识别中英文文档(lang参数)\n- 支持本地文件路径和URL输入\n- 生成中间文件到指定目录(intermediate_dir)", - "type": { - "level_1": "knowledge_cleaning", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "url", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "raw_file", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "intermediate_dir", - "default": "intermediate", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "lang", - "default": "en", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "mineru_backend", - "default": "vlm-sglang-engine", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 120, - "name": "FileOrURLToMarkdownConverterBatch", - "description": "知识提取算子:支持从多种文件格式中提取结构化内容并转换为标准Markdown\n核心功能:\n1. PDF文件:使用MinerU解析引擎提取文本/表格/公式,保留原始布局\n2. Office文档(DOC/PPT等):通过DocConverter转换为Markdown格式\n3. 网页内容(HTML/XML):使用trafilatura提取正文并转为Markdown\n4. 纯文本(TXT/MD):直接透传不做处理\n特殊处理:\n- 自动识别中英文文档(lang参数)\n- 支持本地文件路径和URL输入\n- 生成中间文件到指定目录(intermediate_dir)", - "type": { - "level_1": "knowledge_cleaning", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "intermediate_dir", - "default": "intermediate", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "lang", - "default": "en", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "mineru_backend", - "default": "vlm-sglang-engine", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "source", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "text_path", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 121, - "name": "KBCTextCleaner", - "description": "知识清洗算子:对原始知识内容进行标准化处理,包括HTML标签清理、特殊字符规范化、链接处理和结构优化,提升RAG知识库的质量。主要功能:\n1. 移除冗余HTML标签但保留语义化标签\n2. 标准化引号/破折号等特殊字符\n3. 处理超链接同时保留文本\n4. 保持原始段落结构和代码缩进\n5. 确保事实性内容零修改\n\n输入格式示例:\n
\n

标题文本

\n

正文段落,包括特殊符号,例如“弯引号”、–破折号等

\n \"示意图\"\n 链接文本\n
代码片段
\n ...\n
\n\n输出格式示例:\n标题文本\n\n正文段落,包括特殊符号,例如\"直引号\"、-破折号等\n\n[Image: 示例图 example.jpg]\n\n链接文本\n\n代码片段\n\n[结构保持,语义保留,敏感信息脱敏处理(如手机号、保密标记等)]", - "type": { - "level_1": "knowledge_cleaning", - "level_2": "generate" - }, - "allowed_prompts": [ - "KnowledgeCleanerPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "lang", - "default": "en", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "raw_chunk", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "cleaned_chunk", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 122, - "name": "KBCTextCleanerBatch", - "description": "知识清洗算子:对原始知识内容进行标准化处理,包括HTML标签清理、特殊字符规范化、链接处理和结构优化,提升RAG知识库的质量。主要功能:\n1. 移除冗余HTML标签但保留语义化标签\n2. 标准化引号/破折号等特殊字符\n3. 处理超链接同时保留文本\n4. 保持原始段落结构和代码缩进\n5. 确保事实性内容零修改", - "type": { - "level_1": "knowledge_cleaning", - "level_2": "generate" - }, - "allowed_prompts": [ - "KnowledgeCleanerPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "lang", - "default": "en", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "chunk_path", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "cleaned_chunk_path", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 123, - "name": "KBCMultiHopQAGeneratorBatch", - "description": "('MultiHopQAGenerator 是多跳问答对生成处理器,支持从文本中自动生成需要多步推理的问题与答案。', '处理流程包括:文本预处理、信息抽取、问题生成与回答生成,支持自定义语言模型后端和参数。', '输出格式如下:', '输入:\\ntext: <原始上下文文本>', '输出:\\n{\\n \"text\": <处理后的文本字符串>,\\n \"qa_pairs\": [\\n {\\n \"question\": <字符串:生成的问题>,\\n \"reasoning_steps\": [\\n {\"step\": <推理过程的步骤 1>},\\n {\"step\": <步骤 2>} ...\\n ],\\n \"answer\": <字符串:最终答案>,\\n \"supporting_facts\": [<支持该答案的事实 1>, <事实 2>, ...],\\n \"type\": <可选:问题类型,如“生物学”、“历史”等>\\n },\\n ...\\n ],\\n \"metadata\": {\\n \"source\": <数据来源>,\\n \"timestamp\": <时间戳字符串>,\\n \"complexity\": <整数:问题复杂度标记>\\n }\\n}')", - "type": { - "level_1": "knowledge_cleaning", - "level_2": "generate" - }, - "allowed_prompts": [ - "Text2MultiHopQAGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "seed", - "default": 0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "lang", - "default": "en", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "chunk_path", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "enhanced_chunk_path", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 124, - "name": "QAExtractor", - "description": "QA对提取器 - 将嵌套的QA_pairs转换为Alpaca微调格式\n\n核心功能:\n从结构化的QA对数据中提取问答内容,自动整合推理步骤和支持事实,\n输出符合Stanford Alpaca标准的instruction-input-output格式。\n\n初始化参数:\n• qa_key: QA对的字段名 (默认: 'QA_pairs')\n• output_json_file: 输出JSON文件路径 (可选,不指定则只更新DataFrame)\n• instruction: 统一的指令前缀 (默认: 'Please answer the following question...')\n\n运行参数 (input_key):\n• None - 包含所有字段 (question + reasoning_steps + supporting_facts)\n• '' - 空字符串,不包含额外上下文\n• 'reasoning_steps' - 只包含推理步骤\n• 'question,reasoning_steps' - 逗号分隔多个字段\n• ['question', 'supporting_facts'] - 列表格式\n\n输出字段:\n• instruction: 问题指令\n• input: 上下文信息 (根据input_key动态拼接)\n• output: 答案\n\n适用场景: 知识库QA微调、领域问答模型训练", - "type": { - "level_1": "knowledge_cleaning", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "qa_key", - "default": "QA_pairs", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_json_file", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "instruction", - "default": "Please answer the following question based on the provided information.", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - } - ], - "reasoning": [ - { - "node": 125, - "name": "ReasoningAnswerGenerator", - "description": "该算子用于为给定问题生成答案,调用大语言模型进行推理。\n输入参数:\n- llm_serving:LLM服务实例,用于生成答案\n- prompt_template:提示模板对象,用于构建生成提示词\n输出参数:\n- output_key:生成的答案字段,默认'generated_cot'", - "type": { - "level_1": "reasoning", - "level_2": "generate" - }, - "allowed_prompts": [ - "MathAnswerGeneratorPrompt", - "GeneralAnswerGeneratorPrompt", - "DiyAnswerGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": "", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "generated_cot", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 126, - "name": "ReasoningQuestionGenerator", - "description": "该算子用于基于现有问题生成新问题。\n输入参数:\n- num_prompts:生成问题的数量,整数,范围1-5(含),默认1\n- llm_serving:LLM服务实例,用于生成问题\n- prompt_template:提示模板对象,用于构建生成提示词\n输出参数:\n- 原始输入列(由input_key指定):新增生成的问题\n- Synth_or_Input:标识问题来源,'input'表示原始问题,'synth'表示生成的新问题", - "type": { - "level_1": "reasoning", - "level_2": "generate" - }, - "allowed_prompts": [ - "MathQuestionSynthesisPrompt", - "GeneralQuestionSynthesisPrompt", - "DiyQuestionSynthesisPrompt" - ], - "parameter": { - "init": [ - { - "name": "num_prompts", - "default": 1, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_synth_or_input_flag", - "default": "Synth_or_Input", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 127, - "name": "ReasoningAnswerExtractionQwenMathEvalGenerator", - "description": "该算子用于从数学问题回答中提取规范化答案表达式,进行字符串清洗、单位处理和格式标准化。\n\n输入参数:\n- input_key:输入数据字段名\n- answer_key:原始答案字段名\n- output_key:处理后的答案字段名\n- unit_texts:需要过滤的单位文本列表\n\n输出参数:\n- output_key:标准化后的数学表达式字段", - "type": { - "level_1": "reasoning", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "dataset_name", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "pseudo_correct_solution_example", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "extraction", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 128, - "name": "ReasoningPseudoAnswerGenerator", - "description": "该算子生成多个候选答案并通过统计选择最优解,实现伪答案生成。\n\n输入参数:\n- input_file:输入文件路径\n- output_file:输出文件路径\n- max_times:最大生成次数\n- selection_mode:统计选择模式(frequency/consistency)\n\n输出参数:\n- final_answer:最终选择答案字段\n- candidate_answers:候选答案列表字段", - "type": { - "level_1": "reasoning", - "level_2": "generate" - }, - "allowed_prompts": [ - "MathAnswerGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_times", - "default": 3, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key_answer", - "default": "pseudo_answers", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key_answer_value", - "default": "pseudo_answer_value", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key_solutions", - "default": "pseudo_solutions", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key_correct_solution_example", - "default": "pseudo_correct_solution_example", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 129, - "name": "ReasoningPretrainFormatConvertGenerator", - "description": "该算子用于将SFT格式数据转换为预训练格式。\n\n输入参数:\n- read_key_question:问题字段名\n- read_key_answer:答案字段名\n- output_key:输出文本字段名\n\n输出参数:\n- output_key:输出文本字段名,包含问题和答案的拼接结果\n- 输出文件:转换后的预训练格式数据文件路径", - "type": { - "level_1": "reasoning", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_read_key_question", - "default": "question", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_read_key_answer", - "default": "answer", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "text", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 130, - "name": "ReasoningQuestionFusionGenerator", - "description": "该算子用于基于现有问题生成新问题。\n输入参数:\n- num_prompts:生成问题的数量,整数,范围1-5(含),默认1\n- llm_serving:LLM服务实例,用于生成问题\n- prompt_template:提示模板对象,用于构建生成提示词\n输出参数:\n- 原始输入列(由input_key指定):新增生成的问题\n- Synth_or_Input:标识问题来源,'input'表示原始问题,'synth'表示生成的新问题", - "type": { - "level_1": "reasoning", - "level_2": "generate" - }, - "allowed_prompts": [ - "MathQuestionParallelFusionGeneratorPrompt", - "MathQuestionSequentialFusionGeneratorPrompt", - "MathQuestionConditionFusionGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "num_prompts", - "default": 1, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_problem_1", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_problem_2", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 131, - "name": "ReasoningCategoryDatasetEvaluator", - "description": "该算子用于统计数据集中的类别信息,包括主类别和次类别的分布情况。它计算每个类别的样本数量,并返回类别分布的统计结果。\n输入参数:\n- input_primary_category_key:主类别字段名,默认为'primary_category'\n- input_secondary_category_key:次类别字段名,默认为'secondary_category'\n输出参数:\n- 返回包含类别统计信息的字典,主类别作为键,值为包含该类别样本数量和次类别分布的字典", - "type": { - "level_1": "reasoning", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_primary_category_key", - "default": "primary_category", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_secondary_category_key", - "default": "secondary_category", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 132, - "name": "ReasoningDifficultyDatasetEvaluator", - "description": "该算子用于统计数据集中的难度信息,计算不同难度级别的样本数量分布。它统计每个难度级别的样本数量,并返回难度分布的统计结果。\n输入参数:\n- input_diffulty_key:难度分数字段名,默认为'difficulty_score'\n输出参数:\n- 返回包含难度统计信息的字典,难度级别作为键,值为该难度级别的样本数量", - "type": { - "level_1": "reasoning", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_diffulty_key", - "default": "difficulty_score", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 133, - "name": "ReasoningTokenDatasetEvaluator", - "description": "该算子用于统计数据集中问题和回答的token信息,包括token数量的最小值、最大值、平均值和中位数等统计指标。它使用指定的tokenizer对文本进行编码,并计算token长度的分布情况。\n输入参数:\n- input_question_key:问题文本字段名\n- input_answer_key:回答文本字段名\n- model_name_or_path:tokenizer模型名称或路径\n输出参数:\n- 返回包含token统计信息的字典,包括问题和回答的token数量的零值计数、最小值、最大值、平均值和中位数", - "type": { - "level_1": "reasoning", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "model_name_or_path", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_question_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_answer_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 134, - "name": "ReasoningQuestionCategorySampleEvaluator", - "description": "该算子用于对用户问题进行多级分类(主分类和子分类)。通过大语言模型对输入问题进行语义分析,输出分类编码结果。\n\n输入参数:\n- db_port/db_name/table_name:数据库连接参数(存储模式)\n- input_file/output_file:文件路径(文件模式)\n- input_key:输入数据中问题字段的键名\n- generator_type:模型调用方式(aisuite/request)\n\n输出参数:\n- classification_result:包含主分类和子分类的编码结果", - "type": { - "level_1": "reasoning", - "level_2": "eval" - }, - "allowed_prompts": [ - "MathQuestionCategoryPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "question_category", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 135, - "name": "ReasoningQuestionDifficultySampleEvaluator", - "description": "该算子用于评估问题的难度等级。通过大语言模型分析问题复杂度,输出1-10级的难度评分。\n\n输入参数:\n- eval_stage:评估阶段标识\n- read_min/max_score:分数过滤阈值\n- 其他参数同ReasoningCategoryDatasetEvaluator\n\n输出参数:\n- difficulty_score:数值型难度评分(1-10)", - "type": { - "level_1": "reasoning", - "level_2": "eval" - }, - "allowed_prompts": [ - "MathQuestionDifficultyPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "difficulty_score", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 136, - "name": "ReasoningQuestionSolvableSampleEvaluator", - "description": "该算子用于对用户问题进行多级分类(主分类和子分类)。通过大语言模型对输入问题进行语义分析,输出分类编码结果。\n\n输入参数:\n- db_port/db_name/table_name:数据库连接参数(存储模式)\n- input_file/output_file:文件路径(文件模式)\n- input_key:输入数据中问题字段的键名\n- generator_type:模型调用方式(aisuite/request)\n\n输出参数:\n- classification_result:包含主分类和子分类的编码结果", - "type": { - "level_1": "reasoning", - "level_2": "eval" - }, - "allowed_prompts": [ - "MathQuestionEvaluatorPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 137, - "name": "ReasoningAnswerFormatterFilter", - "description": "该算子用于检查答案格式是否符合规范,主要验证数学答案是否包含正确的\\boxed{}标记。\n\n输入参数:\n- input_key:输入字段名\n- result_key:结果字段名\n\n输出参数:\n- 通过格式检查返回1,否则返回0", - "type": { - "level_1": "reasoning", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "generated_cot", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 138, - "name": "ReasoningAnswerGroundTruthFilter", - "description": "该算子用于对比预测答案与标准答案的匹配度,支持精确匹配和数学验证两种方式。\n\n输入参数:\n- input_test_answer_key:预测答案字段名\n- input_gt_answer_key:标准答案字段名\n- compare_method:比较方法(exact/math_verify)\n\n输出参数:\n- 匹配成功返回1,否则返回0", - "type": { - "level_1": "reasoning", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "compare_method", - "default": "math_verify", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_test_answer_key", - "default": "generated_cot", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_gt_answer_key", - "default": "golden_answer", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 139, - "name": "ReasoningAnswerNgramFilter", - "description": "该算子基于n-gram重复率过滤答案,检测回答中的重复模式。\n\n输入参数:\n- min_score:最小可接受分数\n- max_score:最大可接受分数\n- ngrams:n-gram大小\n\n输出参数:\n- 分数在范围内返回1,否则返回0", - "type": { - "level_1": "reasoning", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 0.1, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 1.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "ngrams", - "default": 5, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_question_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_answer_key", - "default": "generated_cot", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 140, - "name": "ReasoningAnswerPipelineRootFilter", - "description": "答案处理流程根节点,负责将输入数据根据有无真实标签GT分发到不同处理分支。\n\n输入参数:\n- input_file:输入文件路径\n- output_dir:输出目录路径\n- branch_config:分支配置参数\n- parallel_workers:并行工作线程数\n\n输出参数:\n- 多个输出文件路径(根据分支配置生成)", - "type": { - "level_1": "reasoning", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_answer_key", - "default": "output", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_gt_key", - "default": "golden_answer", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 141, - "name": "ReasoningAnswerTokenLengthFilter", - "description": "该算子根据token数量过滤过长的答案。\n\n输入参数:\n- max_answer_token_length:最大token数\n- tokenizer_dir:分词器路径\n- read_min/max_score:分数范围\n\n输出参数:\n- 长度合规返回1,否则返回0", - "type": { - "level_1": "reasoning", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "max_answer_token_length", - "default": 8192, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "tokenizer_dir", - "default": "Qwen/Qwen2.5-0.5B-Instruct", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "generated_cot", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 142, - "name": "ReasoningQuestionFilter", - "description": "该算子用于对问题进行正确性检查,包括格式是否规范、语义是否合理、条件是否矛盾以及是否具备充分信息可解。调用大语言模型依次执行四阶段判断,最终返回每个问题是否合格的二分类结果(保留合格样本)。\n输入参数:\n- system_prompt:系统提示词,用于定义模型行为\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- prompt_template:提示模板对象,用于构建检查提示词\n- input_key:输入问题字段名,默认为'math_problem'\n输出参数:\n- 过滤后的DataFrame,仅保留判断结果为True的行\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "reasoning", - "level_2": "filter" - }, - "allowed_prompts": [ - "MathQuestionFilterPrompt", - "GeneralQuestionFilterPrompt", - "DiyQuestionFilterPrompt" - ], - "parameter": { - "init": [ - { - "name": "system_prompt", - "default": "You are a helpful assistant.", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": "", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "math_problem", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 143, - "name": "ReasoningAnswerModelJudgeFilter", - "description": "该算子用于对答案进行正确性评判,通过比较当前答案与参考答案的语义一致性,判断答案是否正确。调用大语言模型进行语义理解和判断,最终返回每个答案是否正确的二分类结果。\n输入参数:\n- system_prompt:系统提示词,用于定义模型行为\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- prompt_template:提示模板对象,用于构建评判提示词\n- keep_all_samples:是否保留所有样本,默认为False(仅保留正确答案)\n- question_key:问题字段名,默认为'question'\n- answer_key:当前答案字段名,默认为'answer'\n- reference_key:参考答案字段名,默认为'reference_answer'\n输出参数:\n- DataFrame,包含原始数据和判断结果(answer_match_result字段)\n- 如果keep_all_samples为False,则仅保留判断结果为True的行\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "reasoning", - "level_2": "filter" - }, - "allowed_prompts": [ - "AnswerJudgePromptQuestion", - "AnswerJudgePrompt" - ], - "parameter": { - "init": [ - { - "name": "system_prompt", - "default": "You are a helpful assistant specialized in evaluating answer correctness.", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": "", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "keep_all_samples", - "default": false, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_question_key", - "default": "question", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_answer_key", - "default": "answer", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_reference_key", - "default": "reference_answer", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - } - ], - "text2sql": [ - { - "node": 144, - "name": "SQLConsistencyFilter", - "description": "对条目进行过滤,检测SQL和自然语言问题是否对应,即判断SQL是否能解决该问题。\n\n输入参数:\n- input_sql_key: 输入SQL列名\n- input_db_id_key: 输入数据库ID列名\n- input_question_key: 输入问题列名\n\n", - "type": { - "level_1": "text2sql", - "level_2": "filter" - }, - "allowed_prompts": [ - "SQLConsistencyFilterPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "database_manager", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_sql_key", - "default": "SQL", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_db_id_key", - "default": "db_id", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_question_key", - "default": "question", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 145, - "name": "SQLExecutionFilter", - "description": "对条目进行过滤,在数据库中执行SQL,筛选掉不可执行的条目。\n\n输入参数:\n- input_sql_key: 输入SQL列名\n- input_db_id_key: 输入数据库ID列名\n\n", - "type": { - "level_1": "text2sql", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "database_manager", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_sql_key", - "default": "sql", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_db_id_key", - "default": "db_id", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 146, - "name": "SQLGenerator", - "description": "基于数据库信息,合成SQL,覆盖不同的难度、数据库Schema、函数和风格。\n\n输出参数:\n- output_sql_key: 输出SQL列名\n- output_db_id_key: 数据库ID列名\n\n", - "type": { - "level_1": "text2sql", - "level_2": "generate" - }, - "allowed_prompts": [ - "SelectSQLGeneratorPrompt", - "SelectVecSQLGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "database_manager", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "generate_num", - "default": 300, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_sql_key", - "default": "sql", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_db_id_key", - "default": "db_id", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 147, - "name": "SQLByColumnGenerator", - "description": "基于数据库信息,合成SQL,覆盖不同的难度、数据库Schema、函数和风格。\n\n输出参数:\n- output_sql_key: 输出SQL列名\n- output_db_id_key: 数据库ID列名\n\n", - "type": { - "level_1": "text2sql", - "level_2": "generate" - }, - "allowed_prompts": [ - "SelectSQLGeneratorPrompt", - "SelectVecSQLGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "database_manager", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "generate_num", - "default": 5, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_sql_key", - "default": "sql", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_db_id_key", - "default": "db_id", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 148, - "name": "SQLVariationGenerator", - "description": "对于每个条目,基于已有的SQL,指导模型生成SQL的变种,即在原有SQL的基础上,进行数据替换、函数变换、难度变换等操作,生成更加丰富的SQL。\n\n输入参数:\n- input_sql_key: SQL列名\n- input_db_id_key: 数据库ID列名\n\n", - "type": { - "level_1": "text2sql", - "level_2": "generate" - }, - "allowed_prompts": [ - "SQLVariationGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "database_manager", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "num_variations", - "default": 10, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_sql_key", - "default": "sql", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_db_id_key", - "default": "db_id", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 149, - "name": "Text2SQLCoTGenerator", - "description": "对于每个条目,生成从自然语言问题和数据库Schema到SQL的CoT长链路推理过程。\n\n输入参数:\n- input_sql_key: 输入SQL列名\n- input_question_key: 输入问题列名\n- input_db_id_key: 输入数据库ID列名\n\n输出参数:\n- output_cot_key: 输出CoT列名", - "type": { - "level_1": "text2sql", - "level_2": "generate" - }, - "allowed_prompts": [ - "Text2SQLCotGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "database_manager", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_sql_key", - "default": "SQL", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_question_key", - "default": "question", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_db_id_key", - "default": "db_id", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_evidence_key", - "default": "evidence", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_cot_key", - "default": "cot_reasoning", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 150, - "name": "Text2SQLPromptGenerator", - "description": "从数据库提取Schema信息,结合自然语言问题生成提示词。其中提示词模版支持自定义。\n\n输入参数:\n- input_question_key: 问题列名\n- input_db_id_key: 数据库ID列名\n- output_prompt_key: 输出prompt列名\n\n输出参数:\n- output_prompt_key: 生成的prompt", - "type": { - "level_1": "text2sql", - "level_2": "generate" - }, - "allowed_prompts": [ - "Text2SQLPromptGeneratorPrompt", - "Text2VecSQLPromptGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "database_manager", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_question_key", - "default": "question", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_db_id_key", - "default": "db_id", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_evidence_key", - "default": "evidence", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_prompt_key", - "default": "prompt", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 151, - "name": "Text2SQLQuestionGenerator", - "description": "对于每个条目,如果自然语言问题为空,生成SQL对应的自然语言问题。为保证正确,生成多个候选问题,并选择最优的。\n\n输入参数:\n- input_sql_key: 输入SQL列名\n- input_db_id_key: 数据库ID列名\n\n输出参数:\n- output_question_key: 输出问题列名", - "type": { - "level_1": "text2sql", - "level_2": "generate" - }, - "allowed_prompts": [ - "Text2SQLQuestionGeneratorPrompt", - "Text2VecSQLQuestionGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "embedding_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "database_manager", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "question_candidates_num", - "default": 5, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_sql_key", - "default": "sql", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_db_id_key", - "default": "db_id", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_question_key", - "default": "question", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_evidence_key", - "default": "evidence", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 152, - "name": "SQLComponentClassifier", - "description": "根据SQL的组件数量和复杂度,评估SQL的难度。\n\n输入参数:\n- input_sql_key: 输入SQL列名\n\n输出参数:\n- output_difficulty_key: 输出难度列名", - "type": { - "level_1": "text2sql", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "difficulty_thresholds", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "difficulty_labels", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_sql_key", - "default": "SQL", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_difficulty_key", - "default": "sql_component_difficulty", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 153, - "name": "SQLExecutionClassifier", - "description": "让模型根据自然语言问题、数据库Schema和提示词,多次生成SQL,通过生成SQL的准确率,评估该问题对于模型的难度。\n\n输入参数:\n- input_db_id_key: 输入数据库ID列名\n- input_sql_key: 输入SQL列名\n- input_prompt_key: 输入prompt列名\n\n输出参数:\n- output_difficulty_key: 输出难度列名", - "type": { - "level_1": "text2sql", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "database_manager", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "num_generations", - "default": 10, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "difficulty_thresholds", - "default": [ - 2, - 5, - 9 - ], - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "difficulty_labels", - "default": [ - "extra", - "hard", - "medium", - "easy" - ], - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_db_id_key", - "default": "db_id", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_sql_key", - "default": "SQL", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_prompt_key", - "default": "rl_prompt", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_difficulty_key", - "default": "sql_execution_difficulty", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - } - ], - "text_pt": [ - { - "node": 154, - "name": "CCNetDeduplicateFilter", - "description": "CCNet去重方法,基于SHA-1哈希算法的前N位进行重复识别,实现精确去重。\n\n初始化参数:\n- bit_length: 哈希值的位数,默认为64位\n\n运行参数:\n- input_keys: 用于计算哈希的多个字段列表(与input_key二选一)\n- input_key: 用于计算哈希的单个字段名(与input_keys二选一)\n- output_key: 去重标记字段名,默认为'minhash_deduplicated_label'\n\n输出说明:标记为1的数据表示首次出现,标记为0的数据表示重复数据", - "type": { - "level_1": "text_pt", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "bit_length", - "default": 64, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_keys", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "minhash_deduplicated_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 155, - "name": "DebertaV3SampleEvaluator", - "description": "基于Nvidia Deberta V3模型的质量分类器,用于评估文本质量并返回分类结果。\n输入参数:\n- model_name:预训练模型名称\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- device:计算设备,默认为'cuda'\n- input_key:输入文本字段名\n- output_key:输出分类结果字段名,默认为'Debertav3Score'\n输出参数:\n- 包含文本质量分类结果的DataFrame", - "type": { - "level_1": "text_pt", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "model_name", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "Debertav3Score", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 156, - "name": "DebertaV3Filter", - "description": "基于DebertaV3Scorer打分器的得分对数据进行过滤。使用Nvidia Deberta V3模型的质量分类器评估文本质量。\n\n初始化参数:\n- allowed_scores: 允许通过的分数列表,默认为['Medium', 'High']\n- model_name: 模型名称,默认为'nvidia/quality-classifier-deberta'\n- model_cache_dir: 模型缓存目录,默认为'./dataflow_cache'\n- device: 运行设备,默认为'cuda'\n- batch_size: 批处理大小,默认为16\n\n运行参数:\n- input_key: 输入文本字段名\n- output_key: 输出分数字段名,默认为'Debertav3Score'\n\n过滤逻辑:保留分类结果在allowed_scores列表中的数据", - "type": { - "level_1": "text_pt", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "allowed_scores", - "default": [ - "Medium", - "High" - ], - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_name", - "default": "nvidia/quality-classifier-deberta", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "batch_size", - "default": 16, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "Debertav3Score", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 157, - "name": "FineWebEduSampleEvaluator", - "description": "基于Fineweb-Edu分类器评估文本的教育价值。该分类器使用预训练的序列分类模型对文本进行评估,返回0-1之间的分数,分数越高表示文本的教育价值越高。适用于筛选具有教育意义的文本内容。\n输入参数:\n- text: 待评估的文本字符串\n输出参数:\n- float: 0-1之间的教育价值分数,越高表示教育价值越大", - "type": { - "level_1": "text_pt", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "FinewebEduScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 158, - "name": "FineWebEduFilter", - "description": "基于FineWebEduScorer打分器的得分对数据进行过滤。Fineweb-Edu是一个用于评估文本教育价值的分类器。\n\n初始化参数:\n- min_score: 最低分数阈值,默认为2.5\n- max_score: 最高分数阈值,默认为10000\n- model_cache_dir: 模型缓存目录,默认为'./dataflow_cache'\n- device: 运行设备,默认为'cuda'\n\n运行参数:\n- input_key: 输入文本字段名\n- output_key: 输出分数字段名,默认为'FinewebEduScore'\n\n评分标准:0-5分,分数越高表示文本具有越高的教育价值\n过滤逻辑:保留分数在[min_score, max_score]范围内的数据", - "type": { - "level_1": "text_pt", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 2.5, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 10000, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "FinewebEduScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 159, - "name": "PairQualSampleEvaluator", - "description": "基于BGE模型和GPT成对比较数据训练的文本质量评分器,支持中英文输入。通过对文本进行单样本评估,返回0-1之间的质量分数,分数越高表示文本质量越好。模型分为英文版本(zks2856/PairQual-Scorer-en)和中文版本(zks2856/PairQual-Scorer-zh)。\n输入参数:\n- text: 待评估的文本字符串\n- lang: 语言类型,可选'en'或'zh'\n输出参数:\n- float: 0-1之间的质量分数,越高表示质量越好", - "type": { - "level_1": "text_pt", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "lang", - "default": "en", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_length", - "default": 512, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "PairQualScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 160, - "name": "PairQualFilter", - "description": "基于PairQualScorer打分器的得分对数据进行过滤。基于BGE模型,使用GPT对文本成对比较打分后训练而成的双语文本质量评分器,得分越高表示质量越高。\n输入参数:\n- min_score:最小质量得分阈值\n- max_score:最大质量得分阈值\n- model_cache_dir:模型缓存目录路径\n- lang:文本语言类型\n输出参数:\n- 过滤后的DataFrame,仅保留质量得分在指定范围内的文本\n- 返回包含质量得分字段名的列表", - "type": { - "level_1": "text_pt", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 10000, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "lang", - "default": "en", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "PairQualScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 161, - "name": "PerplexitySampleEvaluator", - "description": "基于Huggingface语言模型计算文本的困惑度(Perplexity),困惑度越低表示文本的流畅性和可理解性越高。输入参数:\n- model_name:Huggingface模型路径或名称\n- device:模型运行设备\n输出参数:\n- float: 困惑度值,越低表示文本流畅性越好", - "type": { - "level_1": "text_pt", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "model_name", - "default": "gpt2", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "raw_content", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "PerplexityScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 162, - "name": "PerplexityFilter", - "description": "基于PerplexityScorer打分器的得分对数据进行过滤。基于Huggingface模型计算文本的困惑度,困惑度越低,文本的流畅性和可理解性越高。\n输入参数:\n- min_score:最小困惑度阈值\n- max_score:最大困惑度阈值\n- model_name:Huggingface模型路径或名称\n- device:模型运行设备\n输出参数:\n- 过滤后的DataFrame,仅保留困惑度在指定范围内的文本\n- 返回包含困惑度得分字段名的列表", - "type": { - "level_1": "text_pt", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 10.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 500.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_name", - "default": "gpt2", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "PerplexityScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 163, - "name": "QuratingSampleEvaluator", - "description": "通过Qurating模型(princeton-nlp/QuRater-1.3B)从四个维度评估文本质量:写作风格(writing_style)、所需专业程度(required_expertise)、事实与趣闻(facts_and_trivia)和教育价值(educational_value)。每个维度返回0-1之间的分数,综合评估文本的整体质量。\n输入参数:\n- text: 待评估的文本字符串\n- labels: 评估维度列表,默认为['writing_style', 'required_expertise', 'facts_and_trivia', 'educational_value']\n输出参数:\n- dict: 包含各维度分数的字典,键为维度名称,值为0-1之间的分数", - "type": { - "level_1": "text_pt", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "map_batch_size", - "default": 512, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "num_workers", - "default": 1, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device_batch_size", - "default": 16, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "labels", - "default": [ - "writing_style", - "required_expertise", - "facts_and_trivia", - "educational_value" - ], - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 164, - "name": "QuratingFilter", - "description": "基于QuratingScorer打分器的得分对数据进行过滤。通过Qurating模型从四个维度评估文本质量:写作风格、所需专业知识、事实与 trivia 内容、教育价值。\n每个维度评分范围为0-9分,综合判断文本质量,可用于筛选高质量教育类或知识类内容。\n输入参数:\n- min_scores:各维度保留样本的最小分数阈值,默认为{'writing_style':0,'required_expertise':0,'facts_and_trivia':0,'educational_value':0}\n- max_scores:各维度保留样本的最大分数阈值,默认为{'writing_style':9,'required_expertise':9,'facts_and_trivia':9,'educational_value':9}\n- map_batch_size:映射批次大小,默认为512\n- num_workers:数据加载工作进程数,默认为1\n- device_batch_size:设备批次大小,默认为16\n- device:模型运行设备,默认为'cuda'\n- labels:评估维度列表,默认为['writing_style', 'required_expertise', 'facts_and_trivia', 'educational_value']\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n输出参数:\n- 过滤后的DataFrame,仅保留所有维度分数均在对应阈值范围内的样本\n- 返回包含各维度过滤结果字段名的列表,用于后续算子引用", - "type": { - "level_1": "text_pt", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_scores", - "default": { - "writing_style": 0, - "required_expertise": 0, - "facts_and_trivia": 0, - "educational_value": 0 - }, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_scores", - "default": { - "writing_style": 9, - "required_expertise": 9, - "facts_and_trivia": 9, - "educational_value": 9 - }, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "map_batch_size", - "default": 512, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "num_workers", - "default": 1, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device_batch_size", - "default": 16, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "labels", - "default": [ - "writing_style", - "required_expertise", - "facts_and_trivia", - "educational_value" - ], - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 165, - "name": "TextbookSampleEvaluator", - "description": "基于FastText分类器(kenhktsui/llm-data-textbook-quality-fasttext-classifer-v2)评估文本的教育价值,将文本分为低(Low)、中(Mid)、高(High)三个等级,并映射为1.0、3.0、5.0的分数。适用于筛选适合作为教材的高质量文本内容。\n输入参数:\n- text: 待评估的文本字符串\n输出参数:\n- float: 教育价值分数,可能值为1.0(低)、3.0(中)、5.0(高)", - "type": { - "level_1": "text_pt", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "TextbookScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 166, - "name": "TextbookFilter", - "description": "基于TextbookScorer打分器的得分对数据进行过滤。使用FastText分类器评估文本的教育价值,判断文本是否适合作为教材内容。\n分类器经过训练可识别具有教育意义、结构清晰、知识准确的文本,适用于构建教育类数据集。\n输入参数:\n- min_score:保留样本的最小教育价值分数阈值,默认为0.99\n- max_score:保留样本的最大教育价值分数阈值,默认为1.0\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- input_key:输入文本字段名\n- output_key:教育价值分数字段名,默认为'TextbookScore'\n输出参数:\n- 过滤后的DataFrame,仅保留教育价值分数在[min_score, max_score]范围内的样本\n- 返回包含教育价值分数字段名的列表,用于后续算子引用", - "type": { - "level_1": "text_pt", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 0.99, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 1, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "TextbookScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 167, - "name": "Phi4QAGenerator", - "description": "基于给定文档内容,生成预训练格式的多轮对话问答数据。将原始文档内容转换为适合语言模型预训练的对话格式数据。输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_key:输入文档内容字段名,默认为'raw_content'\n- output_key:输出生成内容字段名,默认为'generated_content'\n输出参数:\n- 包含原始内容和生成内容的DataFrame\n- 返回输出字段名,用于后续算子引用", - "type": { - "level_1": "text_pt", - "level_2": "generate" - }, - "allowed_prompts": [ - "Phi4QAGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "raw_content", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "generated_content", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 168, - "name": "MetaSampleEvaluator", - "description": "通过LLM评估文本的多个元属性,包括文本结构、多样性与复杂性、流畅性与可理解性、安全性、教育价值以及内容准确性与有效性。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- dimensions:评估维度列表,每个维度对应的字典中包含dimension_name,description,和示例字段:\n * dimension_name:维度名称\n * description:维度的描述\n * example_list:包含示例文本和得分的列表\n- input_key:输入文本字段名\n输出参数:\n- 包含6个评估维度得分的DataFrame,列名为:Text Structure, Diversity & Complexity, Fluency & Understandability, Safety, Educational Value, Content Accuracy & Effectiveness", - "type": { - "level_1": "text_pt", - "level_2": "eval" - }, - "allowed_prompts": [ - "MetaPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "dimensions", - "default": [ - { - "dimension_name": "Text Structure", - "description": "Evaluate the surface-level quality of the text, including spelling accuracy, grammar, vocabulary richness, and sentence structure.", - "example_list": [ - { - "text": "The experimental procedure was meticulously documented, with each variable clearly defined.", - "score": "5" - }, - { - "text": "teh data was wrong and we dont no why it happen like that", - "score": "2" - } - ] - }, - { - "dimension_name": "Diversity and Complexity", - "description": "Assess how rich and conceptually varied the content is, and whether it requires expert or deep reasoning to understand.", - "example_list": [ - { - "text": "This article compares Bayesian inference and frequentist approaches in statistical modeling, highlighting theoretical and practical trade-offs.", - "score": "5" - }, - { - "text": "Dogs are pets. They bark. They are friendly.", - "score": "2" - } - ] - }, - { - "dimension_name": "Fluency and Understandability", - "description": "Evaluate whether the text flows naturally, is easy to follow, and avoids awkward or disjointed phrasing.", - "example_list": [ - { - "text": "Despite initial challenges, the team successfully completed the deployment by adhering to a revised strategy.", - "score": "5" - }, - { - "text": "The problem was and then fixed by something happens deployment successful maybe.", - "score": "2" - } - ] - }, - { - "dimension_name": "Safety", - "description": "Identify whether the text contains profanities, hate speech, or excessive personally identifiable information (PII).", - "example_list": [ - { - "text": "The software collects anonymous usage data to improve performance.", - "score": "5" - }, - { - "text": "You idiot, your address 123 Main St will be posted online.", - "score": "1" - } - ] - }, - { - "dimension_name": "Educational Value", - "description": "Determine whether the text provides insight, stimulates thinking, or offers meaningful learning potential.", - "example_list": [ - { - "text": "Understanding the principles of thermodynamics allows engineers to design more efficient engines.", - "score": "5" - }, - { - "text": "The sky is blue. Water is wet. This is how it is.", - "score": "2" - } - ] - }, - { - "dimension_name": "Content Accuracy and Effectiveness", - "description": "Assess the truthfulness, relevance, and practical usefulness of the content.", - "example_list": [ - { - "text": "Newton's second law states that F = ma, which explains the relationship between force, mass, and acceleration.", - "score": "5" - }, - { - "text": "The Earth is flat and doesn't rotate around the Sun.", - "score": "1" - } - ] - } - ], - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - } - ], - "text_sft": [ - { - "node": 169, - "name": "AlpagasusSampleEvaluator", - "description": "通过调用GPT评估指令的质量,返回一个质量得分,得分越高表明指令的质量越高。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- dimension:评估维度,默认为'quality'\n- input_instruction_key:指令字段名\n- input_input_key:输入文本字段名\n- input_output_key:输出文本字段名\n- output_key:输出得分字段名,默认'AlpagasusScore'\n输出参数:\n- 包含评估得分的DataFrame", - "type": { - "level_1": "text_sft", - "level_2": "eval" - }, - "allowed_prompts": [ - "AlpagasusPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "dimension", - "default": "quality", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_output_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "AlpagasusScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 170, - "name": "DeitaQualitySampleEvaluator", - "description": "基于Llama模型的Deita指令质量评估器,通过生成1-6分的质量评分评估指令质量。\n输入参数:\n- device:计算设备,默认为'cuda'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- max_length:最大序列长度,默认为512\n- input_instruction_key:指令文本字段名,默认为'instruction'\n- input_output_key:输出文本字段名,默认为'output'\n- output_key:输出得分字段名,默认为'DeitaQualityScore'\n输出参数:\n- 包含指令质量评分的DataFrame(1-6分)", - "type": { - "level_1": "text_sft", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_length", - "default": 512, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_output_key", - "default": "output", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "DeitaQualityScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 171, - "name": "DeitaComplexitySampleEvaluator", - "description": "基于Llama模型的Deita指令复杂性评估器,通过生成1-6分的复杂性评分评估指令难度。\n输入参数:\n- device:计算设备,默认为'cuda'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- max_length:最大序列长度,默认为512\n- input_instruction_key:指令文本字段名,默认为'instruction'\n- input_output_key:输出文本字段名,默认为'output'\n- output_key:输出得分字段名,默认为'DeitaComplexityScore'\n输出参数:\n- 包含指令复杂性评分的DataFrame(1-6分)", - "type": { - "level_1": "text_sft", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_length", - "default": 512, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_output_key", - "default": "output", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "DeitaComplexityScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 172, - "name": "InstagSampleEvaluator", - "description": "使用Instag评分器评估指令的内容多样性和意图标签。通过分析指令文本生成相关标签,标签数量越多表示内容多样性越大,同时返回标签的详细解释。基于OFA-Sys/InsTagger模型实现。\n输入参数:\n- query: 待评估的指令文本\n输出参数:\n- int: 标签数量(内容多样性指标)\n- list: 包含标签和解释的字典列表", - "type": { - "level_1": "text_sft", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_new_tokens", - "default": 1024, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "temperature", - "default": 0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "do_sample", - "default": false, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "num_return_sequences", - "default": 1, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "return_dict_in_generate", - "default": true, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "InstagScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 173, - "name": "RMSampleEvaluator", - "description": "基于人类偏好数据训练的奖励模型(OpenAssistant/reward-model-deberta-v3-large-v2)对文本质量进行打分,高分代表质量较高。模型输入为指令和响应文本对,输出0-1之间的奖励分数,反映人类对文本质量的偏好判断。\n输入参数:\n- instruction: 指令文本字符串\n- output: 响应文本字符串\n输出参数:\n- float: 0-1之间的奖励分数,越高表示质量越好", - "type": { - "level_1": "text_sft", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_output_key", - "default": "output", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "RMScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 174, - "name": "SuperfilteringSampleEvaluator", - "description": "使用Superfiltering方法评估指令的跟随难度,基于GPT-2模型计算条件困惑度与独立困惑度的比值,得分越高表示指令越难跟随。该方法通过比较指令条件下的响应困惑度与独立响应困惑度,评估指令的清晰度和跟随难度。\n输入参数:\n- instruction: 指令文本\n- input_text: 输入文本(可选)\n- output: 响应文本\n输出参数:\n- float: 困惑度比值,越高表示指令跟随难度越大", - "type": { - "level_1": "text_sft", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_length", - "default": 512, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_output_key", - "default": "output", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "SuperfilteringScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 175, - "name": "TreeinstructSampleEvaluator", - "description": "通过生成语法树的节点数来衡量指令复杂性,节点越多表示指令越复杂。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_instruction_key:指令字段名\n- output_key:输出得分字段名,默认'TreeinstructScore'\n输出参数:\n- 包含指令复杂性得分的DataFrame", - "type": { - "level_1": "text_sft", - "level_2": "eval" - }, - "allowed_prompts": [ - "TreeinstructPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "TreeinstructScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 176, - "name": "AlpagasusFilter", - "description": "基于AlpagasusScorer打分器的得分对数据进行过滤。通过调用GPT模型评估指令的质量,返回一个质量得分。\n\n初始化参数:\n- min_score: 最低分数阈值,默认为3\n- max_score: 最高分数阈值,默认为5\n- llm_serving: LLM服务实例\n- dimension: 评估维度,默认为'quality'(质量)\n\n运行参数:\n- input_instruction_key: 输入指令字段名\n- input_input_key: 输入内容字段名\n- input_output_key: 输出内容字段名\n- output_key: 输出分数字段名,默认为'AlpagasusScore'\n\n过滤逻辑:保留分数在[min_score, max_score]范围内的数据", - "type": { - "level_1": "text_sft", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 3, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 5, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "dimension", - "default": "quality", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_output_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "AlpagasusScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 177, - "name": "DeitaQualityFilter", - "description": "基于DeitaQualityScorer打分器的得分对数据进行过滤。使用基于Llama模型的Deita指令质量评估器,评估指令的质量高低。\n\n初始化参数:\n- min_score: 最低分数阈值,默认为2.5\n- max_score: 最高分数阈值,默认为10000.0\n- device: 运行设备,默认为'cuda'\n- model_cache_dir: 模型缓存目录,默认为'./dataflow_cache'\n- max_length: 最大序列长度,默认为512\n\n运行参数:\n- input_instruction_key: 输入指令字段名,默认为'instruction'\n- input_output_key: 输入输出字段名,默认为'output'\n- output_key: 输出分数字段名,默认为'DeitaQualityScore'\n\n评分标准:1-6分,分数越高表示指令质量越高\n过滤逻辑:保留分数在[min_score, max_score]范围内的数据", - "type": { - "level_1": "text_sft", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 2.5, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 10000.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_length", - "default": 512, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_output_key", - "default": "output", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "DeitaQualityScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 178, - "name": "DeitaComplexityFilter", - "description": "基于DeitaComplexityScorer打分器的得分对数据进行过滤。使用基于Llama模型的Deita指令复杂性评估器,评估指令的复杂程度。\n\n初始化参数:\n- min_score: 最低分数阈值,默认为3.0\n- max_score: 最高分数阈值,默认为5.0\n- device: 运行设备,默认为'cuda'\n- model_cache_dir: 模型缓存目录,默认为'./dataflow_cache'\n- max_length: 最大序列长度,默认为512\n\n运行参数:\n- input_instruction_key: 输入指令字段名,默认为'instruction'\n- input_output_key: 输入输出字段名,默认为'output'\n- output_key: 输出分数字段名,默认为'DeitaComplexityScore'\n\n评分标准:1-6分,分数越高表示指令复杂性越高\n过滤逻辑:保留分数在[min_score, max_score]范围内的数据", - "type": { - "level_1": "text_sft", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 3.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 5.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_length", - "default": 512, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_output_key", - "default": "output", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "DeitaComplexityScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 179, - "name": "InstagFilter", - "description": "基于InstagScorer打分器的过滤算子。使用预训练的Instag模型对指令进行分析,返回标签的数量来评估指令的内容多样性。参数包括模型缓存目录(model_cache_dir)、计算设备(device)和最大新生成标记数(max_new_tokens)。过滤范围由min_score和max_score参数控制,标签越多表示内容多样性越大。", - "type": { - "level_1": "text_sft", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 0.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 1.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_new_tokens", - "default": 1024, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "InstagScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 180, - "name": "RMFilter", - "description": "基于RMScorer打分器的得分对数据进行过滤。使用基于人类偏好数据训练的奖励模型对文本质量进行评分,高分代表质量较高。\n奖励模型能够评估文本的相关性、有用性、无害性等人类偏好指标,可用于筛选符合人类价值观的高质量文本。\n输入参数:\n- min_score:保留样本的最小奖励分数阈值,默认为0.2\n- max_score:保留样本的最大奖励分数阈值,默认为0.8\n- device:模型运行设备,默认为'cuda'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- input_instruction_key:指令字段名,默认为'instruction'\n- input_output_key:输出字段名,默认为'output'\n输出参数:\n- 过滤后的DataFrame,仅保留奖励分数在[min_score, max_score]范围内的样本\n- 返回包含奖励分数字段名的列表,用于后续算子引用", - "type": { - "level_1": "text_sft", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 0.2, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 0.8, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_output_key", - "default": "output", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "RMScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 181, - "name": "SuperfilteringFilter", - "description": "使用Superfiltering评分器过滤掉低质量数据。基于GPT-2模型计算困惑度比值来评估指令跟随难度,比值越低表示指令越容易被模型理解和执行。\n适用于筛选适合特定模型能力的指令数据,提高模型训练效率和效果。\n输入参数:\n- min_score:保留样本的最小分数阈值,默认为0.0\n- max_score:保留样本的最大分数阈值,默认为1.0\n- device:模型运行设备,默认为'cuda'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- max_length:文本最大长度,默认为512\n- input_instruction_key:指令字段名,默认为'instruction'\n- input_input_key:输入字段名,默认为'input'\n- input_output_key:输出字段名,默认为'output'\n- output_key:过滤结果分数字段名,默认为'SuperfilteringScore'\n输出参数:\n- 过滤后的DataFrame,仅保留分数在[min_score, max_score]范围内的样本\n- 返回包含过滤结果分数字段名的列表,用于后续算子引用", - "type": { - "level_1": "text_sft", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 0.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 1.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_length", - "default": 512, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_input_key", - "default": "input", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_output_key", - "default": "output", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "SuperfilteringScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 182, - "name": "TreeinstructFilter", - "description": "基于TreeinstructScore打分器的得分对数据进行过滤。通过生成语法树的节点数来衡量指令复杂性,节点越多表示指令越复杂。\n适用于筛选特定复杂度范围内的指令数据,平衡数据集难度分布,优化模型训练效果。\n输入参数:\n- min_score:保留样本的最小语法树节点数阈值,默认为7\n- max_score:保留样本的最大语法树节点数阈值,默认为100\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_key:输入指令字段名\n- output_key:语法树节点数字段名,默认为'TreeinstructScore'\n输出参数:\n- 过滤后的DataFrame,仅保留语法树节点数在[min_score, max_score]范围内的样本\n- 返回包含语法树节点数字段名的列表,用于后续算子引用", - "type": { - "level_1": "text_sft", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 7, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 100, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "TreeinstructScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 183, - "name": "CondorGenerator", - "description": "基于预置知识树标签,两阶段从0合成SFT格式数据(合成数量大于5000时建议增加标签数量)。第一阶段生成不同难度级别的问题,第二阶段为每个问题生成对应的答案。输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- num_samples:生成样本总数,建议小于5000,默认值为15\n输出参数:\n- 包含'difficulty'、'instruction'和'output'字段的DataFrame\n- 返回生成的DataFrame用于后续处理", - "type": { - "level_1": "text_sft", - "level_2": "generate" - }, - "allowed_prompts": [ - "CondorQuestionPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "num_samples", - "default": 15, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "use_task_diversity", - "default": true, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 184, - "name": "SFTGeneratorSeed", - "description": "基于给定文档内容,生成监督微调格式的问答数据。并支持用户自定义生成内容要求。从原始文档中提取信息,生成符合SFT格式的指令-响应对。输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- custom_prompt:用户自定义提示词\n- input_key:输入文档内容字段名,默认为'raw_content'\n- max_tokens:生成文本的最大token数,默认为4096\n输出参数:\n- 包含'instruction'、'output'和'raw_content'字段的DataFrame\n- 返回包含'instruction'和'output'字段名的列表,用于后续算子引用", - "type": { - "level_1": "text_sft", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "custom_prompt", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "raw_content", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 185, - "name": "CondorRefiner", - "description": "两阶段优化指令回复质量:第一阶段调用API生成对回复的评论,第二阶段利用评论调用API改写回复,提升指令对质量。通过迭代优化提高问答对的整体质量。输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_instruction_key:输入指令字段名,默认为'instruction'\n- input_output_key:输入回复字段名,默认为'output'\n输出参数:\n- 包含优化后回复的DataFrame\n- 返回包含优化后回复字段名的列表,用于后续算子引用", - "type": { - "level_1": "text_sft", - "level_2": "refine" - }, - "allowed_prompts": [ - "CondorCritiquePrompt", - "CondorRefinePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_output_key", - "default": "output", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - } - ], - "vqa": [ - { - "node": 186, - "name": "VQAExtractPdf2Img", - "description": "N/A (非 staticmethod)", - "type": { - "level_1": "vqa", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "dpi", - "default": 300, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_pdf_path", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_image_folder", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 187, - "name": "VQAExtractDocLayoutMinerU", - "description": "N/A (非 staticmethod)", - "type": { - "level_1": "vqa", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "mineru_backend", - "default": "vlm-transformers", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_pdf_file_path", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_folder", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 188, - "name": "VQAExtractPicExtractor", - "description": "N/A (非 staticmethod)", - "type": { - "level_1": "vqa", - "level_2": "generate" - }, - "allowed_prompts": [ - "VQAExtractPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "interleaved", - "default": true, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_layout_path", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_subject", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_folder", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 189, - "name": "VQAExtractQAPairExtractor", - "description": "N/A (非 staticmethod)", - "type": { - "level_1": "vqa", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_vqa_extract_path", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_qa_path", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 190, - "name": "VQAExtractTag2Img", - "description": "N/A (非 staticmethod)", - "type": { - "level_1": "vqa", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "layout_prefix", - "default": "doclay_page_", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "image_prefix", - "default": "page_", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_layout_json", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_pdf_image_dir", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_image_dir", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_qa_file", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_qa_file", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_md_file", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 191, - "name": "VQAClipHeader", - "description": "N/A (非 staticmethod)", - "type": { - "level_1": "vqa", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_image_path", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_layout_path", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_image_folder", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_layout_prefix", - "default": "doclay", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 192, - "name": "VQAConcatenateImages", - "description": "N/A (非 staticmethod)", - "type": { - "level_1": "vqa", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_image_folder", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_image_folder", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - } - ], - "Default": [ - { - "node": 1, - "name": "AgenticRAGQAF1SampleEvaluator", - "description": "用于评估预测答案与多个参考答案之间的 F1 分数", - "type": { - "level_1": "agentic_rag", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_prediction_key", - "default": "refined_answer", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_ground_truth_key", - "default": "golden_doc_answer", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "F1Score", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 2, - "name": "AgenticRAGAtomicTaskGenerator", - "description": "该算子用于为提供的文本内容生成合适的高质量问题与可验证答案。\n\n输入参数:\n- input_key: 输入文本内容字段名(默认值:\"prompts\")\n- output_question_key: 输出问题字段名(默认值:\"question\")\n- output_answer_key: 输出答案字段名(默认值:\"answer\")\n- output_refined_answer_key: 输出精炼答案字段名(默认值:\"refined_answer\")\n- output_optional_answer_key: 输出可替代精炼答案字段名(默认值:\"optional_answer\")\n- output_golden_doc_answer_key: 输出黄金文档回答字段名(默认值:\"golden_doc_answer\")\n", - "type": { - "level_1": "agentic_rag", - "level_2": "generate" - }, - "allowed_prompts": [ - "AtomicTaskGeneratorGetIdentifierPrompt", - "AtomicTaskGeneratorGetConlcusionPrompt", - "AtomicTaskGeneratorQuestionPrompt", - "AtomicTaskGeneratorCleanQAPrompt", - "AtomicTaskGeneratorAnswerPrompt", - "AtomicTaskGeneratorRecallScorePrompt", - "AtomicTaskGeneratorOptionalAnswerPrompt", - "AtomicTaskGeneratorGoldenDocAnswerPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "data_num", - "default": 100, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_per_task", - "default": 10, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_question", - "default": 10, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "prompts", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_question_key", - "default": "question", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_answer_key", - "default": "answer", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_refined_answer_key", - "default": "refined_answer", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_optional_answer_key", - "default": "optional_answer", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_llm_answer_key", - "default": "llm_answer", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_golden_doc_answer_key", - "default": "golden_doc_answer", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 3, - "name": "AgenticRAGDepthQAGenerator", - "description": "该算子以已有问答生成更深度的问题。\n\n输入参数:\n- input_key: 输入字段名(默认值:\"question\")\n- output_key: 输出字段名(默认值:\"depth_question\")\n", - "type": { - "level_1": "agentic_rag", - "level_2": "generate" - }, - "allowed_prompts": [ - "DepthQAGeneratorGetIdentifierPrompt", - "DepthQAGeneratorBackwardTaskPrompt", - "DepthQAGeneratorSupersetCheckPrompt", - "DepthQAGeneratorQuestionPrompt", - "DepthQAGeneratorAnswerPrompt", - "DepthQAGeneratorRecallScorePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "n_rounds", - "default": 2, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "question", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "depth_question", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 4, - "name": "AgenticRAGWidthQAGenerator", - "description": "该算子用于结合两个问答,生成新的问题。\n\n输入参数:\n- input_question_key: 输入问题字段名(默认值:\"question\")\n- input_identifier_key: 输入标识符字段名(默认值:\"identifier\")\n- input_answer_key: 输入答案字段名(默认值:\"answer\")\n- output_question_key: 输出问题字段名(默认值:\"generated_width_task\")\n", - "type": { - "level_1": "agentic_rag", - "level_2": "generate" - }, - "allowed_prompts": [ - "WidthQAGeneratorMergePrompt", - "WidthQAGeneratorOriginCheckPrompt", - "WidthQAGeneratorQuestionVerifyPrompt", - "WidthQAGeneratorAnswerPrompt", - "WidthQAGeneratorRecallScorePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_question_key", - "default": "question", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_identifier_key", - "default": "identifier", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_answer_key", - "default": "answer", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_question_key", - "default": "generated_width_task", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 5, - "name": "ExtractSmilesFromTextGenerator", - "description": "ExtractSmilesFromText 用于从 OCR 文本中抽取或解析化学分子的 SMILES 表达式。算子会根据给定的提示模板(prompt_template),结合文本内容和(可选的)单体缩写信息,调用大语言模型完成解析与结构化,并将结果以 JSON 格式写回到指定列。\n\n输入参数:\n- llm_serving:LLM 服务对象,需实现 LLMServingABC 接口\n- prompt_template:提示词模板对象,用于构造模型输入\n- input_content_key: OCR 文本的列名(默认 'text')\n- input_abbreviation_key:包含缩写/单体信息的列名(默认 'abbreviations'),可为空\n- output_key:写回抽取结果的列名(默认 'synth_smiles')\n\n输出参数:\n- DataFrame,其中 output_key 列为模型返回并经 JSON 解析后的 SMILES 结构\n- 返回 output_key,供后续算子引用\n\n备注:\n- 模型输出会尝试解析为 JSON;若解析失败,将返回 [] 并记录失败次数。", - "type": { - "level_1": "chemistry", - "level_2": "generate" - }, - "allowed_prompts": [ - "ExtractSmilesFromTextPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": "", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_content_key", - "default": "text", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_abbreviation_key", - "default": "abbreviations", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "synth_smiles", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 6, - "name": "SmilesEquivalenceDatasetEvaluator", - "description": "评估 golden_label 与 synth_smiles 的 SMILES 等价性并计算分数。逐块输出 final_result、块内得分与准确率,并统计全局总分。", - "type": { - "level_1": "chemistry", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_golden_key", - "default": "golden_label", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_synth_key", - "default": "synth_smiles", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "final_result", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 7, - "name": "CodeAutoGeneratedSampleEvaluator", - "description": "基于自动生成标记评估代码样本,检测文件头部的自动生成标记。\n\n评估指标:\n- CodeAutoGeneratedMarkerCount: 检测到的自动生成标记数量\n- CodeAutoGeneratedScore: 综合自动生成得分 (0-1,1表示非自动生成)\n\n输入要求:需要包含'lines'列\n\n输出参数:\n- CodeAutoGeneratedMarkerCount: 检测到的自动生成标记数量\n- CodeAutoGeneratedScore: 综合自动生成得分", - "type": { - "level_1": "code", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "is_generated_func", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 8, - "name": "CodeAutoGeneratedFilter", - "description": "基于CodeAutoGeneratedSampleEvaluator的得分过滤自动生成的代码文件,确保只保留人工编写的代码。\n\n评估指标:\n- 自动生成标记数量:检测文件前5行中的自动生成标记\n- 检测标记:'auto-generated', 'autogenerated', 'automatically generated'等\n- 综合自动生成得分:0-1,1表示非自动生成\n- 支持外部检测函数进行额外验证\n\n输入参数:\n- input_key: 输入字段名(需要包含'lines'列)\n- output_key: 输出标签字段名 (默认: 'auto_generated_filter_label')\n- min_score: 最小自动生成得分阈值 (默认: 1.0)\n- max_score: 最大自动生成得分阈值 (默认: 1.0)\n- is_generated_func: 可选的外部检测函数\n\n输出参数:\n- 过滤后的DataFrame,仅保留自动生成得分在指定范围内的代码样本\n- 返回包含自动生成得分标签字段名的列表", - "type": { - "level_1": "code", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 1.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 1.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "is_generated_func", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "auto_generated_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 9, - "name": "CodeDocumentQualitySampleEvaluator", - "description": "基于综合文档级质量指标评估代码样本,包括内容长度、重复模式、字符组成和文本熵值。\n\n评估指标:\n- CodeDocumentQualityCharCount: 字符数\n- CodeDocumentQualityWordCount: 词数\n- CodeDocumentQualityDuplicateLinesRatio: 重复行比例\n- CodeDocumentQualityDuplicateNgramRatio: n-gram重复比例\n- CodeDocumentQualityCurlyBracketRatio: 花括号比例\n- CodeDocumentQualityAllCapsRatio: 全大写单词比例\n- CodeDocumentQualityEntropy: 单字符熵值\n- CodeDocumentQualityScore: 综合文档质量得分 (0-1,1表示通过所有质量检查)\n\n输入要求:需要包含'text'、'filename'、'language'列\n\n输出参数:\n- 各种质量指标的数值\n- CodeDocumentQualityScore: 综合文档质量得分", - "type": { - "level_1": "code", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "thresholds", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 10, - "name": "CodeDocumentQualityFilter", - "description": "基于CodeDocumentQualitySampleEvaluator的得分应用综合文档级质量过滤规则,移除低质量代码和文本样本。\n\n评估指标:\n- 内容长度:字符数、词数、行数范围检查\n- 重复模式:重复行比例、2-10gram重复比例\n- 字符组成:花括号比例、全大写单词比例\n- 文本熵值:单字符熵值检查\n- 综合文档质量得分:0-1,1表示通过所有质量检查\n\n输入参数:\n- input_key: 输入字段名(需要包含'text'、'filename'、'language'列)\n- output_key: 输出标签字段名 (默认: 'doc_quality_filter_label')\n- min_score: 最小文档质量得分阈值 (默认: 1.0)\n- max_score: 最大文档质量得分阈值 (默认: 1.0)\n- thresholds: 可选的阈值字典,用于覆盖默认阈值\n\n输出参数:\n- 过滤后的DataFrame,仅保留文档质量得分在指定范围内的样本\n- 返回包含文档质量得分标签字段名的列表", - "type": { - "level_1": "code", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 1.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 1.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "thresholds", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "doc_quality_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 11, - "name": "CodeEncodedDataSampleEvaluator", - "description": "基于编码数据模式评估代码样本,检测Base64、十六进制和Unicode转义序列。\n\n评估指标:\n- CodeEncodedDataBase64Ratio: Base64编码数据比例\n- CodeEncodedDataHexRatio: 十六进制数据比例\n- CodeEncodedDataUnicodeRatio: Unicode转义序列比例\n- CodeEncodedDataScore: 综合编码数据得分 (0-1,1表示通过编码数据检查)\n\n输入要求:需要包含'text'列\n\n输出参数:\n- CodeEncodedDataBase64Ratio: Base64编码数据比例\n- CodeEncodedDataHexRatio: 十六进制数据比例\n- CodeEncodedDataUnicodeRatio: Unicode转义序列比例\n- CodeEncodedDataScore: 综合编码数据得分", - "type": { - "level_1": "code", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 12, - "name": "CodeEncodedDataFilter", - "description": "基于CodeEncodedDataSampleEvaluator的得分过滤代码样本,移除二进制内容和自动生成代码。\n\n评估指标:\n- Base64编码数据比例:检测连续64+字符的Base64字符串\n- 十六进制数据比例:检测8+个连续的十六进制对\n- Unicode转义序列比例:检测8+个连续的\\uXXXX序列\n- 综合编码数据得分:0-1,1表示通过检查\n\n输入参数:\n- input_key: 输入字段名(需要包含'text'列)\n- output_key: 输出标签字段名 (默认: 'encoded_data_filter_label')\n- min_score: 最小编码数据得分阈值 (默认: 1.0)\n- max_score: 最大编码数据得分阈值 (默认: 1.0)\n\n输出参数:\n- 过滤后的DataFrame,仅保留编码数据得分在指定范围内的代码样本\n- 返回包含编码数据得分标签字段名的列表", - "type": { - "level_1": "code", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 1.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 1.0, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "encoded_data_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 13, - "name": "CodeFileTypeContentFilter", - "description": "基于文件类型和内容特征直接过滤代码样本,针对不同文件格式应用特定规则。\n\n过滤规则:\n- Text/JSON/YAML/Graphviz文件:行数 > 512 行\n- HTML文件:可见文本长度 < 100字符 或 可见文本比例 < 20%\n- Text文件:文件名不符合文档规范(非readme/notes/todo等)\n\n输入参数:\n- input_key: 输入字段名(需要包含'filetype'、'filename'、'line_count'等列)\n- output_key: 输出标签字段名 (默认: 'file_type_content_filter_label')\n\n输出参数:\n- 过滤后的DataFrame,仅保留符合文件类型规则的样本\n- 返回包含输出标签字段名的列表", - "type": { - "level_1": "code", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "file_type_content_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 14, - "name": "CodeLengthSampleEvaluator", - "description": "基于代码长度特征评估代码样本,分析总行数、平均行长和最大行长。\n\n评估指标:\n- CodeLengthTotalLines: 总行数\n- CodeLengthAvgLineLength: 平均行长\n- CodeLengthMaxLineLength: 最大行长\n- CodeLengthScore: 综合长度得分 (0-1,1表示通过所有长度检查)\n\n输入要求:需要包含'lines'和'language'列\n\n输出参数:\n- CodeLengthTotalLines: 总行数\n- CodeLengthAvgLineLength: 平均行长\n- CodeLengthMaxLineLength: 最大行长\n- CodeLengthScore: 综合长度得分", - "type": { - "level_1": "code", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 15, - "name": "CodeLengthSampleFilter", - "description": "基于CodeLengthSampleEvaluator的得分过滤代码样本,移除超大文件和格式不良的代码。\n\n评估指标:\n- 总行数:检查是否超过100,000行\n- 平均行长:普通语言>100字符,特殊语言>100,000字符\n- 最大行长:普通语言>1,000字符\n\n输入参数:\n- input_key: 输入字段名(需要包含'lines'和'language'列)\n- output_key: 输出标签字段名 (默认: 'length_filter_label')\n- min_score: 最小长度得分阈值 (默认: 1.0)\n- max_score: 最大长度得分阈值 (默认: 1.0)\n\n输出参数:\n- 过滤后的DataFrame,仅保留长度得分在指定范围内的代码样本\n- 返回包含长度得分标签字段名的列表", - "type": { - "level_1": "code", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 1.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 1.0, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "length_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 16, - "name": "CodeQualitySampleEvaluator", - "description": "该算子用于评估生成的代码片段与其源指令的匹配质量,并输出分数和反馈。\n\n输入参数:\n- input_instruction_key: 包含人类指令的字段名 (默认: 'generated_instruction')\n- input_code_key: 包含生成代码的字段名 (默认: 'generated_code')\n输出参数:\n- output_score_key: 用于存储质量分数的字段名 (默认: 'quality_score')\n- output_feedback_key: 用于存储质量反馈的字段名 (默认: 'quality_feedback')\n", - "type": { - "level_1": "code", - "level_2": "eval" - }, - "allowed_prompts": [ - "CodeQualityEvaluatorPrompt", - "DiyCodePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_code_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_score_key", - "default": "quality_score", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_feedback_key", - "default": "quality_feedback", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 17, - "name": "CodeQualityScoreFilter", - "description": "基于LLM生成的代码质量分数过滤代码样本,评估正确性、完整性、清晰度、最佳实践和效率。\n\n评估维度:\n- 正确性:代码语法和逻辑是否正确\n- 完整性:代码是否完整实现功能\n- 清晰度:代码是否清晰易懂\n- 最佳实践:是否遵循编程最佳实践\n- 效率:代码执行效率如何\n\n输入参数:\n- input_code_key: 输入代码字段名\n- input_instruction_key: 输入指令字段名\n- output_score_key: 输出打分字段名 (默认: 'quality_score')\n- output_feedback_key: 输出反馈字段名 (默认: 'quality_feedback')\n- output_key: 输出过滤标签字段名 (默认: 'quality_score_filter_label')\n- min_score: 最小质量分数阈值 (默认: 7)\n- max_score: 最大质量分数阈值 (默认: 10)\n\n输出参数:\n- 过滤后的DataFrame,仅保留质量分数在指定范围内的代码样本\n- 返回包含质量分数标签字段名的列表", - "type": { - "level_1": "code", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "min_score", - "default": 7, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 10, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_code_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_score_key", - "default": "quality_score", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_feedback_key", - "default": "quality_feedback", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "quality_score_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 18, - "name": "CodeGenericScoreFilter", - "description": "基于数值分数列直接过滤数据集,提供灵活的阈值比较方法。\n\n比较方法:\n- greater_equal: 分数 >= 阈值\n- greater: 分数 > 阈值\n- less_equal: 分数 <= 阈值\n- less: 分数 < 阈值\n- equal: 分数 = 阈值\n\n输入参数:\n- input_key: 包含分数的字段名\n- output_key: 输出标签字段名 (默认: 'generic_score_filter_label')\n- score_threshold: 分数阈值 (默认: 8)\n- filter_method: 比较方法 (默认: 'greater_equal')\n\n输出参数:\n- 过滤后的DataFrame,仅保留符合分数条件的样本\n- 返回包含输出标签字段名的列表", - "type": { - "level_1": "code", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "score_threshold", - "default": 8, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "filter_method", - "default": "greater_equal", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "generic_score_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 19, - "name": "CodeTextCompositionSampleEvaluator", - "description": "基于字符组成评估代码样本,分析字母字符和字母数字字符的比例。\n\n评估指标:\n- CodeTextCompositionAlphaRatio: 字母字符比例\n- CodeTextCompositionAlnumRatio: 字母数字字符比例\n- CodeTextCompositionScore: 综合字符组成得分 (0-1,1表示通过字符组成检查)\n\n输入要求:需要包含'text'和'language'列\n\n输出参数:\n- CodeTextCompositionAlphaRatio: 字母字符比例\n- CodeTextCompositionAlnumRatio: 字母数字字符比例\n- CodeTextCompositionScore: 综合字符组成得分", - "type": { - "level_1": "code", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 20, - "name": "CodeTextCompositionFilter", - "description": "基于CodeTextCompositionSampleEvaluator的得分过滤代码样本,移除二进制文件、加密内容和不可读文本。\n\n评估指标:\n- 字母字符比例:普通语言需要>=25%\n- 字母数字字符比例:汇编语言需要>=25%\n- 综合字符组成得分:0-1,1表示通过检查\n\n输入参数:\n- input_key: 输入字段名(需要包含'text'和'language'列)\n- output_key: 输出标签字段名 (默认: 'text_composition_filter_label')\n- min_score: 最小字符组成得分阈值 (默认: 1.0)\n- max_score: 最大字符组成得分阈值 (默认: 1.0)\n\n输出参数:\n- 过滤后的DataFrame,仅保留字符组成得分在指定范围内的代码样本\n- 返回包含字符组成得分标签字段名的列表", - "type": { - "level_1": "code", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 1.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 1.0, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "text_composition_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 21, - "name": "CodeCodeToInstructionGenerator", - "description": "该算子用于分析代码片段并反向生成可能产生该代码的人类指令。\n\n输入参数:\n- input_key: 包含原始代码片段的字段名 (默认: 'code')\n输出参数:\n- output_key: 用于存储生成指令的字段名 (默认: 'generated_instruction')\n", - "type": { - "level_1": "code", - "level_2": "generate" - }, - "allowed_prompts": [ - "CodeCodeToInstructionGeneratorPrompt", - "DiyCodePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "code", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "generated_instruction", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 22, - "name": "CodeInstructionToCodeGenerator", - "description": "该算子根据给定的人类指令生成相应的代码片段。\n\n输入参数:\n- input_key: 包含人类指令的字段名 (默认: 'instruction')\n输出参数:\n- output_key: 用于存储生成代码的字段名 (默认: 'generated_code')\n", - "type": { - "level_1": "code", - "level_2": "generate" - }, - "allowed_prompts": [ - "CodeInstructionToCodeGeneratorPrompt", - "DiyCodePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "generated_code", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 23, - "name": "CodeEnhancementInstructionGenerator", - "description": "该算子用于增强人类指令,将不同输出格式的任务统一为生成完整函数。\n\n输入参数:\n- input_key: 包含原始代码片段的字段名 (默认: 'code')\n输出参数:\n- output_key: 用于存储生成指令的字段名 (默认: 'generated_instruction')\n", - "type": { - "level_1": "code", - "level_2": "generate" - }, - "allowed_prompts": [ - "CodeInstructionEnhancement", - "DiyCodePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "messages", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "generated_instruction", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 24, - "name": "CodeInstructionGenerator", - "description": "该算子用于生成新的指令,从数据池中随机抽取few-shot样本,生成类似难度的指令。\n\n输入参数:\n- input_key: 包含原始指令的字段名 (默认: 'prompt')\n输出参数:\n- output_key: 用于存储生成指令的字段名 (默认: 'generated_instruction')\n", - "type": { - "level_1": "code", - "level_2": "generate" - }, - "allowed_prompts": [ - "CodeInstructionGeneratePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "num_few_shot", - "default": 3, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "num_generate", - "default": 10, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "prompt", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "generated_instruction", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 25, - "name": "CodeSandboxSampleEvaluator", - "description": "该算子在一个安全的沙箱环境中执行代码片段以验证其正确性。\n\n输入参数:\n- input_code_key: 包含待执行代码的字段名 (默认: 'generated_code')\n输出参数:\n- output_status_key: 用于存储执行状态 ('PASS' 或 'FAIL') 的字段名 (默认: 'sandbox_status')\n- output_log_key: 用于存储执行日志或错误信息的字段名 (默认: 'sandbox_log')\n", - "type": { - "level_1": "code", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "language", - "default": "python", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "timeout_length", - "default": 15, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "use_process_isolation", - "default": true, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_status_key", - "default": "sandbox_status", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_log_key", - "default": "sandbox_log", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 26, - "name": "ScenarioExtractGenerator", - "description": "从对话内容中提取场景信息,使用LLM服务分析对话并生成场景描述。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_chat_key:对话内容字段名\n- output_key:输出场景字段名,默认'scenario'\n输出参数:\n- 包含提取场景信息的DataFrame\n- 包含输出字段名的列表", - "type": { - "level_1": "conversations", - "level_2": "generate" - }, - "allowed_prompts": [ - "ExtractScenarioPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_chat_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "scenario", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 27, - "name": "ScenarioExpandGenerator", - "description": "基于原始场景生成新的替代场景,使用LLM服务重写或改写原有场景内容。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_scenario_key:原始场景字段名\n- output_key:生成的新场景字段名,默认'modified_scenario'\n输出参数:\n- 包含生成新场景的DataFrame\n- 包含输出字段名的列表", - "type": { - "level_1": "conversations", - "level_2": "generate" - }, - "allowed_prompts": [ - "ExpandScenarioPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_scenario_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "modified_scenario", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 28, - "name": "AtomTaskGenerator", - "description": "根据输入的场景信息,使用LLM服务生成对应的原子任务。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_scenario_key:场景字段名\n- output_key:原子任务的输出字段名,默认'atom_task'\n输出参数:\n- 包含原子任务的DataFrame\n- 包含输出字段名的列表", - "type": { - "level_1": "conversations", - "level_2": "generate" - }, - "allowed_prompts": [ - "FuncAtomicTaskGeneratePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_scenario_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "atom_task", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 29, - "name": "SequentialTaskGenerator", - "description": "根据输入的原子任务,使用LLM服务生成该任务的后继任务和两者的组合任务。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_task_key:原子任务字段名\n- output_subsequent_task_key:后继任务输出字段名,默认'subsequent_task'\n- output_composition_task_key:组合任务输出字段名,默认'composition_task'\n输出参数:\n- 包含后继任务和组合任务的DataFrame\n- 输出字段名的列表(后继任务字段和组合任务字段)", - "type": { - "level_1": "conversations", - "level_2": "generate" - }, - "allowed_prompts": [ - "SequentialTaskGeneratePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_task_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_subsequent_task_key", - "default": "subsequent_task", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_composition_task_key", - "default": "composition_task", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 30, - "name": "ParaSeqTaskGenerator", - "description": "基于原子任务,使用LLM服务生成三个任务类型:并行任务、后继任务以及这三者的组合任务。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_task_key:原子任务字段名\n- output_parallel_task_key:并行任务输出字段名,默认'parallel_task'\n- output_subsequent_task_key:后继任务输出字段名,默认'subsequent_task'\n- output_composition_task_key:组合任务输出字段名,默认'composition_task'\n输出参数:\n- 包含并行任务、后继任务与组合任务的DataFrame\n- 输出字段名列表(并行任务、后继任务、组合任务)", - "type": { - "level_1": "conversations", - "level_2": "generate" - }, - "allowed_prompts": [ - "ParathenSeqTaskGeneratePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_task_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_parallel_task_key", - "default": "parallel_task", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_subsequent_task_key", - "default": "subsequent_task", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_composition_task_key", - "default": "composition_task", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 31, - "name": "FunctionGenerator", - "description": "基于组合任务及其相关子任务,使用LLM服务生成对应的函数列表。输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_composition_task_key:组合任务字段名\n- input_sub_tasks_keys:子任务字段名列表(如原子任务、并行任务、后继任务等)\n- output_key:函数列表输出字段名,默认'functions'\n输出参数:\n- 包含函数定义或函数列表的DataFrame\n- 输出字段名的列表(函数列表字段)", - "type": { - "level_1": "conversations", - "level_2": "generate" - }, - "allowed_prompts": [ - "FuncGeneratePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_composition_task_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_sub_tasks_keys", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "functions", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 32, - "name": "MultiTurnConversationGenerator", - "description": "根据组合任务及其子任务函数,使用LLM服务模拟多轮对话过程,由User、Assistant和Tool三个Agent协同生成完整的对话数据。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_task_key:任务字段名(组合任务)\n- input_sub_tasks_keys:子任务字段名列表\n- input_functions_key:子任务函数字段名\n- output_conversations_key:输出对话字段名,默认'conversations'\n输出参数:\n- 包含已完成的多轮对话记录的DataFrame\n- 输出字段名(对话字段名)", - "type": { - "level_1": "conversations", - "level_2": "generate" - }, - "allowed_prompts": [ - "ConversationUserPrompt", - "ConversationAssistantPrompt", - "ConversationToolPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_task_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_sub_tasks_keys", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_functions_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_conversations_key", - "default": "conversations", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 33, - "name": "ConsistentChatGenerator", - "description": "根据预置主题和人类意图,两阶段从0合成多轮对话格式数据(合成数量大于9000时建议增加标签数量)。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- num_dialogs_per_intent:每个意图生成的对话数量,默认20\n- num_turns_per_dialog:每个对话的轮次数量,默认6\n- temperature:生成温度,控制输出随机性,默认0.9\n输出参数:\n- 包含category和conversation字段的DataFrame,其中conversation为多轮对话列表", - "type": { - "level_1": "conversations", - "level_2": "generate" - }, - "allowed_prompts": [ - "ConsistentQueryPrompt", - "ConsistentResponsePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "num_dialogs_per_intent", - "default": 20, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "num_turns_per_dialog", - "default": 6, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "temperature", - "default": 0.9, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 34, - "name": "FuncCallConversationSampleEvaluator", - "description": "对对话样本进行打分评估:使用 LLM 服务根据预设评分提示词对每条对话进行评分,并将结果写回数据流。\n输入参数:\n- llm_serving:LLM 服务对象,需实现 LLMServingABC 接口\n- input_conversation_key:DataFrame 中对话内容字段名,默认 'conversations'\n- output_score_key:评分结果输出字段名,默认 'score'\n处理流程:\n- 读取存储中的 DataFrame\n- 将每条对话重组为评分提示词并调用 LLM 生成评分(JSON)\n- 解析 JSON,提取 'score' 字段写入 DataFrame;解析失败则回退为 0\n输出参数:\n- 包含评分结果列的 DataFrame\n- 包含输出字段名的列表(仅 'score' 或自定义的输出列名)", - "type": { - "level_1": "conversations", - "level_2": "eval" - }, - "allowed_prompts": [ - "ConversationEvalPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_conversation_key", - "default": "conversations", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_score_key", - "default": "score", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 35, - "name": "CompositionTaskFilter", - "description": "根据组合任务及其子任务,使用LLM服务判断组合任务是否具备可行性与完备性,从而进行可运行任务的筛选。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_composition_task_key:组合任务字段名\n- input_sub_tasks_keys:子任务字段名列表(如原子任务、并行任务、后继任务等)\n- output_key:可运行标签的输出字段名,默认'runable_label'\n输出参数:\n- 仅包含可运行组合任务的数据DataFrame\n- 包含输出字段名的列表(可运行标签字段)", - "type": { - "level_1": "conversations", - "level_2": "filter" - }, - "allowed_prompts": [ - "CompositionTaskFilterPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_composition_task_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_sub_tasks_keys", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "runable_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 36, - "name": "Speech2TextGenerator", - "description": "该算子用于将语音内容转录为文本。它接收语音文件路径或URL,使用大语言模型进行转录,并将转录结果保存到数据框中。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- system_prompt:系统提示词,用于定义模型行为,默认为'You are a helpful assistant'\n- input_key:输入语音文件路径或URL的字段名,默认为'raw_content'\n- output_key:输出转录文本的字段名,默认为'generated_content'\n输出参数:\n- 返回输出字段名,用于后续算子引用\n- 在数据框中添加包含转录文本的新列", - "type": { - "level_1": "core_speech", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "system_prompt", - "default": "You are a helpful assistant", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "raw_content", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "generated_content", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 37, - "name": "PromptedGenerator", - "description": "基于用户提供的提示词(prompt)生成数据。结合系统提示词和输入内容生成符合要求的输出文本。输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- system_prompt:系统提示词,定义模型行为,默认为'You are a helpful agent.'\n- input_key:输入内容字段名,默认为'raw_content'\n- output_key:输出生成内容字段名,默认为'generated_content'\n输出参数:\n- 包含生成内容的DataFrame\n- 返回输出字段名,用于后续算子引用", - "type": { - "level_1": "core_text", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "system_prompt", - "default": "You are a helpful agent.", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "json_schema", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "raw_content", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "generated_content", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 38, - "name": "PairedPromptedGenerator", - "description": "PairedPromptedGenerator:基于两列配对输入(input_key_1 与 input_key_2)进行成对提示生成。\n算子会将 system_prompt 与每行的两列文本按固定模板拼接后,调用 LLM 服务批量生成结果,并将模型输出写回到 DataFrame 的指定列。\n\n输入参数:\n- llm_serving:LLM 服务对象(实现 LLMServingABC 接口)\n- system_prompt:系统提示词(默认 'You are a helpful agent.')。该提示会放在每条样本前缀, 用于约束模型的角色与输出风格。\n- input_key_1:第一列输入字段名(默认 'input_key_1')\n- input_key_2:第二列输入字段名(默认 'input_key_2')\n- output_key:输出字段名(默认 'generated_content')\n\n处理逻辑:\n1) 从 storage 中读取名为 'dataframe' 的 DataFrame;\n2) 对于每一行,若 input_key_1 与 input_key_2 均非空,则按模板:\n system_prompt + input_key_1 + 值 + '\\n' + input_key_2 + 值\n 构造 LLM 输入;\n3) 批量调用 llm_serving.generate_from_input 生成文本;\n4) 将生成结果写入 DataFrame 的 output_key 列并保存。\n\n输出:\n- 返回写入了生成结果的新 DataFrame(由 storage 管理保存),\n- 返回 output_key 以便后续算子引用。", - "type": { - "level_1": "core_text", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "system_prompt", - "default": "You are a helpful agent.", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key_1", - "default": "input_key_1", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key_2", - "default": "input_key_2", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "generated_content", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 39, - "name": "RandomDomainKnowledgeRowGenerator", - "description": "N/A (调用失败)", - "type": { - "level_1": "core_text", - "level_2": "generate" - }, - "allowed_prompts": [ - "SFTFromScratchGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "generation_num", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "domain_keys", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "generated_content", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 40, - "name": "Text2QAGenerator", - "description": "该算子用于为给定的文档片段生成种子QA对。\n\n输入参数:\n- input_key: 包含文档片段的字段名\n- prompt_key: 包含提示词的字段名\n- output_quesion_key: 包含生成问题的字段名\n- output_answer_key: 包含生成答案的字段名\n", - "type": { - "level_1": "core_text", - "level_2": "generate" - }, - "allowed_prompts": [ - "Text2QAAutoPromptGeneratorPrompt", - "Text2QASeedQuestionGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "text", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_question_num", - "default": 1, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_prompt_key", - "default": "generated_prompt", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_quesion_key", - "default": "generated_question", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_answer_key", - "default": "generated_answer", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 41, - "name": "Text2MultiHopQAGenerator", - "description": "('MultiHopQAGenerator 是多跳问答对生成处理器,支持从文本中自动生成需要多步推理的问题与答案。', '处理流程包括:文本预处理、信息抽取、问题生成与回答生成,支持自定义语言模型后端和参数。', '输出格式如下:', '输入:\\ntext: <原始上下文文本>', '输出:\\n{\\n \"text\": <处理后的文本字符串>,\\n \"qa_pairs\": [\\n {\\n \"question\": <字符串:生成的问题>,\\n \"reasoning_steps\": [\\n {\"step\": <推理过程的步骤 1>},\\n {\"step\": <步骤 2>} ...\\n ],\\n \"answer\": <字符串:最终答案>,\\n \"supporting_facts\": [<支持该答案的事实 1>, <事实 2>, ...],\\n \"type\": <可选:问题类型,如“生物学”、“历史”等>\\n },\\n ...\\n ],\\n \"metadata\": {\\n \"source\": <数据来源>,\\n \"timestamp\": <时间戳字符串>,\\n \"complexity\": <整数:问题复杂度标记>\\n }\\n}')", - "type": { - "level_1": "core_text", - "level_2": "generate" - }, - "allowed_prompts": [ - "Text2MultiHopQAGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "seed", - "default": 0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "lang", - "default": "en", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "num_q", - "default": 5, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "cleaned_chunk", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "QA_pairs", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_meta_key", - "default": "QA_metadata", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 42, - "name": "EmbeddingGenerator", - "description": "EmbeddingGenerator算子用于从输入文本生成向量表示(embedding),通常用于语义检索、聚类或下游模型输入等任务。\n\n输入参数:\n- embedding_serving:Embedding服务对象,需实现LLMServingABC接口,用于生成文本的向量表示\n- input_key:输入文本字段名,默认为'text'\n- output_key:输出向量字段名,默认为'embeddings'\n\n输出参数:\n- 包含文本向量的DataFrame,每行对应一个输入文本的embedding\n- 返回输出字段名(如'embeddings'),可供后续算子引用", - "type": { - "level_1": "core_text", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "embedding_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "text", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "embeddings", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 43, - "name": "RetrievalGenerator", - "description": "N/A (非 staticmethod)", - "type": { - "level_1": "core_text", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "system_prompt", - "default": "You are a helpful agent.", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "json_schema", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "raw_content", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "generated_content", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 44, - "name": "BenchDatasetEvaluator", - "description": "该算子用于对比预测答案与标准答案的匹配度,支持两种评估模式:\n\n1. 字符串匹配(match):使用数学验证方法比较答案,适用于有明确答案的问题\n2. 语义匹配(semantic):使用LLM评估语义相似度,仅输入预测答案与标准答案\n\n输入参数:\n- input_test_answer_key:预测答案字段名\n- input_gt_answer_key:标准答案字段名\n- compare_method:比较方法(match/semantic)\n\n输出参数:\n- answer_match_result:匹配结果(True/False)\n- 统计结果将保存到指定的eval_result_path路径\n", - "type": { - "level_1": "core_text", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "eval_result_path", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "compare_method", - "default": "match", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "system_prompt", - "default": "You are a helpful assistant specialized in evaluating answer correctness.", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_test_answer_key", - "default": "generated_cot", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_gt_answer_key", - "default": "golden_answer", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 45, - "name": "BenchDatasetEvaluatorQuestion", - "description": "该算子用于对比预测答案与标准答案的匹配度,支持两种评估模式:\n\n1. 字符串匹配(match):使用数学验证方法比较答案,适用于有明确答案的问题\n2. 语义匹配(semantic):使用LLM评估答案的语义相似度,适用于开放性问题\n\n输入参数:\n- input_test_answer_key:预测答案字段名\n- input_gt_answer_key:标准答案字段名\n- input_question_key:问题字段名(语义匹配模式下必需)\n- compare_method:比较方法(match/semantic)\n\n输出参数:\n- answer_match_result:匹配结果(True/False)\n- 统计结果将保存到指定的eval_result_path路径\n", - "type": { - "level_1": "core_text", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "eval_result_path", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "compare_method", - "default": "match", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "system_prompt", - "default": "You are a helpful assistant specialized in evaluating answer correctness.", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_test_answer_key", - "default": "generated_cot", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_gt_answer_key", - "default": "golden_answer", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_question_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 46, - "name": "Text2QASampleEvaluator", - "description": "该算子用于为给的的文档片段生成种子QA对打分\n\n输入参数:\n- input_question_key: Field name containing the generated question\n- input_answer_key: Field name containing the generated answer\n- output_question_quality_key: Field name containing the question quality grade\n- output_question_quality_feedback_key: Field name containing the question quality feedback\n- output_answer_alignment_key: Field name containing the answer alignment grade\n- output_answer_alignment_feedback_key: Field name containing the answer alignment feedback\n- output_answer_verifiability_key: Field name containing the answer verifiability grade\n- output_downstream_value_key: Field name containing the downstream value grade\n- output_downstream_value_feedback_key: Field name containing the downstream value feedback\n", - "type": { - "level_1": "core_text", - "level_2": "eval" - }, - "allowed_prompts": [ - "Text2QAQuestionQualityPrompt", - "Text2QAAnswerAlignmentPrompt", - "Text2QAAnswerVerifiabilityPrompt", - "Text2QADownstreamValuePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_question_key", - "default": "generated_question", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_answer_key", - "default": "generated_answer", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_question_quality_key", - "default": "question_quality_grades", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_question_quality_feedback_key", - "default": "question_quality_feedbacks", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_answer_alignment_key", - "default": "answer_alignment_grades", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_answer_alignment_feedback_key", - "default": "answer_alignment_feedbacks", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_answer_verifiability_key", - "default": "answer_verifiability_grades", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_answer_verifiability_feedback_key", - "default": "answer_verifiability_feedbacks", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_downstream_value_key", - "default": "downstream_value_grades", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_downstream_value_feedback_key", - "default": "downstream_value_feedbacks", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 47, - "name": "PromptedEvaluator", - "description": "PromptedEvaluator:使用 LLM 根据系统提示词对数据质量进行评分,并将评分写回 DataFrame(同时通过 storage 持久化)。模型应只输出分数(整数)。\n功能:对每行输入文本生成一个评分。\n输入参数:\n- llm_serving:LLM 服务对象,需实现 LLMServingABC 接口。\n- system_prompt:系统提示词(默认:'Please evaluate the quality of this data on a scale from 1 to 5.')。\n- input_key:输入文本所在列名(默认:'raw_content')。\n- output_key:评分结果写入的列名(默认:'eval')。\n输出:\n- 返回输出列名(用于后续算子引用),评分结果已写回并保存。", - "type": { - "level_1": "core_text", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "system_prompt", - "default": "Please evaluate the quality of this data on a scale from 1 to 5.", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "raw_content", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "eval", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 48, - "name": "PromptedFilter", - "description": "PromptedFilter 使用内置的 PromptedEvaluator 对输入数据进行数值化打分,并根据指定的分数区间(min_score 到 max_score,闭区间)筛选出符合条件的样本。默认情况下打分范围是 1–5,但用户可以通过 system_prompt 自定义其他评分规则。\n\n输入参数:\n- llm_serving:LLM 服务对象,需实现 LLMServingABC 接口\n- system_prompt:系统提示词,定义评估规范(可选,默认 'Please evaluate the quality of this data on a scale from 1 to 5.')\n- input_key:待评估文本所在列名(默认 'raw_content')\n- output_key:写回打分结果的列名(默认 'eval',若已存在将被覆盖)\n- min_score:筛选的最小分(默认 5)\n- max_score:筛选的最大分(默认 5)\n\n输出参数:\n- 过滤后的 DataFrame(仅保留分数位于 [min_score, max_score] 的行)\n- 返回 output_key 以供后续算子引用\n\n备注:\n- 默认打分区间是 1–5,但可根据实际 prompt 改变。", - "type": { - "level_1": "core_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "system_prompt", - "default": "Please evaluate the quality of this data on a scale from 1 to 5.", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "min_score", - "default": 1, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 5, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "raw_content", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "eval", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 49, - "name": "KCenterGreedyFilter", - "description": "该算子用于从大量的文档片段中选取部分文档片段,用于后续生成种子QA对\n\n输入参数:\n- input_key: 包含文档片段的字段名\n- embedding_model_path: 嵌入模型路径\n- num_samples: 选取的文档片段数量\n- method: 选择方法,随机或k-center-greedy\n\n", - "type": { - "level_1": "core_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "num_samples", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "embedding_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "content", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 50, - "name": "GeneralFilter", - "description": "该算子支持通过多个自定义函数对 DataFrame 进行灵活过滤。\n\n每条过滤规则是一个函数(例如 lambda 表达式),接受一个 DataFrame 并返回一个布尔类型的 Series,用于指定保留哪些行。\n\n输入参数:\n- filter_rules:一个函数列表,每个函数形式为 lambda df: ...,需返回一个与 df 长度一致的布尔 Series。所有规则之间采用与(AND)关系组合。\n\n示例:\n - lambda df: df['score'] > 0.5\n - lambda df: df['label'].isin(['A', 'B'])", - "type": { - "level_1": "core_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "filter_rules", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 51, - "name": "PromptedRefiner", - "description": "PromptedRefiner 根据给定的 system_prompt 对指定列的文本进行改写/润色/规范化,并将结果**就地写回**同一列(覆盖原内容)。其做法是对每一行拼接 `system_prompt + raw_content` 作为模型输入,批量生成改写结果。\n\n输入参数:\n- llm_serving:LLM 服务对象,需实现 LLMServingABC 接口\n- system_prompt:系统提示词,用于描述改写目标与风格(默认 'You are a helpful agent.')\n- input_key:要改写的文本列名(默认 'raw_content'),改写后会覆盖该列\n\n输出参数:\n- 覆盖后的 DataFrame(同名列被改写后的文本)\n- 无返回值(结果已通过 DataFlowStorage 写出)\n\n备注:\n- 该算子**覆盖** input_key 列;若需保留原文,建议先拷贝到新列。\n- 期望每行在 input_key 列提供可用文本;空值将不会生成对应输入,如与行数不匹配可能导致赋值报错。", - "type": { - "level_1": "core_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "system_prompt", - "default": "You are a helpful agent.", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "raw_content", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 52, - "name": "PandasOperator", - "description": "该算子支持通过多个自定义函数对 DataFrame 进行任意操作(如添加列、重命名、排序等)。\n\n每个函数(通常为 lambda 表达式)接受一个 DataFrame 并返回一个修改后的 DataFrame。\n\n输入参数:\n- process_fn:一个函数列表,每个函数形式为 lambda df: ...,必须返回一个 DataFrame。\n\n示例:\n - lambda df: df.assign(score2=df['score'] * 2)\n - lambda df: df.sort_values('score', ascending=False)", - "type": { - "level_1": "core_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "process_fn", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 53, - "name": "PromptedVQAGenerator", - "description": "该算子用于视觉问答生成,接收包含图像和问题的输入内容,使用大语言模型生成回答,并将生成的回答保存到数据框中。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- system_prompt:系统提示词,用于定义模型行为,默认为'You are a helpful assistant.'\n- input_key:输入内容的字段名,默认为'raw_content'\n- output_key:输出生成内容的字段名,默认为'generated_content'\n输出参数:\n- 返回输出字段名,用于后续算子引用\n- 在数据框中添加包含生成回答的新列", - "type": { - "level_1": "core_vision", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "system_prompt", - "default": "You are a helpful assistant.", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "raw_content", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "generated_content", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 54, - "name": "DBOperator", - "description": "N/A (非 staticmethod)", - "type": { - "level_1": "db", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "expr", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 55, - "name": "ColonEndFilter", - "description": "该算子用于检查文本是否以冒号结尾,常用于判断问题是否为不完整的提问。\n初始化参数:\n- 无\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'{类名小写}_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 56, - "name": "SentenceNumberFilter", - "description": "该算子用于检查文本中的句子数量是否在指定范围内,使用正则表达式匹配句子结束符号(。!?.!?)进行分割。\n初始化参数:\n- min_sentences:最小句子数量阈值,默认为3\n- max_sentences:最大句子数量阈值,默认为7500\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'sentence_number_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_sentences", - "default": 3, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_sentences", - "default": 7500, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "sentence_number_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 57, - "name": "LineEndWithEllipsisFilter", - "description": "该算子用于检测并过滤以省略号(...)或(……)结尾的文本行,常用于识别不完整的表述。\n初始化参数:\n- threshold:以省略号结尾的行数比率阈值,默认为0.3\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'line_end_with_ellipsis_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "threshold", - "default": 0.3, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "line_end_with_ellipsis_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 58, - "name": "ContentNullFilter", - "description": "该算子用于过滤空值、空字符串或仅包含空白字符的文本,确保输入数据的有效性。\n初始化参数:\n- 无\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'content_null_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "content_null_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 59, - "name": "SymbolWordRatioFilter", - "description": "该算子用于检查文本中特定符号(#, ..., …)与单词数量的比率是否超过阈值,过滤符号使用过多的文本。\n初始化参数:\n- threshold:符号与单词比率阈值,默认为0.4\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'symbol_word_ratio_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "threshold", - "default": 0.4, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "symbol_word_ratio_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 60, - "name": "AlphaWordsFilter", - "description": "该算子用于验证文本中字母单词的比率是否达到阈值,支持NLTK分词或简单空格分割两种模式。\n初始化参数:\n- threshold:字母单词比率阈值(无默认值,必须提供)\n- use_tokenizer:是否使用NLTK分词器(无默认值,必须提供)\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'alpha_words_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "threshold", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "use_tokenizer", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "alpha_words_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 61, - "name": "HtmlEntityFilter", - "description": "该算子用于检测并过滤包含HTML实体(如&、<、>等)的文本,确保内容不包含标记语言元素。\n初始化参数:\n- 无\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'html_entity_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "html_entity_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 62, - "name": "IDCardFilter", - "description": "该算子用于检测并过滤包含身份证相关术语的文本,使用正则表达式匹配身份证号码模式以保护敏感信息。\n初始化参数:\n- threshold:身份证相关词汇匹配次数阈值,默认为3\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'id_card_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "threshold", - "default": 3, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "id_card_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 63, - "name": "NoPuncFilter", - "description": "该算子用于确保文本包含足够的标点符号,通过统计句子间最大单词数量进行过滤。\n初始化参数:\n- threshold:句子间最大单词数量阈值,默认为112\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'no_punc_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "threshold", - "default": 112, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "no_punc_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 64, - "name": "SpecialCharacterFilter", - "description": "该算子用于移除包含特殊/unicode字符的文本,使用预定义模式检测非标准字符以确保文本规范性。\n初始化参数:\n- 无\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'special_character_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "special_character_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 65, - "name": "WatermarkFilter", - "description": "该算子用于检测并移除包含版权/水印内容的文本,使用指定关键词列表识别受保护内容。\n初始化参数:\n- watermarks:水印关键词列表,默认为['Copyright', 'Watermark', 'Confidential']\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'watermark_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "watermarks", - "default": [ - "Copyright", - "Watermark", - "Confidential" - ], - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "watermark_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 66, - "name": "MeanWordLengthFilter", - "description": "该算子用于检查文本中单词的平均长度是否在指定范围内,通过字符总数除以单词数量计算平均值。\n初始化参数:\n- min_length:最小平均单词长度,默认为3\n- max_length:最大平均单词长度,默认为10\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'mean_word_length_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_length", - "default": 3, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_length", - "default": 10, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "mean_word_length_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 67, - "name": "StopWordFilter", - "description": "该算子用于验证文本中停用词的比率是否高于阈值,使用NLTK分词器进行单词分割和停用词识别。\n初始化参数:\n- threshold:停用词比率阈值(无默认值,必须提供)\n- use_tokenizer:是否使用NLTK分词器(无默认值,必须提供)\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'stop_word_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "threshold", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "use_tokenizer", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "stop_word_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 68, - "name": "CurlyBracketFilter", - "description": "该算子用于检测文本中是否存在过多的花括号使用,通过花括号数量与文本长度的比率进行过滤。\n初始化参数:\n- threshold:花括号比率阈值,默认为0.025\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'curly_bracket_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "threshold", - "default": 0.025, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "curly_bracket_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 69, - "name": "CapitalWordsFilter", - "description": "该算子用于检查文本中大写单词的比率是否超过阈值,支持可选的分词器进行单词识别。\n初始化参数:\n- threshold:大写单词比率阈值,默认为0.2\n- use_tokenizer:是否使用NLTK分词器,默认为False\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'capital_words_filter'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "threshold", - "default": 0.2, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "use_tokenizer", - "default": false, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "capital_words_filter", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 70, - "name": "LoremIpsumFilter", - "description": "该算子用于检测并过滤包含占位文本(如'lorem ipsum')的文本,使用正则表达式模式匹配并结合阈值过滤。\n初始化参数:\n- threshold:'lorem ipsum'出现次数与文本长度的比率阈值,默认为3e-8\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'loremipsum_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "threshold", - "default": 3e-08, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "loremipsum_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 71, - "name": "UniqueWordsFilter", - "description": "该算子用于检查文本中唯一单词的比率是否达到阈值,通过集合操作计算唯一单词数量与总单词数量的比率。\n初始化参数:\n- threshold:最小唯一单词比率阈值,默认为0.1\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'unique_words_filter'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "threshold", - "default": 0.1, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "unique_words_filter", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 72, - "name": "CharNumberFilter", - "description": "该算子用于验证文本在去除空白字符后的字符数量是否达到最小阈值。\n初始化参数:\n- threshold:最小字符数量阈值,默认为100\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'char_number_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "threshold", - "default": 100, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "char_number_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 73, - "name": "LineStartWithBulletpointFilter", - "description": "该算子用于检测并过滤以各种项目符号符号开头的文本行,使用Unicode字符匹配结合比率阈值进行过滤。\n初始化参数:\n- threshold:以项目符号开头的行数比率阈值,默认为0.9\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'line_start_with_bullet_point_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "threshold", - "default": 0.9, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "line_start_with_bullet_point_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 74, - "name": "LineWithJavascriptFilter", - "description": "该算子用于识别并过滤包含'javascript'引用的文本,通过关键词匹配和阈值判断进行内容过滤。\n初始化参数:\n- threshold:不包含'javascript'的最小行数阈值,默认为3\n运行参数:\n- storage:DataFlowStorage对象\n- input_key:输入文本字段名\n- output_key:输出标签字段名,默认为'line_with_javascript_filter_label'\n返回值:\n- 包含output_key的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "threshold", - "default": 3, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "line_with_javascript_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 75, - "name": "LangkitSampleEvaluator", - "description": "使用Langkit工具包计算文本统计信息,帮助评估文本结构复杂性和可读性。提取多种语言特征,包括句子长度、词汇多样性、情感倾向等。\n\n输出参数:\n- LangkitNumSentencesScore: 句子数量\n- LangkitNumWordsScore: 单词数量\n- LangkitAvgWordLengthScore: 平均单词长度\n- LangkitFleschReadingEaseScore: 可读性评分(Flesch公式)\n- LangkitSentimentScore: 情感倾向(-1到1之间)", - "type": { - "level_1": "general_text", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 76, - "name": "LangkitFilter", - "description": "基于LangkitScorer打分器的得分对数据进行过滤。使用Langkit工具包计算11种文本统计信息,帮助评估文本结构复杂性和可读性。\n输入参数:\n- min_scores:各指标的最小阈值字典,包含11个语言统计指标\n- max_scores:各指标的最大阈值字典,包含11个语言统计指标\n- metrics_to_keep:需要保留的评估指标列表\n输出参数:\n- 过滤后的DataFrame,仅保留所有指标都在指定范围内的文本\n- 返回包含各指标标签字段名的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_scores", - "default": { - "flesch_reading_ease": 0, - "automated_readability_index": 0, - "aggregate_reading_level": 0, - "syllable_count": 32.0, - "lexicon_count": 23.0, - "sentence_count": 1.0, - "character_count": 118.0, - "letter_count": 109.0, - "polysyllable_count": 0.0, - "monosyllable_count": 13.0, - "difficult_words": 4.0 - }, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_scores", - "default": { - "flesch_reading_ease": 100, - "automated_readability_index": 100, - "aggregate_reading_level": 100, - "syllable_count": 2331.9, - "lexicon_count": 1554.0, - "sentence_count": 89.1, - "character_count": 7466.3, - "letter_count": 7193.0, - "polysyllable_count": 216.4, - "monosyllable_count": 1044.1, - "difficult_words": 213.4 - }, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "metrics_to_keep", - "default": [ - "flesch_reading_ease", - "automated_readability_index", - "aggregate_reading_level", - "syllable_count", - "lexicon_count", - "sentence_count", - "character_count", - "letter_count", - "polysyllable_count", - "monosyllable_count", - "difficult_words" - ], - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_keys", - "default": [ - "flesch_reading_ease", - "automated_readability_index", - "aggregate_reading_level", - "syllable_count", - "lexicon_count", - "sentence_count", - "character_count", - "letter_count", - "polysyllable_count", - "monosyllable_count", - "difficult_words" - ], - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 77, - "name": "LexicalDiversitySampleEvaluator", - "description": "使用MTLD(词汇多样性测量)和HDD(移动平均类型-标记比)方法计算文本词汇多样性。\n\n功能说明:\n- MTLD(词汇多样性测量):通过计算维持特定TTR阈值所需的单词数量来评估词汇多样性\n- HDD(移动平均类型-标记比):基于样本的词汇丰富度估计\n\n输入要求:文本长度需大于50个单词\n\n输出参数:\n- LexicalDiversityMTLDScore: MTLD多样性得分(值越高表示多样性越好)\n- LexicalDiversityHD-DScore: HDD多样性得分(值越高表示多样性越好)", - "type": { - "level_1": "general_text", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 78, - "name": "LexicalDiversityFilter", - "description": "基于LexicalDiversityScorer打分器的得分对数据进行过滤。使用MTLD(移动平均类型-令牌比)和HDD(超几何分布多样性)两种方法计算词汇多样性,高分代表更丰富的词汇使用。\n输入参数:\n- min_scores:各指标的最小阈值字典,包含'mtld'和'hdd'\n- max_scores:各指标的最大阈值字典,包含'mtld'和'hdd'\n输出参数:\n- 过滤后的DataFrame,仅保留词汇多样性在指定范围内的文本\n- 返回包含各指标标签字段名的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_scores", - "default": { - "mtld": 50, - "hdd": 0.8 - }, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_scores", - "default": { - "mtld": 99999, - "hdd": 1.0 - }, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_keys", - "default": [ - "mtld", - "hdd" - ], - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 79, - "name": "NgramSampleEvaluator", - "description": "计算文本中n-gram的重复比例,评估文本冗余度。通过比较唯一n-gram数量与总n-gram数量的比值来衡量文本原创性。\n\n初始化参数:\n- ngrams: n-gram的长度,默认为5\n\n输出参数:\n- NgramScore: n-gram重复比例得分(0到1之间,得分越高表示重复比例越低)", - "type": { - "level_1": "general_text", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "ngrams", - "default": 5, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "NgramScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 80, - "name": "NgramFilter", - "description": "基于NgramScorer打分器的得分对数据进行过滤。计算文本中n-gram的重复比例,得分越高表示重复比例越低,文本冗余度越小。\n输入参数:\n- min_score:最小n-gram得分阈值\n- max_score:最大n-gram得分阈值\n- ngrams:n-gram的n值\n输出参数:\n- 过滤后的DataFrame,仅保留n-gram得分在指定范围内的文本\n- 返回包含n-gram得分字段名的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 0.8, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 1, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "ngrams", - "default": 5, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "NgramScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 81, - "name": "PresidioSampleEvaluator", - "description": "使用Microsoft Presidio模型识别文本中的个人身份信息(PII),返回检测到的PII实体数量。支持多种实体类型如姓名、邮箱、电话号码等,基于dslim/bert-base-NER模型实现。适用于评估文本的隐私安全风险。\n输入参数:\n- text: 待检测的文本字符串\n- lang: 语言类型,默认为'en'\n输出参数:\n- int: 检测到的PII实体数量", - "type": { - "level_1": "general_text", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "lang", - "default": "en", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "PresidioScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 82, - "name": "PresidioFilter", - "description": "基于PresidioScorer打分器的得分对数据进行过滤。使用Microsoft Presidio模型识别文本中的私人实体(PII),返回PII信息个数。\n支持识别姓名、邮箱、电话号码、身份证号等多种敏感信息类型,可用于数据隐私保护和合规性检查。\n输入参数:\n- min_score:保留样本的最小PII数量阈值,默认为0\n- max_score:保留样本的最大PII数量阈值,默认为5\n- lang:文本语言,默认为'en'\n- device:模型运行设备,默认为'cuda'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n输出参数:\n- 过滤后的DataFrame,仅保留PII数量在[min_score, max_score]范围内的样本\n- 返回包含输出字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 5, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "lang", - "default": "en", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "PresidioScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 83, - "name": "BlocklistFilter", - "description": "该算子使用特定语言的阻止列表进行文本过滤,支持可选的分词器进行单词级匹配。\n输入参数:\n- input_key:输入文本字段名,默认为'text'\n- language:语言代码,默认为'zh'\n- blocklist_dir:阻止列表文件目录,默认为'./blocklists/'\n- threshold:匹配次数阈值,默认为1\n- use_tokenizer:是否使用分词器,默认为True\n- tokenizer:分词器对象,默认为None\n输出参数:\n- 过滤后的DataFrame,仅保留不包含阻止列表关键词的文本行\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "language", - "default": "en", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "threshold", - "default": 1, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "use_tokenizer", - "default": false, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "blocklist_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 84, - "name": "HashDeduplicateFilter", - "description": "使用多种哈希函数对文本进行精确去重,支持md5、sha256或xxh3算法。通过计算文本的哈希值识别重复数据。\n\n初始化参数:\n- hash_func: 哈希函数名称,可选'md5'、'sha256'或'xxh3',默认为'md5'\n\n运行参数:\n- input_keys: 用于计算哈希的多个字段列表(与input_key二选一)\n- input_key: 用于计算哈希的单个字段名(与input_keys二选一)\n- output_key: 去重标记字段名,默认为'minhash_deduplicated_label'\n\n输出说明:标记为1的数据表示首次出现,标记为0的数据表示重复数据\n算法特点:\n- md5: 128位哈希值,平衡速度和唯一性\n- sha256: 256位哈希值,更高安全性,速度较慢\n- xxh3: 128位哈希值,最快的哈希算法", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "hash_func", - "default": "md5", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_keys", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "minhash_deduplicated_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 85, - "name": "LanguageFilter", - "description": "使用FastText语言识别模型过滤数据。下载并加载预训练的FastText语言识别模型,检查文本的语言是否在允许的语言列表中。\n输入参数:\n- allowed_languages:允许的语言标签列表\n- model_cache_dir:模型缓存目录路径\n输出参数:\n- 过滤后的DataFrame,仅保留语言在允许列表中的文本\n- 返回包含语言标签字段名的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "allowed_languages", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "language_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 86, - "name": "LLMLanguageFilter", - "description": "使用大语言模型识别语言并过滤数据", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "allowed_languages", - "default": [ - "en" - ], - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "language_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 87, - "name": "MinHashDeduplicateFilter", - "description": "结合MinHash与LSH(局部敏感哈希)实现高效近似去重。将文本转换为MinHash签名,使用LSH快速查找相似文本,实现大规模数据集的近似去重。\n输入参数:\n- num_perm:生成MinHash签名的排列数\n- threshold:相似度阈值,超过此阈值判定为相似文本\n- use_n_gram:是否使用n-gram分词\n- ngram:n-gram的n值\n输出参数:\n- 去重后的DataFrame,仅保留唯一文本\n- 返回包含去重标签字段名的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "num_perm", - "default": 128, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "threshold", - "default": 0.9, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "use_n_gram", - "default": true, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "ngram", - "default": 5, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_keys", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "minhash_deduplicated_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 88, - "name": "NgramHashDeduplicateFilter", - "description": "结合n-gram技术与哈希算法识别相似文本,实现近似去重。将文本分割为多个n-gram片段,计算每个片段的哈希值,通过比较哈希集合的相似度来判断文本相似性。\n输入参数:\n- n_gram:将文本分割的片段数量\n- hash_func:哈希函数类型,支持'md5'、'sha256'和'xxh3'\n- diff_size:哈希集合差异阈值,小于此值判定为相似文本\n输出参数:\n- 去重后的DataFrame,仅保留唯一文本\n- 返回包含去重标签字段名的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "n_gram", - "default": 3, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "hash_func", - "default": "md5", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "diff_size", - "default": 1, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_keys", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "minhash_deduplicated_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 89, - "name": "PerspectiveSampleEvaluator", - "description": "使用Perspective API评估文本的毒性,返回毒性概率,得分越高表明文本毒性越高。\n输入参数:\n- serving:Perspective API服务对象\n- input_key:输入文本字段名\n- output_key:输出得分字段名,默认'PerspectiveScore'\n输出参数:\n- 包含毒性评估得分的DataFrame", - "type": { - "level_1": "general_text", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "PerspectiveScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 90, - "name": "PerspectiveFilter", - "description": "基于PerspectiveScorer打分器的得分对数据进行过滤使用Perspective API评估文本的毒性,返回毒性概率,得分越高表明文本毒性越高。\n输入参数:\n- min_score:最小毒性得分阈值\n- max_score:最大毒性得分阈值\n输出参数:\n- 过滤后的DataFrame,仅保留毒性得分在指定范围内的文本\n- 返回包含毒性得分字段名的列表", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 0.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 0.5, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "PerspectiveScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 91, - "name": "SemDeduplicateFilter", - "description": "基于BERT语义相似度识别语义重复文本,执行近似去重操作。通过计算文本嵌入向量间的余弦相似度,识别语义相似的文本并保留唯一样本。\n支持多字段组合作为去重依据,可有效去除内容相似但表述不同的重复数据,提高数据集多样性。\n输入参数:\n- eps:相似度阈值,值越小表示允许的相似度越低,默认为0.05(即余弦相似度大于0.95视为重复)\n- model_name:预训练模型名称,默认为'sentence-transformers/all-MiniLM-L6-v2'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- device:模型运行设备,默认为'cuda'\n- input_keys:多个输入字段名列表,与input_key二选一\n- input_key:单个输入字段名,与input_keys二选一\n- output_key:去重结果字段名,默认为'minhash_deduplicated_label'\n输出参数:\n- 过滤后的DataFrame,仅保留语义不重复的样本(标记为1的样本)\n- 返回包含去重结果字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "eps", - "default": 0.05, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_name", - "default": "sentence-transformers/all-MiniLM-L6-v2", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_keys", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "minhash_deduplicated_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 92, - "name": "SimHashDeduplicateFilter", - "description": "使用SimHash算法通过汉明距离识别相似文本,执行近似去重操作。将文本转换为固定长度的指纹,通过计算指纹间的汉明距离判断文本相似度。\n相比语义去重速度更快,适合大规模数据集的快速去重预处理,尤其适用于检测字符层面相似的文本。\n输入参数:\n- fingerprint_size:指纹长度,默认为64位\n- bound:相似度阈值,值越小表示允许的相似度越低,默认为0.1(即相似度大于0.9视为重复)\n- input_keys:多个输入字段名列表,与input_key二选一\n- input_key:单个输入字段名,与input_keys二选一\n- output_key:去重结果字段名,默认为'minhash_deduplicated_label'\n输出参数:\n- 过滤后的DataFrame,仅保留相似性低于阈值的唯一样本(标记为1的样本)\n- 返回包含去重结果字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "fingerprint_size", - "default": 64, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "bound", - "default": 0.1, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_keys", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "minhash_deduplicated_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 93, - "name": "WordNumberFilter", - "description": "该算子用于过滤单词数量不在指定范围内的文本,通过空格分割计算单词数量。\n输入参数:\n- input_key:输入文本字段名,默认为'text'\n- min_words:最小单词数量阈值,默认为5\n- max_words:最大单词数量阈值,默认为100\n输出参数:\n- 过滤后的DataFrame,仅保留单词数量在指定范围内的文本行\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_words", - "default": 20, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_words", - "default": 100000, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "word_number_filter_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 94, - "name": "HtmlEntityRefiner", - "description": "去除文本中的HTML实体,包括标准实体(如 、<)和各种变体形式(全角符号、中文分号等)。支持自定义需要移除的HTML实体列表。输入参数:\n- html_entities:需要移除的HTML实体列表,默认为包含常见实体的列表\n- input_key:输入文本字段名\n输出参数:\n- 包含移除HTML实体后文本的DataFrame\n- 返回输入字段名,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "html_entities", - "default": [ - "nbsp", - "lt", - "gt", - "amp", - "quot", - "apos", - "hellip", - "ndash", - "mdash", - "lsquo", - "rsquo", - "ldquo", - "rdquo" - ], - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 95, - "name": "HtmlUrlRemoverRefiner", - "description": "去除文本中的URL链接和HTML标签,净化文本内容。使用正则表达式匹配并移除各种形式的URL和HTML标签。输入参数:\n- input_key:输入文本字段名\n输出参数:\n- 包含净化后文本的DataFrame\n- 返回输入字段名,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 96, - "name": "LowercaseRefiner", - "description": "将文本字段中的所有大写字符转换为小写,统一文本格式。对指定字段的文本内容进行全小写处理。输入参数:\n- input_key:输入文本字段名\n输出参数:\n- 包含小写转换后文本的DataFrame\n- 返回输入字段名,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 97, - "name": "NERRefiner", - "description": "使用命名实体识别(NER)技术识别并屏蔽文本中的特定实体。使用spaCy的'en_core_web_sm'模型识别实体,并将其替换为对应的实体类型标签。输入参数:\n- input_key:输入文本字段名\n输出参数:\n- 包含实体屏蔽后文本的DataFrame\n- 返回输入字段名,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 98, - "name": "PIIAnonymizeRefiner", - "description": "使用Presidio和BERT-NER模型识别并匿名化文本中的个人身份信息(PII)。支持多种PII类型的检测和匿名化处理。输入参数:\n- lang:语言代码,默认为'en'\n- device:运行设备,默认为'cuda'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- model_name:NER模型名称,默认为'dslim/bert-base-NER'\n- input_key:输入文本字段名\n输出参数:\n- 包含匿名化后文本的DataFrame\n- 返回输入字段名,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "lang", - "default": "en", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_name", - "default": "dslim/bert-base-NER", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 99, - "name": "ReferenceRemoverRefiner", - "description": "删除文本中未闭合的引用标签和引用链接,包括标签和{{cite}}模板的各种完整和不完整形式。净化文本中的引用标记。输入参数:\n- input_key:输入文本字段名\n输出参数:\n- 包含移除引用标记后文本的DataFrame\n- 返回输入字段名,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 100, - "name": "RemoveContractionsRefiner", - "description": "该算子用于扩展文本中的英语缩写词,将缩写形式转换为完整形式(例如将\"can't\"扩展为\"cannot\")。\n使用contractions库进行缩写词扩展,提高文本标准化程度。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含扩展缩写词后的文本\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 101, - "name": "RemoveEmojiRefiner", - "description": "该算子用于去除文本中的Unicode图像表情符号,包括表情符号、杂项符号、交通符号、旗帜等各类图像符号。\n通过正则表达式匹配Unicode表情符号范围,实现高效过滤。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含去除表情符号的文本\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 102, - "name": "RemoveEmoticonsRefiner", - "description": "该算子用于移除文本中的文本型表情符号,例如':-)'、':D'、':('等字符组合表情。\n基于预定义的表情符号字典进行匹配替换,支持多种常见文本表情模式。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含去除文本表情的文本\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 103, - "name": "RemoveExtraSpacesRefiner", - "description": "该算子用于移除文本中的多余空格,将连续的多个空格替换为单个空格,并去除文本前后的空白字符。\n通过字符串分割和连接实现空格标准化,提高文本格式一致性。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含标准化空格的文本\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 104, - "name": "RemoveImageRefsRefiner", - "description": "该算子用于去除文本中的图片引用格式,包括Markdown图片链接、图片编号、特殊符号组合等图像引用模式。\n通过多模式正则表达式匹配,识别并移除多种图片引用格式。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含去除图片引用的文本\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 105, - "name": "RemoveNumberRefiner", - "description": "该算子用于移除文本中的数字字符,包括0-9的阿拉伯数字。\n通过字符过滤实现数字移除,保留纯文本内容。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含去除数字的文本\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 106, - "name": "RemovePunctuationRefiner", - "description": "该算子用于移除文本中的标点符号,包括英文标点符号集合中的所有符号。\n使用string.punctuation定义的标点集合进行过滤,实现文本去标点处理。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含去除标点的文本\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 107, - "name": "RemoveRepetitionsPunctuationRefiner", - "description": "该算子用于移除文本中重复的标点符号,例如将\"!!!\"变为\"!\",\",,\"变为\",\"。\n通过正则表达式匹配连续重复的标点符号,替换为单个符号。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含标准化标点的文本\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 108, - "name": "RemoveStopwordsRefiner", - "description": "该算子用于移除文本中的英语停用词(如\"the\",\"is\",\"in\"等无实际意义的高频词汇)。\n使用NLTK库的stopwords语料库进行停用词过滤,提高文本特征密度。\n输入参数:\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含去除停用词的文本\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 109, - "name": "SpellingCorrectionRefiner", - "description": "该算子用于通过SymSpell算法对文本中的拼写错误进行纠正,支持自定义编辑距离和词典路径。\n若本地词典不存在则自动下载,使用近似字符串匹配实现拼写纠错功能。\n输入参数:\n- max_edit_distance:最大编辑距离,默认为2\n- prefix_length:前缀长度,默认为7\n- dictionary_path:词典路径,默认为'frequency_dictionary_en_82_765.txt'\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含纠正拼写错误的文本\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "max_edit_distance", - "default": 2, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prefix_length", - "default": 7, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "dictionary_path", - "default": "frequency_dictionary_en_82_765.txt", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 110, - "name": "StemmingLemmatizationRefiner", - "description": "该算子用于对文本进行词干提取或词形还原处理,将词语转换为其基本形式。\n支持两种处理方式:Porter词干提取(stemming)和WordNet词形还原(lemmatization),可通过参数选择。\n输入参数:\n- method:处理方法,可选'stemming'或'lemmatization',默认为'stemming'\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含词干/词形还原后的文本\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "method", - "default": "stemming", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 111, - "name": "TextNormalizationRefiner", - "description": "该算子用于规范化文本中的日期格式和货币格式,统一为标准表示形式。\n日期格式统一转换为'YYYY-MM-DD'形式,货币格式转换为'金额 USD'形式,提高数据一致性。\n输入参数:\n- 无初始化参数\n运行参数:\n- input_key:输入文本字段名\n输出参数:\n- 处理后的DataFrame,包含格式规范化的文本\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "general_text", - "level_2": "refine" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 112, - "name": "BertSampleEvaluator", - "description": "使用BERTScore评估生成文本与参考文本的相似度,基于上下文嵌入计算P/R/F1分数。\n输入参数:\n- lang:语言类型,默认为'en'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- input_key:生成文本字段名\n- reference_key:参考文本字段名\n- output_key:输出得分字段名,默认为'BertScore'\n输出参数:\n- 包含F1相似度得分的DataFrame", - "type": { - "level_1": "general_text", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "lang", - "default": "en", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_reference_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "BertScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 113, - "name": "BleuSampleEvaluator", - "description": "计算BLEU分数评估生成文本与参考文本的n-gram重叠度,支持1-4元语法分析。\n输入参数:\n- n:最大n-gram长度,默认为4\n- eff:参考长度计算方式,可选'shortest'/'average'/'longest',默认为'average'\n- special_reflen:特殊参考长度,默认为None\n- input_key:生成文本字段名\n- reference_key:参考文本字段名\n- output_key:输出得分字段名,默认为'BleuScore'\n输出参数:\n- 包含BLEU得分的DataFrame", - "type": { - "level_1": "general_text", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "n", - "default": 4, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "eff", - "default": "average", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "special_reflen", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_reference_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "BleuScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 114, - "name": "CiderSampleEvaluator", - "description": "使用CIDEr指标评估生成文本与参考文本的相似度,基于TF-IDF加权的n-gram重叠度。\n输入参数:\n- n:最大n-gram长度,默认为4\n- sigma:高斯惩罚参数,默认为6.0\n- df_mode:文档频率模式,默认为'coco-val-df'\n- idf_path:IDF文件路径,默认为预训练COCO数据集IDF\n- input_key:生成文本字段名\n- reference_key:参考文本字段名\n- output_key:输出得分字段名,默认为'CiderScore'\n输出参数:\n- 包含CIDEr得分的DataFrame", - "type": { - "level_1": "general_text", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "n", - "default": 4, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "sigma", - "default": 6.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "df_mode", - "default": "coco-val-df", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "idf_path", - "default": "./dataflow/operators/general_pt/eval/cider/coco-val-df.p", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_reference_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "CiderScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 115, - "name": "Task2VecDatasetEvaluator", - "description": "使用Task2Vec方法评估数据集的多样性,通过计算样本嵌入的余弦距离矩阵来量化多样性。\n输入参数:\n- device:计算设备,默认为'cuda'\n- sample_nums:采样次数,默认为10\n- sample_size:每次采样样本数,默认为1\n- method:嵌入方法,可选'montecarlo'或'variational',默认为'montecarlo'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- input_key:输入文本字段名\n输出参数:\n- Task2VecDiversityScore:多样性得分\n- ConfidenceInterval:置信区间", - "type": { - "level_1": "general_text", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "sample_nums", - "default": 10, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "sample_size", - "default": 1, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "method", - "default": "montecarlo", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 116, - "name": "VendiDatasetEvaluator", - "description": "通过计算VendiScore来评估数据集的多样性,使用BERT和SimCSE模型生成嵌入并计算分数。\n输入参数:\n- device:计算设备,默认为'cuda'\n- input_key:输入文本字段名\n输出参数:\n- BERTVendiScore:基于BERT的多样性得分\n- SimCSEVendiScore:基于SimCSE的多样性得分", - "type": { - "level_1": "general_text", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 117, - "name": "KBCChunkGenerator", - "description": "('CorpusTextSplitter是轻量级文本分割工具,', '支持词/句/语义/递归分块,', '可配置块大小、重叠和最小块长度')", - "type": { - "level_1": "knowledge_cleaning", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "chunk_size", - "default": 512, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "chunk_overlap", - "default": 50, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "split_method", - "default": "token", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "min_tokens_per_chunk", - "default": 100, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "tokenizer_name", - "default": "bert-base-uncased", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "text_path", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "raw_chunk", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 118, - "name": "KBCChunkGeneratorBatch", - "description": "('CorpusTextSplitter是轻量级文本分割工具,', '支持词/句/语义/递归分块,', '可配置块大小、重叠和最小块长度')", - "type": { - "level_1": "knowledge_cleaning", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "chunk_size", - "default": 512, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "chunk_overlap", - "default": 50, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "split_method", - "default": "token", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "min_tokens_per_chunk", - "default": 100, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "tokenizer_name", - "default": "bert-base-uncased", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "text_path", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "chunk_path", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 119, - "name": "FileOrURLToMarkdownConverter", - "description": "知识提取算子:支持从多种文件格式中提取结构化内容并转换为标准Markdown\n核心功能:\n1. PDF文件:使用MinerU解析引擎提取文本/表格/公式,保留原始布局\n2. Office文档(DOC/PPT等):通过DocConverter转换为Markdown格式\n3. 网页内容(HTML/XML):使用trafilatura提取正文并转为Markdown\n4. 纯文本(TXT/MD):直接透传不做处理\n特殊处理:\n- 自动识别中英文文档(lang参数)\n- 支持本地文件路径和URL输入\n- 生成中间文件到指定目录(intermediate_dir)", - "type": { - "level_1": "knowledge_cleaning", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "url", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "raw_file", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "intermediate_dir", - "default": "intermediate", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "lang", - "default": "en", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "mineru_backend", - "default": "vlm-sglang-engine", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 120, - "name": "FileOrURLToMarkdownConverterBatch", - "description": "知识提取算子:支持从多种文件格式中提取结构化内容并转换为标准Markdown\n核心功能:\n1. PDF文件:使用MinerU解析引擎提取文本/表格/公式,保留原始布局\n2. Office文档(DOC/PPT等):通过DocConverter转换为Markdown格式\n3. 网页内容(HTML/XML):使用trafilatura提取正文并转为Markdown\n4. 纯文本(TXT/MD):直接透传不做处理\n特殊处理:\n- 自动识别中英文文档(lang参数)\n- 支持本地文件路径和URL输入\n- 生成中间文件到指定目录(intermediate_dir)", - "type": { - "level_1": "knowledge_cleaning", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "intermediate_dir", - "default": "intermediate", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "lang", - "default": "en", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "mineru_backend", - "default": "vlm-sglang-engine", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "source", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "text_path", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 121, - "name": "KBCTextCleaner", - "description": "知识清洗算子:对原始知识内容进行标准化处理,包括HTML标签清理、特殊字符规范化、链接处理和结构优化,提升RAG知识库的质量。主要功能:\n1. 移除冗余HTML标签但保留语义化标签\n2. 标准化引号/破折号等特殊字符\n3. 处理超链接同时保留文本\n4. 保持原始段落结构和代码缩进\n5. 确保事实性内容零修改\n\n输入格式示例:\n
\n

标题文本

\n

正文段落,包括特殊符号,例如“弯引号”、–破折号等

\n \"示意图\"\n 链接文本\n
代码片段
\n ...\n
\n\n输出格式示例:\n标题文本\n\n正文段落,包括特殊符号,例如\"直引号\"、-破折号等\n\n[Image: 示例图 example.jpg]\n\n链接文本\n\n代码片段\n\n[结构保持,语义保留,敏感信息脱敏处理(如手机号、保密标记等)]", - "type": { - "level_1": "knowledge_cleaning", - "level_2": "generate" - }, - "allowed_prompts": [ - "KnowledgeCleanerPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "lang", - "default": "en", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "raw_chunk", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "cleaned_chunk", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 122, - "name": "KBCTextCleanerBatch", - "description": "知识清洗算子:对原始知识内容进行标准化处理,包括HTML标签清理、特殊字符规范化、链接处理和结构优化,提升RAG知识库的质量。主要功能:\n1. 移除冗余HTML标签但保留语义化标签\n2. 标准化引号/破折号等特殊字符\n3. 处理超链接同时保留文本\n4. 保持原始段落结构和代码缩进\n5. 确保事实性内容零修改", - "type": { - "level_1": "knowledge_cleaning", - "level_2": "generate" - }, - "allowed_prompts": [ - "KnowledgeCleanerPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "lang", - "default": "en", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "chunk_path", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "cleaned_chunk_path", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 123, - "name": "KBCMultiHopQAGeneratorBatch", - "description": "('MultiHopQAGenerator 是多跳问答对生成处理器,支持从文本中自动生成需要多步推理的问题与答案。', '处理流程包括:文本预处理、信息抽取、问题生成与回答生成,支持自定义语言模型后端和参数。', '输出格式如下:', '输入:\\ntext: <原始上下文文本>', '输出:\\n{\\n \"text\": <处理后的文本字符串>,\\n \"qa_pairs\": [\\n {\\n \"question\": <字符串:生成的问题>,\\n \"reasoning_steps\": [\\n {\"step\": <推理过程的步骤 1>},\\n {\"step\": <步骤 2>} ...\\n ],\\n \"answer\": <字符串:最终答案>,\\n \"supporting_facts\": [<支持该答案的事实 1>, <事实 2>, ...],\\n \"type\": <可选:问题类型,如“生物学”、“历史”等>\\n },\\n ...\\n ],\\n \"metadata\": {\\n \"source\": <数据来源>,\\n \"timestamp\": <时间戳字符串>,\\n \"complexity\": <整数:问题复杂度标记>\\n }\\n}')", - "type": { - "level_1": "knowledge_cleaning", - "level_2": "generate" - }, - "allowed_prompts": [ - "Text2MultiHopQAGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "seed", - "default": 0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "lang", - "default": "en", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "chunk_path", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "enhanced_chunk_path", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 124, - "name": "QAExtractor", - "description": "QA对提取器 - 将嵌套的QA_pairs转换为Alpaca微调格式\n\n核心功能:\n从结构化的QA对数据中提取问答内容,自动整合推理步骤和支持事实,\n输出符合Stanford Alpaca标准的instruction-input-output格式。\n\n初始化参数:\n• qa_key: QA对的字段名 (默认: 'QA_pairs')\n• output_json_file: 输出JSON文件路径 (可选,不指定则只更新DataFrame)\n• instruction: 统一的指令前缀 (默认: 'Please answer the following question...')\n\n运行参数 (input_key):\n• None - 包含所有字段 (question + reasoning_steps + supporting_facts)\n• '' - 空字符串,不包含额外上下文\n• 'reasoning_steps' - 只包含推理步骤\n• 'question,reasoning_steps' - 逗号分隔多个字段\n• ['question', 'supporting_facts'] - 列表格式\n\n输出字段:\n• instruction: 问题指令\n• input: 上下文信息 (根据input_key动态拼接)\n• output: 答案\n\n适用场景: 知识库QA微调、领域问答模型训练", - "type": { - "level_1": "knowledge_cleaning", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "qa_key", - "default": "QA_pairs", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_json_file", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "instruction", - "default": "Please answer the following question based on the provided information.", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 125, - "name": "ReasoningAnswerGenerator", - "description": "该算子用于为给定问题生成答案,调用大语言模型进行推理。\n输入参数:\n- llm_serving:LLM服务实例,用于生成答案\n- prompt_template:提示模板对象,用于构建生成提示词\n输出参数:\n- output_key:生成的答案字段,默认'generated_cot'", - "type": { - "level_1": "reasoning", - "level_2": "generate" - }, - "allowed_prompts": [ - "MathAnswerGeneratorPrompt", - "GeneralAnswerGeneratorPrompt", - "DiyAnswerGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": "", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "generated_cot", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 126, - "name": "ReasoningQuestionGenerator", - "description": "该算子用于基于现有问题生成新问题。\n输入参数:\n- num_prompts:生成问题的数量,整数,范围1-5(含),默认1\n- llm_serving:LLM服务实例,用于生成问题\n- prompt_template:提示模板对象,用于构建生成提示词\n输出参数:\n- 原始输入列(由input_key指定):新增生成的问题\n- Synth_or_Input:标识问题来源,'input'表示原始问题,'synth'表示生成的新问题", - "type": { - "level_1": "reasoning", - "level_2": "generate" - }, - "allowed_prompts": [ - "MathQuestionSynthesisPrompt", - "GeneralQuestionSynthesisPrompt", - "DiyQuestionSynthesisPrompt" - ], - "parameter": { - "init": [ - { - "name": "num_prompts", - "default": 1, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_synth_or_input_flag", - "default": "Synth_or_Input", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 127, - "name": "ReasoningAnswerExtractionQwenMathEvalGenerator", - "description": "该算子用于从数学问题回答中提取规范化答案表达式,进行字符串清洗、单位处理和格式标准化。\n\n输入参数:\n- input_key:输入数据字段名\n- answer_key:原始答案字段名\n- output_key:处理后的答案字段名\n- unit_texts:需要过滤的单位文本列表\n\n输出参数:\n- output_key:标准化后的数学表达式字段", - "type": { - "level_1": "reasoning", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "dataset_name", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "pseudo_correct_solution_example", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "extraction", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 128, - "name": "ReasoningPseudoAnswerGenerator", - "description": "该算子生成多个候选答案并通过统计选择最优解,实现伪答案生成。\n\n输入参数:\n- input_file:输入文件路径\n- output_file:输出文件路径\n- max_times:最大生成次数\n- selection_mode:统计选择模式(frequency/consistency)\n\n输出参数:\n- final_answer:最终选择答案字段\n- candidate_answers:候选答案列表字段", - "type": { - "level_1": "reasoning", - "level_2": "generate" - }, - "allowed_prompts": [ - "MathAnswerGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_times", - "default": 3, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key_answer", - "default": "pseudo_answers", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key_answer_value", - "default": "pseudo_answer_value", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key_solutions", - "default": "pseudo_solutions", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key_correct_solution_example", - "default": "pseudo_correct_solution_example", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 129, - "name": "ReasoningPretrainFormatConvertGenerator", - "description": "该算子用于将SFT格式数据转换为预训练格式。\n\n输入参数:\n- read_key_question:问题字段名\n- read_key_answer:答案字段名\n- output_key:输出文本字段名\n\n输出参数:\n- output_key:输出文本字段名,包含问题和答案的拼接结果\n- 输出文件:转换后的预训练格式数据文件路径", - "type": { - "level_1": "reasoning", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_read_key_question", - "default": "question", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_read_key_answer", - "default": "answer", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "text", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 130, - "name": "ReasoningQuestionFusionGenerator", - "description": "该算子用于基于现有问题生成新问题。\n输入参数:\n- num_prompts:生成问题的数量,整数,范围1-5(含),默认1\n- llm_serving:LLM服务实例,用于生成问题\n- prompt_template:提示模板对象,用于构建生成提示词\n输出参数:\n- 原始输入列(由input_key指定):新增生成的问题\n- Synth_or_Input:标识问题来源,'input'表示原始问题,'synth'表示生成的新问题", - "type": { - "level_1": "reasoning", - "level_2": "generate" - }, - "allowed_prompts": [ - "MathQuestionParallelFusionGeneratorPrompt", - "MathQuestionSequentialFusionGeneratorPrompt", - "MathQuestionConditionFusionGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "num_prompts", - "default": 1, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_problem_1", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_problem_2", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 131, - "name": "ReasoningCategoryDatasetEvaluator", - "description": "该算子用于统计数据集中的类别信息,包括主类别和次类别的分布情况。它计算每个类别的样本数量,并返回类别分布的统计结果。\n输入参数:\n- input_primary_category_key:主类别字段名,默认为'primary_category'\n- input_secondary_category_key:次类别字段名,默认为'secondary_category'\n输出参数:\n- 返回包含类别统计信息的字典,主类别作为键,值为包含该类别样本数量和次类别分布的字典", - "type": { - "level_1": "reasoning", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_primary_category_key", - "default": "primary_category", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_secondary_category_key", - "default": "secondary_category", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 132, - "name": "ReasoningDifficultyDatasetEvaluator", - "description": "该算子用于统计数据集中的难度信息,计算不同难度级别的样本数量分布。它统计每个难度级别的样本数量,并返回难度分布的统计结果。\n输入参数:\n- input_diffulty_key:难度分数字段名,默认为'difficulty_score'\n输出参数:\n- 返回包含难度统计信息的字典,难度级别作为键,值为该难度级别的样本数量", - "type": { - "level_1": "reasoning", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_diffulty_key", - "default": "difficulty_score", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 133, - "name": "ReasoningTokenDatasetEvaluator", - "description": "该算子用于统计数据集中问题和回答的token信息,包括token数量的最小值、最大值、平均值和中位数等统计指标。它使用指定的tokenizer对文本进行编码,并计算token长度的分布情况。\n输入参数:\n- input_question_key:问题文本字段名\n- input_answer_key:回答文本字段名\n- model_name_or_path:tokenizer模型名称或路径\n输出参数:\n- 返回包含token统计信息的字典,包括问题和回答的token数量的零值计数、最小值、最大值、平均值和中位数", - "type": { - "level_1": "reasoning", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "model_name_or_path", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_question_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_answer_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 134, - "name": "ReasoningQuestionCategorySampleEvaluator", - "description": "该算子用于对用户问题进行多级分类(主分类和子分类)。通过大语言模型对输入问题进行语义分析,输出分类编码结果。\n\n输入参数:\n- db_port/db_name/table_name:数据库连接参数(存储模式)\n- input_file/output_file:文件路径(文件模式)\n- input_key:输入数据中问题字段的键名\n- generator_type:模型调用方式(aisuite/request)\n\n输出参数:\n- classification_result:包含主分类和子分类的编码结果", - "type": { - "level_1": "reasoning", - "level_2": "eval" - }, - "allowed_prompts": [ - "MathQuestionCategoryPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "question_category", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 135, - "name": "ReasoningQuestionDifficultySampleEvaluator", - "description": "该算子用于评估问题的难度等级。通过大语言模型分析问题复杂度,输出1-10级的难度评分。\n\n输入参数:\n- eval_stage:评估阶段标识\n- read_min/max_score:分数过滤阈值\n- 其他参数同ReasoningCategoryDatasetEvaluator\n\n输出参数:\n- difficulty_score:数值型难度评分(1-10)", - "type": { - "level_1": "reasoning", - "level_2": "eval" - }, - "allowed_prompts": [ - "MathQuestionDifficultyPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "difficulty_score", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 136, - "name": "ReasoningQuestionSolvableSampleEvaluator", - "description": "该算子用于对用户问题进行多级分类(主分类和子分类)。通过大语言模型对输入问题进行语义分析,输出分类编码结果。\n\n输入参数:\n- db_port/db_name/table_name:数据库连接参数(存储模式)\n- input_file/output_file:文件路径(文件模式)\n- input_key:输入数据中问题字段的键名\n- generator_type:模型调用方式(aisuite/request)\n\n输出参数:\n- classification_result:包含主分类和子分类的编码结果", - "type": { - "level_1": "reasoning", - "level_2": "eval" - }, - "allowed_prompts": [ - "MathQuestionEvaluatorPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 137, - "name": "ReasoningAnswerFormatterFilter", - "description": "该算子用于检查答案格式是否符合规范,主要验证数学答案是否包含正确的\\boxed{}标记。\n\n输入参数:\n- input_key:输入字段名\n- result_key:结果字段名\n\n输出参数:\n- 通过格式检查返回1,否则返回0", - "type": { - "level_1": "reasoning", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "generated_cot", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 138, - "name": "ReasoningAnswerGroundTruthFilter", - "description": "该算子用于对比预测答案与标准答案的匹配度,支持精确匹配和数学验证两种方式。\n\n输入参数:\n- input_test_answer_key:预测答案字段名\n- input_gt_answer_key:标准答案字段名\n- compare_method:比较方法(exact/math_verify)\n\n输出参数:\n- 匹配成功返回1,否则返回0", - "type": { - "level_1": "reasoning", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "compare_method", - "default": "math_verify", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_test_answer_key", - "default": "generated_cot", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_gt_answer_key", - "default": "golden_answer", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 139, - "name": "ReasoningAnswerNgramFilter", - "description": "该算子基于n-gram重复率过滤答案,检测回答中的重复模式。\n\n输入参数:\n- min_score:最小可接受分数\n- max_score:最大可接受分数\n- ngrams:n-gram大小\n\n输出参数:\n- 分数在范围内返回1,否则返回0", - "type": { - "level_1": "reasoning", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 0.1, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 1.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "ngrams", - "default": 5, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_question_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_answer_key", - "default": "generated_cot", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 140, - "name": "ReasoningAnswerPipelineRootFilter", - "description": "答案处理流程根节点,负责将输入数据根据有无真实标签GT分发到不同处理分支。\n\n输入参数:\n- input_file:输入文件路径\n- output_dir:输出目录路径\n- branch_config:分支配置参数\n- parallel_workers:并行工作线程数\n\n输出参数:\n- 多个输出文件路径(根据分支配置生成)", - "type": { - "level_1": "reasoning", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_answer_key", - "default": "output", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_gt_key", - "default": "golden_answer", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 141, - "name": "ReasoningAnswerTokenLengthFilter", - "description": "该算子根据token数量过滤过长的答案。\n\n输入参数:\n- max_answer_token_length:最大token数\n- tokenizer_dir:分词器路径\n- read_min/max_score:分数范围\n\n输出参数:\n- 长度合规返回1,否则返回0", - "type": { - "level_1": "reasoning", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "max_answer_token_length", - "default": 8192, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "tokenizer_dir", - "default": "Qwen/Qwen2.5-0.5B-Instruct", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "generated_cot", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 142, - "name": "ReasoningQuestionFilter", - "description": "该算子用于对问题进行正确性检查,包括格式是否规范、语义是否合理、条件是否矛盾以及是否具备充分信息可解。调用大语言模型依次执行四阶段判断,最终返回每个问题是否合格的二分类结果(保留合格样本)。\n输入参数:\n- system_prompt:系统提示词,用于定义模型行为\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- prompt_template:提示模板对象,用于构建检查提示词\n- input_key:输入问题字段名,默认为'math_problem'\n输出参数:\n- 过滤后的DataFrame,仅保留判断结果为True的行\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "reasoning", - "level_2": "filter" - }, - "allowed_prompts": [ - "MathQuestionFilterPrompt", - "GeneralQuestionFilterPrompt", - "DiyQuestionFilterPrompt" - ], - "parameter": { - "init": [ - { - "name": "system_prompt", - "default": "You are a helpful assistant.", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": "", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "math_problem", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 143, - "name": "ReasoningAnswerModelJudgeFilter", - "description": "该算子用于对答案进行正确性评判,通过比较当前答案与参考答案的语义一致性,判断答案是否正确。调用大语言模型进行语义理解和判断,最终返回每个答案是否正确的二分类结果。\n输入参数:\n- system_prompt:系统提示词,用于定义模型行为\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- prompt_template:提示模板对象,用于构建评判提示词\n- keep_all_samples:是否保留所有样本,默认为False(仅保留正确答案)\n- question_key:问题字段名,默认为'question'\n- answer_key:当前答案字段名,默认为'answer'\n- reference_key:参考答案字段名,默认为'reference_answer'\n输出参数:\n- DataFrame,包含原始数据和判断结果(answer_match_result字段)\n- 如果keep_all_samples为False,则仅保留判断结果为True的行\n- 返回包含输入字段名的列表,用于后续算子引用", - "type": { - "level_1": "reasoning", - "level_2": "filter" - }, - "allowed_prompts": [ - "AnswerJudgePromptQuestion", - "AnswerJudgePrompt" - ], - "parameter": { - "init": [ - { - "name": "system_prompt", - "default": "You are a helpful assistant specialized in evaluating answer correctness.", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": "", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "keep_all_samples", - "default": false, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_question_key", - "default": "question", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_answer_key", - "default": "answer", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_reference_key", - "default": "reference_answer", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 144, - "name": "SQLConsistencyFilter", - "description": "对条目进行过滤,检测SQL和自然语言问题是否对应,即判断SQL是否能解决该问题。\n\n输入参数:\n- input_sql_key: 输入SQL列名\n- input_db_id_key: 输入数据库ID列名\n- input_question_key: 输入问题列名\n\n", - "type": { - "level_1": "text2sql", - "level_2": "filter" - }, - "allowed_prompts": [ - "SQLConsistencyFilterPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "database_manager", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_sql_key", - "default": "SQL", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_db_id_key", - "default": "db_id", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_question_key", - "default": "question", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 145, - "name": "SQLExecutionFilter", - "description": "对条目进行过滤,在数据库中执行SQL,筛选掉不可执行的条目。\n\n输入参数:\n- input_sql_key: 输入SQL列名\n- input_db_id_key: 输入数据库ID列名\n\n", - "type": { - "level_1": "text2sql", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "database_manager", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_sql_key", - "default": "sql", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_db_id_key", - "default": "db_id", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 146, - "name": "SQLGenerator", - "description": "基于数据库信息,合成SQL,覆盖不同的难度、数据库Schema、函数和风格。\n\n输出参数:\n- output_sql_key: 输出SQL列名\n- output_db_id_key: 数据库ID列名\n\n", - "type": { - "level_1": "text2sql", - "level_2": "generate" - }, - "allowed_prompts": [ - "SelectSQLGeneratorPrompt", - "SelectVecSQLGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "database_manager", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "generate_num", - "default": 300, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_sql_key", - "default": "sql", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_db_id_key", - "default": "db_id", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 147, - "name": "SQLByColumnGenerator", - "description": "基于数据库信息,合成SQL,覆盖不同的难度、数据库Schema、函数和风格。\n\n输出参数:\n- output_sql_key: 输出SQL列名\n- output_db_id_key: 数据库ID列名\n\n", - "type": { - "level_1": "text2sql", - "level_2": "generate" - }, - "allowed_prompts": [ - "SelectSQLGeneratorPrompt", - "SelectVecSQLGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "database_manager", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "generate_num", - "default": 5, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_sql_key", - "default": "sql", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_db_id_key", - "default": "db_id", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 148, - "name": "SQLVariationGenerator", - "description": "对于每个条目,基于已有的SQL,指导模型生成SQL的变种,即在原有SQL的基础上,进行数据替换、函数变换、难度变换等操作,生成更加丰富的SQL。\n\n输入参数:\n- input_sql_key: SQL列名\n- input_db_id_key: 数据库ID列名\n\n", - "type": { - "level_1": "text2sql", - "level_2": "generate" - }, - "allowed_prompts": [ - "SQLVariationGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "database_manager", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "num_variations", - "default": 10, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_sql_key", - "default": "sql", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_db_id_key", - "default": "db_id", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 149, - "name": "Text2SQLCoTGenerator", - "description": "对于每个条目,生成从自然语言问题和数据库Schema到SQL的CoT长链路推理过程。\n\n输入参数:\n- input_sql_key: 输入SQL列名\n- input_question_key: 输入问题列名\n- input_db_id_key: 输入数据库ID列名\n\n输出参数:\n- output_cot_key: 输出CoT列名", - "type": { - "level_1": "text2sql", - "level_2": "generate" - }, - "allowed_prompts": [ - "Text2SQLCotGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "database_manager", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_sql_key", - "default": "SQL", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_question_key", - "default": "question", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_db_id_key", - "default": "db_id", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_evidence_key", - "default": "evidence", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_cot_key", - "default": "cot_reasoning", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 150, - "name": "Text2SQLPromptGenerator", - "description": "从数据库提取Schema信息,结合自然语言问题生成提示词。其中提示词模版支持自定义。\n\n输入参数:\n- input_question_key: 问题列名\n- input_db_id_key: 数据库ID列名\n- output_prompt_key: 输出prompt列名\n\n输出参数:\n- output_prompt_key: 生成的prompt", - "type": { - "level_1": "text2sql", - "level_2": "generate" - }, - "allowed_prompts": [ - "Text2SQLPromptGeneratorPrompt", - "Text2VecSQLPromptGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "database_manager", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_question_key", - "default": "question", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_db_id_key", - "default": "db_id", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_evidence_key", - "default": "evidence", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_prompt_key", - "default": "prompt", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 151, - "name": "Text2SQLQuestionGenerator", - "description": "对于每个条目,如果自然语言问题为空,生成SQL对应的自然语言问题。为保证正确,生成多个候选问题,并选择最优的。\n\n输入参数:\n- input_sql_key: 输入SQL列名\n- input_db_id_key: 数据库ID列名\n\n输出参数:\n- output_question_key: 输出问题列名", - "type": { - "level_1": "text2sql", - "level_2": "generate" - }, - "allowed_prompts": [ - "Text2SQLQuestionGeneratorPrompt", - "Text2VecSQLQuestionGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "embedding_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "database_manager", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "question_candidates_num", - "default": 5, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "prompt_template", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_sql_key", - "default": "sql", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_db_id_key", - "default": "db_id", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_question_key", - "default": "question", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_evidence_key", - "default": "evidence", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 152, - "name": "SQLComponentClassifier", - "description": "根据SQL的组件数量和复杂度,评估SQL的难度。\n\n输入参数:\n- input_sql_key: 输入SQL列名\n\n输出参数:\n- output_difficulty_key: 输出难度列名", - "type": { - "level_1": "text2sql", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "difficulty_thresholds", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "difficulty_labels", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_sql_key", - "default": "SQL", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_difficulty_key", - "default": "sql_component_difficulty", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 153, - "name": "SQLExecutionClassifier", - "description": "让模型根据自然语言问题、数据库Schema和提示词,多次生成SQL,通过生成SQL的准确率,评估该问题对于模型的难度。\n\n输入参数:\n- input_db_id_key: 输入数据库ID列名\n- input_sql_key: 输入SQL列名\n- input_prompt_key: 输入prompt列名\n\n输出参数:\n- output_difficulty_key: 输出难度列名", - "type": { - "level_1": "text2sql", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "database_manager", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "num_generations", - "default": 10, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "difficulty_thresholds", - "default": [ - 2, - 5, - 9 - ], - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "difficulty_labels", - "default": [ - "extra", - "hard", - "medium", - "easy" - ], - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_db_id_key", - "default": "db_id", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_sql_key", - "default": "SQL", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_prompt_key", - "default": "rl_prompt", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_difficulty_key", - "default": "sql_execution_difficulty", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 154, - "name": "CCNetDeduplicateFilter", - "description": "CCNet去重方法,基于SHA-1哈希算法的前N位进行重复识别,实现精确去重。\n\n初始化参数:\n- bit_length: 哈希值的位数,默认为64位\n\n运行参数:\n- input_keys: 用于计算哈希的多个字段列表(与input_key二选一)\n- input_key: 用于计算哈希的单个字段名(与input_keys二选一)\n- output_key: 去重标记字段名,默认为'minhash_deduplicated_label'\n\n输出说明:标记为1的数据表示首次出现,标记为0的数据表示重复数据", - "type": { - "level_1": "text_pt", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "bit_length", - "default": 64, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_keys", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "minhash_deduplicated_label", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 155, - "name": "DebertaV3SampleEvaluator", - "description": "基于Nvidia Deberta V3模型的质量分类器,用于评估文本质量并返回分类结果。\n输入参数:\n- model_name:预训练模型名称\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- device:计算设备,默认为'cuda'\n- input_key:输入文本字段名\n- output_key:输出分类结果字段名,默认为'Debertav3Score'\n输出参数:\n- 包含文本质量分类结果的DataFrame", - "type": { - "level_1": "text_pt", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "model_name", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "Debertav3Score", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 156, - "name": "DebertaV3Filter", - "description": "基于DebertaV3Scorer打分器的得分对数据进行过滤。使用Nvidia Deberta V3模型的质量分类器评估文本质量。\n\n初始化参数:\n- allowed_scores: 允许通过的分数列表,默认为['Medium', 'High']\n- model_name: 模型名称,默认为'nvidia/quality-classifier-deberta'\n- model_cache_dir: 模型缓存目录,默认为'./dataflow_cache'\n- device: 运行设备,默认为'cuda'\n- batch_size: 批处理大小,默认为16\n\n运行参数:\n- input_key: 输入文本字段名\n- output_key: 输出分数字段名,默认为'Debertav3Score'\n\n过滤逻辑:保留分类结果在allowed_scores列表中的数据", - "type": { - "level_1": "text_pt", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "allowed_scores", - "default": [ - "Medium", - "High" - ], - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_name", - "default": "nvidia/quality-classifier-deberta", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "batch_size", - "default": 16, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "Debertav3Score", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 157, - "name": "FineWebEduSampleEvaluator", - "description": "基于Fineweb-Edu分类器评估文本的教育价值。该分类器使用预训练的序列分类模型对文本进行评估,返回0-1之间的分数,分数越高表示文本的教育价值越高。适用于筛选具有教育意义的文本内容。\n输入参数:\n- text: 待评估的文本字符串\n输出参数:\n- float: 0-1之间的教育价值分数,越高表示教育价值越大", - "type": { - "level_1": "text_pt", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "FinewebEduScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 158, - "name": "FineWebEduFilter", - "description": "基于FineWebEduScorer打分器的得分对数据进行过滤。Fineweb-Edu是一个用于评估文本教育价值的分类器。\n\n初始化参数:\n- min_score: 最低分数阈值,默认为2.5\n- max_score: 最高分数阈值,默认为10000\n- model_cache_dir: 模型缓存目录,默认为'./dataflow_cache'\n- device: 运行设备,默认为'cuda'\n\n运行参数:\n- input_key: 输入文本字段名\n- output_key: 输出分数字段名,默认为'FinewebEduScore'\n\n评分标准:0-5分,分数越高表示文本具有越高的教育价值\n过滤逻辑:保留分数在[min_score, max_score]范围内的数据", - "type": { - "level_1": "text_pt", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 2.5, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 10000, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "FinewebEduScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 159, - "name": "PairQualSampleEvaluator", - "description": "基于BGE模型和GPT成对比较数据训练的文本质量评分器,支持中英文输入。通过对文本进行单样本评估,返回0-1之间的质量分数,分数越高表示文本质量越好。模型分为英文版本(zks2856/PairQual-Scorer-en)和中文版本(zks2856/PairQual-Scorer-zh)。\n输入参数:\n- text: 待评估的文本字符串\n- lang: 语言类型,可选'en'或'zh'\n输出参数:\n- float: 0-1之间的质量分数,越高表示质量越好", - "type": { - "level_1": "text_pt", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "lang", - "default": "en", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_length", - "default": 512, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "PairQualScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 160, - "name": "PairQualFilter", - "description": "基于PairQualScorer打分器的得分对数据进行过滤。基于BGE模型,使用GPT对文本成对比较打分后训练而成的双语文本质量评分器,得分越高表示质量越高。\n输入参数:\n- min_score:最小质量得分阈值\n- max_score:最大质量得分阈值\n- model_cache_dir:模型缓存目录路径\n- lang:文本语言类型\n输出参数:\n- 过滤后的DataFrame,仅保留质量得分在指定范围内的文本\n- 返回包含质量得分字段名的列表", - "type": { - "level_1": "text_pt", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 10000, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "lang", - "default": "en", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "PairQualScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 161, - "name": "PerplexitySampleEvaluator", - "description": "基于Huggingface语言模型计算文本的困惑度(Perplexity),困惑度越低表示文本的流畅性和可理解性越高。输入参数:\n- model_name:Huggingface模型路径或名称\n- device:模型运行设备\n输出参数:\n- float: 困惑度值,越低表示文本流畅性越好", - "type": { - "level_1": "text_pt", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "model_name", - "default": "gpt2", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "raw_content", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "PerplexityScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 162, - "name": "PerplexityFilter", - "description": "基于PerplexityScorer打分器的得分对数据进行过滤。基于Huggingface模型计算文本的困惑度,困惑度越低,文本的流畅性和可理解性越高。\n输入参数:\n- min_score:最小困惑度阈值\n- max_score:最大困惑度阈值\n- model_name:Huggingface模型路径或名称\n- device:模型运行设备\n输出参数:\n- 过滤后的DataFrame,仅保留困惑度在指定范围内的文本\n- 返回包含困惑度得分字段名的列表", - "type": { - "level_1": "text_pt", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 10.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 500.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_name", - "default": "gpt2", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "PerplexityScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 163, - "name": "QuratingSampleEvaluator", - "description": "通过Qurating模型(princeton-nlp/QuRater-1.3B)从四个维度评估文本质量:写作风格(writing_style)、所需专业程度(required_expertise)、事实与趣闻(facts_and_trivia)和教育价值(educational_value)。每个维度返回0-1之间的分数,综合评估文本的整体质量。\n输入参数:\n- text: 待评估的文本字符串\n- labels: 评估维度列表,默认为['writing_style', 'required_expertise', 'facts_and_trivia', 'educational_value']\n输出参数:\n- dict: 包含各维度分数的字典,键为维度名称,值为0-1之间的分数", - "type": { - "level_1": "text_pt", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "map_batch_size", - "default": 512, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "num_workers", - "default": 1, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device_batch_size", - "default": 16, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "labels", - "default": [ - "writing_style", - "required_expertise", - "facts_and_trivia", - "educational_value" - ], - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 164, - "name": "QuratingFilter", - "description": "基于QuratingScorer打分器的得分对数据进行过滤。通过Qurating模型从四个维度评估文本质量:写作风格、所需专业知识、事实与 trivia 内容、教育价值。\n每个维度评分范围为0-9分,综合判断文本质量,可用于筛选高质量教育类或知识类内容。\n输入参数:\n- min_scores:各维度保留样本的最小分数阈值,默认为{'writing_style':0,'required_expertise':0,'facts_and_trivia':0,'educational_value':0}\n- max_scores:各维度保留样本的最大分数阈值,默认为{'writing_style':9,'required_expertise':9,'facts_and_trivia':9,'educational_value':9}\n- map_batch_size:映射批次大小,默认为512\n- num_workers:数据加载工作进程数,默认为1\n- device_batch_size:设备批次大小,默认为16\n- device:模型运行设备,默认为'cuda'\n- labels:评估维度列表,默认为['writing_style', 'required_expertise', 'facts_and_trivia', 'educational_value']\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n输出参数:\n- 过滤后的DataFrame,仅保留所有维度分数均在对应阈值范围内的样本\n- 返回包含各维度过滤结果字段名的列表,用于后续算子引用", - "type": { - "level_1": "text_pt", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_scores", - "default": { - "writing_style": 0, - "required_expertise": 0, - "facts_and_trivia": 0, - "educational_value": 0 - }, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_scores", - "default": { - "writing_style": 9, - "required_expertise": 9, - "facts_and_trivia": 9, - "educational_value": 9 - }, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "map_batch_size", - "default": 512, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "num_workers", - "default": 1, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device_batch_size", - "default": 16, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "labels", - "default": [ - "writing_style", - "required_expertise", - "facts_and_trivia", - "educational_value" - ], - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 165, - "name": "TextbookSampleEvaluator", - "description": "基于FastText分类器(kenhktsui/llm-data-textbook-quality-fasttext-classifer-v2)评估文本的教育价值,将文本分为低(Low)、中(Mid)、高(High)三个等级,并映射为1.0、3.0、5.0的分数。适用于筛选适合作为教材的高质量文本内容。\n输入参数:\n- text: 待评估的文本字符串\n输出参数:\n- float: 教育价值分数,可能值为1.0(低)、3.0(中)、5.0(高)", - "type": { - "level_1": "text_pt", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "TextbookScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 166, - "name": "TextbookFilter", - "description": "基于TextbookScorer打分器的得分对数据进行过滤。使用FastText分类器评估文本的教育价值,判断文本是否适合作为教材内容。\n分类器经过训练可识别具有教育意义、结构清晰、知识准确的文本,适用于构建教育类数据集。\n输入参数:\n- min_score:保留样本的最小教育价值分数阈值,默认为0.99\n- max_score:保留样本的最大教育价值分数阈值,默认为1.0\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- input_key:输入文本字段名\n- output_key:教育价值分数字段名,默认为'TextbookScore'\n输出参数:\n- 过滤后的DataFrame,仅保留教育价值分数在[min_score, max_score]范围内的样本\n- 返回包含教育价值分数字段名的列表,用于后续算子引用", - "type": { - "level_1": "text_pt", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 0.99, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 1, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "TextbookScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 167, - "name": "Phi4QAGenerator", - "description": "基于给定文档内容,生成预训练格式的多轮对话问答数据。将原始文档内容转换为适合语言模型预训练的对话格式数据。输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_key:输入文档内容字段名,默认为'raw_content'\n- output_key:输出生成内容字段名,默认为'generated_content'\n输出参数:\n- 包含原始内容和生成内容的DataFrame\n- 返回输出字段名,用于后续算子引用", - "type": { - "level_1": "text_pt", - "level_2": "generate" - }, - "allowed_prompts": [ - "Phi4QAGeneratorPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "raw_content", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "generated_content", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 168, - "name": "MetaSampleEvaluator", - "description": "通过LLM评估文本的多个元属性,包括文本结构、多样性与复杂性、流畅性与可理解性、安全性、教育价值以及内容准确性与有效性。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- dimensions:评估维度列表,每个维度对应的字典中包含dimension_name,description,和示例字段:\n * dimension_name:维度名称\n * description:维度的描述\n * example_list:包含示例文本和得分的列表\n- input_key:输入文本字段名\n输出参数:\n- 包含6个评估维度得分的DataFrame,列名为:Text Structure, Diversity & Complexity, Fluency & Understandability, Safety, Educational Value, Content Accuracy & Effectiveness", - "type": { - "level_1": "text_pt", - "level_2": "eval" - }, - "allowed_prompts": [ - "MetaPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "dimensions", - "default": [ - { - "dimension_name": "Text Structure", - "description": "Evaluate the surface-level quality of the text, including spelling accuracy, grammar, vocabulary richness, and sentence structure.", - "example_list": [ - { - "text": "The experimental procedure was meticulously documented, with each variable clearly defined.", - "score": "5" - }, - { - "text": "teh data was wrong and we dont no why it happen like that", - "score": "2" - } - ] - }, - { - "dimension_name": "Diversity and Complexity", - "description": "Assess how rich and conceptually varied the content is, and whether it requires expert or deep reasoning to understand.", - "example_list": [ - { - "text": "This article compares Bayesian inference and frequentist approaches in statistical modeling, highlighting theoretical and practical trade-offs.", - "score": "5" - }, - { - "text": "Dogs are pets. They bark. They are friendly.", - "score": "2" - } - ] - }, - { - "dimension_name": "Fluency and Understandability", - "description": "Evaluate whether the text flows naturally, is easy to follow, and avoids awkward or disjointed phrasing.", - "example_list": [ - { - "text": "Despite initial challenges, the team successfully completed the deployment by adhering to a revised strategy.", - "score": "5" - }, - { - "text": "The problem was and then fixed by something happens deployment successful maybe.", - "score": "2" - } - ] - }, - { - "dimension_name": "Safety", - "description": "Identify whether the text contains profanities, hate speech, or excessive personally identifiable information (PII).", - "example_list": [ - { - "text": "The software collects anonymous usage data to improve performance.", - "score": "5" - }, - { - "text": "You idiot, your address 123 Main St will be posted online.", - "score": "1" - } - ] - }, - { - "dimension_name": "Educational Value", - "description": "Determine whether the text provides insight, stimulates thinking, or offers meaningful learning potential.", - "example_list": [ - { - "text": "Understanding the principles of thermodynamics allows engineers to design more efficient engines.", - "score": "5" - }, - { - "text": "The sky is blue. Water is wet. This is how it is.", - "score": "2" - } - ] - }, - { - "dimension_name": "Content Accuracy and Effectiveness", - "description": "Assess the truthfulness, relevance, and practical usefulness of the content.", - "example_list": [ - { - "text": "Newton's second law states that F = ma, which explains the relationship between force, mass, and acceleration.", - "score": "5" - }, - { - "text": "The Earth is flat and doesn't rotate around the Sun.", - "score": "1" - } - ] - } - ], - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 169, - "name": "AlpagasusSampleEvaluator", - "description": "通过调用GPT评估指令的质量,返回一个质量得分,得分越高表明指令的质量越高。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- dimension:评估维度,默认为'quality'\n- input_instruction_key:指令字段名\n- input_input_key:输入文本字段名\n- input_output_key:输出文本字段名\n- output_key:输出得分字段名,默认'AlpagasusScore'\n输出参数:\n- 包含评估得分的DataFrame", - "type": { - "level_1": "text_sft", - "level_2": "eval" - }, - "allowed_prompts": [ - "AlpagasusPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "dimension", - "default": "quality", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_output_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "AlpagasusScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 170, - "name": "DeitaQualitySampleEvaluator", - "description": "基于Llama模型的Deita指令质量评估器,通过生成1-6分的质量评分评估指令质量。\n输入参数:\n- device:计算设备,默认为'cuda'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- max_length:最大序列长度,默认为512\n- input_instruction_key:指令文本字段名,默认为'instruction'\n- input_output_key:输出文本字段名,默认为'output'\n- output_key:输出得分字段名,默认为'DeitaQualityScore'\n输出参数:\n- 包含指令质量评分的DataFrame(1-6分)", - "type": { - "level_1": "text_sft", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_length", - "default": 512, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_output_key", - "default": "output", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "DeitaQualityScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 171, - "name": "DeitaComplexitySampleEvaluator", - "description": "基于Llama模型的Deita指令复杂性评估器,通过生成1-6分的复杂性评分评估指令难度。\n输入参数:\n- device:计算设备,默认为'cuda'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- max_length:最大序列长度,默认为512\n- input_instruction_key:指令文本字段名,默认为'instruction'\n- input_output_key:输出文本字段名,默认为'output'\n- output_key:输出得分字段名,默认为'DeitaComplexityScore'\n输出参数:\n- 包含指令复杂性评分的DataFrame(1-6分)", - "type": { - "level_1": "text_sft", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_length", - "default": 512, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_output_key", - "default": "output", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "DeitaComplexityScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 172, - "name": "InstagSampleEvaluator", - "description": "使用Instag评分器评估指令的内容多样性和意图标签。通过分析指令文本生成相关标签,标签数量越多表示内容多样性越大,同时返回标签的详细解释。基于OFA-Sys/InsTagger模型实现。\n输入参数:\n- query: 待评估的指令文本\n输出参数:\n- int: 标签数量(内容多样性指标)\n- list: 包含标签和解释的字典列表", - "type": { - "level_1": "text_sft", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_new_tokens", - "default": 1024, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "temperature", - "default": 0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "do_sample", - "default": false, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "num_return_sequences", - "default": 1, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "return_dict_in_generate", - "default": true, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "InstagScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 173, - "name": "RMSampleEvaluator", - "description": "基于人类偏好数据训练的奖励模型(OpenAssistant/reward-model-deberta-v3-large-v2)对文本质量进行打分,高分代表质量较高。模型输入为指令和响应文本对,输出0-1之间的奖励分数,反映人类对文本质量的偏好判断。\n输入参数:\n- instruction: 指令文本字符串\n- output: 响应文本字符串\n输出参数:\n- float: 0-1之间的奖励分数,越高表示质量越好", - "type": { - "level_1": "text_sft", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_output_key", - "default": "output", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "RMScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 174, - "name": "SuperfilteringSampleEvaluator", - "description": "使用Superfiltering方法评估指令的跟随难度,基于GPT-2模型计算条件困惑度与独立困惑度的比值,得分越高表示指令越难跟随。该方法通过比较指令条件下的响应困惑度与独立响应困惑度,评估指令的清晰度和跟随难度。\n输入参数:\n- instruction: 指令文本\n- input_text: 输入文本(可选)\n- output: 响应文本\n输出参数:\n- float: 困惑度比值,越高表示指令跟随难度越大", - "type": { - "level_1": "text_sft", - "level_2": "eval" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_length", - "default": 512, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_output_key", - "default": "output", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "SuperfilteringScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 175, - "name": "TreeinstructSampleEvaluator", - "description": "通过生成语法树的节点数来衡量指令复杂性,节点越多表示指令越复杂。\n输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_instruction_key:指令字段名\n- output_key:输出得分字段名,默认'TreeinstructScore'\n输出参数:\n- 包含指令复杂性得分的DataFrame", - "type": { - "level_1": "text_sft", - "level_2": "eval" - }, - "allowed_prompts": [ - "TreeinstructPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "TreeinstructScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 176, - "name": "AlpagasusFilter", - "description": "基于AlpagasusScorer打分器的得分对数据进行过滤。通过调用GPT模型评估指令的质量,返回一个质量得分。\n\n初始化参数:\n- min_score: 最低分数阈值,默认为3\n- max_score: 最高分数阈值,默认为5\n- llm_serving: LLM服务实例\n- dimension: 评估维度,默认为'quality'(质量)\n\n运行参数:\n- input_instruction_key: 输入指令字段名\n- input_input_key: 输入内容字段名\n- input_output_key: 输出内容字段名\n- output_key: 输出分数字段名,默认为'AlpagasusScore'\n\n过滤逻辑:保留分数在[min_score, max_score]范围内的数据", - "type": { - "level_1": "text_sft", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 3, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 5, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "dimension", - "default": "quality", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_output_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "AlpagasusScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 177, - "name": "DeitaQualityFilter", - "description": "基于DeitaQualityScorer打分器的得分对数据进行过滤。使用基于Llama模型的Deita指令质量评估器,评估指令的质量高低。\n\n初始化参数:\n- min_score: 最低分数阈值,默认为2.5\n- max_score: 最高分数阈值,默认为10000.0\n- device: 运行设备,默认为'cuda'\n- model_cache_dir: 模型缓存目录,默认为'./dataflow_cache'\n- max_length: 最大序列长度,默认为512\n\n运行参数:\n- input_instruction_key: 输入指令字段名,默认为'instruction'\n- input_output_key: 输入输出字段名,默认为'output'\n- output_key: 输出分数字段名,默认为'DeitaQualityScore'\n\n评分标准:1-6分,分数越高表示指令质量越高\n过滤逻辑:保留分数在[min_score, max_score]范围内的数据", - "type": { - "level_1": "text_sft", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 2.5, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 10000.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_length", - "default": 512, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_output_key", - "default": "output", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "DeitaQualityScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 178, - "name": "DeitaComplexityFilter", - "description": "基于DeitaComplexityScorer打分器的得分对数据进行过滤。使用基于Llama模型的Deita指令复杂性评估器,评估指令的复杂程度。\n\n初始化参数:\n- min_score: 最低分数阈值,默认为3.0\n- max_score: 最高分数阈值,默认为5.0\n- device: 运行设备,默认为'cuda'\n- model_cache_dir: 模型缓存目录,默认为'./dataflow_cache'\n- max_length: 最大序列长度,默认为512\n\n运行参数:\n- input_instruction_key: 输入指令字段名,默认为'instruction'\n- input_output_key: 输入输出字段名,默认为'output'\n- output_key: 输出分数字段名,默认为'DeitaComplexityScore'\n\n评分标准:1-6分,分数越高表示指令复杂性越高\n过滤逻辑:保留分数在[min_score, max_score]范围内的数据", - "type": { - "level_1": "text_sft", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 3.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 5.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_length", - "default": 512, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_output_key", - "default": "output", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "DeitaComplexityScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 179, - "name": "InstagFilter", - "description": "基于InstagScorer打分器的过滤算子。使用预训练的Instag模型对指令进行分析,返回标签的数量来评估指令的内容多样性。参数包括模型缓存目录(model_cache_dir)、计算设备(device)和最大新生成标记数(max_new_tokens)。过滤范围由min_score和max_score参数控制,标签越多表示内容多样性越大。", - "type": { - "level_1": "text_sft", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 0.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 1.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_new_tokens", - "default": 1024, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "InstagScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 180, - "name": "RMFilter", - "description": "基于RMScorer打分器的得分对数据进行过滤。使用基于人类偏好数据训练的奖励模型对文本质量进行评分,高分代表质量较高。\n奖励模型能够评估文本的相关性、有用性、无害性等人类偏好指标,可用于筛选符合人类价值观的高质量文本。\n输入参数:\n- min_score:保留样本的最小奖励分数阈值,默认为0.2\n- max_score:保留样本的最大奖励分数阈值,默认为0.8\n- device:模型运行设备,默认为'cuda'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- input_instruction_key:指令字段名,默认为'instruction'\n- input_output_key:输出字段名,默认为'output'\n输出参数:\n- 过滤后的DataFrame,仅保留奖励分数在[min_score, max_score]范围内的样本\n- 返回包含奖励分数字段名的列表,用于后续算子引用", - "type": { - "level_1": "text_sft", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 0.2, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 0.8, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_output_key", - "default": "output", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "RMScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 181, - "name": "SuperfilteringFilter", - "description": "使用Superfiltering评分器过滤掉低质量数据。基于GPT-2模型计算困惑度比值来评估指令跟随难度,比值越低表示指令越容易被模型理解和执行。\n适用于筛选适合特定模型能力的指令数据,提高模型训练效率和效果。\n输入参数:\n- min_score:保留样本的最小分数阈值,默认为0.0\n- max_score:保留样本的最大分数阈值,默认为1.0\n- device:模型运行设备,默认为'cuda'\n- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n- max_length:文本最大长度,默认为512\n- input_instruction_key:指令字段名,默认为'instruction'\n- input_input_key:输入字段名,默认为'input'\n- input_output_key:输出字段名,默认为'output'\n- output_key:过滤结果分数字段名,默认为'SuperfilteringScore'\n输出参数:\n- 过滤后的DataFrame,仅保留分数在[min_score, max_score]范围内的样本\n- 返回包含过滤结果分数字段名的列表,用于后续算子引用", - "type": { - "level_1": "text_sft", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 0.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 1.0, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "device", - "default": "cuda", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "model_cache_dir", - "default": "./dataflow_cache", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_length", - "default": 512, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_input_key", - "default": "input", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_output_key", - "default": "output", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "SuperfilteringScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 182, - "name": "TreeinstructFilter", - "description": "基于TreeinstructScore打分器的得分对数据进行过滤。通过生成语法树的节点数来衡量指令复杂性,节点越多表示指令越复杂。\n适用于筛选特定复杂度范围内的指令数据,平衡数据集难度分布,优化模型训练效果。\n输入参数:\n- min_score:保留样本的最小语法树节点数阈值,默认为7\n- max_score:保留样本的最大语法树节点数阈值,默认为100\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_key:输入指令字段名\n- output_key:语法树节点数字段名,默认为'TreeinstructScore'\n输出参数:\n- 过滤后的DataFrame,仅保留语法树节点数在[min_score, max_score]范围内的样本\n- 返回包含语法树节点数字段名的列表,用于后续算子引用", - "type": { - "level_1": "text_sft", - "level_2": "filter" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "min_score", - "default": 7, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "max_score", - "default": 100, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_key", - "default": "TreeinstructScore", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 183, - "name": "CondorGenerator", - "description": "基于预置知识树标签,两阶段从0合成SFT格式数据(合成数量大于5000时建议增加标签数量)。第一阶段生成不同难度级别的问题,第二阶段为每个问题生成对应的答案。输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- num_samples:生成样本总数,建议小于5000,默认值为15\n输出参数:\n- 包含'difficulty'、'instruction'和'output'字段的DataFrame\n- 返回生成的DataFrame用于后续处理", - "type": { - "level_1": "text_sft", - "level_2": "generate" - }, - "allowed_prompts": [ - "CondorQuestionPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "num_samples", - "default": 15, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "use_task_diversity", - "default": true, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 184, - "name": "SFTGeneratorSeed", - "description": "基于给定文档内容,生成监督微调格式的问答数据。并支持用户自定义生成内容要求。从原始文档中提取信息,生成符合SFT格式的指令-响应对。输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- custom_prompt:用户自定义提示词\n- input_key:输入文档内容字段名,默认为'raw_content'\n- max_tokens:生成文本的最大token数,默认为4096\n输出参数:\n- 包含'instruction'、'output'和'raw_content'字段的DataFrame\n- 返回包含'instruction'和'output'字段名的列表,用于后续算子引用", - "type": { - "level_1": "text_sft", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "custom_prompt", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_key", - "default": "raw_content", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 185, - "name": "CondorRefiner", - "description": "两阶段优化指令回复质量:第一阶段调用API生成对回复的评论,第二阶段利用评论调用API改写回复,提升指令对质量。通过迭代优化提高问答对的整体质量。输入参数:\n- llm_serving:LLM服务对象,需实现LLMServingABC接口\n- input_instruction_key:输入指令字段名,默认为'instruction'\n- input_output_key:输入回复字段名,默认为'output'\n输出参数:\n- 包含优化后回复的DataFrame\n- 返回包含优化后回复字段名的列表,用于后续算子引用", - "type": { - "level_1": "text_sft", - "level_2": "refine" - }, - "allowed_prompts": [ - "CondorCritiquePrompt", - "CondorRefinePrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_instruction_key", - "default": "instruction", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_output_key", - "default": "output", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 186, - "name": "VQAExtractPdf2Img", - "description": "N/A (非 staticmethod)", - "type": { - "level_1": "vqa", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "dpi", - "default": 300, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_pdf_path", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_image_folder", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 187, - "name": "VQAExtractDocLayoutMinerU", - "description": "N/A (非 staticmethod)", - "type": { - "level_1": "vqa", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "mineru_backend", - "default": "vlm-transformers", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_pdf_file_path", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_folder", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 188, - "name": "VQAExtractPicExtractor", - "description": "N/A (非 staticmethod)", - "type": { - "level_1": "vqa", - "level_2": "generate" - }, - "allowed_prompts": [ - "VQAExtractPrompt" - ], - "parameter": { - "init": [ - { - "name": "llm_serving", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "interleaved", - "default": true, - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_layout_path", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_subject", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_folder", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 189, - "name": "VQAExtractQAPairExtractor", - "description": "N/A (非 staticmethod)", - "type": { - "level_1": "vqa", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_vqa_extract_path", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_qa_path", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 190, - "name": "VQAExtractTag2Img", - "description": "N/A (非 staticmethod)", - "type": { - "level_1": "vqa", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [ - { - "name": "layout_prefix", - "default": "doclay_page_", - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "image_prefix", - "default": "page_", - "kind": "POSITIONAL_OR_KEYWORD" - } - ], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_layout_json", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_pdf_image_dir", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_image_dir", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_qa_file", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_qa_file", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_md_file", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 191, - "name": "VQAClipHeader", - "description": "N/A (非 staticmethod)", - "type": { - "level_1": "vqa", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_image_path", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_layout_path", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_image_folder", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_layout_prefix", - "default": "doclay", - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - }, - { - "node": 192, - "name": "VQAConcatenateImages", - "description": "N/A (非 staticmethod)", - "type": { - "level_1": "vqa", - "level_2": "generate" - }, - "allowed_prompts": [], - "parameter": { - "init": [], - "run": [ - { - "name": "storage", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "input_image_folder", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - }, - { - "name": "output_image_folder", - "default": null, - "kind": "POSITIONAL_OR_KEYWORD" - } - ] - }, - "required": "", - "depends_on": [], - "mode": "" - } - ] -} \ No newline at end of file From 1aa8ff32ae2fee9fffb9a22a40f49d62ca72c999 Mon Sep 17 00:00:00 2001 From: scuuy <912074188@qq.com> Date: Mon, 1 Dec 2025 19:12:53 +0800 Subject: [PATCH 10/10] name & value pattern --- backend/app/api/v1/endpoints/pipelines.py | 14 ++++++++++++++ backend/app/schemas/pipelines.py | 2 +- backend/app/services/pipeline_registry.py | 15 +++++++++++++++ 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/backend/app/api/v1/endpoints/pipelines.py b/backend/app/api/v1/endpoints/pipelines.py index 633cc80..4256c1e 100644 --- a/backend/app/api/v1/endpoints/pipelines.py +++ b/backend/app/api/v1/endpoints/pipelines.py @@ -34,6 +34,11 @@ def create_pipeline(request: Request, payload: PipelineIn): try: logger.info(f"Request: {request.method} {request.url.path}, Pipeline name: {payload.name}") pipeline_in_data = payload.model_dump() + + operators = pipeline_in_data.get("config", {}).get("operators", []) + for op in operators: + op["params"] = _PIPELINE_REGISTRY.parse_frontend_params(op.get("params", [])) + pipeline = _PIPELINE_REGISTRY.create_pipeline(pipeline_in_data) return created(pipeline) except ValueError as e: @@ -54,6 +59,11 @@ def get_pipeline(pipeline_id: str): def update_pipeline(pipeline_id: str, payload: PipelineIn): try: pipeline_in_data = payload.model_dump() + + operators = pipeline_in_data.get("config", {}).get("operators", []) + for op in operators: + op["params"] = _PIPELINE_REGISTRY.parse_frontend_params(op.get("params", [])) + updated_pipeline = _PIPELINE_REGISTRY.update_pipeline(pipeline_id, pipeline_in_data) return ok(updated_pipeline) except ValueError as e: @@ -82,6 +92,10 @@ async def execute_pipeline(request: Request, payload: PipelineExecutionRequest, try: logger.info(f"Request: {request.method} {request.url.path}") + if payload.config: + for op in payload.config.operators: + op.params = _PIPELINE_REGISTRY.parse_frontend_params(op.params) + # 调用服务层开始执行 execution_id, pipeline_config, initial_result = _PIPELINE_REGISTRY.start_execution( pipeline_id=payload.pipeline_id, diff --git a/backend/app/schemas/pipelines.py b/backend/app/schemas/pipelines.py index e508354..12c30e4 100644 --- a/backend/app/schemas/pipelines.py +++ b/backend/app/schemas/pipelines.py @@ -33,7 +33,7 @@ class ExecutionStatus(str, Enum): failed = "failed" -class PipelineOperator(OperatorDetailSchema): # 画布上的pipeline类 +class PipelineOperator(BaseModel): # 画布上的pipeline类 """Pipeline算子模型""" name: str = Field(..., description="算子名称") params: Dict[str, Any] = Field(default_factory=dict, description="算子参数配置") diff --git a/backend/app/services/pipeline_registry.py b/backend/app/services/pipeline_registry.py index aa08aa3..f5b8ae7 100644 --- a/backend/app/services/pipeline_registry.py +++ b/backend/app/services/pipeline_registry.py @@ -83,6 +83,21 @@ def _write(self, data: Dict): with open(self.path, "w", encoding="utf-8") as f: yaml.safe_dump(data, f, allow_unicode=True, sort_keys=False) + def parse_frontend_params(self, params_list): + """ + 将前端 [{name: xxx, value: yyy}] 解析成字典 {xxx: yyy} + """ + if not params_list: + return {} + + parsed = {} + for item in params_list: + # item = {"name": "...", "value": ...} + key = item.get("name") + value = item.get("value") + if key is not None: + parsed[key] = value + return parsed def get_current_time(self): """获取当前时间的ISO格式字符串"""