From 9ed2d8670e123a4247a42909ffbe2a596e0b2b65 Mon Sep 17 00:00:00 2001 From: Louis Choquel Date: Mon, 9 Jun 2025 14:42:16 +0200 Subject: [PATCH 1/7] Feature/dry run (#68) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Highlight - Structured input specs as dicts: required_variable → concept_code - Static validation of inputs for inference pipes: PipeLLM, PipeOcr and PipeImgGen: you can now catch mistakes in your pipelines without running any operation - `static_validation_config` to decide how to react to errors, kind of like pyright settings: for each type of error you can choose `raise`, `log` or `ignore` , with a general default of course - Dry run validation catches errors in pipelines by running them without any inference, so it’s fast, costs nothing and works without internet, which makes it suitable for validating pipelines (like a linter) and this can even be used by agents like Cursor and Codex (more on that soon) - Dry run pipes without inference - `dry_run_config` to enable/disable jinja2 rendering during dry run - Added dependency to polyfactory for mock pydantic model generation - Improved Error handling for bad inputs during `run_pipe`, which works in dry run too, that's the purpose ### Tests - Added pytest option `-pipe-run-mode` to `shared_pytest_plugins.py`, values `live` or `dry`, **default = `dry`** - Made all the test_pipe apply pipe_run_mode - Introduced a new pytest marker `pipelex_api`, applied to client methods instead of inference, llm etc. - Added target `make test-pipelex-api` aka `make ta` to run just the client methods ### Refactor - New way (simpler and way more elegant) to wrap inference worker methods (llm, image gen, ocr): instead of the `job_func` decorators, it uses a base method that calls a private abstract method implemented by each worker, and does stuff before and after ### Fixes - Fixed logged error regarding pipe stack and PipeParallel - Restored pipe tracker functionality, which no longer crashes when using stuff attributes as inputs --- Makefile | 17 +- pipelex/cli/_cli.py | 33 +- .../content_generator_dry.py | 249 +++++++++++++++ pipelex/cogt/image/prompt_image.py | 26 +- pipelex/cogt/image/prompt_image_factory.py | 10 +- pipelex/cogt/imgg/imgg_worker_abstract.py | 99 +++--- pipelex/cogt/llm/llm_job_func.py | 75 ----- pipelex/cogt/llm/llm_models/llm_deck.py | 6 +- pipelex/cogt/llm/llm_models/llm_setting.py | 12 +- pipelex/cogt/llm/llm_prompt.py | 45 ++- pipelex/cogt/llm/llm_worker_abstract.py | 113 +++++-- pipelex/cogt/llm/llm_worker_factory.py | 16 +- pipelex/cogt/ocr/ocr_worker_abstract.py | 69 ++-- pipelex/config.py | 36 ++- pipelex/core/concept.py | 36 ++- pipelex/core/concept_library.py | 21 +- pipelex/core/concept_native.py | 7 + pipelex/core/pipe_abstract.py | 16 +- pipelex/core/pipe_blueprint.py | 16 +- pipelex/core/pipe_input_details.py | 80 +++++ pipelex/core/pipe_input_spec.py | 88 ++++++ pipelex/core/pipe_library.py | 3 +- pipelex/core/pipe_run_params.py | 12 + pipelex/core/pipe_run_params_factory.py | 4 +- pipelex/core/working_memory.py | 60 ++-- pipelex/core/working_memory_factory.py | 6 +- pipelex/exceptions.py | 118 +++++-- pipelex/libraries/library_manager.py | 90 ++++-- pipelex/libraries/llm_deck/base_llm_deck.toml | 3 +- .../libraries/llm_integrations/vertexai.toml | 18 +- pipelex/libraries/pipelines/documents.toml | 10 +- .../libraries/pipelines/image_generation.toml | 21 -- pipelex/libraries/pipelines/images.toml | 44 ++- pipelex/libraries/pipelines/questions.py | 183 ----------- pipelex/libraries/pipelines/questions.toml | 294 ------------------ pipelex/libraries/pipelines/retrieve.py | 12 - pipelex/libraries/pipelines/retrieve.toml | 25 -- pipelex/pipe_controllers/pipe_batch.py | 121 +++---- .../pipe_controllers/pipe_batch_factory.py | 3 +- pipelex/pipe_controllers/pipe_condition.py | 6 +- .../pipe_condition_factory.py | 3 +- pipelex/pipe_controllers/pipe_parallel.py | 7 +- .../pipe_controllers/pipe_parallel_factory.py | 3 +- pipelex/pipe_controllers/pipe_sequence.py | 26 +- .../pipe_controllers/pipe_sequence_factory.py | 5 +- pipelex/pipe_controllers/sub_pipe.py | 23 +- pipelex/pipe_operators/pipe_func.py | 3 - pipelex/pipe_operators/pipe_func_factory.py | 3 +- pipelex/pipe_operators/pipe_img_gen.py | 137 +++++++- .../pipe_operators/pipe_img_gen_factory.py | 10 +- pipelex/pipe_operators/pipe_jinja2.py | 36 ++- pipelex/pipe_operators/pipe_jinja2_factory.py | 3 +- pipelex/pipe_operators/pipe_llm.py | 152 +++++++-- pipelex/pipe_operators/pipe_llm_factory.py | 5 +- pipelex/pipe_operators/pipe_llm_prompt.py | 34 +- pipelex/pipe_operators/pipe_ocr.py | 137 +++++++- pipelex/pipe_operators/pipe_ocr_factory.py | 17 +- pipelex/pipe_operators/pipe_operator.py | 41 ++- pipelex/pipelex.toml | 21 ++ pipelex/pipeline/execute.py | 6 +- pipelex/pipeline/start.py | 7 +- .../plugins/anthropic/anthropic_llm_worker.py | 7 +- pipelex/plugins/bedrock/bedrock_llm_worker.py | 7 +- pipelex/plugins/fal/fal_imgg_worker.py | 8 +- pipelex/plugins/mistral/mistral_factory.py | 2 +- pipelex/plugins/mistral/mistral_llm_worker.py | 7 +- pipelex/plugins/mistral/mistral_ocr_worker.py | 5 +- pipelex/plugins/openai/openai_factory.py | 4 +- pipelex/plugins/openai/openai_imgg_worker.py | 8 +- pipelex/plugins/openai/openai_llm_worker.py | 7 +- pipelex/plugins/openai/vertexai_config.py | 8 +- pipelex/test_extras/shared_pytest_plugins.py | 18 ++ pipelex/tools/misc/attribute_utils.py | 34 ++ pipelex/tools/misc/markdown_utils.py | 12 +- pipelex/tools/typing/pydantic_utils.py | 21 +- pyproject.toml | 4 +- tests/pipelex/cogt/conftest.py | 183 ----------- .../{cogt => }/cogt_asynch/__init__.py | 0 .../cogt_asynch/test_content_generator.py | 0 .../{cogt => }/cogt_asynch/test_image_gen.py | 0 .../cogt_asynch/test_llm_engines.py | 0 .../cogt_asynch/test_llm_gen_object.py | 2 +- .../cogt_asynch/test_llm_gen_text.py | 2 +- .../cogt_asynch/test_llm_job_async_factory.py | 0 .../{cogt => }/cogt_asynch/test_llm_report.py | 0 .../{cogt => }/cogt_asynch/test_llm_vision.py | 2 +- .../{cogt => }/cogt_asynch/test_ocr.py | 0 tests/pipelex/conftest.py | 190 ++++++++++- tests/pipelex/pipelex_asynch/test_client.py | 3 +- .../pipelex/pipelex_asynch/test_pipe_batch.py | 8 +- .../pipelex/pipelex_asynch/test_pipe_imgg.py | 4 + .../pipelex_asynch/test_pipe_jinja2.py | 55 ++++ tests/pipelex/pipelex_asynch/test_pipe_llm.py | 15 +- tests/pipelex/pipelex_asynch/test_pipe_ocr.py | 19 +- .../test_pipe_running_variants.py | 21 +- tests/pipelex/test_data.py | 10 +- tests/pipelex/test_libraries.py | 2 +- tests/test_pipelines/answer.toml | 251 --------------- tests/test_pipelines/basic.toml | 171 ---------- tests/test_pipelines/contracts.py | 75 ----- tests/test_pipelines/contracts.toml | 13 - tests/test_pipelines/failure_modes.toml | 2 - tests/test_pipelines/misc_tests/flows.toml | 13 +- .../misc_tests/multiplicity.toml | 8 +- .../test_pipelines/misc_tests/pipe_batch.toml | 9 +- .../misc_tests/subfolder_1/cars.toml | 2 +- .../misc_tests/subfolder_2/animals.toml | 2 +- .../subfolder_2/subfolder/flowers.toml | 2 +- .../misc_tests/test_errors.toml | 10 +- .../misc_tests/test_jinja2.toml | 2 +- tests/test_pipelines/misc_tests/tests.toml | 46 +-- tests/test_pipelines/test_images.toml | 22 ++ tests/test_pipelines/tricky_questions.py | 16 + tests/test_pipelines/tricky_questions.toml | 77 +++++ tests/tools/typing/test_pydantic_utils.py | 4 +- uv.lock | 29 +- 116 files changed, 2272 insertions(+), 1932 deletions(-) create mode 100644 pipelex/cogt/content_generation/content_generator_dry.py delete mode 100644 pipelex/cogt/llm/llm_job_func.py create mode 100644 pipelex/core/pipe_input_details.py create mode 100644 pipelex/core/pipe_input_spec.py delete mode 100644 pipelex/libraries/pipelines/image_generation.toml delete mode 100644 pipelex/libraries/pipelines/questions.py delete mode 100644 pipelex/libraries/pipelines/questions.toml delete mode 100644 pipelex/libraries/pipelines/retrieve.py delete mode 100644 pipelex/libraries/pipelines/retrieve.toml create mode 100644 pipelex/tools/misc/attribute_utils.py delete mode 100644 tests/pipelex/cogt/conftest.py rename tests/pipelex/{cogt => }/cogt_asynch/__init__.py (100%) rename tests/pipelex/{cogt => }/cogt_asynch/test_content_generator.py (100%) rename tests/pipelex/{cogt => }/cogt_asynch/test_image_gen.py (100%) rename tests/pipelex/{cogt => }/cogt_asynch/test_llm_engines.py (100%) rename tests/pipelex/{cogt => }/cogt_asynch/test_llm_gen_object.py (99%) rename tests/pipelex/{cogt => }/cogt_asynch/test_llm_gen_text.py (99%) rename tests/pipelex/{cogt => }/cogt_asynch/test_llm_job_async_factory.py (100%) rename tests/pipelex/{cogt => }/cogt_asynch/test_llm_report.py (100%) rename tests/pipelex/{cogt => }/cogt_asynch/test_llm_vision.py (98%) rename tests/pipelex/{cogt => }/cogt_asynch/test_ocr.py (100%) create mode 100644 tests/pipelex/pipelex_asynch/test_pipe_jinja2.py delete mode 100644 tests/test_pipelines/answer.toml delete mode 100644 tests/test_pipelines/basic.toml delete mode 100644 tests/test_pipelines/contracts.py delete mode 100644 tests/test_pipelines/contracts.toml create mode 100644 tests/test_pipelines/test_images.toml create mode 100644 tests/test_pipelines/tricky_questions.py create mode 100644 tests/test_pipelines/tricky_questions.toml diff --git a/Makefile b/Makefile index 23aa02439..fc6c08af4 100644 --- a/Makefile +++ b/Makefile @@ -203,12 +203,12 @@ cleanall: cleanderived cleanenv cleanlibraries codex-tests: env $(call PRINT_TITLE,"Unit testing for Codex") @echo "• Running unit tests for Codex (excluding inference and codex_disabled)" - $(VENV_PYTEST) --exitfirst --quiet -m "not inference and not codex_disabled" || [ $$? = 5 ] + $(VENV_PYTEST) --exitfirst --quiet -m "not (inference or codex_disabled or pipelex_api)" || [ $$? = 5 ] gha-tests: env $(call PRINT_TITLE,"Unit testing for github actions") @echo "• Running unit tests for github actions (excluding inference and gha_disabled)" - $(VENV_PYTEST) --exitfirst --quiet -m "not inference and not gha_disabled" || [ $$? = 5 ] + $(VENV_PYTEST) --exitfirst --quiet -m "not (inference or gha_disabled or pipelex_api)" || [ $$? = 5 ] run-all-tests: env $(call PRINT_TITLE,"Running all unit tests") @@ -218,7 +218,7 @@ run-all-tests: env run-manual-trigger-gha-tests: env $(call PRINT_TITLE,"Running GHA tests") @echo "• Running GHA unit tests for inference, llm, and not gha_disabled" - $(VENV_PYTEST) --exitfirst --quiet -m "not gha_disabled and (inference or llm)" || [ $$? = 5 ] + $(VENV_PYTEST) --exitfirst --quiet -m "not (gha_disabled or pipelex_api) and (inference or llm)" || [ $$? = 5 ] run-gha_disabled-tests: env $(call PRINT_TITLE,"Running GHA disabled tests") @@ -303,6 +303,17 @@ test-imgg: env tg: test-imgg @echo "> done: tg = test-imgg" +test-pipelex-api: env + $(call PRINT_TITLE,"Unit testing") + @if [ -n "$(TEST)" ]; then \ + $(VENV_PYTEST) --exitfirst -m "pipelex_api" -s -k "$(TEST)" $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ + else \ + $(VENV_PYTEST) --exitfirst -m "pipelex_api" -s $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ + fi + +ta: test-pipelex-api + @echo "> done: ta = test-pipelex-api" + ############################################################################################ ############################ Linting ############################ ############################################################################################ diff --git a/pipelex/cli/_cli.py b/pipelex/cli/_cli.py index 867360c6b..a57b68b39 100644 --- a/pipelex/cli/_cli.py +++ b/pipelex/cli/_cli.py @@ -137,22 +137,23 @@ def _format_concept_code(concept_code: Optional[str], current_domain: str) -> st pipes_dict[domain] = {} for pipe in domain_pipes: - if pipe.code: - input_code = _format_concept_code(pipe.input_concept_code, domain) - output_code = _format_concept_code(pipe.output_concept_code, domain) - - table.add_row( - pipe.code, - pipe.definition or "", - input_code, - output_code, - ) - - pipes_dict[domain][pipe.code] = { - "definition": pipe.definition or "", - "input": pipe.input_concept_code or "", - "output": pipe.output_concept_code or "", - } + inputs = pipe.inputs + formatted_inputs = [f"{name}: {_format_concept_code(concept_code, domain)}" for name, concept_code in inputs.items] + formatted_inputs_str = ", ".join(formatted_inputs) + output_code = _format_concept_code(pipe.output_concept_code, domain) + + table.add_row( + pipe.code, + pipe.definition or "", + formatted_inputs_str, + output_code, + ) + + pipes_dict[domain][pipe.code] = { + "definition": pipe.definition or "", + "inputs": formatted_inputs_str, + "output": pipe.output_concept_code, + } pretty_print(table) diff --git a/pipelex/cogt/content_generation/content_generator_dry.py b/pipelex/cogt/content_generation/content_generator_dry.py new file mode 100644 index 000000000..e295359f3 --- /dev/null +++ b/pipelex/cogt/content_generation/content_generator_dry.py @@ -0,0 +1,249 @@ +from typing import Any, Dict, List, Optional, Type + +from polyfactory.factories.pydantic_factory import ModelFactory +from typing_extensions import override + +from pipelex import log +from pipelex.cogt.content_generation.content_generator_protocol import ContentGeneratorProtocol, update_job_metadata +from pipelex.cogt.image.generated_image import GeneratedImage +from pipelex.cogt.imgg.imgg_handle import ImggHandle +from pipelex.cogt.imgg.imgg_job_components import ImggJobConfig, ImggJobParams +from pipelex.cogt.imgg.imgg_prompt import ImggPrompt +from pipelex.cogt.llm.llm_models.llm_setting import LLMSetting +from pipelex.cogt.llm.llm_prompt import LLMPrompt +from pipelex.cogt.llm.llm_prompt_factory_abstract import LLMPromptFactoryAbstract +from pipelex.cogt.ocr.ocr_handle import OcrHandle +from pipelex.cogt.ocr.ocr_input import OcrInput +from pipelex.cogt.ocr.ocr_job_components import OcrJobConfig, OcrJobParams +from pipelex.cogt.ocr.ocr_output import ExtractedImageFromPage, OcrOutput, Page +from pipelex.config import get_config +from pipelex.pipeline.job_metadata import JobMetadata +from pipelex.tools.templating.jinja2_environment import Jinja2TemplateCategory +from pipelex.tools.templating.templating_models import PromptingStyle +from pipelex.tools.typing.pydantic_utils import BaseModelTypeVar + + +class ContentGeneratorDry(ContentGeneratorProtocol): + """ + This class is used to generate mock content for testing purposes. + It does not use any inference. + """ + + @property + def _text_gen_truncate_length(self) -> int: + return get_config().pipelex.dry_run_config.text_gen_truncate_length + + @override + @update_job_metadata + async def make_llm_text( # pyright: ignore[reportIncompatibleMethodOverride] + self, + job_metadata: JobMetadata, + llm_setting_main: LLMSetting, + llm_prompt_for_text: LLMPrompt, + wfid: Optional[str] = None, + ) -> str: + func_name = "make_llm_text" + log.dev(f"🤡 DRY RUN: {self.__class__.__name__}.{func_name}") + prompt_truncated = llm_prompt_for_text.desc(truncate_text_length=self._text_gen_truncate_length) + generated_text = f"DRY RUN: {func_name} • llm_setting={llm_setting_main.desc()} • prompt={prompt_truncated}" + return generated_text + + @override + @update_job_metadata + async def make_object_direct( # pyright: ignore[reportIncompatibleMethodOverride] + self, + job_metadata: JobMetadata, + object_class: Type[BaseModelTypeVar], + llm_setting_for_object: LLMSetting, + llm_prompt_for_object: LLMPrompt, + wfid: Optional[str] = None, + ) -> BaseModelTypeVar: + func_name = "make_object_direct" + log.dev(f"🤡 DRY RUN: {self.__class__.__name__}.{func_name}") + + class ObjectFactory(ModelFactory[object_class]): # type: ignore + __model__ = object_class + + obj = ObjectFactory.build() + return obj + + @override + @update_job_metadata + async def make_text_then_object( # pyright: ignore[reportIncompatibleMethodOverride] + self, + job_metadata: JobMetadata, + object_class: Type[BaseModelTypeVar], + llm_setting_main: LLMSetting, + llm_setting_for_object: LLMSetting, + llm_prompt_for_text: LLMPrompt, + llm_prompt_factory_for_object: Optional[LLMPromptFactoryAbstract] = None, + wfid: Optional[str] = None, + ) -> BaseModelTypeVar: + func_name = "make_text_then_object" + log.dev(f"🤡 DRY RUN: {self.__class__.__name__}.{func_name}") + return await self.make_object_direct( + job_metadata=job_metadata, + object_class=object_class, + llm_setting_for_object=llm_setting_for_object, + llm_prompt_for_object=llm_prompt_for_text, + ) + + @override + @update_job_metadata + async def make_object_list_direct( # pyright: ignore[reportIncompatibleMethodOverride] + self, + job_metadata: JobMetadata, + object_class: Type[BaseModelTypeVar], + llm_setting_for_object_list: LLMSetting, + llm_prompt_for_object_list: LLMPrompt, + wfid: Optional[str] = None, + ) -> List[BaseModelTypeVar]: + func_name = "make_object_list_direct" + log.dev(f"🤡 DRY RUN: {self.__class__.__name__}.{func_name}") + object_1 = await self.make_object_direct( + job_metadata=job_metadata, + object_class=object_class, + llm_setting_for_object=llm_setting_for_object_list, + llm_prompt_for_object=llm_prompt_for_object_list, + ) + object_2 = await self.make_object_direct( + job_metadata=job_metadata, + object_class=object_class, + llm_setting_for_object=llm_setting_for_object_list, + llm_prompt_for_object=llm_prompt_for_object_list, + ) + two_objects = [object_1, object_2] + return two_objects + + @override + @update_job_metadata + async def make_text_then_object_list( # pyright: ignore[reportIncompatibleMethodOverride] + self, + job_metadata: JobMetadata, + object_class: Type[BaseModelTypeVar], + llm_setting_main: LLMSetting, + llm_setting_for_object_list: LLMSetting, + llm_prompt_for_text: LLMPrompt, + llm_prompt_factory_for_object_list: Optional[LLMPromptFactoryAbstract] = None, + wfid: Optional[str] = None, + ) -> List[BaseModelTypeVar]: + func_name = "make_text_then_object_list" + log.dev(f"🤡 DRY RUN: {self.__class__.__name__}.{func_name}") + return await self.make_object_list_direct( + job_metadata=job_metadata, + object_class=object_class, + llm_setting_for_object_list=llm_setting_for_object_list, + llm_prompt_for_object_list=llm_prompt_for_text, + ) + + @override + @update_job_metadata + async def make_single_image( # pyright: ignore[reportIncompatibleMethodOverride] + self, + job_metadata: JobMetadata, + imgg_handle: ImggHandle, + imgg_prompt: ImggPrompt, + imgg_job_params: Optional[ImggJobParams] = None, + imgg_job_config: Optional[ImggJobConfig] = None, + wfid: Optional[str] = None, + ) -> GeneratedImage: + func_name = "make_single_image" + log.dev(f"🤡 DRY RUN: {self.__class__.__name__}.{func_name}") + generated_image = GeneratedImage( + url="https://storage.googleapis.com/public_test_files_7fa6_4277_9ab/fashion/fashion_photo_1.jpg", + width=1536, + height=2752, + ) + return generated_image + + @override + @update_job_metadata + async def make_image_list( # pyright: ignore[reportIncompatibleMethodOverride] + self, + job_metadata: JobMetadata, + imgg_handle: ImggHandle, + imgg_prompt: ImggPrompt, + nb_images: int, + imgg_job_params: Optional[ImggJobParams] = None, + imgg_job_config: Optional[ImggJobConfig] = None, + wfid: Optional[str] = None, + ) -> List[GeneratedImage]: + func_name = "make_image_list" + log.dev(f"🤡 DRY RUN: {self.__class__.__name__}.{func_name}") + generated_image_list = [ + GeneratedImage( + url="https://storage.googleapis.com/public_test_files_7fa6_4277_9ab/fashion/fashion_photo_1.jpg", + width=1536, + height=2752, + ), + GeneratedImage( + url="https://storage.googleapis.com/public_test_files_7fa6_4277_9ab/fashion/fashion_photo_2.png", + width=1024, + height=1536, + ), + ] + return generated_image_list + + @override + async def make_jinja2_text( + self, + context: Dict[str, Any], + jinja2_name: Optional[str] = None, + jinja2: Optional[str] = None, + prompting_style: Optional[PromptingStyle] = None, + template_category: Jinja2TemplateCategory = Jinja2TemplateCategory.LLM_PROMPT, + wfid: Optional[str] = None, + ) -> str: + func_name = "make_jinja2_text" + log.dev(f"🤡 DRY RUN: {self.__class__.__name__}.{func_name}") + jinja2_truncated = jinja2[: self._text_gen_truncate_length] if jinja2 else None + jinja2_text = ( + f"DRY RUN: {func_name} • context={context} • jinja2_name={jinja2_name} • " + f"jinja2={jinja2_truncated} • prompting_style={prompting_style} • template_category={template_category}" + ) + return jinja2_text + + @override + async def make_ocr_extract_pages( + self, + job_metadata: JobMetadata, + ocr_input: OcrInput, + ocr_handle: OcrHandle, + ocr_job_params: Optional[OcrJobParams] = None, + ocr_job_config: Optional[OcrJobConfig] = None, + wfid: Optional[str] = None, + ) -> OcrOutput: + func_name = "make_ocr_extract_pages" + log.dev(f"🤡 DRY RUN: {self.__class__.__name__}.{func_name}") + if ocr_input.image_uri: + ocr_image_as_page = Page( + text="DRY RUN: OCR text", + extracted_images=[], + page_view=None, + ) + ocr_output = OcrOutput( + pages={1: ocr_image_as_page}, + ) + else: + ocr_page_1 = Page( + text="DRY RUN: OCR text", + extracted_images=[], + page_view=ExtractedImageFromPage( + image_id="page_view_1", + base_64="", + caption="DRY RUN: OCR text", + ), + ) + ocr_page_2 = Page( + text="DRY RUN: OCR text", + extracted_images=[], + page_view=ExtractedImageFromPage( + image_id="page_view_2", + base_64="", + caption="DRY RUN: OCR text", + ), + ) + ocr_output = OcrOutput( + pages={1: ocr_page_1, 2: ocr_page_2}, + ) + return ocr_output diff --git a/pipelex/cogt/image/prompt_image.py b/pipelex/cogt/image/prompt_image.py index 92fe8aa68..aafa752c0 100644 --- a/pipelex/cogt/image/prompt_image.py +++ b/pipelex/cogt/image/prompt_image.py @@ -4,6 +4,7 @@ from pydantic import BaseModel from typing_extensions import override +from pipelex.tools.misc.attribute_utils import AttributePolisher from pipelex.tools.misc.filetype_utils import FileType, detect_file_type_from_base64, detect_file_type_from_path @@ -35,26 +36,33 @@ class PromptImageUrl(PromptImage): @override def __str__(self) -> str: - return f"PromptImageUrl(url='{self.url}')" + truncated_url = AttributePolisher.get_truncated_value(name="url", value=self.url) + return f"PromptImageUrl(url='{truncated_url}')" + + @override + def __format__(self, format_spec: str) -> str: + return self.__str__() class PromptImageBytes(PromptImage): - b64_image_bytes: bytes + base_64: bytes def get_file_type(self) -> FileType: - return detect_file_type_from_base64(self.b64_image_bytes) + return detect_file_type_from_base64(self.base_64) @override def __str__(self) -> str: - bytes_sample: str = str(self.b64_image_bytes[:20]) - if len(self.b64_image_bytes) > 20: - bytes_sample += "..." - bytes_preview = f"{len(self.b64_image_bytes)} bytes: {bytes_sample}" - return f"PromptImageBytes(image_bytes={bytes_preview})" + base_64_str = str(self.base_64) + truncated_base_64 = AttributePolisher.get_truncated_value(name="base_64", value=base_64_str) + return f"PromptImageBytes(image_bytes={truncated_base_64})" @override def __repr__(self) -> str: return self.__str__() + @override + def __format__(self, format_spec: str) -> str: + return self.__str__() + def make_prompt_image_typed_bytes(self) -> PromptImageTypedBytes: - return PromptImageTypedBytes(image_bytes=self.b64_image_bytes, file_type=self.get_file_type()) + return PromptImageTypedBytes(image_bytes=self.base_64, file_type=self.get_file_type()) diff --git a/pipelex/cogt/image/prompt_image_factory.py b/pipelex/cogt/image/prompt_image_factory.py index b7f875fee..17f99fd06 100644 --- a/pipelex/cogt/image/prompt_image_factory.py +++ b/pipelex/cogt/image/prompt_image_factory.py @@ -23,7 +23,7 @@ def make_prompt_image( elif url: return PromptImageUrl(url=url) elif base_64: - return PromptImageBytes(b64_image_bytes=base_64) + return PromptImageBytes(base_64=base_64) else: raise PromptImageFactoryError("PromptImageFactory requires one of file_path, url, or image_bytes") @@ -44,17 +44,17 @@ async def make_promptimagebytes_from_url_async( prompt_image_url: PromptImageUrl, ) -> PromptImageBytes: raw_image_bytes = await fetch_file_from_url_httpx_async(prompt_image_url.url) - b64_image_bytes = await encode_to_base64_async(raw_image_bytes) - return PromptImageBytes(b64_image_bytes=b64_image_bytes) + base_64 = await encode_to_base64_async(raw_image_bytes) + return PromptImageBytes(base_64=base_64) @classmethod async def promptimage_to_b64_async(cls, image_prompt: PromptImage) -> bytes: if isinstance(image_prompt, PromptImagePath): return await load_binary_as_base64_async(image_prompt.file_path) elif isinstance(image_prompt, PromptImageBytes): - return image_prompt.b64_image_bytes + return image_prompt.base_64 elif isinstance(image_prompt, PromptImageUrl): image_bytes = await cls.make_promptimagebytes_from_url_async(image_prompt) - return image_bytes.b64_image_bytes + return image_bytes.base_64 else: raise PromptImageFactoryError(f"Unknown PromptImage type: {image_prompt}") diff --git a/pipelex/cogt/imgg/imgg_worker_abstract.py b/pipelex/cogt/imgg/imgg_worker_abstract.py index ce915d880..c8df26736 100644 --- a/pipelex/cogt/imgg/imgg_worker_abstract.py +++ b/pipelex/cogt/imgg/imgg_worker_abstract.py @@ -1,8 +1,7 @@ from abc import abstractmethod -from functools import wraps -from typing import Any, Callable, List, Optional, TypeVar, cast +from typing import List, Optional -from typing_extensions import Awaitable, override +from typing_extensions import override from pipelex import log from pipelex.cogt.image.generated_image import GeneratedImage @@ -12,25 +11,40 @@ from pipelex.pipeline.job_metadata import UnitJobId from pipelex.reporting.reporting_protocol import ReportingProtocol -F = TypeVar("F", bound=Callable[..., Awaitable[Any]]) +class ImggWorkerAbstract(InferenceWorkerAbstract): + def __init__( + self, + imgg_engine: ImggEngine, + reporting_delegate: Optional[ReportingProtocol] = None, + ): + InferenceWorkerAbstract.__init__(self, reporting_delegate=reporting_delegate) + self.imgg_engine = imgg_engine + + ######################################################### + # Instance methods + ######################################################### -def imgg_job_func(func: F) -> F: - @wraps(func) - async def wrapper( - self: Any, + @property + @override + def desc(self) -> str: + return f"Img Worker using:\n{self.imgg_engine.desc}" + + def _check_can_perform_job(self, imgg_job: ImggJob): + # This can be overridden by subclasses for specific checks + pass + + async def gen_image( + self, imgg_job: ImggJob, - *args: Any, - **kwargs: Any, - ) -> Any: - log.debug(f"Working — {func.__name__} using:\n{self.imgg_engine.desc}") + ) -> GeneratedImage: + log.debug(f"Image gen worker gen_image:\n{self.imgg_engine.desc}") # Verify that the job is valid imgg_job.validate_before_execution() # Verify feasibility - self.check_can_perform_job(imgg_job=imgg_job) - # TODO: check can generate object (where it will be appropriate) + self._check_can_perform_job(imgg_job=imgg_job) # metadata imgg_job.job_metadata.unit_job_id = UnitJobId.IMGG_TEXT_TO_IMAGE @@ -39,7 +53,7 @@ async def wrapper( imgg_job.imgg_job_before_start(imgg_engine=self.imgg_engine) # Execute job - result = await func(self, imgg_job, *args, **kwargs) + result = await self._gen_image(imgg_job=imgg_job) # Report job imgg_job.imgg_job_after_complete() @@ -48,39 +62,44 @@ async def wrapper( return result - return cast(F, wrapper) - + @abstractmethod + async def _gen_image( + self, + imgg_job: ImggJob, + ) -> GeneratedImage: + pass -class ImggWorkerAbstract(InferenceWorkerAbstract): - def __init__( + async def gen_image_list( self, - imgg_engine: ImggEngine, - reporting_delegate: Optional[ReportingProtocol] = None, - ): - InferenceWorkerAbstract.__init__(self, reporting_delegate=reporting_delegate) - self.imgg_engine = imgg_engine + imgg_job: ImggJob, + nb_images: int, + ) -> List[GeneratedImage]: + log.debug(f"Image gen worker gen_image_list:\n{self.imgg_engine.desc}") - ######################################################### - # Instance methods - ######################################################### + # Verify that the job is valid + imgg_job.validate_before_execution() - @property - @override - def desc(self) -> str: - return f"Img Worker using:\n{self.imgg_engine.desc}" + # Verify feasibility + self._check_can_perform_job(imgg_job=imgg_job) - def check_can_perform_job(self, imgg_job: ImggJob): - pass + # metadata + imgg_job.job_metadata.unit_job_id = UnitJobId.IMGG_TEXT_TO_IMAGE - @abstractmethod - async def gen_image( - self, - imgg_job: ImggJob, - ) -> GeneratedImage: - pass + # Prepare job + imgg_job.imgg_job_before_start(imgg_engine=self.imgg_engine) + + # Execute job + result = await self._gen_image_list(imgg_job=imgg_job, nb_images=nb_images) + + # Report job + imgg_job.imgg_job_after_complete() + if self.reporting_delegate: + self.reporting_delegate.report_inference_job(inference_job=imgg_job) + + return result @abstractmethod - async def gen_image_list( + async def _gen_image_list( self, imgg_job: ImggJob, nb_images: int, diff --git a/pipelex/cogt/llm/llm_job_func.py b/pipelex/cogt/llm/llm_job_func.py deleted file mode 100644 index 4da6d83a8..000000000 --- a/pipelex/cogt/llm/llm_job_func.py +++ /dev/null @@ -1,75 +0,0 @@ -from functools import wraps -from typing import Any, Awaitable, Callable, TypeVar, cast - -from instructor.exceptions import InstructorRetryException - -from pipelex import log -from pipelex.cogt.exceptions import LLMWorkerError -from pipelex.cogt.llm.llm_job import LLMJob -from pipelex.cogt.llm.llm_worker_abstract import LLMWorkerAbstract, LLMWorkerJobFuncName - -F = TypeVar("F", bound=Callable[..., Awaitable[Any]]) - - -def llm_job_func(func: F) -> F: - """ - A decorator for asynchronous LLM job functions. - - This decorator wraps an asynchronous function that performs an LLM job, - adding logging, integrity checks, feasibility checks, job preparation, - execution timing, and reporting. - - Args: - func (F): The asynchronous function to be decorated. - - Returns: - F: The wrapped asynchronous function. - """ - - @wraps(func) - async def wrapper( - self: LLMWorkerAbstract, - llm_job: LLMJob, - *args: Any, - **kwargs: Any, - ) -> Any: - func_name = LLMWorkerJobFuncName(func.__name__) - log.debug(f"LLM Working async job function: '{func_name}'") - log.verbose(f"\n{self.llm_engine.desc}") - log.verbose(llm_job.params_desc) - - # Verify that the job is valid - llm_job.validate_before_execution() - - # Verify feasibility - self.check_can_perform_job(llm_job=llm_job, func_name=func_name) - - # TODO: Fix printing prompts that contain image bytes - # log.verbose(llm_job.llm_prompt.desc, title="llm_prompt") - - # metadata - llm_job.job_metadata.unit_job_id = self.unit_job_id(func_name=func_name) - - # Prepare job - llm_job.llm_job_before_start(llm_engine=self.llm_engine) - - # Execute job - try: - result = await func(self, llm_job, *args, **kwargs) - except InstructorRetryException as exc: - raise LLMWorkerError( - f"LLM Worker error: Instructor failed after retry with llm '{self.llm_engine.tag}': {exc}\nLLMPrompt: {llm_job.llm_prompt.desc}" - ) from exc - - # Cleanup result - if hasattr(result, "_raw_response"): - delattr(result, "_raw_response") - - # Report job - llm_job.llm_job_after_complete() - if self.reporting_delegate: - self.reporting_delegate.report_inference_job(inference_job=llm_job) - - return result - - return cast(F, wrapper) diff --git a/pipelex/cogt/llm/llm_models/llm_deck.py b/pipelex/cogt/llm/llm_models/llm_deck.py index 42db1021c..dfe9c3265 100644 --- a/pipelex/cogt/llm/llm_models/llm_deck.py +++ b/pipelex/cogt/llm/llm_models/llm_deck.py @@ -158,8 +158,10 @@ def _validate_llm_setting(cls, llm_setting: LLMSetting, llm_model: LLMModel): if llm_model.max_tokens is not None and (llm_setting_max_tokens := llm_setting.max_tokens): if llm_setting_max_tokens > llm_model.max_tokens: raise LLMSettingsValidationError( - f"LLM setting '{llm_setting.llm_handle}' has a max_tokens of {llm_setting_max_tokens}, \ - which is greater than the model's max_tokens of {llm_model.max_tokens}" + ( + f"LLM setting '{llm_setting.llm_handle}' has a max_tokens of {llm_setting_max_tokens}, " + f"which is greater than the model's max_tokens of {llm_model.max_tokens}" + ) ) match llm_model.llm_family: case LLMFamily.O_SERIES: diff --git a/pipelex/cogt/llm/llm_models/llm_setting.py b/pipelex/cogt/llm/llm_models/llm_setting.py index 5e55d1dfe..ebb263f96 100644 --- a/pipelex/cogt/llm/llm_models/llm_setting.py +++ b/pipelex/cogt/llm/llm_models/llm_setting.py @@ -31,8 +31,10 @@ def validate_max_tokens(cls, value: Union[int, Literal["auto"], None]) -> Option @model_validator(mode="after") def validate_temperature(self) -> Self: if self.llm_handle.startswith("gemini") and self.temperature > 1: - error_msg = f"Gemini LLMs such as '{self.llm_handle}' support temperatures up to 2 but we normalize between 0 and 1, \ - so you can't set a temperature of {self.temperature}" + error_msg = ( + f"Gemini LLMs such as '{self.llm_handle}' support temperatures up to 2 but we normalize between 0 and 1, " + f"so you can't set a temperature of {self.temperature}" + ) raise LLMSettingsValidationError(error_msg) return self @@ -43,6 +45,12 @@ def make_llm_job_params(self) -> LLMJobParams: seed=None, ) + def desc(self) -> str: + return ( + f"LLMSetting(llm_handle={self.llm_handle}, temperature={self.temperature}, " + f"max_tokens={self.max_tokens}, prompting_target={self.prompting_target})" + ) + LLMSettingOrPresetId = Union[LLMSetting, str] diff --git a/pipelex/cogt/llm/llm_prompt.py b/pipelex/cogt/llm/llm_prompt.py index 3cca009ad..38aa757af 100644 --- a/pipelex/cogt/llm/llm_prompt.py +++ b/pipelex/cogt/llm/llm_prompt.py @@ -40,22 +40,41 @@ def validate_before_execution(self): @override def __str__(self) -> str: # return json_str(self, title="llm_prompt", is_spaced=True) - return self.desc + return self.desc() # return "test" - @property - def desc(self) -> str: + @override + def __repr__(self) -> str: + return self.desc() + + @override + def __format__(self, format_spec: str) -> str: + return self.desc() + + def desc(self, truncate_text_length: Optional[int] = None) -> str: description = "LLM Prompt:" - if self.system_text: - description += f""" -system_text: -{self.system_text} -""" - if self.user_text: - description += f""" -user_text: -{self.user_text} -""" + if truncate_text_length: + if self.system_text: + description += f""" + system_text: + {self.system_text[:truncate_text_length]} + """ + if self.user_text: + description += f""" + user_text: + {self.user_text[:truncate_text_length]} + """ + else: + if self.system_text: + description += f""" + system_text: + {self.system_text} + """ + if self.user_text: + description += f""" + user_text: + {self.user_text} + """ if self.user_images: user_images_desc: str = "\n".join([f" {image}" for image in self.user_images]) diff --git a/pipelex/cogt/llm/llm_worker_abstract.py b/pipelex/cogt/llm/llm_worker_abstract.py index 6fa4e56d4..23b097aba 100644 --- a/pipelex/cogt/llm/llm_worker_abstract.py +++ b/pipelex/cogt/llm/llm_worker_abstract.py @@ -1,9 +1,11 @@ from abc import ABC, abstractmethod from typing import Optional, Type +from instructor.exceptions import InstructorRetryException from typing_extensions import override -from pipelex.cogt.exceptions import LLMCapabilityError +from pipelex import log +from pipelex.cogt.exceptions import LLMCapabilityError, LLMCompletionError from pipelex.cogt.inference.inference_worker_abstract import InferenceWorkerAbstract from pipelex.cogt.llm.llm_job import LLMJob from pipelex.cogt.llm.llm_models.llm_engine import LLMEngine @@ -11,12 +13,6 @@ from pipelex.pipeline.job_metadata import UnitJobId from pipelex.reporting.reporting_protocol import ReportingProtocol from pipelex.tools.typing.pydantic_utils import BaseModelTypeVar -from pipelex.types import StrEnum - - -class LLMWorkerJobFuncName(StrEnum): - GEN_TEXT = "gen_text" - GEN_OBJECT = "gen_object" class LLMWorkerAbstract(InferenceWorkerAbstract, ABC): @@ -47,21 +43,11 @@ def __init__( def desc(self) -> str: return f"LLM Worker using:\n{self.llm_engine.desc}" - def unit_job_id(self, func_name: LLMWorkerJobFuncName) -> UnitJobId: - match func_name: - case LLMWorkerJobFuncName.GEN_TEXT: - return UnitJobId.LLM_GEN_TEXT - case LLMWorkerJobFuncName.GEN_OBJECT: - return UnitJobId.LLM_GEN_OBJECT - - def check_can_perform_job(self, llm_job: LLMJob, func_name: LLMWorkerJobFuncName): - match func_name: - case LLMWorkerJobFuncName.GEN_TEXT: - pass - case LLMWorkerJobFuncName.GEN_OBJECT: - if not self.llm_engine.is_gen_object_supported: - raise LLMCapabilityError(f"LLM Engine '{self.llm_engine.tag}' does not support object generation.") + def _check_can_perform_job(self, llm_job: LLMJob): + # This can be overridden by subclasses for specific checks + self._check_vision_support(llm_job=llm_job) + def _check_vision_support(self, llm_job: LLMJob): if llm_job.llm_prompt.user_images: if not self.llm_engine.llm_model.is_vision_supported: raise LLMCapabilityError(f"LLM Engine '{self.llm_engine.tag}' does not support vision.") @@ -71,17 +57,98 @@ def check_can_perform_job(self, llm_job: LLMJob, func_name: LLMWorkerJobFuncName if nb_images > max_prompt_images: raise LLMCapabilityError(f"LLM Engine '{self.llm_engine.tag}' does not accept that many images: {nb_images}.") - @abstractmethod async def gen_text( self, llm_job: LLMJob, ) -> str: - pass + log.debug("LLM Worker gen_text") + log.verbose(f"\n{self.llm_engine.desc}") + log.verbose(llm_job.params_desc) + + # Verify that the job is valid + llm_job.validate_before_execution() + + # Verify feasibility + self._check_can_perform_job(llm_job=llm_job) + + # TODO: Fix printing prompts that contain image bytes + # log.verbose(llm_job.llm_prompt.desc, title="llm_prompt") + + # metadata + llm_job.job_metadata.unit_job_id = UnitJobId.LLM_GEN_TEXT + + # Prepare job + llm_job.llm_job_before_start(llm_engine=self.llm_engine) + + result = await self._gen_text(llm_job=llm_job) + + # Cleanup result (Instructor adds the client's response as a _raw_response attribute, we don't want to pass it along) + if hasattr(result, "_raw_response"): + delattr(result, "_raw_response") + + # Report job + llm_job.llm_job_after_complete() + if self.reporting_delegate: + self.reporting_delegate.report_inference_job(inference_job=llm_job) + + return result @abstractmethod + async def _gen_text( + self, + llm_job: LLMJob, + ) -> str: + pass + async def gen_object( self, llm_job: LLMJob, schema: Type[BaseModelTypeVar], + ) -> BaseModelTypeVar: + log.debug("LLM Worker gen_object") + log.verbose(f"\n{self.llm_engine.desc}") + log.verbose(llm_job.params_desc) + + # Verify that the job is valid + llm_job.validate_before_execution() + + # Verify feasibility + if not self.llm_engine.is_gen_object_supported: + raise LLMCapabilityError(f"LLM Engine '{self.llm_engine.tag}' does not support object generation.") + self._check_can_perform_job(llm_job=llm_job) + + # TODO: Fix printing prompts that contain image bytes + # log.verbose(llm_job.llm_prompt.desc, title="llm_prompt") + + # metadata + llm_job.job_metadata.unit_job_id = UnitJobId.LLM_GEN_OBJECT + + # Prepare job + llm_job.llm_job_before_start(llm_engine=self.llm_engine) + + # Execute job + try: + result = await self._gen_object(llm_job=llm_job, schema=schema) + except InstructorRetryException as exc: + raise LLMCompletionError( + f"LLM Worker error: Instructor failed after retry with llm '{self.llm_engine.tag}': {exc}\nLLMPrompt: {llm_job.llm_prompt.desc}" + ) from exc + + # Cleanup result + if hasattr(result, "_raw_response"): + delattr(result, "_raw_response") + + # Report job + llm_job.llm_job_after_complete() + if self.reporting_delegate: + self.reporting_delegate.report_inference_job(inference_job=llm_job) + + return result + + @abstractmethod + async def _gen_object( + self, + llm_job: LLMJob, + schema: Type[BaseModelTypeVar], ) -> BaseModelTypeVar: pass diff --git a/pipelex/cogt/llm/llm_worker_factory.py b/pipelex/cogt/llm/llm_worker_factory.py index d21cf0f95..21ccb3edd 100644 --- a/pipelex/cogt/llm/llm_worker_factory.py +++ b/pipelex/cogt/llm/llm_worker_factory.py @@ -83,9 +83,11 @@ def make_llm_worker( raise MissingDependencyError( "anthropic", "anthropic", - "The anthropic SDK is required to use Anthropic models via the anthropic client. \ - However, you can use Anthropic models through bedrock directly by using the 'bedrock-anthropic-claude' llm family.\ - (eg: bedrock-anthropic-claude)", + ( + "The anthropic SDK is required to use Anthropic models via the anthropic client. " + "However, you can use Anthropic models through bedrock directly " + "by using the 'bedrock-anthropic-claude' llm family. (eg: bedrock-anthropic-claude)" + ), ) from exc from pipelex.plugins.anthropic.anthropic_factory import AnthropicFactory @@ -109,9 +111,11 @@ def make_llm_worker( raise MissingDependencyError( "mistralai", "mistral", - "The mistralai SDK is required to use Mistral models through the mistralai client. \ - However, you can use Mistral models through bedrock directly by using the 'bedrock-mistral' llm family. \ - (eg: bedrock-mistral-large)", + ( + "The mistralai SDK is required to use Mistral models through the mistralai client. " + "However, you can use Mistral models through bedrock directly " + "by using the 'bedrock-mistral' llm family. (eg: bedrock-mistral-large)" + ), ) from exc from pipelex.plugins.mistral.mistral_factory import MistralFactory diff --git a/pipelex/cogt/ocr/ocr_worker_abstract.py b/pipelex/cogt/ocr/ocr_worker_abstract.py index 1746af50d..d7211b4e0 100644 --- a/pipelex/cogt/ocr/ocr_worker_abstract.py +++ b/pipelex/cogt/ocr/ocr_worker_abstract.py @@ -1,8 +1,7 @@ from abc import abstractmethod -from functools import wraps -from typing import Any, Callable, Optional, TypeVar, cast +from typing import Optional -from typing_extensions import Awaitable, override +from typing_extensions import override from pipelex import log from pipelex.cogt.inference.inference_worker_abstract import InferenceWorkerAbstract @@ -12,24 +11,40 @@ from pipelex.pipeline.job_metadata import UnitJobId from pipelex.reporting.reporting_protocol import ReportingProtocol -F = TypeVar("F", bound=Callable[..., Awaitable[Any]]) +class OcrWorkerAbstract(InferenceWorkerAbstract): + def __init__( + self, + ocr_engine: OcrEngine, + reporting_delegate: Optional[ReportingProtocol] = None, + ): + InferenceWorkerAbstract.__init__(self, reporting_delegate=reporting_delegate) + self.ocr_engine = ocr_engine + + ######################################################### + # Instance methods + ######################################################### -def ocr_job_func(func: F) -> F: - @wraps(func) - async def wrapper( - self: Any, + @property + @override + def desc(self) -> str: + return f"Ocr Worker using:\n{self.ocr_engine.desc}" + + def _check_can_perform_job(self, ocr_job: OcrJob): + # This can be overridden by subclasses for specific checks + pass + + async def ocr_extract_pages( + self, ocr_job: OcrJob, - *args: Any, - **kwargs: Any, - ) -> Any: - log.debug(f"Working — {func.__name__} using:\n{self.ocr_engine.desc}") + ) -> OcrOutput: + log.debug(f"OCR Worker ocr_extract_pages:\n{self.ocr_engine.desc}") # Verify that the job is valid ocr_job.validate_before_execution() # Verify feasibility - self.check_can_perform_job(ocr_job=ocr_job) + self._check_can_perform_job(ocr_job=ocr_job) # TODO: check can generate object (where it will be appropriate) # metadata @@ -39,7 +54,7 @@ async def wrapper( ocr_job.ocr_job_before_start(ocr_engine=self.ocr_engine) # Execute job - result = await func(self, ocr_job, *args, **kwargs) + result = await self._ocr_extract_pages(ocr_job=ocr_job) # Report job ocr_job.ocr_job_after_complete() @@ -48,32 +63,8 @@ async def wrapper( return result - return cast(F, wrapper) - - -class OcrWorkerAbstract(InferenceWorkerAbstract): - def __init__( - self, - ocr_engine: OcrEngine, - reporting_delegate: Optional[ReportingProtocol] = None, - ): - InferenceWorkerAbstract.__init__(self, reporting_delegate=reporting_delegate) - self.ocr_engine = ocr_engine - - ######################################################### - # Instance methods - ######################################################### - - @property - @override - def desc(self) -> str: - return f"Ocr Worker using:\n{self.ocr_engine.desc}" - - def check_can_perform_job(self, ocr_job: OcrJob): - pass - @abstractmethod - async def ocr_extract_pages( + async def _ocr_extract_pages( self, ocr_job: OcrJob, ) -> OcrOutput: diff --git a/pipelex/config.py b/pipelex/config.py index 1870cf754..b2373df4a 100644 --- a/pipelex/config.py +++ b/pipelex/config.py @@ -1,10 +1,11 @@ -from typing import Dict, List, Optional +from typing import Dict, List, Optional, cast import shortuuid +from pydantic import Field, field_validator from pipelex.cogt.config_cogt import Cogt from pipelex.cogt.llm.llm_models.llm_prompting_target import LLMPromptingTarget -from pipelex.exceptions import PipelexError +from pipelex.exceptions import PipelexError, StaticValidationErrorType from pipelex.hub import get_required_config from pipelex.libraries.library_config import LibraryConfig from pipelex.pipeline.track.tracker_config import TrackerConfig @@ -13,6 +14,30 @@ from pipelex.tools.config.models import ConfigModel, ConfigRoot from pipelex.tools.log.log_config import LogConfig from pipelex.tools.templating.templating_models import PromptingStyle +from pipelex.types import StrEnum + + +class StaticValidationReaction(StrEnum): + RAISE = "raise" + LOG = "log" + IGNORE = "ignore" + + +class StaticValidationConfig(ConfigModel): + default_reaction: StaticValidationReaction = Field(strict=False) + reactions: Dict[StaticValidationErrorType, StaticValidationReaction] + + @field_validator("reactions", mode="before") + def validate_reactions(cls, value: Dict[str, str]) -> Dict[StaticValidationErrorType, StaticValidationReaction]: + the_dict = cast( + Dict[StaticValidationErrorType, StaticValidationReaction], + ConfigModel.transform_dict_str_to_enum( + input_dict=value, + key_enum_cls=StaticValidationErrorType, + value_enum_cls=StaticValidationReaction, + ), + ) + return the_dict class PipelexConfigError(PipelexError): @@ -23,6 +48,11 @@ class PipeRunConfig(ConfigModel): pipe_stack_limit: int +class DryRunConfig(ConfigModel): + apply_to_jinja2_rendering: bool + text_gen_truncate_length: int + + class GenericTemplateNames(ConfigModel): structure_from_preliminary_text_user: str structure_from_preliminary_text_system: str @@ -65,11 +95,13 @@ class Pipelex(ConfigModel): aws_config: AwsConfig library_config: LibraryConfig + static_validation_config: StaticValidationConfig generic_template_names: GenericTemplateNames tracker_config: TrackerConfig structure_config: StructureConfig prompting_config: PromptingConfig + dry_run_config: DryRunConfig pipe_run_config: PipeRunConfig reporting_config: ReportingConfig diff --git a/pipelex/core/concept.py b/pipelex/core/concept.py index 45dc4eba3..e5ba8f457 100644 --- a/pipelex/core/concept.py +++ b/pipelex/core/concept.py @@ -25,11 +25,11 @@ def validate_code_domain(self) -> Self: if not Concept.concept_str_contains_domain(self.code): raise ConceptCodeError(f"Code must contain a dot (.) for concept with code '{self.code}' and domain '{self.domain}'") - domain, code = Concept.extract_domain_and_concept_from_definition(concept_code=self.code) + domain, code = Concept.extract_domain_and_concept_from_str(concept_str=self.code) if domain != self.domain: raise ConceptDomainError( - f"Left part of code must match the domain field for concept with \ - code '{self.code}' and domain '{self.domain}': {domain} != {self.domain}" + f"Left part of code must match the domain field for concept with " + f"code '{self.code}' and domain '{self.domain}': {domain} != {self.domain}" ) self.validate_domain_syntax(domain, self.code, self.domain) @@ -41,16 +41,16 @@ def validate_code_domain(self) -> Self: def validate_domain_syntax(cls, domain: str, code: str, domain_field: str) -> None: if not re.match(r"^[a-z][a-z0-9_]*$", domain): raise ConceptDomainError( - f"Domain must be snake_case (lowercase letters, numbers, and underscores only) \ - for concept with code '{code}' and domain '{domain_field}': {domain}" + f"Domain must be snake_case (lowercase letters, numbers, and underscores only) " + f"for concept with code '{code}' and domain '{domain_field}': {domain}" ) @classmethod def validate_concept_code_syntax(cls, code: str, concept_code: str, domain_field: str) -> None: if not re.match(r"^[A-Z][a-zA-Z0-9]*$", code): raise ConceptCodeError( - f"Code must be PascalCase (letters and numbers only, starting with uppercase) \ - for concept with code '{concept_code}' and domain '{domain_field}': {code}" + f"Code must be PascalCase (letters and numbers only, starting with uppercase) " + f"for concept with code '{concept_code}' and domain '{domain_field}': {code}" ) @field_validator("refines") @@ -60,7 +60,7 @@ def validate_refines(cls, value: List[str]) -> List[str]: if not cls.concept_str_contains_domain(code): raise ConceptCodeError(f"Each inherited code must contain a dot (.), got: {code}") - domain, code = Concept.extract_domain_and_concept_from_definition(concept_code=code) + domain, code = Concept.extract_domain_and_concept_from_str(concept_str=code) cls.validate_concept_code_syntax(code=code, concept_code=code, domain_field=code) cls.validate_domain_syntax(domain=domain, code=code, domain_field=code) return value @@ -82,20 +82,26 @@ def is_valid_structure_class(cls, structure_class_name: str) -> bool: return False @classmethod - def extract_domain_and_concept_from_definition(cls, concept_code: str) -> Tuple[str, str]: - if "." in concept_code: - domain, concept = concept_code.split(".") - return domain, concept - raise ConceptError(f"No extraction of domain and concept from concept code '{concept_code}'") + def check_possible_concept_from_str(cls, concept_str: str): + parts = concept_str.split(".") + if len(parts) > 2: + raise ConceptError(f"Concept code '{concept_str}' contains more than one dot") + + @classmethod + def extract_domain_and_concept_from_str(cls, concept_str: str) -> Tuple[str, str]: + if "." in concept_str: + domain_code, concept_code = concept_str.split(".") + return domain_code, concept_code + raise ConceptError(f"No extraction of domain and concept from concept code '{concept_str}'") @classmethod def extract_concept_name_from_str(cls, concept_str: str) -> str: - _, concept = cls.extract_domain_and_concept_from_definition(concept_code=concept_str) + _, concept = cls.extract_domain_and_concept_from_str(concept_str=concept_str) return concept @classmethod def extract_domain_from_str(cls, concept_str: str) -> str: - domain, _ = cls.extract_domain_and_concept_from_definition(concept_code=concept_str) + domain, _ = cls.extract_domain_and_concept_from_str(concept_str=concept_str) return domain @classmethod diff --git a/pipelex/core/concept_library.py b/pipelex/core/concept_library.py index e4a69ef14..2005a8347 100644 --- a/pipelex/core/concept_library.py +++ b/pipelex/core/concept_library.py @@ -6,6 +6,7 @@ from pipelex import log from pipelex.core.concept import Concept from pipelex.core.concept_factory import ConceptFactory +from pipelex.core.concept_native import NativeConcept from pipelex.core.concept_provider_abstract import ConceptProviderAbstract from pipelex.exceptions import ConceptLibraryConceptNotFoundError, ConceptLibraryError @@ -19,26 +20,26 @@ def validate_with_libraries(self): for concept in self.root.values(): for domain_concept_code in concept.refines: if "." in domain_concept_code: - domain, concept_code = Concept.extract_domain_and_concept_from_definition(concept_code=domain_concept_code) + domain, concept_code = Concept.extract_domain_and_concept_from_str(concept_str=domain_concept_code) found_concept = self.root.get(f"{domain}.{concept_code}", None) if not found_concept: raise ConceptLibraryError( - f"Concept '{concept.code}' refines '{domain_concept_code}' but no concept \ - with the code '{concept_code}' and domain '{domain}' exists" + f"Concept '{concept.code}' refines '{domain_concept_code}' but no concept " + f"with the code '{concept_code}' and domain '{domain}' exists" ) else: current_domain = concept.domain found_concept = self.root.get(f"{current_domain}.{domain_concept_code}", None) if not found_concept: raise ConceptLibraryError( - f"Concept '{concept.code}' refines '{domain_concept_code}' but no concept \ - with the code '{domain_concept_code}' and domain '{current_domain}' exists" + f"Concept '{concept.code}' refines '{domain_concept_code}' but no concept " + f"with the code '{domain_concept_code}' and domain '{current_domain}' exists" ) if found_concept.domain != current_domain: raise ConceptLibraryError( - f"Concept '{concept.code}' refines '{domain_concept_code}' but the concept \ - exists in domain '{found_concept.domain}' and not in the same domain '{current_domain}'" + f"Concept '{concept.code}' refines '{domain_concept_code}' but the concept " + f"exists in domain '{found_concept.domain}' and not in the same domain '{current_domain}'" ) self.get_required_concept(concept_code=domain_concept_code) @@ -59,7 +60,7 @@ def list_concepts(self) -> List[Concept]: return list(self.root.values()) def _list_concept_names(self) -> List[str]: - return [Concept.extract_domain_and_concept_from_definition(c.code)[1] for c in self.list_concepts()] + return [Concept.extract_domain_and_concept_from_str(c.code)[1] for c in self.list_concepts()] @override def list_concepts_by_domain(self, domain: str) -> List[Concept]: @@ -87,6 +88,8 @@ def is_compatible(self, tested_concept: Concept, wanted_concept: Concept) -> boo @override def is_compatible_by_concept_code(self, tested_concept_code: str, wanted_concept_code: str) -> bool: + if wanted_concept_code == NativeConcept.ANYTHING.code: + return True tested_concept = self.get_required_concept(concept_code=tested_concept_code) wanted_concept = self.get_required_concept(concept_code=wanted_concept_code) if tested_concept.code == wanted_concept.code: @@ -109,7 +112,7 @@ def get_required_concept(self, concept_code: str) -> Concept: # TODO: replace this with a concept factory method make_implicit_concept return ConceptFactory.make_concept_from_definition( domain_code="implicit", - code=Concept.extract_domain_and_concept_from_definition(concept_code=concept_code)[1], + code=Concept.extract_domain_and_concept_from_str(concept_str=concept_code)[1], definition=concept_code, ) else: diff --git a/pipelex/core/concept_native.py b/pipelex/core/concept_native.py index d3cb46bba..0a1dab12d 100644 --- a/pipelex/core/concept_native.py +++ b/pipelex/core/concept_native.py @@ -21,6 +21,7 @@ class NativeConceptClass(StrEnum): # Exceptionally, we use an Enum here (and not our usual StrEnum) to avoid confusion with # the concept_code with must have the form "native.ConceptName" class NativeConcept(Enum): + ANYTHING = "Anything" DYNAMIC = "Dynamic" TEXT = "Text" IMAGE = "Image" @@ -57,6 +58,8 @@ def content_class_name(self) -> NativeConceptClass: return NativeConceptClass.DYNAMIC case NativeConcept.PAGE: return NativeConceptClass.PAGE + case NativeConcept.ANYTHING: + raise RuntimeError("NativeConcept.ANYTHING cannot be used as a content class name") def make_concept(self) -> Concept: definition: str @@ -77,6 +80,8 @@ def make_concept(self) -> Concept: definition = "A dynamic concept" case NativeConcept.PAGE: definition = "The content of a page of a document, comprising text and linked images as well as an optional page view image" + case NativeConcept.ANYTHING: + raise RuntimeError("NativeConcept.ANYTHING cannot be used as a concept") return Concept( code=self.code, @@ -89,5 +94,7 @@ def make_concept(self) -> Concept: def all_concepts(cls) -> List[Concept]: concepts: List[Concept] = [] for code in cls: + if code == cls.ANYTHING: + continue concepts.append(code.make_concept()) return concepts diff --git a/pipelex/core/pipe_abstract.py b/pipelex/core/pipe_abstract.py index df3d60069..a80be5d07 100644 --- a/pipelex/core/pipe_abstract.py +++ b/pipelex/core/pipe_abstract.py @@ -1,8 +1,9 @@ from abc import ABC, abstractmethod from typing import Optional, Set, Type -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict, Field +from pipelex.core.pipe_input_spec import PipeInputSpec from pipelex.core.pipe_output import PipeOutput from pipelex.core.pipe_run_params import PipeRunParams from pipelex.core.working_memory import WorkingMemory @@ -17,7 +18,8 @@ class PipeAbstract(ABC, BaseModel): domain: str definition: Optional[str] = None - input_concept_code: Optional[str] = None + # TODO: support auto (implicit) input, it makes sense for pipe controllers + inputs: PipeInputSpec = Field(default_factory=PipeInputSpec) output_concept_code: str @property @@ -34,18 +36,10 @@ def pipe_dependencies(self) -> Set[str]: def concept_dependencies(self) -> Set[str]: required_concepts = set([self.output_concept_code]) - if self.input_concept_code: - required_concepts.add(self.input_concept_code) + required_concepts.update(self.inputs.concepts) return required_concepts # Required variables - - @property - def required_input_concept_code(self) -> str: - if self.input_concept_code is None: - raise RuntimeError("input_concept_code is required") - return self.input_concept_code - def required_variables(self) -> Set[str]: return set() diff --git a/pipelex/core/pipe_blueprint.py b/pipelex/core/pipe_blueprint.py index 3e000242a..5992279c3 100644 --- a/pipelex/core/pipe_blueprint.py +++ b/pipelex/core/pipe_blueprint.py @@ -12,14 +12,18 @@ class PipeBlueprint(StructuredContent): model_config = ConfigDict(extra="forbid") definition: Optional[str] = None - input: Optional[str] = None + inputs: Optional[Dict[str, str]] = None output: str domain: str @model_validator(mode="after") def add_domain_prefix(self) -> Self: - if self.input and "." not in self.input: - self.input = f"{self.domain}.{self.input}" + # if self.input and "." not in self.input: + # self.input = f"{self.domain}.{self.input}" + if self.inputs: + for input_name, input_concept_code in self.inputs.items(): + if "." not in input_concept_code: + self.inputs[input_name] = f"{self.domain}.{input_concept_code}" if self.output and "." not in self.output: self.output = f"{self.domain}.{self.output}" return self @@ -30,11 +34,11 @@ def _add_native_prefix_if_needed(cls, value: str) -> str: return f"native.{value}" return value - @field_validator("input") + @field_validator("inputs") @classmethod - def validate_input(cls, value: Optional[str]) -> Optional[str]: + def validate_inputs(cls, value: Optional[Dict[str, str]]) -> Optional[Dict[str, str]]: if value: - return cls._add_native_prefix_if_needed(value) + return {name: cls._add_native_prefix_if_needed(concept_code) for name, concept_code in value.items()} return value @field_validator("output") diff --git a/pipelex/core/pipe_input_details.py b/pipelex/core/pipe_input_details.py new file mode 100644 index 000000000..d3814d9e9 --- /dev/null +++ b/pipelex/core/pipe_input_details.py @@ -0,0 +1,80 @@ +from typing import Callable, Dict, List, Set, Tuple + +from pydantic import Field, RootModel, field_validator + +from pipelex import log +from pipelex.core.concept import Concept +from pipelex.exceptions import ConceptError, PipeInputSpecError + +PipeInputDetailsRoot = Dict[str, str] + + +class PipeInputDetails(RootModel[PipeInputDetailsRoot]): + root: PipeInputDetailsRoot = Field(default_factory=dict) + + @field_validator("root", mode="wrap") + @classmethod + def validate_concept_codes(cls, input_value: Dict[str, str], handler: Callable[[Dict[str, str]], Dict[str, str]]) -> Dict[str, str]: + # First let Pydantic handle the basic type validation + validated_dict: Dict[str, str] = handler(input_value) + + # Now we can transform and validate the keys and values + transformed_dict: Dict[str, str] = {} + for required_input, concept_str in validated_dict.items(): + # in case of sub-attribute, the variable name is the object name, before the 1st dot + transformed_key: str = required_input.split(".", 1)[0] + if transformed_key != required_input: + log.warning(f"Sub-attribute {required_input} detected, using {transformed_key} as variable name") + + # Validate concept_code + try: + Concept.check_possible_concept_from_str(concept_str=concept_str) + except ConceptError as exc: + raise PipeInputSpecError(f"Invalid concept code: {concept_str}") from exc + + if transformed_key in transformed_dict and transformed_dict[transformed_key] != concept_str: + log.warning( + f"Variable {transformed_key} already exists with a different concept code: {transformed_dict[transformed_key]} -> {concept_str}" + ) + transformed_dict[transformed_key] = concept_str + + return transformed_dict + + def set_default_domain(self, domain: str): + for input_name, input_concept_code in self.root.items(): + if "." not in input_concept_code: + self.root[input_name] = f"{domain}.{input_concept_code}" + + def get(self, variable_name: str) -> str: + return self.root[variable_name] + + def add_requirement(self, variable_name: str, concept_code: str): + self.root[variable_name] = concept_code + + @property + def items(self) -> List[Tuple[str, str]]: + return list(self.root.items()) + + @property + def concepts(self) -> Set[str]: + return set(self.root.values()) + + # @property + # def variables(self) -> List[str]: + # return list(self.root.keys()) + + @property + def required_names(self) -> List[str]: + the_required_names: List[str] = [] + for requirement_expression in self.root.keys(): + required_variable_name = requirement_expression.split(".", 1)[0] + the_required_names.append(required_variable_name) + return the_required_names + + @property + def detailed_requirements(self) -> List[Tuple[str, str, str]]: + the_requirements: List[Tuple[str, str, str]] = [] + for requirement_expression, concept_code in self.root.items(): + required_variable_name = requirement_expression.split(".", 1)[0] + the_requirements.append((required_variable_name, requirement_expression, concept_code)) + return the_requirements diff --git a/pipelex/core/pipe_input_spec.py b/pipelex/core/pipe_input_spec.py new file mode 100644 index 000000000..e41a68625 --- /dev/null +++ b/pipelex/core/pipe_input_spec.py @@ -0,0 +1,88 @@ +from typing import Callable, Dict, List, Set, Tuple + +from pydantic import Field, RootModel, field_validator + +from pipelex import log +from pipelex.core.concept import Concept +from pipelex.exceptions import ConceptError, PipeInputNotFoundError, PipeInputSpecError + +PipeInputSpecRoot = Dict[str, str] + + +class PipeInputSpec(RootModel[PipeInputSpecRoot]): + """ + A PipeInputSpec is a dictionary of variable names and their corresponding concept codes. + It's meant to hold the required input variables declared by a pipe. + """ + + root: PipeInputSpecRoot = Field(default_factory=dict) + + @field_validator("root", mode="wrap") + @classmethod + def validate_concept_codes(cls, input_value: Dict[str, str], handler: Callable[[Dict[str, str]], Dict[str, str]]) -> Dict[str, str]: + # First let Pydantic handle the basic type validation + validated_dict: Dict[str, str] = handler(input_value) + + # Now we can transform and validate the keys and values + transformed_dict: Dict[str, str] = {} + for required_input, concept_str in validated_dict.items(): + # in case of sub-attribute, the variable name is the object name, before the 1st dot + transformed_key: str = required_input.split(".", 1)[0] + if transformed_key != required_input: + log.warning(f"Sub-attribute {required_input} detected, using {transformed_key} as variable name") + + # Validate concept_code + try: + Concept.check_possible_concept_from_str(concept_str=concept_str) + except ConceptError as exc: + raise PipeInputSpecError(f"Invalid concept code: {concept_str}") from exc + + if transformed_key in transformed_dict and transformed_dict[transformed_key] != concept_str: + log.warning( + f"Variable {transformed_key} already exists with a different concept code: {transformed_dict[transformed_key]} -> {concept_str}" + ) + transformed_dict[transformed_key] = concept_str + + return transformed_dict + + def set_default_domain(self, domain: str): + for input_name, input_concept_code in self.root.items(): + if "." not in input_concept_code: + self.root[input_name] = f"{domain}.{input_concept_code}" + + def get_required_concept_code(self, variable_name: str) -> str: + concept_code = self.root.get(variable_name) + if not concept_code: + raise PipeInputNotFoundError(f"Variable '{variable_name}' not found in input spec") + return concept_code + + def add_requirement(self, variable_name: str, concept_code: str): + self.root[variable_name] = concept_code + + @property + def items(self) -> List[Tuple[str, str]]: + return list(self.root.items()) + + @property + def concepts(self) -> Set[str]: + return set(self.root.values()) + + @property + def variables(self) -> List[str]: + return list(self.root.keys()) + + @property + def required_names(self) -> List[str]: + the_required_names: List[str] = [] + for requirement_expression in self.root.keys(): + required_variable_name = requirement_expression.split(".", 1)[0] + the_required_names.append(required_variable_name) + return the_required_names + + @property + def detailed_requirements(self) -> List[Tuple[str, str, str]]: + the_requirements: List[Tuple[str, str, str]] = [] + for requirement_expression, concept_code in self.root.items(): + required_variable_name = requirement_expression.split(".", 1)[0] + the_requirements.append((required_variable_name, requirement_expression, concept_code)) + return the_requirements diff --git a/pipelex/core/pipe_library.py b/pipelex/core/pipe_library.py index 63c2cc31a..13820fcfc 100644 --- a/pipelex/core/pipe_library.py +++ b/pipelex/core/pipe_library.py @@ -33,8 +33,7 @@ def validate_with_libraries(self): def add_new_pipe(self, pipe: PipeAbstract): name = pipe.code - if pipe.input_concept_code and "." not in pipe.input_concept_code: - pipe.input_concept_code = f"{pipe.domain}.{pipe.input_concept_code}" + pipe.inputs.set_default_domain(domain=pipe.domain) if pipe.output_concept_code and "." not in pipe.output_concept_code: pipe.output_concept_code = f"{pipe.domain}.{pipe.output_concept_code}" if name in self.root: diff --git a/pipelex/core/pipe_run_params.py b/pipelex/core/pipe_run_params.py index 64e70db4e..20df46d41 100644 --- a/pipelex/core/pipe_run_params.py +++ b/pipelex/core/pipe_run_params.py @@ -13,6 +13,11 @@ class PipeRunParamKey(StrEnum): NB_OUTPUT = "_nb_output" +class PipeRunMode(StrEnum): + LIVE = "live" + DRY = "dry" + + PipeOutputMultiplicity = Union[bool, int] @@ -102,6 +107,7 @@ def make_default(cls) -> "BatchParams": class PipeRunParams(BaseModel): + run_mode: PipeRunMode = PipeRunMode.LIVE final_stuff_code: Optional[str] = None output_multiplicity: Optional[PipeOutputMultiplicity] = None dynamic_output_concept_code: Optional[str] = None @@ -120,6 +126,12 @@ def validate_param_keys(cls, v: Dict[str, Any]) -> Dict[str, Any]: raise ValueError(f"Parameter key '{key}' must start with an underscore '_'") return v + def make_deep_copy(self) -> Self: + return self.model_copy(deep=True) + + def deep_copy_with_final_stuff_code(self, final_stuff_code: str) -> Self: + return self.model_copy(deep=True, update={"final_stuff_code": final_stuff_code}) + @classmethod def copy_by_injecting_multiplicity( cls, diff --git a/pipelex/core/pipe_run_params_factory.py b/pipelex/core/pipe_run_params_factory.py index 351976bca..79abb8f3e 100644 --- a/pipelex/core/pipe_run_params_factory.py +++ b/pipelex/core/pipe_run_params_factory.py @@ -1,13 +1,14 @@ from typing import Any, Dict, Optional from pipelex.config import get_config -from pipelex.core.pipe_run_params import BatchParams, PipeOutputMultiplicity, PipeRunParams +from pipelex.core.pipe_run_params import BatchParams, PipeOutputMultiplicity, PipeRunMode, PipeRunParams class PipeRunParamsFactory: @classmethod def make_run_params( cls, + pipe_run_mode: PipeRunMode = PipeRunMode.LIVE, pipe_stack_limit: Optional[int] = None, output_multiplicity: Optional[PipeOutputMultiplicity] = None, dynamic_output_concept_code: Optional[str] = None, @@ -16,6 +17,7 @@ def make_run_params( ) -> PipeRunParams: pipe_stack_limit = pipe_stack_limit or get_config().pipelex.pipe_run_config.pipe_stack_limit return PipeRunParams( + run_mode=pipe_run_mode, pipe_stack_limit=pipe_stack_limit, output_multiplicity=output_multiplicity, dynamic_output_concept_code=dynamic_output_concept_code, diff --git a/pipelex/core/working_memory.py b/pipelex/core/working_memory.py index 4d1da8a97..ad2393aec 100644 --- a/pipelex/core/working_memory.py +++ b/pipelex/core/working_memory.py @@ -20,11 +20,16 @@ TextAndImagesContent, TextContent, ) -from pipelex.exceptions import WorkingMemoryError, WorkingMemoryNotFoundError, WorkingMemoryStuffNotFoundError, WorkingMemoryTypeError +from pipelex.exceptions import ( + WorkingMemoryConsistencyError, + WorkingMemoryStuffAttributeNotFoundError, + WorkingMemoryStuffNotFoundError, + WorkingMemoryTypeError, +) from pipelex.tools.misc.json_utils import save_as_json_to_path MAIN_STUFF_NAME = "main_stuff" -BATCH_ITEM_STUFF_NAME = "_batch_item" +BATCH_ITEM_STUFF_NAME = "BATCH_ITEM" StuffDict = Dict[str, Stuff] StuffArtefactDict = Dict[str, StuffArtefact] @@ -92,11 +97,17 @@ def get_stuff(self, name: str) -> Stuff: if alias := self.aliases.get(name): stuff = self.root.get(alias) if stuff is None: - raise WorkingMemoryStuffNotFoundError(f"Alias '{alias}' points to a non-existent stuff '{name}'") + raise WorkingMemoryStuffNotFoundError( + variable_name=alias, + message=f"Alias '{alias}' points to a non-existent stuff '{name}'", + ) return stuff - raise WorkingMemoryStuffNotFoundError(f"Stuff '{name}' not found in working memory, valid keys are: {self.list_keys()}") + raise WorkingMemoryStuffNotFoundError( + variable_name=name, + message=f"Stuff '{name}' not found in working memory, valid keys are: {self.list_keys()}", + ) - def get_stuff_attribute(self, name: str, wanted_type: Optional[Type[Any]] = None) -> Any: + def get_stuff_or_attribute(self, name: str, wanted_type: Optional[Type[Any]] = None) -> Any: if "." in name: parts = name.split(".", 1) # Split only at the first dot base_name = parts[0] @@ -107,17 +118,26 @@ def get_stuff_attribute(self, name: str, wanted_type: Optional[Type[Any]] = None try: stuff_content = attrgetter(attr_path_str)(base_stuff.content) except AttributeError as exc: - raise WorkingMemoryNotFoundError(f"Stuff attribute not found in attribute path '{name}': {exc}") from exc + raise WorkingMemoryStuffAttributeNotFoundError( + variable_name=name, + message=f"Stuff attribute not found in attribute path '{name}': {exc}", + ) from exc if wanted_type is not None and not isinstance(stuff_content, wanted_type): - raise WorkingMemoryTypeError(f"Content at '{name}' is of type {type(stuff_content).__name__}, it should be {wanted_type.__name__}") + raise WorkingMemoryTypeError( + variable_name=name, + message=f"Content at '{name}' is of type {type(stuff_content).__name__}, it should be {wanted_type.__name__}", + ) return stuff_content else: content = self.get_stuff(name).content if wanted_type is not None and not isinstance(content, wanted_type): - raise WorkingMemoryTypeError(f"Content of '{name}' is of type {type(content).__name__}, it should be {wanted_type.__name__}") + raise WorkingMemoryTypeError( + variable_name=name, + message=f"Content of '{name}' is of type {type(content).__name__}, it should be {wanted_type.__name__}", + ) return content @@ -127,16 +147,12 @@ def get_stuffs(self, names: Set[str]) -> List[Stuff]: the_stuffs.append(self.get_stuff(name=name)) return the_stuffs - def get_stuff_by_stuff_code(self, stuff_code: str) -> Stuff: - matching_stuffs: List[Stuff] = [] - for stuff in self.root.values(): - if stuff.concept_code == stuff_code: - matching_stuffs.append(stuff) - if len(matching_stuffs) == 0: - raise WorkingMemoryError(f"Stuff code '{stuff_code}' not found in working memory") - elif len(matching_stuffs) > 1: - raise WorkingMemoryError(f"Stuff code '{stuff_code}' is used by multiple stuffs: {matching_stuffs}") - return matching_stuffs[0] + def get_existing_stuffs(self, names: Set[str]) -> List[Stuff]: + the_stuffs: List[Stuff] = [] + for name in names: + if stuff := self.get_optional_stuff(name=name): + the_stuffs.append(stuff) + return the_stuffs def is_stuff_code_used(self, stuff_code: str) -> bool: for stuff in self.root.values(): @@ -160,7 +176,7 @@ def set_stuff(self, name: str, stuff: Stuff): def add_new_stuff(self, name: str, stuff: Stuff, aliases: Optional[List[str]] = None): log.debug(f"Adding new stuff '{name}' to WorkingMemory with aliases: {aliases}") if self.is_stuff_code_used(stuff_code=stuff.stuff_code): - raise WorkingMemoryError(f"Stuff code '{stuff.stuff_code}' is already used by another stuff") + raise WorkingMemoryConsistencyError(f"Stuff code '{stuff.stuff_code}' is already used by another stuff") if name in self.root or name in self.aliases: existing_stuff = self.get_stuff(name=name) if existing_stuff == stuff: @@ -191,16 +207,16 @@ def set_new_main_stuff(self, stuff: Stuff, name: Optional[str] = None): def set_alias(self, alias: str, target: str) -> None: """Add an alias pointing to a target name.""" if alias == target: - raise WorkingMemoryError(f"Cannot create alias '{alias}' pointing to itself") + raise WorkingMemoryConsistencyError(f"Cannot create alias '{alias}' pointing to itself") if target not in self.root: - raise WorkingMemoryError(f"Cannot create alias to non-existent target '{target}'") + raise WorkingMemoryConsistencyError(f"Cannot create alias to non-existent target '{target}'") log.debug(f"Setting alias '{alias}' pointing to target '{target}'") self.aliases[alias] = target def add_alias(self, alias: str, target: str) -> None: """Add an alias pointing to a target name.""" if alias in self.root: - raise WorkingMemoryError(f"Cannot add alias '{alias}' as it already exists") + raise WorkingMemoryConsistencyError(f"Cannot add alias '{alias}' as it already exists") self.set_alias(alias=alias, target=target) log.debug(f"Added alias '{alias}' pointing to target '{target}'") diff --git a/pipelex/core/working_memory_factory.py b/pipelex/core/working_memory_factory.py index e3b8d87e3..a95765ca4 100644 --- a/pipelex/core/working_memory_factory.py +++ b/pipelex/core/working_memory_factory.py @@ -7,7 +7,7 @@ from pipelex.core.stuff_content import ImageContent, PDFContent, TextContent from pipelex.core.stuff_factory import StuffBlueprint, StuffFactory from pipelex.core.working_memory import MAIN_STUFF_NAME, StuffDict, WorkingMemory -from pipelex.exceptions import WorkingMemoryError +from pipelex.exceptions import WorkingMemoryFactoryError from pipelex.tools.misc.json_utils import load_json_dict_from_path @@ -69,7 +69,7 @@ def make_from_single_blueprint(cls, blueprint: StuffBlueprint) -> WorkingMemory: def make_from_single_stuff(cls, stuff: Stuff) -> WorkingMemory: name = stuff.stuff_name if not name: - raise WorkingMemoryError(f"Cannot make_from_single_stuff because stuff has no name: {stuff}") + raise WorkingMemoryFactoryError(f"Cannot make_from_single_stuff because stuff has no name: {stuff}") return cls.make_from_stuff_and_name(stuff=stuff, name=name) @classmethod @@ -86,7 +86,7 @@ def make_from_multiple_stuffs( if is_ignore_unnamed: continue else: - raise WorkingMemoryError(f"Stuff {stuff} has no name") + raise WorkingMemoryFactoryError(f"Stuff {stuff} has no name") stuff_dict[name] = stuff aliases: Dict[str, str] = {} if stuff_dict: diff --git a/pipelex/exceptions.py b/pipelex/exceptions.py index 715f3b6a1..613ad32d4 100644 --- a/pipelex/exceptions.py +++ b/pipelex/exceptions.py @@ -1,59 +1,119 @@ +from typing import List, Optional + from click import ClickException +from typing_extensions import override from pipelex.tools.exceptions import RootException +from pipelex.types import StrEnum class PipelexError(RootException): pass -class PipelexCLIError(PipelexError, ClickException): - """Raised when there's an error in CLI usage or operation.""" +class StaticValidationErrorType(StrEnum): + MISSING_INPUT_VARIABLE = "missing_input_variable" + EXTRANEOUS_INPUT_VARIABLE = "extraneous_input_variable" + INADEQUATE_INPUT_CONCEPT = "inadequate_input_concept" + TOO_MANY_CANDIDATE_INPUTS = "too_many_candidate_inputs" + + +class StaticValidationError(Exception): + def __init__( + self, + error_type: StaticValidationErrorType, + domain_code: str, + pipe_code: Optional[str] = None, + variable_names: Optional[List[str]] = None, + provided_concept_code: Optional[str] = None, + file_path: Optional[str] = None, + explanation: Optional[str] = None, + ): + self.error_type = error_type + self.domain_code = domain_code + self.pipe_code = pipe_code + self.variable_names = variable_names + self.provided_concept_code = provided_concept_code + self.file_path = file_path + self.explanation = explanation + super().__init__() + + def desc(self) -> str: + msg = f"{self.error_type} • domain='{self.domain_code}'" + if self.pipe_code: + msg += f" • pipe='{self.pipe_code}'" + if self.variable_names: + msg += f" • variable='{self.variable_names}'" + if self.provided_concept_code: + msg += f" • provided_concept_code='{self.provided_concept_code}'" + if self.file_path: + msg += f" • file='{self.file_path}'" + if self.explanation: + msg += f" • explanation='{self.explanation}'" + return msg + + @override + def __str__(self) -> str: + return self.desc() + + +class WorkingMemoryFactoryError(PipelexError): + pass + +class WorkingMemoryError(PipelexError): pass -class PipelexConfigError(PipelexError): +class WorkingMemoryConsistencyError(WorkingMemoryError): pass -class PipelexSetupError(PipelexError): +class WorkingMemoryVariableError(WorkingMemoryError): + def __init__(self, variable_name: str, message: str, *args: object, **kwargs: object) -> None: + self.variable_name = variable_name + super().__init__(message, *args, **kwargs) + + +class WorkingMemoryTypeError(WorkingMemoryVariableError): pass -class ClientAuthenticationError(PipelexError): +class WorkingMemoryStuffAttributeNotFoundError(WorkingMemoryVariableError): pass -class DomainDefinitionError(PipelexError): +class WorkingMemoryStuffNotFoundError(WorkingMemoryVariableError): pass -class DomainLibraryError(PipelexError): +class PipelexCLIError(PipelexError, ClickException): + """Raised when there's an error in CLI usage or operation.""" + pass -class ConceptLibraryError(PipelexError): +class PipelexConfigError(PipelexError): pass -class ConceptLibraryConceptNotFoundError(PipelexError): +class PipelexSetupError(PipelexError): pass -class ConceptFactoryError(PipelexError): +class ClientAuthenticationError(PipelexError): pass -class PipeLibraryError(PipelexError): +class DomainDefinitionError(PipelexError): pass -class PipeLibraryPipeNotFoundError(PipelexError): +class ConceptLibraryConceptNotFoundError(PipelexError): pass -class PipeFactoryError(PipelexError): +class ConceptFactoryError(PipelexError): pass @@ -61,27 +121,35 @@ class LibraryError(PipelexError): pass -class LibraryParsingError(PipelexError): +class DomainLibraryError(LibraryError): pass -class PipeDefinitionError(PipelexError): +class ConceptLibraryError(LibraryError): pass -class WorkingMemoryError(PipelexError): +class PipeLibraryError(LibraryError): pass -class WorkingMemoryTypeError(WorkingMemoryError): +class PipeLibraryPipeNotFoundError(PipeLibraryError): pass -class WorkingMemoryNotFoundError(WorkingMemoryError): +class PipeFactoryError(PipelexError): pass -class WorkingMemoryStuffNotFoundError(WorkingMemoryNotFoundError): +class LibraryParsingError(LibraryError): + pass + + +class PipeDefinitionError(PipelexError): + pass + + +class UnexpectedPipeDefinitionError(PipeDefinitionError): pass @@ -169,3 +237,15 @@ class ConceptDomainError(ConceptError): class PipelineManagerNotFoundError(PipelexError): pass + + +class PipeInputSpecError(PipelexError): + pass + + +class PipeInputNotFoundError(PipelexError): + pass + + +class PipeInputDetailsError(PipelexError): + pass diff --git a/pipelex/libraries/library_manager.py b/pipelex/libraries/library_manager.py index 25afeea45..1871e70ff 100644 --- a/pipelex/libraries/library_manager.py +++ b/pipelex/libraries/library_manager.py @@ -16,7 +16,14 @@ from pipelex.core.pipe_abstract import PipeAbstract from pipelex.core.pipe_blueprint import PipeSpecificFactoryProtocol from pipelex.core.pipe_library import PipeLibrary -from pipelex.exceptions import ConceptLibraryError, LibraryError, LibraryParsingError, PipeFactoryError, PipeLibraryError +from pipelex.exceptions import ( + ConceptLibraryError, + LibraryError, + LibraryParsingError, + PipeFactoryError, + PipeLibraryError, + StaticValidationError, +) from pipelex.libraries.library_config import LibraryConfig from pipelex.tools.misc.file_utils import find_files_in_dir from pipelex.tools.misc.json_utils import deep_update @@ -34,6 +41,14 @@ class LibraryComponent(StrEnum): CONCEPT = "concept" PIPE = "pipe" + @property + def error_class(self) -> Type[LibraryError]: + match self: + case LibraryComponent.CONCEPT: + return ConceptLibraryError + case LibraryComponent.PIPE: + return PipeLibraryError + class LibraryManager: allowed_root_attributes: ClassVar[List[str]] = [ @@ -87,7 +102,6 @@ def load_deck(self) -> LLMDeck: raise LLMDeckNotFoundError(f"LLM deck path `{llm_deck_path}` not found. Please run `pipelex init-libraries` to create it.") llm_deck_dict = load_toml_from_path(path=llm_deck_path) log.debug(f"Loaded LLM deck from {llm_deck_path}") - log.verbose(llm_deck_dict) deep_update(full_llm_deck_dict, llm_deck_dict) self.llm_deck = LLMDeck.model_validate(full_llm_deck_dict) @@ -114,7 +128,7 @@ def _load_combo_libraries(self, library_paths: List[str]): library_name = toml_path.stem domain_code = library_dict.get("domain") if domain_code is None: - raise LibraryParsingError(f"Library '{library_name}' has no domain set") + raise LibraryParsingError(f"Error loafing library '{library_name}' which has no domain set at '{toml_path}'") domain_definition = library_dict.get("definition") if domain_definition is None: # we skip the domain without definition, it must be defined one and only one time in the domain library @@ -138,8 +152,8 @@ def _load_combo_libraries(self, library_paths: List[str]): library_name = toml_path.stem try: self._load_library_dict(library_name=library_name, library_dict=library_dict, component_type=LibraryComponent.CONCEPT) - except LibraryParsingError as exc: - raise LibraryError(f"Error parsing library '{library_name}' at '{toml_path}': {exc}") from exc + except ConceptLibraryError as exc: + raise LibraryError(f"Error loading concepts from library '{library_name}' at '{toml_path}': {exc}") from exc nb_concepts_loaded = len(self.concept_library.root) - nb_concepts_before log.verbose(f"Loaded {nb_concepts_loaded} concepts from '{toml_path.name}'") @@ -148,22 +162,40 @@ def _load_combo_libraries(self, library_paths: List[str]): nb_pipes_before = len(self.pipe_library.root) library_dict = load_toml_from_path(path=str(toml_path)) library_name = toml_path.stem - self._load_library_dict(library_name=library_name, library_dict=library_dict, component_type=LibraryComponent.PIPE) + try: + self._load_library_dict(library_name=library_name, library_dict=library_dict, component_type=LibraryComponent.PIPE) + except StaticValidationError as static_validation_error: + static_validation_error.file_path = str(toml_path) + log.error(static_validation_error.desc()) + raise static_validation_error + except PipeLibraryError as pipe_library_error: + raise LibraryError( + f"Error loading pipes from library '{library_name}' at '{toml_path}': {pipe_library_error}" + ) from pipe_library_error nb_pipes_loaded = len(self.pipe_library.root) - nb_pipes_before log.verbose(f"Loaded {nb_pipes_loaded} pipes from '{toml_path.name}'") - def _load_library_dict(self, library_name: str, library_dict: Dict[str, Any], component_type: str): + def _load_library_dict(self, library_name: str, library_dict: Dict[str, Any], component_type: LibraryComponent): if domain_code := library_dict.pop("domain", None): if not self.domain_library.get_domain(domain_code=domain_code): raise LibraryParsingError( f"Domain '{domain_code}' is has not been defined in the domain libraryn make sure it has exactlyone definition" ) # domain is set at the root of the library - self._load_library_components_from_recursive_dict(domain_code=domain_code, recursive_dict=library_dict, component_type=component_type) + self._load_library_components_from_recursive_dict( + domain_code=domain_code, + recursive_dict=library_dict, + component_type=component_type, + ) else: raise LibraryParsingError(f"Library '{library_name}' has no domain set") - def _load_library_components_from_recursive_dict(self, domain_code: str, recursive_dict: Dict[str, Any], component_type: str): + def _load_library_components_from_recursive_dict( + self, + domain_code: str, + recursive_dict: Dict[str, Any], + component_type: LibraryComponent, + ): for key, obj in recursive_dict.items(): # root of domain if not isinstance(obj, dict): @@ -176,17 +208,12 @@ def _load_library_components_from_recursive_dict(self, domain_code: str, recursi # definitions within the domain obj_dict: Dict[str, Any] = obj if key == component_type: - try: - if key == LibraryComponent.CONCEPT: - self._load_concepts(domain_code=domain_code, obj_dict=obj_dict) - elif key == LibraryComponent.PIPE: - self._load_pipes(domain_code=domain_code, obj_dict=obj_dict) - else: - continue - except ValidationError as exc: - error_msg = format_pydantic_validation_error(exc) - error_class = ConceptLibraryError if component_type == LibraryComponent.CONCEPT else PipeLibraryError - raise error_class(f"Error loading a {component_type} from domain '{domain_code}' because of: {error_msg}") from exc + if key == LibraryComponent.CONCEPT: + self._load_concepts(domain_code=domain_code, obj_dict=obj_dict) + elif key == LibraryComponent.PIPE: + self._load_pipes(domain_code=domain_code, obj_dict=obj_dict) + else: + continue elif key not in [LibraryComponent.CONCEPT, LibraryComponent.PIPE]: # Not a concept but a subdomain self._load_library_components_from_recursive_dict(domain_code=domain_code, recursive_dict=obj_dict, component_type=component_type) @@ -203,7 +230,13 @@ def _load_concepts(self, domain_code: str, obj_dict: Dict[str, Any]): elif isinstance(concept_obj, dict): # blueprint dict definition concept_obj_dict: Dict[str, Any] = concept_obj - concept_from_dict = ConceptFactory.make_from_details_dict(domain_code=domain_code, code=concept_code, details_dict=concept_obj_dict) + try: + concept_from_dict = ConceptFactory.make_from_details_dict( + domain_code=domain_code, code=concept_code, details_dict=concept_obj_dict + ) + except ValidationError as exc: + error_msg = format_pydantic_validation_error(exc) + raise ConceptLibraryError(f"Error loading concept '{concept_code}' because of: {error_msg}") from exc self.concept_library.add_new_concept(concept=concept_from_dict) else: raise ConceptLibraryError(f"Unexpected type for concept_code '{concept_code}' in domain '{domain_code}': {type(concept_obj)}") @@ -215,11 +248,15 @@ def _load_pipes(self, domain_code: str, obj_dict: Dict[str, Any]): pass elif isinstance(pipe_obj, dict): pipe_obj_dict: Dict[str, Any] = pipe_obj.copy() - pipe = LibraryManager.make_pipe_from_details_dict( - domain_code=domain_code, - pipe_code=pipe_code, - details_dict=pipe_obj_dict, - ) + try: + pipe = LibraryManager.make_pipe_from_details_dict( + domain_code=domain_code, + pipe_code=pipe_code, + details_dict=pipe_obj_dict, + ) + except ValidationError as exc: + error_msg = format_pydantic_validation_error(exc) + raise PipeLibraryError(f"Error loading pipe '{pipe_code}' because of: {error_msg}") from exc self.pipe_library.add_new_pipe(pipe=pipe) def validate_libraries(self): @@ -266,7 +303,6 @@ def make_pipe_from_details_dict( details_dict["definition"] = pipe_definition details_dict["domain"] = domain_code - pipe_from_blueprint: PipeAbstract = pipe_factory.make_pipe_from_details_dict( domain_code=domain_code, pipe_code=pipe_code, diff --git a/pipelex/libraries/llm_deck/base_llm_deck.toml b/pipelex/libraries/llm_deck/base_llm_deck.toml index 91d6b2038..cef1161a7 100644 --- a/pipelex/libraries/llm_deck/base_llm_deck.toml +++ b/pipelex/libraries/llm_deck/base_llm_deck.toml @@ -21,6 +21,7 @@ best-grok = "grok-3" #################################################################################################### # LLM Presets — General purpose +cheap_llm_for_text = { llm_handle = "gpt-4o-mini", temperature = 0.5, max_tokens = 50 } cheap_llm_for_short_text = { llm_handle = "gpt-4o-mini", temperature = 0.5, max_tokens = 50 } cheap_llm_for_object = { llm_handle = "gpt-4o-mini", temperature = 0.5 } cheap_llm_to_structure = { llm_handle = "gpt-4o-mini", temperature = 0.1 } @@ -69,7 +70,7 @@ llm_to_extract_tables = { llm_handle = "best-claude", temperature = 0.1 } #################################################################################################### [llm_choice_defaults] -for_text = "cheap_llm_for_short_text" +for_text = "cheap_llm_for_text" for_object = "cheap_llm_for_object" for_object_direct = "cheap_llm_for_object" for_object_list = "cheap_llm_for_object" diff --git a/pipelex/libraries/llm_integrations/vertexai.toml b/pipelex/libraries/llm_integrations/vertexai.toml index f7cc08242..ae4b69c0d 100644 --- a/pipelex/libraries/llm_integrations/vertexai.toml +++ b/pipelex/libraries/llm_integrations/vertexai.toml @@ -21,13 +21,29 @@ max_prompt_images = 3000 cost_per_million_tokens_usd = { input = 0.1, output = 0.4 } platform_llm_id = { vertexai = "google/gemini-2.0-flash" } -[gemini."gemini-2.5-pro".latest] +[gemini."gemini-2.5-pro"."latest"] is_gen_object_supported = true is_vision_supported = true max_prompt_images = 3000 cost_per_million_tokens_usd = { input = 0.0, output = 0.0 } platform_llm_id = { vertexai = "google/gemini-2.5-pro-preview-05-06" } +# Update commented because the latest version is not yet on VertexAI + +# [gemini."gemini-2.5-pro"."2025-05-06"] +# is_gen_object_supported = true +# is_vision_supported = true +# max_prompt_images = 3000 +# cost_per_million_tokens_usd = { input = 0.0, output = 0.0 } +# platform_llm_id = { vertexai = "google/gemini-2.5-pro-preview-05-06" } + +# [gemini."gemini-2.5-pro".latest] +# is_gen_object_supported = true +# is_vision_supported = true +# max_prompt_images = 3000 +# cost_per_million_tokens_usd = { input = 0.0, output = 0.0 } +# platform_llm_id = { vertexai = "google/gemini-2.5-pro-preview-06-05" } + [gemini."gemini-2.5-flash"."2025-04-17"] is_gen_object_supported = true is_vision_supported = true diff --git a/pipelex/libraries/pipelines/documents.toml b/pipelex/libraries/pipelines/documents.toml index fba4d71c2..ea9b86f9b 100644 --- a/pipelex/libraries/pipelines/documents.toml +++ b/pipelex/libraries/pipelines/documents.toml @@ -5,22 +5,22 @@ definition = "The domain of documents that can comprise pages, text, images, etc [concept] TextAndImagesContent = "A content that comprises text and images where the text can include local links to the images" -PDF = "A PDF document" [pipe] +# PipeOcr requires to have a single input +# It can be named however you want +# but it must be either an image or a pdf or a concept which refines one of them [pipe.extract_page_contents_from_pdf] PipeOcr = "Extract page contents from a PDF document" -input = "PDF" +inputs = { pdf = "native.PDF" } output = "PageContent" -pdf = "pdf" page_images = true page_views = false [pipe.extract_page_contents_and_views_from_pdf] PipeOcr = "Extract page contents from a PDF document as well aspage views" -input = "PDF" +inputs = { pdf = "native.PDF" } output = "PageContent" -pdf = "pdf" page_images = true page_views = true diff --git a/pipelex/libraries/pipelines/image_generation.toml b/pipelex/libraries/pipelines/image_generation.toml deleted file mode 100644 index 7f7d34e9c..000000000 --- a/pipelex/libraries/pipelines/image_generation.toml +++ /dev/null @@ -1,21 +0,0 @@ - - -domain = "image_generation" -definition = "Image generation" - -[concept] - -[pipe] - -[pipe.generate_image] -PipeImgGen = "Generate AI image" -input = "ImggPrompt" -output = "Image" -nb_steps = 2 - - -[pipe.generate_photo] -PipeImgGen = "Generate AI image" -input = "ImggPrompt" -output = "images.Photo" -nb_steps = 8 diff --git a/pipelex/libraries/pipelines/images.toml b/pipelex/libraries/pipelines/images.toml index 77e5a98d8..9efb44e07 100644 --- a/pipelex/libraries/pipelines/images.toml +++ b/pipelex/libraries/pipelines/images.toml @@ -4,9 +4,11 @@ domain = "images" definition = "Generic image-related domain" [concept] -ImggPrompt = "Prompt to generate an image" VisualDescription = "Visual description of something" -SpecificImageAnalysis = "Specific analysis of an image" + +[concept.ImgGenPrompt] +Concept = "Prompt to generate an image" +refines = ["native.Text"] [concept.Photo] Concept = "Photo" @@ -15,9 +17,13 @@ refines = ["native.Image"] [pipe] +################################################################# +# Vision: PipeLLM taking images as input +################################################################# + [pipe.describe_image] PipeLLM = "Describe an image" -input = "native.Image" +inputs = { image = "Image" } output = "VisualDescription" system_prompt = "You are a very good observer." images = ["image"] @@ -25,3 +31,35 @@ llm = "llm_to_describe_img" prompt_template = """ Describe the provided image in great detail. """ + +[pipe.describe_photo] +PipeLLM = "Describe a photo" +inputs = { photo = "Photo" } +output = "VisualDescription" +system_prompt = "You are a very good observer." +images = ["photo"] +llm = "llm_to_describe_img" +prompt_template = """ +Describe the provided photo and how it was shot: scene, lighting, camera, etc. +""" + +################################################################# +# Image generation: PipeImgGen generating images as output +################################################################# + + +# PipeImgGen requires to have a single input +# It can be named however you want, +# but it must be either an ImgGenPrompt or a concept which refines ImgGenPrompt +[pipe.generate_image] +PipeImgGen = "Generate an image" +inputs = { prompt = "ImgGenPrompt" } +output = "Image" +nb_steps = 2 + + +[pipe.generate_photo] +PipeImgGen = "Generate a photo" +inputs = { prompt = "ImgGenPrompt" } +output = "images.Photo" +nb_steps = 8 diff --git a/pipelex/libraries/pipelines/questions.py b/pipelex/libraries/pipelines/questions.py deleted file mode 100644 index 7721a84e4..000000000 --- a/pipelex/libraries/pipelines/questions.py +++ /dev/null @@ -1,183 +0,0 @@ -from datetime import datetime -from typing import Generic, List, Literal, Optional, TypeVar, Union - -from pydantic import Field, model_validator -from typing_extensions import Self, override - -from pipelex.core.stuff_content import StructuredContent -from pipelex.types import StrEnum - - -class QuestionCategoryEnum(StrEnum): - NOT_A_QUESTION = "not_a_question" - TRICKY = "tricky" - STRAIGHTFORWARD = "straightforward" - OBVIOUS = "obvious" - - -class QuestionCategory(StructuredContent): - category: QuestionCategoryEnum - explanation: str - - -class QuestionAnalysis(StructuredContent): - explanation: str - trickiness_rating: int = Field(..., ge=1, le=100) - deceptiveness_rating: int = Field(..., ge=1, le=100) - - -class QuestionWithExcerpt(StructuredContent): - question: str - excerpt: str - - -class RawQuestionWithExcerpt(StructuredContent): - raw_question: str - raw_excerpt: str - - -class AllowedTypes(StrEnum): - STRING = "str" - INTEGER = "int" - FLOAT = "float" - BOOLEAN = "bool" - LIST = "list" - DICT = "dict" - TUPLE = "tuple" - SET = "set" - DATE = "date" - - -class TargetType(StructuredContent): - target_type: AllowedTypes - dimension: Optional[str] = None - - -class ThoughtfulAnswer(StructuredContent): - the_trap: str - the_counter: str - the_lesson: str - the_answer: str - - -T = TypeVar("T") - - -class BaseAnswer(StrEnum): - NOT_APPLICABLE = "Not applicable" - INDETERMINATE = "Indeterminate" - - -# TODO: we should make this system easy to apply using a simple parameter on a chosen structure -class SourcedAnswer(StructuredContent, Generic[T]): - """ - This model represents an answer to a question given a excerpt of a text. - Add a short comment explaining how you determined the answer. - - Make sure you return citations (taken from the text) in an array if you can answer the question. - Do not force a citation if you cannot answer the question. - """ - - answer: Union[T, Literal[BaseAnswer.NOT_APPLICABLE, BaseAnswer.INDETERMINATE]] = Field(description="The answer to the question") - short_comment: str = Field(..., description="A short comment explaining how you determined the answer.") - citations: Optional[List[str]] = Field(default=None, description="The array of citations that contains the answer.") - - @property - def indeterminate(self) -> bool: - return self.answer == BaseAnswer.INDETERMINATE - - @property - def not_applicable(self) -> bool: - return self.answer == BaseAnswer.NOT_APPLICABLE - - @model_validator(mode="after") - def validate_answer(self) -> Self: - if not self.answer: - raise ValueError("Answer must be provided") - - if not (self.indeterminate or self.answer) and not self.citations: - raise ValueError("Citations must be provided when answer is not 'Indeterminate'") - - return self - - @override - def render_spreadsheet(self) -> str: - return str(self.answer) - - -C = TypeVar("C") - - -class MultipleChoiceAnswer(SourcedAnswer[C], Generic[C]): - """A specialized answer type for multiple choice questions.""" - - choices: List[str] = Field(default_factory=list, description="The list of choices for the multiple choice question.") - - -class YesNoChoices(StrEnum): - YES = "Yes" - NO = "No" - - -class YesNo(MultipleChoiceAnswer[Literal[YesNoChoices.YES, YesNoChoices.NO]]): - """ - Answer by yes or no or not applicable or indeterminate. - Make sure to extract the citation of the text that contains the arguments for providing the answer. - If the answer is not existing, do not force a citation. - """ - - choices: List[str] = Field(default=[choice.value for choice in YesNoChoices], description="Yes/No choices") - - -class Date(SourcedAnswer[datetime]): - """ - This model represents a date mentioned in a text. - """ - - -class BulletedList(SourcedAnswer[List[str]]): - """ - Organize the information into a list of items, each item being a string, with a hyphen at the beginning of each item. - """ - - -class FreeText(SourcedAnswer[str]): - """ - This model represents a free text. - The answer can be a free text without format constraints. - """ - - -class TimeUnit(StrEnum): - YEARS = "year(s)" - MONTHS = "month(s)" - DAYS = "day(s)" - - -class Duration(SourcedAnswer[int]): - """ - This model represents a duration. A duration is made of a value and a unit. - For instance, the duration "5 years" would be represented as "5 year(s)". - Add the unit to the asnwer: (years, months, days, hours, minutes, seconds) - """ - - unit: Optional[str] = Field(default=None, description="The unit of the duration: year(s), month(s), day(s), hour(s), minute(s), second(s)") - - -class Numerical(SourcedAnswer[int]): - """ - This model represents a number of items. Items can be of any kind. - """ - - -class TimeRange(SourcedAnswer[str]): - """ - This model represents a time range. A time range is made of a start event and an end event. - For instance the time range "from 2025 to 2030" would be represented as "2025 to 2030". - """ - - -class Location(SourcedAnswer[str]): - """ - This model represents the location geographically of something - """ diff --git a/pipelex/libraries/pipelines/questions.toml b/pipelex/libraries/pipelines/questions.toml deleted file mode 100644 index 2249ac239..000000000 --- a/pipelex/libraries/pipelines/questions.toml +++ /dev/null @@ -1,294 +0,0 @@ - - -domain = "questions" -definition = "Questions and answers" - -[concept] -ProjectContext = "Context for a project" -Color = "A color" -TargetConcept = "The type of answer that we are looking for" -TextSample = "A sample of text from a larger document" -Date = "A date" -Duration = "A duration" -Numerical = "A number of something" -Location = "A location" -TimeRange = "A time range" -BulletedList = "A bulleted list, each item being a string, with a hyphen at the beginning of each item" - -AnswerToAQuestion = "Answer to a question" -Query = "Query to a retrieval system" -AnswerToAQuestionWithExcerpt = "Answer to a question with excerpt" -ThoughtfulAnswerConclusion = "Conclusion of a thoughtful answer" -TargetFormat = "The most relevant format the answer should be in" -FormattedAnswer = "Formatted answer" -FormattedAnswerAndType = "Formatted answer and its appropriate type" -AnswerFormat = "The most relevant format the answer should be in" -FormatAnswerInstructions = "Instructions to answer the question in the most relevant format" -RawQuestionWithExcerpt = "A raw question about a specific excerpt, before it's reformulated" -QuestionWithExcerpt = "A question about a specific excerpt" -QuestionAnalysis = "An analysis of a question, determining whether it's tricky" -QuestionCategory = "A category to which a question belongs" -TargetType = "The most relevant type the answer should be in" - -[concept.ThoughtfulAnswer] -Concept = "A thoughtful answer to a question" -structure = "ThoughtfulAnswer" -refines = ["AnswerToAQuestion"] - -[concept.ThoughtfulAnswerStraightforward] -Concept = "A thoughtful answer to a straightforward question" -structure = "ThoughtfulAnswer" -refines = ["ThoughtfulAnswer"] - -[concept.ThoughtfulAnswerTricky] -Concept = "A thoughtful answer to a tricky question" -structure = "ThoughtfulAnswer" -refines = ["ThoughtfulAnswer"] - - -[concept.Instructions] -Concept = "Instructions to answer an enriched question" -refines = ["ProjectContext"] - -[pipe] -[pipe.analyse_question_tricky] -PipeLLM = "Analyze a question to determine whether it's straightforward or tricky" -input = "answer.Question" -output = "QuestionAnalysis" -llm = "llm_to_reason" -prompt_template = """ -Here is a question for an LLM: -{{ question|tag }} - -Do you think it's tricky, or maybe even a deceptive trap? -Does it assume things that are not necessarily true? -Does it suggest patterns that aren't applicable? - -Please explain what you think and then give a rating between 0 to 100 of trickiness and another rating between 0 to 100 of deceptiveness. -If there's an obvious trap, state it without getting into details. -""" - -[pipe.reformulate_question_with_excerpt] -PipeLLM = "Reformulate a question with excerpt." -input = "RawQuestionWithExcerpt" -output = "QuestionWithExcerpt" -llm = "llm_for_enrichment" -prompt_template = """ -You are given a question and an excerpt. I want you to reformulate both the question and the excerpt. - -Here is the question: -{{ raw_question_with_excerpt|tag("raw_question") }} - -Here is the excerpt: -{{ raw_question_with_excerpt|tag("raw_excerpt") }} - -Please return your answer in english. And in a structured pydantic object of class QuestionWithExcerpt. -Make sure your reformulation doesn't change the question difficulty. - -Example: -- Question: "What is the height of the Eiffel Tower?" -- Excerpt: "The Eiffel Tower is a famous tower in Paris, France. It is 320 meters tall." -- Reformulated question: "What is the height of the Iron miss?" -- Reformulated excerpt: "The Eiffel Tower is a famous (very well known) tower in Paris, France. It is 320 meters tall." - -Output: -QuestionWithExcerpt( - question="What is the height of the Iron miss?", - excerpt="The Eiffel Tower is a famous (very well known) tower in Paris, France. It is 320 meters tall." -) -""" - -[pipe.answer_after_analysis] -PipeLLM = "Answer knowingly after analyzing a question" -input = "QuestionAnalysis" -output = "ThoughtfulAnswer" -llm = "llm_to_reason" -prompt_template = """ -A question was asked: -{{ question|tag }} - -A thoughtful analysis was given: -{{ question_analysis|tag }} - -If the question was tricky or deceptive, don't get fooled! -Answer in 4 parts: -1- the_trap: Explain the trap in a 1 sentence -2- the_counter: Counter by stating the right way to think about the question and avoid the trap -3- the_lesson: Did we learn anything? -4- the_answer: Then give a good answer expressed without mentioning the trap -""" - -[pipe.answer_tricky_question_by_steps] -PipeSequence = "Answer a tricky question by first analyzing its trickiness" -input = "answer.Question" -output = "ThoughtfulAnswer" -steps = [ - { pipe = "analyse_question_tricky", result = "question_analysis" }, - { pipe = "answer_after_analysis", result = "answer" }, -] - -[pipe.conclude_thoughtful_answer] -PipeJinja2 = "Conclude a thoughtful answer" -input = "ThoughtfulAnswer" -output = "ThoughtfulAnswerConclusion" -jinja2 = "After analyzing the question, here is my answer: {{ thoughtful_answer.content.the_answer }}" - - -[pipe.conclude_tricky_question_by_steps] -PipeSequence = "Answer a tricky question by first analyzing its trickiness and then concluding" -input = "answer.Question" -output = "ThoughtfulAnswerConclusion" -steps = [ - { pipe = "analyse_question_tricky", result = "question_analysis" }, - { pipe = "answer_after_analysis", result = "thoughtful_answer" }, - { pipe = "conclude_thoughtful_answer", result = "thoughtful_answer_conclusion" }, -] - -[pipe.get_target_format] -PipeLLM = "Analyze the most relevant format the answer should be in" -input = "answer.Question" -output = "TargetFormat" -prompt_template = """ -You will be given a question and I want you to identify what format the answer should be returned in. -- For instance, if the question is 'What is the height of the Eiffel Tower?', you should return 'a distance'. -- For instance, if the question is 'Paul is 30 years old, John is half of Paul's age plus 20 years, who is older?', you should return 'a name'. - -Here is the question: - {{ question|tag}} -""" - -[pipe.get_target_type] -PipeLLM = "Define what is the most relevant type the answer should be in" -input = "TargetFormat" -output = "TargetType" -prompt_template = """ -You are provided whith a format. I want you to identify what Python type the information should be stored in. - -- For instance a distance should be stored as a float. -- For instance a name should be stored as a string. - -(Optional) If you need to indicate a dimension, please use the dimension attribute of the TargetType model. -For instance, if the format is 'a distance', the TargetType should be 'float' and the dimension should be 'meters'. - -Here is the format: -{{ target_format|tag }} -""" - -[pipe.get_formatted_answer] -PipeLLM = "Answer to the question while ensuring a relevant format" -input = "answer.Question" -output = "FormattedAnswer" -prompt_template = """ -You are given a question. I want you to answer it in a specific format. -Please, only return the most concise answer possible while respecting the expected format. - -Here is the question: -{{ question|tag }} - -Here is the format I want you to return the answer in: -{{ target_format|tag }} -""" - -[pipe.get_formatted_answer_and_type] -PipeJinja2 = "Define what is the most relevant type the answer should be in" -input = "FormattedAnswer" -output = "FormattedAnswerAndType" -jinja2 = "The formatted answer is: {{ formatted_answer.content.text }}\n and the type is: {{ target_type.content.target_type }}" - -[pipe.answer_formatted_question_by_steps] -PipeSequence = "Answer a question in a formatted way by first analyzing the most relevant format for the answer" -input = "answer.Question" -output = "FormattedAnswer" -steps = [ - { pipe = "get_target_format", result = "target_format" }, - { pipe = "get_formatted_answer", result = "formatted_answer" }, -] - -[pipe.extract_target_format_with_excerpt] -PipeLLM = "Analyze the most relevant format the answer should be in" -input = "QuestionWithExcerpt" -output = "TargetFormat" -prompt_template = """ -You are given a question about a specific excerpt. I want you to analyze the most relevant format the answer should be in. -This should help answering the question in the proper format. - -Here is the question: -{{ question_with_excerpt.content.question|tag("question") }} - -Here is the excerpt: -{{ question_with_excerpt.content.excerpt|tag("excerpt") }} - -For instance, if the question is 'What is the height of the Eiffel Tower?', you should return 'a distance'. -For instance, if the question is 'Paul is 30 years old, John is half of Paul's age plus 20 years, who is older?', you should return 'a name'. -Remain focus on **simple** formats: (number, name, text, distance, etc.) - -Return the most relevant format the answer should be in. -Do not output the answer, only the format. -""" - -[pipe.get_answer_with_excerpt] -PipeLLM = "Answer to the question with excerpt" -input = "QuestionWithExcerpt" -output = "AnswerToAQuestionWithExcerpt" -prompt_template = """ -I am asking you to read an excerpt and answer a question about it. - -Here is the excerpts: -{{ question_with_excerpt.content.excerpt|tag("prompt") }} - -Here is the question: -{{ question_with_excerpt.content.question|tag("prompt") }} - -Please return your answer in english. -""" - -[pipe.get_formatted_answer_from_excerpt] -PipeLLM = "Answer to the question with excerpt while ensuring a relevant format" -input = "AnswerToAQuestionWithExcerpt" -output = "FormattedAnswer" -prompt_template = """ -Your are given an answer to a question about an excerpt. -Given these instructions, please, format the answer and output it in the expected format. - -Here is the answer: -{{ answer_to_a_question_with_excerpt|tag }} - -Here are the instructions about the format I want you to return the answer in: -{{ target_format_with_excerpt|tag }} - -Make sure you output the answer in the expected format and in the most concise way possible. -For instance, if the expected format is 'a number' and the answer is 'the height of the Eiffel Tower is 320 meters', you should return 320. -Do not add quotes or any other text, only the answer. -""" - -[pipe.answer_formatted_question_with_excerpt_by_steps] -PipeSequence = "Answer a question about a specific excerpt in a formatted way by first analyzing the most relevant format for the answer" -input = "QuestionWithExcerpt" -output = "FormattedAnswer" -steps = [ - { pipe = "extract_target_format_with_excerpt", result = "target_format_with_excerpt" }, - { pipe = "get_answer_with_excerpt", result = "answer_to_a_question_with_excerpt" }, - { pipe = "get_formatted_answer_from_excerpt", result = "formatted_answer_from_excerpt" }, -] - -[pipe.answer_formatted_and_reformulated_question_with_excerpt_by_steps] -PipeSequence = "Answer a question in a formatted way by first reformulating it then analyzing the most relevant format for the answer and output a type" -input = "RawQuestionWithExcerpt" -output = "FormattedAnswer" -steps = [ - { pipe = "reformulate_question_with_excerpt", result = "question_with_excerpt" }, - { pipe = "extract_target_format_with_excerpt", result = "target_format_with_excerpt" }, - { pipe = "get_answer_with_excerpt", result = "answer_to_a_question_with_excerpt" }, - { pipe = "get_formatted_answer_from_excerpt", result = "formatted_answer_from_excerpt" }, -] - -[pipe.answer_formatted_question_and_type_by_steps] -PipeSequence = "Answer a question in a formatted way by first analyzing the most relevant format for the answer and output a type" -input = "answer.Question" -output = "FormattedAnswerAndType" -steps = [ - { pipe = "get_target_format", result = "target_format" }, - { pipe = "get_target_type", result = "target_type" }, - { pipe = "get_formatted_answer", result = "formatted_answer" }, - { pipe = "get_formatted_answer_and_type", result = "formatted_answer_and_type" }, -] diff --git a/pipelex/libraries/pipelines/retrieve.py b/pipelex/libraries/pipelines/retrieve.py deleted file mode 100644 index a9e778431..000000000 --- a/pipelex/libraries/pipelines/retrieve.py +++ /dev/null @@ -1,12 +0,0 @@ -from pydantic import Field - -from pipelex.core.stuff_content import StructuredContent - - -class RetrievedExcerpt(StructuredContent): - """ - This model represents an excerpt from a text with its justification for being relevant to a question. - """ - - text: str - justification: str = Field(..., description="The justification for why this excerpt is relevant to the question") diff --git a/pipelex/libraries/pipelines/retrieve.toml b/pipelex/libraries/pipelines/retrieve.toml deleted file mode 100644 index 6c0780981..000000000 --- a/pipelex/libraries/pipelines/retrieve.toml +++ /dev/null @@ -1,25 +0,0 @@ - - -domain = "retrieve" -definition = "The domain for retrieving relevant excerpts from text" - -[concept] -RetrievedExcerpt = "An excerpt from a text with its justification for being relevant to a question" - -[pipe] -[pipe.retrieve_excerpt] -PipeLLM = "Find the most relevant excerpt in a text that answers a specific question" -input = "native.Text" -output = "RetrievedExcerpt" -llm = "llm_to_retrieve" -prompt_template = """ -Given this text: -{{ text_content|tag }} - -Given this question: -{{ question|tag }} - -Your task is to find all relevant excerpts from the text that contribute to answering this question. - -Output each excerpt ONLY for actual found excerpts. -""" diff --git a/pipelex/pipe_controllers/pipe_batch.py b/pipelex/pipe_controllers/pipe_batch.py index e59ed65f7..99ac6d4ba 100644 --- a/pipelex/pipe_controllers/pipe_batch.py +++ b/pipelex/pipe_controllers/pipe_batch.py @@ -11,9 +11,9 @@ from pipelex.core.stuff import Stuff from pipelex.core.stuff_content import ListContent, StuffContent from pipelex.core.stuff_factory import StuffFactory -from pipelex.core.working_memory import WorkingMemory -from pipelex.exceptions import PipeExecutionError -from pipelex.hub import get_pipe_router +from pipelex.core.working_memory import MAIN_STUFF_NAME, WorkingMemory, WorkingMemoryStuffNotFoundError +from pipelex.exceptions import PipeInputError, PipeInputNotFoundError +from pipelex.hub import get_pipe_router, get_pipeline_tracker, get_required_pipe from pipelex.pipe_controllers.pipe_controller import PipeController from pipelex.pipeline.job_metadata import JobMetadata @@ -36,34 +36,44 @@ async def _run_controller_pipe( pipe_run_params: PipeRunParams, output_name: Optional[str] = None, ) -> PipeOutput: - """Run a sequence of steps in batch for each item in the input list.""" - if not self.input_concept_code: - raise PipeExecutionError(f"Missing input concept code for pipe '{self.code}' but it is required for PipeBatch") + """Run a pipe in batch mode for each item in the input list.""" + batch_params = pipe_run_params.batch_params or self.batch_params or BatchParams.make_default() + input_item_stuff_name = batch_params.input_item_stuff_name + try: + input_item_concept_code = self.inputs.get_required_concept_code(input_item_stuff_name) + except PipeInputNotFoundError as exc: + raise PipeInputError( + f"Batch input item stuff named '{input_item_stuff_name}' is not in this PipeBatch '{self.code}' input spec: {self.inputs}" + ) from exc + if pipe_run_params.final_stuff_code: log.debug(f"PipeBatch.run_pipe() final_stuff_code: {pipe_run_params.final_stuff_code}") pipe_run_params.final_stuff_code = None pipe_run_params.push_pipe_layer(pipe_code=self.branch_pipe_code) - batch_params = pipe_run_params.batch_params or self.batch_params or BatchParams.make_default() - input_stuff_key = batch_params.input_list_stuff_name - input_stuff = working_memory.get_stuff(input_stuff_key) + try: + input_stuff = working_memory.get_stuff(batch_params.input_list_stuff_name) + except WorkingMemoryStuffNotFoundError as exc: + raise PipeInputError( + f"Input list stuff '{batch_params.input_list_stuff_name}' required by this PipeBatch '{self.code}' not found in working memory: {exc}" + ) from exc input_stuff_code = input_stuff.stuff_code input_content = input_stuff.content if not isinstance(input_content, ListContent): - raise ValueError( + raise PipeInputError( f"Input of PipeBatch must be ListContent, got {input_stuff.stuff_name or 'unnamed'} = {type(input_content)}. stuff: {input_stuff}" ) + input_content = cast(ListContent[StuffContent], input_content) + pipe_router = get_pipe_router() # TODO: Make commented code work when inputing images named "a.b.c" - # sub_pipe = get_required_pipe(pipe_code=self.branch_pipe_code) + sub_pipe = get_required_pipe(pipe_code=self.branch_pipe_code) nb_history_items_limit = get_config().pipelex.tracker_config.applied_nb_items_limit - pipe_router = get_pipe_router() - input_content = cast(ListContent[StuffContent], input_content) batch_output_stuff_code = shortuuid.uuid() tasks: List[Coroutine[Any, Any, PipeOutput]] = [] item_stuffs: List[Stuff] = [] - # required_stuff_lists: List[List[Stuff]] = [] + required_stuff_lists: List[List[Stuff]] = [] branch_output_item_codes: List[str] = [] for branch_index, item in enumerate(input_content.items): branch_output_item_code = f"{batch_output_stuff_code}-branch-{branch_index}" @@ -73,24 +83,19 @@ async def _run_controller_pipe( branch_input_item_code = f"{input_stuff_code}-branch-{branch_index}" item_input_stuff = StuffFactory.make_stuff( code=branch_input_item_code, - concept_code=self.input_concept_code, + concept_code=input_item_concept_code, content=item, - name=batch_params.input_item_stuff_name, + name=input_item_stuff_name, ) item_stuffs.append(item_input_stuff) branch_memory = working_memory.make_deep_copy() - branch_memory.set_new_main_stuff(stuff=item_input_stuff, name=batch_params.input_item_stuff_name) - - # required_variables = sub_pipe.required_variables() - # required_stuffs = branch_memory.get_stuffs(names=required_variables) - # required_stuffs = [required_stuff for required_stuff in required_stuffs if required_stuff.stuff_code != input_stuff_code] - # required_stuff_lists.append(required_stuffs) - branch_pipe_run_params = pipe_run_params.model_copy( - deep=True, - update={ - "final_stuff_code": branch_output_item_code, - }, - ) + branch_memory.set_new_main_stuff(stuff=item_input_stuff, name=input_item_stuff_name) + + required_variables = sub_pipe.required_variables() + required_stuffs = branch_memory.get_existing_stuffs(names=required_variables) + required_stuffs = [required_stuff for required_stuff in required_stuffs if required_stuff.stuff_code != input_stuff_code] + required_stuff_lists.append(required_stuffs) + branch_pipe_run_params = pipe_run_params.deep_copy_with_final_stuff_code(final_stuff_code=branch_output_item_code) tasks.append( pipe_router.run_pipe_code( pipe_code=self.branch_pipe_code, @@ -119,35 +124,37 @@ async def _run_controller_pipe( name=output_name, ) - # for branch_index, (required_stuff_list, item_input_stuff, item_output_stuff) in enumerate( - # zip(required_stuff_lists, item_stuffs, output_stuffs) - # ): - # get_pipeline_tracker().add_batch_step( - # from_stuff=input_stuff, - # to_stuff=item_input_stuff, - # to_branch_index=branch_index, - # pipe_layer=pipe_run_params.pipe_layers, - # comment="PipeBatch.run_pipe() in zip", - # ) - # for required_stuff in required_stuff_list: - # get_pipeline_tracker().add_pipe_step( - # from_stuff=required_stuff, - # to_stuff=item_output_stuff, - # pipe_code=self.branch_pipe_code, - # pipe_layer=pipe_run_params.pipe_layers, - # comment="PipeBatch.run_pipe() on required_stuff_list", - # as_item_index=branch_index, - # is_with_edge=(required_stuff.stuff_name != MAIN_STUFF_NAME), - # ) - - # for branch_index, branch_output_stuff in enumerate(output_stuffs): - # branch_output_item_code = branch_output_item_codes[branch_index] - # get_pipeline_tracker().add_aggregate_step( - # from_stuff=branch_output_stuff, - # to_stuff=output_stuff, - # pipe_layer=pipe_run_params.pipe_layers, - # comment="PipeBatch.run_pipe() on branch_index of batch", - # ) + for branch_index, ( + required_stuff_list, + item_input_stuff, + item_output_stuff, + ) in enumerate(zip(required_stuff_lists, item_stuffs, output_stuffs)): + get_pipeline_tracker().add_batch_step( + from_stuff=input_stuff, + to_stuff=item_input_stuff, + to_branch_index=branch_index, + pipe_layer=pipe_run_params.pipe_layers, + comment="PipeBatch.run_pipe() in zip", + ) + for required_stuff in required_stuff_list: + get_pipeline_tracker().add_pipe_step( + from_stuff=required_stuff, + to_stuff=item_output_stuff, + pipe_code=self.branch_pipe_code, + pipe_layer=pipe_run_params.pipe_layers, + comment="PipeBatch.run_pipe() on required_stuff_list", + as_item_index=branch_index, + is_with_edge=(required_stuff.stuff_name != MAIN_STUFF_NAME), + ) + + for branch_index, branch_output_stuff in enumerate(output_stuffs): + branch_output_item_code = branch_output_item_codes[branch_index] + get_pipeline_tracker().add_aggregate_step( + from_stuff=branch_output_stuff, + to_stuff=output_stuff, + pipe_layer=pipe_run_params.pipe_layers, + comment="PipeBatch.run_pipe() on branch_index of batch", + ) working_memory.set_new_main_stuff( stuff=output_stuff, diff --git a/pipelex/pipe_controllers/pipe_batch_factory.py b/pipelex/pipe_controllers/pipe_batch_factory.py index 2d785a49b..e633d51e9 100644 --- a/pipelex/pipe_controllers/pipe_batch_factory.py +++ b/pipelex/pipe_controllers/pipe_batch_factory.py @@ -3,6 +3,7 @@ from typing_extensions import override from pipelex.core.pipe_blueprint import PipeBlueprint, PipeSpecificFactoryProtocol +from pipelex.core.pipe_input_spec import PipeInputSpec from pipelex.core.pipe_run_params import BatchParams from pipelex.pipe_controllers.pipe_batch import PipeBatch @@ -31,7 +32,7 @@ def make_pipe_from_blueprint( domain=domain_code, code=pipe_code, definition=pipe_blueprint.definition, - input_concept_code=pipe_blueprint.input, + inputs=PipeInputSpec(root=pipe_blueprint.inputs or {}), output_concept_code=pipe_blueprint.output, branch_pipe_code=pipe_blueprint.branch_pipe_code, batch_params=batch_params, diff --git a/pipelex/pipe_controllers/pipe_condition.py b/pipelex/pipe_controllers/pipe_condition.py index 6bef2edfe..be0224b9a 100644 --- a/pipelex/pipe_controllers/pipe_condition.py +++ b/pipelex/pipe_controllers/pipe_condition.py @@ -126,8 +126,10 @@ async def _run_controller_pipe( try: required_stuffs = working_memory.get_stuffs(names=required_stuff_names) except WorkingMemoryStuffNotFoundError as exc: - error_details = f"PipeCondition '{self.code}', stack: {pipe_run_params.pipe_layers}, required_variables: {required_variables}" - raise PipeInputError(f"Some required stuff(s) not found - {error_details}") from exc + pipe_condition_path = pipe_run_params.pipe_layers + [self.code] + pipe_condition_path_str = ".".join(pipe_condition_path) + error_details = f"PipeCondition '{pipe_condition_path_str}', required_variables: {required_variables}, missing: '{exc.variable_name}'" + raise PipeInputError(f"Some required stuff(s) not found: {error_details}") from exc for required_stuff in required_stuffs: get_pipeline_tracker().add_condition_step( diff --git a/pipelex/pipe_controllers/pipe_condition_factory.py b/pipelex/pipe_controllers/pipe_condition_factory.py index f82ad4321..01ffdadd2 100644 --- a/pipelex/pipe_controllers/pipe_condition_factory.py +++ b/pipelex/pipe_controllers/pipe_condition_factory.py @@ -3,6 +3,7 @@ from typing_extensions import override from pipelex.core.pipe_blueprint import PipeBlueprint, PipeSpecificFactoryProtocol +from pipelex.core.pipe_input_spec import PipeInputSpec from pipelex.pipe_controllers.pipe_condition import PipeCondition @@ -28,7 +29,7 @@ def make_pipe_from_blueprint( domain=domain_code, code=pipe_code, definition=pipe_blueprint.definition, - input_concept_code=pipe_blueprint.input, + inputs=PipeInputSpec(root=pipe_blueprint.inputs or {}), output_concept_code=pipe_blueprint.output, expression_jinja2=pipe_blueprint.expression_template, expression=pipe_blueprint.expression, diff --git a/pipelex/pipe_controllers/pipe_parallel.py b/pipelex/pipe_controllers/pipe_parallel.py index 58d8025d2..773e80a49 100644 --- a/pipelex/pipe_controllers/pipe_parallel.py +++ b/pipelex/pipe_controllers/pipe_parallel.py @@ -47,12 +47,13 @@ async def _run_controller_pipe( tasks: List[Coroutine[Any, Any, PipeOutput]] = [] - for parallel_sub_pipe in self.parallel_sub_pipes: + for sub_pipe in self.parallel_sub_pipes: tasks.append( - parallel_sub_pipe.run( + sub_pipe.run( + calling_pipe_code=self.code, job_metadata=job_metadata, working_memory=working_memory.make_deep_copy(), - sub_pipe_run_params=pipe_run_params, + sub_pipe_run_params=pipe_run_params.make_deep_copy(), ) ) diff --git a/pipelex/pipe_controllers/pipe_parallel_factory.py b/pipelex/pipe_controllers/pipe_parallel_factory.py index 2f8b9aae1..483c3e069 100644 --- a/pipelex/pipe_controllers/pipe_parallel_factory.py +++ b/pipelex/pipe_controllers/pipe_parallel_factory.py @@ -3,6 +3,7 @@ from typing_extensions import override from pipelex.core.pipe_blueprint import PipeBlueprint, PipeSpecificFactoryProtocol +from pipelex.core.pipe_input_spec import PipeInputSpec from pipelex.exceptions import PipeDefinitionError from pipelex.pipe_controllers.pipe_parallel import PipeParallel from pipelex.pipe_controllers.sub_pipe import SubPipe @@ -36,7 +37,7 @@ def make_pipe_from_blueprint( domain=domain_code, code=pipe_code, definition=pipe_blueprint.definition, - input_concept_code=pipe_blueprint.input, + inputs=PipeInputSpec(root=pipe_blueprint.inputs or {}), output_concept_code=pipe_blueprint.output, parallel_sub_pipes=parallel_sub_pipes, add_each_output=pipe_blueprint.add_each_output, diff --git a/pipelex/pipe_controllers/pipe_sequence.py b/pipelex/pipe_controllers/pipe_sequence.py index 30e1f8c59..30f76eb36 100644 --- a/pipelex/pipe_controllers/pipe_sequence.py +++ b/pipelex/pipe_controllers/pipe_sequence.py @@ -2,7 +2,6 @@ from typing_extensions import override -from pipelex import log from pipelex.core.pipe_output import PipeOutput from pipelex.core.pipe_run_params import PipeRunParams from pipelex.core.working_memory import WorkingMemory @@ -13,11 +12,11 @@ class PipeSequence(PipeController): - pipe_steps: List[SubPipe] + sequential_sub_pipes: List[SubPipe] @override def pipe_dependencies(self) -> Set[str]: - return set(step.pipe_code for step in self.pipe_steps) + return set(sub_pipe.pipe_code for sub_pipe in self.sequential_sub_pipes) @override async def _run_controller_pipe( @@ -27,31 +26,26 @@ async def _run_controller_pipe( pipe_run_params: PipeRunParams, output_name: Optional[str] = None, ) -> PipeOutput: - log.debug(f"run_pipe_direct: output_name={output_name}") pipe_run_params.push_pipe_layer(pipe_code=self.code) if pipe_run_params.is_multiple_output_required: raise PipeRunParamsError( f"PipeSequence does not suppport multiple outputs, got output_multiplicity = {pipe_run_params.output_multiplicity}" ) - log.dev(f"{self.class_name} generating a '{self.output_concept_code}' named -> {output_name or 'unnamed'}") - log.dev(f"self.pipe_steps:\n{self.pipe_steps}") - - if not self.output_concept_code: - raise ValueError("No output concept code") current_memory = working_memory - for step_index, step in enumerate(self.pipe_steps): - step_run_params: PipeRunParams + for sub_pipe_index, sub_pipe in enumerate(self.sequential_sub_pipes): + sub_pipe_run_params: PipeRunParams # only the last step should apply the final_stuff_code - if step_index == len(self.pipe_steps) - 1: - step_run_params = pipe_run_params.model_copy() + if sub_pipe_index == len(self.sequential_sub_pipes) - 1: + sub_pipe_run_params = pipe_run_params.model_copy() else: - step_run_params = pipe_run_params.model_copy(update=({"final_stuff_code": None})) - pipe_output = await step.run( + sub_pipe_run_params = pipe_run_params.model_copy(update=({"final_stuff_code": None})) + pipe_output = await sub_pipe.run( + calling_pipe_code=self.code, working_memory=current_memory, job_metadata=job_metadata, - sub_pipe_run_params=step_run_params, + sub_pipe_run_params=sub_pipe_run_params, ) current_memory = pipe_output.working_memory diff --git a/pipelex/pipe_controllers/pipe_sequence_factory.py b/pipelex/pipe_controllers/pipe_sequence_factory.py index 572bf186c..533ca9271 100644 --- a/pipelex/pipe_controllers/pipe_sequence_factory.py +++ b/pipelex/pipe_controllers/pipe_sequence_factory.py @@ -3,6 +3,7 @@ from typing_extensions import override from pipelex.core.pipe_blueprint import PipeBlueprint, PipeSpecificFactoryProtocol +from pipelex.core.pipe_input_spec import PipeInputSpec from pipelex.pipe_controllers.pipe_sequence import PipeSequence from pipelex.pipe_controllers.sub_pipe_factory import SubPipeBlueprint @@ -25,9 +26,9 @@ def make_pipe_from_blueprint( domain=domain_code, code=pipe_code, definition=pipe_blueprint.definition, - input_concept_code=pipe_blueprint.input, + inputs=PipeInputSpec(root=pipe_blueprint.inputs or {}), output_concept_code=pipe_blueprint.output, - pipe_steps=pipe_steps, + sequential_sub_pipes=pipe_steps, ) @classmethod diff --git a/pipelex/pipe_controllers/sub_pipe.py b/pipelex/pipe_controllers/sub_pipe.py index e8bf8c93a..41e0cdc73 100644 --- a/pipelex/pipe_controllers/sub_pipe.py +++ b/pipelex/pipe_controllers/sub_pipe.py @@ -13,8 +13,6 @@ from pipelex.pipeline.job_metadata import JobMetadata -# TODO: decide if SubPipe should be a PipeAbstract (it's probably the case) -# TODO: update job metadata class SubPipe(BaseModel): pipe_code: str output_name: Optional[str] = None @@ -23,6 +21,7 @@ class SubPipe(BaseModel): async def run( self, + calling_pipe_code: str, working_memory: WorkingMemory, job_metadata: JobMetadata, sub_pipe_run_params: PipeRunParams, @@ -36,13 +35,23 @@ async def run( pipe_output: PipeOutput sub_pipe_run_params.batch_params = self.batch_params if batch_params := self.batch_params: - input_list_stuff = working_memory.get_stuff(name=batch_params.input_list_stuff_name) + try: + input_list_stuff = working_memory.get_stuff(name=batch_params.input_list_stuff_name) + except WorkingMemoryStuffNotFoundError as exc: + raise PipeInputError( + f"Input list stuff named '{batch_params.input_list_stuff_name}' required by sub_pipe '{self.pipe_code}' " + f"of pipe '{calling_pipe_code}' not found in working memory: {exc}" + ) from exc input_concept_code = input_list_stuff.concept_code output_concept_code = pipe.output_concept_code + + sub_pipe = get_required_pipe(pipe_code=self.pipe_code) + pipe_batch_inputs = sub_pipe.inputs + pipe_batch_inputs.add_requirement(variable_name=batch_params.input_list_stuff_name, concept_code=input_concept_code) pipe_batch = PipeBatch( domain=pipe.domain, code=self.pipe_code, - input_concept_code=input_concept_code, + inputs=pipe_batch_inputs, output_concept_code=output_concept_code, branch_pipe_code=self.pipe_code, ) @@ -67,8 +76,10 @@ async def run( try: required_stuffs = working_memory.get_stuffs(names=required_stuff_names) except WorkingMemoryStuffNotFoundError as exc: - error_details = f"sub_pipe '{self.pipe_code}', stack: {sub_pipe_run_params.pipe_layers}, required_variables: {required_variables}" - raise PipeInputError(f"Some required stuff(s) not found - {error_details}") from exc + sub_pipe_path = sub_pipe_run_params.pipe_layers + [self.pipe_code] + sub_pipe_path_str = ".".join(sub_pipe_path) + error_details = f"SubPipe '{sub_pipe_path_str}', required_variables: {required_variables}, missing: '{exc.variable_name}'" + raise PipeInputError(f"Some required stuff(s) not found: {error_details}") from exc log.debug(required_stuffs, title=f"Required stuffs for {self.pipe_code}") pipe_output = await get_pipe_router().run_pipe_code( pipe_code=self.pipe_code, diff --git a/pipelex/pipe_operators/pipe_func.py b/pipelex/pipe_operators/pipe_func.py index b805cc98a..69605a689 100644 --- a/pipelex/pipe_operators/pipe_func.py +++ b/pipelex/pipe_operators/pipe_func.py @@ -28,9 +28,6 @@ async def _run_operator_pipe( pipe_run_params: PipeRunParams, output_name: Optional[str] = None, ) -> PipeFuncOutput: - if not self.output_concept_code: - raise ValueError("PipeFunc should have a non-None output_concept_code") - log.debug(f"Applying function '{self.function_name}'") function = func_registry.get_required_function(self.function_name) diff --git a/pipelex/pipe_operators/pipe_func_factory.py b/pipelex/pipe_operators/pipe_func_factory.py index 43ccafab5..8000f5c9a 100644 --- a/pipelex/pipe_operators/pipe_func_factory.py +++ b/pipelex/pipe_operators/pipe_func_factory.py @@ -3,6 +3,7 @@ from typing_extensions import override from pipelex.core.pipe_blueprint import PipeBlueprint, PipeSpecificFactoryProtocol +from pipelex.core.pipe_input_spec import PipeInputSpec from pipelex.pipe_operators.pipe_func import PipeFunc @@ -23,7 +24,7 @@ def make_pipe_from_blueprint( domain=domain_code, code=pipe_code, definition=pipe_blueprint.definition, - input_concept_code=pipe_blueprint.input, + inputs=PipeInputSpec(root=pipe_blueprint.inputs or {}), output_concept_code=pipe_blueprint.output, function_name=pipe_blueprint.function_name, ) diff --git a/pipelex/pipe_operators/pipe_img_gen.py b/pipelex/pipe_operators/pipe_img_gen.py index caabb0570..c701be73b 100644 --- a/pipelex/pipe_operators/pipe_img_gen.py +++ b/pipelex/pipe_operators/pipe_img_gen.py @@ -1,27 +1,33 @@ from typing import List, Literal, Optional, Union -from pydantic import Field -from typing_extensions import override +from pydantic import Field, field_validator, model_validator +from typing_extensions import Self, override from pipelex import log +from pipelex.cogt.content_generation.content_generator_dry import ContentGeneratorDry +from pipelex.cogt.content_generation.content_generator_protocol import ContentGeneratorProtocol from pipelex.cogt.imgg.imgg_handle import ImggHandle from pipelex.cogt.imgg.imgg_job_components import AspectRatio, Background, ImggJobParams, Quality from pipelex.cogt.imgg.imgg_prompt import ImggPrompt -from pipelex.config import get_config +from pipelex.config import StaticValidationErrorType, StaticValidationReaction, get_config from pipelex.core.concept_native import NativeConcept from pipelex.core.pipe_output import PipeOutput from pipelex.core.pipe_run_params import PipeOutputMultiplicity, PipeRunParams, output_multiplicity_to_apply from pipelex.core.stuff_content import ImageContent, ListContent, StuffContent from pipelex.core.stuff_factory import StuffFactory from pipelex.core.working_memory import WorkingMemory -from pipelex.exceptions import PipeDefinitionError, PipeInputError, PipeRunParamsError, WorkingMemoryStuffNotFoundError -from pipelex.hub import get_content_generator +from pipelex.exceptions import ( + PipeDefinitionError, + PipeInputError, + PipeRunParamsError, + StaticValidationError, + UnexpectedPipeDefinitionError, + WorkingMemoryStuffNotFoundError, +) +from pipelex.hub import get_concept_provider, get_content_generator from pipelex.pipe_operators.pipe_operator import PipeOperator from pipelex.pipeline.job_metadata import JobMetadata -# TODO: refacto this as part of the PipeImgGen blueprint/params -IMGG_PROMPT_NAME = "imgg_prompt" - class PipeImgGenOutput(PipeOutput): @property @@ -41,7 +47,6 @@ def image_urls(self) -> List[str]: class PipeImgGen(PipeOperator): output_concept_code: str = NativeConcept.IMAGE.code imgg_prompt: Optional[str] = None - imgg_prompt_stuff_name: Optional[str] = None # TODO: wrap this up in imgg llm_presets like for llm imgg_handle: Optional[ImggHandle] = None aspect_ratio: Optional[AspectRatio] = Field(default=None, strict=False) @@ -55,6 +60,88 @@ class PipeImgGen(PipeOperator): seed: Optional[Union[int, Literal["auto"]]] = None output_multiplicity: PipeOutputMultiplicity + img_gen_prompt_var_name: Optional[str] = None + + @field_validator("img_gen_prompt_var_name") + @classmethod + def validate_input_var_name_not_provided_as_attribute(cls, v: Optional[str]) -> Optional[str]: + if v is not None: + raise PipeDefinitionError("img_gen_prompt_var_name must be None before input validation") + return v + + @model_validator(mode="after") + def validate_inputs(self) -> Self: + self._validate_inputs() + return self + + def _validate_inputs(self): + concept_provider = get_concept_provider() + static_validation_config = get_config().pipelex.static_validation_config + default_reaction = static_validation_config.default_reaction + reactions = static_validation_config.reactions + # check that we have either an imgg_prompt passed as attribute or as a single text input + if self.imgg_prompt: + if self.inputs.items: + raise PipeDefinitionError("img_gen_prompt_var_name must be None if imgg_prompt is provided") + else: + # we're good with the prompt provided as attribute + return + + candidate_prompt_var_names: List[str] = [] + for input_name, input_concept_code in self.inputs.items: + log.debug(f"Validating input '{input_name}' with concept code '{input_concept_code}'") + if concept_provider.is_compatible_by_concept_code( + tested_concept_code=input_concept_code, + wanted_concept_code=NativeConcept.TEXT.code, + ): + self.img_gen_prompt_var_name = input_name + candidate_prompt_var_names.append(input_name) + else: + inadequate_input_concept_error = StaticValidationError( + error_type=StaticValidationErrorType.INADEQUATE_INPUT_CONCEPT, + domain_code=self.domain, + pipe_code=self.code, + variable_names=[input_name], + provided_concept_code=input_concept_code, + explanation="Only a text input can be provided for image gen prompt", + ) + match reactions.get(StaticValidationErrorType.INADEQUATE_INPUT_CONCEPT, default_reaction): + case StaticValidationReaction.IGNORE: + pass + case StaticValidationReaction.LOG: + log.error(inadequate_input_concept_error.desc()) + case StaticValidationReaction.RAISE: + raise inadequate_input_concept_error + if len(candidate_prompt_var_names) > 1: + too_many_candidate_inputs_error = StaticValidationError( + error_type=StaticValidationErrorType.TOO_MANY_CANDIDATE_INPUTS, + domain_code=self.domain, + pipe_code=self.code, + variable_names=candidate_prompt_var_names, + explanation="Only one text input can be provided for image gen prompt", + ) + match reactions.get(StaticValidationErrorType.TOO_MANY_CANDIDATE_INPUTS, default_reaction): + case StaticValidationReaction.IGNORE: + pass + case StaticValidationReaction.LOG: + log.error(too_many_candidate_inputs_error.desc()) + case StaticValidationReaction.RAISE: + raise too_many_candidate_inputs_error + elif len(candidate_prompt_var_names) == 0: + missing_input_var_error = StaticValidationError( + error_type=StaticValidationErrorType.MISSING_INPUT_VARIABLE, + domain_code=self.domain, + pipe_code=self.code, + explanation="You must provide an image gen prompt either as attribute of the pipe or as a single text input", + ) + match reactions.get(StaticValidationErrorType.MISSING_INPUT_VARIABLE, default_reaction): + case StaticValidationReaction.IGNORE: + pass + case StaticValidationReaction.LOG: + log.error(missing_input_var_error.desc()) + case StaticValidationReaction.RAISE: + raise missing_input_var_error + @override async def _run_operator_pipe( self, @@ -62,9 +149,9 @@ async def _run_operator_pipe( working_memory: WorkingMemory, pipe_run_params: PipeRunParams, output_name: Optional[str] = None, + content_generator: Optional[ContentGeneratorProtocol] = None, ) -> PipeImgGenOutput: - if not self.output_concept_code: - raise PipeDefinitionError("PipeImgGen should have a non-None output_concept_code") + content_generator = content_generator or get_content_generator() applied_output_multiplicity, _, _ = output_multiplicity_to_apply( output_multiplicity_base=self.output_multiplicity or False, @@ -74,12 +161,13 @@ async def _run_operator_pipe( log.debug("Getting image generation prompt from context") if self.imgg_prompt: imgg_prompt_text = self.imgg_prompt - else: - stuff_name = self.imgg_prompt_stuff_name or IMGG_PROMPT_NAME + elif stuff_name := self.img_gen_prompt_var_name: try: imgg_prompt_text = working_memory.get_stuff_as_str(stuff_name) except WorkingMemoryStuffNotFoundError as exc: raise PipeInputError(f"Could not find a valid user image named '{stuff_name}' in the working_memory: {exc}") from exc + else: + raise UnexpectedPipeDefinitionError("You must provide an image gen prompt either as attribute of the pipe or as a single text input") imgg_config = get_config().cogt.imgg_config imgg_param_defaults = imgg_config.imgg_param_defaults @@ -125,7 +213,7 @@ async def _run_operator_pipe( nb_images = 1 if nb_images > 1: - generated_image_list = await get_content_generator().make_image_list( + generated_image_list = await content_generator.make_image_list( job_metadata=job_metadata, imgg_handle=imgg_handle, imgg_prompt=ImggPrompt( @@ -150,7 +238,7 @@ async def _run_operator_pipe( ) log.verbose(the_content, title="List of image contents") else: - generated_image = await get_content_generator().make_single_image( + generated_image = await content_generator.make_single_image( job_metadata=job_metadata, imgg_handle=imgg_handle, imgg_prompt=ImggPrompt( @@ -185,3 +273,22 @@ async def _run_operator_pipe( working_memory=working_memory, ) return pipe_output + + @override + async def _dry_run_operator_pipe( + self, + job_metadata: JobMetadata, + working_memory: WorkingMemory, + pipe_run_params: PipeRunParams, + output_name: Optional[str] = None, + ) -> PipeOutput: + log.warning(f"PipeImgGen: dry run operator pipe: {self.code}") + content_generator_dry = ContentGeneratorDry() + pipe_output = await self._run_operator_pipe( + job_metadata=job_metadata, + working_memory=working_memory, + pipe_run_params=pipe_run_params, + output_name=output_name, + content_generator=content_generator_dry, + ) + return pipe_output diff --git a/pipelex/pipe_operators/pipe_img_gen_factory.py b/pipelex/pipe_operators/pipe_img_gen_factory.py index dc31859c9..4475ade36 100644 --- a/pipelex/pipe_operators/pipe_img_gen_factory.py +++ b/pipelex/pipe_operators/pipe_img_gen_factory.py @@ -6,14 +6,14 @@ from pipelex.cogt.imgg.imgg_handle import ImggHandle from pipelex.cogt.imgg.imgg_job_components import AspectRatio, Quality from pipelex.core.pipe_blueprint import PipeBlueprint, PipeSpecificFactoryProtocol +from pipelex.core.pipe_input_spec import PipeInputSpec from pipelex.exceptions import PipeDefinitionError from pipelex.pipe_operators.pipe_img_gen import PipeImgGen from pipelex.tools.typing.validation_utils import has_more_than_one_among_attributes_from_lists class PipeImgGenBlueprint(PipeBlueprint): - imgg_prompt: Optional[str] = None - imgg_prompt_stuff_name: Optional[str] = None + img_gen_prompt: Optional[str] = None imgg_handle: Optional[ImggHandle] = None aspect_ratio: Optional[AspectRatio] = Field(default=None, strict=False) quality: Optional[Quality] = Field(default=None, strict=False) @@ -30,7 +30,6 @@ def validate_imgg_prompt_and_imgg_prompt_stuff_name(self) -> Self: if excess_attributes_list := has_more_than_one_among_attributes_from_lists( self, [ - ["imgg_prompt", "imgg_prompt_stuff_name"], ["quality", "nb_steps"], ], ): @@ -52,11 +51,10 @@ def make_pipe_from_blueprint( domain=domain_code, code=pipe_code, definition=pipe_blueprint.definition, - input_concept_code=pipe_blueprint.input, + inputs=PipeInputSpec(root=pipe_blueprint.inputs or {}), output_concept_code=pipe_blueprint.output, output_multiplicity=output_multiplicity, - imgg_prompt=pipe_blueprint.imgg_prompt, - imgg_prompt_stuff_name=pipe_blueprint.imgg_prompt_stuff_name, + imgg_prompt=pipe_blueprint.img_gen_prompt, imgg_handle=pipe_blueprint.imgg_handle, aspect_ratio=pipe_blueprint.aspect_ratio, nb_steps=pipe_blueprint.nb_steps, diff --git a/pipelex/pipe_operators/pipe_jinja2.py b/pipelex/pipe_operators/pipe_jinja2.py index 11d6ea9ba..43fa4444e 100644 --- a/pipelex/pipe_operators/pipe_jinja2.py +++ b/pipelex/pipe_operators/pipe_jinja2.py @@ -6,6 +6,9 @@ from typing_extensions import Self, override from pipelex import log +from pipelex.cogt.content_generation.content_generator_dry import ContentGeneratorDry +from pipelex.cogt.content_generation.content_generator_protocol import ContentGeneratorProtocol +from pipelex.config import get_config from pipelex.core.concept_native import NativeConcept from pipelex.core.pipe_output import PipeOutput from pipelex.core.pipe_run_params import PipeRunParams @@ -40,6 +43,7 @@ class PipeJinja2(PipeOperator): jinja2: Optional[str] = None prompting_style: Optional[PromptingStyle] = None template_category: Jinja2TemplateCategory = Jinja2TemplateCategory.LLM_PROMPT + extra_context: Optional[Dict[str, Any]] = None @model_validator(mode="after") def validate_jinja2(self) -> Self: @@ -84,19 +88,21 @@ async def _run_operator_pipe( working_memory: WorkingMemory, pipe_run_params: PipeRunParams, output_name: Optional[str] = None, + content_generator: Optional[ContentGeneratorProtocol] = None, ) -> PipeJinja2Output: + content_generator = content_generator or get_content_generator() if pipe_run_params.is_multiple_output_required: raise PipeRunParamsError( f"PipeJinja2 does not suppport multiple outputs, got output_multiplicity = {pipe_run_params.output_multiplicity}" ) - if not self.output_concept_code: - raise PipeRunParamsError("PipeJinja2 must have an output_concept_code") context: Dict[str, Any] = working_memory.generate_stuff_artefact_dict() if pipe_run_params: context.update(**pipe_run_params.params) + if self.extra_context: + context.update(**self.extra_context) - jinja2_text = await get_content_generator().make_jinja2_text( + jinja2_text = await content_generator.make_jinja2_text( context=context, jinja2_name=self.jinja2_name, jinja2=self.jinja2, @@ -124,3 +130,27 @@ async def _run_operator_pipe( ) return pipe_output + + @override + async def _dry_run_operator_pipe( + self, + job_metadata: JobMetadata, + working_memory: WorkingMemory, + pipe_run_params: PipeRunParams, + output_name: Optional[str] = None, + ) -> PipeOutput: + content_generator_used: ContentGeneratorProtocol + if get_config().pipelex.dry_run_config.apply_to_jinja2_rendering: + log.warning(f"PipeJinja2: using dry run operator pipe for jinja2 rendering: {self.code}") + content_generator_used = ContentGeneratorDry() + else: + log.warning(f"PipeJinja2: using regular operator pipe for jinja2 rendering (dry run not applied to jinja2): {self.code}") + content_generator_used = get_content_generator() + pipe_output = await self._run_operator_pipe( + job_metadata=job_metadata, + working_memory=working_memory, + pipe_run_params=pipe_run_params, + output_name=output_name, + content_generator=content_generator_used, + ) + return pipe_output diff --git a/pipelex/pipe_operators/pipe_jinja2_factory.py b/pipelex/pipe_operators/pipe_jinja2_factory.py index ab03b5e5a..b0dff7f05 100644 --- a/pipelex/pipe_operators/pipe_jinja2_factory.py +++ b/pipelex/pipe_operators/pipe_jinja2_factory.py @@ -4,6 +4,7 @@ from pipelex.config import get_config from pipelex.core.pipe_blueprint import PipeBlueprint, PipeSpecificFactoryProtocol +from pipelex.core.pipe_input_spec import PipeInputSpec from pipelex.exceptions import PipeDefinitionError from pipelex.pipe_operators.pipe_jinja2 import PipeJinja2 from pipelex.tools.templating.jinja2_environment import Jinja2TemplateCategory @@ -41,7 +42,7 @@ def make_pipe_from_blueprint( domain=domain_code, code=pipe_code, definition=pipe_blueprint.definition, - input_concept_code=pipe_blueprint.input, + inputs=PipeInputSpec(root=pipe_blueprint.inputs or {}), output_concept_code=pipe_blueprint.output, jinja2_name=pipe_blueprint.jinja2_name, jinja2=preprocessed_template, diff --git a/pipelex/pipe_operators/pipe_llm.py b/pipelex/pipe_operators/pipe_llm.py index 951529fd3..272ac266e 100644 --- a/pipelex/pipe_operators/pipe_llm.py +++ b/pipelex/pipe_operators/pipe_llm.py @@ -5,15 +5,18 @@ from typing_extensions import Self, override from pipelex import log +from pipelex.cogt.content_generation.content_generator_dry import ContentGeneratorDry +from pipelex.cogt.content_generation.content_generator_protocol import ContentGeneratorProtocol from pipelex.cogt.llm.llm_models.llm_deck import LLMSettingChoices from pipelex.cogt.llm.llm_models.llm_deck_check import check_llm_setting_with_deck from pipelex.cogt.llm.llm_models.llm_setting import LLMSetting from pipelex.cogt.llm.llm_prompt import LLMPrompt from pipelex.cogt.llm.llm_prompt_factory_abstract import LLMPromptFactoryAbstract -from pipelex.config import get_config +from pipelex.config import StaticValidationReaction, get_config from pipelex.core.concept_factory import ConceptFactory from pipelex.core.concept_native import NativeConcept, NativeConceptClass from pipelex.core.domain import Domain, SpecialDomain +from pipelex.core.pipe_input_spec import PipeInputSpec from pipelex.core.pipe_output import PipeOutput from pipelex.core.pipe_run_params import ( PipeOutputMultiplicity, @@ -21,10 +24,16 @@ PipeRunParams, output_multiplicity_to_apply, ) -from pipelex.core.stuff_content import ListContent, StuffContent, TextContent +from pipelex.core.stuff_content import ListContent, StructuredContent, StuffContent, TextContent from pipelex.core.stuff_factory import StuffFactory from pipelex.core.working_memory import WorkingMemory -from pipelex.exceptions import PipeDefinitionError, PipeExecutionError +from pipelex.exceptions import ( + PipeDefinitionError, + PipeInputError, + PipeInputNotFoundError, + StaticValidationError, + StaticValidationErrorType, +) from pipelex.hub import ( get_concept_provider, get_content_generator, @@ -60,6 +69,91 @@ class PipeLLM(PipeOperator): system_prompt_to_structure: Optional[str] = None output_multiplicity: Optional[PipeOutputMultiplicity] = None + def needed_inputs(self) -> PipeInputSpec: + return self.pipe_llm_prompt.needed_inputs() + + @model_validator(mode="after") + def validate_inputs(self) -> Self: + self._validate_inputs() + return self + + def _validate_inputs(self): + concept_provider = get_concept_provider() + static_validation_config = get_config().pipelex.static_validation_config + default_reaction = static_validation_config.default_reaction + reactions = static_validation_config.reactions + + the_needed_inputs = self.needed_inputs() + # check all required variables are in the inputs + for required_variable_name, requirement_expression, concept_code in the_needed_inputs.detailed_requirements: + if required_variable_name not in self.inputs.variables: + missing_input_var_error = StaticValidationError( + error_type=StaticValidationErrorType.MISSING_INPUT_VARIABLE, + domain_code=self.domain, + pipe_code=self.code, + variable_names=[required_variable_name], + ) + match reactions.get(StaticValidationErrorType.MISSING_INPUT_VARIABLE, default_reaction): + case StaticValidationReaction.IGNORE: + pass + case StaticValidationReaction.LOG: + log.error(missing_input_var_error.desc()) + case StaticValidationReaction.RAISE: + raise missing_input_var_error + + # there is one case where the needed input is of specific concept: the user_images + if concept_code == NativeConcept.IMAGE.code: + try: + concept_code_of_declared_input = self.inputs.get_required_concept_code(variable_name=required_variable_name) + except PipeInputNotFoundError as exc: + raise PipeInputError( + f"Input variable '{required_variable_name}' is not in this PipeLLM '{self.code}' input spec: {self.inputs}" + ) from exc + if not concept_provider.is_compatible_by_concept_code( + tested_concept_code=concept_code_of_declared_input, + wanted_concept_code=concept_code, + ): + if required_variable_name != requirement_expression: + # the required_input is a sub-attribute of the required variable + # TODO: check that the sub-attribute is compatible with the concept code + # let's check at least that the input is a structured concept + input_concept = concept_provider.get_required_concept(concept_code=concept_code_of_declared_input) + input_concept_class_name = input_concept.structure_class_name + input_concept_class = class_registry.get_required_subclass(name=input_concept_class_name, base_class=StuffContent) + if issubclass(input_concept_class, StructuredContent): + continue + inadequate_input_concept_error = StaticValidationError( + error_type=StaticValidationErrorType.INADEQUATE_INPUT_CONCEPT, + domain_code=self.domain, + pipe_code=self.code, + variable_names=[required_variable_name], + provided_concept_code=concept_code_of_declared_input, + explanation="The input provided for LLM Vision must be an image or a concept that refines image", + ) + match reactions.get(StaticValidationErrorType.INADEQUATE_INPUT_CONCEPT, default_reaction): + case StaticValidationReaction.IGNORE: + pass + case StaticValidationReaction.LOG: + log.error(inadequate_input_concept_error.desc()) + case StaticValidationReaction.RAISE: + raise inadequate_input_concept_error + # check that all inputs are in the required variables + for input_name in self.inputs.variables: + if input_name not in the_needed_inputs.required_names: + extraneous_input_var_error = StaticValidationError( + error_type=StaticValidationErrorType.EXTRANEOUS_INPUT_VARIABLE, + domain_code=self.domain, + pipe_code=self.code, + variable_names=[input_name], + ) + match reactions.get(StaticValidationErrorType.EXTRANEOUS_INPUT_VARIABLE, default_reaction): + case StaticValidationReaction.IGNORE: + pass + case StaticValidationReaction.LOG: + log.error(extraneous_input_var_error.desc()) + case StaticValidationReaction.RAISE: + raise extraneous_input_var_error + @model_validator(mode="after") def validate_output_concept_consistency(self) -> Self: if self.structuring_method is not None: @@ -70,12 +164,7 @@ def validate_output_concept_consistency(self) -> Self: @override def validate_with_libraries(self): - if self.input_concept_code and get_concept_provider().is_compatible_by_concept_code( - tested_concept_code=self.input_concept_code, - wanted_concept_code=NativeConcept.IMAGE.code, - ): - if not self.pipe_llm_prompt.user_images: - raise PipeDefinitionError(f"No user images provided for concept '{self.input_concept_code}' but it's required") + self._validate_inputs() self.pipe_llm_prompt.validate_with_libraries() if self.prompt_template_to_structure: get_template(template_name=self.prompt_template_to_structure) @@ -118,7 +207,9 @@ async def _run_operator_pipe( working_memory: WorkingMemory, pipe_run_params: PipeRunParams, output_name: Optional[str] = None, + content_generator: Optional[ContentGeneratorProtocol] = None, ) -> PipeLLMOutput: + content_generator = content_generator or get_content_generator() # interpret / unwrap the arguments log.debug(f"PipeLLM pipe_code = {self.code}") if self.output_concept_code == ConceptFactory.make_concept_code( @@ -190,22 +281,10 @@ async def _run_operator_pipe( ) llm_prompt_1 = cast(PipeLLMPromptOutput, pipe_output).llm_prompt - if input_concept_code := self.input_concept_code: - if ( - get_concept_provider().is_compatible_by_concept_code( - tested_concept_code=input_concept_code, - wanted_concept_code=NativeConcept.IMAGE.code, - ) - and not llm_prompt_1.user_images - ): - raise PipeExecutionError( - f"No user images provided in the prompt with input concept '{input_concept_code}' but it's required for pipe '{self.code}'" - ) - the_content: StuffContent if output_concept.structure_class_name == NativeConceptClass.TEXT and not is_multiple_output: log.debug(f"PipeLLM generating a single text output: {self.class_name}_gen_text") - generated_text: str = await get_content_generator().make_llm_text( + generated_text: str = await content_generator.make_llm_text( job_metadata=job_metadata, llm_prompt_for_text=llm_prompt_1, llm_setting_main=self.llm_setting_main, @@ -275,6 +354,7 @@ async def _run_operator_pipe( output_class_name=output_concept.structure_class_name, llm_prompt_1=llm_prompt_1, llm_prompt_2_factory=llm_prompt_2_factory, + content_generator=content_generator, ) output_stuff = StuffFactory.make_stuff_using_concept( @@ -301,6 +381,7 @@ async def _llm_gen_object_stuff_content( output_class_name: str, llm_prompt_1: LLMPrompt, llm_prompt_2_factory: Optional[LLMPromptFactoryAbstract], + content_generator: ContentGeneratorProtocol, ) -> StuffContent: content_class: Type[StuffContent] = class_registry.get_required_subclass(name=output_class_name, base_class=StuffContent) task_desc: str @@ -319,7 +400,7 @@ async def _llm_gen_object_stuff_content( method_desc = "text_then_object" log.dev(f"{task_desc} by {method_desc}") - generated_objects = await get_content_generator().make_text_then_object_list( + generated_objects = await content_generator.make_text_then_object_list( job_metadata=job_metadata, object_class=content_class, llm_prompt_for_text=llm_prompt_1, @@ -332,7 +413,7 @@ async def _llm_gen_object_stuff_content( # We're generating a list of objects directly method_desc = "object_direct" log.dev(f"{task_desc} by {method_desc}, content_class={content_class.__name__}") - generated_objects = await get_content_generator().make_object_list_direct( + generated_objects = await content_generator.make_object_list_direct( job_metadata=job_metadata, object_class=content_class, llm_prompt_for_object_list=llm_prompt_1, @@ -349,7 +430,7 @@ async def _llm_gen_object_stuff_content( # We're generating a single object using preliminary text method_desc = "text_then_object" log.verbose(f"{task_desc} by {method_desc}") - generated_object = await get_content_generator().make_text_then_object( + generated_object = await content_generator.make_text_then_object( job_metadata=job_metadata, object_class=content_class, llm_prompt_for_text=llm_prompt_1, @@ -362,7 +443,7 @@ async def _llm_gen_object_stuff_content( # We're generating a single object directly method_desc = "object_direct" log.verbose(f"{task_desc} by {method_desc}, content_class={content_class.__name__}") - generated_object = await get_content_generator().make_object_direct( + generated_object = await content_generator.make_object_direct( job_metadata=job_metadata, object_class=content_class, llm_prompt_for_object=llm_prompt_1, @@ -372,3 +453,22 @@ async def _llm_gen_object_stuff_content( the_content = generated_object return the_content + + @override + async def _dry_run_operator_pipe( + self, + job_metadata: JobMetadata, + working_memory: WorkingMemory, + pipe_run_params: PipeRunParams, + output_name: Optional[str] = None, + ) -> PipeOutput: + log.warning(f"PipeLLM: dry run operator pipe: {self.code}") + content_generator_dry = ContentGeneratorDry() + pipe_output = await self._run_operator_pipe( + job_metadata=job_metadata, + working_memory=working_memory, + pipe_run_params=pipe_run_params, + output_name=output_name, + content_generator=content_generator_dry, + ) + return pipe_output diff --git a/pipelex/pipe_operators/pipe_llm_factory.py b/pipelex/pipe_operators/pipe_llm_factory.py index c8e883bf7..f873ed575 100644 --- a/pipelex/pipe_operators/pipe_llm_factory.py +++ b/pipelex/pipe_operators/pipe_llm_factory.py @@ -6,6 +6,7 @@ from pipelex.cogt.llm.llm_models.llm_deck import LLMSettingChoices from pipelex.cogt.llm.llm_models.llm_setting import LLMSettingOrPresetId from pipelex.core.pipe_blueprint import PipeBlueprint, PipeSpecificFactoryProtocol +from pipelex.core.pipe_input_spec import PipeInputSpec from pipelex.core.pipe_run_params import make_output_multiplicity from pipelex.exceptions import PipeDefinitionError from pipelex.hub import get_optional_domain @@ -43,6 +44,7 @@ class PipeLLMBlueprint(PipeBlueprint): nb_output: Optional[int] = None multiple_output: Optional[bool] = None + # TODO: chack that the listed images are listed in the inputs @model_validator(mode="after") def validate_multiple_output(self) -> Self: if excess_attributes_list := has_more_than_one_among_attributes_from_lists( @@ -114,6 +116,7 @@ def make_pipe_from_blueprint( pipe_llm_prompt = PipeLLMPrompt( code="adhoc_for_pipe_llm_prompt", domain=domain_code, + inputs=PipeInputSpec(root=pipe_blueprint.inputs or {}), system_prompt_pipe_jinja2=system_prompt_pipe_jinja2, system_prompt_verbatim_name=pipe_blueprint.system_prompt_name, system_prompt=pipe_blueprint.system_prompt or system_prompt, @@ -141,7 +144,7 @@ def make_pipe_from_blueprint( domain=domain_code, code=pipe_code, definition=pipe_blueprint.definition, - input_concept_code=pipe_blueprint.input, + inputs=PipeInputSpec(root=pipe_blueprint.inputs or {}), output_concept_code=pipe_blueprint.output, pipe_llm_prompt=pipe_llm_prompt, llm_choices=llm_settings, diff --git a/pipelex/pipe_operators/pipe_llm_prompt.py b/pipelex/pipe_operators/pipe_llm_prompt.py index 87e6d0b4d..34cd6cd5f 100644 --- a/pipelex/pipe_operators/pipe_llm_prompt.py +++ b/pipelex/pipe_operators/pipe_llm_prompt.py @@ -10,6 +10,7 @@ from pipelex.cogt.llm.llm_prompt import LLMPrompt from pipelex.core.concept import Concept from pipelex.core.concept_native import NativeConcept +from pipelex.core.pipe_input_spec import PipeInputSpec from pipelex.core.pipe_output import PipeOutput from pipelex.core.pipe_run_params import PipeRunParams from pipelex.core.stuff_content import ImageContent, LLMPromptContent, StuffContent @@ -19,9 +20,7 @@ PipeDefinitionError, PipeInputError, PipeRunParamsError, - WorkingMemoryNotFoundError, - WorkingMemoryStuffNotFoundError, - WorkingMemoryTypeError, + WorkingMemoryVariableError, ) from pipelex.hub import get_template from pipelex.pipe_operators.pipe_jinja2 import PipeJinja2, PipeJinja2Output @@ -94,6 +93,26 @@ def validate_with_libraries(self): if self.system_prompt_pipe_jinja2: self.system_prompt_pipe_jinja2.validate_with_libraries() + def needed_inputs(self) -> PipeInputSpec: + conceptless_required_variables: Set[str] = set() + if self.user_pipe_jinja2: + conceptless_required_variables.update(self.user_pipe_jinja2.required_variables()) + if self.system_prompt_pipe_jinja2: + conceptless_required_variables.update(self.system_prompt_pipe_jinja2.required_variables()) + + pipe_input_spec = PipeInputSpec(root={}) + for conceptless_required_variable in conceptless_required_variables: + if conceptless_required_variable.startswith("_"): + # variables starting with _ are run parameters, not inputs + continue + pipe_input_spec.add_requirement(variable_name=conceptless_required_variable, concept_code=NativeConcept.ANYTHING.code) + + if self.user_images: + for user_image in self.user_images: + pipe_input_spec.add_requirement(variable_name=user_image, concept_code=NativeConcept.IMAGE.code) + + return pipe_input_spec + @override def required_variables(self) -> Set[str]: required_variables: Set[str] = set() @@ -102,7 +121,8 @@ def required_variables(self) -> Set[str]: if self.system_prompt_pipe_jinja2: required_variables.update(self.system_prompt_pipe_jinja2.required_variables()) if self.user_images: - required_variables.update(self.user_images) + user_images_top_object_name = [user_image.split(".", 1)[0] for user_image in self.user_images] + required_variables.update(user_images_top_object_name) return required_variables @override @@ -117,8 +137,6 @@ async def _run_operator_pipe( raise PipeRunParamsError( f"PipeLLMPrompt does not suppport multiple outputs, got output_multiplicity = {pipe_run_params.output_multiplicity}" ) - if not self.output_concept_code: - raise PipeRunParamsError("PipeLLMPrompt must have a fixed non-None output_concept_code") ############################################################ # User images @@ -128,8 +146,8 @@ async def _run_operator_pipe( for user_image_name in self.user_images: log.debug(f"Getting user image '{user_image_name}' from context") try: - prompt_image_content = working_memory.get_stuff_attribute(name=user_image_name, wanted_type=ImageContent) - except (WorkingMemoryNotFoundError, WorkingMemoryStuffNotFoundError, WorkingMemoryTypeError) as exc: + prompt_image_content = working_memory.get_stuff_or_attribute(name=user_image_name, wanted_type=ImageContent) + except WorkingMemoryVariableError as exc: raise PipeInputError(f"Could not find a valid user image named '{user_image_name}' in the working_memory: {exc}") from exc if base_64 := prompt_image_content.base_64: diff --git a/pipelex/pipe_operators/pipe_ocr.py b/pipelex/pipe_operators/pipe_ocr.py index 721b68f8f..7f0b20fac 100644 --- a/pipelex/pipe_operators/pipe_ocr.py +++ b/pipelex/pipe_operators/pipe_ocr.py @@ -1,24 +1,36 @@ from typing import List, Optional -from pydantic import model_validator +from pydantic import field_validator, model_validator from typing_extensions import Self, override from pipelex import log +from pipelex.cogt.content_generation.content_generator_dry import ContentGeneratorDry +from pipelex.cogt.content_generation.content_generator_protocol import ContentGeneratorProtocol from pipelex.cogt.ocr.ocr_engine import OcrEngine from pipelex.cogt.ocr.ocr_handle import OcrHandle from pipelex.cogt.ocr.ocr_input import OcrInput from pipelex.cogt.ocr.ocr_job_components import OcrJobConfig, OcrJobParams +from pipelex.config import StaticValidationReaction, get_config +from pipelex.core.concept_native import NativeConcept from pipelex.core.pipe_output import PipeOutput -from pipelex.core.pipe_run_params import PipeRunParams +from pipelex.core.pipe_run_params import ( + PipeRunParams, +) from pipelex.core.stuff_content import ImageContent, ListContent, PageContent, TextAndImagesContent, TextContent from pipelex.core.stuff_factory import StuffFactory from pipelex.core.working_memory import WorkingMemory -from pipelex.exceptions import PipeDefinitionError -from pipelex.hub import get_content_generator +from pipelex.exceptions import ( + PipeDefinitionError, + StaticValidationError, + StaticValidationErrorType, +) +from pipelex.hub import ( + get_concept_provider, + get_content_generator, +) from pipelex.pipe_operators.pipe_operator import PipeOperator from pipelex.pipeline.job_metadata import JobMetadata from pipelex.tools.pdf.pypdfium2_renderer import pypdfium2_renderer -from pipelex.tools.typing.validation_utils import has_exactly_one_among_attributes_from_list class PipeOcrOutput(PipeOutput): @@ -27,19 +39,94 @@ class PipeOcrOutput(PipeOutput): class PipeOcr(PipeOperator): ocr_engine: Optional[OcrEngine] = None - image_stuff_name: Optional[str] = None - pdf_stuff_name: Optional[str] = None should_caption_images: bool should_include_images: bool should_include_page_views: bool page_views_dpi: int + image_stuff_name: Optional[str] = None + pdf_stuff_name: Optional[str] = None + + @field_validator("image_stuff_name", "pdf_stuff_name") + @classmethod + def validate_input_stuff_name_not_provided_as_attribute(cls, v: Optional[str]) -> Optional[str]: + if v is not None: + raise PipeDefinitionError("image_stuff_name and pdf_stuff_name must be None before input validation") + return v + @model_validator(mode="after") - def validate_exactly_one_input_stuff_name(self) -> Self: - if not has_exactly_one_among_attributes_from_list(self, attributes_list=["image_stuff_name", "pdf_stuff_name"]): - raise PipeDefinitionError("Exactly one of 'image_stuff_name' or 'pdf_stuff_name' must be provided") + def validate_inputs(self) -> Self: + self._validate_inputs() return self + def _validate_inputs(self): + concept_provider = get_concept_provider() + static_validation_config = get_config().pipelex.static_validation_config + default_reaction = static_validation_config.default_reaction + reactions = static_validation_config.reactions + + # check that we have either an image or a pdf in inputs, at most one of them and nothing else + candidate_prompt_var_names: List[str] = [] + for input_name, input_concept_code in self.inputs.items: + log.debug(f"Validating input '{input_name}' with concept code '{input_concept_code}'") + if concept_provider.is_compatible_by_concept_code( + tested_concept_code=input_concept_code, + wanted_concept_code=NativeConcept.IMAGE.code, + ): + self.image_stuff_name = input_name + candidate_prompt_var_names.append(input_name) + elif concept_provider.is_compatible_by_concept_code( + tested_concept_code=input_concept_code, + wanted_concept_code=NativeConcept.PDF.code, + ): + self.pdf_stuff_name = input_name + candidate_prompt_var_names.append(input_name) + else: + inadequate_input_concept_error = StaticValidationError( + error_type=StaticValidationErrorType.INADEQUATE_INPUT_CONCEPT, + domain_code=self.domain, + pipe_code=self.code, + variable_names=[input_name], + provided_concept_code=input_concept_code, + explanation="For OCR you must provide either a pdf or an image or a concept that refines them", + ) + match reactions.get(StaticValidationErrorType.INADEQUATE_INPUT_CONCEPT, default_reaction): + case StaticValidationReaction.IGNORE: + pass + case StaticValidationReaction.LOG: + log.error(inadequate_input_concept_error.desc()) + case StaticValidationReaction.RAISE: + raise inadequate_input_concept_error + if len(candidate_prompt_var_names) > 1: + too_many_candidate_inputs_error = StaticValidationError( + error_type=StaticValidationErrorType.TOO_MANY_CANDIDATE_INPUTS, + domain_code=self.domain, + pipe_code=self.code, + variable_names=candidate_prompt_var_names, + explanation="Only one image or pdf can be provided for OCR", + ) + match reactions.get(StaticValidationErrorType.TOO_MANY_CANDIDATE_INPUTS, default_reaction): + case StaticValidationReaction.IGNORE: + pass + case StaticValidationReaction.LOG: + log.error(too_many_candidate_inputs_error.desc()) + case StaticValidationReaction.RAISE: + raise too_many_candidate_inputs_error + elif len(candidate_prompt_var_names) == 0: + missing_input_var_error = StaticValidationError( + error_type=StaticValidationErrorType.MISSING_INPUT_VARIABLE, + domain_code=self.domain, + pipe_code=self.code, + explanation="For OCR you must provide either a pdf or an image or a concept that refines them", + ) + match reactions.get(StaticValidationErrorType.MISSING_INPUT_VARIABLE, default_reaction): + case StaticValidationReaction.IGNORE: + pass + case StaticValidationReaction.LOG: + log.error(missing_input_var_error.desc()) + case StaticValidationReaction.RAISE: + raise missing_input_var_error + @override async def _run_operator_pipe( self, @@ -47,9 +134,9 @@ async def _run_operator_pipe( working_memory: WorkingMemory, pipe_run_params: PipeRunParams, output_name: Optional[str] = None, + content_generator: Optional[ContentGeneratorProtocol] = None, ) -> PipeOcrOutput: - if not self.output_concept_code: - raise PipeDefinitionError("PipeOcr should have a non-None output_concept_code") + content_generator = content_generator or get_content_generator() image_uri: Optional[str] = None pdf_uri: Optional[str] = None @@ -73,7 +160,7 @@ async def _run_operator_pipe( image_uri=image_uri, pdf_uri=pdf_uri, ) - ocr_output = await get_content_generator().make_ocr_extract_pages( + ocr_output = await content_generator.make_ocr_extract_pages( ocr_input=ocr_input, ocr_handle=ocr_handle, job_metadata=job_metadata, @@ -84,10 +171,12 @@ async def _run_operator_pipe( # Build the output stuff, which is a list of page contents page_view_contents: List[ImageContent] = [] if self.should_include_page_views: + log.debug(f"should_include_page_views: {self.should_include_page_views}, pdf_uri: {pdf_uri}, image_uri: {image_uri}") if pdf_uri: for page in ocr_output.pages.values(): if page.page_view: page_view_contents.append(ImageContent.make_from_extracted_image(extracted_image=page.page_view)) + log.debug(f"page_view_contents: {page_view_contents}") needs_to_generate_page_views: bool if len(page_view_contents) == 0: log.debug("No page views found in the OCR output") @@ -108,7 +197,8 @@ async def _run_operator_pipe( page_contents: List[PageContent] = [] for page_index, page in ocr_output.pages.items(): images = [ImageContent.make_from_extracted_image(extracted_image=img) for img in page.extracted_images] - page_view = page_view_contents[page_index] if self.should_include_page_views else None + log.debug(f"images: {images}, page_view_contents: {page_view_contents}, index: {page_index}") + page_view = page_view_contents[page_index - 1] if self.should_include_page_views else None page_contents.append( PageContent( text_and_images=TextAndImagesContent( @@ -136,3 +226,22 @@ async def _run_operator_pipe( working_memory=working_memory, ) return pipe_output + + @override + async def _dry_run_operator_pipe( + self, + job_metadata: JobMetadata, + working_memory: WorkingMemory, + pipe_run_params: PipeRunParams, + output_name: Optional[str] = None, + ) -> PipeOutput: + log.warning(f"PipeOcr: dry run operator pipe: {self.code}") + content_generator_dry = ContentGeneratorDry() + pipe_output = await self._run_operator_pipe( + job_metadata=job_metadata, + working_memory=working_memory, + pipe_run_params=pipe_run_params, + output_name=output_name, + content_generator=content_generator_dry, + ) + return pipe_output diff --git a/pipelex/pipe_operators/pipe_ocr_factory.py b/pipelex/pipe_operators/pipe_ocr_factory.py index b973b09bc..6d4e7290c 100644 --- a/pipelex/pipe_operators/pipe_ocr_factory.py +++ b/pipelex/pipe_operators/pipe_ocr_factory.py @@ -1,32 +1,22 @@ from typing import Any, Dict, Optional -from pydantic import model_validator -from typing_extensions import Self, override +from typing_extensions import override from pipelex.cogt.ocr.ocr_engine_factory import OcrEngineFactory, OcrPlatform from pipelex.cogt.ocr.ocr_handle import OcrHandle from pipelex.core.pipe_blueprint import PipeBlueprint, PipeSpecificFactoryProtocol -from pipelex.exceptions import PipeDefinitionError +from pipelex.core.pipe_input_spec import PipeInputSpec from pipelex.pipe_operators.pipe_ocr import PipeOcr -from pipelex.tools.typing.validation_utils import has_exactly_one_among_attributes_from_list class PipeOcrBlueprint(PipeBlueprint): definition: Optional[str] = None - image: Optional[str] = None - pdf: Optional[str] = None ocr_platform: Optional[OcrPlatform] = None page_images: bool = False page_image_captions: bool = False page_views: bool = False page_views_dpi: int = 300 - @model_validator(mode="after") - def validate_input_source(self) -> Self: - if not has_exactly_one_among_attributes_from_list(self, attributes_list=["image", "pdf"]): - raise PipeDefinitionError("Either 'image' or 'pdf' must be provided") - return self - class PipeOcrFactory(PipeSpecificFactoryProtocol[PipeOcrBlueprint, PipeOcr]): @classmethod @@ -48,8 +38,7 @@ def make_pipe_from_blueprint( definition=pipe_blueprint.definition, ocr_engine=ocr_engine, output_concept_code=pipe_blueprint.output, - image_stuff_name=pipe_blueprint.image, - pdf_stuff_name=pipe_blueprint.pdf, + inputs=PipeInputSpec(root=pipe_blueprint.inputs or {}), should_include_images=pipe_blueprint.page_images, should_caption_images=pipe_blueprint.page_image_captions, should_include_page_views=pipe_blueprint.page_views, diff --git a/pipelex/pipe_operators/pipe_operator.py b/pipelex/pipe_operators/pipe_operator.py index 68ac38318..cc4a8eb65 100644 --- a/pipelex/pipe_operators/pipe_operator.py +++ b/pipelex/pipe_operators/pipe_operator.py @@ -3,9 +3,10 @@ from typing_extensions import override +from pipelex import log from pipelex.core.pipe_abstract import PipeAbstract from pipelex.core.pipe_output import PipeOutput -from pipelex.core.pipe_run_params import PipeRunParams +from pipelex.core.pipe_run_params import PipeRunMode, PipeRunParams from pipelex.core.working_memory import WorkingMemory from pipelex.hub import get_activity_manager from pipelex.pipeline.activity.activity_models import ActivityReport @@ -29,12 +30,21 @@ async def run_pipe( ) job_metadata.update(updated_metadata=updated_metadata) - pipe_output = await self._run_operator_pipe( - job_metadata=job_metadata, - working_memory=working_memory, - pipe_run_params=pipe_run_params, - output_name=output_name, - ) + match pipe_run_params.run_mode: + case PipeRunMode.LIVE: + pipe_output = await self._run_operator_pipe( + job_metadata=job_metadata, + working_memory=working_memory, + pipe_run_params=pipe_run_params, + output_name=output_name, + ) + case PipeRunMode.DRY: + pipe_output = await self._dry_run_operator_pipe( + job_metadata=job_metadata, + working_memory=working_memory, + pipe_run_params=pipe_run_params, + output_name=output_name, + ) get_activity_manager().dispatch_activity( activity_report=ActivityReport( job_metadata=job_metadata, @@ -55,3 +65,20 @@ async def _run_operator_pipe( output_name: Optional[str] = None, ) -> PipeOutput: pass + + async def _dry_run_operator_pipe( + self, + job_metadata: JobMetadata, + working_memory: WorkingMemory, + pipe_run_params: PipeRunParams, + output_name: Optional[str] = None, + ) -> PipeOutput: + log.warning( + f"PipeOperator: dry run method called for operator pipe: {self.code}, but no dry run method is implemented for {self.__class__.__name__}" + ) + return await self._run_operator_pipe( + job_metadata=job_metadata, + working_memory=working_memory, + pipe_run_params=pipe_run_params, + output_name=output_name, + ) diff --git a/pipelex/pipelex.toml b/pipelex/pipelex.toml index 114b49eb4..d7a47ff48 100644 --- a/pipelex/pipelex.toml +++ b/pipelex/pipelex.toml @@ -197,6 +197,19 @@ is_default_text_then_structure = false # turn this to true to get better result structure_from_preliminary_text_system = "structure_from_preliminary_text_system" structure_from_preliminary_text_user = "structure_from_preliminary_text_user" +#################################################################################################### +# Static validation config +#################################################################################################### + +[pipelex.static_validation_config] +default_reaction = "raise" + +[pipelex.static_validation_config.reactions] +# enable one of these to tolerate some static validation errors, like you would for pyright or some linters +# missing_input_variable = "log" +# extraneous_input_variable = "log" +# inadequate_input_concept = "log" + #################################################################################################### # History graph config #################################################################################################### @@ -224,3 +237,11 @@ choice_edge_style = "-----" [pipelex.pipe_run_config] pipe_stack_limit = 20 + +#################################################################################################### +# Dry run config +#################################################################################################### + +[pipelex.dry_run_config] +apply_to_jinja2_rendering = false +text_gen_truncate_length = 256 \ No newline at end of file diff --git a/pipelex/pipeline/execute.py b/pipelex/pipeline/execute.py index 74a7ace42..06db5bded 100644 --- a/pipelex/pipeline/execute.py +++ b/pipelex/pipeline/execute.py @@ -2,7 +2,7 @@ from pipelex import pretty_print from pipelex.core.pipe_output import PipeOutput -from pipelex.core.pipe_run_params import PipeOutputMultiplicity +from pipelex.core.pipe_run_params import PipeOutputMultiplicity, PipeRunMode from pipelex.core.pipe_run_params_factory import PipeRunParamsFactory from pipelex.core.working_memory import WorkingMemory from pipelex.hub import get_pipe_router, get_pipeline_manager, get_report_delegate, get_required_pipe @@ -16,6 +16,7 @@ async def execute_pipeline( output_name: Optional[str] = None, output_multiplicity: Optional[PipeOutputMultiplicity] = None, dynamic_output_concept_code: Optional[str] = None, + pipe_run_mode: PipeRunMode = PipeRunMode.LIVE, ) -> Tuple[PipeOutput, str]: """Execute a pipeline and wait for its completion. @@ -35,6 +36,8 @@ async def execute_pipeline( Output multiplicity. dynamic_output_concept_code: Override the dynamic output concept code. + pipe_run_mode: + Pipe run mode: ``PipeRunMode.LIVE`` or ``PipeRunMode.DRY``. Returns ------- @@ -53,6 +56,7 @@ async def execute_pipeline( pipe_run_params = PipeRunParamsFactory.make_run_params( output_multiplicity=output_multiplicity, dynamic_output_concept_code=dynamic_output_concept_code, + pipe_run_mode=pipe_run_mode, ) pretty_print(pipe, title=f"Running pipe '{pipe_code}'") diff --git a/pipelex/pipeline/start.py b/pipelex/pipeline/start.py index 022539847..ac4b72d87 100644 --- a/pipelex/pipeline/start.py +++ b/pipelex/pipeline/start.py @@ -3,7 +3,7 @@ from pipelex import pretty_print from pipelex.core.pipe_output import PipeOutput -from pipelex.core.pipe_run_params import PipeOutputMultiplicity +from pipelex.core.pipe_run_params import PipeOutputMultiplicity, PipeRunMode from pipelex.core.pipe_run_params_factory import PipeRunParamsFactory from pipelex.core.working_memory import WorkingMemory from pipelex.hub import get_pipe_router, get_pipeline_manager, get_report_delegate, get_required_pipe @@ -17,6 +17,7 @@ async def start_pipeline( output_name: Optional[str] = None, output_multiplicity: Optional[PipeOutputMultiplicity] = None, dynamic_output_concept_code: Optional[str] = None, + pipe_run_mode: PipeRunMode = PipeRunMode.LIVE, ) -> Tuple[str, asyncio.Task[PipeOutput]]: """Start a pipeline in the background. @@ -37,7 +38,8 @@ async def start_pipeline( Output multiplicity. dynamic_output_concept_code: Override the dynamic output concept code. - + pipe_run_mode: + Pipe run mode: ``PipeRunMode.LIVE`` or ``PipeRunMode.DRY``. Returns ------- Tuple[str, asyncio.Task[PipeOutput]] @@ -57,6 +59,7 @@ async def start_pipeline( pipe_run_params = PipeRunParamsFactory.make_run_params( output_multiplicity=output_multiplicity, dynamic_output_concept_code=dynamic_output_concept_code, + pipe_run_mode=pipe_run_mode, ) pretty_print(pipe, title=f"Starting pipe '{pipe_code}' (background)") diff --git a/pipelex/plugins/anthropic/anthropic_llm_worker.py b/pipelex/plugins/anthropic/anthropic_llm_worker.py index 40ae24dd6..85aa8b63a 100644 --- a/pipelex/plugins/anthropic/anthropic_llm_worker.py +++ b/pipelex/plugins/anthropic/anthropic_llm_worker.py @@ -7,7 +7,6 @@ from pipelex import log from pipelex.cogt.exceptions import LLMCompletionError, LLMEngineParameterError, SdkTypeError from pipelex.cogt.llm.llm_job import LLMJob -from pipelex.cogt.llm.llm_job_func import llm_job_func from pipelex.cogt.llm.llm_models.llm_engine import LLMEngine from pipelex.cogt.llm.llm_models.llm_platform import LLMPlatform from pipelex.cogt.llm.llm_worker_abstract import LLMWorkerAbstract @@ -71,8 +70,7 @@ def _adapt_max_tokens(self, max_tokens: Optional[int]) -> int: return max_tokens @override - @llm_job_func - async def gen_text( + async def _gen_text( self, llm_job: LLMJob, ) -> str: @@ -102,8 +100,7 @@ async def gen_text( return full_reply_content @override - @llm_job_func - async def gen_object( + async def _gen_object( self, llm_job: LLMJob, schema: Type[BaseModelTypeVar], diff --git a/pipelex/plugins/bedrock/bedrock_llm_worker.py b/pipelex/plugins/bedrock/bedrock_llm_worker.py index 853fbba0f..95f4ec264 100644 --- a/pipelex/plugins/bedrock/bedrock_llm_worker.py +++ b/pipelex/plugins/bedrock/bedrock_llm_worker.py @@ -5,7 +5,6 @@ from pipelex import log from pipelex.cogt.exceptions import LLMCapabilityError, LLMEngineParameterError, SdkTypeError from pipelex.cogt.llm.llm_job import LLMJob -from pipelex.cogt.llm.llm_job_func import llm_job_func from pipelex.cogt.llm.llm_models.llm_engine import LLMEngine from pipelex.cogt.llm.llm_worker_abstract import LLMWorkerAbstract from pipelex.cogt.llm.structured_output import StructureMethod @@ -42,8 +41,7 @@ def __init__( self.bedrock_client_for_text = sdk_instance @override - @llm_job_func - async def gen_text( + async def _gen_text( self, llm_job: LLMJob, ) -> str: @@ -63,8 +61,7 @@ async def gen_text( return bedrock_response_text @override - @llm_job_func - async def gen_object( + async def _gen_object( self, llm_job: LLMJob, schema: Type[BaseModelTypeVar], diff --git a/pipelex/plugins/fal/fal_imgg_worker.py b/pipelex/plugins/fal/fal_imgg_worker.py index 31aa11973..f3884a5ee 100644 --- a/pipelex/plugins/fal/fal_imgg_worker.py +++ b/pipelex/plugins/fal/fal_imgg_worker.py @@ -8,7 +8,7 @@ from pipelex.cogt.image.generated_image import GeneratedImage from pipelex.cogt.imgg.imgg_engine import ImggEngine from pipelex.cogt.imgg.imgg_job import ImggJob -from pipelex.cogt.imgg.imgg_worker_abstract import ImggWorkerAbstract, imgg_job_func +from pipelex.cogt.imgg.imgg_worker_abstract import ImggWorkerAbstract from pipelex.plugins.fal.fal_factory import FalFactory from pipelex.reporting.reporting_protocol import ReportingProtocol @@ -28,8 +28,7 @@ def __init__( self.fal_async_client = sdk_instance @override - @imgg_job_func - async def gen_image( + async def _gen_image( self, imgg_job: ImggJob, ) -> GeneratedImage: @@ -61,8 +60,7 @@ async def gen_image( return generated_image @override - @imgg_job_func - async def gen_image_list( + async def _gen_image_list( self, imgg_job: ImggJob, nb_images: int, diff --git a/pipelex/plugins/mistral/mistral_factory.py b/pipelex/plugins/mistral/mistral_factory.py index a0658a5c1..9115bb15e 100644 --- a/pipelex/plugins/mistral/mistral_factory.py +++ b/pipelex/plugins/mistral/mistral_factory.py @@ -74,7 +74,7 @@ def make_mistral_image_url(cls, prompt_image: PromptImage) -> ImageURLChunk: # TODO: use actual image type return ImageURLChunk(image_url=f"data:image/png;base64,{image_bytes}") elif isinstance(prompt_image, PromptImageBytes): - image_bytes = encode_to_base64(prompt_image.b64_image_bytes).decode("utf-8") + image_bytes = encode_to_base64(prompt_image.base_64).decode("utf-8") # TODO: use actual image type return ImageURLChunk(image_url=f"data:image/png;base64,{image_bytes}") else: diff --git a/pipelex/plugins/mistral/mistral_llm_worker.py b/pipelex/plugins/mistral/mistral_llm_worker.py index 7441a0263..433b531fc 100644 --- a/pipelex/plugins/mistral/mistral_llm_worker.py +++ b/pipelex/plugins/mistral/mistral_llm_worker.py @@ -8,7 +8,6 @@ from pipelex import log from pipelex.cogt.exceptions import LLMCompletionError, LLMEngineParameterError, SdkTypeError from pipelex.cogt.llm.llm_job import LLMJob -from pipelex.cogt.llm.llm_job_func import llm_job_func from pipelex.cogt.llm.llm_models.llm_engine import LLMEngine from pipelex.cogt.llm.llm_worker_abstract import LLMWorkerAbstract from pipelex.cogt.llm.structured_output import StructureMethod @@ -44,8 +43,7 @@ def __init__( self.instructor_for_objects = instructor.from_mistral(client=sdk_instance, use_async=True) @override - @llm_job_func - async def gen_text( + async def _gen_text( self, llm_job: LLMJob, ) -> str: @@ -70,8 +68,7 @@ async def gen_text( return mistral_response_content @override - @llm_job_func - async def gen_object( + async def _gen_object( self, llm_job: LLMJob, schema: Type[BaseModelTypeVar], diff --git a/pipelex/plugins/mistral/mistral_ocr_worker.py b/pipelex/plugins/mistral/mistral_ocr_worker.py index 36f941d4e..9c0b74192 100644 --- a/pipelex/plugins/mistral/mistral_ocr_worker.py +++ b/pipelex/plugins/mistral/mistral_ocr_worker.py @@ -9,7 +9,7 @@ from pipelex.cogt.ocr.ocr_input import OcrInputError from pipelex.cogt.ocr.ocr_job import OcrJob from pipelex.cogt.ocr.ocr_output import OcrOutput -from pipelex.cogt.ocr.ocr_worker_abstract import OcrWorkerAbstract, ocr_job_func +from pipelex.cogt.ocr.ocr_worker_abstract import OcrWorkerAbstract from pipelex.plugins.mistral.mistral_factory import MistralFactory from pipelex.plugins.mistral.mistral_utils import upload_file_for_ocr from pipelex.reporting.reporting_protocol import ReportingProtocol @@ -32,8 +32,7 @@ def __init__( self.mistral_client: Mistral = sdk_instance @override - @ocr_job_func - async def ocr_extract_pages( + async def _ocr_extract_pages( self, ocr_job: OcrJob, ) -> OcrOutput: diff --git a/pipelex/plugins/openai/openai_factory.py b/pipelex/plugins/openai/openai_factory.py index 7cc9b6554..35c653f9a 100644 --- a/pipelex/plugins/openai/openai_factory.py +++ b/pipelex/plugins/openai/openai_factory.py @@ -120,11 +120,11 @@ def make_openai_image_url(cls, prompt_image: PromptImage) -> ImageURL: openai_image_url = ImageURL(url=url, detail="high") elif isinstance(prompt_image, PromptImageBytes): # TODO: manage image type - url_with_bytes: str = f"data:image/jpeg;base64,{prompt_image.b64_image_bytes.decode('utf-8')}" + url_with_bytes: str = f"data:image/jpeg;base64,{prompt_image.base_64.decode('utf-8')}" openai_image_url = ImageURL(url=url_with_bytes, detail="high") elif isinstance(prompt_image, PromptImagePath): image_bytes = load_binary_as_base64(path=prompt_image.file_path) - return cls.make_openai_image_url(PromptImageBytes(b64_image_bytes=image_bytes)) + return cls.make_openai_image_url(PromptImageBytes(base_64=image_bytes)) else: raise LLMPromptParameterError(f"prompt_image of type {type(prompt_image)} is not supported") return openai_image_url diff --git a/pipelex/plugins/openai/openai_imgg_worker.py b/pipelex/plugins/openai/openai_imgg_worker.py index 8baca86c3..b68aa7c82 100644 --- a/pipelex/plugins/openai/openai_imgg_worker.py +++ b/pipelex/plugins/openai/openai_imgg_worker.py @@ -10,7 +10,7 @@ from pipelex.cogt.imgg.imgg_engine import ImggEngine from pipelex.cogt.imgg.imgg_job import ImggJob from pipelex.cogt.imgg.imgg_job_components import Quality -from pipelex.cogt.imgg.imgg_worker_abstract import ImggWorkerAbstract, imgg_job_func +from pipelex.cogt.imgg.imgg_worker_abstract import ImggWorkerAbstract from pipelex.config import get_config from pipelex.plugins.openai.openai_imgg_factory import OpenAIImggFactory from pipelex.reporting.reporting_protocol import ReportingProtocol @@ -35,8 +35,7 @@ def __init__( self.openai_client = sdk_instance @override - @imgg_job_func - async def gen_image( + async def _gen_image( self, imgg_job: ImggJob, ) -> GeneratedImage: @@ -45,8 +44,7 @@ async def gen_image( return generated_image @override - @imgg_job_func - async def gen_image_list( + async def _gen_image_list( self, imgg_job: ImggJob, nb_images: int, diff --git a/pipelex/plugins/openai/openai_llm_worker.py b/pipelex/plugins/openai/openai_llm_worker.py index 74eaeb968..c0305ebbf 100644 --- a/pipelex/plugins/openai/openai_llm_worker.py +++ b/pipelex/plugins/openai/openai_llm_worker.py @@ -9,7 +9,6 @@ from pipelex import log from pipelex.cogt.exceptions import LLMCompletionError, LLMEngineParameterError, LLMModelNotFoundError, SdkTypeError from pipelex.cogt.llm.llm_job import LLMJob -from pipelex.cogt.llm.llm_job_func import llm_job_func from pipelex.cogt.llm.llm_models.llm_engine import LLMEngine from pipelex.cogt.llm.llm_models.llm_family import LLMFamily from pipelex.cogt.llm.llm_worker_abstract import LLMWorkerAbstract @@ -45,8 +44,7 @@ def __init__( ######################################################### @override - @llm_job_func - async def gen_text( + async def _gen_text( self, llm_job: LLMJob, ) -> str: @@ -142,8 +140,7 @@ async def gen_text( return response_text @override - @llm_job_func - async def gen_object( + async def _gen_object( self, llm_job: LLMJob, schema: Type[BaseModelTypeVar], diff --git a/pipelex/plugins/openai/vertexai_config.py b/pipelex/plugins/openai/vertexai_config.py index d0977ddaf..861fa8bd1 100644 --- a/pipelex/plugins/openai/vertexai_config.py +++ b/pipelex/plugins/openai/vertexai_config.py @@ -65,9 +65,11 @@ def _get_credentials(self) -> str: raise MissingDependencyError( "google-auth-oauthlib", "google", - "The google-auth-oauthlib SDK is required to use Google connection. \ - You can install it with 'pip install pipelex[google]', or use this model via another provider \ - (such as Azure OpenAI, OpenAI, anthropic or bedrock).", + ( + "The google-auth-oauthlib SDK is required to use Google connection. " + "You can install it with 'pip install pipelex[google]', or use this model via another provider " + "(such as Azure OpenAI, OpenAI, anthropic or bedrock)." + ), ) from exc match self.api_key_method: diff --git a/pipelex/test_extras/shared_pytest_plugins.py b/pipelex/test_extras/shared_pytest_plugins.py index 94faa7f80..db30a233d 100644 --- a/pipelex/test_extras/shared_pytest_plugins.py +++ b/pipelex/test_extras/shared_pytest_plugins.py @@ -1,5 +1,7 @@ import pytest +from pytest import FixtureRequest, Parser +from pipelex.core.pipe_run_params import PipeRunMode from pipelex.libraries.library_config import LibraryConfig from pipelex.tools.runtime_manager import RunMode, runtime_manager @@ -23,3 +25,19 @@ def manage_pipelex_libraries_with_overwrite(): yield # TODO: make it safe to erase/replace standard libraries in client projects without touching custom stuff # LibraryConfig.remove_libraries() + + +def pytest_addoption(parser: Parser): + parser.addoption( + "--pipe-run-mode", + action="store", + default="dry", + help="Pipe run mode: 'live' or 'dry'", + choices=("live", "dry"), + ) + + +@pytest.fixture +def pipe_run_mode(request: FixtureRequest) -> PipeRunMode: + mode_str = request.config.getoption("--pipe-run-mode") + return PipeRunMode(mode_str) diff --git a/pipelex/tools/misc/attribute_utils.py b/pipelex/tools/misc/attribute_utils.py new file mode 100644 index 000000000..eec35c15a --- /dev/null +++ b/pipelex/tools/misc/attribute_utils.py @@ -0,0 +1,34 @@ +from typing import Any, ClassVar + + +class AttributePolisher: + base_64_truncate_length: ClassVar[int] = 64 + url_truncate_length: ClassVar[int] = 128 + truncate_suffix: ClassVar[str] = "…" + + @classmethod + def _truncate_string(cls, value: str, max_length: int) -> str: + """Truncate a string to the specified maximum length and append the truncate suffix.""" + if len(value) > max_length: + return value[:max_length] + cls.truncate_suffix + return value + + @classmethod + def should_truncate(cls, name: str, value: Any) -> bool: + if not isinstance(value, str): + return False + + if name == "base_64" and len(value) > cls.base_64_truncate_length: + return True + elif name == "url" and value.startswith("data:image/") and len(value) > cls.url_truncate_length: + return True + return False + + @classmethod + def get_truncated_value(cls, name: str, value: str) -> str: + """Get the truncated value based on the field name and value type.""" + if name == "base_64": + return cls._truncate_string(value, cls.base_64_truncate_length) + elif name == "url" and value.startswith("data:image/"): + return cls._truncate_string(value, cls.url_truncate_length) + return value diff --git a/pipelex/tools/misc/markdown_utils.py b/pipelex/tools/misc/markdown_utils.py index 0eed92341..f55a2afd0 100644 --- a/pipelex/tools/misc/markdown_utils.py +++ b/pipelex/tools/misc/markdown_utils.py @@ -1,10 +1,11 @@ -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional +from pipelex.tools.misc.attribute_utils import AttributePolisher from pipelex.tools.misc.json_utils import purify_json_dict from pipelex.tools.misc.string_utils import snake_to_capitalize_first_letter -def convert_to_markdown(data: Any, level: int = 1, is_pretty: bool = False) -> str: +def convert_to_markdown(data: Any, level: int = 1, is_pretty: bool = False, key: Optional[str] = None) -> str: """ Convert arbitrary JSON-compatible Python data to a Markdown string without needing to specify the markdown type explicitly. @@ -22,7 +23,7 @@ def convert_to_markdown(data: Any, level: int = 1, is_pretty: bool = False) -> s converted_line = f"{heading_prefix} {key}" # Convert the value recursively, increasing the heading level # dict_result_lines.append(convert_to_markdown(data=value, level=level + 1)) - converted_value = convert_to_markdown(data=value, level=level + 1) + converted_value = convert_to_markdown(data=value, level=level + 1, key=key) converted_value_nb_lines = len(converted_value.split("\n")) if converted_value_nb_lines > 1: dict_result_lines.append(converted_line) @@ -54,7 +55,10 @@ def convert_to_markdown(data: Any, level: int = 1, is_pretty: bool = False) -> s elif isinstance(data, (str, int, float, bool)): # Simple scalar types become paragraphs (strings) or inline text # If it's a string with multiple lines, just output them as-is. - return str(data) + str_value = str(data) + if key and AttributePolisher.should_truncate(name=key, value=str_value): + return AttributePolisher.get_truncated_value(name=key, value=str_value) + return str_value elif data is None: # No value diff --git a/pipelex/tools/typing/pydantic_utils.py b/pipelex/tools/typing/pydantic_utils.py index ccdba1674..499c3d4f1 100644 --- a/pipelex/tools/typing/pydantic_utils.py +++ b/pipelex/tools/typing/pydantic_utils.py @@ -1,9 +1,10 @@ -from typing import Any, ClassVar, Dict, List, Optional, Sequence, Set, TypeVar, Union +from typing import Any, Dict, List, Optional, Sequence, Set, TypeVar, Union from pydantic import BaseModel, ValidationError from rich.repr import Result as RichReprResult from typing_extensions import override +from pipelex.tools.misc.attribute_utils import AttributePolisher from pipelex.types import StrEnum BaseModelTypeVar = TypeVar("BaseModelTypeVar", bound=BaseModel) @@ -175,9 +176,6 @@ def serialize_model( class CustomBaseModel(BaseModel): - truncate_length: ClassVar[int] = 50 - truncate_suffix: ClassVar[str] = "…" - @override def __rich_repr__(self) -> RichReprResult: # type: ignore for item in super().__rich_repr__(): # type: ignore @@ -186,11 +184,8 @@ def __rich_repr__(self) -> RichReprResult: # type: ignore if len(tuple_item) >= 2: name = tuple_item[0] value = tuple_item[1] - should_truncate = (name == "base_64" and isinstance(value, str) and len(value) > self.truncate_length) or ( - name == "url" and isinstance(value, str) and value.startswith("data:image/") and len(value) > self.truncate_length - ) - if should_truncate: - truncated_value = value[: self.truncate_length] + self.truncate_suffix + if AttributePolisher.should_truncate(name=name, value=value): + truncated_value = AttributePolisher.get_truncated_value(name, value) if len(tuple_item) == 3: yield name, truncated_value, tuple_item[2] else: @@ -204,11 +199,9 @@ def __rich_repr__(self) -> RichReprResult: # type: ignore def __repr_args__(self) -> Sequence[tuple[Optional[str], Any]]: processed_args: list[tuple[Optional[str], Any]] = [] for name, value in super().__repr_args__(): - should_truncate = (name == "base_64" and isinstance(value, str) and len(value) > self.truncate_length) or ( - name == "url" and isinstance(value, str) and value.startswith("data:image/") and len(value) > self.truncate_length - ) - if should_truncate: - processed_args.append((name, value[: self.truncate_length] + self.truncate_suffix)) + if name and AttributePolisher.should_truncate(name=name, value=value): + truncated_value = AttributePolisher.get_truncated_value(name, value) + processed_args.append((name, truncated_value)) else: processed_args.append((name, value)) return processed_args diff --git a/pyproject.toml b/pyproject.toml index f8861dac3..8a254f67d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ dependencies = [ "openpyxl>=3.1.5", "pandas>=2.2.3", "pillow>=11.2.1", + "polyfactory>=2.21.0", "pypdfium2>=4.30.1", "pydantic==2.10.6", "python-dotenv>=1.0.1", @@ -196,10 +197,11 @@ strictSetInference = true typeCheckingMode = "strict" [tool.pytest.ini_options] -addopts = "--import-mode=importlib -ra -m 'not inference and not llm and not imgg and not ocr and not needs_output'" +addopts = "--import-mode=importlib -ra -m 'not (inference or llm or imgg or ocr or needs_output or pipelex_api)'" asyncio_default_fixture_loop_scope = "session" markers = [ "needs_output: tests that need output to be displayed", + "pipelex_api: tests that require access to the Pipelex API", "inference: slow and costly due to inference calls", "llm: slow and costly due to llm inference calls", "imgg: slow and costly due to imgg inference calls", diff --git a/tests/pipelex/cogt/conftest.py b/tests/pipelex/cogt/conftest.py deleted file mode 100644 index f5f68ca3c..000000000 --- a/tests/pipelex/cogt/conftest.py +++ /dev/null @@ -1,183 +0,0 @@ -import pytest - -from pipelex.cogt.llm.llm_job_components import LLMJobParams -from pipelex.cogt.llm.llm_models.llm_family import LLMCreator, LLMFamily -from pipelex.cogt.llm.llm_models.llm_platform import LLMPlatform - - -@pytest.fixture( - params=[ - "llm_for_testing_gen_text", - "llm_for_testing_gen_object", - "llm_for_creative_writing", - ] -) -def llm_preset_id(request: pytest.FixtureRequest) -> str: - assert isinstance(request.param, str) - return request.param - - -# TODO: make it efficient to also test multiple platforms like openai/azure and mistral/anthropic/bedrock -@pytest.fixture( - params=[ - # "o1", - # "gpt-4o", - "gpt-4o-mini", - # "gpt-4-5-preview", - # "o1-mini", - # "o3-mini", - # "claude-3-haiku", - # "claude-3-5-sonnet", - # "claude-3-7-sonnet", - # "mistral-large", - # "ministral-3b", - # "ministral-8b", - # "pixtral-12b", - # "pixtral-large", - # "gemini-1-5-pro", - # "gemini-1-5-flash", - # "gemini-2-flash", - # "gemini-2-pro", - # "gemini-2-5-flash", - # "gemini-2-5-pro", - # "bedrock-mistral-large", - # "bedrock-claude-3-7-sonnet", - # "bedrock-meta-llama-3-3-70b-instruct", - # "bedrock-nova-pro", - # "sonar", - ] -) -def llm_handle(request: pytest.FixtureRequest) -> str: - assert isinstance(request.param, str) - return request.param - - -@pytest.fixture( - params=[ - # "o1", - # "o3-mini", - # "gpt-4o", - "gpt-4o-mini", - # "gpt-4-5-preview", - # "claude-3-haiku", - # "claude-3-5-sonnet", - # "claude-3-7-sonnet", - # "pixtral-12b", - # "pixtral-large", - # "gemini-1-5-pro", - # "gemini-1-5-flash", - # "gemini-2-flash", - # "gemini-2-pro", - # "gemini-2-5-pro", - # "gemini-2-5-flash", - # "mistral-small3.1", - # "qwen3:8b", - ] -) -def llm_handle_for_vision(request: pytest.FixtureRequest) -> str: - assert isinstance(request.param, str) - return request.param - - -@pytest.fixture( - params=[ - # LLMFamily.GPT_4, - LLMFamily.GPT_4O, - # LLMFamily.GPT_4_5, - # LLMFamily.GPT_4_1, - # LLMFamily.O_SERIES, - # LLMFamily.CLAUDE_3_7, - # LLMFamily.CLAUDE_4, - # LLMFamily.PERPLEXITY_SEARCH, - # LLMFamily.PERPLEXITY_REASONING, - # LLMFamily.PERPLEXITY_RESEARCH, - # LLMFamily.PERPLEXITY_DEEPSEEK, - # LLMFamily.GEMINI, - # LLMFamily.GEMMA, - ] -) -def llm_family(request: pytest.FixtureRequest) -> LLMFamily: - assert isinstance(request.param, LLMFamily) - return request.param - - -@pytest.fixture( - params=[ - # LLMCreator.ALIBABA, - # LLMCreator.AMAZON, - # LLMCreator.ANTHROPIC, - # LLMCreator.DEEPSEEK, - # LLMCreator.GOOGLE, - LLMCreator.OPENAI, - # LLMCreator.META, - # LLMCreator.MISTRAL, - # LLMCreator.PERPLEXITY, - ] -) -def llm_creator(request: pytest.FixtureRequest) -> LLMCreator: - assert isinstance(request.param, LLMCreator) - return request.param - - -# TODO: build llm_id/platform combos dynalically from config data -@pytest.fixture( - params=[ - # LLMPlatform.ANTHROPIC, - # LLMPlatform.AZURE_OPENAI, - # LLMPlatform.BEDROCK, - # LLMPlatform.BEDROCK_ANTHROPIC, - # LLMPlatform.MISTRAL, - LLMPlatform.OPENAI, - # LLMPlatform.PERPLEXITY, - # LLMPlatform.VERTEXAI_OPENAI, - # LLMPlatform.CUSTOM_LLM, - # LLMPlatform.XAI, - ] -) -def llm_platform(request: pytest.FixtureRequest) -> LLMPlatform: - assert isinstance(request.param, LLMPlatform) - return request.param - - -@pytest.fixture( - params=[ - "gpt-4o-mini", - # "open-mixtral-8x7b", - # "google/gemini-2.0-flash", - # "google/gemini-2.5-pro-preview-05-06", - # "google/gemini-2.5-flash-preview-04-17", - # "google/gemini-2.5-flash-preview-05-20", - # "o1", - # "o4-mini", - # "bedrock-mistral-large", - # "sonar", - # "claude-3-7-sonnet", - # "claude-4-sonnet", - # "claude-4-opus", - # "us.anthropic.claude-sonnet-4-20250514-v1:0", - # "us.anthropic.claude-opus-4-20250514-v1:0", - # "sonar", - # "sonar-pro", - # "gemma3:4b", - # "llama4:scout", - # "mistral-small3.1:24b", - # "qwen3:8b", - ] -) -def llm_id(request: pytest.FixtureRequest) -> str: - assert isinstance(request.param, str) - return request.param - - -@pytest.fixture( - params=[ - LLMJobParams( - temperature=0.5, - max_tokens=None, - seed=None, - ), - ] -) -def llm_job_params(request: pytest.FixtureRequest) -> LLMJobParams: - assert isinstance(request.param, LLMJobParams) - return request.param diff --git a/tests/pipelex/cogt/cogt_asynch/__init__.py b/tests/pipelex/cogt_asynch/__init__.py similarity index 100% rename from tests/pipelex/cogt/cogt_asynch/__init__.py rename to tests/pipelex/cogt_asynch/__init__.py diff --git a/tests/pipelex/cogt/cogt_asynch/test_content_generator.py b/tests/pipelex/cogt_asynch/test_content_generator.py similarity index 100% rename from tests/pipelex/cogt/cogt_asynch/test_content_generator.py rename to tests/pipelex/cogt_asynch/test_content_generator.py diff --git a/tests/pipelex/cogt/cogt_asynch/test_image_gen.py b/tests/pipelex/cogt_asynch/test_image_gen.py similarity index 100% rename from tests/pipelex/cogt/cogt_asynch/test_image_gen.py rename to tests/pipelex/cogt_asynch/test_image_gen.py diff --git a/tests/pipelex/cogt/cogt_asynch/test_llm_engines.py b/tests/pipelex/cogt_asynch/test_llm_engines.py similarity index 100% rename from tests/pipelex/cogt/cogt_asynch/test_llm_engines.py rename to tests/pipelex/cogt_asynch/test_llm_engines.py diff --git a/tests/pipelex/cogt/cogt_asynch/test_llm_gen_object.py b/tests/pipelex/cogt_asynch/test_llm_gen_object.py similarity index 99% rename from tests/pipelex/cogt/cogt_asynch/test_llm_gen_object.py rename to tests/pipelex/cogt_asynch/test_llm_gen_object.py index e96061e9e..2b15c7065 100644 --- a/tests/pipelex/cogt/cogt_asynch/test_llm_gen_object.py +++ b/tests/pipelex/cogt_asynch/test_llm_gen_object.py @@ -27,7 +27,7 @@ def get_async_worker_and_job(llm_preset_id: str, user_text: str): @pytest.mark.llm @pytest.mark.inference @pytest.mark.asyncio(loop_scope="class") -class TestAsyncCogtLLMGenObject: +class TestLLMGenObject: @pytest.mark.parametrize("user_text, expected_instance", LLMTestCases.SINGLE_OBJECT) async def test_gen_object_async_using_handle(self, llm_job_params: LLMJobParams, llm_handle: str, user_text: str, expected_instance: BaseModel): llm_worker = get_llm_worker(llm_handle=llm_handle) diff --git a/tests/pipelex/cogt/cogt_asynch/test_llm_gen_text.py b/tests/pipelex/cogt_asynch/test_llm_gen_text.py similarity index 99% rename from tests/pipelex/cogt/cogt_asynch/test_llm_gen_text.py rename to tests/pipelex/cogt_asynch/test_llm_gen_text.py index 3bb51da5e..0f077a3a2 100644 --- a/tests/pipelex/cogt/cogt_asynch/test_llm_gen_text.py +++ b/tests/pipelex/cogt_asynch/test_llm_gen_text.py @@ -27,7 +27,7 @@ def get_async_worker_and_job(llm_preset_id: str, user_text: str): @pytest.mark.llm @pytest.mark.inference @pytest.mark.asyncio(loop_scope="class") -class TestAsyncCogtLLMGenText: +class TestLLMGenText: @pytest.mark.parametrize("topic, prompt_text", LLMTestCases.SINGLE_TEXT) async def test_gen_text_async_using_handle(self, llm_job_params: LLMJobParams, llm_handle: str, topic: str, prompt_text: str): pretty_print(prompt_text, title=topic) diff --git a/tests/pipelex/cogt/cogt_asynch/test_llm_job_async_factory.py b/tests/pipelex/cogt_asynch/test_llm_job_async_factory.py similarity index 100% rename from tests/pipelex/cogt/cogt_asynch/test_llm_job_async_factory.py rename to tests/pipelex/cogt_asynch/test_llm_job_async_factory.py diff --git a/tests/pipelex/cogt/cogt_asynch/test_llm_report.py b/tests/pipelex/cogt_asynch/test_llm_report.py similarity index 100% rename from tests/pipelex/cogt/cogt_asynch/test_llm_report.py rename to tests/pipelex/cogt_asynch/test_llm_report.py diff --git a/tests/pipelex/cogt/cogt_asynch/test_llm_vision.py b/tests/pipelex/cogt_asynch/test_llm_vision.py similarity index 98% rename from tests/pipelex/cogt/cogt_asynch/test_llm_vision.py rename to tests/pipelex/cogt_asynch/test_llm_vision.py index 785423f4f..09f437db7 100644 --- a/tests/pipelex/cogt/cogt_asynch/test_llm_vision.py +++ b/tests/pipelex/cogt_asynch/test_llm_vision.py @@ -38,7 +38,7 @@ async def test_gen_text_from_vision_by_url(self, llm_handle_for_vision: str, top @pytest.mark.parametrize("topic, image_path", LLMVisionTestCases.IMAGE_PATHS) async def test_gen_text_from_vision_by_bytes(self, llm_handle_for_vision: str, topic: str, image_path: str): image_bytes = load_binary_as_base64(path=image_path) - prompt_image = PromptImageBytes(b64_image_bytes=image_bytes) + prompt_image = PromptImageBytes(base_64=image_bytes) llm_worker = get_llm_worker(llm_handle=llm_handle_for_vision) llm_job = LLMJobFactory.make_llm_job_from_prompt_contents( user_text=LLMVisionTestCases.VISION_USER_TEXT_2, diff --git a/tests/pipelex/cogt/cogt_asynch/test_ocr.py b/tests/pipelex/cogt_asynch/test_ocr.py similarity index 100% rename from tests/pipelex/cogt/cogt_asynch/test_ocr.py rename to tests/pipelex/cogt_asynch/test_ocr.py diff --git a/tests/pipelex/conftest.py b/tests/pipelex/conftest.py index 7ebc84104..e541ec21d 100644 --- a/tests/pipelex/conftest.py +++ b/tests/pipelex/conftest.py @@ -1,15 +1,197 @@ import pytest from pipelex.cogt.imgg.imgg_handle import ImggHandle +from pipelex.cogt.llm.llm_job_components import LLMJobParams +from pipelex.cogt.llm.llm_models.llm_family import LLMCreator, LLMFamily +from pipelex.cogt.llm.llm_models.llm_platform import LLMPlatform @pytest.fixture( params=[ - ImggHandle.FLUX_1_PRO_LEGACY, - ImggHandle.FLUX_1_1_PRO, - ImggHandle.FLUX_1_1_ULTRA, + "llm_for_testing_gen_text", + "llm_for_testing_gen_object", + "llm_for_creative_writing", + ] +) +def llm_preset_id(request: pytest.FixtureRequest) -> str: + assert isinstance(request.param, str) + return request.param + + +# TODO: make it efficient to also test multiple platforms like openai/azure and mistral/anthropic/bedrock +@pytest.fixture( + params=[ + # "o1", + # "gpt-4o", + "gpt-4o-mini", + # "gpt-4-5-preview", + # "o1-mini", + # "o3-mini", + # "claude-3-haiku", + # "claude-3-5-sonnet", + # "claude-3-7-sonnet", + # "mistral-large", + # "ministral-3b", + # "ministral-8b", + # "pixtral-12b", + # "pixtral-large", + # "gemini-1-5-pro", + # "gemini-1-5-flash", + # "gemini-2-flash", + # "gemini-2-pro", + # "gemini-2-5-flash", + # "gemini-2-5-pro", + # "bedrock-mistral-large", + # "bedrock-claude-3-7-sonnet", + # "bedrock-meta-llama-3-3-70b-instruct", + # "bedrock-nova-pro", + # "sonar", + ] +) +def llm_handle(request: pytest.FixtureRequest) -> str: + assert isinstance(request.param, str) + return request.param + + +@pytest.fixture( + params=[ + # "o1", + # "o3-mini", + # "gpt-4o", + "gpt-4o-mini", + # "gpt-4-5-preview", + # "claude-3-haiku", + # "claude-3-5-sonnet", + # "claude-3-7-sonnet", + # "pixtral-12b", + # "pixtral-large", + # "gemini-1-5-pro", + # "gemini-1-5-flash", + # "gemini-2-flash", + # "gemini-2-pro", + # "gemini-2-5-pro", + # "gemini-2-5-flash", + # "mistral-small3.1", + # "qwen3:8b", + ] +) +def llm_handle_for_vision(request: pytest.FixtureRequest) -> str: + assert isinstance(request.param, str) + return request.param + + +@pytest.fixture( + params=[ + # LLMFamily.GPT_4, + LLMFamily.GPT_4O, + # LLMFamily.GPT_4_5, + # LLMFamily.GPT_4_1, + # LLMFamily.O_SERIES, + # LLMFamily.CLAUDE_3_7, + # LLMFamily.CLAUDE_4, + # LLMFamily.PERPLEXITY_SEARCH, + # LLMFamily.PERPLEXITY_REASONING, + # LLMFamily.PERPLEXITY_RESEARCH, + # LLMFamily.PERPLEXITY_DEEPSEEK, + # LLMFamily.GEMINI, + # LLMFamily.GEMMA, + ] +) +def llm_family(request: pytest.FixtureRequest) -> LLMFamily: + assert isinstance(request.param, LLMFamily) + return request.param + + +@pytest.fixture( + params=[ + # LLMCreator.ALIBABA, + # LLMCreator.AMAZON, + # LLMCreator.ANTHROPIC, + # LLMCreator.DEEPSEEK, + # LLMCreator.GOOGLE, + LLMCreator.OPENAI, + # LLMCreator.META, + # LLMCreator.MISTRAL, + # LLMCreator.PERPLEXITY, + ] +) +def llm_creator(request: pytest.FixtureRequest) -> LLMCreator: + assert isinstance(request.param, LLMCreator) + return request.param + + +# TODO: build llm_id/platform combos dynalically from config data +@pytest.fixture( + params=[ + # LLMPlatform.ANTHROPIC, + # LLMPlatform.AZURE_OPENAI, + # LLMPlatform.BEDROCK, + # LLMPlatform.BEDROCK_ANTHROPIC, + # LLMPlatform.MISTRAL, + LLMPlatform.OPENAI, + # LLMPlatform.PERPLEXITY, + # LLMPlatform.VERTEXAI, + # LLMPlatform.CUSTOM_LLM, + # LLMPlatform.XAI, + ] +) +def llm_platform(request: pytest.FixtureRequest) -> LLMPlatform: + assert isinstance(request.param, LLMPlatform) + return request.param + + +@pytest.fixture( + params=[ + "gpt-4o-mini", + # "open-mixtral-8x7b", + # "google/gemini-2.0-flash", + # "google/gemini-2.5-pro-preview-05-06", + # "google/gemini-2.5-pro-preview-06-05", # not yet on VertexAI + # "google/gemini-2.5-flash-preview-04-17", + # "google/gemini-2.5-flash-preview-05-20", + # "o1", + # "o4-mini", + # "bedrock-mistral-large", + # "sonar", + # "claude-3-7-sonnet", + # "claude-4-sonnet", + # "claude-4-opus", + # "us.anthropic.claude-sonnet-4-20250514-v1:0", + # "us.anthropic.claude-opus-4-20250514-v1:0", + # "sonar", + # "sonar-pro", + # "gemma3:4b", + # "llama4:scout", + # "mistral-small3.1:24b", + # "qwen3:8b", + ] +) +def llm_id(request: pytest.FixtureRequest) -> str: + assert isinstance(request.param, str) + return request.param + + +@pytest.fixture( + params=[ + LLMJobParams( + temperature=0.5, + max_tokens=None, + seed=None, + ), + ] +) +def llm_job_params(request: pytest.FixtureRequest) -> LLMJobParams: + assert isinstance(request.param, LLMJobParams) + return request.param + + +@pytest.fixture( + params=[ + # ImggHandle.FLUX_1_PRO_LEGACY, + # ImggHandle.FLUX_1_1_PRO, + # ImggHandle.FLUX_1_1_ULTRA, ImggHandle.SDXL_LIGHTNING, - ImggHandle.OPENAI_GPT_IMAGE_1, + # ImggHandle.OPENAI_GPT_IMAGE_1, ] ) def imgg_handle(request: pytest.FixtureRequest) -> ImggHandle: diff --git a/tests/pipelex/pipelex_asynch/test_client.py b/tests/pipelex/pipelex_asynch/test_client.py index 404ffded8..d6a29aa15 100644 --- a/tests/pipelex/pipelex_asynch/test_client.py +++ b/tests/pipelex/pipelex_asynch/test_client.py @@ -17,8 +17,7 @@ class Example(BaseModel): memory: List[Stuff] -@pytest.mark.llm -@pytest.mark.inference +@pytest.mark.pipelex_api @pytest.mark.asyncio(loop_scope="class") class TestPipelexApiClient: @pytest.fixture diff --git a/tests/pipelex/pipelex_asynch/test_pipe_batch.py b/tests/pipelex/pipelex_asynch/test_pipe_batch.py index 10fae5f2f..d2b9eb4a3 100644 --- a/tests/pipelex/pipelex_asynch/test_pipe_batch.py +++ b/tests/pipelex/pipelex_asynch/test_pipe_batch.py @@ -2,6 +2,7 @@ from pipelex import pretty_print from pipelex.core.pipe_output import PipeOutput +from pipelex.core.pipe_run_params import PipeRunMode from pipelex.core.pipe_run_params_factory import PipeRunParamsFactory from pipelex.core.stuff_content import ListContent, TextContent from pipelex.core.stuff_factory import StuffFactory @@ -13,7 +14,10 @@ @pytest.mark.inference @pytest.mark.asyncio(loop_scope="class") class TestPipeBatch: - async def test_pipe_batch_basic(self): + async def test_pipe_batch_basic( + self, + pipe_run_mode: PipeRunMode, + ): # Create Stuff objects invoice_list_stuff = StuffFactory.make_stuff( concept_code="test_pipe_batch.TestPipeBatchItem", @@ -32,7 +36,7 @@ async def test_pipe_batch_basic(self): # Run the pipe pipe_output: PipeOutput = await get_pipe_router().run_pipe_code( pipe_code="test_pipe_batch", - pipe_run_params=PipeRunParamsFactory.make_run_params(), + pipe_run_params=PipeRunParamsFactory.make_run_params(pipe_run_mode=pipe_run_mode), working_memory=working_memory, ) diff --git a/tests/pipelex/pipelex_asynch/test_pipe_imgg.py b/tests/pipelex/pipelex_asynch/test_pipe_imgg.py index 053660213..ad501bcfb 100644 --- a/tests/pipelex/pipelex_asynch/test_pipe_imgg.py +++ b/tests/pipelex/pipelex_asynch/test_pipe_imgg.py @@ -3,6 +3,8 @@ from pipelex import pretty_print from pipelex.cogt.imgg.imgg_handle import ImggHandle from pipelex.core.concept_native import NativeConcept +from pipelex.core.pipe_run_params import PipeRunMode +from pipelex.core.pipe_run_params_factory import PipeRunParamsFactory from pipelex.hub import get_pipe_router from pipelex.pipe_operators.pipe_img_gen import PipeImgGen, PipeImgGenOutput from pipelex.pipe_works.pipe_job_factory import PipeJobFactory @@ -16,6 +18,7 @@ class TestPipeImgg: @pytest.mark.parametrize("topic, image_desc", IMGGTestCases.IMAGE_DESC) async def test_pipe_img_gen( self, + pipe_run_mode: PipeRunMode, imgg_handle: ImggHandle, topic: str, image_desc: str, @@ -29,6 +32,7 @@ async def test_pipe_img_gen( output_concept_code=NativeConcept.IMAGE.code, output_multiplicity=False, ), + pipe_run_params=PipeRunParamsFactory.make_run_params(pipe_run_mode=pipe_run_mode), ) pipe_imgg_output: PipeImgGenOutput = await get_pipe_router().run_pipe_job( pipe_job=pipe_job, diff --git a/tests/pipelex/pipelex_asynch/test_pipe_jinja2.py b/tests/pipelex/pipelex_asynch/test_pipe_jinja2.py new file mode 100644 index 000000000..3a175d071 --- /dev/null +++ b/tests/pipelex/pipelex_asynch/test_pipe_jinja2.py @@ -0,0 +1,55 @@ +import pytest + +from pipelex import pretty_print +from pipelex.core.pipe_run_params import PipeRunMode +from pipelex.core.pipe_run_params_factory import PipeRunParamsFactory +from pipelex.core.working_memory_factory import WorkingMemoryFactory +from pipelex.hub import get_pipe_router +from pipelex.pipe_operators.pipe_jinja2 import PipeJinja2, PipeJinja2Output +from pipelex.pipe_works.pipe_job_factory import PipeJobFactory +from pipelex.tools.templating.templating_models import PromptingStyle, TagStyle, TextFormat +from tests.pipelex.test_data import JINJA2TestCases + + +@pytest.mark.asyncio(loop_scope="class") +class TestPipeJinja2: + @pytest.mark.parametrize("jinja2", JINJA2TestCases.JINJA2_FOR_ANY) + async def test_pipe_jinja2_for_any( + self, + pipe_run_mode: PipeRunMode, + jinja2: str, + ): + pipe_job = PipeJobFactory.make_pipe_job( + pipe=PipeJinja2( + code="adhoc_for_test_pipe_jinja2_for_any", + domain="generic", + jinja2=jinja2, + extra_context={"place_holder": "[some text from test_pipe_jinja2_for_any]"}, + ), + pipe_run_params=PipeRunParamsFactory.make_run_params(pipe_run_mode=pipe_run_mode), + ) + pipe_jinja2_output: PipeJinja2Output = await get_pipe_router().run_pipe_job(pipe_job=pipe_job) + rendered_text = pipe_jinja2_output.rendered_text + pretty_print(rendered_text) + + @pytest.mark.parametrize("jinja2", JINJA2TestCases.JINJA2_FOR_STUFF) + async def test_pipe_jinja2_for_stuff( + self, + pipe_run_mode: PipeRunMode, + jinja2: str, + ): + working_memory = WorkingMemoryFactory.make_from_text(text="[some text from test_pipe_jinja2_for_stuff]", name="place_holder") + + pipe_job = PipeJobFactory.make_pipe_job( + pipe=PipeJinja2( + code="adhoc_for_test_pipe_jinja2", + domain="generic", + jinja2=jinja2, + prompting_style=PromptingStyle(tag_style=TagStyle.TICKS, text_format=TextFormat.MARKDOWN), + ), + pipe_run_params=PipeRunParamsFactory.make_run_params(pipe_run_mode=pipe_run_mode), + working_memory=working_memory, + ) + pipe_jinja2_output: PipeJinja2Output = await get_pipe_router().run_pipe_job(pipe_job=pipe_job) + rendered_text = pipe_jinja2_output.rendered_text + pretty_print(rendered_text) diff --git a/tests/pipelex/pipelex_asynch/test_pipe_llm.py b/tests/pipelex/pipelex_asynch/test_pipe_llm.py index bf13ef7d8..6b895a69d 100644 --- a/tests/pipelex/pipelex_asynch/test_pipe_llm.py +++ b/tests/pipelex/pipelex_asynch/test_pipe_llm.py @@ -4,6 +4,9 @@ from pipelex import log, pretty_print from pipelex.core.concept_native import NativeConcept +from pipelex.core.pipe_input_spec import PipeInputSpec +from pipelex.core.pipe_run_params import PipeRunMode +from pipelex.core.pipe_run_params_factory import PipeRunParamsFactory from pipelex.core.stuff import Stuff from pipelex.core.working_memory_factory import WorkingMemoryFactory from pipelex.hub import get_pipe_router, get_report_delegate @@ -17,7 +20,10 @@ @pytest.mark.inference @pytest.mark.asyncio(loop_scope="class") class TestPipeLLM: - async def test_pipe_llm(self): + async def test_pipe_llm( + self, + pipe_run_mode: PipeRunMode, + ): pipe_job = PipeJobFactory.make_pipe_job( pipe=PipeLLM( code="adhoc_for_test_pipe_llm", @@ -30,6 +36,7 @@ async def test_pipe_llm(self): user_text=PipeTestCases.USER_PROMPT, ), ), + pipe_run_params=PipeRunParamsFactory.make_run_params(pipe_run_mode=pipe_run_mode), ) pipe_llm_output: PipeLLMOutput = await get_pipe_router().run_pipe_job( pipe_job=pipe_job, @@ -48,7 +55,11 @@ async def test_pipe_llm_attribute_image( self, stuff: Stuff, attribute_paths: List[str], + pipe_run_mode: PipeRunMode, ): + stuff_name = stuff.stuff_name + if not stuff_name: + pytest.fail(f"Cannot use nameless stuff in this test: {stuff}") working_memory = WorkingMemoryFactory.make_from_single_stuff(stuff=stuff) pipe_job = PipeJobFactory.make_pipe_job( @@ -56,6 +67,7 @@ async def test_pipe_llm_attribute_image( pipe=PipeLLM( code="adhoc_for_test_pipe_llm_image", domain="generic", + inputs=PipeInputSpec(root={stuff_name: stuff.concept_code}), output_concept_code=NativeConcept.TEXT.code, pipe_llm_prompt=PipeLLMPrompt( code="adhoc_for_test_pipe_llm_image", @@ -65,6 +77,7 @@ async def test_pipe_llm_attribute_image( user_images=attribute_paths, ), ), + pipe_run_params=PipeRunParamsFactory.make_run_params(pipe_run_mode=pipe_run_mode), ) pipe_llm_output: PipeLLMOutput = await get_pipe_router().run_pipe_job( pipe_job=pipe_job, diff --git a/tests/pipelex/pipelex_asynch/test_pipe_ocr.py b/tests/pipelex/pipelex_asynch/test_pipe_ocr.py index 1120d1db5..9ebe82096 100644 --- a/tests/pipelex/pipelex_asynch/test_pipe_ocr.py +++ b/tests/pipelex/pipelex_asynch/test_pipe_ocr.py @@ -2,6 +2,9 @@ from pipelex import pretty_print from pipelex.core.concept_native import NativeConcept +from pipelex.core.pipe_input_spec import PipeInputSpec +from pipelex.core.pipe_run_params import PipeRunMode +from pipelex.core.pipe_run_params_factory import PipeRunParamsFactory from pipelex.core.stuff_content import PageContent from pipelex.core.working_memory_factory import WorkingMemoryFactory from pipelex.hub import get_pipe_router @@ -17,19 +20,25 @@ class TestPipeOCR: @pytest.mark.parametrize("image_url", PipeOcrTestCases.PIPE_OCR_IMAGE_TEST_CASES) async def test_pipe_ocr_image( self, + pipe_run_mode: PipeRunMode, image_url: str, ): pipe_job = PipeJobFactory.make_pipe_job( pipe=PipeOcr( code="adhoc_for_test_pipe_ocr_image", domain="generic", - image_stuff_name="page_scan", + inputs=PipeInputSpec( + root={ + "page_scan": "native.Image", + } + ), should_include_images=True, should_caption_images=False, should_include_page_views=True, page_views_dpi=300, output_concept_code=NativeConcept.TEXT_AND_IMAGES.code, ), + pipe_run_params=PipeRunParamsFactory.make_run_params(pipe_run_mode=pipe_run_mode), working_memory=WorkingMemoryFactory.make_from_image( image_url=image_url, concept_code="ocr.PageScan", @@ -45,19 +54,25 @@ async def test_pipe_ocr_image( @pytest.mark.parametrize("pdf_url", PipeOcrTestCases.PIPE_OCR_PDF_TEST_CASES) async def test_pipe_ocr_pdf( self, + pipe_run_mode: PipeRunMode, pdf_url: str, ): pipe_job = PipeJobFactory.make_pipe_job( pipe=PipeOcr( code="adhoc_for_test_pipe_ocr_pdf", domain="generic", - pdf_stuff_name="pdf", + inputs=PipeInputSpec( + root={ + "pdf": "native.PDF", + } + ), should_include_images=True, should_caption_images=False, should_include_page_views=True, page_views_dpi=300, output_concept_code=NativeConcept.TEXT_AND_IMAGES.code, ), + pipe_run_params=PipeRunParamsFactory.make_run_params(pipe_run_mode=pipe_run_mode), working_memory=WorkingMemoryFactory.make_from_pdf( pdf_url=pdf_url, concept_code=NativeConcept.PDF.code, diff --git a/tests/pipelex/pipelex_asynch/test_pipe_running_variants.py b/tests/pipelex/pipelex_asynch/test_pipe_running_variants.py index 2bdcf796d..ba7ab0c20 100644 --- a/tests/pipelex/pipelex_asynch/test_pipe_running_variants.py +++ b/tests/pipelex/pipelex_asynch/test_pipe_running_variants.py @@ -5,7 +5,7 @@ from pipelex import log, pretty_print from pipelex.core.pipe_output import PipeOutput -from pipelex.core.pipe_run_params import BatchParams, PipeOutputMultiplicity +from pipelex.core.pipe_run_params import BatchParams, PipeOutputMultiplicity, PipeRunMode from pipelex.core.pipe_run_params_factory import PipeRunParamsFactory from pipelex.core.stuff import Stuff from pipelex.core.stuff_factory import StuffBlueprint @@ -21,10 +21,11 @@ @pytest.mark.ocr @pytest.mark.inference @pytest.mark.asyncio(loop_scope="class") -class TestPipeRouter: +class TestPipeRunningVariants: @pytest.mark.parametrize("topic, blueprint, pipe_code", PipeTestCases.BLUEPRINT_AND_PIPE) async def test_pipe_from_blueprint( self, + pipe_run_mode: PipeRunMode, request: FixtureRequest, pipe_result_handler: Tuple[str, ActivityHandlerForResultFiles], save_working_memory: Any, @@ -36,7 +37,7 @@ async def test_pipe_from_blueprint( working_memory = WorkingMemoryFactory.make_from_single_blueprint(blueprint=blueprint) pipe_output: PipeOutput = await get_pipe_router().run_pipe_code( pipe_code=pipe_code, - pipe_run_params=PipeRunParamsFactory.make_run_params(), + pipe_run_params=PipeRunParamsFactory.make_run_params(pipe_run_mode=pipe_run_mode), working_memory=working_memory, job_metadata=JobMetadata( top_job_id=cast(str, request.node.originalname), # type: ignore @@ -54,6 +55,7 @@ async def test_pipe_from_blueprint( @pytest.mark.parametrize("topic, stuff, pipe_code", PipeTestCases.STUFF_AND_PIPE) async def test_pipe_from_stuff( self, + pipe_run_mode: PipeRunMode, request: FixtureRequest, pipe_result_handler: Tuple[str, ActivityHandlerForResultFiles], save_working_memory: Any, @@ -65,7 +67,7 @@ async def test_pipe_from_stuff( working_memory = WorkingMemoryFactory.make_from_single_stuff(stuff=stuff) pipe_output: PipeOutput = await get_pipe_router().run_pipe_code( pipe_code=pipe_code, - pipe_run_params=PipeRunParamsFactory.make_run_params(), + pipe_run_params=PipeRunParamsFactory.make_run_params(pipe_run_mode=pipe_run_mode), working_memory=working_memory, job_metadata=JobMetadata( top_job_id=cast(str, request.node.originalname), # type: ignore @@ -80,6 +82,7 @@ async def test_pipe_from_stuff( @pytest.mark.parametrize("topic, pipe_code", PipeTestCases.NO_INPUT) async def test_pipe_no_input( self, + pipe_run_mode: PipeRunMode, request: FixtureRequest, pipe_result_handler: Tuple[str, ActivityHandlerForResultFiles], save_working_memory: Any, @@ -89,7 +92,7 @@ async def test_pipe_no_input( log.verbose(f"{topic}: just run pipe '{pipe_code}'") pipe_output: PipeOutput = await get_pipe_router().run_pipe_code( pipe_code=pipe_code, - pipe_run_params=PipeRunParamsFactory.make_run_params(), + pipe_run_params=PipeRunParamsFactory.make_run_params(pipe_run_mode=pipe_run_mode), working_memory=WorkingMemory(), job_metadata=JobMetadata( top_job_id=cast(str, request.node.originalname), # type: ignore @@ -109,6 +112,7 @@ async def test_pipe_no_input( @pytest.mark.parametrize("topic, pipe_code, output_multiplicity", PipeTestCases.NO_INPUT_PARALLEL1) async def test_pipe_batch_no_input( self, + pipe_run_mode: PipeRunMode, request: FixtureRequest, pipe_result_handler: Tuple[str, ActivityHandlerForResultFiles], save_working_memory: Any, @@ -120,6 +124,7 @@ async def test_pipe_batch_no_input( pipe_output: PipeOutput = await get_pipe_router().run_pipe_code( pipe_code=pipe_code, pipe_run_params=PipeRunParamsFactory.make_run_params( + pipe_run_mode=pipe_run_mode, output_multiplicity=output_multiplicity, ), working_memory=WorkingMemory(), @@ -141,6 +146,7 @@ async def test_pipe_batch_no_input( @pytest.mark.parametrize("pipe_code, stuff, input_list_stuff_name, input_item_stuff_name", PipeTestCases.BATCH_TEST) async def test_pipe_batch_with_list_content( self, + pipe_run_mode: PipeRunMode, request: FixtureRequest, pipe_result_handler: Tuple[str, ActivityHandlerForResultFiles], save_working_memory: Any, @@ -156,7 +162,8 @@ async def test_pipe_batch_with_list_content( batch_params=BatchParams( input_list_stuff_name=input_list_stuff_name, input_item_stuff_name=input_item_stuff_name, - ) + ), + pipe_run_mode=pipe_run_mode, ), working_memory=working_memory, job_metadata=JobMetadata( @@ -171,6 +178,7 @@ async def test_pipe_batch_with_list_content( @pytest.mark.parametrize("pipe_code, exception, expected_error_message", PipeTestCases.FAILURE_PIPES) async def test_pipe_infinite_loop( self, + pipe_run_mode: PipeRunMode, request: FixtureRequest, pipe_code: str, exception: Type[Exception], @@ -183,6 +191,7 @@ async def test_pipe_infinite_loop( pipe_code=pipe_code, pipe_run_params=PipeRunParamsFactory.make_run_params( pipe_stack_limit=6, + pipe_run_mode=pipe_run_mode, ), job_metadata=JobMetadata( top_job_id=cast(str, request.node.originalname), # type: ignore diff --git a/tests/pipelex/test_data.py b/tests/pipelex/test_data.py index 8f49e2982..0aca7eff2 100644 --- a/tests/pipelex/test_data.py +++ b/tests/pipelex/test_data.py @@ -159,11 +159,11 @@ class PipeTestCases: ), ] STUFF_AND_PIPE: ClassVar[List[Tuple[str, Stuff, str]]] = [ # topic, stuff, pipe_code - # ( - # "Process Simple Image", - # SIMPLE_STUFF_IMAGE, - # "simple_llm_test_from_image", - # ), + ( + "Process Simple Image", + SIMPLE_STUFF_IMAGE, + "simple_llm_test_from_image", + ), ( "Extract page contents from PDF", SIMPLE_STUFF_PDF, diff --git a/tests/pipelex/test_libraries.py b/tests/pipelex/test_libraries.py index 87873dfc4..3c8cfb78a 100644 --- a/tests/pipelex/test_libraries.py +++ b/tests/pipelex/test_libraries.py @@ -42,7 +42,7 @@ def pretty_print_all_pipes( pipe.code, pipe.definition, pipe.__class__.__name__, - pipe.input_concept_code, + ", ".join([f"{name}: {concept_code}" for name, concept_code in pipe.inputs.items]), pipe.output_concept_code, ) diff --git a/tests/test_pipelines/answer.toml b/tests/test_pipelines/answer.toml deleted file mode 100644 index 4c3183f80..000000000 --- a/tests/test_pipelines/answer.toml +++ /dev/null @@ -1,251 +0,0 @@ - - -domain = "answer" -definition = "The domain for questions and answers" - -[concept] -Answer = "An answer to a question" -Question = "A question to a problem" -EnrichedQuestion = "An enriched question" -GroundTruth = "The ground truth answer" -Correctness = "Correctness of an answer" -YesNo = "Yes/No answer format" - -[pipe] -[pipe.write_context_of_text] -PipeLLM = "Write the context of a sample of text" -input = "native.Text" -output = "native.Text" -llm = "llm_to_retrieve" -prompt_template = """ -You will receive a text. -Your task is to write the context of the text. -This context should be maximum of 30 words. The goal is to quickly understand the type of ducument by just reding this context. - -@text -""" - -[pipe.retrieve_excerpts] -PipeLLM = "Find the most relevant excerpts in a text that answers a specific question" -input = "native.Text" -output = "retrieve.RetrievedExcerpt" -llm = "llm_to_retrieve" -multiple_output = true -prompt_template = """ -@text - -@question - -Your task is to find all relevant excerpts from the text that contribute to answering this question. -It might not contain the exact answer, but it should be relevant to the question. - -Output each excerpt and its justification in the specified structured format, but ONLY for actual found excerpts. -""" - -[pipe.enrich_question] -PipeLLM = "Get an enriched question" -input = "answer.Question" -output = "EnrichedQuestion" -llm = "llm_to_enrich" -prompt_template = """ -Your task is to reformulate a form field into a question for a LLM. - -@context - -Here is the name of one of the fields in a form I have to fill in: '{{ question|format }}'. -Each word is important therefore do not extrapolate or create information. - -{% if client_instructions %} - Here are important instructions from the customer to take into account in order to enrich the question. - The client instructions are important and you absolutely must follow them. However, it DOES NOT contain the answer. - @client_instructions -{% endif %} - -{% if target_concept.content.choices %} - To help you understand the question and potential ambiguity, here are the ONLY possible values for the answer: - {% for value in target_concept.content.choices %} - - {{ value }} - {% endfor %} -{% endif %} -You can always choose to ouput this base fallback answer: -{% for value in target_concept.content.BaseAnswer %} -- {{ value }} -{% endfor %} - -{% if target_concept.content.model_fields %} - Here are the fields that you have to fill in: - {% for field_name, field in target_concept.content.model_fields.items() %} - - {{ field_name }}: {{ field.annotation }} - {% endfor %} -{% endif %} - -{% if target_concept.content.__doc__ %} -Here are some guidelines about the desired output of the answer. -{{ target_concept.content.__doc__ }} -{% endif %} - -Here is the main task: If I were to prompt an LLM to extract this information from a specific section of the contract, what should I ask? - -Here are some rules that you absolutely must follow: -- No need to add instructions like "based on the provided contract", just write the question in English, no need for code. -- No need for intros like "Here is a reformulated question", just write the question. -- It is important that you specify that the question is a Yes/No question if it is the case. -""" - -[pipe.clean_answer] -PipeLLM = "Clean the answer" -input = "native.Dynamic" -output = "native.Dynamic" -prompt_template = """ -You are helping to clean answers that were generated from analyzing document excerpts to answer specific questions. - -Here is the answer to clean: -@answer - -Your task is to clean the answer by handling cases where no clear answer could be found in the document excerpts. - -ONLY output the cleaned answer - do not add any explanation or commentary. - -If the answer contains any of these patterns, output "Indeterminate": -- Empty or blank answers (including empty JSON objects) -- Statements indicating no relevant information was found -- Phrases like: - * "The excerpts are not relevant to the question" - * "There is nothing relevant in the document to answer" - * "Based on the document, there is nothing..." - * "No information found in the document" - * "Cannot determine from the provided excerpts" - * "No relevant excerpts were found" - -Important rules: -- Keep "NO" answers unchanged -- Keep "not_applicable" or "indeterminate" answers unchanged -- Preserve all other valid answers exactly as they are -- DO NOT add any explanation or commentary to your output - -{% if target_concept.content.model_fields %} - Here are the fields that you have to fill in: - {% for field_name, field in target_concept.content.model_fields.items() %} - - {{ field_name }}: {{ field.annotation }} - {% endfor %} -{% endif %} - -{% if target_concept.content.__doc__ %} -Here are some guidelines about the desired output of the answer. -{{ target_concept.content.__doc__ }} -{% endif %} -""" - -[pipe.pre_answer_question] -PipeLLM = "Answer the question in a dynamically specified format" -input = "EnrichedQuestion" -output = "native.Dynamic" -llm = "llm_to_answer" -prompt_template = """ -Your task is to extract information in a contract. - -Here is some context about the contract: -@context - -To help you, your assistant has already enriched the question and extracted the most relevant excerpts{% if client_instructions %}, -and provided you with some hints (can be considered as client instructions){% endif %}. - -Here is the question: -@enriched_question - -Here are the relevant excerpts: -Not all of them are necessarily relevant to the question, but all of them are relevant to the contract. -@excerpts - -{% if client_instructions %} -Here are important instructions from the customer to take into account in order to enrich the question. -The client instructions are important and you absolutely must follow them. However, it DOES NOT contain the answer. -@client_instructions -{% endif %} - -{% if target_concept.content.choices %} -To help you understand the question and potential ambiguity, here are the ONLY possible values for the answer: -{% for value in target_concept.content.choices %} -- {{ value }} -{% endfor %} -{% endif %} -You can always choose to ouput this base fallback answer: -{% for value in target_concept.content.BaseAnswer %} -- {{ value }} -{% endfor %} - -{% if target_concept.content.model_fields %} -Here are the fields that you have to fill in: -{% for field_name, field in target_concept.content.model_fields.items() %} -- {{ field_name }}: {{ field.annotation }} -{% endfor %} -{% endif %} - -{% if target_concept.content.__doc__ %} -Here are some guidelines about the desired output of the answer. -{{ target_concept.content.__doc__ }} -{% endif %} - -Important rules for answering: -- For Yes/No questions: Answer "NO" if no excerpts or inconclusive evidence (with explanation) are provided. -- For multiple choice questions: Mark as "indeterminate" if no excerpts or inconclusive evidence (with explanation) are provided. -- Always cite the answer with citations EXCEPT when the answer is "indeterminate" -- When evidence is clear: Provide answer with citations -- When no answer is applicable, or the answer says that its not applicable, mark as "not_applicable" with explanation. -- If the target_format is FreeText, it must be a text. -- [IMPORTANT] DO NOT add commentaries like "Based on.. According to...", just output the answer. -- [IMPORTANT] DO NOT extrapolate or create information. Base your answer solely on the provided excerpts. -- Please, cite the exact sentences that you used to answer the question in a "citation" paragraph. -- Make sure that you also cite the clause number if provided (20.1 for instance). - -Here is the fields format of the answer you must output: -answer, citations, short_comment -""" - -[pipe.answer_question] -PipeSequence = "Answer the question in a dynamically specified format" -input = "EnrichedQuestion" -output = "native.Dynamic" -steps = [ - { pipe = "pre_answer_question", result = "answer" }, - { pipe = "clean_answer", result = "cleaned_answer" }, -] - -[pipe.retrieve_then_answer] -PipeSequence = "Answer a question, given the target type and the excerpts neeeded to answer it" -input = "answer.Question" -output = "native.Dynamic" -steps = [ - { pipe = "write_context_of_text", result = "context" }, - { pipe = "retrieve_excerpts", result = "excerpts" }, - { pipe = "enrich_question", result = "enriched_question" }, - { pipe = "answer_question", result = "answer" }, -] - -[pipe.verify_correctness] -PipeLLM = "Verify the correctness of the answer" -input = "Answer" -output = "Correctness" -prompt_template = """ -You are a helpful assistant that verifies the correctness of an answer. - -Your task is to verify if the answer is correct or not compared to a ground truth. -Sometimes the answer is correct, sometimes it is not. - -Here is the question it was asked: -@question - -Here is the answer to verify: -@answer - -Here is the expected answer: -@ground_truth - -Consider that the ground_truth is always correct. -If the ground_truth is empty, consider that the answer should be "not_applicable" or "indeterminate", or that no excerpts was found. -If the grund_truth is an extract of a clause of a contract, or multiple clauses, try to find out if the answer is globally the same clause that was extracted. -If the answer miss a lot of information, it is wrong. If it extracted more, it is correct. - -Here is the format of the answer you must output: -correctness as a boolean, explanation as a string -""" diff --git a/tests/test_pipelines/basic.toml b/tests/test_pipelines/basic.toml deleted file mode 100644 index c275875c2..000000000 --- a/tests/test_pipelines/basic.toml +++ /dev/null @@ -1,171 +0,0 @@ - - -domain = "basic" -definition = "The domain for basic concepts" - -[concept] -Excerpt = "A text excerpt from a document" -BunchOfExcerpts = "A text gathering a bunch of excerpts from a document" -ClientInstructions = "Expert human in the loop" -FreeText = "A free text" - -[pipe] -[pipe.write_query] -PipeLLM = "Write a query to a retrieval system" -input = "answer.Question" -output = "questions.Query" -prompt_template = """ -You are studying a contract. As usual, you are given a form to fill in about this contract. -Fortunately, you have a natural language based retrieval system that can help you recover useful, required, information. - -Your task is to formulate a search query for the retrieval system based on the provided field and contextual information. - -And here is the context about the contract: -{{ project_context|tag("context") }} - -So here is the field to fill in: -{{ question|tag("field") }} - -Write the query in a concise way, ensuring that you include the context for the retrieval system. -Reformulate the field if needed. -Your answer will be directly used as the query to the retrieval system, so make sure it is usable as a query (do not add quotes, introduction text, etc.). -""" - -[pipe.get_answer_format] -PipeLLM = "Analyze the most relevant format the answer should be in" -input = "answer.Question" -output = "questions.AnswerFormat" -prompt_template = """ -You are studying a contract. As usual, you are given a form to fill in about this contract. -I want you to identify what format the answer should be returned in. -Examples: -- if the question is 'What is the height of the Eiffel Tower?', you should return 'a distance'. -- if the question is 'Paul is 30 years old, John is half of Paul's age plus 20 years, who is older?', you should return 'a name'. - -But the output format is quite simple (a percentage, a date, a duration, a company name, etc.). - -{{ question|tag("question")}} -""" - - -[pipe.get_instructions] -PipeLLM = "Get instructions to answer an enriched question" -input = "answer.Question" -output = "questions.Instructions" -prompt_template = """ -Your task is to help answer a question. -More specifically, you have to tell what to look for in the document, how to proceed etc. - -Here is the question: -{{ question|tag("question") }} - -Give your hints in a concise way. -""" - -[pipe.answer_enriched_question_with_blobs] -PipeLLM = "Answer an enriched question with blobs" -input = "answer.Question" -output = "answer.Answer" -prompt_template = """ -Your task is to answer a question about a contract. - -You will be given relevant excerpts from the contract. - -You will also be given some additional hints to help you answer the question. They come from a senior lawyer. - -{{ excerpts|tag("question") }} - -{{ excerpts|tag("excerpts") }} - -{{ instructions|tag("instructions") }} - -Answer the question **without giving too much details about what you have done to find the answer**. Just answer and give some factual explanations (like citing the excerpts). -""" - -[pipe.get_format_answer_instructions] -PipeLLM = "Get instructions to format the answer" -input = "answer.Answer" -output = "questions.FormatAnswerInstructions" -prompt_template = """ -I have an LLM answer to a question that is too verbose. I need to format it in a specific format. -Your role is to tell me how to do it. - -Here is the question I asked: -{{ question|tag("question") }} - -Here is the LLM answer I got: -{{ answer|tag("answer") }} - -Here is the expected format type I am asked to answer in: -{{ answer_format|tag("answer_format") }} - - -For instance, if the question is "What is the height of the Eiffel Tower?" and the answer is "The Eiffel Tower is very high, it is about three hundred meters high" while the expected format is "a distance in meters". -You should answer something like: -"Pay attention to the numeric value and the unit. Here, the numeric value is three hundred and the unit is meters. -You then need to convert it to a number with its unit." -""" - -[pipe.format_answer] -PipeLLM = "Format the answer" -input = "answer.Answer" -output = "questions.FormattedAnswer" -prompt_template = """ -Your role is to format an LLM answer to a question. -You will be given an answer, a specific target format and some instructions. - -Here is the answer: -{{ answer|tag("answer") }} - -Here is the target format: -{{ answer_format|tag("answer_format") }} - -Here are the instructions: -{{ format_answer_instructions|tag("format_answer_instructions") }} - -For instance, if the answer is "The Eiffel Tower is very high, it is about three hundred meters high", the format is "a distance in meters", while instructions are "Pay attention to the numeric value and the unit. Here, the numeric value is three hundred and the unit is meters": -You should answer "320 meters" -""" - - -[pipe.answer_question_with_blobs] -PipeSequence = "Answer a question in a formatted way by first analyzing the most relevant format the answer should be in and output a type" -input = "answer.Question" -output = "answer.Answer" -steps = [ - { pipe = "get_answer_format", result = "answer_format" }, - { pipe = "get_instructions", result = "instructions" }, - { pipe = "answer_enriched_question_with_blobs", result = "answer" }, - { pipe = "get_format_answer_instructions", result = "format_answer_instructions" }, - { pipe = "format_answer", result = "formatted_answer" }, -] - -[pipe.answer_question_with_excerpts] -PipeLLM = "Answer the question in a dynamically specified format" -input = "answer.Question" -output = "answer.Answer" -prompt_template = """ -Your task is to answer a question about a contract. - -Here is some context about the contract: -{{ project_context|format }} - -To help you, your assistant has already enriched the question and extracted the most relevant excerpts{% if client_instructions %}, and provided you with some customer instructions{% endif %}. -Please, cite the exact sentences/clauses that you used to answer the question in a "citations" paragraph. -Make sure that you also cite the clauses number if provided (20.1 for instance). -You can use multiple parts of the text to answer the question, and provide multiple citations as an array. - -Here is the question: -{{ enriched_question|tag("enriched_question") }} - -Here are the relevant excerpts: -{{ excerpts|tag("excerpts") }} - -{% if client_instructions %} -Here are important instructions from the customer that have to be taken into account in order to answer the question: -{{ client_instructions|tag("client_instructions") }} -{% endif %} - -Be aware that you have the possibility to return "Indeterminate" if the answer is not found in the text, or if no excerpts are provided. -DO NOT extrapolate, DO NOT create information that is not provided. Base your answer on what you are given in this prompt. -""" diff --git a/tests/test_pipelines/contracts.py b/tests/test_pipelines/contracts.py deleted file mode 100644 index a96d05698..000000000 --- a/tests/test_pipelines/contracts.py +++ /dev/null @@ -1,75 +0,0 @@ -from typing import Any, List, Literal, Optional, Union - -from pydantic import Field, model_validator -from typing_extensions import Self, override - -from pipelex.types import StrEnum -from pipelex_libraries.pipelines.base_library.questions import BaseAnswer, MultipleChoiceAnswer, SourcedAnswer - - -class Fees(SourcedAnswer[Any]): - class Value(StrEnum): - PERCENTAGE = "Percentage" - AMOUNT = "Amount" - INDETERMINATE = "Indeterminate" - - class Currency(StrEnum): - USD = "USD" - EUR = "EUR" - GBP = "GBP" - AUD = "AUD" - CAD = "CAD" - UNKNOWN = "Unknown currency" - - answer: Union[float, BaseAnswer] = Field( - default=BaseAnswer.INDETERMINATE, - description="The fee value - for percentages use decimal (e.g. 2.5 for 2.5%), for amounts use the absolute value", - ) - fee_type: Value = Field(default=Value.PERCENTAGE, description="The type of fee (percentage or amount)") - fee_currency: Optional[Currency] = Field( - default=None, description="The currency of the fee amount. Required when fee_type is AMOUNT, should be None for PERCENTAGE" - ) - - @model_validator(mode="after") - def validate_fee(self) -> Self: - if isinstance(self.answer, float): - if self.answer < 0 or self.answer > 100: - raise ValueError("Fee value must be between 0 and 100") - if self.fee_type == self.Value.AMOUNT and not self.fee_currency: - raise ValueError("Currency is required when fee type is AMOUNT") - if self.fee_type == self.Value.PERCENTAGE and self.fee_currency: - raise ValueError("Currency should not be set when fee type is PERCENTAGE") - return self - - @override - def render_spreadsheet(self) -> str: - if self.not_applicable: - return BaseAnswer.NOT_APPLICABLE.value - elif self.indeterminate: - return BaseAnswer.INDETERMINATE.value - if self.fee_type == self.Value.PERCENTAGE: - return f"{self.answer}" - else: - return f"{self.answer} {self.fee_currency.value if self.fee_currency else 'Unknown'}" - - -class GoverningLaw(SourcedAnswer[Any]): - """ - The Governing Law can be a country, a state, a city, a law code, etc. - It should be the name used in the contract. - """ - - answer: Union[str, BaseAnswer] = Field(description="The governing law") - - -class ContractTypeChoices(StrEnum): - CONTRACT = "Contract" - AMENDMENT = "Amendment" - - -class ContractType(MultipleChoiceAnswer[Literal[ContractTypeChoices.CONTRACT, ContractTypeChoices.AMENDMENT]]): - """The type of the contract - either a main contract or an amendment.""" - - choices: List[str] = Field( - default=[choice.value for choice in ContractTypeChoices], description="The list of choices for the multiple choice question." - ) diff --git a/tests/test_pipelines/contracts.toml b/tests/test_pipelines/contracts.toml deleted file mode 100644 index 58bd26f48..000000000 --- a/tests/test_pipelines/contracts.toml +++ /dev/null @@ -1,13 +0,0 @@ - - -domain = "contracts" -definition = "The domain for analyzing contracts" -system_prompt = "You are an expert in contract analysis." - -[concept] -Contract = "A document or a set of documents that legally binds two or more parties to an agreement" -Amendment = "A change or addition to a legal document" -ClmFieldName = "Field Name from a contract lifecycle management system" -GoverningLaw = "Governing law of a contract" -Fees = "Fees provided by a contract" -ContractType = "Type of contract" diff --git a/tests/test_pipelines/failure_modes.toml b/tests/test_pipelines/failure_modes.toml index a0f0581c8..6abb942ea 100644 --- a/tests/test_pipelines/failure_modes.toml +++ b/tests/test_pipelines/failure_modes.toml @@ -7,7 +7,6 @@ definition = "This domain is for testing failure modes" [pipe.dummy] PipeLLM = "This pipe is a dummy pipe" -input = "native.Text" output = "native.Text" llm = { llm_handle = "gpt-4o-mini", temperature = 1, max_tokens = 50 } prompt_template = """ @@ -16,7 +15,6 @@ This is a dummy prompt, do whatever you want. Something funny but not a joke, ok [pipe.infinite_loop_1] PipeSequence = "This pipe will cause an infinite loop" -input = "native.Text" output = "native.Text" steps = [ { pipe = "dummy", result = "dummy_result" }, diff --git a/tests/test_pipelines/misc_tests/flows.toml b/tests/test_pipelines/misc_tests/flows.toml index 7a1011625..dde622039 100644 --- a/tests/test_pipelines/misc_tests/flows.toml +++ b/tests/test_pipelines/misc_tests/flows.toml @@ -8,11 +8,12 @@ Color = "A color" [pipe.extract_colors] PipeLLM = "Extract Colors" +inputs = { text = "Text" } output = "Color" prompt_template = """ Extract colors from the following text: -{{ text|tag }} +@text """ multiple_output = true @@ -27,7 +28,7 @@ llm = "llm_for_creative_writing" [pipe.sequence_for_batch_test] PipeSequence = "Sequence for parallel test" -input = "Color" +inputs = { color = "Color" } output = "Color" steps = [ { pipe = "capitalize_color", result = "capitalized_color" }, @@ -37,13 +38,13 @@ steps = [ [pipe.batch_test] PipeBatch = "Batch Test" -input = "Color" +inputs = { color = "Color" } output = "Color" branch_pipe_code = "sequence_for_batch_test" [pipe.capitalize_color] PipeLLM = "Capitalize Colors" -input = "Color" +inputs = { color = "Color" } output = "Color" prompt_template = """ Put the first letter of a word that represents a color as a capital letter. @@ -56,7 +57,7 @@ Output only the word, nothing else. [pipe.capitalize_last_letter] PipeLLM = "Capitalize Last Letter" -input = "Color" +inputs = { capitalized_color = "Color" } output = "Color" prompt_template = """ Put the last letter of a word that represents a color as a capital letter. @@ -69,7 +70,7 @@ Output only the word, nothing else. [pipe.reverse_letters] PipeLLM = "Reverse Letters" -input = "Color" +inputs = { color = "Color" } output = "Color" prompt_template = """ Reverse the letters of this word: diff --git a/tests/test_pipelines/misc_tests/multiplicity.toml b/tests/test_pipelines/misc_tests/multiplicity.toml index c009a192a..2b4246bc4 100644 --- a/tests/test_pipelines/misc_tests/multiplicity.toml +++ b/tests/test_pipelines/misc_tests/multiplicity.toml @@ -1,6 +1,6 @@ -domain = "multiplicity" +domain = "test_multiplicity" definition = "Test library about multiplicity" [concept] @@ -28,7 +28,7 @@ llm = "llm_for_creative_writing" [pipe.imagine_nature_product] PipeLLM = "Imagine a product of nature" -input = "Color" +inputs = { color = "Color" } output = "ProductOfNature" prompt_template = """ Propose a product of nature of this color: {{ color|format("plain") }}. @@ -37,7 +37,7 @@ Just state what it is in a single sentence. [pipe.imagine_fantasy_scene_including_products_of_nature] PipeLLM = "Imagine a fantasy scene including products of nature" -input = "ProductOfNature" +inputs = { product_of_nature = "ProductOfNature" } output = "FantasyScene" prompt_template = """ Imagine a fantasy scene including the following products of nature: @@ -54,7 +54,7 @@ steps = [ { pipe = "choose_colors", result = "color", nb_output = 3 }, { pipe = "imagine_nature_product", result = "product_of_nature" }, { pipe = "imagine_fantasy_scene_including_products_of_nature", result = "imgg_prompt" }, - { pipe = "generate_image", result = "image" }, + # { pipe = "generate_image", result = "image" }, ] [pipe.imagine_nature_scene_of_original_power_rangers_colors] diff --git a/tests/test_pipelines/misc_tests/pipe_batch.toml b/tests/test_pipelines/misc_tests/pipe_batch.toml index 23b20bd66..96523a430 100644 --- a/tests/test_pipelines/misc_tests/pipe_batch.toml +++ b/tests/test_pipelines/misc_tests/pipe_batch.toml @@ -4,20 +4,19 @@ domain = "test_pipe_batch" definition = "Pipelines to test Pipe Batch" [concept] -[concept.Random] -Concept = "TestPipeBatchItem" +TestPipeBatchItem = "Could be anything" [pipe.test_pipe_batch_item] PipeLLM = "Test Pipe Batch" -input = "TestPipeBatchItem" +inputs = { BATCH_ITEM = "TestPipeBatchItem" } output = "TestPipeBatchItem" prompt_template = """ Output this "item" and add "Hello" at the beginning of the output. -{{ _batch_item|tag("item") }} +{{ BATCH_ITEM|tag("item") }} """ [pipe.test_pipe_batch] PipeBatch = "Test Pipe Batch 2" -input = "TestPipeBatchItem" +inputs = { BATCH_ITEM = "TestPipeBatchItem" } output = "TestPipeBatchItem" branch_pipe_code = "test_pipe_batch_item" diff --git a/tests/test_pipelines/misc_tests/subfolder_1/cars.toml b/tests/test_pipelines/misc_tests/subfolder_1/cars.toml index f1e161258..b452dbb1b 100644 --- a/tests/test_pipelines/misc_tests/subfolder_1/cars.toml +++ b/tests/test_pipelines/misc_tests/subfolder_1/cars.toml @@ -11,7 +11,7 @@ CarDescription = "A detailed description of a car model" [pipe] [pipe.generate_car_description] PipeLLM = "Generate a description of a car" -input = "Car" +inputs = { car = "Car" } output = "CarDescription" prompt_template = """ Given the reference to a car, generate a description of the car. diff --git a/tests/test_pipelines/misc_tests/subfolder_2/animals.toml b/tests/test_pipelines/misc_tests/subfolder_2/animals.toml index 138cd25ee..6dbf6f5bf 100644 --- a/tests/test_pipelines/misc_tests/subfolder_2/animals.toml +++ b/tests/test_pipelines/misc_tests/subfolder_2/animals.toml @@ -11,7 +11,7 @@ AnimalDescription = "A detailed description of an animal" [pipe] [pipe.generate_animal_description] PipeLLM = "Generate a description of an animal" -input = "Animal" +inputs = { animal = "Animal" } output = "AnimalDescription" prompt_template = """ Given the reference to an animal, generate a description of the animal. diff --git a/tests/test_pipelines/misc_tests/subfolder_2/subfolder/flowers.toml b/tests/test_pipelines/misc_tests/subfolder_2/subfolder/flowers.toml index 394464d88..90b2df67d 100644 --- a/tests/test_pipelines/misc_tests/subfolder_2/subfolder/flowers.toml +++ b/tests/test_pipelines/misc_tests/subfolder_2/subfolder/flowers.toml @@ -9,7 +9,7 @@ FlowerDescription = "A detailed description of a flower" [pipe] [pipe.generate_flower_description] PipeLLM = "Generate a description of a flower" -input = "Flower" +inputs = { flower = "Flower" } output = "FlowerDescription" prompt_template = """ Given the reference to a flower, generate a description of the flower. diff --git a/tests/test_pipelines/misc_tests/test_errors.toml b/tests/test_pipelines/misc_tests/test_errors.toml index f68ee0588..1bb013af2 100644 --- a/tests/test_pipelines/misc_tests/test_errors.toml +++ b/tests/test_pipelines/misc_tests/test_errors.toml @@ -5,7 +5,7 @@ definition = "This library is intended for testing errors" # [pipe.jinja2_syntax_error] # PipeLLM = "Jinja2 syntax error (on purpose, for testing): bad curly braces" -# input = "native.Text" +# inputs = { text = "native.Text" } # output = "native.Text" # prompt_template = """ # {{ a123|tag } @@ -13,7 +13,7 @@ definition = "This library is intended for testing errors" # [pipe.jinja2_dummy_filter1] # PipeLLM = "Jinja2 undefined filter (on purpose, for testing)" -# input = "native.Text" +# inputs = { text = "native.Text" } # output = "native.Text" # prompt_template = """ # {{ foobar|dummy_filter1 }} @@ -21,7 +21,7 @@ definition = "This library is intended for testing errors" # [pipe.jinja2_dummy_filter2] # PipeLLM = "Jinja2 undefined filter (on purpose, for testing)" -# input = "native.Text" +# inputs = { text = "native.Text" } # output = "native.Text" # prompt_template = """ # {{ foobar|dummy_filter2() }} @@ -29,7 +29,7 @@ definition = "This library is intended for testing errors" # [pipe.jinja2_dummy_filter3] # PipeLLM = "Jinja2 undefined filter (on purpose, for testing)" -# input = "native.Text" +# inputs = { text = "native.Text" } # output = "native.Text" # prompt_template = """ # {{ foobar|dummy_filter3("param") }} @@ -38,7 +38,7 @@ definition = "This library is intended for testing errors" # [pipe.jinja2_dummy_filter4] # PipeLLM = "Jinja2 undefined filter (on purpose, for testing)" -# input = "native.Text" +# inputs = { text = "native.Text" } # output = "native.Text" # prompt_template = """ # {{ foobar|dummy_filter4(param1, param2) }} diff --git a/tests/test_pipelines/misc_tests/test_jinja2.toml b/tests/test_pipelines/misc_tests/test_jinja2.toml index 65dccb9fa..358d348db 100644 --- a/tests/test_pipelines/misc_tests/test_jinja2.toml +++ b/tests/test_pipelines/misc_tests/test_jinja2.toml @@ -6,7 +6,7 @@ definition = "This library is intended for testing Jinja2" [pipe.jinja2_test_1] PipeJinja2 = "Jinja2 test 1" -input = "native.Text" +inputs = { text = "native.Text" } output = "native.Text" jinja2 = """ This is a simple test prompt: diff --git a/tests/test_pipelines/misc_tests/tests.toml b/tests/test_pipelines/misc_tests/tests.toml index a244d6bf1..f4096f04e 100644 --- a/tests/test_pipelines/misc_tests/tests.toml +++ b/tests/test_pipelines/misc_tests/tests.toml @@ -10,7 +10,7 @@ Complex = "A complex object" [pipe.simple_llm_test_from_text] PipeLLM = "Simple LLM test from text" -input = "native.Text" +inputs = { text = "native.Text" } output = "native.Text" prompt_template = """ This is a simple test prompt: @@ -18,7 +18,7 @@ This is a simple test prompt: """ [pipe.simple_llm_test_from_image] PipeLLM = "Simple LLM test from image" -input = "native.Image" +inputs = { image = "native.Image" } output = "native.Text" prompt_template = """ Describe the using rap lyrics, including puns and references to the image. @@ -37,7 +37,7 @@ multiple_output = true [pipe.create_synopsis] PipeLLM = "Create Synopsis" -input = "native.Text" +inputs = { text = "native.Text" } output = "native.Text" llm = "llm_for_creative_writing" prompt_template = """ @@ -48,42 +48,6 @@ Create a short synopsis for a story about: Be concise: 3 sentences max. """ - -# TODO: fix testing implict concept -[pipe.test_implicit_concept] -PipeLLM = "Implicit concept" -output = "RandomConceptCodeThatDoesNotExist" -prompt_template = """ -This is is a test prompt. - -Output what is here: - -{{ a123|tag }} -""" - -[pipe.is_article_critic_reasonable] -PipeLLM = "Article Critic Evaluator" -input = "ArticleAndCritic" -output = "native.Text" -prompt_template = """ -You are an expert at evaluating article critiques. Your task is to determine if the given critique of an article is reasonable and valuable. -Article to evaluate: - -{{ my_article|tag }} - -Critique to evaluate: - -{{ critic|tag }} - -Evaluate if this critique is reasonable and valuable. Consider the following aspects: -1. Is the critique specific and detailed? -2. Does it provide constructive feedback? -3. Is it supported by examples from the article? -4. Is the tone professional? -5. Does it focus on the content rather than personal attacks? -Output only 'true' if the critique is reasonable and valuable, or 'false' if it is not. -""" - [pipe.test_no_input] PipeLLM = "No Input" output = "native.Text" @@ -101,7 +65,7 @@ Tell me a short story about a red baloon. """ [pipe.generate_3_images] -PipeImgGen = "Generate 3 AI images" -input = "images.ImggPrompt" +PipeImgGen = "Generate 3 images" +inputs = { prompt = "images.ImgGenPrompt" } output = "native.Image" nb_output = 3 diff --git a/tests/test_pipelines/test_images.toml b/tests/test_pipelines/test_images.toml new file mode 100644 index 000000000..bbaaf9bce --- /dev/null +++ b/tests/test_pipelines/test_images.toml @@ -0,0 +1,22 @@ + + +domain = "test_images" +definition = "Test domain for images" + +[concept] + +[pipe] + + +[pipe.generate_cutie_image] +PipeImgGen = "Generate an image of a cutie" +output = "Image" +nb_steps = 2 +img_gen_prompt = "Cartoon image of a cutie that likes to play with pipes" + + +[pipe.generate_cutie_photo] +PipeImgGen = "Generate a photo of a cutie" +output = "images.Photo" +nb_steps = 8 +img_gen_prompt = "Realistic photo of a cutie that likes to play with pipes" diff --git a/tests/test_pipelines/tricky_questions.py b/tests/test_pipelines/tricky_questions.py new file mode 100644 index 000000000..935017e9b --- /dev/null +++ b/tests/test_pipelines/tricky_questions.py @@ -0,0 +1,16 @@ +from pydantic import Field + +from pipelex.core.stuff_content import StructuredContent + + +class QuestionAnalysis(StructuredContent): + explanation: str + trickiness_rating: int = Field(..., ge=1, le=100) + deceptiveness_rating: int = Field(..., ge=1, le=100) + + +class ThoughtfulAnswer(StructuredContent): + the_trap: str + the_counter: str + the_lesson: str + the_answer: str diff --git a/tests/test_pipelines/tricky_questions.toml b/tests/test_pipelines/tricky_questions.toml new file mode 100644 index 000000000..a44246dfc --- /dev/null +++ b/tests/test_pipelines/tricky_questions.toml @@ -0,0 +1,77 @@ +domain = "test_tricky_questions" +definition = "Domain for testing tricky questions" + +[concept] +QuestionAnalysis = "An analysis of a question, determining whether it's tricky" +AnswerToAQuestion = "Answer to a question" +ThoughtfulAnswerConclusion = "Conclusion of a thoughtful answer" + +[concept.ThoughtfulAnswer] +Concept = "A thoughtful answer to a question" +structure = "ThoughtfulAnswer" +refines = ["AnswerToAQuestion"] + +[pipe] + +[pipe.analyse_question_tricky] +PipeLLM = "Analyze a question to determine whether it's straightforward or tricky" +inputs = { question = "answer.Question" } +output = "QuestionAnalysis" +llm = "llm_to_reason" +prompt_template = """ +Here is a question for an LLM: +@question + +Do you think it's tricky, or maybe even a deceptive trap? +Does it assume things that are not necessarily true? +Does it suggest patterns that aren't applicable? + +Please explain what you think and then give a rating between 0 to 100 of trickiness and another rating between 0 to 100 of deceptiveness. +If there's an obvious trap, state it without getting into details. +""" + +[pipe.answer_after_analysis] +PipeLLM = "Answer knowingly after analyzing a question" +inputs = { question = "answer.Question", question_analysis = "QuestionAnalysis" } +output = "ThoughtfulAnswer" +llm = "llm_to_reason" +prompt_template = """ +A question was asked: +@question + +A thoughtful analysis was given: +@question_analysis + +If the question was tricky or deceptive, don't get fooled! +Answer in 4 parts: +1- the_trap: Explain the trap in a 1 sentence +2- the_counter: Counter by stating the right way to think about the question and avoid the trap +3- the_lesson: Did we learn anything? +4- the_answer: Then give a good answer expressed without mentioning the trap +""" + +[pipe.answer_tricky_question_by_steps] +PipeSequence = "Answer a tricky question by first analyzing its trickiness" +inputs = { question = "answer.Question" } +output = "ThoughtfulAnswer" +steps = [ + { pipe = "analyse_question_tricky", result = "question_analysis" }, + { pipe = "answer_after_analysis", result = "answer" }, +] + +[pipe.conclude_thoughtful_answer] +PipeJinja2 = "Conclude a thoughtful answer" +inputs = { thoughtful_answer = "ThoughtfulAnswer" } +output = "ThoughtfulAnswerConclusion" +jinja2 = "After analyzing the question, here is my answer: $thoughtful_answer.the_answer" + + +[pipe.conclude_tricky_question_by_steps] +PipeSequence = "Answer a tricky question by first analyzing its trickiness and then concluding" +inputs = { question = "answer.Question" } +output = "ThoughtfulAnswerConclusion" +steps = [ + { pipe = "analyse_question_tricky", result = "question_analysis" }, + { pipe = "answer_after_analysis", result = "thoughtful_answer" }, + { pipe = "conclude_thoughtful_answer", result = "thoughtful_answer_conclusion" }, +] diff --git a/tests/tools/typing/test_pydantic_utils.py b/tests/tools/typing/test_pydantic_utils.py index 309b31ca4..42920d767 100644 --- a/tests/tools/typing/test_pydantic_utils.py +++ b/tests/tools/typing/test_pydantic_utils.py @@ -3,6 +3,7 @@ import pytest from pydantic import BaseModel, ConfigDict, Field, ValidationError +from pipelex.tools.misc.attribute_utils import AttributePolisher from pipelex.tools.typing.pydantic_utils import ( CustomBaseModel, ExtraFieldAttribute, @@ -100,7 +101,8 @@ class TestModel(CustomBaseModel): url: str other: str - TestModel.truncate_length = 10 + AttributePolisher.base_64_truncate_length = 10 + AttributePolisher.url_truncate_length = 10 model = TestModel( base_64="b" * 20, url="data:image/png;base64," + "x" * 20, diff --git a/uv.lock b/uv.lock index e86f1e461..f1cbb9bd6 100644 --- a/uv.lock +++ b/uv.lock @@ -517,6 +517,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/43/09/2aea36ff60d16dd8879bdb2f5b3ee0ba8d08cbbdcdfe870e695ce3784385/execnet-2.1.1-py3-none-any.whl", hash = "sha256:26dee51f1b80cebd6d0ca8e74dd8745419761d3bef34163928cbebbdc4749fdc", size = 40612, upload-time = "2024-04-08T09:04:17.414Z" }, ] +[[package]] +name = "faker" +version = "37.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "tzdata" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/97/4b/5354912eaff922876323f2d07e21408b10867f3295d5f917748341cb6f53/faker-37.3.0.tar.gz", hash = "sha256:77b79e7a2228d57175133af0bbcdd26dc623df81db390ee52f5104d46c010f2f", size = 1901376, upload-time = "2025-05-14T15:24:18.039Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ce/99/045b2dae19a01b9fbb23b9971bc04f4ef808e7f3a213d08c81067304a210/faker-37.3.0-py3-none-any.whl", hash = "sha256:48c94daa16a432f2d2bc803c7ff602509699fca228d13e97e379cd860a7e216e", size = 1942203, upload-time = "2025-05-14T15:24:16.159Z" }, +] + [[package]] name = "fal-client" version = "0.7.0" @@ -1397,7 +1409,7 @@ wheels = [ [[package]] name = "pipelex" -version = "0.2.12" +version = "0.2.14" source = { editable = "." } dependencies = [ { name = "aiofiles" }, @@ -1414,6 +1426,7 @@ dependencies = [ { name = "openpyxl" }, { name = "pandas" }, { name = "pillow" }, + { name = "polyfactory" }, { name = "pydantic" }, { name = "pypdfium2" }, { name = "python-dotenv" }, @@ -1493,6 +1506,7 @@ requires-dist = [ { name = "pandas", specifier = ">=2.2.3" }, { name = "pandas-stubs", marker = "extra == 'dev'", specifier = ">=2.2.3.241126" }, { name = "pillow", specifier = ">=11.2.1" }, + { name = "polyfactory", specifier = ">=2.21.0" }, { name = "pydantic", specifier = "==2.10.6" }, { name = "pypdfium2", specifier = ">=4.30.1" }, { name = "pyright", marker = "extra == 'dev'", specifier = "==1.1.398" }, @@ -1531,6 +1545,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, ] +[[package]] +name = "polyfactory" +version = "2.21.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "faker" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/bd/d0/8ce6a9912a6f1077710ebc46a6aa9a79a64a06b69d2d6b4ccefc9765ce8f/polyfactory-2.21.0.tar.gz", hash = "sha256:a6d8dba91b2515d744cc014b5be48835633f7ccb72519a68f8801759e5b1737a", size = 246314, upload-time = "2025-04-18T10:19:33.852Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e0/ba/c148fba517a0aaccfc4fca5e61bf2a051e084a417403e930dc615886d4e6/polyfactory-2.21.0-py3-none-any.whl", hash = "sha256:9483b764756c8622313d99f375889b1c0d92f09affb05742d7bcfa2b5198d8c5", size = 60875, upload-time = "2025-04-18T10:19:31.881Z" }, +] + [[package]] name = "propcache" version = "0.3.1" From 0b43596ee4cb07170301a368dac926521eacf027 Mon Sep 17 00:00:00 2001 From: thomashebrard Date: Mon, 9 Jun 2025 15:59:32 +0200 Subject: [PATCH 2/7] bump to 0.3.0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8a254f67d..252d88222 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "pipelex" -version = "0.2.14" +version = "0.3.0" description = "Pipelex is an open-source dev tool based on a simple declarative language that lets you define replicable, structured, composable LLM pipelines." authors = [{ name = "Evotis S.A.S.", email = "evotis@pipelex.com" }] maintainers = [{ name = "Pipelex staff", email = "oss@pipelex.com" }] From 32001f0852a515471341fe1fa37f24e58d2bd4ad Mon Sep 17 00:00:00 2001 From: thomashebrard Date: Mon, 9 Jun 2025 15:59:47 +0200 Subject: [PATCH 3/7] make li --- uv.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/uv.lock b/uv.lock index f1cbb9bd6..c748492cd 100644 --- a/uv.lock +++ b/uv.lock @@ -1409,7 +1409,7 @@ wheels = [ [[package]] name = "pipelex" -version = "0.2.14" +version = "0.3.0" source = { editable = "." } dependencies = [ { name = "aiofiles" }, From 649bc2a4fab2edeee6585b24df4f6f402c986684 Mon Sep 17 00:00:00 2001 From: Louis Choquel Date: Mon, 9 Jun 2025 16:59:39 +0200 Subject: [PATCH 4/7] Fix/validation of pipe ocr fields (#69) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### 📝 Description - Remove smelly validation of None PipeOcr fields ### 🔄 Type of Change - [X] 🐛 Bug fix - [ ] ✨ New feature - [ ] 💥 Breaking change - [ ] 📚 Documentation update - [ ] 🧹 Code refactor - [ ] ⚡ Performance improvement - [ ] ✅ Test update --- pipelex/pipe_operators/pipe_ocr.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/pipelex/pipe_operators/pipe_ocr.py b/pipelex/pipe_operators/pipe_ocr.py index 7f0b20fac..14ce618b5 100644 --- a/pipelex/pipe_operators/pipe_ocr.py +++ b/pipelex/pipe_operators/pipe_ocr.py @@ -1,6 +1,6 @@ from typing import List, Optional -from pydantic import field_validator, model_validator +from pydantic import model_validator from typing_extensions import Self, override from pipelex import log @@ -47,13 +47,6 @@ class PipeOcr(PipeOperator): image_stuff_name: Optional[str] = None pdf_stuff_name: Optional[str] = None - @field_validator("image_stuff_name", "pdf_stuff_name") - @classmethod - def validate_input_stuff_name_not_provided_as_attribute(cls, v: Optional[str]) -> Optional[str]: - if v is not None: - raise PipeDefinitionError("image_stuff_name and pdf_stuff_name must be None before input validation") - return v - @model_validator(mode="after") def validate_inputs(self) -> Self: self._validate_inputs() @@ -88,7 +81,7 @@ def _validate_inputs(self): pipe_code=self.code, variable_names=[input_name], provided_concept_code=input_concept_code, - explanation="For OCR you must provide either a pdf or an image or a concept that refines them", + explanation="For OCR you must provide either a pdf or an image or a concept that refines one of them", ) match reactions.get(StaticValidationErrorType.INADEQUATE_INPUT_CONCEPT, default_reaction): case StaticValidationReaction.IGNORE: @@ -117,7 +110,7 @@ def _validate_inputs(self): error_type=StaticValidationErrorType.MISSING_INPUT_VARIABLE, domain_code=self.domain, pipe_code=self.code, - explanation="For OCR you must provide either a pdf or an image or a concept that refines them", + explanation="For OCR you must provide either a pdf or an image or a concept that refines one of them", ) match reactions.get(StaticValidationErrorType.MISSING_INPUT_VARIABLE, default_reaction): case StaticValidationReaction.IGNORE: From a3d930b99a3bf115092855ae401e7db46aa50504 Mon Sep 17 00:00:00 2001 From: Louis Choquel Date: Tue, 10 Jun 2025 11:11:06 +0200 Subject: [PATCH 5/7] Feature/pytest marker and make targets for dry run (#72) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### 🔗 Related Issues ### 📝 Description - Added pytest marker `dry_runable` - Added `make` targets more powerful thanks to dry run: more test coverage by dry running pipes without inference - Added "-n auto" to gha and codex test targets in order to use pytest-xdist and be faster by using multiple CPUs ### 🔄 Type of Change - [ ] 🐛 Bug fix - [X] ✨ New feature - [ ] 💥 Breaking change - [ ] 📚 Documentation update - [ ] 🧹 Code refactor - [ ] ⚡ Performance improvement - [X] ✅ Test update --- Makefile | 36 ++++++++++--------- pipelex/pipelex.toml | 2 +- pyproject.toml | 1 + .../pipelex/pipelex_asynch/test_pipe_batch.py | 1 + .../pipelex/pipelex_asynch/test_pipe_imgg.py | 1 + tests/pipelex/pipelex_asynch/test_pipe_llm.py | 1 + tests/pipelex/pipelex_asynch/test_pipe_ocr.py | 1 + .../test_pipe_running_variants.py | 1 + 8 files changed, 26 insertions(+), 18 deletions(-) diff --git a/Makefile b/Makefile index fc6c08af4..12cd5a8d4 100644 --- a/Makefile +++ b/Makefile @@ -16,6 +16,8 @@ VENV_PIPELEX := $(VIRTUAL_ENV)/bin/pipelex UV_MIN_VERSION = $(shell grep -m1 'required-version' pyproject.toml | sed -E 's/.*= *"([^<>=, ]+).*/\1/') +USUAL_PYTEST_MARKERS := "(dry_runable or not (inference or llm or imgg or ocr)) and not (needs_output or pipelex_api)" + define PRINT_TITLE $(eval PROJECT_PART := [$(PROJECT_NAME)]) $(eval TARGET_PART := ($@)) @@ -203,17 +205,17 @@ cleanall: cleanderived cleanenv cleanlibraries codex-tests: env $(call PRINT_TITLE,"Unit testing for Codex") @echo "• Running unit tests for Codex (excluding inference and codex_disabled)" - $(VENV_PYTEST) --exitfirst --quiet -m "not (inference or codex_disabled or pipelex_api)" || [ $$? = 5 ] + $(VENV_PYTEST) -n auto --exitfirst --quiet -m "(dry_runable or not inference) and not (needs_output or pipelex_api)" || [ $$? = 5 ] gha-tests: env $(call PRINT_TITLE,"Unit testing for github actions") @echo "• Running unit tests for github actions (excluding inference and gha_disabled)" - $(VENV_PYTEST) --exitfirst --quiet -m "not (inference or gha_disabled or pipelex_api)" || [ $$? = 5 ] + $(VENV_PYTEST) -n auto --exitfirst --quiet -m "(dry_runable or not inference) and not (gha_disabled or pipelex_api)" || [ $$? = 5 ] run-all-tests: env $(call PRINT_TITLE,"Running all unit tests") @echo "• Running all unit tests" - $(VENV_PYTEST) --exitfirst --quiet + $(VENV_PYTEST) -n auto --exitfirst --quiet run-manual-trigger-gha-tests: env $(call PRINT_TITLE,"Running GHA tests") @@ -229,18 +231,18 @@ test: env $(call PRINT_TITLE,"Unit testing without prints but displaying logs via pytest for WARNING level and above") @echo "• Running unit tests" @if [ -n "$(TEST)" ]; then \ - $(VENV_PYTEST) -s -o log_cli=true -o log_level=WARNING -k "$(TEST)" $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ + $(VENV_PYTEST) -s -m $(USUAL_PYTEST_MARKERS) -o log_cli=true -o log_level=WARNING -k "$(TEST)" $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ else \ - $(VENV_PYTEST) -s -o log_cli=true -o log_level=WARNING $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ + $(VENV_PYTEST) -s -m $(USUAL_PYTEST_MARKERS) -o log_cli=true -o log_level=WARNING $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ fi test-xdist: env $(call PRINT_TITLE,"Unit testing without prints but displaying logs via pytest for WARNING level and above") @echo "• Running unit tests" @if [ -n "$(TEST)" ]; then \ - $(VENV_PYTEST) -n auto -o log_level=WARNING -k "$(TEST)" $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ + $(VENV_PYTEST) -n auto -m $(USUAL_PYTEST_MARKERS) -o log_level=WARNING -k "$(TEST)" $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ else \ - $(VENV_PYTEST) -n auto -o log_level=WARNING $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ + $(VENV_PYTEST) -n auto -m $(USUAL_PYTEST_MARKERS) -o log_level=WARNING $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ fi t: test-xdist @@ -250,9 +252,9 @@ test-quiet: env $(call PRINT_TITLE,"Unit testing without prints but displaying logs via pytest for WARNING level and above") @echo "• Running unit tests" @if [ -n "$(TEST)" ]; then \ - $(VENV_PYTEST) -o log_cli=true -o log_level=WARNING -k "$(TEST)" $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ + $(VENV_PYTEST) -m $(USUAL_PYTEST_MARKERS) -o log_cli=true -o log_level=WARNING -k "$(TEST)" $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ else \ - $(VENV_PYTEST) -o log_cli=true -o log_level=WARNING $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ + $(VENV_PYTEST) -m $(USUAL_PYTEST_MARKERS) -o log_cli=true -o log_level=WARNING $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ fi tq: test-quiet @@ -262,9 +264,9 @@ test-with-prints: env $(call PRINT_TITLE,"Unit testing with prints and our rich logs") @echo "• Running unit tests" @if [ -n "$(TEST)" ]; then \ - $(VENV_PYTEST) -s -k "$(TEST)" $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ + $(VENV_PYTEST) -s -m $(USUAL_PYTEST_MARKERS) -k "$(TEST)" $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ else \ - $(VENV_PYTEST) -s $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ + $(VENV_PYTEST) -s -m $(USUAL_PYTEST_MARKERS) $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ fi tp: test-with-prints @@ -273,9 +275,9 @@ tp: test-with-prints test-inference: env $(call PRINT_TITLE,"Unit testing") @if [ -n "$(TEST)" ]; then \ - $(VENV_PYTEST) --exitfirst -m "inference and not imgg" -s -k "$(TEST)" $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ + $(VENV_PYTEST) --pipe-run-mode live --exitfirst -m "inference and not imgg" -s -k "$(TEST)" $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ else \ - $(VENV_PYTEST) --exitfirst -m "inference and not imgg" -s $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ + $(VENV_PYTEST) --pipe-run-mode live --exitfirst -m "inference and not imgg" -s $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ fi ti: test-inference @@ -284,9 +286,9 @@ ti: test-inference test-ocr: env $(call PRINT_TITLE,"Unit testing ocr") @if [ -n "$(TEST)" ]; then \ - $(VENV_PYTEST) --exitfirst -m "ocr" -s -k "$(TEST)" $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ + $(VENV_PYTEST) --pipe-run-mode live --exitfirst -m "ocr" -s -k "$(TEST)" $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ else \ - $(VENV_PYTEST) --exitfirst -m "ocr" -s $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ + $(VENV_PYTEST) --pipe-run-mode live --exitfirst -m "ocr" -s $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ fi to: test-ocr @@ -295,9 +297,9 @@ to: test-ocr test-imgg: env $(call PRINT_TITLE,"Unit testing") @if [ -n "$(TEST)" ]; then \ - $(VENV_PYTEST) --exitfirst -m "imgg" -s -k "$(TEST)" $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ + $(VENV_PYTEST) --pipe-run-mode live --exitfirst -m "imgg" -s -k "$(TEST)" $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ else \ - $(VENV_PYTEST) --exitfirst -m "imgg" -s $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ + $(VENV_PYTEST) --pipe-run-mode live --exitfirst -m "imgg" -s $(if $(filter 1,$(VERBOSE)),-v,$(if $(filter 2,$(VERBOSE)),-vv,$(if $(filter 3,$(VERBOSE)),-vvv,))); \ fi tg: test-imgg diff --git a/pipelex/pipelex.toml b/pipelex/pipelex.toml index d7a47ff48..79431a752 100644 --- a/pipelex/pipelex.toml +++ b/pipelex/pipelex.toml @@ -244,4 +244,4 @@ pipe_stack_limit = 20 [pipelex.dry_run_config] apply_to_jinja2_rendering = false -text_gen_truncate_length = 256 \ No newline at end of file +text_gen_truncate_length = 256 diff --git a/pyproject.toml b/pyproject.toml index 252d88222..d4f764213 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -208,6 +208,7 @@ markers = [ "ocr: slow and costly due to ocr inference calls", "gha_disabled: tests that should not run in GitHub Actions", "codex_disabled: tests that should not run in Codex", + "dry_runable: tests that can be run in dry-run mode", ] minversion = "8.0" diff --git a/tests/pipelex/pipelex_asynch/test_pipe_batch.py b/tests/pipelex/pipelex_asynch/test_pipe_batch.py index d2b9eb4a3..5c1d72307 100644 --- a/tests/pipelex/pipelex_asynch/test_pipe_batch.py +++ b/tests/pipelex/pipelex_asynch/test_pipe_batch.py @@ -10,6 +10,7 @@ from pipelex.hub import get_pipe_router, get_pipeline_tracker, get_report_delegate +@pytest.mark.dry_runable @pytest.mark.llm @pytest.mark.inference @pytest.mark.asyncio(loop_scope="class") diff --git a/tests/pipelex/pipelex_asynch/test_pipe_imgg.py b/tests/pipelex/pipelex_asynch/test_pipe_imgg.py index ad501bcfb..56539cff4 100644 --- a/tests/pipelex/pipelex_asynch/test_pipe_imgg.py +++ b/tests/pipelex/pipelex_asynch/test_pipe_imgg.py @@ -11,6 +11,7 @@ from tests.pipelex.test_data import IMGGTestCases +@pytest.mark.dry_runable @pytest.mark.imgg @pytest.mark.inference @pytest.mark.asyncio(loop_scope="class") diff --git a/tests/pipelex/pipelex_asynch/test_pipe_llm.py b/tests/pipelex/pipelex_asynch/test_pipe_llm.py index 6b895a69d..1332fd153 100644 --- a/tests/pipelex/pipelex_asynch/test_pipe_llm.py +++ b/tests/pipelex/pipelex_asynch/test_pipe_llm.py @@ -16,6 +16,7 @@ from tests.pipelex.test_data import PipeTestCases +@pytest.mark.dry_runable @pytest.mark.llm @pytest.mark.inference @pytest.mark.asyncio(loop_scope="class") diff --git a/tests/pipelex/pipelex_asynch/test_pipe_ocr.py b/tests/pipelex/pipelex_asynch/test_pipe_ocr.py index 9ebe82096..f0915ede5 100644 --- a/tests/pipelex/pipelex_asynch/test_pipe_ocr.py +++ b/tests/pipelex/pipelex_asynch/test_pipe_ocr.py @@ -13,6 +13,7 @@ from tests.pipelex.test_data import PipeOcrTestCases +@pytest.mark.dry_runable @pytest.mark.ocr @pytest.mark.inference @pytest.mark.asyncio(loop_scope="class") diff --git a/tests/pipelex/pipelex_asynch/test_pipe_running_variants.py b/tests/pipelex/pipelex_asynch/test_pipe_running_variants.py index ba7ab0c20..e7df2f1a3 100644 --- a/tests/pipelex/pipelex_asynch/test_pipe_running_variants.py +++ b/tests/pipelex/pipelex_asynch/test_pipe_running_variants.py @@ -17,6 +17,7 @@ from tests.pipelex.test_data import PipeTestCases +@pytest.mark.dry_runable @pytest.mark.llm @pytest.mark.ocr @pytest.mark.inference From bd9d6b38fa3ccfcb9a0f7ebeffd70292fdc33e11 Mon Sep 17 00:00:00 2001 From: Louis Choquel Date: Tue, 10 Jun 2025 15:41:48 +0200 Subject: [PATCH 6/7] Be more lax on having domain definition: it's now Optional (#73) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### 📝 Description * Be more lax on having domain definition: it's now Optional ### 🔄 Type of Change - [ ] 🐛 Bug fix - [X] ✨ New feature - [ ] 💥 Breaking change - [ ] 📚 Documentation update - [ ] 🧹 Code refactor - [ ] ⚡ Performance improvement - [ ] ✅ Test update ### 🧪 Tests * tests/test_pipelines/failure_modes.toml has no domain definition set, and it's OK --- CHANGELOG.md | 16 +++++++++++++++- Makefile | 6 +++--- pipelex/core/domain.py | 4 ++-- pipelex/libraries/library_manager.py | 9 ++++----- pyproject.toml | 2 +- tests/pipelex/pipelex_asynch/test_pipe_batch.py | 2 +- tests/pipelex/pipelex_asynch/test_pipe_imgg.py | 2 +- tests/pipelex/pipelex_asynch/test_pipe_llm.py | 2 +- tests/pipelex/pipelex_asynch/test_pipe_ocr.py | 2 +- .../pipelex_asynch/test_pipe_running_variants.py | 2 +- tests/test_pipelines/failure_modes.toml | 1 - 11 files changed, 30 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ce0a74ac..541d2e980 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,20 @@ # Changelog -## [v0.3.0] - 2025-06-10 +## [v0.3.1] - 2025-06-10 + +### Added +- New pytest marker `dry_runnable` for tests that can run without inference. +- Enhanced `make` targets with dry-run capabilities for improved test coverage: + - `make test-xdist` (or `make t`): Runs all non-inference tests **plus inference tests** that support dry-runs - fast and resource-efficient + - `make test-inference` (or `make ti`): Runs tests requiring actual inference, with actual inference (slow and costly) +- Parallel test execution using `pytest-xdist` (`-n auto`) enabled for: + - GitHub Actions workflows + - Codex test targets + +### Changed +- Domain validation is now less restrictive in pipeline TOML: the `definition` attribute is now `Optional` + +## [v0.3.0] - 2025-06-09 ### Highlights diff --git a/Makefile b/Makefile index 12cd5a8d4..4366989f3 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,7 @@ VENV_PIPELEX := $(VIRTUAL_ENV)/bin/pipelex UV_MIN_VERSION = $(shell grep -m1 'required-version' pyproject.toml | sed -E 's/.*= *"([^<>=, ]+).*/\1/') -USUAL_PYTEST_MARKERS := "(dry_runable or not (inference or llm or imgg or ocr)) and not (needs_output or pipelex_api)" +USUAL_PYTEST_MARKERS := "(dry_runnable or not (inference or llm or imgg or ocr)) and not (needs_output or pipelex_api)" define PRINT_TITLE $(eval PROJECT_PART := [$(PROJECT_NAME)]) @@ -205,12 +205,12 @@ cleanall: cleanderived cleanenv cleanlibraries codex-tests: env $(call PRINT_TITLE,"Unit testing for Codex") @echo "• Running unit tests for Codex (excluding inference and codex_disabled)" - $(VENV_PYTEST) -n auto --exitfirst --quiet -m "(dry_runable or not inference) and not (needs_output or pipelex_api)" || [ $$? = 5 ] + $(VENV_PYTEST) -n auto --exitfirst --quiet -m "(dry_runnable or not inference) and not (needs_output or pipelex_api)" || [ $$? = 5 ] gha-tests: env $(call PRINT_TITLE,"Unit testing for github actions") @echo "• Running unit tests for github actions (excluding inference and gha_disabled)" - $(VENV_PYTEST) -n auto --exitfirst --quiet -m "(dry_runable or not inference) and not (gha_disabled or pipelex_api)" || [ $$? = 5 ] + $(VENV_PYTEST) -n auto --exitfirst --quiet -m "(dry_runnable or not inference) and not (gha_disabled or pipelex_api)" || [ $$? = 5 ] run-all-tests: env $(call PRINT_TITLE,"Running all unit tests") diff --git a/pipelex/core/domain.py b/pipelex/core/domain.py index 3d2081c3c..04cfbd35d 100644 --- a/pipelex/core/domain.py +++ b/pipelex/core/domain.py @@ -13,7 +13,7 @@ class SpecialDomain(StrEnum): class Domain(BaseModel): code: str - definition: str + definition: Optional[str] = None system_prompt: Optional[str] = None system_prompt_to_structure: Optional[str] = None prompt_template_to_structure: Optional[str] = None @@ -24,4 +24,4 @@ def __str__(self): @classmethod def make_default(cls) -> Self: - return cls(code=SpecialDomain.NATIVE, definition="") + return cls(code=SpecialDomain.NATIVE) diff --git a/pipelex/libraries/library_manager.py b/pipelex/libraries/library_manager.py index 1871e70ff..f527f9f75 100644 --- a/pipelex/libraries/library_manager.py +++ b/pipelex/libraries/library_manager.py @@ -128,7 +128,10 @@ def _load_combo_libraries(self, library_paths: List[str]): library_name = toml_path.stem domain_code = library_dict.get("domain") if domain_code is None: - raise LibraryParsingError(f"Error loafing library '{library_name}' which has no domain set at '{toml_path}'") + raise LibraryParsingError( + f"Error loading library '{library_name}' which has no domain set at '{toml_path}'. " + "Just write 'domain = \"my_domain\"' at the top of the file." + ) domain_definition = library_dict.get("definition") if domain_definition is None: # we skip the domain without definition, it must be defined one and only one time in the domain library @@ -177,10 +180,6 @@ def _load_combo_libraries(self, library_paths: List[str]): def _load_library_dict(self, library_name: str, library_dict: Dict[str, Any], component_type: LibraryComponent): if domain_code := library_dict.pop("domain", None): - if not self.domain_library.get_domain(domain_code=domain_code): - raise LibraryParsingError( - f"Domain '{domain_code}' is has not been defined in the domain libraryn make sure it has exactlyone definition" - ) # domain is set at the root of the library self._load_library_components_from_recursive_dict( domain_code=domain_code, diff --git a/pyproject.toml b/pyproject.toml index d4f764213..52fc50e16 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -208,7 +208,7 @@ markers = [ "ocr: slow and costly due to ocr inference calls", "gha_disabled: tests that should not run in GitHub Actions", "codex_disabled: tests that should not run in Codex", - "dry_runable: tests that can be run in dry-run mode", + "dry_runnable: tests that can be run in dry-run mode", ] minversion = "8.0" diff --git a/tests/pipelex/pipelex_asynch/test_pipe_batch.py b/tests/pipelex/pipelex_asynch/test_pipe_batch.py index 5c1d72307..be772c723 100644 --- a/tests/pipelex/pipelex_asynch/test_pipe_batch.py +++ b/tests/pipelex/pipelex_asynch/test_pipe_batch.py @@ -10,7 +10,7 @@ from pipelex.hub import get_pipe_router, get_pipeline_tracker, get_report_delegate -@pytest.mark.dry_runable +@pytest.mark.dry_runnable @pytest.mark.llm @pytest.mark.inference @pytest.mark.asyncio(loop_scope="class") diff --git a/tests/pipelex/pipelex_asynch/test_pipe_imgg.py b/tests/pipelex/pipelex_asynch/test_pipe_imgg.py index 56539cff4..3ef3e5f71 100644 --- a/tests/pipelex/pipelex_asynch/test_pipe_imgg.py +++ b/tests/pipelex/pipelex_asynch/test_pipe_imgg.py @@ -11,7 +11,7 @@ from tests.pipelex.test_data import IMGGTestCases -@pytest.mark.dry_runable +@pytest.mark.dry_runnable @pytest.mark.imgg @pytest.mark.inference @pytest.mark.asyncio(loop_scope="class") diff --git a/tests/pipelex/pipelex_asynch/test_pipe_llm.py b/tests/pipelex/pipelex_asynch/test_pipe_llm.py index 1332fd153..86535a7ca 100644 --- a/tests/pipelex/pipelex_asynch/test_pipe_llm.py +++ b/tests/pipelex/pipelex_asynch/test_pipe_llm.py @@ -16,7 +16,7 @@ from tests.pipelex.test_data import PipeTestCases -@pytest.mark.dry_runable +@pytest.mark.dry_runnable @pytest.mark.llm @pytest.mark.inference @pytest.mark.asyncio(loop_scope="class") diff --git a/tests/pipelex/pipelex_asynch/test_pipe_ocr.py b/tests/pipelex/pipelex_asynch/test_pipe_ocr.py index f0915ede5..349da9321 100644 --- a/tests/pipelex/pipelex_asynch/test_pipe_ocr.py +++ b/tests/pipelex/pipelex_asynch/test_pipe_ocr.py @@ -13,7 +13,7 @@ from tests.pipelex.test_data import PipeOcrTestCases -@pytest.mark.dry_runable +@pytest.mark.dry_runnable @pytest.mark.ocr @pytest.mark.inference @pytest.mark.asyncio(loop_scope="class") diff --git a/tests/pipelex/pipelex_asynch/test_pipe_running_variants.py b/tests/pipelex/pipelex_asynch/test_pipe_running_variants.py index e7df2f1a3..023a733e7 100644 --- a/tests/pipelex/pipelex_asynch/test_pipe_running_variants.py +++ b/tests/pipelex/pipelex_asynch/test_pipe_running_variants.py @@ -17,7 +17,7 @@ from tests.pipelex.test_data import PipeTestCases -@pytest.mark.dry_runable +@pytest.mark.dry_runnable @pytest.mark.llm @pytest.mark.ocr @pytest.mark.inference diff --git a/tests/test_pipelines/failure_modes.toml b/tests/test_pipelines/failure_modes.toml index 6abb942ea..a46283c99 100644 --- a/tests/test_pipelines/failure_modes.toml +++ b/tests/test_pipelines/failure_modes.toml @@ -1,7 +1,6 @@ domain = "failure_modes" -definition = "This domain is for testing failure modes" [concept] From 67729ea654d9958782740c1b0305683f3899ec43 Mon Sep 17 00:00:00 2001 From: Louis Choquel Date: Tue, 10 Jun 2025 15:58:01 +0200 Subject: [PATCH 7/7] Bump v0.3.1 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 52fc50e16..1130c1845 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "pipelex" -version = "0.3.0" +version = "0.3.1" description = "Pipelex is an open-source dev tool based on a simple declarative language that lets you define replicable, structured, composable LLM pipelines." authors = [{ name = "Evotis S.A.S.", email = "evotis@pipelex.com" }] maintainers = [{ name = "Pipelex staff", email = "oss@pipelex.com" }]