diff --git a/.github/workflows/update-docs.yml b/.github/workflows/update-docs.yml new file mode 100644 index 0000000..a4b223a --- /dev/null +++ b/.github/workflows/update-docs.yml @@ -0,0 +1,38 @@ +name: Update reference docs + +on: + pull_request: + paths: + - "src/parxy_cli/commands/**" + - "src/parxy_cli/cli.py" + - "src/parxy_core/models/config.py" + - "scripts/generate_docs.py" + +jobs: + update-docs: + name: Regenerate reference docs + runs-on: ubuntu-latest + permissions: + contents: write + + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 1 + + - name: Install uv + uses: astral-sh/setup-uv@v7.3.1 + with: + enable-cache: true + + - name: Install dependencies + run: uv sync + + - name: Generate reference docs + run: uv run python scripts/generate_docs.py + + - name: Commit if changed + uses: stefanzweifel/git-auto-commit-action@v7.1.0 + with: + commit_message: "docs: sync CLI and configuration reference" + file_pattern: "docs/reference/*.md" diff --git a/docs/howto/add_new_parser.md b/docs/howto/add_new_parser.md index f4d644f..e04f704 100644 --- a/docs/howto/add_new_parser.md +++ b/docs/howto/add_new_parser.md @@ -1,3 +1,8 @@ +--- +title: Add a new parser +description: How to implement a custom driver, register it with Parxy at runtime, and make it available alongside the built-in parsers. +--- + # How to Add a New Parser to Parxy Parxy is designed to be **extensible** — you can integrate new parsing backends (drivers) or create custom variants of existing ones directly from your Python code, without modifying the core library. diff --git a/docs/howto/batch_processing.md b/docs/howto/batch_processing.md index a3d314b..18ab8ac 100644 --- a/docs/howto/batch_processing.md +++ b/docs/howto/batch_processing.md @@ -1,3 +1,8 @@ +--- +title: Process multiple documents in parallel +description: How to use Parxy's batch API to parse many documents concurrently, control worker count, handle per-file errors, and collect structured results. +--- + # How to Process Multiple Documents in Parallel Parxy provides a `batch` method for processing multiple documents in parallel, with support for per-file configuration. This is useful when you need to parse many documents efficiently or when different documents require different parsing strategies. diff --git a/docs/howto/configure_landingai.md b/docs/howto/configure_landingai.md index 42d0ab5..5de245b 100644 --- a/docs/howto/configure_landingai.md +++ b/docs/howto/configure_landingai.md @@ -1,3 +1,8 @@ +--- +title: Configure LandingAI ADE +description: How to set up the LandingAI Agentic Document Extraction driver, configure the API key and environment, and override parsing options per document. +--- + # How to Configure LandingAI ADE This guide shows you how to configure the LandingAI ADE (Agentic Document Extraction) driver for document processing, including setting default options and overriding them on a per-document basis. diff --git a/docs/howto/configure_llamaparse.md b/docs/howto/configure_llamaparse.md index cb613a3..51c87d2 100644 --- a/docs/howto/configure_llamaparse.md +++ b/docs/howto/configure_llamaparse.md @@ -1,3 +1,8 @@ +--- +title: Configure LlamaParse +description: How to set up the LlamaParse driver, configure the API key and parsing mode, and override options on a per-document basis for better extraction results. +--- + # How to Configure LlamaParse This guide shows you how to configure the LlamaParse driver for document processing, including setting default options and overriding them on a per-document basis. diff --git a/docs/howto/configure_llmwhisperer.md b/docs/howto/configure_llmwhisperer.md index 6b40b03..b28547a 100644 --- a/docs/howto/configure_llmwhisperer.md +++ b/docs/howto/configure_llmwhisperer.md @@ -1,3 +1,8 @@ +--- +title: Configure LLMWhisperer +description: How to set up the LLMWhisperer driver, configure the API key and parsing mode, and override options on a per-document basis for better extraction results. +--- + # How to Configure LLMWhisperer This guide shows you how to configure the LLMWhisperer driver for document processing, including setting default options and overriding them on a per-document basis. diff --git a/docs/howto/configure_observability.md b/docs/howto/configure_observability.md index 9e0739a..7a55142 100644 --- a/docs/howto/configure_observability.md +++ b/docs/howto/configure_observability.md @@ -1,3 +1,8 @@ +--- +title: Configure observability +description: How to enable OpenTelemetry tracing and metrics in Parxy, connect to an OTLP collector, and monitor document processing operations in your observability stack. +--- + # How to Configure Observability This guide shows you how to enable and configure OpenTelemetry-based observability in Parxy to monitor document processing operations. diff --git a/docs/howto/configure_pdfact.md b/docs/howto/configure_pdfact.md index 8330c15..be62c44 100644 --- a/docs/howto/configure_pdfact.md +++ b/docs/howto/configure_pdfact.md @@ -1,3 +1,8 @@ +--- +title: Configure PdfAct +description: How to set up the PdfAct driver against a self-hosted or remote service instance, configure the base URL and API key, and run PdfAct locally with Docker. +--- + # How to Configure PdfAct This guide shows you how to configure the PdfAct driver for document processing using a self-hosted or remote PdfAct service. diff --git a/docs/howto/configure_pymupdf.md b/docs/howto/configure_pymupdf.md index 1a86ff1..11db1b1 100644 --- a/docs/howto/configure_pymupdf.md +++ b/docs/howto/configure_pymupdf.md @@ -1,3 +1,8 @@ +--- +title: Configure PyMuPDF +description: How to use Parxy's default PyMuPDF driver, choose the right extraction level for your use case, and adjust the output when working with local PDF files. +--- + # How to Configure PyMuPDF This guide shows you how to use the PyMuPDF driver for document processing. PyMuPDF is the default driver in Parxy and requires no external services or API keys. diff --git a/docs/howto/configure_unstructured_local.md b/docs/howto/configure_unstructured_local.md index bc50a19..4a454c3 100644 --- a/docs/howto/configure_unstructured_local.md +++ b/docs/howto/configure_unstructured_local.md @@ -1,3 +1,8 @@ +--- +title: Configure Unstructured library +description: How to install and configure the Unstructured local driver for offline PDF parsing without external APIs, including extraction levels and output options. +--- + # How to Configure Unstructured Local This guide shows you how to configure the Unstructured Local driver for document processing. This driver uses the open-source `unstructured` library for local document parsing without requiring external services. diff --git a/docs/howto/pdf_manipulation.md b/docs/howto/merge_and_split_pdfs.md similarity index 97% rename from docs/howto/pdf_manipulation.md rename to docs/howto/merge_and_split_pdfs.md index ba26d45..88599a8 100644 --- a/docs/howto/pdf_manipulation.md +++ b/docs/howto/merge_and_split_pdfs.md @@ -1,3 +1,8 @@ +--- +title: Merge and split PDFs +description: How to merge multiple PDFs and split a single PDF into pages or ranges from the command line using parxy pdf:merge and parxy pdf:split. +--- + # How to Manipulate PDFs with Parxy Parxy provides powerful **PDF manipulation commands** that allow you to merge multiple PDF files into one or split a single PDF into multiple files — all from the command line. diff --git a/docs/howto/pdf_attachments.md b/docs/howto/pdf_attachments.md index e0a62e7..5e2a785 100644 --- a/docs/howto/pdf_attachments.md +++ b/docs/howto/pdf_attachments.md @@ -1,3 +1,8 @@ +--- +title: Work with PDF attachments +description: How to add, list, extract, and remove file attachments embedded in a PDF using Parxy's CLI commands, with examples for common attachment workflows. +--- + # How to Work with PDF Attachments Parxy provides comprehensive **PDF attachment commands** that allow you to add, list, extract, and remove file attachments in PDF documents — all from the command line. @@ -473,6 +478,6 @@ parxy attach:remove --help ## Related Documentation -- [PDF Manipulation](./pdf_manipulation.md) - Learn about merging and splitting PDFs +- [Merge and split PDFs](./merge_and_split_pdfs.md) - [Getting Started Tutorial](../tutorials/getting_started.md) - General introduction to Parxy CLI - [Using the CLI](../tutorials/using_cli.md) - Basic CLI usage patterns diff --git a/docs/installation_and_setup.md b/docs/installation_and_setup.md new file mode 100644 index 0000000..4d243e3 --- /dev/null +++ b/docs/installation_and_setup.md @@ -0,0 +1,94 @@ +--- +title: Installation and setup +description: Quick instructions to install Parxy via pip, uv, or uvx and configuration via environment variables. +weight: 3 +--- + +# Installation and Setup + +## Requirements + +- Python **3.12** or **3.13** + +## Installation + +Parxy can be installed via pip or uv, or run without installation using uvx. + +### Via pip + +```bash +pip install parxy # Basic installation (PyMuPDF and PdfAct drivers) +pip install parxy[all] # All drivers included +``` + +### Via uv + +```bash +uv add parxy # Basic installation +uv add parxy --extra all # All drivers included +``` + +### Without installation (uvx) + +[`uvx`](https://docs.astral.sh/uv/guides/tools/) runs Parxy in an isolated environment without a permanent install: + +```bash +# Basic drivers only +uvx parxy --help +``` + +```bash +# All drivers included +uvx --from 'parxy[all]' parxy --help +``` + +### Installing specific drivers + +If you only need a particular driver, install its extra instead of `all`: + +```bash +pip install parxy[llama] # LlamaParse +pip install parxy[llmwhisperer] # LLMWhisperer +pip install parxy[landingai] # Landing AI +pip install parxy[unstructured_local] # Unstructured library +``` + +See [Supported Services](./supported_services.md) for the full list of drivers and their extras. + +## Environment variables and API keys + +Some drivers require an API key. Parxy reads these from environment variables, which can be set in a `.env` file in your project root. + +To generate a template `.env` file: + +```bash +parxy env +``` + +Then fill in the keys for the services you use: + +```bash +# LlamaParse +PARXY_LLAMAPARSE_API_KEY= + +# Unstract LLMWhisperer +PARXY_LLMWHISPERER_API_KEY= +``` + +### Core environment variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `PARXY_DEFAULT_DRIVER` | `pymupdf` | Driver used when none is specified | +| `PARXY_LOGGING_LEVEL` | `INFO` | Logging verbosity | +| `PARXY_LOGGING_FILE` | *(none)* | Path to write log output | + +### Self-hosted services + +Some drivers (such as PdfAct) can be run locally via Docker. To generate a Docker Compose configuration: + +```bash +parxy docker +``` + +This produces a `compose.yaml` you can start with `docker compose up`. diff --git a/docs/introduction.md b/docs/introduction.md new file mode 100644 index 0000000..0eb7520 --- /dev/null +++ b/docs/introduction.md @@ -0,0 +1,86 @@ +--- +title: Introduction +description: What Parxy is, how it works, and a quick look at the CLI commands and Python library API before you dive in. +weight: 1 +--- + +# Introduction + +Parxy is a document processing gateway with a unified interface for multiple document parsing services. Via a common unified model it allows to swap providers without rewriting your application. + +- Single API across different providers (local libraries and remote APIs) +- Supports PyMuPDF, Unstructured, LlamaParse, LLMWhisperer, PdfAct, and more +- Custom drivers can be registered directly in your application code +- Execution tracing to help debug parsing issues + +## Available as CLI and library + +Parxy works as a command line tool or as a Python library. + +The quickest way to try it out is via [`uvx`](https://docs.astral.sh/uv/concepts/tools/#execution-vs-installation): + +```bash +uvx parxy --help +``` + +To include all supported drivers: + +```bash +uvx --from 'parxy[all]' parxy --help +``` + +See [Installation and Setup](./installation_and_setup.md) for the full installation options. + +## CLI overview + +Once installed, `parxy` provides the following commands: + +| Command | Description | +|---------|-------------| +| `parxy parse` | Extract text content from documents with customizable granularity levels and output formats | +| `parxy markdown` | Convert documents into Markdown format, with optional combining of multiple documents | +| `parxy drivers` | List available document processing drivers | +| `parxy env` | Create a configuration file with default settings | +| `parxy docker` | Generate a Docker Compose configuration for self-hosted services | +| `parxy pdf:merge` | Merge multiple PDF files into one, with support for selecting specific page ranges | +| `parxy pdf:split` | Split a PDF file into individual pages | + +```bash +# Parse a PDF to markdown +parxy parse --mode markdown document.pdf + +# Launch interactive TUI for parser comparison +parxy tui ./documents + +# Merge multiple PDFs with page ranges +parxy pdf:merge cover.pdf doc1.pdf[1:10] doc2.pdf -o merged.pdf +``` + +Run `parxy --help` for the full list of options. + +## Library overview + +Parxy can also be used directly in Python. After installation, import the `Parxy` facade: + +```python +from parxy_core.facade import Parxy + +# Parse a document using the default driver +doc = Parxy.parse('path/to/document.pdf') + +print(f"Pages: {len(doc.pages)}") +print(f"Title: {doc.metadata.title}") + +# Use a specific driver +doc = Parxy.driver(Parxy.LLAMAPARSE).parse('path/to/document.pdf') +``` + +Every driver returns the same `Document` structure, so you can switch providers without changing how you process the output. + +For a step-by-step walkthrough, see the [Getting Started tutorial](./tutorials/getting_started.md). + +## Next steps + +- [Installation and first run](./installation_and_setup.md) +- [Available drivers](./supported_services.md) and their installation +- [Parse your first document](./tutorials/getting_started.md) diff --git a/docs/reference/cli.md b/docs/reference/cli.md new file mode 100644 index 0000000..59922bc --- /dev/null +++ b/docs/reference/cli.md @@ -0,0 +1,269 @@ +--- +title: CLI reference +description: Command line reference with all parxy commands, including arguments, options, types, and defaults. Prefer to run parxy --help and parxy --help if you have access to the terminal. +--- + + + + +# CLI reference + +## `parxy agents` + +Set up AI agent configuration files for Parxy projects. + +Creates or updates an AGENTS.md file with Parxy usage documentation. +If AGENTS.md exists, the Parxy section (marked with tags) is +added or updated while preserving other content. + +Optionally creates Claude Code skill files for common operations. + +``` +parxy agents [OPTIONS] +``` + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `path` | - | Output directory for agent files. Defaults to current directory. | +| `--overwrite` | `-f` | `flag` | `false` | Overwrite existing Parxy section without prompting. | + +## `parxy attach` + +Extract an attached file from a PDF + +``` +parxy attach [OPTIONS] INPUT_FILE NAME +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUT_FILE` | Yes | PDF file containing the attachment | +| `NAME` | Yes | Name of attached file to extract | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `text` | - | Output file path. If not specified, saves to current directory with original name. | +| `--stdout` | - | `flag` | `false` | Output content to stdout (text files only) | + +## `parxy attach:add` + +Add files as attachments to a PDF + +``` +parxy attach:add [OPTIONS] INPUT_FILE FILES... +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUT_FILE` | Yes | PDF file to add attachments to | +| `FILES` | Yes | One or more files to attach | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `text` | - | Output file path. If not specified, creates {input}_with_attachments.pdf | +| `--description` | `-d` | `text` | - | Description for attached file(s). Matched by position to files. | +| `--name` | `-n` | `text` | - | Custom name(s) for attached file(s). Matched by position to files. | +| `--overwrite` | - | `flag` | `false` | Overwrite existing attachments with same name | + +## `parxy attach:list` + +List attached files in a PDF + +``` +parxy attach:list [OPTIONS] INPUT_FILE +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUT_FILE` | Yes | PDF file to inspect | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--verbose` | `-v` | `flag` | `false` | Show detailed information | + +## `parxy attach:remove` + +Remove attached files from a PDF + +``` +parxy attach:remove [OPTIONS] INPUT_FILE NAMES... +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUT_FILE` | Yes | PDF file to process | +| `NAMES` | No | Names of attachments to remove | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `text` | - | Output file path. If not specified, creates {input}_no_attachments.pdf | +| `--all` | - | `flag` | `false` | Remove all attached files | + +## `parxy docker` + +Create a Docker Compose file to run self-hostable parsers (experimental). + +``` +parxy docker +``` + +## `parxy drivers` + +List supported drivers. + +``` +parxy drivers +``` + +## `parxy env` + +Create an environment file with Parxy configuration. + +``` +parxy env +``` + +## `parxy markdown` + +Parse documents to Markdown. + +Accepts PDF files (parsed on-the-fly) or pre-parsed JSON result files +(loaded directly from the Document model without re-parsing). + +``` +parxy markdown [OPTIONS] INPUTS... +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUTS` | Yes | One or more files or folders to parse. Use --recursive to search subdirectories. | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--driver` | `-d` | `text` | - | Driver(s) to use for parsing. Can be specified multiple times. (default: pymupdf or PARXY_DEFAULT_DRIVER) | +| `--level` | `-l` | `page` | `block` | `line` | `span` | `character` | `block` | Extraction level | +| `--output` | `-o` | `text` | - | Directory to save markdown files. If not specified, files are saved next to the source files. | +| `--inline` | `-i` | `flag` | `false` | Output markdown to stdout with file name as YAML frontmatter. Only valid with a single file. | +| `--recursive` | `-r` | `flag` | `false` | Recursively search subdirectories when processing folders | +| `--max-depth` | - | `integer range` | - | Maximum depth to recurse into subdirectories (only applies with --recursive). 0 = current directory only, 1 = one level down, etc. | +| `--stop-on-failure` | - | `flag` | `false` | Stop processing files immediately if an error occurs with any file | +| `--workers` | `-w` | `integer range` | - | Number of parallel workers to use. Defaults to cpu count. | +| `--page-separators` | - | `flag` | `false` | Insert HTML comments before each page's content. | + +## `parxy parse` + +Parse documents using one or more drivers. + +This command processes PDF documents and extracts their content in various formats. +You can specify individual files or entire folders to process. + +``` +parxy parse [OPTIONS] INPUTS... +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUTS` | Yes | One or more files or folders to parse. Use --recursive to search subdirectories. | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--driver` | `-d` | `text` | - | Driver(s) to use for parsing. Can be specified multiple times. (default: pymupdf or PARXY_DEFAULT_DRIVER) | +| `--level` | `-l` | `page` | `block` | `line` | `span` | `character` | `block` | Extraction level | +| `--mode` | `-m` | `json` | `plain` | `markdown` | `json` | Output mode: json (JSON serialization), plain (plain text), or markdown (markdown format) | +| `--output` | `-o` | `text` | - | Directory to save output files. If not specified, files will be saved in the same directory as the source files. | +| `--show` | `-s` | `flag` | `false` | Show document content in console in addition to saving to files | +| `--recursive` | `-r` | `flag` | `false` | Recursively search subdirectories when processing folders | +| `--max-depth` | - | `integer range` | - | Maximum depth to recurse into subdirectories (only applies with --recursive). 0 = current directory only, 1 = one level down, etc. | +| `--stop-on-failure` | - | `flag` | `false` | Stop processing files immediately if an error occurs with any file | +| `--workers` | `-w` | `integer range` | - | Number of parallel workers to use. Defaults to cpu count. | + +## `parxy pdf:merge` + +Merge multiple PDF files into a single PDF + +``` +parxy pdf:merge [OPTIONS] INPUTS... +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUTS` | Yes | One or more PDF files or folders to merge. Files support page ranges in square brackets (e.g., file.pdf[1:3]). Folders are processed non-recursively. | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `text` | - | Output file path for the merged PDF. If not specified, you will be prompted. | + +## `parxy pdf:split` + +Split a PDF file into individual pages + +``` +parxy pdf:split [OPTIONS] INPUT_FILE +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUT_FILE` | Yes | PDF file to split | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `text` | - | Output path. Without --combine: output directory for split files (default: folder next to input). With --combine: output file path (default: {stem}_pages_{from}-{to}.pdf next to input). | +| `--prefix` | `-p` | `text` | - | Prefix for output filenames. If not specified, uses the input filename. | +| `--pages` | - | `text` | - | Page range to extract (1-based). Examples: "1" (single page), "1:3" (pages 1-3), ":3" (up to page 3), "3:" (from page 3). If not specified, all pages are extracted. | +| `--combine` | - | `flag` | `false` | Combine extracted pages into a single PDF instead of one file per page. | + +## `parxy tui` + +Launch the Parxy TUI for interactive parser comparison + +``` +parxy tui WORKSPACE +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `WORKSPACE` | No | Path to the workspace folder (optional — can be selected inside the TUI) | + +## `parxy version` + +Print Parxy version information. + +``` +parxy version +``` diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md new file mode 100644 index 0000000..2c92d7d --- /dev/null +++ b/docs/reference/configuration.md @@ -0,0 +1,109 @@ +--- +title: Configuration reference +description: Configuration options for Parxy and the drivers. Settings are read from the environment or a .env file. Run parxy env to generate a starter .env with some default. +--- + + + + +# Configuration reference + +All settings are read from environment variables or a `.env` file in your project root. + +Run `parxy env` to generate a template `.env` with usual configuration options. + +## Core settings + +Prefix: `PARXY_` + +| Variable | Default | Description | +|----------|---------|-------------| +| `PARXY_DEFAULT_DRIVER` | `pymupdf` | The default driver to use in case nothing is specified. | +| `PARXY_LOGGING_LEVEL` | `INFO` | The logging level. | +| `PARXY_LOGGING_FILE` | - | The log file path. | +| `PARXY_THEME` | - | The console theme to use. One of: `light`, `dark`. | + +## Observability / tracing + +Prefix: `PARXY_TRACING_` + +| Variable | Default | Description | +|----------|---------|-------------| +| `PARXY_TRACING_ENABLE` | `false` | Enable sending traces to the observability service. | +| `PARXY_TRACING_API_KEY` | *(secret)* | The authentication key (used for both traces and metrics unless overridden). | +| `PARXY_TRACING_ENDPOINT` | `http://localhost:4318/` | The base url of the Open Telemetry collector endpoint. | +| `PARXY_TRACING_ENABLE_METRICS` | `false` | Enable sending metrics to the telemetry service. | +| `PARXY_TRACING_TRACES_ENDPOINT` | *(computed)* | The endpoint for the traces exporter. | +| `PARXY_TRACING_METRICS_ENDPOINT` | *(computed)* | The endpoint for the metrics exporter. | +| `PARXY_TRACING_TIMEOUT_SECONDS` | `10` | The client timeout when sending traces. | +| `PARXY_TRACING_USE_COMPRESSION` | `true` | The client should compress traces before send. | +| `PARXY_TRACING_VERBOSE` | `true` | Log when traces are sent. | +| `PARXY_TRACING_AUTHENTICATION_HEADER` | `Authorization` | The header in which the api key needs to be included for authentication purposes. | + +## PdfAct + +Prefix: `PARXY_PDFACT_` + +| Variable | Default | Description | +|----------|---------|-------------| +| `PARXY_PDFACT_BASE_URL` | `http://localhost:4567/` | The base URL of the PdfAct API. | +| `PARXY_PDFACT_API_KEY` | *(secret)* | The authentication key. | + +## LlamaParse + +Prefix: `PARXY_LLAMAPARSE_` + +| Variable | Default | Description | +|----------|---------|-------------| +| `PARXY_LLAMAPARSE_BASE_URL` | `https://api.cloud.eu.llamaindex.ai` | The base URL of the Llama Parsing API. | +| `PARXY_LLAMAPARSE_API_KEY` | *(secret)* | The authentication key | +| `PARXY_LLAMAPARSE_ORGANIZATION_ID` | - | The organization ID for the LlamaParse API. | +| `PARXY_LLAMAPARSE_PROJECT_ID` | - | The project ID for the LlamaParse API. | +| `PARXY_LLAMAPARSE_NUM_WORKERS` | `4` | The number of workers to use sending API requests for parsing. | +| `PARXY_LLAMAPARSE_SHOW_PROGRESS` | `false` | Show progress when parsing multiple files. | +| `PARXY_LLAMAPARSE_VERBOSE` | `false` | Whether to print the progress of the parsing. | +| `PARXY_LLAMAPARSE_PARSE_MODE` | `parse_page_with_llm` | Parsing mode to use. | +| `PARXY_LLAMAPARSE_PRESET` | - | Parser preset. | +| `PARXY_LLAMAPARSE_MODEL` | - | Document model name for parse_with_agent mode. | +| `PARXY_LLAMAPARSE_PREMIUM_MODE` | `false` | Use best parser mode if set to True. | +| `PARXY_LLAMAPARSE_FAST_MODE` | `false` | Use faster mode that skips OCR of images and table/heading reconstruction. | +| `PARXY_LLAMAPARSE_DISABLE_OCR` | `false` | Disable the OCR on the document. | +| `PARXY_LLAMAPARSE_DISABLE_IMAGE_EXTRACTION` | `false` | If set to true, the parser will not extract images from the document. | +| `PARXY_LLAMAPARSE_HIGH_RES_OCR` | `false` | Use high resolution OCR to extract text from images. | +| `PARXY_LLAMAPARSE_EXTRACT_LAYOUT` | `false` | Extract layout information from the document. | +| `PARXY_LLAMAPARSE_SKIP_DIAGONAL_TEXT` | `false` | Skip diagonal text (when text rotation in degrees modulo 90 is not 0). | +| `PARXY_LLAMAPARSE_LANGUAGE` | `en` | Language of the text to parse. | +| `PARXY_LLAMAPARSE_DO_NOT_UNROLL_COLUMNS` | `false` | Keep columns in text according to document layout. | +| `PARXY_LLAMAPARSE_TARGET_PAGES` | - | Target pages to extract. | +| `PARXY_LLAMAPARSE_MAX_PAGES` | - | Maximum number of pages to extract. | +| `PARXY_LLAMAPARSE_CONTINUOUS_MODE` | `false` | Parse documents continuously for better results on tables spanning multiple pages. | +| `PARXY_LLAMAPARSE_AUTO_MODE` | `false` | Automatically select best mode based on page content. | +| `PARXY_LLAMAPARSE_DO_NOT_CACHE` | `true` | If set to true, the document will not be cached. | + +## LLMWhisperer + +Prefix: `PARXY_LLMWHISPERER_` + +| Variable | Default | Description | +|----------|---------|-------------| +| `PARXY_LLMWHISPERER_BASE_URL` | `https://llmwhisperer-api.eu-west.unstract.com/api/v2` | The base URL of the LlmWhisperer API v2. | +| `PARXY_LLMWHISPERER_API_KEY` | *(secret)* | The authentication key. | +| `PARXY_LLMWHISPERER_LOGGING_LEVEL` | `INFO` | The logging level for the client. | +| `PARXY_LLMWHISPERER_MODE` | `form` | Default parsing mode. | + +## Landing AI + +Prefix: `PARXY_LANDINGAI_` + +| Variable | Default | Description | +|----------|---------|-------------| +| `PARXY_LANDINGAI_API_KEY` | *(secret)* | The authentication key. | +| `PARXY_LANDINGAI_ENVIRONMENT` | `eu` | The environment to use. One of: `production`, `eu`. | +| `PARXY_LANDINGAI_BASE_URL` | - | The base URL of the Landing AI ADE API. | + +## Unstructured library + +Prefix: `PARXY_UNSTRUCTURED_LOCAL_` + +| Variable | Default | Description | +|----------|---------|-------------| diff --git a/docs/supported_services.md b/docs/supported_services.md new file mode 100644 index 0000000..6bc6944 --- /dev/null +++ b/docs/supported_services.md @@ -0,0 +1,54 @@ +--- +title: Supported services +description: All document processing services and libraries supported by Parxy, their stability status, required extras, and how to register a custom driver. +weight: 2 +--- + +# Supported Services + +Parxy supports the following document processing services and libraries. The **Extra** column shows the optional dependency group to install for each driver. + +| Service or Library | Support status | Extra | Local file | Remote file | +|--------------------|----------------|-------|------------|-------------| +| [**PyMuPDF**](https://pymupdf.readthedocs.io/en/latest/) | Live | *(included)* | ✅ | ✅ | +| [**PdfAct**](https://github.com/data-house/pdfact) | Live | *(included)* | ✅ | ✅ | +| [**Unstructured** library](https://docs.unstructured.io/open-source/introduction/overview) | Preview | `unstructured_local` | ✅ | ✅ | +| [**Landing AI Agentic Document Extraction**](https://landing.ai/agentic-document-extraction) | Preview | `landingai` | ✅ | ✅ | +| [**LlamaParse**](https://docs.cloud.llamaindex.ai/llamaparse/overview) | Preview | `llama` | ✅ | ✅ | +| [**LLMWhisperer**](https://docs.unstract.com/llmwhisperer/index.html) | Preview | `llmwhisperer` | ✅ | ✅ | + + +Status meanings: **Live** = stable; **Preview** = functional but the API may change. + +## Adding a custom driver (Live Extension) + +You can register a new driver directly in your application code — no fork required. + +**1. Create a class that inherits from `Driver`** + +```python +from parxy_core.drivers import Driver +from parxy_core.models import Document + +class CustomDriverExample(Driver): + """Example custom driver.""" + + def _handle(self, file, level="page") -> Document: + return Document(pages=[]) +``` + +**2. Register it with `Parxy.extend()`** + +```python +from parxy_core.facade import Parxy + +Parxy.extend(name='my_parser', callback=lambda: CustomDriverExample()) +``` + +**3. Use it** + +```python +Parxy.driver('my_parser').parse('path/to/document.pdf') +``` + +For a full guide on building and publishing a driver, see [How to Add a New Parser to Parxy](./howto/add_new_parser.md). diff --git a/docs/tutorials/agentic_usage.md b/docs/tutorials/agentic_usage.md index 65a9659..830dd83 100644 --- a/docs/tutorials/agentic_usage.md +++ b/docs/tutorials/agentic_usage.md @@ -1,3 +1,8 @@ +--- +title: Agentic usage +description: How to use Parxy alongside AI coding assistants like Claude Code, GitHub Copilot, and Cursor, including setup and common prompting patterns. +--- + # Agentic Usage You can use Parxy with AI coding assistants such as Claude Code, GitHub Copilot, Cursor, and other AI-powered development tools. diff --git a/docs/tutorials/getting_started.md b/docs/tutorials/getting_started.md index d63342b..6d48a19 100644 --- a/docs/tutorials/getting_started.md +++ b/docs/tutorials/getting_started.md @@ -1,6 +1,11 @@ +--- +title: Getting started +description: Parse your first PDF with Parxy, explore the unified Document model, and learn how to extract plain text and Markdown from any supported driver. +--- + # Getting Started with Parxy -Welcome to your first experience with **Parxy** — a unified Python interface for document parsing. +**Parxy** is a unified Python interface for document parsing. This tutorial will guide you step-by-step through: @@ -151,11 +156,11 @@ In this tutorial you: You're now ready to try more advanced use cases, such as: -* [Using Parxy from the command line](using_cli.md) +* [Using Parxy from the command line](./using_cli.md) * [Processing multiple documents in parallel](../howto/batch_processing.md) -* Comparing different parsers on the same document * [Extending Parxy with a custom driver](../howto/add_new_parser.md) * [Monitoring document processing with OpenTelemetry](../howto/configure_observability.md) +* Comparing different parsers on the same document > [!TIP] diff --git a/docs/tutorials/merge_pdf.md b/docs/tutorials/merge_pdf.md new file mode 100644 index 0000000..03fffa1 --- /dev/null +++ b/docs/tutorials/merge_pdf.md @@ -0,0 +1,121 @@ +--- +title: Merge PDFs +description: How to combine multiple PDF files into one using Parxy, with optional page ranges, and post-merge optimization. +--- + +# Merge PDFs + +This tutorial covers how to combine multiple PDF files into a single document using Parxy's Python API, including page selection, file size optimization, and embedding attachments. + +## What You'll Learn + +By the end of this tutorial, you'll be able to: + +- Merge multiple PDFs into a single file +- Select specific page ranges from each input file +- Reduce the file size of the merged output +- Embed data files as attachments in the final PDF + +## Merging PDFs + +The `Parxy.pdf.merge()` method combines multiple PDF files in the order they are provided. + +```python +from pathlib import Path +from parxy_core.facade.parxy import Parxy + +# Merge two complete PDFs +Parxy.pdf.merge( + inputs=[ + (Path("chapter1.pdf"), None, None), # All pages + (Path("chapter2.pdf"), None, None), # All pages + ], + output=Path("book.pdf") +) +``` + +Each entry in `inputs` is a tuple of `(path, from_page, to_page)`. Pass `None` for either page boundary to include up to the start or end of the file. + +### Selecting page ranges + +Pages use 0-based indexing. To include only a subset of pages from a file: + +```python +Parxy.pdf.merge( + inputs=[ + (Path("intro.pdf"), 0, 0), # Only the first page + (Path("content.pdf"), 0, 9), # Pages 1–10 + (Path("appendix.pdf"), 4, None), # From page 5 to the end + ], + output=Path("selected.pdf") +) +``` + +## Optimizing the merged output + +After merging, you can reduce file size with `Parxy.pdf.optimize()`: + +```python +from pathlib import Path +from parxy_core.facade.parxy import Parxy + +result = Parxy.pdf.optimize( + input_path=Path("merged.pdf"), + output_path=Path("merged_optimized.pdf") +) + +print(f"Original: {result['original_size']:,} bytes") +print(f"Optimized: {result['optimized_size']:,} bytes") +print(f"Reduction: {result['reduction_percent']:.1f}%") +``` + +Fine-tune the compression settings as needed: + +```python +result = Parxy.pdf.optimize( + input_path=Path("merged.pdf"), + output_path=Path("merged_web.pdf"), + scrub_metadata=True, # Remove metadata and existing attachments + subset_fonts=True, # Keep only the font glyphs actually used + compress_images=True, # Enable image compression + dpi_threshold=150, # Only process images above 150 DPI + dpi_target=72, # Downsample to 72 DPI + image_quality=60, # JPEG quality (0–100) + convert_to_grayscale=True # Convert to grayscale +) +``` + +> Set `scrub_metadata=False` if you plan to add attachments after optimizing, as `scrub_metadata=True` removes any existing embedded files. + +## Error handling + +```python +from pathlib import Path +from parxy_core.facade.parxy import Parxy +from parxy_core.services.pdf_service import PdfService + +# Missing input file +try: + Parxy.pdf.merge([(Path("missing.pdf"), None, None)], Path("out.pdf")) +except FileNotFoundError as e: + print(f"File not found: {e}") + +# Invalid optimization parameter +try: + Parxy.pdf.optimize(Path("doc.pdf"), Path("out.pdf"), image_quality=150) +except ValueError as e: + print(f"Invalid parameter: {e}") + +# Adding an attachment that already exists +with PdfService(Path("document.pdf")) as pdf: + pdf.add_attachment(Path("data.csv")) + try: + pdf.add_attachment(Path("data.csv")) # Duplicate name + except ValueError as e: + print(f"Attachment conflict: {e}") +``` + +## Next steps + +- [Split PDFs](./split_pdf.md) — split a document into pages or page ranges +- [CLI reference: pdf:merge](../reference/cli.md#parxy-pdfmerge) — the same operations from the command line diff --git a/docs/tutorials/pdf_manipulation.md b/docs/tutorials/pdf_manipulation.md deleted file mode 100644 index afa8604..0000000 --- a/docs/tutorials/pdf_manipulation.md +++ /dev/null @@ -1,398 +0,0 @@ -# PDF Manipulation with Parxy - -This tutorial covers how to manipulate PDF files programmatically using Parxy's Python API. You'll learn to merge, split, optimize PDFs, and manage file attachments. - -## What You'll Learn - -By the end of this tutorial, you'll be able to: - -- Merge multiple PDFs into a single file -- Split a PDF into individual pages -- Optimize PDF file size with compression -- Add, list, extract, and remove PDF attachments -- Choose between the facade API and context manager patterns - -## Two Ways to Manipulate PDFs - -Parxy provides two complementary approaches for PDF manipulation: - -| Approach | Best For | Pattern | -|----------|----------|---------| -| `Parxy.pdf` facade | Quick, one-off operations (merge, split, optimize) | Static methods | -| `PdfService` context manager | Working with a single PDF (attachments, modifications) | `with` statement | - -## Part 1: Using the Parxy.pdf Facade - -The `Parxy.pdf` namespace provides static methods for common PDF operations that don't require keeping a file open. - -### Merging PDFs - -Combine multiple PDF files into one: - -```python -from pathlib import Path -from parxy_core.facade.parxy import Parxy - -# Merge two complete PDFs -Parxy.pdf.merge( - inputs=[ - (Path("chapter1.pdf"), None, None), # All pages - (Path("chapter2.pdf"), None, None), # All pages - ], - output=Path("book.pdf") -) -``` - -You can also select specific page ranges (0-based indexing): - -```python -# Merge specific pages from different PDFs -Parxy.pdf.merge( - inputs=[ - (Path("intro.pdf"), 0, 0), # Only first page - (Path("content.pdf"), 0, 9), # Pages 1-10 - (Path("appendix.pdf"), 4, None), # From page 5 to end - ], - output=Path("selected.pdf") -) -``` - -### Splitting PDFs - -Split a PDF into individual page files: - -```python -from pathlib import Path -from parxy_core.facade.parxy import Parxy - -# Split into individual pages -pages = Parxy.pdf.split( - input_path=Path("document.pdf"), - output_dir=Path("./pages"), - prefix="doc" -) - -# Returns list of created files -for page_path in pages: - print(f"Created: {page_path}") -# Output: -# Created: pages/doc_page_1.pdf -# Created: pages/doc_page_2.pdf -# ... -``` - -You can limit splitting to a page range using 0-based `from_page` / `to_page` indices: - -```python -# Split only pages 2–5 (0-based: indices 1–4) -pages = Parxy.pdf.split( - input_path=Path("document.pdf"), - output_dir=Path("./pages"), - prefix="doc", - from_page=1, - to_page=4, -) -# Creates: doc_page_2.pdf, doc_page_3.pdf, doc_page_4.pdf, doc_page_5.pdf -``` - -### Extracting Pages into a Single PDF - -Use `extract_pages` to pull a page range from a PDF into a new single-file PDF without splitting each page individually: - -```python -from pathlib import Path -from parxy_core.services.pdf_service import PdfService - -# Extract pages 3–7 (0-based: indices 2–6) -PdfService.extract_pages( - input_path=Path("report.pdf"), - output_path=Path("summary.pdf"), - from_page=2, - to_page=6, -) -``` - -Omit `from_page` / `to_page` to copy all pages: - -```python -# Equivalent to a copy -PdfService.extract_pages(Path("original.pdf"), Path("copy.pdf")) -``` - -### Optimizing PDFs - -Reduce PDF file size using compression techniques: - -```python -from pathlib import Path -from parxy_core.facade.parxy import Parxy - -# Basic optimization with defaults -result = Parxy.pdf.optimize( - input_path=Path("large_scan.pdf"), - output_path=Path("optimized.pdf") -) - -print(f"Original: {result['original_size']:,} bytes") -print(f"Optimized: {result['optimized_size']:,} bytes") -print(f"Reduction: {result['reduction_percent']:.1f}%") -``` - -Fine-tune optimization settings: - -```python -# Aggressive optimization for web delivery -result = Parxy.pdf.optimize( - input_path=Path("presentation.pdf"), - output_path=Path("web_ready.pdf"), - scrub_metadata=True, # Remove metadata and attachments - subset_fonts=True, # Keep only used font glyphs - compress_images=True, # Compress images - dpi_threshold=150, # Process images above 150 DPI - dpi_target=72, # Downsample to 72 DPI - image_quality=60, # JPEG quality (0-100) - convert_to_grayscale=True # Convert to grayscale -) -``` - -## Part 2: Using PdfService with Context Manager - -For operations that require working with a PDF document (especially attachments), use the `PdfService` class with Python's context manager pattern. - -### Opening a PDF - -```python -from pathlib import Path -from parxy_core.services.pdf_service import PdfService - -# Open PDF within context manager -with PdfService(Path("document.pdf")) as pdf: - # Work with the PDF here - attachments = pdf.list_attachments() - print(f"Found {len(attachments)} attachments") - -# PDF is automatically closed when exiting the block -``` - -> **Important**: Always use `PdfService` within a `with` statement. Operations outside the context manager will raise `RuntimeError`. - -### Listing Attachments - -```python -from pathlib import Path -from parxy_core.services.pdf_service import PdfService - -with PdfService(Path("report.pdf")) as pdf: - attachments = pdf.list_attachments() - - if not attachments: - print("No attachments found") - else: - for name in attachments: - info = pdf.get_attachment_info(name) - print(f"- {name}") - print(f" Size: {info['size']:,} bytes") - print(f" Description: {info.get('description', 'N/A')}") -``` - -### Adding Attachments - -```python -from pathlib import Path -from parxy_core.services.pdf_service import PdfService - -with PdfService(Path("report.pdf")) as pdf: - # Add a file with default name (uses filename) - pdf.add_attachment(Path("data.csv")) - - # Add with custom name and description - pdf.add_attachment( - file_path=Path("analysis.xlsx"), - name="quarterly_analysis.xlsx", - desc="Q4 2024 Financial Analysis" - ) - - # Save the modified PDF - pdf.save(Path("report_with_attachments.pdf")) -``` - -### Extracting Attachments - -```python -from pathlib import Path -from parxy_core.services.pdf_service import PdfService - -with PdfService(Path("package.pdf")) as pdf: - # Extract a specific attachment - content = pdf.extract_attachment("data.json") - - # Save to file - output_path = Path("extracted_data.json") - output_path.write_bytes(content) - print(f"Extracted to {output_path}") -``` - -Extract all attachments: - -```python -from pathlib import Path -from parxy_core.services.pdf_service import PdfService - -output_dir = Path("./extracted") -output_dir.mkdir(exist_ok=True) - -with PdfService(Path("archive.pdf")) as pdf: - for name in pdf.list_attachments(): - content = pdf.extract_attachment(name) - (output_dir / name).write_bytes(content) - print(f"Extracted: {name}") -``` - -### Removing Attachments - -```python -from pathlib import Path -from parxy_core.services.pdf_service import PdfService - -with PdfService(Path("document.pdf")) as pdf: - # Remove a specific attachment - pdf.remove_attachment("old_data.csv") - - # Save changes - pdf.save(Path("document_cleaned.pdf")) -``` - -## Complete Example: Document Processing Pipeline - -Here's a practical example combining multiple operations: - -```python -from pathlib import Path -from parxy_core.facade.parxy import Parxy -from parxy_core.services.pdf_service import PdfService - - -def process_report(input_dir: Path, output_path: Path): - """Merge PDFs, attach source data, and optimize.""" - - # Step 1: Find all PDFs to merge - pdf_files = sorted(input_dir.glob("*.pdf")) - if not pdf_files: - raise ValueError(f"No PDFs found in {input_dir}") - - # Step 2: Merge all PDFs - temp_merged = output_path.parent / "temp_merged.pdf" - Parxy.pdf.merge( - inputs=[(pdf, None, None) for pdf in pdf_files], - output=temp_merged - ) - print(f"Merged {len(pdf_files)} files") - - # Step 3: Add attachments with context manager - with PdfService(temp_merged) as pdf: - # Attach any CSV files from the input directory - for csv_file in input_dir.glob("*.csv"): - pdf.add_attachment( - file_path=csv_file, - desc=f"Source data: {csv_file.name}" - ) - print(f"Attached: {csv_file.name}") - - # Save with attachments - temp_with_attachments = output_path.parent / "temp_attached.pdf" - pdf.save(temp_with_attachments) - - # Step 4: Optimize the final output - result = Parxy.pdf.optimize( - input_path=temp_with_attachments, - output_path=output_path, - scrub_metadata=False, # Keep our attachments! - compress_images=True - ) - - print(f"Final size: {result['optimized_size']:,} bytes") - print(f"Saved to: {output_path}") - - # Cleanup temp files - temp_merged.unlink() - temp_with_attachments.unlink() - - -# Usage -process_report( - input_dir=Path("./quarterly_reports"), - output_path=Path("./output/Q4_2024_combined.pdf") -) -``` - -## Error Handling - -Both APIs raise standard Python exceptions: - -```python -from pathlib import Path -from parxy_core.facade.parxy import Parxy -from parxy_core.services.pdf_service import PdfService - -# FileNotFoundError for missing files -try: - Parxy.pdf.split(Path("missing.pdf"), Path("./out"), "doc") -except FileNotFoundError as e: - print(f"File not found: {e}") - -# ValueError for invalid page ranges -try: - Parxy.pdf.split(Path("doc.pdf"), Path("./out"), "doc", from_page=100) -except ValueError as e: - print(f"Invalid page range: {e}") - -# ValueError for invalid parameters -try: - Parxy.pdf.optimize( - Path("doc.pdf"), - Path("out.pdf"), - image_quality=150 # Must be 0-100 - ) -except ValueError as e: - print(f"Invalid parameter: {e}") - -# KeyError for missing attachments -with PdfService(Path("document.pdf")) as pdf: - try: - pdf.extract_attachment("nonexistent.txt") - except KeyError as e: - print(f"Attachment not found: {e}") - -# RuntimeError for operations outside context manager -pdf = PdfService(Path("document.pdf")) -try: - pdf.list_attachments() # Not inside 'with' block! -except RuntimeError as e: - print(f"Context error: {e}") -``` - -## Summary - -In this tutorial you learned: - -- **`Parxy.pdf.merge()`** - Combine multiple PDFs with optional page ranges -- **`Parxy.pdf.split()`** - Split a PDF into individual page files, with optional page range -- **`PdfService.extract_pages()`** - Extract a page range into a single output PDF -- **`Parxy.pdf.optimize()`** - Reduce file size with compression options -- **`PdfService` context manager** - Work with attachments (add, list, extract, remove) - -### When to Use Each Approach - -| Use `Parxy.pdf` when... | Use `PdfService` when... | -|-------------------------|--------------------------| -| Merging multiple files | Adding/removing attachments | -| Splitting into pages | Extracting attachment content | -| Optimizing file size | Multiple operations on one file | -| One-shot operations | Need fine-grained control | -| Splitting a page range | Extracting a page range into one PDF (`extract_pages`) | - -## Next Steps - -- [PDF Manipulation from CLI](../howto/pdf_manipulation.md) - Command-line usage -- [Working with Attachments](working_with_attachments.md) - CLI attachment commands -- [Batch Processing](../howto/batch_processing.md) - Process multiple documents diff --git a/docs/tutorials/split_pdf.md b/docs/tutorials/split_pdf.md new file mode 100644 index 0000000..e596496 --- /dev/null +++ b/docs/tutorials/split_pdf.md @@ -0,0 +1,126 @@ +--- +title: Split PDFs +description: How to split a PDF into individual pages or page ranges, extract a subset into a new file. +--- + +# Split PDFs + +This tutorial covers how to break a PDF apart using Parxy's Python API: splitting into individual pages, extracting a page range, splitting by chunks, and working with file attachments embedded in a PDF. + +## What You'll Learn + +By the end of this tutorial, you'll be able to: + +- Split a PDF into one file per page +- Extract a contiguous page range into a single new PDF +- Split a PDF into fixed-size chunks + +## Splitting into individual pages + +`Parxy.pdf.split()` writes each page as a separate file inside an output directory: + +```python +from pathlib import Path +from parxy_core.facade.parxy import Parxy + +pages = Parxy.pdf.split( + input_path=Path("document.pdf"), + output_dir=Path("./pages"), + prefix="doc" +) + +for page_path in pages: + print(f"Created: {page_path}") +# Created: pages/doc_page_1.pdf +# Created: pages/doc_page_2.pdf +# ... +``` + +### Splitting a specific page range + +Page indices are 0-based. To split only a subset of the document: + +```python +# Split pages 2–5 (0-based indices 1–4) +pages = Parxy.pdf.split( + input_path=Path("document.pdf"), + output_dir=Path("./pages"), + prefix="doc", + from_page=1, + to_page=4, +) +# Creates: doc_page_2.pdf, doc_page_3.pdf, doc_page_4.pdf, doc_page_5.pdf +``` + +### Splitting into fixed-size chunks + +To group pages into chunks of N rather than one file per page, use `PdfService.split_pdf_by_chunk()` directly: + +```python +from pathlib import Path +from parxy_core.services.pdf_service import PdfService + +output_dir = Path("./chunks") +output_dir.mkdir(exist_ok=True) + +chunks = PdfService.split_pdf_by_chunk( + input_path=Path("document.pdf"), + output_dir=output_dir, + prefix="chunk", + chunk_size=10, # 10 pages per file +) + +for chunk_path in chunks: + print(f"Created: {chunk_path}") +``` + +## Extracting pages into a single PDF + +When you want a contiguous page range as one file rather than individual pages, use `PdfService.extract_pages()`: + +```python +from pathlib import Path +from parxy_core.services.pdf_service import PdfService + +# Extract pages 3–7 (0-based indices 2–6) +PdfService.extract_pages( + input_path=Path("report.pdf"), + output_path=Path("summary.pdf"), + from_page=2, + to_page=6, +) +``` + +Omit both page arguments to copy the whole document: + +```python +PdfService.extract_pages(Path("original.pdf"), Path("copy.pdf")) +``` + +## Error handling + +```python +from pathlib import Path +from parxy_core.facade.parxy import Parxy +from parxy_core.services.pdf_service import PdfService + +# Invalid page range +try: + Parxy.pdf.split(Path("doc.pdf"), Path("./out"), "doc", from_page=100) +except ValueError as e: + print(f"Invalid page range: {e}") + +# Missing attachment +with PdfService(Path("document.pdf")) as pdf: + try: + pdf.extract_attachment("nonexistent.txt") + except KeyError as e: + print(f"Attachment not found: {e}") + +# PdfService used outside a context manager +pdf = PdfService(Path("document.pdf")) +try: + pdf.list_attachments() +except RuntimeError as e: + print(f"Context error: {e}") +``` diff --git a/docs/tutorials/using_cli.md b/docs/tutorials/using_cli.md index 91acee8..aa14f57 100644 --- a/docs/tutorials/using_cli.md +++ b/docs/tutorials/using_cli.md @@ -300,7 +300,7 @@ parxy pdf:split document.pdf --pages 2:5 --combine -o extracted.pdf Page range formats (1-based): `3` · `2:5` · `:5` · `3:` -For more detailed examples and use cases, see the [PDF Manipulation How-to Guide](../howto/pdf_manipulation.md). +For more detailed examples and use cases, see the [Merge and split PDFs](../howto/merge_and_split_pdfs.md) guide. ## Managing Drivers diff --git a/docs/tutorials/working_with_attachments.md b/docs/tutorials/working_with_attachments.md index e88285e..6850d4f 100644 --- a/docs/tutorials/working_with_attachments.md +++ b/docs/tutorials/working_with_attachments.md @@ -1,3 +1,8 @@ +--- +title: Working with PDF attachments +description: Step-by-step walkthrough for adding, listing, extracting, and removing file attachments embedded in a PDF using Parxy's CLI attachment commands. +--- + # Tutorial: Working with PDF Attachments In this tutorial, you'll learn how to incorporate file attachments into PDF documents using Parxy's attachment commands. We'll walk through a real-world scenario where you need to bundle data files with a report. @@ -413,7 +418,7 @@ parxy attach:list document.pdf Now that you understand PDF attachments, you can: 1. **Explore other Parxy features:** - - [PDF Manipulation](../howto/pdf_manipulation.md) - Merge and split PDFs + - [Merge and split PDFs](../howto/merge_and_split_pdfs.md) - [Using the CLI](./using_cli.md) - Learn more CLI patterns 2. **Apply to your work:** diff --git a/scripts/generate_docs.py b/scripts/generate_docs.py new file mode 100644 index 0000000..9ff653e --- /dev/null +++ b/scripts/generate_docs.py @@ -0,0 +1,397 @@ +#!/usr/bin/env python3 +""" +Generate CLI and configuration reference documentation for Parxy. + +Run this script whenever CLI commands or configuration options change: + + python scripts/generate_docs.py + +Writes: + docs/reference/cli.md + docs/reference/configuration.md +""" + +import ast +import inspect +import logging +import sys +import textwrap +import types +import typing +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(ROOT / 'src')) + +GENERATED_NOTICE = """\ + + +""" + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + + +def _literal_values(annotation) -> list | None: + """ + Return the allowed literal values if *annotation* is (or wraps) a Literal type. + + Handles all three common forms: + - ``Literal['a', 'b']`` + - ``Optional[Literal['a', 'b']]`` (typing.Union with None) + - ``Literal['a', 'b'] | None`` (Python 3.10+ union syntax) + """ + if annotation is None: + return None + + origin = getattr(annotation, '__origin__', None) + + # Direct: Literal['a', 'b'] + if origin is typing.Literal: + return list(annotation.__args__) + + # Union: Optional[Literal[...]] or Literal[...] | None + is_union = origin is typing.Union or isinstance(annotation, types.UnionType) + if is_union: + for arg in getattr(annotation, '__args__', ()): + if arg is type(None): + continue + if getattr(arg, '__origin__', None) is typing.Literal: + return list(arg.__args__) + + return None + + +def _type_label(click_type) -> str: + """Human-readable label for a Click parameter type.""" + import click + + if isinstance(click_type, click.Choice): + return ' | '.join(f'`{c}`' for c in click_type.choices) + name = getattr(click_type, 'name', None) or type(click_type).__name__ + return f'`{name.lower()}`' + + +def _default_label(value, is_flag: bool = False) -> str: + """Human-readable default value for a CLI option.""" + import enum + + if value is None: + return '-' + if is_flag: + return '`false`' if value is False else '`true`' + if isinstance(value, bool): + return f'`{str(value).lower()}`' + if isinstance(value, list) and not value: + return '-' + # Enum instances from typer — use .value + if isinstance(value, enum.Enum): + return f'`{value.value}`' + return f'`{value}`' + + +# --------------------------------------------------------------------------- +# CLI reference +# --------------------------------------------------------------------------- + + +def _iter_leaf_commands(group): + """ + Yield (display_name, click_command) for every non-hidden leaf command. + + Handles both flat commands and single-level groups (e.g. the `tui` + sub-app which has a callback but no sub-commands). + """ + import click + + for name, cmd in sorted(group.commands.items()): + if getattr(cmd, 'hidden', False): + continue + if isinstance(cmd, click.Group) and cmd.commands: + # Proper sub-command group — recurse one level + for sub_name, sub_cmd in sorted(cmd.commands.items()): + if not getattr(sub_cmd, 'hidden', False): + yield f'{name} {sub_name}', sub_cmd + else: + yield name, cmd + + +def _format_command(full_name: str, cmd) -> str: + """Format one command as a markdown section.""" + import click + + lines = [] + lines.append(f'## `parxy {full_name}`\n') + + # Help text — strip the Examples block, which is too verbose for a reference page + help_text = (cmd.help or '').strip() + if '\n\nExamples:' in help_text: + help_text = help_text[: help_text.index('\n\nExamples:')].strip() + if help_text: + lines.append(help_text + '\n') + + # Usage line + has_opts = any(isinstance(p, click.Option) for p in cmd.params) + args_str = '' + for p in cmd.params: + if isinstance(p, click.Argument): + mv = p.human_readable_name.upper() + if p.nargs == -1: + mv += '...' + args_str += f' {mv}' + + lines.append( + f'```\nparxy {full_name}{" [OPTIONS]" if has_opts else ""}{args_str}\n```\n' + ) + + # Arguments table + args = [p for p in cmd.params if isinstance(p, click.Argument)] + if args: + lines.append('**Arguments:**\n') + lines.append('| Argument | Required | Description |') + lines.append('|----------|----------|-------------|') + for arg in args: + req = 'Yes' if arg.required else 'No' + desc = (arg.help or '').replace('\n', ' ').strip() + lines.append(f'| `{arg.human_readable_name.upper()}` | {req} | {desc} |') + lines.append('') + + # Options table (skip the --help flag) + opts = [ + p + for p in cmd.params + if isinstance(p, click.Option) and not p.hidden and p.name != 'help' + ] + if opts: + lines.append('**Options:**\n') + lines.append('| Option | Short | Type | Default | Description |') + lines.append('|--------|-------|------|---------|-------------|') + for opt in opts: + long_opts = [o for o in opt.opts if o.startswith('--')] + short_opts = [o for o in opt.opts if not o.startswith('--')] + long = ', '.join(f'`{o}`' for o in long_opts) + short = ', '.join(f'`{o}`' for o in short_opts) if short_opts else '-' + type_str = '`flag`' if opt.is_flag else _type_label(opt.type) + default_str = _default_label(opt.default, is_flag=opt.is_flag) + desc = (opt.help or '').replace('\n', ' ').strip() + lines.append(f'| {long} | {short} | {type_str} | {default_str} | {desc} |') + lines.append('') + + return '\n'.join(lines) + + +def generate_cli_reference() -> str: + import typer + from parxy_cli.cli import app + + cli = typer.main.get_command(app) + + frontmatter = ( + '---\n' + 'title: CLI reference\n' + 'description: Command line reference with all parxy commands, including arguments, options, types, and defaults. Prefer to run parxy --help and parxy --help if you have access to the terminal.\n' + '---\n' + ) + + parts = [ + frontmatter, + GENERATED_NOTICE, + '# CLI reference\n', + ] + + for name, cmd in _iter_leaf_commands(cli): + parts.append(_format_command(name, cmd)) + + return '\n'.join(parts) + + +# --------------------------------------------------------------------------- +# Configuration reference +# --------------------------------------------------------------------------- + + +def _field_docstrings(cls) -> dict[str, str]: + """ + Extract per-field docstrings from a Pydantic settings class. + + Pydantic does not capture attribute docstrings automatically, so we + parse the class source with the AST and look for string literals that + immediately follow an annotated assignment. + """ + try: + source = textwrap.dedent(inspect.getsource(cls)) + tree = ast.parse(source) + except Exception: + return {} + + descriptions: dict[str, str] = {} + for node in ast.walk(tree): + if not isinstance(node, ast.ClassDef): + continue + body = node.body + for i, child in enumerate(body): + if not ( + isinstance(child, ast.AnnAssign) and isinstance(child.target, ast.Name) + ): + continue + field_name = child.target.id + if i + 1 >= len(body): + continue + nxt = body[i + 1] + if ( + isinstance(nxt, ast.Expr) + and isinstance(nxt.value, ast.Constant) + and isinstance(nxt.value.value, str) + ): + descriptions[field_name] = nxt.value.value.strip() + + return descriptions + + +def _env_prefix(cls) -> str: + try: + return cls.model_config.get('env_prefix', '').upper() + except Exception: + return '' + + +def _config_default_label(field_name: str, field_info) -> str: + """Human-readable default for a Pydantic field.""" + from pydantic_core import PydanticUndefinedType + + default = field_info.default + + # default_factory means the value is computed at runtime + if field_info.default_factory is not None: + return '*(computed)*' + + if isinstance(default, PydanticUndefinedType): + return '*(required)*' + if default is None: + return '-' + if isinstance(default, bool): + return f'`{str(default).lower()}`' + # Only map int → logging level name for fields explicitly named *logging_level* + if isinstance(default, int) and 'logging_level' in field_name: + return f'`{logging.getLevelName(default)}`' + return f'`{default}`' + + +def _format_config_section(cls, heading: str) -> str: + """Format one config class as a markdown section.""" + from pydantic import BaseModel + + prefix = _env_prefix(cls) + docstrings = _field_docstrings(cls) + + lines = [f'## {heading}\n'] + + class_doc = inspect.getdoc(cls) + if class_doc: + # Strip the "All env variables must start with..." boilerplate + first_line = class_doc.splitlines()[0] + if 'All env variables' not in first_line: + lines.append(first_line + '\n') + + lines.append(f'Prefix: `{prefix}`\n') + lines.append('| Variable | Default | Description |') + lines.append('|----------|---------|-------------|') + + for field_name, field_info in cls.model_fields.items(): + # Skip nested model fields — they have their own section + annotation = field_info.annotation + try: + origin = getattr(annotation, '__origin__', None) + if isinstance(annotation, type) and issubclass(annotation, BaseModel): + continue + except TypeError: + pass + + env_var = f'`{prefix}{field_name.upper()}`' + default_str = _config_default_label(field_name, field_info) + + desc = docstrings.get(field_name, '').replace('\n', ' ').strip() + # Trim to first sentence for table readability + if '. ' in desc: + desc = desc[: desc.index('. ') + 1] + + # Append allowed values for Literal-typed fields + lit_vals = _literal_values(field_info.annotation) + if lit_vals: + vals_str = ', '.join(f'`{v}`' for v in lit_vals) + desc = (desc.rstrip('.') + '. ' if desc else '') + f'One of: {vals_str}.' + + # Flag secret fields + annotation_str = str(field_info.annotation) + if 'SecretStr' in annotation_str: + default_str = '*(secret)*' + + lines.append(f'| {env_var} | {default_str} | {desc} |') + + lines.append('') + return '\n'.join(lines) + + +def generate_config_reference() -> str: + from parxy_core.models.config import ( + LandingAIConfig, + LlamaParseConfig, + LlmWhispererConfig, + ParxyConfig, + ParxyTracingConfig, + PdfActConfig, + UnstructuredLocalConfig, + ) + + frontmatter = ( + '---\n' + 'title: Configuration reference\n' + 'description: Configuration options for Parxy and the drivers. Settings are read from the environment or a .env file. Run parxy env to generate a starter .env with some default.\n' + '---\n' + ) + + parts = [ + frontmatter, + GENERATED_NOTICE, + '# Configuration reference\n', + 'All settings are read from environment variables or a `.env` file in your project root.\n', + 'Run `parxy env` to generate a template `.env` with usual configuration options.\n', + ] + + sections = [ + (ParxyConfig, 'Core settings'), + (ParxyTracingConfig, 'Observability / tracing'), + (PdfActConfig, 'PdfAct'), + (LlamaParseConfig, 'LlamaParse'), + (LlmWhispererConfig, 'LLMWhisperer'), + (LandingAIConfig, 'Landing AI'), + (UnstructuredLocalConfig, 'Unstructured library'), + ] + + for cls, title in sections: + parts.append(_format_config_section(cls, title)) + + return '\n'.join(parts) + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + + +def main() -> None: + out_dir = ROOT / 'docs' / 'reference' + out_dir.mkdir(parents=True, exist_ok=True) + + cli_path = out_dir / 'cli.md' + cli_path.write_text(generate_cli_reference(), encoding='utf-8') + print(f' docs/reference/cli.md') + + config_path = out_dir / 'configuration.md' + config_path.write_text(generate_config_reference(), encoding='utf-8') + print(f' docs/reference/configuration.md') + + +if __name__ == '__main__': + main()