From b9338f6fa4d89656c93caf45033b0c892a8ca6d5 Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Thu, 14 May 2026 15:17:06 +0200 Subject: [PATCH 1/2] Support PDFMiner --- README.md | 1 + docs/supported_services.md | 3 +- pyproject.toml | 4 + src/parxy_core/drivers/__init__.py | 1 + src/parxy_core/drivers/factory.py | 5 + src/parxy_core/drivers/pdfminer.py | 128 +++++++++++++++++++++++ tests/drivers/test_pdfminer.py | 156 +++++++++++++++++++++++++++++ tests/test_factory.py | 6 ++ uv.lock | 8 +- 9 files changed, 310 insertions(+), 2 deletions(-) create mode 100644 src/parxy_core/drivers/pdfminer.py create mode 100644 tests/drivers/test_pdfminer.py diff --git a/README.md b/README.md index 734b04a..d5035b2 100644 --- a/README.md +++ b/README.md @@ -165,6 +165,7 @@ For more information take a look at our [Getting Started with Parxy tutorial](./ | [**LLMWhisperer**](https://docs.unstract.com/llmwhisperer/index.html) | `llmwhisperer` | ✅ | ✅ | Preview | | [**Pypdfium2**](https://github.com/pypdfium2-team/pypdfium2) | `pypdfium2` | ✅ | ✅ | Preview | | [**pdfplumber**](https://github.com/jsvine/pdfplumber) | `pdfplumber` | ✅ | ✅ | Preview | +| [**PDFMiner**](https://github.com/pdfminer/pdfminer.six) | `pdfminer` | ✅ | ✅ | Preview | | [**Unstructured.io** cloud service](https://docs.unstructured.io/open-source/introduction/overview) | | | | Planned | | [**Chunkr**](https://www.chunkr.ai/) | | | | Planned | | [**Docling**](https://docling-project.github.io/docling/) | | | | Planned | diff --git a/docs/supported_services.md b/docs/supported_services.md index 19ce7a8..202b302 100644 --- a/docs/supported_services.md +++ b/docs/supported_services.md @@ -17,7 +17,8 @@ Parxy supports the following document processing services and libraries. The **E | [**LlamaParse**](https://docs.cloud.llamaindex.ai/llamaparse/overview) | Preview | `llama` | ✅ | ✅ | | [**LLMWhisperer**](https://docs.unstract.com/llmwhisperer/index.html) | Preview | `llmwhisperer` | ✅ | ✅ | | [**Pypdfium2**](https://github.com/pypdfium2-team/pypdfium2) | Preview | `pypdfium2` | ✅ | ✅ | -| [**pdfplumber**](https://github.com/jsvine/pdfplumber) | Preview | `pdfplumber` | ✅ | ✅ | +| [**pdfplumber**](https://github.com/jsvine/pdfplumber) | Preview | `pdfplumber` | ✅ | ✅ | +| [**PDFMiner**](https://github.com/pdfminer/pdfminer.six) | Preview | `pdfminer` | ✅ | ✅ | Status meanings: **Live** = stable; **Preview** = functional but the API may change. diff --git a/pyproject.toml b/pyproject.toml index 3d2fc48..7f7f004 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,9 @@ pypdfium2 = [ pdfplumber = [ "pdfplumber>=0.11.0", ] +pdfminer = [ + "pdfminer.six>=20251230", +] all = [ "llama-cloud>=2.0.0", "llmwhisperer-client>=2.4.2", @@ -60,6 +63,7 @@ all = [ "textual>=0.89.0", "pypdfium2>=5.7.1", "pdfplumber>=0.11.0", + "pdfminer.six>=20251230", ] diff --git a/src/parxy_core/drivers/__init__.py b/src/parxy_core/drivers/__init__.py index 0f17d34..812cf15 100644 --- a/src/parxy_core/drivers/__init__.py +++ b/src/parxy_core/drivers/__init__.py @@ -12,3 +12,4 @@ PyPDFium2Driver as PyPDFium2Driver, ) from parxy_core.drivers.pdfplumber import PDFPlumberDriver as PDFPlumberDriver +from parxy_core.drivers.pdfminer import PDFMinerDriver as PDFMinerDriver diff --git a/src/parxy_core/drivers/factory.py b/src/parxy_core/drivers/factory.py index 9717099..1e48bc5 100644 --- a/src/parxy_core/drivers/factory.py +++ b/src/parxy_core/drivers/factory.py @@ -11,6 +11,7 @@ from parxy_core.drivers.unstructured_local import UnstructuredLocalDriver from parxy_core.drivers.pypdfium2 import PyPDFium2Driver from parxy_core.drivers.pdfplumber import PDFPlumberDriver +from parxy_core.drivers.pdfminer import PDFMinerDriver from parxy_core.models import ( PdfActConfig, LandingAIConfig, @@ -217,6 +218,9 @@ def _create_pypdfium_driver(self) -> PyPDFium2Driver: def _create_pdfplumber_driver(self) -> PDFPlumberDriver: return PDFPlumberDriver(logger=self._logger) + def _create_pdfminer_driver(self) -> PDFMinerDriver: + return PDFMinerDriver(logger=self._logger) + def _create_landingai_driver(self) -> LandingAIADEDriver: """Create a LandingAI ADE Driver instance. @@ -293,6 +297,7 @@ def get_supported_drivers(self) -> List[str]: 'unstructured_local', 'pypdfium', 'pdfplumber', + 'pdfminer', ] return supported_drivers diff --git a/src/parxy_core/drivers/pdfminer.py b/src/parxy_core/drivers/pdfminer.py new file mode 100644 index 0000000..442d909 --- /dev/null +++ b/src/parxy_core/drivers/pdfminer.py @@ -0,0 +1,128 @@ +"""PDFMiner driver for parxy.""" + +import io +from typing import Any, Optional + +from parxy_core.drivers import Driver +from parxy_core.models import Document, Page, TocEntry + + +class PDFMinerDriver(Driver): + """PDF parser using PDFMiner - mature text extraction. + + PDFMiner is a mature, pure-Python PDF text extraction library. + Good for text-heavy documents, handles various encodings well. + """ + + supported_levels = ["page", "block"] + + def _initialize_driver(self): + """Initialize PDFMiner driver by checking if the library is available.""" + try: + from pdfminer.high_level import extract_pages # noqa: F401 + except ImportError as e: + raise ImportError( + "pdfminer.six is required. Install with: pip install parxy[pdfminer]" + ) from e + return self + + def _handle( + self, file: str | io.BytesIO | bytes, level: str = "page", **kwargs + ) -> Document: + """Parse PDF to Document object. + + Parameters + ---------- + file : str | io.BytesIO | bytes + Path, URL or stream of the file to parse. + level : str, optional + Desired extraction level. Default is "page". + **kwargs : dict + Additional keyword arguments. + + Returns + ------- + Document + A parsed Document in unified format. + """ + from pdfminer.high_level import extract_pages + from pdfminer.layout import LAParams, LTTextContainer + from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines + from pdfminer.pdfpage import PDFPage, LITERAL_PAGE + from pdfminer.pdfparser import PDFParser + from pdfminer.pdftypes import PDFObjRef + + if level == 'block': + level = 'page' + + filename, stream = self.handle_file_input(file) + + with self._trace_parse(filename, stream, **kwargs) as span: + pages = [] + for page_num, page_layout in enumerate( + extract_pages(io.BytesIO(stream), laparams=LAParams()), start=1 + ): + text = ''.join( + element.get_text() + for element in page_layout + if isinstance(element, LTTextContainer) + ).strip() + pages.append(Page(number=page_num, text=text, blocks=None)) + + span.set_attribute("output.pages", len(pages)) + + outline = None + parser = PDFParser(io.BytesIO(stream)) + try: + doc = PDFDocument(parser) + resolver = _PageNumberResolver(doc, PDFPage, LITERAL_PAGE, PDFObjRef) + entries = [] + for bm_level, title, dest, a, se in doc.get_outlines(): + ref = dest if dest is not None else (a if a is not None else se) + page_num = resolver.resolve(ref) if ref is not None else None + entries.append(TocEntry( + title=title, + page=page_num, + level=bm_level, + )) + if entries: + outline = entries + except PDFNoOutlines: + pass + finally: + parser.flush() + + return Document( + filename=filename, + pages=pages, + outline=outline, + ) + + +class _PageNumberResolver: + """Resolves PDF destination references to 1-based page numbers.""" + + def __init__(self, document: Any, PDFPage: Any, LITERAL_PAGE: Any, PDFObjRef: Any): + self._document = document + self._LITERAL_PAGE = LITERAL_PAGE + self._PDFObjRef = PDFObjRef + self._objid_to_pagenum: dict[int, int] = { + page.pageid: page_num + for page_num, page in enumerate(PDFPage.create_pages(document), 1) + } + + def resolve(self, ref: Any) -> Optional[int]: + if isinstance(ref, self._PDFObjRef): + resolved = ref.resolve() + if isinstance(resolved, dict) and resolved.get('Type') is self._LITERAL_PAGE: + return self._objid_to_pagenum.get(ref.objid) + return self.resolve(resolved) + if isinstance(ref, dict) and 'D' in ref: + return self.resolve(ref['D']) + if isinstance(ref, list): + first_ref = next((e for e in ref if isinstance(e, self._PDFObjRef)), None) + if first_ref is not None: + return self.resolve(first_ref) + if isinstance(ref, bytes): + return self.resolve(self._document.get_dest(ref)) + return None diff --git a/tests/drivers/test_pdfminer.py b/tests/drivers/test_pdfminer.py new file mode 100644 index 0000000..1c219de --- /dev/null +++ b/tests/drivers/test_pdfminer.py @@ -0,0 +1,156 @@ +import os +import pytest +from unittest.mock import Mock, patch, MagicMock + +from parxy_core.models import Page, TocEntry + +from parxy_core.drivers import PDFMinerDriver +from parxy_core.exceptions import FileNotFoundException + + +class TestPDFMinerDriver: + def __fixture_path(self, file: str) -> str: + current_dir = os.path.dirname(os.path.abspath(__file__)) + fixtures_dir = os.path.join(os.path.dirname(current_dir), 'fixtures') + return os.path.join(fixtures_dir, file) + + def test_pdfminer_driver_can_be_created(self): + driver = PDFMinerDriver() + + assert driver.supported_levels == ['page', 'block'] + + def test_pdfminer_driver_unrecognized_level_handled(self): + driver = PDFMinerDriver() + + path = self.__fixture_path('empty-doc.pdf') + + with pytest.raises(ValueError) as excinfo: + driver.parse(path, level='custom') + + assert 'not supported' in str(excinfo.value) + assert '[custom]' in str(excinfo.value) + + def test_pdfminer_driver_handle_not_existing_file(self): + driver = PDFMinerDriver() + + path = self.__fixture_path('non-existing-file.pdf') + + with pytest.raises(FileNotFoundException): + driver.parse(path, level='page') + + def test_pdfminer_driver_read_empty_document_page_level(self): + driver = PDFMinerDriver() + + path = self.__fixture_path('empty-doc.pdf') + document = driver.parse(path, level='page') + + assert document is not None + assert document.language is None + assert document.outline is None + assert document.metadata is None + assert len(document.pages) == 1 + assert isinstance(document.pages[0], Page) + assert document.pages[0].blocks is None + assert document.pages[0].number == 1 + assert document.pages[0].text == '1' + + def test_pdfminer_driver_read_document(self): + driver = PDFMinerDriver() + + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path, level='page') + + assert document is not None + assert document.language is None + assert document.metadata is None + assert len(document.pages) == 1 + assert isinstance(document.pages[0], Page) + assert document.pages[0].blocks is None + assert document.pages[0].number == 1 + assert ( + document.pages[0].text + == 'This is the header \nThis is a test PDF to be used as input in unit \ntests \nThis is a heading 1 \nThis is a paragraph below heading 1 \n1' + ) + + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_pdfminer_driver_tracing_span_created(self, mock_tracer): + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.info = Mock() + + driver = PDFMinerDriver() + path = self.__fixture_path('empty-doc.pdf') + document = driver.parse(path, level='page') + + mock_tracer.span.assert_called() + + span_calls = mock_tracer.span.call_args_list + doc_processing_call = [ + c for c in span_calls if c[0][0] == 'document-processing' + ][0] + + assert doc_processing_call[1]['driver'] == 'PDFMinerDriver' + assert doc_processing_call[1]['level'] == 'page' + + mock_tracer.count.assert_called_once() + count_call = mock_tracer.count.call_args + assert count_call[0][0] == 'documents.processed' + assert count_call[1]['driver'] == 'PDFMinerDriver' + + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_pdfminer_driver_tracing_exception_recorded(self, mock_tracer): + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.error = Mock() + + driver = PDFMinerDriver() + path = self.__fixture_path('non-existing-file.pdf') + + with pytest.raises(FileNotFoundException): + driver.parse(path, level="page") + + mock_tracer.error.assert_called_once() + error_call = mock_tracer.error.call_args + assert error_call[0][0] == 'Parsing failed' + + mock_tracer.count.assert_called_once() + + def test_pdfminer_driver_reads_outline(self): + driver = PDFMinerDriver() + + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path, level='page') + + assert document.outline is not None + assert len(document.outline) == 1 + entry = document.outline[0] + assert isinstance(entry, TocEntry) + assert entry.title == 'This is a heading 1' + assert entry.page == 1 + assert entry.level == 1 + assert entry.bbox is None + + def test_pdfminer_driver_empty_document_has_no_outline(self): + driver = PDFMinerDriver() + + path = self.__fixture_path('empty-doc.pdf') + document = driver.parse(path, level='page') + + assert document.outline is None + + def test_pdfminer_driver_records_elapsed_time(self): + driver = PDFMinerDriver() + + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path, level='page') + + assert document.parsing_metadata is not None + assert 'driver_elapsed_time' in document.parsing_metadata + assert isinstance(document.parsing_metadata['driver_elapsed_time'], float) + assert document.parsing_metadata['driver_elapsed_time'] > 0 diff --git a/tests/test_factory.py b/tests/test_factory.py index d243ae0..b8c5e1d 100644 --- a/tests/test_factory.py +++ b/tests/test_factory.py @@ -11,6 +11,7 @@ from parxy_core.drivers.landingai import LandingAIADEDriver from parxy_core.drivers import PyPDFium2Driver from parxy_core.drivers import PDFPlumberDriver +from parxy_core.drivers import PDFMinerDriver from parxy_core.models import Document from parxy_core.models import ParxyConfig @@ -128,3 +129,8 @@ def test_pdfplumber_driver_instantiated(self): DriverFactory.reset() driver = DriverFactory.build().driver('pdfplumber') assert isinstance(driver, PDFPlumberDriver) + + def test_pdfminer_driver_instantiated(self): + DriverFactory.reset() + driver = DriverFactory.build().driver('pdfminer') + assert isinstance(driver, PDFMinerDriver) diff --git a/uv.lock b/uv.lock index 2881f7f..3d9d41a 100644 --- a/uv.lock +++ b/uv.lock @@ -1921,6 +1921,7 @@ all = [ { name = "landingai-ade" }, { name = "llama-cloud" }, { name = "llmwhisperer-client" }, + { name = "pdfminer-six" }, { name = "pdfplumber" }, { name = "pypdfium2" }, { name = "textual" }, @@ -1935,6 +1936,9 @@ llama = [ llmwhisperer = [ { name = "llmwhisperer-client" }, ] +pdfminer = [ + { name = "pdfminer-six" }, +] pdfplumber = [ { name = "pdfplumber" }, ] @@ -1967,6 +1971,8 @@ requires-dist = [ { name = "opentelemetry-exporter-otlp", specifier = ">=1.37.0" }, { name = "opentelemetry-proto", specifier = ">=1.37.0" }, { name = "opentelemetry-sdk", specifier = ">=1.37.0" }, + { name = "pdfminer-six", marker = "extra == 'all'", specifier = ">=20251230" }, + { name = "pdfminer-six", marker = "extra == 'pdfminer'", specifier = ">=20251230" }, { name = "pdfplumber", marker = "extra == 'all'", specifier = ">=0.11.0" }, { name = "pdfplumber", marker = "extra == 'pdfplumber'", specifier = ">=0.11.0" }, { name = "pydantic", specifier = ">=2.11.7" }, @@ -1984,7 +1990,7 @@ requires-dist = [ { name = "unstructured", extras = ["pdf"], marker = "extra == 'unstructured-local'", specifier = ">=0.18.13" }, { name = "validators", specifier = ">=0.35.0" }, ] -provides-extras = ["llama", "llmwhisperer", "unstructured-local", "landingai", "tui", "pypdfium2", "pdfplumber", "all"] +provides-extras = ["llama", "llmwhisperer", "unstructured-local", "landingai", "tui", "pypdfium2", "pdfplumber", "pdfminer", "all"] [package.metadata.requires-dev] dev = [ From 2d7af283d5665907267685f4604598b352470164 Mon Sep 17 00:00:00 2001 From: avvertix <5672748+avvertix@users.noreply.github.com> Date: Thu, 14 May 2026 13:17:43 +0000 Subject: [PATCH 2/2] Fix styling --- src/parxy_core/drivers/pdfminer.py | 25 +++++++++++++++---------- tests/drivers/test_pdfminer.py | 2 +- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/src/parxy_core/drivers/pdfminer.py b/src/parxy_core/drivers/pdfminer.py index 442d909..aa519eb 100644 --- a/src/parxy_core/drivers/pdfminer.py +++ b/src/parxy_core/drivers/pdfminer.py @@ -14,7 +14,7 @@ class PDFMinerDriver(Driver): Good for text-heavy documents, handles various encodings well. """ - supported_levels = ["page", "block"] + supported_levels = ['page', 'block'] def _initialize_driver(self): """Initialize PDFMiner driver by checking if the library is available.""" @@ -22,12 +22,12 @@ def _initialize_driver(self): from pdfminer.high_level import extract_pages # noqa: F401 except ImportError as e: raise ImportError( - "pdfminer.six is required. Install with: pip install parxy[pdfminer]" + 'pdfminer.six is required. Install with: pip install parxy[pdfminer]' ) from e return self def _handle( - self, file: str | io.BytesIO | bytes, level: str = "page", **kwargs + self, file: str | io.BytesIO | bytes, level: str = 'page', **kwargs ) -> Document: """Parse PDF to Document object. @@ -69,7 +69,7 @@ def _handle( ).strip() pages.append(Page(number=page_num, text=text, blocks=None)) - span.set_attribute("output.pages", len(pages)) + span.set_attribute('output.pages', len(pages)) outline = None parser = PDFParser(io.BytesIO(stream)) @@ -80,11 +80,13 @@ def _handle( for bm_level, title, dest, a, se in doc.get_outlines(): ref = dest if dest is not None else (a if a is not None else se) page_num = resolver.resolve(ref) if ref is not None else None - entries.append(TocEntry( - title=title, - page=page_num, - level=bm_level, - )) + entries.append( + TocEntry( + title=title, + page=page_num, + level=bm_level, + ) + ) if entries: outline = entries except PDFNoOutlines: @@ -114,7 +116,10 @@ def __init__(self, document: Any, PDFPage: Any, LITERAL_PAGE: Any, PDFObjRef: An def resolve(self, ref: Any) -> Optional[int]: if isinstance(ref, self._PDFObjRef): resolved = ref.resolve() - if isinstance(resolved, dict) and resolved.get('Type') is self._LITERAL_PAGE: + if ( + isinstance(resolved, dict) + and resolved.get('Type') is self._LITERAL_PAGE + ): return self._objid_to_pagenum.get(ref.objid) return self.resolve(resolved) if isinstance(ref, dict) and 'D' in ref: diff --git a/tests/drivers/test_pdfminer.py b/tests/drivers/test_pdfminer.py index 1c219de..ea662e1 100644 --- a/tests/drivers/test_pdfminer.py +++ b/tests/drivers/test_pdfminer.py @@ -113,7 +113,7 @@ def test_pdfminer_driver_tracing_exception_recorded(self, mock_tracer): path = self.__fixture_path('non-existing-file.pdf') with pytest.raises(FileNotFoundException): - driver.parse(path, level="page") + driver.parse(path, level='page') mock_tracer.error.assert_called_once() error_call = mock_tracer.error.call_args