Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ For more information take a look at our [Getting Started with Parxy tutorial](./
| [**LLMWhisperer**](https://docs.unstract.com/llmwhisperer/index.html) | `llmwhisperer` | ✅ | ✅ | Preview |
| [**Pypdfium2**](https://github.com/pypdfium2-team/pypdfium2) | `pypdfium2` | ✅ | ✅ | Preview |
| [**pdfplumber**](https://github.com/jsvine/pdfplumber) | `pdfplumber` | ✅ | ✅ | Preview |
| [**PDFMiner**](https://github.com/pdfminer/pdfminer.six) | `pdfminer` | ✅ | ✅ | Preview |
| [**Unstructured.io** cloud service](https://docs.unstructured.io/open-source/introduction/overview) | | | | Planned |
| [**Chunkr**](https://www.chunkr.ai/) | | | | Planned |
| [**Docling**](https://docling-project.github.io/docling/) | | | | Planned |
Expand Down
3 changes: 2 additions & 1 deletion docs/supported_services.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ Parxy supports the following document processing services and libraries. The **E
| [**LlamaParse**](https://docs.cloud.llamaindex.ai/llamaparse/overview) | Preview | `llama` | ✅ | ✅ |
| [**LLMWhisperer**](https://docs.unstract.com/llmwhisperer/index.html) | Preview | `llmwhisperer` | ✅ | ✅ |
| [**Pypdfium2**](https://github.com/pypdfium2-team/pypdfium2) | Preview | `pypdfium2` | ✅ | ✅ |
| [**pdfplumber**](https://github.com/jsvine/pdfplumber) | Preview | `pdfplumber` | ✅ | ✅ |
| [**pdfplumber**](https://github.com/jsvine/pdfplumber) | Preview | `pdfplumber` | ✅ | ✅ |
| [**PDFMiner**](https://github.com/pdfminer/pdfminer.six) | Preview | `pdfminer` | ✅ | ✅ |

Status meanings: **Live** = stable; **Preview** = functional but the API may change.

Expand Down
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ pypdfium2 = [
pdfplumber = [
"pdfplumber>=0.11.0",
]
pdfminer = [
"pdfminer.six>=20251230",
]
all = [
"llama-cloud>=2.0.0",
"llmwhisperer-client>=2.4.2",
Expand All @@ -60,6 +63,7 @@ all = [
"textual>=0.89.0",
"pypdfium2>=5.7.1",
"pdfplumber>=0.11.0",
"pdfminer.six>=20251230",
]


Expand Down
1 change: 1 addition & 0 deletions src/parxy_core/drivers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@
PyPDFium2Driver as PyPDFium2Driver,
)
from parxy_core.drivers.pdfplumber import PDFPlumberDriver as PDFPlumberDriver
from parxy_core.drivers.pdfminer import PDFMinerDriver as PDFMinerDriver
5 changes: 5 additions & 0 deletions src/parxy_core/drivers/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from parxy_core.drivers.unstructured_local import UnstructuredLocalDriver
from parxy_core.drivers.pypdfium2 import PyPDFium2Driver
from parxy_core.drivers.pdfplumber import PDFPlumberDriver
from parxy_core.drivers.pdfminer import PDFMinerDriver
from parxy_core.models import (
PdfActConfig,
LandingAIConfig,
Expand Down Expand Up @@ -217,6 +218,9 @@ def _create_pypdfium_driver(self) -> PyPDFium2Driver:
def _create_pdfplumber_driver(self) -> PDFPlumberDriver:
return PDFPlumberDriver(logger=self._logger)

def _create_pdfminer_driver(self) -> PDFMinerDriver:
return PDFMinerDriver(logger=self._logger)

def _create_landingai_driver(self) -> LandingAIADEDriver:
"""Create a LandingAI ADE Driver instance.

Expand Down Expand Up @@ -293,6 +297,7 @@ def get_supported_drivers(self) -> List[str]:
'unstructured_local',
'pypdfium',
'pdfplumber',
'pdfminer',
]

return supported_drivers
Expand Down
133 changes: 133 additions & 0 deletions src/parxy_core/drivers/pdfminer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
"""PDFMiner driver for parxy."""

import io
from typing import Any, Optional

from parxy_core.drivers import Driver
from parxy_core.models import Document, Page, TocEntry


class PDFMinerDriver(Driver):
"""PDF parser using PDFMiner - mature text extraction.

PDFMiner is a mature, pure-Python PDF text extraction library.
Good for text-heavy documents, handles various encodings well.
"""

supported_levels = ['page', 'block']

def _initialize_driver(self):
"""Initialize PDFMiner driver by checking if the library is available."""
try:
from pdfminer.high_level import extract_pages # noqa: F401
except ImportError as e:
raise ImportError(
'pdfminer.six is required. Install with: pip install parxy[pdfminer]'
) from e
return self

def _handle(
self, file: str | io.BytesIO | bytes, level: str = 'page', **kwargs
) -> Document:
"""Parse PDF to Document object.

Parameters
----------
file : str | io.BytesIO | bytes
Path, URL or stream of the file to parse.
level : str, optional
Desired extraction level. Default is "page".
**kwargs : dict
Additional keyword arguments.

Returns
-------
Document
A parsed Document in unified format.
"""
from pdfminer.high_level import extract_pages
from pdfminer.layout import LAParams, LTTextContainer
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdfpage import PDFPage, LITERAL_PAGE
from pdfminer.pdfparser import PDFParser
from pdfminer.pdftypes import PDFObjRef

if level == 'block':
level = 'page'

filename, stream = self.handle_file_input(file)

with self._trace_parse(filename, stream, **kwargs) as span:
pages = []
for page_num, page_layout in enumerate(
extract_pages(io.BytesIO(stream), laparams=LAParams()), start=1
):
text = ''.join(
element.get_text()
for element in page_layout
if isinstance(element, LTTextContainer)
).strip()
pages.append(Page(number=page_num, text=text, blocks=None))

span.set_attribute('output.pages', len(pages))

outline = None
parser = PDFParser(io.BytesIO(stream))
try:
doc = PDFDocument(parser)
resolver = _PageNumberResolver(doc, PDFPage, LITERAL_PAGE, PDFObjRef)
entries = []
for bm_level, title, dest, a, se in doc.get_outlines():
ref = dest if dest is not None else (a if a is not None else se)
page_num = resolver.resolve(ref) if ref is not None else None
entries.append(
TocEntry(
title=title,
page=page_num,
level=bm_level,
)
)
if entries:
outline = entries
except PDFNoOutlines:
pass
finally:
parser.flush()

return Document(
filename=filename,
pages=pages,
outline=outline,
)


class _PageNumberResolver:
"""Resolves PDF destination references to 1-based page numbers."""

def __init__(self, document: Any, PDFPage: Any, LITERAL_PAGE: Any, PDFObjRef: Any):
self._document = document
self._LITERAL_PAGE = LITERAL_PAGE
self._PDFObjRef = PDFObjRef
self._objid_to_pagenum: dict[int, int] = {
page.pageid: page_num
for page_num, page in enumerate(PDFPage.create_pages(document), 1)
}

def resolve(self, ref: Any) -> Optional[int]:
if isinstance(ref, self._PDFObjRef):
resolved = ref.resolve()
if (
isinstance(resolved, dict)
and resolved.get('Type') is self._LITERAL_PAGE
):
return self._objid_to_pagenum.get(ref.objid)
return self.resolve(resolved)
if isinstance(ref, dict) and 'D' in ref:
return self.resolve(ref['D'])
if isinstance(ref, list):
first_ref = next((e for e in ref if isinstance(e, self._PDFObjRef)), None)
if first_ref is not None:
return self.resolve(first_ref)
if isinstance(ref, bytes):
return self.resolve(self._document.get_dest(ref))
return None
156 changes: 156 additions & 0 deletions tests/drivers/test_pdfminer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import os
import pytest
from unittest.mock import Mock, patch, MagicMock

from parxy_core.models import Page, TocEntry

from parxy_core.drivers import PDFMinerDriver
from parxy_core.exceptions import FileNotFoundException


class TestPDFMinerDriver:
def __fixture_path(self, file: str) -> str:
current_dir = os.path.dirname(os.path.abspath(__file__))
fixtures_dir = os.path.join(os.path.dirname(current_dir), 'fixtures')
return os.path.join(fixtures_dir, file)

def test_pdfminer_driver_can_be_created(self):
driver = PDFMinerDriver()

assert driver.supported_levels == ['page', 'block']

def test_pdfminer_driver_unrecognized_level_handled(self):
driver = PDFMinerDriver()

path = self.__fixture_path('empty-doc.pdf')

with pytest.raises(ValueError) as excinfo:
driver.parse(path, level='custom')

assert 'not supported' in str(excinfo.value)
assert '[custom]' in str(excinfo.value)

def test_pdfminer_driver_handle_not_existing_file(self):
driver = PDFMinerDriver()

path = self.__fixture_path('non-existing-file.pdf')

with pytest.raises(FileNotFoundException):
driver.parse(path, level='page')

def test_pdfminer_driver_read_empty_document_page_level(self):
driver = PDFMinerDriver()

path = self.__fixture_path('empty-doc.pdf')
document = driver.parse(path, level='page')

assert document is not None
assert document.language is None
assert document.outline is None
assert document.metadata is None
assert len(document.pages) == 1
assert isinstance(document.pages[0], Page)
assert document.pages[0].blocks is None
assert document.pages[0].number == 1
assert document.pages[0].text == '1'

def test_pdfminer_driver_read_document(self):
driver = PDFMinerDriver()

path = self.__fixture_path('test-doc.pdf')
document = driver.parse(path, level='page')

assert document is not None
assert document.language is None
assert document.metadata is None
assert len(document.pages) == 1
assert isinstance(document.pages[0], Page)
assert document.pages[0].blocks is None
assert document.pages[0].number == 1
assert (
document.pages[0].text
== 'This is the header \nThis is a test PDF to be used as input in unit \ntests \nThis is a heading 1 \nThis is a paragraph below heading 1 \n1'
)

@patch('parxy_core.drivers.abstract_driver.tracer')
def test_pdfminer_driver_tracing_span_created(self, mock_tracer):
mock_span = MagicMock()
mock_span.__enter__ = Mock(return_value=mock_span)
mock_span.__exit__ = Mock(return_value=False)
mock_tracer.span = Mock(return_value=mock_span)
mock_tracer.count = Mock()
mock_tracer.info = Mock()

driver = PDFMinerDriver()
path = self.__fixture_path('empty-doc.pdf')
document = driver.parse(path, level='page')

mock_tracer.span.assert_called()

span_calls = mock_tracer.span.call_args_list
doc_processing_call = [
c for c in span_calls if c[0][0] == 'document-processing'
][0]

assert doc_processing_call[1]['driver'] == 'PDFMinerDriver'
assert doc_processing_call[1]['level'] == 'page'

mock_tracer.count.assert_called_once()
count_call = mock_tracer.count.call_args
assert count_call[0][0] == 'documents.processed'
assert count_call[1]['driver'] == 'PDFMinerDriver'

@patch('parxy_core.drivers.abstract_driver.tracer')
def test_pdfminer_driver_tracing_exception_recorded(self, mock_tracer):
mock_span = MagicMock()
mock_span.__enter__ = Mock(return_value=mock_span)
mock_span.__exit__ = Mock(return_value=False)
mock_tracer.span = Mock(return_value=mock_span)
mock_tracer.count = Mock()
mock_tracer.error = Mock()

driver = PDFMinerDriver()
path = self.__fixture_path('non-existing-file.pdf')

with pytest.raises(FileNotFoundException):
driver.parse(path, level='page')

mock_tracer.error.assert_called_once()
error_call = mock_tracer.error.call_args
assert error_call[0][0] == 'Parsing failed'

mock_tracer.count.assert_called_once()

def test_pdfminer_driver_reads_outline(self):
driver = PDFMinerDriver()

path = self.__fixture_path('test-doc.pdf')
document = driver.parse(path, level='page')

assert document.outline is not None
assert len(document.outline) == 1
entry = document.outline[0]
assert isinstance(entry, TocEntry)
assert entry.title == 'This is a heading 1'
assert entry.page == 1
assert entry.level == 1
assert entry.bbox is None

def test_pdfminer_driver_empty_document_has_no_outline(self):
driver = PDFMinerDriver()

path = self.__fixture_path('empty-doc.pdf')
document = driver.parse(path, level='page')

assert document.outline is None

def test_pdfminer_driver_records_elapsed_time(self):
driver = PDFMinerDriver()

path = self.__fixture_path('test-doc.pdf')
document = driver.parse(path, level='page')

assert document.parsing_metadata is not None
assert 'driver_elapsed_time' in document.parsing_metadata
assert isinstance(document.parsing_metadata['driver_elapsed_time'], float)
assert document.parsing_metadata['driver_elapsed_time'] > 0
6 changes: 6 additions & 0 deletions tests/test_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from parxy_core.drivers.landingai import LandingAIADEDriver
from parxy_core.drivers import PyPDFium2Driver
from parxy_core.drivers import PDFPlumberDriver
from parxy_core.drivers import PDFMinerDriver
from parxy_core.models import Document
from parxy_core.models import ParxyConfig

Expand Down Expand Up @@ -128,3 +129,8 @@ def test_pdfplumber_driver_instantiated(self):
DriverFactory.reset()
driver = DriverFactory.build().driver('pdfplumber')
assert isinstance(driver, PDFPlumberDriver)

def test_pdfminer_driver_instantiated(self):
DriverFactory.reset()
driver = DriverFactory.build().driver('pdfminer')
assert isinstance(driver, PDFMinerDriver)
Loading