Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 113 additions & 5 deletions src/parxy_core/drivers/llamaparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,15 @@
LlamaPage = object

from parxy_core.drivers import Driver
from parxy_core.models import Document, Page, BoundingBox, TextBlock, HierarchyLevel
from parxy_core.models import (
Document,
Page,
BoundingBox,
TextBlock,
TableBlock,
ImageBlock,
HierarchyLevel,
)
from parxy_core.utils import safe_json_dumps
from parxy_core.exceptions import (
ParsingException,
Expand Down Expand Up @@ -502,6 +510,96 @@ def _convert_text_block(text_block: PageItem, page_number: int) -> TextBlock:
)


def _convert_table_block(text_block: PageItem, page_number: int) -> TableBlock:
"""Convert a LlamaParse `PageItem` with table type to a `TableBlock`.

Parameters
----
text_block : PageItem
The LlamaParse page item containing table data.
page_number : int
The page number (0-based).

Returns
-------
TableBlock
The converted `TableBlock` object with markdown table content.
"""
bbox = BoundingBox(
x0=text_block.bBox.x,
y0=text_block.bBox.y,
x1=text_block.bBox.x + text_block.bBox.w,
y1=text_block.bBox.y + text_block.bBox.h,
)
# Use markdown representation as the text content for tables
text_value = getattr(text_block, 'md', '') or ''
category = text_block.type
role = LLAMAPARSE_TO_ROLE.get(category, 'table') if category else 'table'
return TableBlock(
type='table',
role=role,
category=category,
text=text_value,
bbox=bbox,
page=page_number,
source_data=text_block.model_dump(exclude={'bBox', 'value', 'type', 'lvl'}),
)


def _convert_image_block(image_data, page_number: int) -> ImageBlock:
"""Convert a LlamaParse image entry to an `ImageBlock`.

Parameters
----
image_data
Image data from the LlamaParse page (model object or dict).
page_number : int
The page number (0-based).

Returns
-------
ImageBlock
The converted `ImageBlock` object.
"""
# Normalise to dict so we can handle both Pydantic models and plain dicts
if isinstance(image_data, dict):
img = image_data
elif hasattr(image_data, 'model_dump'):
img = image_data.model_dump()
else:
img = vars(image_data)

bbox = BoundingBox(
x0=img.get('x', 0),
y0=img.get('y', 0),
x1=img.get('x', 0) + img.get('width', 0),
y1=img.get('y', 0) + img.get('height', 0),
)

# Build alt_text from OCR entries when available
ocr_entries = img.get('ocr') or []
alt_text = (
' '.join(
entry.get('text', '')
if isinstance(entry, dict)
else getattr(entry, 'text', '')
for entry in ocr_entries
).strip()
or None
)

return ImageBlock(
type='image',
role='figure',
category='figure',
name=img.get('name'),
alt_text=alt_text,
bbox=bbox,
page=page_number,
source_data=img,
)


def _convert_page(
page: LlamaPage,
level: str,
Expand All @@ -520,15 +618,25 @@ def _convert_page(
Page
The converted `Page` object.
"""
text_blocks = None
blocks = None
if HierarchyLevel[level] >= HierarchyLevel.BLOCK:
text_blocks = [_convert_text_block(item, page.page - 1) for item in page.items]
blocks = []
for item in page.items:
if item.type in ('table', 'tables'):
blocks.append(_convert_table_block(item, page.page))
else:
blocks.append(_convert_text_block(item, page.page))

# Process page-level images into ImageBlocks
images = getattr(page, 'images', None) or []
for image_data in images:
blocks.append(_convert_image_block(image_data, page.page))
return Page(
number=page.page - 1,
number=page.page,
width=page.width,
height=page.height,
text=page.text if page.text != 'NO_CONTENT_HERE' else '',
blocks=text_blocks,
blocks=blocks,
source_data=page.model_dump(
exclude={'page', 'text', 'items', 'width', 'height'}
),
Expand Down
2 changes: 1 addition & 1 deletion src/parxy_core/drivers/llmwhisperer.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ def llmwhisperer_to_parxy(
):
pages.append(
Page(
number=page_number,
number=page_number + 1,
text=page_text,
source_data=doc['extraction']['metadata'].get(str(page_number), None),
)
Expand Down
24 changes: 18 additions & 6 deletions src/parxy_core/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,16 @@ def isEmpty(self) -> bool:
return not self.text or self.text.strip() == ''


class ImageBlock(Block): ...
class ImageBlock(Block):
name: Optional[str] = None
alt_text: Optional[str] = None


class TableBlock(Block): ...
class TableBlock(Block):
text: str

def isEmpty(self) -> bool:
return not self.text or self.text.strip() == ''


class Page(BaseModel):
Expand Down Expand Up @@ -197,12 +203,18 @@ def markdown(self) -> str:
page_parts.append(block.text.strip())

elif isinstance(block, ImageBlock):
# Placeholder for images - could be enhanced with actual image data
page_parts.append('![Image]')
ext = (
block.name.rsplit('.', 1)[-1]
if block.name and '.' in block.name
else ''
)
lang = f'image:{ext}' if ext else 'image'
alt = block.alt_text or ''
page_parts.append(f'```{lang}\n{alt}\n```')

elif isinstance(block, TableBlock):
# Placeholder for tables - could be enhanced with actual table data
page_parts.append('| Table content |')
if block.text.strip():
page_parts.append(block.text.strip())

if page_parts:
markdown_parts.append('\n\n'.join(page_parts))
Expand Down
1 change: 1 addition & 0 deletions tests/drivers/test_llamaparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ def test_llamaparse_driver_read_document(self):
assert document.metadata is None
assert len(document.pages) == 1
assert isinstance(document.pages[0], Page)
assert document.pages[0].number == 1
assert (
document.pages[0].text
== 'This is the header\n\nThis is a test PDF to be used as input in unit\ntests\n\nThis is a heading 1\nThis is a paragraph below heading 1\n\n1'
Expand Down
1 change: 1 addition & 0 deletions tests/drivers/test_llmwhisperer.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def test_llmwhisperer_driver_read_document(self):
assert document.metadata is None
assert len(document.pages) == 1
assert isinstance(document.pages[0], Page)
assert document.pages[0].number == 1
assert (
document.pages[0].text
== '\n\nThis is the header \n\nThis is a test PDF to be used as input in unit \n\ntests \n\nThis is a heading 1 \nThis is a paragraph below heading 1 \n\n 1 \n'
Expand Down
Binary file added tests/fixtures/pdf-headings-images-tables.pdf
Binary file not shown.
1 change: 1 addition & 0 deletions tests/fixtures/sources/empty-document.typ
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
// This document results in a blank PDF on purpose
Binary file added tests/fixtures/sources/generated-image.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
57 changes: 57 additions & 0 deletions tests/fixtures/sources/headings-images-tables.typ
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
= Introduction

In this report, we will write some _lorem ipsum_.

#lorem(90)


= Section heading

#lorem(15)

+ The climate
- Temperature
- Precipitation
+ The topography
+ The geology



== Subsection with image and figure

#image("generated-image.png", width: 50%)

#lorem(15)


#figure(
image("generated-image.png", width: 50%),
caption: [
A generated image using _Google Nano Banana_ model of a winter landscape.
],
)



== Subsection with table

#lorem(15)

#table(
columns: (1fr, auto, auto),
inset: 10pt,
align: horizon,
table.header(
[*Shape*], [*Volume*], [*Parameters*],
),
"cylinder",
$ pi h (D^2 - d^2) / 4 $,
[
$h$: height \
$D$: outer radius \
$d$: inner radius
],
"tetrahedron",
$ sqrt(2) / 12 a^3 $,
[$a$: edge length]
)
Loading