Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 57 additions & 16 deletions src/nutrient_dws/api/direct.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,7 +478,8 @@ def optimize_pdf(
grayscale_annotations: Convert annotations to grayscale (default: False).
disable_images: Remove all images from the PDF (default: False).
mrc_compression: MCR compression (default: False).
image_optimization_quality: Image optimization quality from 1 (least optimized) to 4 (most optimized) (default: 2).
image_optimization_quality: Image optimization quality from 1 (least optimized)
to 4 (most optimized) (default: 2).
linearize: Linearize (optimize for web viewing) the PDF (default: False).

Returns:
Expand All @@ -487,7 +488,8 @@ def optimize_pdf(
Raises:
AuthenticationError: If API key is missing or invalid.
APIError: For other API errors.
ValueError: If image_optimization_quality is not between 1-4 or no optimization is enabled
ValueError: If image_optimization_quality is not between 1-4
or no optimization is enabled

Example:
# Aggressive optimization for minimum file size
Expand Down Expand Up @@ -709,7 +711,11 @@ def split_pdf(
output_paths=["part1.pdf", "part2.pdf"]
)
"""
from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output, get_pdf_page_count
from nutrient_dws.file_handler import (
get_pdf_page_count,
prepare_file_for_upload,
save_file_output,
)

# Validate inputs
if not page_ranges:
Expand All @@ -731,15 +737,21 @@ def split_pdf(

# Validate start is within document bounds
if start < 0 or start >= num_of_pages:
raise ValueError(f"Page range {i}: start index {start} is out of bounds (0-{num_of_pages-1})")
raise ValueError(
f"Page range {i}: start index {start} is out of bounds (0-{num_of_pages - 1})"
)

# If end is specified, validate it's within document bounds
if "end" in page_range:
end = page_range["end"]
if end < 0 or end >= num_of_pages:
raise ValueError(f"Page range {i}: end index {end} is out of bounds (0-{num_of_pages-1})")
raise ValueError(
f"Page range {i}: end index {end} is out of bounds (0-{num_of_pages - 1})"
)
if end < start:
raise ValueError(f"Page range {i}: end index {end} cannot be less than start index {start}")
raise ValueError(
f"Page range {i}: end index {end} cannot be less than start index {start}"
)

results = []

Expand Down Expand Up @@ -814,7 +826,11 @@ def duplicate_pdf_pages(
output_path="reordered.pdf"
)
"""
from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output, get_pdf_page_count
from nutrient_dws.file_handler import (
get_pdf_page_count,
prepare_file_for_upload,
save_file_output,
)

# Validate inputs
if not page_indexes:
Expand All @@ -837,7 +853,9 @@ def duplicate_pdf_pages(
else:
# Validate positive indexes are within bounds
if page_index >= num_of_pages:
raise ValueError(f"Page index {page_index} is out of bounds (0-{num_of_pages-1})")
raise ValueError(
f"Page index {page_index} is out of bounds (0-{num_of_pages - 1})"
)
# For positive indexes, create single-page range
parts.append({"file": "file", "pages": {"start": page_index, "end": page_index}})

Expand Down Expand Up @@ -905,7 +923,11 @@ def delete_pdf_pages(
output_path="pages_deleted.pdf"
)
"""
from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output, get_pdf_page_count
from nutrient_dws.file_handler import (
get_pdf_page_count,
prepare_file_for_upload,
save_file_output,
)

# Validate inputs
if not page_indexes:
Expand All @@ -924,7 +946,7 @@ def delete_pdf_pages(
# Validate page indexes are within bounds
for idx in page_indexes:
if idx >= num_of_pages:
raise ValueError(f"Page index {idx} is out of bounds (0-{num_of_pages-1})")
raise ValueError(f"Page index {idx} is out of bounds (0-{num_of_pages - 1})")

# Prepare file for upload
file_field, file_data = prepare_file_for_upload(input_file, "file")
Expand Down Expand Up @@ -952,7 +974,9 @@ def delete_pdf_pages(

# Add remaining pages after the last deleted page
num_of_pages = get_pdf_page_count(input_file)
if (current_page > 0 or (current_page == 0 and len(sorted_indexes) == 0)) and current_page < num_of_pages:
if (
current_page > 0 or (current_page == 0 and len(sorted_indexes) == 0)
) and current_page < num_of_pages:
# Add all remaining pages from current_page onwards
parts.append({"file": "file", "pages": {"start": current_page}})

Expand Down Expand Up @@ -1098,7 +1122,11 @@ def add_page(
output_path="with_blank_pages.pdf"
)
"""
from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output, get_pdf_page_count
from nutrient_dws.file_handler import (
get_pdf_page_count,
prepare_file_for_upload,
save_file_output,
)

# Validate inputs
if page_count < 1:
Expand Down Expand Up @@ -1394,7 +1422,11 @@ def set_page_label(
labels=[{"pages": {"start": 0, "end": 0}, "label": "Cover Page"}]
)
"""
from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output, get_pdf_page_count
from nutrient_dws.file_handler import (
get_pdf_page_count,
prepare_file_for_upload,
save_file_output,
)

# Validate inputs
if not labels:
Expand Down Expand Up @@ -1422,18 +1454,27 @@ def set_page_label(
# Validate start is within document bounds
start = pages["start"]
if start < 0 or start >= num_of_pages:
raise ValueError(f"Label configuration {i}: start index {start} is out of bounds (0-{num_of_pages-1})")
raise ValueError(
f"Label configuration {i}: start index {start}"
f" is out of bounds (0-{num_of_pages - 1})"
)

# Normalize pages - only include 'end' if explicitly provided
normalized_pages = {"start": start}
if "end" in pages:
end = pages["end"]
# Validate end is within document bounds
if end < 0 or end >= num_of_pages:
raise ValueError(f"Label configuration {i}: end index {end} is out of bounds (0-{num_of_pages-1})")
raise ValueError(
f"Label configuration {i}: end index {end}"
f" is out of bounds (0-{num_of_pages - 1})"
)
# Validate end is not less than start
if end < start:
raise ValueError(f"Label configuration {i}: end index {end} cannot be less than start index {start}")
raise ValueError(
f"Label configuration {i}: end index {end}"
f" cannot be less than start index {start}"
)
normalized_pages["end"] = end
# If no end is specified, leave it out (meaning "to end of document")

Expand Down
21 changes: 11 additions & 10 deletions src/nutrient_dws/file_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,21 +205,22 @@ def get_file_size(file_input: FileInput) -> int | None:

return None


def get_pdf_page_count(pdf_input: FileInput) -> int:
"""Zero dependency way to get the number of pages in a PDF.

Args:
file_input: File path, bytes, or file-like object. Has to be of a PDF file
pdf_input: File path, bytes, or file-like object. Has to be of a PDF file

Returns:
Number of pages in a PDF.
"""
if isinstance(pdf_input, (str, Path)):
with open(pdf_input, 'rb') as f:
with open(pdf_input, "rb") as f:
pdf_bytes = f.read()
elif isinstance(pdf_input, bytes):
pdf_bytes = pdf_input
elif hasattr(pdf_input, 'read') and hasattr(pdf_input, 'seek') and hasattr(pdf_input, 'tell'):
elif hasattr(pdf_input, "read") and hasattr(pdf_input, "seek") and hasattr(pdf_input, "tell"):
pos = pdf_input.tell()
pdf_input.seek(0)
pdf_bytes = pdf_input.read()
Expand All @@ -228,35 +229,35 @@ def get_pdf_page_count(pdf_input: FileInput) -> int:
raise TypeError("Unsupported input type. Expected str, Path, bytes, or seekable BinaryIO.")

# Find all PDF objects
objects = re.findall(rb'(\d+)\s+(\d+)\s+obj(.*?)endobj', pdf_bytes, re.DOTALL)
objects = re.findall(rb"(\d+)\s+(\d+)\s+obj(.*?)endobj", pdf_bytes, re.DOTALL)

# Get the Catalog Object
catalog_obj = None
for obj_num, gen_num, obj_data in objects:
if b'/Type' in obj_data and b'/Catalog' in obj_data:
for _obj_num, _gen_num, obj_data in objects:
if b"/Type" in obj_data and b"/Catalog" in obj_data:
catalog_obj = obj_data
break

if not catalog_obj:
raise ValueError("Could not find /Catalog object in PDF.")

# Extract /Pages reference (e.g. 3 0 R)
pages_ref_match = re.search(rb'/Pages\s+(\d+)\s+(\d+)\s+R', catalog_obj)
pages_ref_match = re.search(rb"/Pages\s+(\d+)\s+(\d+)\s+R", catalog_obj)
if not pages_ref_match:
raise ValueError("Could not find /Pages reference in /Catalog.")
pages_obj_num = pages_ref_match.group(1).decode()
pages_obj_gen = pages_ref_match.group(2).decode()

# Step 3: Find the referenced /Pages object
pages_obj_pattern = fr'{pages_obj_num}\s+{pages_obj_gen}\s+obj(.*?)endobj'.encode()
pages_obj_pattern = rf"{pages_obj_num}\s+{pages_obj_gen}\s+obj(.*?)endobj".encode()
pages_obj_match = re.search(pages_obj_pattern, pdf_bytes, re.DOTALL)
if not pages_obj_match:
raise ValueError("Could not find root /Pages object.")
pages_obj_data = pages_obj_match.group(1)

# Step 4: Extract /Count
count_match = re.search(rb'/Count\s+(\d+)', pages_obj_data)
count_match = re.search(rb"/Count\s+(\d+)", pages_obj_data)
if not count_match:
raise ValueError("Could not find /Count in root /Pages object.")

return int(count_match.group(1))
return int(count_match.group(1))
28 changes: 21 additions & 7 deletions tests/integration/test_direct_api_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,9 @@ def test_split_pdf_integration(self, client, sample_multipage_pdf_path, tmp_path
# Verify the number of pages in each output PDF
total_page_count = get_pdf_page_count(sample_multipage_pdf_path)
assert get_pdf_page_count(result[0]) == 1 # First PDF should have 1 page
assert get_pdf_page_count(result[1]) == total_page_count - 1 # Second PDF should have the remaining pages
assert (
get_pdf_page_count(result[1]) == total_page_count - 1
) # Second PDF should have the remaining pages

def test_split_pdf_with_output_files(self, client, sample_multipage_pdf_path, tmp_path):
"""Test split_pdf method saving to output files."""
Expand Down Expand Up @@ -307,7 +309,9 @@ def test_split_pdf_with_output_files(self, client, sample_multipage_pdf_path, tm

# Verify the number of pages in the second output PDF
total_page_count = get_pdf_page_count(sample_multipage_pdf_path)
assert get_pdf_page_count(str(tmp_path / "remaining.pdf")) == total_page_count - 1 # Second PDF should have remaining pages
assert (
get_pdf_page_count(str(tmp_path / "remaining.pdf")) == total_page_count - 1
) # Second PDF should have remaining pages

def test_split_pdf_no_ranges_error(self, client, sample_pdf_path):
"""Test split_pdf with no ranges returns first page by default."""
Expand Down Expand Up @@ -396,7 +400,9 @@ def test_duplicate_pdf_pages_negative_indexes(self, client, sample_pdf_path):
assert_is_pdf(result)

# Verify the number of pages in the output PDF
assert get_pdf_page_count(result) == 3 # Should have 3 pages (last page, first page, last page)
assert (
get_pdf_page_count(result) == 3
) # Should have 3 pages (last page, first page, last page)

def test_duplicate_pdf_pages_empty_indexes_error(self, client, sample_pdf_path):
"""Test duplicate_pdf_pages method with empty page_indexes raises error."""
Expand All @@ -415,7 +421,9 @@ def test_delete_pdf_pages_basic(self, client, sample_multipage_pdf_path):

# Verify the number of pages in the output PDF
total_page_count = get_pdf_page_count(sample_multipage_pdf_path)
assert get_pdf_page_count(result) == total_page_count - 1 # Should have 2 pages (deleted first page from 3-page PDF)
assert (
get_pdf_page_count(result) == total_page_count - 1
) # Should have 2 pages (deleted first page from 3-page PDF)

def test_delete_pdf_pages_multiple(self, client, sample_multipage_pdf_path):
"""Test delete_pdf_pages method with multiple page deletion."""
Expand All @@ -428,7 +436,9 @@ def test_delete_pdf_pages_multiple(self, client, sample_multipage_pdf_path):

# Verify the number of pages in the output PDF
total_page_count = get_pdf_page_count(sample_multipage_pdf_path)
assert get_pdf_page_count(result) == total_page_count - 2 # Should have 1 page (deleted pages 1 and 3 from 3-page PDF)
assert (
get_pdf_page_count(result) == total_page_count - 2
) # Should have 1 page (deleted pages 1 and 3 from 3-page PDF)

def test_delete_pdf_pages_with_output_file(self, client, sample_multipage_pdf_path, tmp_path):
"""Test delete_pdf_pages method saving to output file."""
Expand All @@ -449,7 +459,9 @@ def test_delete_pdf_pages_with_output_file(self, client, sample_multipage_pdf_pa

# Verify the number of pages in the output PDF
total_page_count = get_pdf_page_count(sample_multipage_pdf_path)
assert get_pdf_page_count(output_path) == total_page_count - 1 # Should have 2 pages (deleted page 2 from 3-page PDF)
assert (
get_pdf_page_count(output_path) == total_page_count - 1
) # Should have 2 pages (deleted page 2 from 3-page PDF)

def test_delete_pdf_pages_negative_indexes_error(self, client, sample_pdf_path):
"""Test delete_pdf_pages method with negative indexes raises error."""
Expand All @@ -473,7 +485,9 @@ def test_delete_pdf_pages_duplicate_indexes(self, client, sample_multipage_pdf_p

# Verify the number of pages in the output PDF
total_page_count = get_pdf_page_count(sample_multipage_pdf_path)
assert get_pdf_page_count(result) == total_page_count - 2 # Should have 1 page (deleted pages 1 and 2 from 3-page PDF)
assert (
get_pdf_page_count(result) == total_page_count - 2
) # Should have 1 page (deleted pages 1 and 2 from 3-page PDF)

# Tests for add_page
def test_add_page_at_beginning(self, client, sample_pdf_path):
Expand Down
8 changes: 6 additions & 2 deletions tests/integration/test_live_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,9 @@ def test_duplicate_pdf_pages_negative_indexes(self, client, sample_pdf_path):
assert_is_pdf(result)

# Verify the number of pages in the output PDF
assert get_pdf_page_count(result) == 3 # Should have 3 pages (last page, first page, last page)
assert (
get_pdf_page_count(result) == 3
) # Should have 3 pages (last page, first page, last page)

def test_duplicate_pdf_pages_empty_indexes_error(self, client, sample_pdf_path):
"""Test duplicate_pdf_pages method with empty page_indexes raises error."""
Expand All @@ -326,7 +328,9 @@ def test_delete_pdf_pages_basic(self, client, sample_pdf_path):

# Verify the number of pages in the output PDF
total_pages = get_pdf_page_count(sample_pdf_path)
assert get_pdf_page_count(result) == total_pages - 1 # Should have one less page than original
assert (
get_pdf_page_count(result) == total_pages - 1
) # Should have one less page than original

def test_delete_pdf_pages_multiple(self, client, sample_pdf_path):
"""Test delete_pdf_pages method with multiple page deletion."""
Expand Down
8 changes: 6 additions & 2 deletions tests/integration/test_new_tools_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,9 @@ def test_create_redactions_preset_with_output_file(
"""Test creating redactions with preset and saving to file."""
output_path = tmp_path / "redacted_preset.pdf"
result = client.create_redactions_preset(
sample_pdf_with_sensitive_data, preset="international-phone-number", output_path=str(output_path)
sample_pdf_with_sensitive_data,
preset="international-phone-number",
output_path=str(output_path),
)
assert result is None
assert output_path.exists()
Expand All @@ -94,7 +96,9 @@ def test_create_redactions_text(self, client, sample_pdf_with_sensitive_data):
"""Test creating redactions for exact text matches."""
# Use a very common letter that should exist
result = client.create_redactions_text(
sample_pdf_with_sensitive_data, text="a", case_sensitive=False,
sample_pdf_with_sensitive_data,
text="a",
case_sensitive=False,
)
assert_is_pdf(result)
assert len(result) > 0
Expand Down