diff --git a/src/nutrient_dws/api/direct.py b/src/nutrient_dws/api/direct.py index 5f3b06b..690289c 100644 --- a/src/nutrient_dws/api/direct.py +++ b/src/nutrient_dws/api/direct.py @@ -478,7 +478,8 @@ def optimize_pdf( grayscale_annotations: Convert annotations to grayscale (default: False). disable_images: Remove all images from the PDF (default: False). mrc_compression: MCR compression (default: False). - image_optimization_quality: Image optimization quality from 1 (least optimized) to 4 (most optimized) (default: 2). + image_optimization_quality: Image optimization quality from 1 (least optimized) + to 4 (most optimized) (default: 2). linearize: Linearize (optimize for web viewing) the PDF (default: False). Returns: @@ -487,7 +488,8 @@ def optimize_pdf( Raises: AuthenticationError: If API key is missing or invalid. APIError: For other API errors. - ValueError: If image_optimization_quality is not between 1-4 or no optimization is enabled + ValueError: If image_optimization_quality is not between 1-4 + or no optimization is enabled Example: # Aggressive optimization for minimum file size @@ -709,7 +711,11 @@ def split_pdf( output_paths=["part1.pdf", "part2.pdf"] ) """ - from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output, get_pdf_page_count + from nutrient_dws.file_handler import ( + get_pdf_page_count, + prepare_file_for_upload, + save_file_output, + ) # Validate inputs if not page_ranges: @@ -731,15 +737,21 @@ def split_pdf( # Validate start is within document bounds if start < 0 or start >= num_of_pages: - raise ValueError(f"Page range {i}: start index {start} is out of bounds (0-{num_of_pages-1})") + raise ValueError( + f"Page range {i}: start index {start} is out of bounds (0-{num_of_pages - 1})" + ) # If end is specified, validate it's within document bounds if "end" in page_range: end = page_range["end"] if end < 0 or end >= num_of_pages: - raise ValueError(f"Page range {i}: end index {end} is out of bounds (0-{num_of_pages-1})") + raise ValueError( + f"Page range {i}: end index {end} is out of bounds (0-{num_of_pages - 1})" + ) if end < start: - raise ValueError(f"Page range {i}: end index {end} cannot be less than start index {start}") + raise ValueError( + f"Page range {i}: end index {end} cannot be less than start index {start}" + ) results = [] @@ -814,7 +826,11 @@ def duplicate_pdf_pages( output_path="reordered.pdf" ) """ - from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output, get_pdf_page_count + from nutrient_dws.file_handler import ( + get_pdf_page_count, + prepare_file_for_upload, + save_file_output, + ) # Validate inputs if not page_indexes: @@ -837,7 +853,9 @@ def duplicate_pdf_pages( else: # Validate positive indexes are within bounds if page_index >= num_of_pages: - raise ValueError(f"Page index {page_index} is out of bounds (0-{num_of_pages-1})") + raise ValueError( + f"Page index {page_index} is out of bounds (0-{num_of_pages - 1})" + ) # For positive indexes, create single-page range parts.append({"file": "file", "pages": {"start": page_index, "end": page_index}}) @@ -905,7 +923,11 @@ def delete_pdf_pages( output_path="pages_deleted.pdf" ) """ - from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output, get_pdf_page_count + from nutrient_dws.file_handler import ( + get_pdf_page_count, + prepare_file_for_upload, + save_file_output, + ) # Validate inputs if not page_indexes: @@ -924,7 +946,7 @@ def delete_pdf_pages( # Validate page indexes are within bounds for idx in page_indexes: if idx >= num_of_pages: - raise ValueError(f"Page index {idx} is out of bounds (0-{num_of_pages-1})") + raise ValueError(f"Page index {idx} is out of bounds (0-{num_of_pages - 1})") # Prepare file for upload file_field, file_data = prepare_file_for_upload(input_file, "file") @@ -952,7 +974,9 @@ def delete_pdf_pages( # Add remaining pages after the last deleted page num_of_pages = get_pdf_page_count(input_file) - if (current_page > 0 or (current_page == 0 and len(sorted_indexes) == 0)) and current_page < num_of_pages: + if ( + current_page > 0 or (current_page == 0 and len(sorted_indexes) == 0) + ) and current_page < num_of_pages: # Add all remaining pages from current_page onwards parts.append({"file": "file", "pages": {"start": current_page}}) @@ -1098,7 +1122,11 @@ def add_page( output_path="with_blank_pages.pdf" ) """ - from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output, get_pdf_page_count + from nutrient_dws.file_handler import ( + get_pdf_page_count, + prepare_file_for_upload, + save_file_output, + ) # Validate inputs if page_count < 1: @@ -1394,7 +1422,11 @@ def set_page_label( labels=[{"pages": {"start": 0, "end": 0}, "label": "Cover Page"}] ) """ - from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output, get_pdf_page_count + from nutrient_dws.file_handler import ( + get_pdf_page_count, + prepare_file_for_upload, + save_file_output, + ) # Validate inputs if not labels: @@ -1422,7 +1454,10 @@ def set_page_label( # Validate start is within document bounds start = pages["start"] if start < 0 or start >= num_of_pages: - raise ValueError(f"Label configuration {i}: start index {start} is out of bounds (0-{num_of_pages-1})") + raise ValueError( + f"Label configuration {i}: start index {start}" + f" is out of bounds (0-{num_of_pages - 1})" + ) # Normalize pages - only include 'end' if explicitly provided normalized_pages = {"start": start} @@ -1430,10 +1465,16 @@ def set_page_label( end = pages["end"] # Validate end is within document bounds if end < 0 or end >= num_of_pages: - raise ValueError(f"Label configuration {i}: end index {end} is out of bounds (0-{num_of_pages-1})") + raise ValueError( + f"Label configuration {i}: end index {end}" + f" is out of bounds (0-{num_of_pages - 1})" + ) # Validate end is not less than start if end < start: - raise ValueError(f"Label configuration {i}: end index {end} cannot be less than start index {start}") + raise ValueError( + f"Label configuration {i}: end index {end}" + f" cannot be less than start index {start}" + ) normalized_pages["end"] = end # If no end is specified, leave it out (meaning "to end of document") diff --git a/src/nutrient_dws/file_handler.py b/src/nutrient_dws/file_handler.py index a896cb6..f79cfde 100644 --- a/src/nutrient_dws/file_handler.py +++ b/src/nutrient_dws/file_handler.py @@ -205,21 +205,22 @@ def get_file_size(file_input: FileInput) -> int | None: return None + def get_pdf_page_count(pdf_input: FileInput) -> int: """Zero dependency way to get the number of pages in a PDF. Args: - file_input: File path, bytes, or file-like object. Has to be of a PDF file + pdf_input: File path, bytes, or file-like object. Has to be of a PDF file Returns: Number of pages in a PDF. """ if isinstance(pdf_input, (str, Path)): - with open(pdf_input, 'rb') as f: + with open(pdf_input, "rb") as f: pdf_bytes = f.read() elif isinstance(pdf_input, bytes): pdf_bytes = pdf_input - elif hasattr(pdf_input, 'read') and hasattr(pdf_input, 'seek') and hasattr(pdf_input, 'tell'): + elif hasattr(pdf_input, "read") and hasattr(pdf_input, "seek") and hasattr(pdf_input, "tell"): pos = pdf_input.tell() pdf_input.seek(0) pdf_bytes = pdf_input.read() @@ -228,12 +229,12 @@ def get_pdf_page_count(pdf_input: FileInput) -> int: raise TypeError("Unsupported input type. Expected str, Path, bytes, or seekable BinaryIO.") # Find all PDF objects - objects = re.findall(rb'(\d+)\s+(\d+)\s+obj(.*?)endobj', pdf_bytes, re.DOTALL) + objects = re.findall(rb"(\d+)\s+(\d+)\s+obj(.*?)endobj", pdf_bytes, re.DOTALL) # Get the Catalog Object catalog_obj = None - for obj_num, gen_num, obj_data in objects: - if b'/Type' in obj_data and b'/Catalog' in obj_data: + for _obj_num, _gen_num, obj_data in objects: + if b"/Type" in obj_data and b"/Catalog" in obj_data: catalog_obj = obj_data break @@ -241,22 +242,22 @@ def get_pdf_page_count(pdf_input: FileInput) -> int: raise ValueError("Could not find /Catalog object in PDF.") # Extract /Pages reference (e.g. 3 0 R) - pages_ref_match = re.search(rb'/Pages\s+(\d+)\s+(\d+)\s+R', catalog_obj) + pages_ref_match = re.search(rb"/Pages\s+(\d+)\s+(\d+)\s+R", catalog_obj) if not pages_ref_match: raise ValueError("Could not find /Pages reference in /Catalog.") pages_obj_num = pages_ref_match.group(1).decode() pages_obj_gen = pages_ref_match.group(2).decode() # Step 3: Find the referenced /Pages object - pages_obj_pattern = fr'{pages_obj_num}\s+{pages_obj_gen}\s+obj(.*?)endobj'.encode() + pages_obj_pattern = rf"{pages_obj_num}\s+{pages_obj_gen}\s+obj(.*?)endobj".encode() pages_obj_match = re.search(pages_obj_pattern, pdf_bytes, re.DOTALL) if not pages_obj_match: raise ValueError("Could not find root /Pages object.") pages_obj_data = pages_obj_match.group(1) # Step 4: Extract /Count - count_match = re.search(rb'/Count\s+(\d+)', pages_obj_data) + count_match = re.search(rb"/Count\s+(\d+)", pages_obj_data) if not count_match: raise ValueError("Could not find /Count in root /Pages object.") - return int(count_match.group(1)) \ No newline at end of file + return int(count_match.group(1)) diff --git a/tests/integration/test_direct_api_integration.py b/tests/integration/test_direct_api_integration.py index 4ee08df..a36b1c9 100644 --- a/tests/integration/test_direct_api_integration.py +++ b/tests/integration/test_direct_api_integration.py @@ -273,7 +273,9 @@ def test_split_pdf_integration(self, client, sample_multipage_pdf_path, tmp_path # Verify the number of pages in each output PDF total_page_count = get_pdf_page_count(sample_multipage_pdf_path) assert get_pdf_page_count(result[0]) == 1 # First PDF should have 1 page - assert get_pdf_page_count(result[1]) == total_page_count - 1 # Second PDF should have the remaining pages + assert ( + get_pdf_page_count(result[1]) == total_page_count - 1 + ) # Second PDF should have the remaining pages def test_split_pdf_with_output_files(self, client, sample_multipage_pdf_path, tmp_path): """Test split_pdf method saving to output files.""" @@ -307,7 +309,9 @@ def test_split_pdf_with_output_files(self, client, sample_multipage_pdf_path, tm # Verify the number of pages in the second output PDF total_page_count = get_pdf_page_count(sample_multipage_pdf_path) - assert get_pdf_page_count(str(tmp_path / "remaining.pdf")) == total_page_count - 1 # Second PDF should have remaining pages + assert ( + get_pdf_page_count(str(tmp_path / "remaining.pdf")) == total_page_count - 1 + ) # Second PDF should have remaining pages def test_split_pdf_no_ranges_error(self, client, sample_pdf_path): """Test split_pdf with no ranges returns first page by default.""" @@ -396,7 +400,9 @@ def test_duplicate_pdf_pages_negative_indexes(self, client, sample_pdf_path): assert_is_pdf(result) # Verify the number of pages in the output PDF - assert get_pdf_page_count(result) == 3 # Should have 3 pages (last page, first page, last page) + assert ( + get_pdf_page_count(result) == 3 + ) # Should have 3 pages (last page, first page, last page) def test_duplicate_pdf_pages_empty_indexes_error(self, client, sample_pdf_path): """Test duplicate_pdf_pages method with empty page_indexes raises error.""" @@ -415,7 +421,9 @@ def test_delete_pdf_pages_basic(self, client, sample_multipage_pdf_path): # Verify the number of pages in the output PDF total_page_count = get_pdf_page_count(sample_multipage_pdf_path) - assert get_pdf_page_count(result) == total_page_count - 1 # Should have 2 pages (deleted first page from 3-page PDF) + assert ( + get_pdf_page_count(result) == total_page_count - 1 + ) # Should have 2 pages (deleted first page from 3-page PDF) def test_delete_pdf_pages_multiple(self, client, sample_multipage_pdf_path): """Test delete_pdf_pages method with multiple page deletion.""" @@ -428,7 +436,9 @@ def test_delete_pdf_pages_multiple(self, client, sample_multipage_pdf_path): # Verify the number of pages in the output PDF total_page_count = get_pdf_page_count(sample_multipage_pdf_path) - assert get_pdf_page_count(result) == total_page_count - 2 # Should have 1 page (deleted pages 1 and 3 from 3-page PDF) + assert ( + get_pdf_page_count(result) == total_page_count - 2 + ) # Should have 1 page (deleted pages 1 and 3 from 3-page PDF) def test_delete_pdf_pages_with_output_file(self, client, sample_multipage_pdf_path, tmp_path): """Test delete_pdf_pages method saving to output file.""" @@ -449,7 +459,9 @@ def test_delete_pdf_pages_with_output_file(self, client, sample_multipage_pdf_pa # Verify the number of pages in the output PDF total_page_count = get_pdf_page_count(sample_multipage_pdf_path) - assert get_pdf_page_count(output_path) == total_page_count - 1 # Should have 2 pages (deleted page 2 from 3-page PDF) + assert ( + get_pdf_page_count(output_path) == total_page_count - 1 + ) # Should have 2 pages (deleted page 2 from 3-page PDF) def test_delete_pdf_pages_negative_indexes_error(self, client, sample_pdf_path): """Test delete_pdf_pages method with negative indexes raises error.""" @@ -473,7 +485,9 @@ def test_delete_pdf_pages_duplicate_indexes(self, client, sample_multipage_pdf_p # Verify the number of pages in the output PDF total_page_count = get_pdf_page_count(sample_multipage_pdf_path) - assert get_pdf_page_count(result) == total_page_count - 2 # Should have 1 page (deleted pages 1 and 2 from 3-page PDF) + assert ( + get_pdf_page_count(result) == total_page_count - 2 + ) # Should have 1 page (deleted pages 1 and 2 from 3-page PDF) # Tests for add_page def test_add_page_at_beginning(self, client, sample_pdf_path): diff --git a/tests/integration/test_live_api.py b/tests/integration/test_live_api.py index 2243407..4591f42 100644 --- a/tests/integration/test_live_api.py +++ b/tests/integration/test_live_api.py @@ -306,7 +306,9 @@ def test_duplicate_pdf_pages_negative_indexes(self, client, sample_pdf_path): assert_is_pdf(result) # Verify the number of pages in the output PDF - assert get_pdf_page_count(result) == 3 # Should have 3 pages (last page, first page, last page) + assert ( + get_pdf_page_count(result) == 3 + ) # Should have 3 pages (last page, first page, last page) def test_duplicate_pdf_pages_empty_indexes_error(self, client, sample_pdf_path): """Test duplicate_pdf_pages method with empty page_indexes raises error.""" @@ -326,7 +328,9 @@ def test_delete_pdf_pages_basic(self, client, sample_pdf_path): # Verify the number of pages in the output PDF total_pages = get_pdf_page_count(sample_pdf_path) - assert get_pdf_page_count(result) == total_pages - 1 # Should have one less page than original + assert ( + get_pdf_page_count(result) == total_pages - 1 + ) # Should have one less page than original def test_delete_pdf_pages_multiple(self, client, sample_pdf_path): """Test delete_pdf_pages method with multiple page deletion.""" diff --git a/tests/integration/test_new_tools_integration.py b/tests/integration/test_new_tools_integration.py index 47e5edf..7dd70e2 100644 --- a/tests/integration/test_new_tools_integration.py +++ b/tests/integration/test_new_tools_integration.py @@ -75,7 +75,9 @@ def test_create_redactions_preset_with_output_file( """Test creating redactions with preset and saving to file.""" output_path = tmp_path / "redacted_preset.pdf" result = client.create_redactions_preset( - sample_pdf_with_sensitive_data, preset="international-phone-number", output_path=str(output_path) + sample_pdf_with_sensitive_data, + preset="international-phone-number", + output_path=str(output_path), ) assert result is None assert output_path.exists() @@ -94,7 +96,9 @@ def test_create_redactions_text(self, client, sample_pdf_with_sensitive_data): """Test creating redactions for exact text matches.""" # Use a very common letter that should exist result = client.create_redactions_text( - sample_pdf_with_sensitive_data, text="a", case_sensitive=False, + sample_pdf_with_sensitive_data, + text="a", + case_sensitive=False, ) assert_is_pdf(result) assert len(result) > 0