diff --git a/CHANGELOG.md b/CHANGELOG.md index 8045627acd..8d12f2338a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.12.6-dev5 +## 0.12.6-dev6 ### Enhancements diff --git a/test_unstructured/partition/test_common.py b/test_unstructured/partition/test_common.py index 5e9652ff55..f57da48750 100644 --- a/test_unstructured/partition/test_common.py +++ b/test_unstructured/partition/test_common.py @@ -5,7 +5,8 @@ from PIL import Image from unstructured_inference.inference import layout from unstructured_inference.inference.elements import TextRegion -from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout +from unstructured_inference.inference.layout import DocumentLayout, PageLayout +from unstructured_inference.inference.layoutelement import LayoutElement from unstructured.documents.coordinates import PixelSpace from unstructured.documents.elements import ( @@ -28,7 +29,7 @@ class MockPageLayout(layout.PageLayout): - def __init__(self, number: int, image: Image): + def __init__(self, number: int, image: Image.Image): self.number = number self.image = image diff --git a/typings/pptx/table.pyi b/typings/pptx/table.pyi index edfa21b534..a1b3976ab3 100644 --- a/typings/pptx/table.pyi +++ b/typings/pptx/table.pyi @@ -1 +1,11 @@ -class Table: ... +class Table: + @property + def rows(self) -> tuple[_Row]: ... + +class _Row: + @property + def cells(self) -> tuple[_Cell]: ... + +class _Cell: + @property + def text(self) -> str: ... diff --git a/unstructured/__version__.py b/unstructured/__version__.py index cdc18c18da..d28790e97d 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.12.6-dev5" # pragma: no cover +__version__ = "0.12.6-dev6" # pragma: no cover diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index 60bcf3e248..654dcc4bdc 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -899,7 +899,7 @@ class Footer(Text): category = "Footer" -TYPE_TO_TEXT_ELEMENT_MAP: Dict[str, Any] = { +TYPE_TO_TEXT_ELEMENT_MAP: Dict[str, type[Text]] = { ElementType.TITLE: Title, ElementType.SECTION_HEADER: Title, ElementType.HEADLINE: Title, diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index b3dcb01d2c..244323d35e 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -15,9 +15,9 @@ from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN from unstructured.partition.common import ( - _add_element_metadata, - _remove_element_metadata, + add_element_metadata, exactly_one, + remove_element_metadata, set_element_hierarchy, ) @@ -602,16 +602,11 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]: # NOTE(robinson) - Attached files have already run through this logic # in their own partitioning function if element.metadata.attached_to_filename is None: - _add_element_metadata( - element, - **metadata_kwargs, # type: ignore - ) + add_element_metadata(element, **metadata_kwargs) return elements else: - return _remove_element_metadata( - elements, - ) + return remove_element_metadata(elements) return wrapper @@ -639,16 +634,11 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]: # NOTE(robinson) - Attached files have already run through this logic # in their own partitioning function if element.metadata.attached_to_filename is None: - _add_element_metadata( - element, - filetype=FILETYPE_TO_MIMETYPE[filetype], - ) + add_element_metadata(element, filetype=FILETYPE_TO_MIMETYPE[filetype]) return elements else: - return _remove_element_metadata( - elements, - ) + return remove_element_metadata(elements) return wrapper diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py index e4e16be9d3..225d779f73 100644 --- a/unstructured/partition/common.py +++ b/unstructured/partition/common.py @@ -6,17 +6,7 @@ from datetime import datetime from io import BufferedReader, BytesIO, TextIOWrapper from tempfile import SpooledTemporaryFile -from typing import ( - IO, - TYPE_CHECKING, - Any, - BinaryIO, - Dict, - List, - Optional, - Tuple, - Union, -) +from typing import IO, TYPE_CHECKING, Any, BinaryIO, List, Optional import emoji from tabulate import tabulate @@ -39,11 +29,8 @@ from unstructured.partition.utils.constants import SORT_MODE_DONT, SORT_MODE_XY_CUT from unstructured.utils import dependency_exists, first -if dependency_exists("docx") and dependency_exists("docx.table"): - from docx.table import Table as docxtable - if dependency_exists("pptx") and dependency_exists("pptx.table"): - from pptx.table import Table as pptxtable + from pptx.table import Table as PptxTable if dependency_exists("numpy") and dependency_exists("cv2"): from unstructured.partition.utils.sorting import sort_page_elements @@ -80,14 +67,20 @@ } -def get_last_modified_date(filename: str) -> Union[str, None]: +def get_last_modified_date(filename: str) -> Optional[str]: + """Modification time of file at path `filename`, if it exists. + + Returns `None` when `filename` is not a path to a file on the local filesystem. + + Otherwise returns date and time in ISO 8601 string format (YYYY-MM-DDTHH:MM:SS) like + "2024-03-05T17:02:53". + """ modify_date = datetime.fromtimestamp(os.path.getmtime(filename)) return modify_date.strftime("%Y-%m-%dT%H:%M:%S%z") -def get_last_modified_date_from_file( - file: Union[IO[bytes], SpooledTemporaryFile[bytes], BinaryIO, bytes], -) -> Union[str, None]: +def get_last_modified_date_from_file(file: IO[bytes] | bytes) -> Optional[str]: + """Modified timestamp of `file` if it corresponds to a file on the local filesystem.""" filename = None if hasattr(file, "name"): filename = file.name @@ -100,15 +93,11 @@ def get_last_modified_date_from_file( def normalize_layout_element( - layout_element: Union[ - "LayoutElement", - Element, - Dict[str, Any], - ], + layout_element: LayoutElement | Element | dict[str, Any], coordinate_system: Optional[CoordinateSystem] = None, infer_list_items: bool = True, source_format: Optional[str] = "html", -) -> Union[Element, List[Element]]: +) -> Element | list[Element]: """Converts an unstructured_inference LayoutElement object to an unstructured Element.""" if isinstance(layout_element, Element) and source_format == "html": @@ -123,7 +112,7 @@ def normalize_layout_element( else: layout_dict = layout_element - text = layout_dict.get("text") + text = layout_dict.get("text", "") # Both `coordinates` and `coordinate_system` must be present # in order to add coordinates metadata to the element. coordinates = layout_dict.get("coordinates") @@ -148,7 +137,7 @@ def normalize_layout_element( ) else: return ListItem( - text=text if text else "", + text=text, coordinates=coordinates, coordinate_system=coordinate_system, metadata=class_prob_metadata, @@ -156,6 +145,7 @@ def normalize_layout_element( ) elif element_type in TYPE_TO_TEXT_ELEMENT_MAP: + assert isinstance(element_type, str) _element_class = TYPE_TO_TEXT_ELEMENT_MAP[element_type] _element_class = _element_class( text=text, @@ -187,7 +177,7 @@ def normalize_layout_element( ) else: return Text( - text=text if text else "", + text=text, coordinates=coordinates, coordinate_system=coordinate_system, metadata=class_prob_metadata, @@ -197,10 +187,10 @@ def normalize_layout_element( def layout_list_to_list_items( text: Optional[str], - coordinates: Optional[Tuple[Tuple[float, float], ...]], + coordinates: Optional[tuple[tuple[float, float], ...]], coordinate_system: Optional[CoordinateSystem], - metadata=Optional[ElementMetadata], - detection_origin=Optional[str], + metadata: Optional[ElementMetadata], + detection_origin: Optional[str], ) -> List[Element]: """Converts a list LayoutElement to a list of ListItem elements.""" split_items = ENUMERATED_BULLETS_RE.split(text) if text else [] @@ -226,9 +216,8 @@ def layout_list_to_list_items( def set_element_hierarchy( - elements: List[Element], - ruleset: Dict[str, List[str]] = HIERARCHY_RULE_SET, -) -> List[Element]: + elements: List[Element], ruleset: dict[str, list[str]] = HIERARCHY_RULE_SET +) -> list[Element]: """Sets the parent_id for each element in the list of elements based on the element's category, depth and a ruleset @@ -274,23 +263,25 @@ def set_element_hierarchy( return elements -def _add_element_metadata( +def add_element_metadata( element: Element, filename: Optional[str] = None, filetype: Optional[str] = None, page_number: Optional[int] = None, url: Optional[str] = None, text_as_html: Optional[str] = None, - coordinates: Optional[Tuple[Tuple[float, float], ...]] = None, + coordinates: Optional[tuple[tuple[float, float], ...]] = None, coordinate_system: Optional[CoordinateSystem] = None, section: Optional[str] = None, image_path: Optional[str] = None, detection_origin: Optional[str] = None, languages: Optional[List[str]] = None, - **kwargs, + **kwargs: Any, ) -> Element: - """Adds document metadata to the document element. Document metadata includes information - like the filename, source url, and page number.""" + """Adds document metadata to the document element. + + Document metadata includes information like the filename, source url, and page number. + """ coordinates_metadata = ( CoordinatesMetadata( @@ -342,12 +333,11 @@ def _add_element_metadata( return element -def _remove_element_metadata( - layout_elements, -) -> List[Element]: - """Removes document metadata from the document element. Document metadata includes information - like the filename, source url, and page number.""" - # Init an empty list of elements to write to +def remove_element_metadata(layout_elements) -> list[Element]: + """Removes document metadata from the document element. + + Document metadata includes information like the filename, source url, and page number. + """ elements: List[Element] = [] metadata = ElementMetadata() for layout_element in layout_elements: @@ -442,8 +432,8 @@ def exactly_one(**kwargs: Any) -> None: def spooled_to_bytes_io_if_needed( - file_obj: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile[bytes]]], -) -> Optional[Union[bytes, BinaryIO]]: + file_obj: bytes | BinaryIO | SpooledTemporaryFile[bytes] | None, +) -> bytes | BinaryIO | None: if isinstance(file_obj, SpooledTemporaryFile): file_obj.seek(0) contents = file_obj.read() @@ -453,35 +443,35 @@ def spooled_to_bytes_io_if_needed( return file_obj -def convert_to_bytes( - file: Optional[Union[bytes, SpooledTemporaryFile, IO[bytes]]] = None, -) -> bytes: +def convert_to_bytes(file: bytes | IO[bytes]) -> bytes: + """Extract the bytes from `file` without preventing it from being read again later. + + As a convenience to simplify client code, also returns `file` unchanged if it is already bytes. + """ if isinstance(file, bytes): - f_bytes = file - elif isinstance(file, SpooledTemporaryFile): + return file + + if isinstance(file, SpooledTemporaryFile): file.seek(0) f_bytes = file.read() file.seek(0) - elif isinstance(file, BytesIO): - f_bytes = file.getvalue() - elif isinstance(file, (TextIOWrapper, BufferedReader)): + return f_bytes + + if isinstance(file, BytesIO): + return file.getvalue() + + if isinstance(file, (TextIOWrapper, BufferedReader)): with open(file.name, "rb") as f: - f_bytes = f.read() - else: - raise ValueError("Invalid file-like object type") + return f.read() - return f_bytes + raise ValueError("Invalid file-like object type") -def convert_ms_office_table_to_text( - table: Union["docxtable", "pptxtable"], - as_html: bool = True, -) -> str: - """ - Convert a table object from a Word document to an HTML table string using the tabulate library. +def convert_ms_office_table_to_text(table: PptxTable, as_html: bool = True) -> str: + """Convert a PPTX table object to an HTML table string using the tabulate library. Args: - table (Table): A docx.table.Table object. + table (Table): A pptx.table.Table object. as_html (bool): Whether to return the table as an HTML string (True) or a plain text string (False) @@ -513,9 +503,7 @@ def contains_emoji(s: str) -> bool: return bool(emoji.emoji_count(s)) -def _get_page_image_metadata( - page: PageLayout, -) -> dict: +def _get_page_image_metadata(page: PageLayout) -> dict[str, Any]: """Retrieve image metadata and coordinate system from a page.""" image = getattr(page, "image", None) @@ -551,7 +539,7 @@ def document_to_element_list( detection_origin: Optional[str] = None, sort_mode: str = SORT_MODE_XY_CUT, languages: Optional[List[str]] = None, - **kwargs, + **kwargs: Any, ) -> List[Element]: """Converts a DocumentLayout object to a list of unstructured elements.""" elements: List[Element] = [] @@ -565,7 +553,7 @@ def document_to_element_list( image_width = page_image_metadata.get("width") image_height = page_image_metadata.get("height") - translation_mapping: List[Tuple["LayoutElement", Element]] = [] + translation_mapping: list[tuple["LayoutElement", Element]] = [] for layout_element in page.elements: if image_width and image_height and hasattr(layout_element.bbox, "coordinates"): coordinate_system = PixelSpace(width=image_width, height=image_height) @@ -610,7 +598,7 @@ def document_to_element_list( layout_element.image_path if hasattr(layout_element, "image_path") else None ) - _add_element_metadata( + add_element_metadata( element, page_number=i + 1, filetype=image_format, @@ -642,16 +630,16 @@ def document_to_element_list( def ocr_data_to_elements( ocr_data: List["LayoutElement"], - image_size: Tuple[Union[int, float], Union[int, float]], + image_size: tuple[int | float, int | float], common_metadata: Optional[ElementMetadata] = None, infer_list_items: bool = True, source_format: Optional[str] = None, -) -> List[Element]: +) -> list[Element]: """Convert OCR layout data into `unstructured` elements with associated metadata.""" image_width, image_height = image_size coordinate_system = PixelSpace(width=image_width, height=image_height) - elements = [] + elements: list[Element] = [] for layout_element in ocr_data: element = normalize_layout_element( layout_element, diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 4be6c1c9d2..0ac897d544 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -135,7 +135,7 @@ def default_hi_res_model() -> str: @add_chunking_strategy def partition_pdf( filename: str = "", - file: Optional[Union[BinaryIO, SpooledTemporaryFile]] = None, + file: Optional[Union[BinaryIO, SpooledTemporaryFile[bytes]]] = None, include_page_breaks: bool = False, strategy: str = PartitionStrategy.AUTO, infer_table_structure: bool = False, @@ -151,7 +151,7 @@ def partition_pdf( extract_image_block_types: Optional[List[str]] = None, extract_image_block_output_dir: Optional[str] = None, extract_image_block_to_payload: bool = False, - **kwargs, + **kwargs: Any, ) -> List[Element]: """Parses a pdf document into a list of interpreted elements. Parameters diff --git a/unstructured/utils.py b/unstructured/utils.py index 40d648aecb..9a50204b05 100644 --- a/unstructured/utils.py +++ b/unstructured/utils.py @@ -263,7 +263,7 @@ def validate_date_args(date: Optional[str] = None) -> bool: ) -def _first_and_remaining_iterator(it: Iterable[Any]) -> tuple[Any, Iterator[Any]]: +def _first_and_remaining_iterator(it: Iterable[_T]) -> Tuple[_T, Iterator[_T]]: iterator = iter(it) try: out = next(iterator) @@ -275,7 +275,7 @@ def _first_and_remaining_iterator(it: Iterable[Any]) -> tuple[Any, Iterator[Any] return out, iterator -def first(it: Iterable[Any]) -> Any: +def first(it: Iterable[_T]) -> _T: """Returns the first item from an iterable. Raises an error if the iterable is empty.""" out, _ = _first_and_remaining_iterator(it) return out