diff --git a/CHANGELOG.md b/CHANGELOG.md index d3d2718e48..8f71ead139 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.15.8-dev2 +## 0.15.8-dev3 ### Enhancements @@ -8,6 +8,7 @@ * **Fix NLTK data download path to prevent nested directories**. Resolved an issue where a nested "nltk_data" directory was created within the parent "nltk_data" directory when it already existed. This fix prevents errors in checking for existing downloads and loading models from NLTK data. * **Minify text_as_html from DOCX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text. +* **Fall back to filename extension-based file-type detection for unidentified OLE files.** Resolves a problem where a DOC file that could not be detected as such by `filetype` was incorrectly identified as a MSG file. ## 0.15.6 diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index deed06dff1..8c363838c3 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -14,6 +14,7 @@ LogCaptureFixture, Mock, example_doc_path, + function_mock, patch, property_mock, ) @@ -52,7 +53,6 @@ (FileType.JPG, "img/example.jpg", "image/jpeg"), (FileType.JSON, "spring-weather.html.json", "application/json"), (FileType.MD, "README.md", "text/markdown"), - (FileType.MSG, "fake-email.msg", "application/vnd.ms-outlook"), (FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"), (FileType.ORG, "README.org", "text/org"), (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"), @@ -111,7 +111,6 @@ def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_conte (FileType.JPG, "img/example.jpg", "image/jpeg"), (FileType.JSON, "spring-weather.html.json", "application/json"), (FileType.MD, "README.md", "text/markdown"), - (FileType.MSG, "fake-email.msg", "application/vnd.ms-outlook"), (FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"), (FileType.ORG, "README.org", "text/org"), (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"), @@ -193,7 +192,6 @@ def test_it_detects_correct_file_type_from_file_no_name_with_swapped_ms_office_c (FileType.DOC, "simple.doc"), (FileType.PPT, "fake-power-point.ppt"), (FileType.XLS, "tests-example.xls"), - (FileType.MSG, "fake-email-multiple-attachments.msg"), ], ) @pytest.mark.parametrize( @@ -212,7 +210,7 @@ def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_asserted_ """Fixes wrong XLS asserted as DOC, PPT, etc. Asserted content-type can be anything except `None` and differentiator will fix it if the file - is DOC, PPT, XLS, or MSG type. + is DOC, PPT, or XLS type. """ # -- disable strategies 2 & 3, content-type strategy should get this on its own -- ctx_mime_type_.return_value = None @@ -254,7 +252,6 @@ def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_asserted_ (FileType.JSON, "spring-weather.html.json", "application/json"), (FileType.MD, "README.md", "text/markdown"), (FileType.MD, "README.md", "text/x-markdown"), - (FileType.MSG, "fake-email.msg", "application/vnd.ms-outlook"), (FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"), (FileType.ORG, "README.org", "text/org"), (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"), @@ -314,7 +311,6 @@ def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_rec (FileType.HTML, "ideas-page.html"), (FileType.JPG, "img/example.jpg"), (FileType.JSON, "spring-weather.html.json"), - (FileType.MSG, "fake-email.msg"), (FileType.ODT, "simple.odt"), (FileType.PDF, "pdf/layout-parser-paper-fast.pdf"), (FileType.PNG, "img/DA-1p.png"), @@ -357,7 +353,6 @@ def test_it_detects_most_file_types_using_strategy_2_when_libmagic_guesses_mime_ (FileType.DOC, "simple.doc"), (FileType.PPT, "fake-power-point.ppt"), (FileType.XLS, "tests-example.xls"), - (FileType.MSG, "fake-email-multiple-attachments.msg"), ], ) @pytest.mark.parametrize( @@ -535,6 +530,21 @@ def test_it_falls_back_to_extension_strategy_when_prior_strategies_fail( # ================================================================================================ +@pytest.mark.parametrize( + ("metadata_file_path", "expected_value"), + [ + ("fake-email.msg", FileType.MSG), + ("fake-email.msg.outlook", FileType.UNK), + ], +) +def test_it_can_only_detect_MSG_format_by_extension( + metadata_file_path: str, expected_value: FileType +): + with open(example_doc_path("fake-email.msg"), "rb") as f: + file = io.BytesIO(f.read()) + assert detect_filetype(file=file, metadata_file_path=metadata_file_path) == expected_value + + @pytest.mark.parametrize("mime_type", ["application/xml", "text/xml"]) @pytest.mark.parametrize("extension", [".html", ".htm"]) def test_it_detects_HTML_from_guessed_mime_type_ending_with_xml_and_html_extension( @@ -1018,11 +1028,11 @@ def and_it_returns_None_when_ole_differentiation_is_not_applicable_to_the_mime_t ("simple.doc", FileType.DOC), ("fake-power-point.ppt", FileType.PPT), ("tests-example.xls", FileType.XLS), - ("fake-email.msg", FileType.MSG), + ("fake-email.msg", None), ("README.org", None), ], ) - def it_distinguishes_the_file_type_of_applicable_zip_files( + def it_distinguishes_the_file_type_of_applicable_OLE_files( self, file_name: str, expected_value: FileType | None ): # -- no file-name available, just to make sure we're not relying on an extension -- @@ -1033,6 +1043,27 @@ def it_distinguishes_the_file_type_of_applicable_zip_files( assert differentiator.file_type is expected_value + def but_it_returns_None_to_engage_fallback_when_filetype_cannot_guess_mime( + self, guess_mime_: Mock + ): + guess_mime_.return_value = None + # -- no file-name available, just to make sure we're not relying on an extension -- + with open(example_doc_path("fake-email.msg"), "rb") as f: + file = io.BytesIO(f.read()) + ctx = _FileTypeDetectionContext(file=file) + differentiator = _OleFileDifferentiator(ctx) + + file_type = differentiator.file_type + + guess_mime_.assert_called_once_with(file) + assert file_type is None + + # -- fixtures -------------------------------------------------------------------------------- + + @pytest.fixture + def guess_mime_(self, request: FixtureRequest): + return function_mock(request, "unstructured.file_utils.filetype.ft.guess_mime") + class Describe_TextFileDifferentiator: """Unit-test suite for `unstructured.file_utils.filetype._TextFileDifferentiator`.""" diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 0ca79ff041..7b909373c9 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.8-dev2" # pragma: no cover +__version__ = "0.15.8-dev3" # pragma: no cover diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index c5bd11509e..58f8bdbfa2 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -476,13 +476,14 @@ def file_type(self) -> FileType | None: if not self._is_ole_file(self._ctx): return None - # -- `filetype` lib is better at legacy MS-Office files than `libmagic`, so rely on it to - # -- differentiate those. Note it doesn't detect MSG type though, so we assume any OLE file - # -- that is not a legacy MS-Office type to be a MSG file. + # -- `filetype` lib is better at legacy MS-Office files than `libmagic`, so we rely on it + # -- to differentiate those. Note `filetype` doesn't detect MSG type and won't always + # -- detect DOC, PPT, or XLS, returning `None` instead. We let those fall through and we + # -- rely on filename-extension to identify those. with self._ctx.open() as file: mime_type = ft.guess_mime(file) - return FileType.from_mime_type(mime_type or "application/vnd.ms-outlook") + return FileType.from_mime_type(mime_type) if mime_type else None @staticmethod def _is_ole_file(ctx: _FileTypeDetectionContext) -> bool: