Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.15.8-dev2
## 0.15.8-dev3

### Enhancements

Expand All @@ -8,6 +8,7 @@

* **Fix NLTK data download path to prevent nested directories**. Resolved an issue where a nested "nltk_data" directory was created within the parent "nltk_data" directory when it already existed. This fix prevents errors in checking for existing downloads and loading models from NLTK data.
* **Minify text_as_html from DOCX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.
* **Fall back to filename extension-based file-type detection for unidentified OLE files.** Resolves a problem where a DOC file that could not be detected as such by `filetype` was incorrectly identified as a MSG file.

## 0.15.6

Expand Down
49 changes: 40 additions & 9 deletions test_unstructured/file_utils/test_filetype.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
LogCaptureFixture,
Mock,
example_doc_path,
function_mock,
patch,
property_mock,
)
Expand Down Expand Up @@ -52,7 +53,6 @@
(FileType.JPG, "img/example.jpg", "image/jpeg"),
(FileType.JSON, "spring-weather.html.json", "application/json"),
(FileType.MD, "README.md", "text/markdown"),
(FileType.MSG, "fake-email.msg", "application/vnd.ms-outlook"),
(FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"),
(FileType.ORG, "README.org", "text/org"),
(FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"),
Expand Down Expand Up @@ -111,7 +111,6 @@ def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_conte
(FileType.JPG, "img/example.jpg", "image/jpeg"),
(FileType.JSON, "spring-weather.html.json", "application/json"),
(FileType.MD, "README.md", "text/markdown"),
(FileType.MSG, "fake-email.msg", "application/vnd.ms-outlook"),
(FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"),
(FileType.ORG, "README.org", "text/org"),
(FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"),
Expand Down Expand Up @@ -193,7 +192,6 @@ def test_it_detects_correct_file_type_from_file_no_name_with_swapped_ms_office_c
(FileType.DOC, "simple.doc"),
(FileType.PPT, "fake-power-point.ppt"),
(FileType.XLS, "tests-example.xls"),
(FileType.MSG, "fake-email-multiple-attachments.msg"),
],
)
@pytest.mark.parametrize(
Expand All @@ -212,7 +210,7 @@ def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_asserted_
"""Fixes wrong XLS asserted as DOC, PPT, etc.

Asserted content-type can be anything except `None` and differentiator will fix it if the file
is DOC, PPT, XLS, or MSG type.
is DOC, PPT, or XLS type.
"""
# -- disable strategies 2 & 3, content-type strategy should get this on its own --
ctx_mime_type_.return_value = None
Expand Down Expand Up @@ -254,7 +252,6 @@ def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_asserted_
(FileType.JSON, "spring-weather.html.json", "application/json"),
(FileType.MD, "README.md", "text/markdown"),
(FileType.MD, "README.md", "text/x-markdown"),
(FileType.MSG, "fake-email.msg", "application/vnd.ms-outlook"),
(FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"),
(FileType.ORG, "README.org", "text/org"),
(FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"),
Expand Down Expand Up @@ -314,7 +311,6 @@ def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_rec
(FileType.HTML, "ideas-page.html"),
(FileType.JPG, "img/example.jpg"),
(FileType.JSON, "spring-weather.html.json"),
(FileType.MSG, "fake-email.msg"),
(FileType.ODT, "simple.odt"),
(FileType.PDF, "pdf/layout-parser-paper-fast.pdf"),
(FileType.PNG, "img/DA-1p.png"),
Expand Down Expand Up @@ -357,7 +353,6 @@ def test_it_detects_most_file_types_using_strategy_2_when_libmagic_guesses_mime_
(FileType.DOC, "simple.doc"),
(FileType.PPT, "fake-power-point.ppt"),
(FileType.XLS, "tests-example.xls"),
(FileType.MSG, "fake-email-multiple-attachments.msg"),
],
)
@pytest.mark.parametrize(
Expand Down Expand Up @@ -535,6 +530,21 @@ def test_it_falls_back_to_extension_strategy_when_prior_strategies_fail(
# ================================================================================================


@pytest.mark.parametrize(
("metadata_file_path", "expected_value"),
[
("fake-email.msg", FileType.MSG),
("fake-email.msg.outlook", FileType.UNK),
],
)
def test_it_can_only_detect_MSG_format_by_extension(
metadata_file_path: str, expected_value: FileType
):
with open(example_doc_path("fake-email.msg"), "rb") as f:
file = io.BytesIO(f.read())
assert detect_filetype(file=file, metadata_file_path=metadata_file_path) == expected_value


@pytest.mark.parametrize("mime_type", ["application/xml", "text/xml"])
@pytest.mark.parametrize("extension", [".html", ".htm"])
def test_it_detects_HTML_from_guessed_mime_type_ending_with_xml_and_html_extension(
Expand Down Expand Up @@ -1018,11 +1028,11 @@ def and_it_returns_None_when_ole_differentiation_is_not_applicable_to_the_mime_t
("simple.doc", FileType.DOC),
("fake-power-point.ppt", FileType.PPT),
("tests-example.xls", FileType.XLS),
("fake-email.msg", FileType.MSG),
("fake-email.msg", None),
("README.org", None),
],
)
def it_distinguishes_the_file_type_of_applicable_zip_files(
def it_distinguishes_the_file_type_of_applicable_OLE_files(
self, file_name: str, expected_value: FileType | None
):
# -- no file-name available, just to make sure we're not relying on an extension --
Expand All @@ -1033,6 +1043,27 @@ def it_distinguishes_the_file_type_of_applicable_zip_files(

assert differentiator.file_type is expected_value

def but_it_returns_None_to_engage_fallback_when_filetype_cannot_guess_mime(
self, guess_mime_: Mock
):
guess_mime_.return_value = None
# -- no file-name available, just to make sure we're not relying on an extension --
with open(example_doc_path("fake-email.msg"), "rb") as f:
file = io.BytesIO(f.read())
ctx = _FileTypeDetectionContext(file=file)
differentiator = _OleFileDifferentiator(ctx)

file_type = differentiator.file_type

guess_mime_.assert_called_once_with(file)
assert file_type is None

# -- fixtures --------------------------------------------------------------------------------

@pytest.fixture
def guess_mime_(self, request: FixtureRequest):
return function_mock(request, "unstructured.file_utils.filetype.ft.guess_mime")


class Describe_TextFileDifferentiator:
"""Unit-test suite for `unstructured.file_utils.filetype._TextFileDifferentiator`."""
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.15.8-dev2" # pragma: no cover
__version__ = "0.15.8-dev3" # pragma: no cover
9 changes: 5 additions & 4 deletions unstructured/file_utils/filetype.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,13 +476,14 @@ def file_type(self) -> FileType | None:
if not self._is_ole_file(self._ctx):
return None

# -- `filetype` lib is better at legacy MS-Office files than `libmagic`, so rely on it to
# -- differentiate those. Note it doesn't detect MSG type though, so we assume any OLE file
# -- that is not a legacy MS-Office type to be a MSG file.
# -- `filetype` lib is better at legacy MS-Office files than `libmagic`, so we rely on it
# -- to differentiate those. Note `filetype` doesn't detect MSG type and won't always
# -- detect DOC, PPT, or XLS, returning `None` instead. We let those fall through and we
# -- rely on filename-extension to identify those.
with self._ctx.open() as file:
mime_type = ft.guess_mime(file)

return FileType.from_mime_type(mime_type or "application/vnd.ms-outlook")
return FileType.from_mime_type(mime_type) if mime_type else None

@staticmethod
def _is_ole_file(ctx: _FileTypeDetectionContext) -> bool:
Expand Down