From 4da323a2b52fb6e146c62c31bed191eece779f5c Mon Sep 17 00:00:00 2001 From: Yao You Date: Wed, 20 Aug 2025 08:59:13 -0500 Subject: [PATCH 1/2] change warning to debug for short text default to english log --- unstructured/partition/common/lang.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured/partition/common/lang.py b/unstructured/partition/common/lang.py index 2f966725ed..31fee82877 100644 --- a/unstructured/partition/common/lang.py +++ b/unstructured/partition/common/lang.py @@ -403,7 +403,7 @@ def detect_languages( # If text contains special characters (like ñ, å, or Korean/Mandarin/etc.) it will NOT default # to English. It will default to English if text is only ascii characters and is short. if re.match(r"^[\x00-\x7F]+$", text) and len(text.split()) < 5: - logger.warning(f'short text: "{text}". Defaulting to English.') + logger.debug(f'short text: "{text}". Defaulting to English.') return ["eng"] # set seed for deterministic langdetect outputs From 5ac8fd8066c254422c2300aded671e09b26197b2 Mon Sep 17 00:00:00 2001 From: Yao You Date: Wed, 20 Aug 2025 09:04:31 -0500 Subject: [PATCH 2/2] update changelog and bump version --- CHANGELOG.md | 12 +++++++++++- unstructured/__version__.py | 2 +- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 37a1172ae9..03080c415f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.18.14-dev0 + +### Enhancements + +### Features + +### Fixes + +- **change short text language detection log to debug** reduce warning level log spamming + ## 0.18.13 ### Enhancements @@ -6,7 +16,7 @@ ### Fixes -- **Parse a wider variety of date formats in email headers** The `partition_email` function is now more robust to non-standard date formats, including ISO-8601 dates with "Z" suffixes. This prevents `ValueError` exceptions when partitioning emails with these date formats. +- **Parse a wider variety of date formats in email headers** The `partition_email` function is now more robust to non-standard date formats, including ISO-8601 dates with "Z" suffixes. This prevents `ValueError` exceptions when partitioning emails with these date formats. ## 0.18.12 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 7774420d99..4df44ced66 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.18.13" # pragma: no cover +__version__ = "0.18.14-dev0" # pragma: no cover