Unstructured-IO · ds-filipknefel · Feb 14, 2024 · Feb 5, 2024 · Feb 6, 2024 · Feb 7, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -24,6 +24,7 @@
 * **Add .heic file partitioning** .heic image files were previously unsupported and are now supported though partition_image()
 * **Add the ability to specify an alternate OCR** implementation by implementing an `OCRAgent` interface and specify it using `OCR_AGENT` environment variable.
 * **Add Vectara destination connector** Adds support for writing partitioned documents into a Vectara index.
+* **Add ability to detect text in .docx inline shapes** extensions of docx partition, extracts text from inline shapes and includes them in paragraph's text
 
 ### Fixes
 
@@ -37,6 +38,7 @@
 * **Add title to Vectara upload - was not separated out from initial connector **
 * **Fix change OpenSearch port to fix potential conflict with Elasticsearch in ingest test **
 
+
 ## 0.12.3
 
 ### Enhancements

diff --git a/example-docs/docx-shapes.docx b/example-docs/docx-shapes.docx
diff --git a/test_unstructured/partition/docx/test_docx.py b/test_unstructured/partition/docx/test_docx.py
@@ -764,6 +764,20 @@ def test_partition_docx_includes_hyperlink_metadata():
     assert metadata.link_urls is None
 
 
+# -- shape behaviors -----------------------------------------------------------------------------
+
+
+def test_it_considers_text_inside_shapes():
+    # -- <bracketed> text is written inside inline shapes --
+    partitioned_doc = partition_docx(example_doc_path("docx-shapes.docx"))
+    assert [element.text for element in partitioned_doc] == [
+        "Paragraph with single <inline-image> within.",
+        "Paragraph with <inline-image1> and <inline-image2> within.",
+        # -- text "<floating-shape>" in floating shape is ignored --
+        "Paragraph with floating shape attached.",
+    ]
+
+
 # -- module-level fixtures -----------------------------------------------------------------------
 
 

diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py
@@ -330,7 +330,12 @@ def _classify_paragraph_to_element(self, paragraph: Paragraph) -> Iterator[Eleme
         does not contribute to the document-element stream and will not cause an element to be
         emitted.
         """
-        text = paragraph.text
+        text = "".join(
+            e.text
+            for e in paragraph._p.xpath(
+                "w:r | w:hyperlink | w:r/descendant::wp:inline[ancestor::w:drawing][1]//w:r"
+            )
+        )
 
         # NOTE(scanny) - blank paragraphs are commonly used for spacing between paragraphs and
         # do not contribute to the document-element stream.