In [None]:
pip install -q azure-ai-documentintelligence

In [None]:
import io, os, json
 
from azure.ai.documentintelligence import DocumentIntelligenceClient
 
from azure.core.credentials import AzureKeyCredential
 
from azure.core.serialization import AzureJSONEncoder


In [None]:
 
# ─── 1. Client + paths ──────────────────────────────────────────
 
endpoint = ""
 
key      = ""
 
client = DocumentIntelligenceClient(endpoint, AzureKeyCredential(key))
 
input_folder  = ""
 
output_folder = ""
 
# ─── 2. Read DOCX files with Spark binaryFile ──────────────────
 
docx_files = (
 
    spark.read.format("binaryFile")
 
    .option("pathGlobFilter", "*.docx")
 
    .load(input_folder)
 
)
 
# ─── 3. Process each DOCX ──────────────────────────────────────
 
for row in docx_files.collect():
 
    doc_path  = row.path
 
    doc_bytes = row.content
 
    print(f"\nProcessing: {doc_path}")
 
    # Analyse layout (Word is natively supported by prebuilt-layout)
 
    poller  = client.begin_analyze_document(
 
        model_id="prebuilt-layout",
 
        body=io.BytesIO(doc_bytes)
 
    )
 
    result  = poller.result()
 
    # ─── Build output structure ────────────────────────────────
 
    out = {
 
        "file"  : os.path.basename(doc_path),
 
        "styles": [
 
            "handwritten" if s.is_handwritten else "no handwritten"
 
            for s in result.styles
 
        ],
 
        "pages" : []          # will hold page dicts (with tables attached)
 
    }
 
    # A) Collect every page (lines only for now)
 
    page_index_map = {}
 
    for page in result.pages:
 
        page_dict = {
 
            "page_number": page.page_number,
 
            "lines"      : [ln.content for ln in page.lines],
 
            "tables"     : []      # placeholder for step B
 
        }
 
        page_index_map[page.page_number] = len(out["pages"])
 
        out["pages"].append(page_dict)
 
    # B) Convert tables ➜ attach to correct page
 
    for tbl_idx, tbl in enumerate(result.tables):
 
        start_page = tbl.bounding_regions[0].page_number
 
        # headers
 
        headers = {
 
            c.column_index: c.content
 
            for c in tbl.cells
 
            if c.row_index == 0
 
        }
 
        # rows
 
        row_dicts = {}
 
        for c in tbl.cells:
 
            if c.row_index == 0:
 
                continue
 
            row_dicts.setdefault(c.row_index, {})[
 
                headers.get(c.column_index, f"col_{c.column_index}")
 
            ] = c.content
 
        table_dict = {
 
            "table_index" : tbl_idx,
 
            "row_count"   : tbl.row_count,
 
            "column_count": tbl.column_count,
 
            "rows"        : [row_dicts[r] for r in sorted(row_dicts)]
 
        }
 
        # attach
 
        if start_page in page_index_map:
 
            out["pages"][page_index_map[start_page]]["tables"].append(table_dict)
 
        else:
 
            out.setdefault("orphan_tables", []).append(table_dict)
 
    # ─── 4. Write JSON back to ADLS ─────────────────────────────
 
    json_str   = json.dumps(out, indent=4, cls=AzureJSONEncoder)
 
    json_name  = os.path.splitext(os.path.basename(doc_path))[0] + ".json"
 
    output_path = os.path.join(output_folder, json_name)   # abfss:// …
 
    mssparkutils.fs.put(output_path, json_str, overwrite=True)
 
    print(f"✓ Saved subset JSON → {output_path}")
 
print("\nAll DOCX files processed and subsets stored.")
 
 
 