In [None]:
pip install azure-ai-documentintelligence

In [None]:
import io, os, json, tempfile, subprocess
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.core.credentials import AzureKeyCredential
from azure.core.serialization import AzureJSONEncoder

In [None]:
# ── 1. Client + paths ────────────────────────────────────────────
endpoint = ""
key      = ""
 
client = DocumentIntelligenceClient(endpoint, AzureKeyCredential(key))
 
input_folder  = ""
output_folder = ""
 
# ── 2. Spark-read the DOCs ───────────────────────────────────────
 
doc_files = (
 
    spark.read.format("binaryFile")
 
    .option("pathGlobFilter", "*.doc")
 
    .load(input_folder)
 
)
 
# ── Helper: convert DOC → PDF bytes ──────────────────────────────
 
def doc_to_pdf_bytes(doc_bytes: bytes) -> bytes:
 
    """Convert legacy Word .doc binary to PDF using LibreOffice."""
 
    with tempfile.TemporaryDirectory() as tmpdir:
 
        src_path = os.path.join(tmpdir, f"{uuid.uuid4()}.doc")
 
        pdf_path = src_path.replace(".doc", ".pdf")
 
        # write the incoming bytes
 
        with open(src_path, "wb") as f:
 
            f.write(doc_bytes)
 
        # call libreoffice headless converter
 
        subprocess.run(
 
            ["soffice", "--headless",
 
             "--convert-to", "pdf",
 
             "--outdir", tmpdir,
 
             src_path],
 
            check=True,
 
            stdout=subprocess.DEVNULL,
 
            stderr=subprocess.DEVNULL
 
        )
 
        # read the resulting PDF back to bytes
 
        with open(pdf_path, "rb") as f:
 
            return f.read()
 
# ── 3. Process each DOC (converted to PDF) ───────────────────────
 
for row in doc_files.collect():
 
    doc_path  = row.path
 
    doc_bytes = row.content
 
    print(f"\nProcessing: {doc_path}")
 
    # convert
 
    try:
 
        pdf_bytes = doc_to_pdf_bytes(doc_bytes)
 
    except subprocess.CalledProcessError as e:
 
        print(f"✗ Conversion failed for {doc_path}: {e}")
 
        continue
 
    # analyse converted PDF
 
    poller  = client.begin_analyze_document(
 
        model_id="prebuilt-layout",
 
        body=io.BytesIO(pdf_bytes)              # <── same as before
 
    )
 
    result  = poller.result()
 
    # ── Build output dict (unchanged logic) ──────────────────────
 
    out = {
 
        "file"  : os.path.basename(doc_path),
 
        "styles": [
 
            "handwritten" if s.is_handwritten else "no handwritten"
 
            for s in result.styles
 
        ],
 
        "pages" : []
 
    }
 
    page_index_map = {}
 
    for page in result.pages:
 
        page_dict = {
 
            "page_number": page.page_number,
 
            "lines"      : [ln.content for ln in page.lines],
 
            "tables"     : []
 
        }
 
        page_index_map[page.page_number] = len(out["pages"])
 
        out["pages"].append(page_dict)
 
    for tbl_idx, tbl in enumerate(result.tables):
 
        start_page = tbl.bounding_regions[0].page_number
 
        headers = {c.column_index: c.content for c in tbl.cells if c.row_index == 0}
 
        row_dicts = {}
 
        for c in tbl.cells:
 
            if c.row_index == 0: continue
 
            row_dicts.setdefault(c.row_index, {})[
 
                headers.get(c.column_index, f"col_{c.column_index}")
 
            ] = c.content
 
        table_dict = {
 
            "table_index" : tbl_idx,
 
            "row_count"   : tbl.row_count,
 
            "column_count": tbl.column_count,
 
            "rows"        : [row_dicts[r] for r in sorted(row_dicts)]
 
        }
 
        (out["pages"][page_index_map[start_page]]["tables"]
 
         if start_page in page_index_map
 
         else out.setdefault("orphan_tables", [])).append(table_dict)
 
    # ── 4. Serialise & save to ADLS ──────────────────────────────
 
    json_str   = json.dumps(out, indent=4, cls=AzureJSONEncoder)
 
    json_name  = os.path.splitext(os.path.basename(doc_path))[0] + ".json"
 
    output_path = os.path.join(output_folder, json_name)
 
    mssparkutils.fs.put(output_path, json_str, overwrite=True)
 
    print(f"✓ Saved JSON → {output_path}")
 
print("\nAll DOCs processed and subsets stored.")
 
 