In [None]:
pip install -q azure-ai-documentintelligence
 

In [None]:
import io, os, json
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.core.credentials import AzureKeyCredential
from azure.core.serialization import AzureJSONEncoder
 
# ── 1. Client + paths ────────────────────────────────────────────
endpoint = ""
key      = ""
 
client = DocumentIntelligenceClient(endpoint, AzureKeyCredential(key))
 
input_folder  = ""
output_folder = ""
 
# ── 2. Read PDFs via Spark binaryFile ────────────────────────────
pdf_files = (
    spark.read.format("binaryFile")
    .option("pathGlobFilter", "*.pdf")
    .load(input_folder)
)
 
# ── 3. Process each PDF ──────────────────────────────────────────
for row in pdf_files.collect():
    pdf_path  = row.path
    pdf_bytes = row.content
    print(f"\nProcessing: {pdf_path}")
 
    # Analyse with prebuilt‑layout
    poller  = client.begin_analyze_document(
        model_id="prebuilt-layout",
        body=io.BytesIO(pdf_bytes)
    )
    result  = poller.result()
 
    # ── Build the output dict ────────────────────────────────────
    out = {
        "file"  : os.path.basename(pdf_path),
        "styles": [
            "handwritten" if s.is_handwritten else "no handwritten"
            for s in result.styles
        ],
        "pages" : []          # will hold page dicts (with tables attached)
    }
 
    # ------------------------------------------------------------
    # A)  First create every page dict (lines + empty tables list)
    # ------------------------------------------------------------
    page_index_map = {}       # page_number → index in out["pages"]
 
    for page in result.pages:
        page_dict = {
            "page_number": page.page_number,
            "lines"      : [ln.content for ln in page.lines],
            "tables"     : []      # placeholder
        }
        page_index_map[page.page_number] = len(out["pages"])
        out["pages"].append(page_dict)
 
    # ------------------------------------------------------------
    # B)  Walk through tables, convert to header→value rows,
    #     then attach each table to the correct page
    # ------------------------------------------------------------
    for tbl_idx, tbl in enumerate(result.tables):
 
        # The first bounding region tells us the page number
        start_page = tbl.bounding_regions[0].page_number
 
        # ─ headers ─
        headers = {
            c.column_index: c.content
            for c in tbl.cells
            if c.row_index == 0
        }
 
        # ─ rows as key/value dicts ─
        row_dicts = {}
        for c in tbl.cells:
            if c.row_index == 0:
                continue
            row_dicts.setdefault(c.row_index, {})[
                headers.get(c.column_index, f"col_{c.column_index}")
            ] = c.content
 
        table_dict = {
            "table_index" : tbl_idx,
            "row_count"   : tbl.row_count,
            "column_count": tbl.column_count,
            "rows"        : [row_dicts[r] for r in sorted(row_dicts)]
        }
 
        # ─ attach to page ─
        if start_page in page_index_map:
            out["pages"][page_index_map[start_page]]["tables"].append(table_dict)
        else:
            out.setdefault("orphan_tables", []).append(table_dict)
 
    # ── 4. Serialise & save to ADLS ─────────────────────────────
    json_str   = json.dumps(out, indent=4, cls=AzureJSONEncoder)
    json_name  = os.path.splitext(os.path.basename(pdf_path))[0] + ".json"
    output_path = os.path.join(output_folder, json_name)   # abfss:// …
 
    mssparkutils.fs.put(output_path, json_str, overwrite=True)
    print(f"✓ Saved subset JSON → {output_path}")
 
print("\nAll PDFs processed and subsets stored.")
 