In [1]:
# Get test document paths
from pathlib import Path
pdf_input_path = Path('../tests/data/wiring_bonding.pdf')
doc_input_path = Path('../tests/data/maintenance_procedure_template.docx')
ppt_input_path = Path('../tests/data/functional_flight_checks.pptx')
xls_input_path = Path('../tests/data/equipment_maintenance_schedule.xlsx')
output_dir = Path('../data/output')

In [2]:
# Get LLM Client
from openai import OpenAI
from databricks.sdk import WorkspaceClient
w = WorkspaceClient()

workspace_client = WorkspaceClient()
workspace_url = workspace_client.config.host
token = workspace_client.config.token

llm_model = "shm_gpt_4o_mini"
llm_client = OpenAI(
  api_key=token,
  base_url=f"{workspace_url}/serving-endpoints",
)

In [9]:
from docling.datamodel.base_models import InputFormat
from docling.document_converter import PdfFormatOption
from maud.document.converters import MAUDPipelineOptions, MaudConverter, MAUDPipeline
import pandas as pd

maud_pipeline_options = MAUDPipelineOptions(
    llm_client=llm_client,
    llm_model="shm_gpt_4o_mini",
    max_tokens=200,
    clf_client=llm_client,
    clf_model='dummy_clf',
)

converter = MaudConverter(
    input_path=doc_input_path,
    output_dir=output_dir,
    llm_client=maud_pipeline_options.llm_client,
    llm_model=maud_pipeline_options.llm_model,
    max_tokens=maud_pipeline_options.max_tokens,
    overwrite=False,
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_cls=MAUDPipeline,
            pipeline_options=maud_pipeline_options,
        )
    }
)

document = converter.convert()

2025-02-24 15:26:53,793 - MaudConverter - INFO - Converting document


In [5]:
chunks = converter.chunk()
pd.DataFrame(chunks)

Unnamed: 0,filename,input_hash,pages,doc_refs,has_table,has_picture,tables,pictures,headings,captions,chunk_type,text,enriched_text
0,wiring_bonding.pdf,2e760c98582dd50f45f97cb7cd8f7c4d,[1],"[#/texts/0, #/texts/1, #/texts/2, #/texts/3, #...",False,False,[],[],,,text,Single Shield Cable with Solder Sleeve P...,\n Headings:\n\n Caption...
1,wiring_bonding.pdf,2e760c98582dd50f45f97cb7cd8f7c4d,[1],"[#/texts/13, #/texts/15, #/texts/16, #/texts/1...",False,False,[],[],[WARNING],,text,Use only hot air gun M83521/5-01 or equivalent...,\n Headings:WARNING\n\n ...
2,wiring_bonding.pdf,2e760c98582dd50f45f97cb7cd8f7c4d,[1],[#/tables/0],True,False,[0],[],[WARNING],[The table lists several tools and equipment a...,text,"Crimp Tool, Name Part Number = AD-1377 or ...",\n Headings:WARNING\n\n ...
3,wiring_bonding.pdf,2e760c98582dd50f45f97cb7cd8f7c4d,[1],"[#/texts/21, #/texts/22]",False,False,[],[],[WARNING],,text,11\nThe table lists several tools and equipmen...,\n Headings:WARNING\n\n ...
4,wiring_bonding.pdf,2e760c98582dd50f45f97cb7cd8f7c4d,[1],"[#/pictures/0, #/texts/19, #/texts/20]",False,True,[],[0],[],[Table 5 -4 Tooling],picture,The image depicts a technical diagram highligh...,\n Captions:Table 5 -4 Toolin...
5,wiring_bonding.pdf,2e760c98582dd50f45f97cb7cd8f7c4d,[1],[#/tables/0],True,False,[0],[],[],[The table lists several tools and equipment a...,table,| Name Part Number | Name Part Number ...,\n Captions:The table lists sev...
6,wiring_bonding.pdf,2e760c98582dd50f45f97cb7cd8f7c4d,[1],"[#/texts/0, #/texts/1, #/texts/2, #/texts/3, #...",True,True,[0],[0],[WARNING],[The table lists several tools and equipment a...,page,The page outlines the procedure for preparing ...,Headings:WARNING\nPage Description:The page ou...
