In [0]:
%pip install pymupdf openai
%restart_python

## Ingest Files
Download process and instrumentation diagrams. We use an md5 hash to encode the file name and ensure uniqueness.

In [0]:
# import requests
# import hashlib

# url = "https://open.alberta.ca/dataset/46ddba1a-7b86-4d7c-b8b6-8fe33a60fada/resource/a82b9bc3-37a9-4447-8d2f-f5b55a5c3353/download/facilitydrawings.pdf"
# hashed_url = hashlib.md5(url.encode()).hexdigest()
# raw_vol_path = "/Volumes/shm/pid/raw_pdfs/"
# image_vol_path = "/Volumes/shm/pid/pdf_images/"

# raw_pdf_file_path = raw_vol_path + hashed_url + '.pdf'

# response = requests.get(url)
# with open(raw_pdf_file_path, 'wb') as file:
#     file.write(response.content)

In [0]:
# import requests
# import hashlib

# pdf_name = "ALB-701-PID-PR-005046-003_3DD0.1.pdf"
# hashed_name = hashlib.md5(pdf_name.encode()).hexdigest()
# raw_vol_path = "/Volumes/shm/pid/raw_pdfs/ALB/with_load_sheet/"
# image_vol_path = "/Volumes/shm/pid/pdf_images/ALB/with_load_sheet/"

# raw_pdf_file_path = raw_vol_path + pdf_name
# hashed_pdf_file_path = raw_vol_path + 'hashed/' + hashed_name + '.pdf'

# with open(raw_pdf_file_path, "rb") as src, open(hashed_pdf_file_path, "wb") as dst:
#     dst.write(src.read())

We currently use PyMuPDF for splitting the pages. The license isn't great, but there are many other options. This is the simplest for now.

In [0]:
# import fitz  # PyMuPDF
# from pathlib import Path

# doc_dir = Path(image_vol_path) / hashed_url
# doc_dir.mkdir(exist_ok=True)

# doc = fitz.open(raw_pdf_file_path)
# for page_num in range(len(doc)):
#     page = doc.load_page(page_num)
#     pix = page.get_pixmap(dpi=200)
#     pix.save(doc_dir / f"page_{page_num+1}.png")

In [0]:
# import fitz  # PyMuPDF
# from pathlib import Path

# doc_dir = Path(image_vol_path) / hashed_name
# doc_dir.mkdir(exist_ok=True)

# doc = fitz.open(raw_pdf_file_path)
# for page_num in range(len(doc)):
#     page = doc.load_page(page_num)
#     pix = page.get_pixmap(dpi=200)
#     pix.save(doc_dir / f"page_{page_num+1}.png")

In [0]:
import hashlib
import shutil
from pathlib import Path
import fitz  # PyMuPDF

# Define your source and target dirs
raw_vol_path   = Path("/Volumes/shm/pid/raw_pdfs/ALB/with_load_sheet/")
hashed_vol_path= raw_vol_path / "hashed"
image_vol_path = Path("/Volumes/shm/pid/pdf_images/ALB/with_load_sheet/")

# Make sure the hashed-and-images dirs exist
hashed_vol_path.mkdir(parents=True, exist_ok=True)
image_vol_path.mkdir(parents=True, exist_ok=True)

# Loop over every PDF in raw_vol_path
for pdf_path in raw_vol_path.glob("*.pdf"):
    pdf_name    = pdf_path.name
    hashed_name = hashlib.md5(pdf_name.encode()).hexdigest()
    
    # Copy the original PDF into the "hashed" subfolder under its MD5 name
    target_pdf = hashed_vol_path / f"{hashed_name}.pdf"
    shutil.copy2(pdf_path, target_pdf)

    # Create a per-doc image folder
    doc_img_dir = image_vol_path / hashed_name
    doc_img_dir.mkdir(exist_ok=True)
    
    # Open & render pages to PNG
    doc = fitz.open(str(pdf_path))
    for page_num, page in enumerate(doc, start=1):
        pix = page.get_pixmap(dpi=200)
        out_path = doc_img_dir / f"page_{page_num}.png"
        pix.save(out_path)

## Classify Pages
Now that we have individual images parsed out, we can begin scaled classification and parsing. First we want to do some basic classification on what type of document each page is. This would get scaled to do dynamic prompting and possible few shotting.

In [0]:
for doc in doc_dir.glob("*.png"):
    print(doc.name)

In [0]:
import base64
image_data = base64.b64encode(doc.read_bytes()).decode("utf-8")

In [0]:
from openai import OpenAI

DATABRICKS_TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()

client = OpenAI(
  api_key=DATABRICKS_TOKEN,
  base_url="https://adb-984752964297111.11.azuredatabricks.net/serving-endpoints"
)

chat_completion = client.chat.completions.create(
  messages=[
    {
      "role": "system",
      "content": "You are an AI assistant that can extract and analyze text from images."
    },
    {
      "role": "user",
      "content": [
        {"type": "text", "text": """
          Extract the following information from this image: 
          - all title block information
          - all comments
          - all revisions with revision, the date in YYYYMMDD format, and description from the revision table
          - all equipment tags (e.g. US-05-5)
          - all line tags (e.g 2"-SL-242XX6-SAB). 
          - image classification, one of process diagram, text or electrical diagram
          
          Provide the response in a json format. Do not interpret, translate, or clarify any aspects of the image, only parse the text and symbols that are there.
          
          First, extract title block and comments. Before providing the response, reflect on your response and make sure it is correct and complete. If there is any text, symbology, or diagram that hasn't been captured, add it to the uncaptured text. 

          Example:
          {
            title_block:
            comments:
            revision_history:
            equipment_tags:
            line_tags:
            classification:
            uncaptured:
          }
         
        """},
        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
      ]
    }
  ],
  model="databricks-claude-sonnet-4"
)

parsed_text = chat_completion.choices[0].message.content
print(parsed_text)