In [0]:
%pip install OpenAI

Most companies just want an easy way to analyze scanned documents. Multimodal models unlock this - sure they are expensive, but they are really efficient to 'program'.

In [0]:
from openai import OpenAI
import os
import base64
from PIL import Image
import io

DATABRICKS_TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()

# Path to image file in Databricks volume
image_path = "/Volumes/shm/multimodal/docs_bronze/land_title.png"

# Read and encode the image
with open(image_path, "rb") as image_file:
    image_data = base64.b64encode(image_file.read()).decode("utf-8")

client = OpenAI(
  api_key=DATABRICKS_TOKEN,
  base_url="https://adb-984752964297111.11.azuredatabricks.net/serving-endpoints"
)

chat_completion = client.chat.completions.create(
  messages=[
    {
      "role": "system",
      "content": "You are an AI assistant that can extract and analyze text from images."
    },
    {
      "role": "user",
      "content": [
        {"type": "text", "text": "Extract all text from this image and summarize what it contains."},
        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
      ]
    }
  ],
  model="databricks-claude-3-7-sonnet",
  max_tokens=512
)

parsed_text = chat_completion.choices[0].message.content
print(parsed_text)

We win when people expand this to large tables and get data moving

In [0]:
table = spark.createDataFrame(
  [(image_path, parsed_text)],
  ['path','parsed_text']
)
table.write.mode('overwrite').format('delta').saveAsTable('shm.default.parsed_text')

Now let's use a cheaper model to extract key information

In [0]:
%sql
SELECT 
  *, 
  ai_query(
    'databricks-meta-llama-3-3-70b-instruct',
    CONCAT('Identify all landowners in this document. Return only the names in a comma separated python list, without any preamble:', parsed_text)
  ) as landowners, 
  ai_query(
    'databricks-meta-llama-3-3-70b-instruct',
    CONCAT('Identify gas and oil royalties in the document', parsed_text),
    responseFormat => '{
      "type": "json_schema",
      "json_schema": {
        "name": "royalties",
        "schema": {
          "type": "object",
          "properties": {
            "gas_royalty": {"type": "number"},
            "oil_royalty": {"type": "number"}
          }
        }
      }
    }'
  ) as royalties
FROM shm.default.parsed_text