# Parsing
This notebook does the bulk of the work. It takes each image and runs a Claude query with structured outputs and few shot prompting to extract information from the documents.

In [0]:
%pip install mlflow openai
%restart_python

In [0]:
from mlflow.models import ModelConfig
config = ModelConfig(development_config="config.yaml").to_dict()

Use Pathlib to get a generator and loop through images for testing

In [0]:
from pathlib import Path
image_paths = Path(config['image_vol_path']).rglob('*.png')

In [0]:
list(image_paths)
ex1_path = Path('/Volumes/shm/pid/pdf_images/5a82c87214d47c8af93fb443908548ee/page_10.png')
ex1_data = base64.b64encode(ex1_path.read_bytes()).decode("utf-8")
ex2_path = Path('/Volumes/shm/pid/pdf_images/5a82c87214d47c8af93fb443908548ee/page_16.png')
ex2_data = base64.b64encode(ex2_path.read_bytes()).decode("utf-8")
inf_path = Path('/Volumes/shm/pid/pdf_images/5a82c87214d47c8af93fb443908548ee/page_29.png')

Example Inference

In [0]:
from openai import OpenAI
import base64

DATABRICKS_TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()

client = OpenAI(
  api_key=DATABRICKS_TOKEN,
  base_url="https://adb-984752964297111.11.azuredatabricks.net/serving-endpoints"
)

def zero_shot_parse(image_path: Path):
  image_data = base64.b64encode(image_path.read_bytes()).decode("utf-8")

  chat_completion = client.chat.completions.create(
    messages=[
      {
        "role": "system",
        "content": config['system_prompt']
      },
      {
        "role": "user",
        "content": [
          {"type": "image_url", "image_url": 
            {"url": f"data:image/jpeg;base64,{image_data}"}
          }
        ]
      }
    ],
    model=config['fm_endpoint'],
    temperature=config['temperature'],
    top_p=config['top_p']
  )

  parsed_text = chat_completion.choices[0].message.content
  return parsed_text

In [0]:
ex1_output = zero_shot_parse(ex1_path)
ex2_output = zero_shot_parse(ex2_path)
inf_zero_shot = zero_shot_parse(inf_path)

In [0]:
print(inf_zero_shot)

In [0]:
def few_shot_parse(image_path: Path):
  image_data = base64.b64encode(image_path.read_bytes()).decode("utf-8")

  chat_completion = client.chat.completions.create(
    messages=[
      {
        "role": "system",
        "content": config['system_prompt']
      },
      {
        "role": "user",
        "content": [
          {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{ex1_data}"}},
          {"type": "text", "text": ex1_output},
          {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{ex2_data}"}},
          {"type": "text", "text": ex2_output},
          {"type": "image_url", "image_url": 
            {"url": f"data:image/jpeg;base64,{image_data}"}
          }
        ]
      }
    ],
    model=config['fm_endpoint'],
    temperature=config['temperature'],
    top_p=config['top_p']
  )

  parsed_text = chat_completion.choices[0].message.content
  return parsed_text

In [0]:
inf_few_shot = few_shot_parse(inf_path)

In [0]:
inf_zero_shot

In [0]:
inf_few_shot

JSON Parsing

In [0]:
import re
json_str = inf_few_shot

fixed_json_str = re.sub(
    r'"(\d+)"-([A-Z\-0-9]+)"',
    r'"\1-\2"',
    json_str
)

In [0]:
import json
parsed_dict = json.loads(fixed_json_str)

In [0]:
import pandas as pd
pd.DataFrame(parsed_dict['title_block'], index=[0])

In [0]:
import yaml

with open('output.yaml', 'w') as yaml_file:
    yaml.dump(parsed_dict, yaml_file)

In [0]:
with open('output.yaml', 'r') as f:
    reloaded = yaml.safe_load(f)

json.dumps(reloaded)