# Parsing
This notebook does the bulk of the work. It takes each image and runs a Claude query with structured outputs and few shot prompting to extract information from the documents.

In [0]:
%pip install -U --quiet mlflow openai
%restart_python

In [0]:
from pathlib import Path
from openai import OpenAI
import base64
from PIL import Image
import IPython.display as display

In [0]:
from mlflow.models import ModelConfig
config = ModelConfig(development_config="config.yaml").to_dict()

Prepare Pool of Examples 

In [0]:
load_sheet_df = spark.sql("SELECT * FROM shm.pid.load_sheet_alb")
display(load_sheet_df)

In [0]:
examples_df = (
  load_sheet_df
  .filter(load_sheet_df.for_examples == True)
  .toPandas()
)

# Get the image path
# TODO: Abstract a bit more
examples_df["image_path"] = examples_df["closest_filename"].apply(lambda x: Path(config["processed_path"] + x.replace(".pdf", "") + "/" + x.replace(".pdf", "") + "_page_1.jpeg"))

In [0]:
examples_df["json_output"][7]

In [0]:
ex1_path = examples_df.loc[1, "image_path"]
ex1_image = Image.open(ex1_path)
display.display(ex1_image)

In [0]:
ex2_path = examples_df.loc[4, "image_path"]
ex2_image = Image.open(ex2_path)
display.display(ex2_image)

In [0]:
ex3_path = examples_df.loc[7, "image_path"]
ex3_image = Image.open(ex3_path)
display.display(ex3_image)

In [0]:
test_page_path = examples_df.loc[13, "image_path"]
test_image = Image.open(test_page_path)
display.display(test_image)

In [0]:
test_tile_path = "/Volumes/shm/pid/tiled_pdfs/with_load_sheet/MRP-520-PID-PR-000351_F267/MRP-520-PID-PR-000351_F267.1_page_1_tile_4.webp"
test_tile = Image.open(test_tile_path)
display.display(test_tile)

## Zero Shot Inference
Our first example tests the zero shot inference, which performs poorly in terms of tag counts.

In [0]:
DATABRICKS_TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()

client = OpenAI(
  api_key=DATABRICKS_TOKEN,
  base_url="https://adb-984752964297111.11.azuredatabricks.net/serving-endpoints"
)

def zero_shot_parse(image_path: str):
  image_data = base64.b64encode(Path(image_path).read_bytes()).decode("utf-8")

  chat_completion = client.chat.completions.create(
    messages=[
      {
        "role": "system",
        "content": config['system_prompt']
      },
      {
        "role": "user",
        "content": [
          {"type": "image_url", "image_url": 
            {"url": f"data:image/webp;base64,{image_data}"}
          }
        ]
      }
    ],
    model=config['fm_endpoint'],
    temperature=config['temperature'],
    top_p=config['top_p']
  )

  parsed_text = chat_completion.choices[0].message.content
  return parsed_text

In [0]:
zero_shot_parse(test_page_path)

In [0]:
zero_shot_parse(test_tile_path)

## Few Shot Parsing
We've proven that tiled 

In [0]:
import pandas as pd
def few_shot_parse(image_path: str, examples: pd.DataFrame):
  image_data = base64.b64encode(Path(image_path).read_bytes()).decode("utf-8")

  content = []
  for idx, example in examples.iterrows():
    example_text = example['json_output']
    image_path = Path(example['image_path'])
    example_data = base64.b64encode(image_path.read_bytes()).decode("utf-8")
    content.append({
        "type": "image_url", 
        "image_url": {"url": f"data:image/jpeg;base64,{example_data}"},
      })
    content.append({
      "type": "text",
      "text": example_text
    })
    
  content.append({
    "type": "image_url", "image_url": 
    {"url": f"data:image/jpeg;base64,{image_data}"}
    })

  chat_completion = client.chat.completions.create(
    messages=[
      {
        "role": "system",
        "content": config['system_prompt']
      },
      {
        "role": "user",
        "content": content
      }
    ],
    model=config['fm_endpoint'],
    temperature=config['temperature'],
    top_p=config['top_p']
  )

  parsed_text = chat_completion.choices[0].message.content
  return parsed_text

In [0]:
examples = examples_df.iloc[[7]]
inf_few_shot = few_shot_parse(test_tile_path, examples)

In [0]:
inf_few_shot

## JSON Parsing
One of the key things we need is to be able to extract the text LLM outputs as structured outputs.

In [0]:
import re
json_str = inf_few_shot

fixed_json_str = re.sub(
    r'"(\d+)"-([A-Z\-0-9]+)"',
    r'"\1-\2"',
    json_str
)

In [0]:
import json
parsed_dict = json.loads(fixed_json_str)

In [0]:
parsed_dict

In [0]:
import pandas as pd
pd.DataFrame(parsed_dict, index=[0])