# Parsing
This notebook does the bulk of the work. It takes each image and runs a Claude query with structured outputs and few shot prompting to extract information from the documents.

In [0]:
%pip install mlflow openai
%restart_python

In [0]:
from mlflow.models import ModelConfig
config = ModelConfig(development_config="config.yaml").to_dict()

In [0]:
from pathlib import Path
import base64

Use Pathlib to get a generator and loop through images for testing

In [0]:
# from pathlib import Path
# image_paths = Path(config['image_vol_path']).rglob('*.png')

In [0]:


#list(image_paths)
ex1_path = Path('/Volumes/shm/pid/pdf_images/5a82c87214d47c8af93fb443908548ee/page_10.webp')
ex1_data = base64.b64encode(ex1_path.read_bytes()).decode("utf-8")
ex2_path = Path('/Volumes/shm/pid/pdf_images/5a82c87214d47c8af93fb443908548ee/page_15.webp')
ex2_data = base64.b64encode(ex2_path.read_bytes()).decode("utf-8")
#inf_path = Path('/Volumes/shm/pid/pdf_images/5a82c87214d47c8af93fb443908548ee/page_29.png')
#inf_path = Path('/Volumes/shm/pid/pdf_images/5a82c87214d47c8af93fb443908548ee/page_13.webp')
inf_path = Path('/Volumes/shm/pid/pdf_images/5a82c87214d47c8af93fb443908548ee_tiled/tile_2.png')
#inf_path = Path('/Volumes/shm/pid/pdf_images/5a82c87214d47c8af93fb443908548ee_tiled/tile_8.png')

In [0]:
# import yaml
# import json

# def _load_yaml_as_json(path: Path) -> str:
#     """
#     Read a YAML file from `path`, parse it into a Python object,
#     then dump that object as a JSON string.
#     """
#     text = path.read_text()               # raises if file doesn’t exist
#     data = yaml.safe_load(text)           # e.g. dict, list, etc.
#     return json.dumps(data, ensure_ascii=False)

In [0]:
# 1. If you haven’t already installed PyYAML, run:
#    pip install pyyaml

import yaml
from pathlib import Path

def load_yaml_for_prompt(file_path: Path) -> str:
    """
    Read a YAML file and return a fenced code‐block string containing the same YAML,
    suitable for pasting into an LLM prompt as “example expected output.”

    Args:
        file_path: path to the .yaml file you want to load.

    Returns:
        A string that looks like:
            ```yaml
            <contents of the YAML, re-dumped>
            ```
    """
    # 1. Read + parse
    with file_path.open("r", encoding="utf-8") as f:
        data = yaml.safe_load(f)

    # 2. Re-dump to YAML (keep original key order)
    yaml_text = yaml.dump(data, sort_keys=False)

    # 3. Wrap in a code fence
    fenced = f"```yaml\n{yaml_text}```"
    return fenced


Example Inference

In [0]:
from openai import OpenAI
import base64

DATABRICKS_TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()

client = OpenAI(
  api_key=DATABRICKS_TOKEN,
  base_url="https://adb-984752964297111.11.azuredatabricks.net/serving-endpoints"
)

def zero_shot_parse(image_path: Path):
  image_data = base64.b64encode(image_path.read_bytes()).decode("utf-8")

  chat_completion = client.chat.completions.create(
    messages=[
      {
        "role": "system",
        "content": config['system_prompt']
      },
      {
        "role": "user",
        "content": [
          {"type": "image_url", "image_url": 
            {"url": f"data:image/jpeg;base64,{image_data}"}
          }
        ]
      }
    ],
    model=config['fm_endpoint'],
    temperature=config['temperature'],
    top_p=config['top_p']
  )

  parsed_text = chat_completion.choices[0].message.content
  return parsed_text

In [0]:
yaml_example_1_path = Path('example_output_1.yaml')
yaml_example_2_path = Path('example_output_2.yaml')


ex1_output = load_yaml_for_prompt(yaml_example_1_path)
ex2_output = load_yaml_for_prompt(yaml_example_2_path)

In [0]:
inf_zero_shot = zero_shot_parse(inf_path)

In [0]:
print(inf_zero_shot)

In [0]:
# def few_shot_parse(image_path: Path):
#   image_data = base64.b64encode(image_path.read_bytes()).decode("utf-8")

#   chat_completion = client.chat.completions.create(
#     messages=[
#       {
#         "role": "system",
#         "content": config['system_prompt']
#       },
#       {
#         "role": "user",
#         "content": [
#           {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{ex1_data}"}},
#           {"type": "text", "text": ex1_output},
#           {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{ex2_data}"}},
#           {"type": "text", "text": ex2_output},
#           {"type": "image_url", "image_url": 
#             {"url": f"data:image/jpeg;base64,{image_data}"}
#           }
#         ]
#       }
#     ],
#     model=config['fm_endpoint'],
#     temperature=config['temperature'],
#     top_p=config['top_p']
#   )

#   parsed_text = chat_completion.choices[0].message.content
#   return parsed_text

In [0]:
import base64
from pathlib import Path

def few_shot_parse(image_path: Path):
    image_data = base64.b64encode(image_path.read_bytes()).decode("utf-8")


    #  reate an explicit “instructions” text block that tells the model: 
    #    “Your answer should be ONLY the YAML mapping, matching the examples exactly—no extra text.”
    instruction_text = (
        "Now, based on the image below, please output ONLY the YAML mapping "
        "in the same structure as the two examples above. "
        "Do not include any additional commentary, explanation, or markup—just the YAML."
    )

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": config["system_prompt"]
            },
            {
                "role": "user",
                "content": [
                    # Example‐1 image
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{ex1_data}"}
                    },
                    # Example‐1 YAML (fenced)
                    {
                        "type": "text",
                        "text": ex1_output
                    },

                    # Example‐2 image
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{ex2_data}"}
                    },
                    # Example‐2 YAML (fenced)
                    {
                        "type": "text",
                        "text": ex2_output
                    },

                    # → New: instruction to output only YAML
                    {
                        "type": "text",
                        "text": instruction_text
                    },

                    # Finally: the “unknown” image we want the model to parse
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{image_data}"}
                    }
                ]
            }
        ],
        model=config["fm_endpoint"],
        temperature=config["temperature"],
        top_p=config["top_p"]
    )

    parsed_text = chat_completion.choices[0].message.content
    return parsed_text


In [0]:
import base64
from pathlib import Path

def few_shot_parse(image_path: Path):
    image_data = base64.b64encode(image_path.read_bytes()).decode("utf-8")


    #  reate an explicit “instructions” text block that tells the model: 
    #    “Your answer should be ONLY the YAML mapping, matching the examples exactly—no extra text.”
    instruction_text = (
        "Now, based on the image below, please output ONLY the YAML mapping "
        "in the same structure as the two examples above. "
        "Do not include any additional commentary, explanation, or markup—just the YAML. Only include 10 equipment tags and 10 linetags"
    )

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": config["system_prompt"]
            },
            {
                "role": "user",
                "content": [
                    # # Example‐2 image
                    # {
                    #     "type": "image_url",
                    #     "image_url": {"url": f"data:image/jpeg;base64,{ex2_data}"}
                    # },
                    # # Example‐2 YAML (fenced)
                    # {
                    #     "type": "text",
                    #     "text": ex2_output
                    # },

                    # → New: instruction to output only YAML
                    {
                        "type": "text",
                        "text": instruction_text
                    },

                    # Finally: the “unknown” image we want the model to parse
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{image_data}"}
                    }
                ]
            }
        ],
        model=config["fm_endpoint"],
        temperature=config["temperature"],
        top_p=config["top_p"]
    )

    parsed_text = chat_completion.choices[0].message.content
    return parsed_text


In [0]:
inf_few_shot = few_shot_parse(inf_path)

In [0]:
def few_shot_parse(image_path):
    # Split the input data into smaller chunks
    chunk_size = 4194304  # 4 MB
    with open(image_path, 'rb') as f:
        data = f.read()
    
    chunks = [data[i:i + chunk_size] for i in range(0, len(data), chunk_size)]
    
    results = []
    for chunk in chunks:
        instruction_text = (
            "Now, based on the image below, please output ONLY the YAML mapping "
            "in the same structure as the two examples above. "
            "Do not include any additional commentary, explanation, or markup—just the YAML."
        )
        
        chat_completion = client.chat.completions.create(
            model=config["fm_endpoint"],
            messages=[
                {
                    "role": "system",
                    "content": config["system_prompt"]
                },
                {
                    "role": "user",
                    "content": instruction_text
                },
                {
                    "role": "user",
                    "content": chunk.decode('utf-8', errors='ignore')
                }
            ]
        )
        
        results.append(chat_completion.choices[0].message['content'])
    
    return results

inf_few_shot = few_shot_parse(inf_path)

In [0]:
inf_few_shot

In [0]:
import json

def json_string_to_yaml_file(raw_text: str, output_path: str) -> None:
    """
    Convert a JSON‐formatted string (with \n and \" escapes) into a YAML file.

    Args:
        raw_text: The raw string you received from the model, e.g.:
            '{\n  "title_block": { ... }\n  ...\n}'
        output_path: Path where you want to write the YAML file, e.g. "parsed_output.yaml".
    """
    # Parse the JSON string into a Python dict
    data = json.loads(raw_text)

    # Dump that dict as YAML (preserving key order)
    yaml_text = yaml.dump(data, sort_keys=False)

    # Write it out
    Path(output_path).write_text(yaml_text, encoding="utf-8")


In [0]:
#json_string_to_yaml_file(inf_few_shot, "parsed_output.yaml")

In [0]:
inf_few_shot

JSON Parsing

In [0]:
import re
json_str = inf_few_shot

fixed_json_str = re.sub(
    r'"(\d+)"-([A-Z\-0-9]+)"',
    r'"\1-\2"',
    json_str
)

In [0]:
import json
parsed_dict = json.loads(fixed_json_str)

In [0]:
parsed_dict

In [0]:
import pandas as pd

data = parsed_dict

# Step 1: Flatten the nested 'title_block' into top-level keys
flat_data = {}
for key, value in data['title_block'].items():
    flat_data[f"title_block_{key}"] = value

# Step 2: Convert each list field into a comma-separated string
for list_key in ['comments', 'revision_history', 'equipment_tags', 'line_tags', 'uncaptured']:
    flat_data[list_key] = ", ".join(data[list_key])

# Step 3: Add any remaining top-level non-list fields
flat_data['document_type'] = data['document_type']

# Step 4: Create a DataFrame with a single row
df = pd.DataFrame([flat_data])

# Display the DataFrame
print(df)

In [0]:
data = parsed_dict

df = pd.json_normalize(data)

for col in ['comments', 'revision_history', 'equipment_tags', 'line_tags', 'uncaptured']:
    df[col] = df[col].apply(lambda lst: ", ".join(lst) if isinstance(lst, list) else "")


title_cols = [c for c in df.columns if c.startswith("title_block.")]
other_cols = [c for c in df.columns if not c.startswith("title_block.")]
results = df[title_cols + other_cols]

print(results)

In [0]:
df

In [0]:
import yaml

with open('output.yaml', 'w') as yaml_file:
    yaml.dump(parsed_dict, yaml_file)

In [0]:
with open('output.yaml', 'r') as f:
    reloaded = yaml.safe_load(f)

json.dumps(reloaded)

Compare results with expected results

In [0]:
import yaml
import pandas as pd
from pathlib import Path

def expected_yaml_to_dataframe(yaml_path: str) -> pd.DataFrame:
    """
    Load a YAML file and return a pandas DataFrame with one row,
    """

    # 1. Read & parse the YAML file
    path = Path(yaml_path)
    with path.open("r", encoding="utf-8") as f:
        data = yaml.safe_load(f)

    # 2. Use pandas.json_normalize to flatten nested structures
    df = pd.json_normalize(data)

    # 3. Ensure all expected columns exist; fill missing ones
    expected_cols = [
        "title_block.drawing_number",
        "title_block.title",
        "title_block.revision_status",
        "title_block.primary_location",
        "title_block.other_locations",
        "title_block.plant",
        "title_block.area",
        "title_block.type",
        "title_block.discipline",
        "title_block.sequence",
        "comments",
        "revision_history",
        "equipment_tags",
        "line_tags",
        "document_type",
        "uncaptured",
    ]

    list_cols = {"comments", "revision_history", "equipment_tags", "line_tags", "uncaptured"}

    for col in expected_cols:
        if col not in df.columns:
            # For list‐type fields, default to an empty list; otherwise None
            if col in list_cols:
                df[col] = [[] for _ in range(len(df))]
            else:
                df[col] = ''

    # 4. Convert each list column into a comma‐separated string
    for col in list_cols:
        df[col] = df[col].apply(lambda val: ", ".join(val) if isinstance(val, list) else "")

    # 5. Reorder columns
    df = df[expected_cols]
    return df


In [0]:
expected_results = expected_yaml_to_dataframe("pg_29_expected_results.yaml")

In [0]:
expected_results

In [0]:
df_compare = pd.concat([results, expected_results], ignore_index=True)


In [0]:
df_compare

In [0]:
import pandas as pd

# 1. Helper to count items in a comma‐separated string
def count_items(cell):
    if pd.isna(cell) or cell == "":
        return 0
    # Split on commas, strip whitespace, count non-empty tokens
    return len([item for item in str(cell).split(",") if item.strip()])

# 2. Add the two new count columns
df_compare["equipment_count"] = df_compare["equipment_tags"].apply(count_items)
df_compare["line_count"] = df_compare["line_tags"].apply(count_items)

# 3. Build a comparison dict between row 0 and row 1 (exact equality)
comparison = {}
for col in df_compare.columns:
    v0 = df_compare.at[0, col]
    v1 = df_compare.at[1, col]
    comparison[col] = (v0 == v1)

# 4. Append the comparison row to df_compare
df_compare = pd.concat([df_compare, pd.DataFrame([comparison])], ignore_index=True)


In [0]:
df_compare

In [0]:
df_compare['equipment_tags'][0]

In [0]:
import pandas as pd

# Assume df_compare already exists with exactly two rows,
# and has columns 'equipment_tags' and 'line_tags'.

# 1. Helper to count items in a comma‐separated string
def count_items(cell):
    if pd.isna(cell) or cell == "" or not isinstance(cell, str):
        return 0
    return len([item for item in cell.split(",") if item.strip()])

# 2. Add the two new count columns
df_compare["equipment_count"] = df_compare["equipment_tags"].apply(count_items)
df_compare["line_count"] = df_compare["line_tags"].apply(count_items)

# 3. Build a comparison dict between row 0 and row 1,
#    treating equipment_tags and line_tags as order‐insensitive sets
comparison = {}
for col in df_compare.columns:
    v0 = df_compare.at[0, col]
    v1 = df_compare.at[1, col]

    if col in ["equipment_tags", "line_tags"]:
        # Convert each comma‐separated string into a set of stripped items
        set0 = set(item.strip() for item in v0.split(",") if isinstance(v0, str) and item.strip())
        set1 = set(item.strip() for item in v1.split(",") if isinstance(v1, str) and item.strip())
        comparison[col] = (set0 == set1)
    else:
        # Exact equality for all other columns
        comparison[col] = (v0 == v1)

# 4. Append the comparison row to df_compare
df_compare = pd.concat([df_compare, pd.DataFrame([comparison])], ignore_index=True)

print(df_compare)

In [0]:
df_compare

In [0]:
df_compare.to_csv("df_compare.csv", index=False)