In [None]:
# !pip install marker

```
Notes for Mahdi: 

1. The following notebook is how I am extracting text from the .jpeg/.pdf photographs of the Wagner text
2. The version of python I am using = Python 3.10.13
3. Transcriptions are done in 3 parts:
    A. Extract text from a PDF containing a batch of Wagner passages using Marker
    B. Extract text individually from .jpeg version of Wagner passages using Base64 and LLMs
    C. Use LLM to synthesize both transcriptions

```

In [25]:
# General packages
import glob
import tqdm
import os
import pickle as pkl

## Marker (OCR library) Packages
from marker.config.parser import ConfigParser
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
import cgi

## OCR on base64 packages
import base64
from bs4 import BeautifulSoup
from typing import get_args, Literal, Union

# LLM/Agent packages
from pydantic import BaseModel, Field
from pydantic_ai import Agent
from openai.types.chat.chat_completion_content_part_param import (
    ChatCompletionContentPartTextParam,
    ChatCompletionContentPartImageParam
)
from openai.types.chat.chat_completion_content_part_image_param import (
    ImageURL
)
import pydantic_ai.models as ai_models

# Directory to photo training set
photo_dir = "../../training_set/"

## To import API keys
import sys
sys.path.append('/Users/williamharrigan/Desktop/')

# Script containing API keys
import api_keys


In [2]:
## Path to .jpeg photos of Wagner passages
jpeg_files = glob.glob(f'{photo_dir}/*.jpeg')

## Path to .pdf version of batch of Wagner photos
wagner_pdf_batch = glob.glob(f'{photo_dir}/test*.pdf')[0] # test batch contains 2 photos -> complete batch contains 50 photos
# wagner_pdf_batch = glob.glob(f'{photo_dir}/complete*.pdf')[0]
print('Marker photo: ', wagner_pdf_batch)

# Sort files to process alphabetically
sorted_files = sorted(jpeg_files)
print('Base64/OCR photo: ', sorted_files[0])


Marker photo:  ../../training_set/test_passage_wagner_batch.pdf
Base64/OCR photo:  ../../training_set/Acanthaceae_Dicliptera_chinensis.jpeg


## Marker OCR

In [3]:
## Set configuration for Marker text extraction

# Sets LLM component to use Gemini, but can use any available LLM
config = {
    "output_format": "json",
    "use_llm": True,
    "gemini_api_key": os.environ["GEMINI_API_KEY"], # API key
    "gemini_model": "gemini-2.5-pro-preview-03-25", # LLM
    "llm_service": "marker.services.gemini.GoogleGeminiService" # Set for converter
}

# Image parser
config_parser = ConfigParser(config)

# Default parameters for Marker OCR
converter = PdfConverter(
    config=config_parser.generate_config_dict(),
    artifact_dict=create_model_dict(),
    processor_list=config_parser.get_processors(),
    renderer=config_parser.get_renderer(),
    llm_service=config["llm_service"]  # Set in config (above)
)

Loaded layout model s3://layout/2025_02_18 on device mps with dtype torch.float16
Loaded texify model s3://texify/2025_02_18 on device mps with dtype torch.float16
Loaded recognition model s3://text_recognition/2025_02_18 on device mps with dtype torch.float16
Loaded table recognition model s3://table_recognition/2025_02_18 on device mps with dtype torch.float16
Loaded detection model s3://text_detection/2025_02_28 on device mps with dtype torch.float16
Loaded detection model s3://inline_math_detection/2025_02_24 on device mps with dtype torch.float16


In [4]:
## Run Marker OCR on batch -> output in .json format

rendered = converter(wagner_pdf_batch)

Recognizing layout: 100%|██████████| 2/2 [00:03<00:00,  1.60s/it]
LLM layout relabelling: 4it [00:03,  1.26it/s]
Running OCR Error Detection: 100%|██████████| 1/1 [00:00<00:00,  9.20it/s]
Detecting bboxes: 100%|██████████| 2/2 [00:02<00:00,  1.15s/it]
Recognizing Text: 100%|██████████| 2/2 [00:12<00:00,  6.14s/it]
Detecting bboxes: 0it [00:00, ?it/s]
LLMTableMergeProcessor running: 0it [00:00, ?it/s]
LLM processors running: 100%|██████████| 1/1 [00:01<00:00,  1.73s/it]


In [18]:
## Processing Marker Text Extractions

# Store marker text extractions
marker_ocr_outputs = {}

# Iterate through rendered passages
for i in range(len(rendered.children)):
    
    plant_species = sorted_files[i].split('/')[-1].strip('.jpeg')
    print(f"Plant: {plant_species}")
    
    # Extract all text blocks from rendered
    blocks = rendered.children[i].children

    # Reorder extracted Wagner text using BeautifulSoup
    block_list = []
    for block in blocks:
        text = BeautifulSoup(block.html, "html.parser").get_text()
        block_list.append({
            "text": text.strip(),
            "x": block.bbox[2],  # left coordinate
            "y": block.bbox[3]   # top coordinate
        })
        
    block_list.sort(key=lambda b: (b["x"], b["y"]))

    print('Marker Extracted Text:')

    # Concatenate Text
    total = []

    for block in block_list:
        total.append(block["text"])
    
    # Save to dictionary
    marker_ocr_outputs[plant_species] = total
    print(f"{total}\n")


Plant: Acanthaceae_Dicliptera_chinensis
Marker Extracted Text:
['(nat)', '[Justicia chinensis L.]', '1. Dicliptera chinensis (L.) Juss.', 'Sprawling or decumbent perennial herbs; stems 2-7 dm long. Leaves green, lower surface slightly paler, ovate, 2.5-13.5 cm long, sparsely strigillose, especially on the veins, cystoliths prominent on upper surface as white raised streaks the size of the hairs, petioles 1-3.5 cm long. Flowers in axillary cymes, each one subtended by 2 green, ovate bracts of unequal size, the', 'larger one ca. 12-14 mm long, the smaller\nPlate 1. one ca. 8-9 mm long, all bracts short-vil-', '', "lous especially along the margins, the veins inconspicuous, pedicels 0-1 mm long; calyx lobes of unequal size, 5-7 mm long; corolla rose to purple, the throat with purple spots, 5-13 mm long. Capsules ovoid, 6-7 mm long, short-villous. Seeds 4, discoid. Native to tropical areas worldwide; in Hawai'i naturalized primarily in or near urban areas, at least on Kaua'i and O'ahu, but

## base64 OCR

In [19]:
# Async transcription function
async def transcribe_image_with_openai(image_path: str) -> str:
    with open(image_path, "rb") as image_file:
        base64_image = base64.b64encode(image_file.read()).decode("utf-8")

    agent = Agent(
        model="openai:gpt-4o",
        result_type=str,
        system_prompt="You are a vision model capable of accurately performing OCR on an image",
    )

    image_param = ChatCompletionContentPartImageParam(
        type='image_url',
        image_url=ImageURL(url=f"data:image/png;base64,{base64_image}", detail='low')
    )

    message = [
        ChatCompletionContentPartTextParam(
            type="text",
            text="Convert the image to text. Don't miss any text and DO NOT ADD ANY TEXT that is not present in the image."
        ),
        image_param
    ]

    response = await agent.run(message)
    return response.data

In [20]:
## Run base64 OCR on .jpeg images of Wagner Passages

# Storage for Extracted Text
openai_transcriptions = {}

# Plant ID = format files are saved in = family_genus_species.jpeg
plant_id = 'Acanthaceae_Dicliptera_chinensis'

# Set image path
image_path = f"{photo_dir}/{plant_id}.jpeg"

# Run transcription agent
openai_transcriptions[plant_id] = await transcribe_image_with_openai(image_path)

## Loop to process all Wagner .jpeg images

# for image_path in tqdm.tqdm(jpeg_files):
#     plant_id = image_path.split('/')[-1].strip('.jpeg') 
#     openai_transcriptions[plant_id] = await transcribe_image_with_openai(image_path)



In [21]:
## Create Agent to Synthesize base64 and Marker text extractions

create_final_transcription = Agent(
    model="openai:gpt-4o",
    result_type=str,
    system_prompt=(
        "You are a botanical plant expert. Given two OCR outputs of the same passage on Hawaiian plants, merge them into one accurate, complete transcription. Choose the most reliable text from each version."
    )
)

In [27]:
# Dictionary for final transcriptions
synthesized_transcriptions = {}

# Iterate through available transcriptions (jpeg_files, openai_transcriptions or marker_ocr_outputs)
for plant_id in tqdm.tqdm(openai_transcriptions.keys()): 

    # Get transcriptions from dictionary
    ocr_1 = openai_transcriptions[plant_id]
    ocr_2 = marker_ocr_outputs[plant_id]

    # Input to LLM
    og_transcriptions = f""" Here are the following two passages. It is very important to include page number and any other relevant information. Do not add any information that is not present in the transcriptions.
    OCR text 1:
    {ocr_1}

    OCR text 2:
    {ocr_2}
    """

    # Run LLM-agent to get synthesized Wagner passage text
    r = await create_final_transcription.run(og_transcriptions)
    
    # Save to dictionary
    synthesized_transcriptions[plant_id] = r.data
    
    
# Save file to pkl
# with open('wagner_transcriptions.pkl', 'wb') as f:
#     pkl.dump(synthesized_transcriptions, f)

100%|██████████| 1/1 [00:05<00:00,  5.01s/it]


In [24]:
for key, value in synthesized_transcriptions.items():
    print(key)
    print(value)

Acanthaceae_Dicliptera_chinensis
---

171

1. Dicliptera chinensis (L.) Juss. [Justicia chinensis L.] (nat)

Sprawling or decumbent perennial herbs; stems 2-7 dm long. Leaves green, lower surface slightly paler, ovate, 2.5-13.5 cm long, sparsely strigillose, especially on the veins, cystoliths prominent on upper surface; white raised streaks the size of the larger petals, 1-3.5 cm long. Flowers in axillary cymes, each one subtended by 2 green, ovate bracts of unequal size, the larger one ca. 12-14 mm long, the smaller one ca. 8-9 mm long, all bracts short-villous especially along the margins, the veins inconspicuous, pedicels 0.7-1 mm long; calyx lobes of unequal size, 5-17 mm long; corolla conspicuous, rose to purple, the throat with purple spot, 5-13 mm long. Capsules ovoid, 6-7 mm long, short-villous. Seeds 4, discoid. Native to tropical areas worldwide; in Hawai'i naturalized on O'ahu; in moist disturbed areas, at least on Kaua'i and O'ahu, but perhaps more widespread. Popenoe (192