```
Notes for Mahdi: 

1. This notebook extracts plant trait data in a JSON format from Wagner passage transcriptions
2. The version of python I am using = Python 3.10.13
3. The goal of this notebook is to extract data using 3 LLMs (gpt-4o, o3-mini and claude-sonnet-3.5)
4. At the end of the notebook includes the validation agent to compare automated extractions to manual extractions

```

In [19]:
## General Packages
import pickle as pkl
from collections import defaultdict
import pandas as pd
import numpy as np
import glob
import tqdm
import ast
import re

## Data Extraction Related Packages
from openai import OpenAI
from enum import Enum
from typing import Optional, List
import json
import asyncio

## Pydantic Packages
from pydantic import BaseModel, Field
from openai.lib._pydantic import to_strict_json_schema
from pydantic_ai import Agent
from openai.types.chat.chat_completion_content_part_param import (
    ChatCompletionContentPartTextParam,
    ChatCompletionContentPartImageParam
)
from openai.types.chat.chat_completion_content_part_image_param import (
    ImageURL
)

## To import API keys
import sys
import os
sys.path.append('/Users/williamharrigan/Desktop/')

# Script containing API keys
import api_keys


## Set Directories 
data_dir = './data_files'
output_dir = './data_files'

In [20]:
## Load Transcriptions
with open(f'{data_dir}/wagner_transcriptions.pkl', 'rb') as file: # This pkl only contains 1 transcription
    transcription_dict = pkl.load(file)

print(transcription_dict)


## Load transcriptions from complete transcription
# file_path = f'{data_dir}/transcriptions_for_wagner.txt'

# transcription_dict = {}
# species_id = None
# collecting = False
# lines_buffer = []

# with open(file_path, 'r') as file:
#     for line in file:
#         line = line.strip()
        
#         if len(line.split('_')) == 3:
#             species_id = line
#             lines_buffer = []
#             collecting = False

#         elif line.startswith('Transcription:'):
#             collecting = True
#             lines_buffer = [line]

#         elif line.startswith('Confidence:'):
#             collecting = False
#             confidence = int(line.split(':')[1].strip('\t'))
#             # print(confidence)
#             if species_id and lines_buffer:
#                 # Join lines into a single string with newlines preserved
#                 transcription_dict[species_id] = (''.join(lines_buffer), confidence)

#         elif collecting:
#             lines_buffer.append(line)


{'Acanthaceae_Dicliptera_chinensis': "---\n\n171\n\n1. Dicliptera chinensis (L.) Juss.  \n   [Justicia chinensis L.]  \n   (nat)\n\n   Sprawling or decumbent perennial herbs; stems 2-7 dm long. Leaves green, lower surface slightly paler, ovate, 2.5-13.5 cm long, sparsely strigillose, especially on the veins, cystoliths prominent on upper surface as white raised streaks the size of the larger petals, petioles 1-3.5 cm long. Flowers in axillary cymes, each one subtended by 2 green, ovate bracts of unequal size, the larger one ca. 12-14 mm long, the smaller one ca. 8-9 mm long, all bracts short-villous especially along the margins, the veins inconspicuous, pedicels 0.7-1 mm long; calyx lobes of unequal size, 5-17 mm long, coriaceous in texture; corolla rose to purple, the throat with purple spots, 5-13 mm long. Capsules ovoid, short-villous. Seeds 4, discoid. Native to tropical areas worldwide; in Hawai'i naturalized on O'ahu; in moist disturbed areas, at least on Kaua'i and O'ahu, but pe

In [3]:
## Defining Classes for Data Extraction

class Description(str, Enum):
    DICOTS = "Dicots"
    MONOCOTS = "Monocots"
    CONIFERS = "Conifers"
    FERNS = "Ferns and fern allies"

class StemHairType(str, Enum):
    DENDRITIC = "DENDRITIC"
    GLABROUS = "GLABROUS"
    HIRSUTE = "HIRSUTE"
    HISPID = "HISPID"
    LEPIDOTE = "LEPIDOTE"
    PILOSE = "PILOSE"
    PUBERULENT = "PUBERULENT"
    STRIGOSE = "STRIGOSE"
    STELLATE = "STELLATE"
    TOMENTOSE = "TOMENTOSE"
    VILLOUS = "VILLOUS"
    GLAUCOUS = "GLAUCOUS"
    
class FruitType(str, Enum):
    ACHENE = "ACHENE"
    AGGREGATE = "AGGREGATE"
    ARTICLE = "ARTICLE"
    BERRY = "BERRY"
    CAPSULE = "CAPSULE"
    CARYOPSIS = "CARYOPSIS"
    DRUPE = "DRUPE"
    FOLLICLE = "FOLLICLE"
    LEGUME = "LEGUME"
    MERICARP = "MERICARP"
    MULTIPLE = "MULTIPLE"
    NUT = "NUT"
    PEPO = "PEPO"
    POME = "POME"
    SCHIZOCARP = "SCHIZOCARP"
    SILICLE = "SILICLE"
    SILIQUE = "SILIQUE"
    SYCONIUM = "SYCONIUM"
    
class LeafShapeType(str, Enum):
    ACEROSE = "ACEROSE"
    AWL_SHAPED = "AWL_SHAPED"
    GLADIATE = "GLADIATE"
    HASTATE = "HASTATE"
    CORDATE = "CORDATE"
    DELTOID = "DELTOID"
    LANCEOLATE = "LANCEOLATE"
    LINEAR = "LINEAR"
    ELLIPTIC = "ELLIPTIC"
    ENSIFORM = "ENSIFORM"
    LYRATE = "LYRATE"
    OBCORDATE = "OBCORDATE"
    FALCATE = "FALCATE"
    FLABELLATE = "FLABELLATE"
    OBDELTOID = "OBDELTOID"
    OBELLIPTIC = "OBELLIPTIC"
    OBLANCEOLATE = "OBLANCEOLATE"
    OBLONG = "OBLONG"
    PERFOLIATE = "PERFOLIATE"
    QUADRATE = "QUADRATE"
    OBOVATE = "OBOVATE"
    ORBICULAR = "ORBICULAR"
    RENIFORM = "RENIFORM"
    RHOMBIC = "RHOMBIC"
    OVAL = "OVAL"
    OVATE = "OVATE"
    ROTUND = "ROTUND"
    SAGITTATE = "SAGITTATE"
    PANDURATE = "PANDURATE"
    PELTATE = "PELTATE"
    SPATULATE = "SPATULATE"
    SUBULATE = "SUBULATE"

class StemHairs(str, Enum):
    DENDRITIC = "DENDRITIC"
    GLABROUS = "GLABROUS"

class PhyllotaxyType(str, Enum):
    ALTERNATE = "ALTERNATE"
    OPPOSITE = "OPPOSITE"
    WHORLED = "WHORLED"
    DECUSSATE = "DECUSSATE"
    DISTICHOUS = "DISTICHOUS"
    EQUITANT = "EQUITANT"
    TERNATE = "TERNATE"
    CAULINE = "CAULINE"

class InflorescenceType(str, Enum):
    CATKIN = "CATKIN"
    CYME = "CYME"
    HEAD = "HEAD"
    PANICLE = "PANICLE"
    RACEME = "RACEME"
    SPATHE_SPADIX = "SPATHE_SPADIX"
    THYRSE = "THYRSE"
    UMBEL = "UMBEL"
    VERTISCILLATE = "VERTISCILLATE"
    SOLITARY = "SOLITARY"
    SPIKE = "SPIKE"
    LANCEOLATE = "LANCEOLATE"
    GLOBOSE = "GLOBOSE"
    INVOLUCRE = "INVOLUCRE"
    CORYMBOSE = "CORYMBOSE"
    STROBILOID = "STROBILOID"

class LeafHairType(str, Enum):
    DENDRITIC = "DENDRITIC"
    GLABROUS = "GLABROUS"
    HIRSUTE = "HIRSUTE"
    HISPID = "HISPID"
    LEPIDOTE = "LEPIDOTE"
    PILOSE = "PILOSE"
    PUBERULENT = "PUBERULENT"
    STRIGOSE = "STRIGOSE"
    STELLATE = "STELLATE"
    TOMENTOSE = "TOMENTOSE"
    VILLOUS = "VILLOUS"
    GLAUCOUS = "GLAUCOUS"

class LeafType(str, Enum):
    SIMPLE = "SIMPLE"
    COMPOUND = "COMPOUND"

class BreedingType(str, Enum):
    MONOECIOUS = "MONOECIOUS"  # Male and female flowers on the same plant
    ANDROMONOECIOUS = "ANDROMONOECIOUS"
    CHASMOGAMOUS = "CHASMOGAMOUS"
    DIOECIOUS = "DIOECIOUS"  # Separate male and female plants
    GYNODIOECIOUS = "GYNODIOECIOUS"
    POLYGAMO_MONOECIOUS = "POLYGAMO-MONOECIOUS"
    POLYGAMOUS = "POLYGAMOUS"
    POLYGAMO_DIOECIOUS = "POLYGAMO-DIOECIOUS"
    GYNOMONOECIOUS = "GYNOMONOECIOUS"
    STERILE = "STERILE"

class LeafMarginType(str, Enum):
    TEETH = "TEETH"
    LOBED = "LOBED"
    ENTIRE = "ENTIRE"
    NOTEETH = "NO TEETH"


class LifeFormType(str, Enum):
    ANNUAL_HERB = "ANNUAL_HERB"
    PERENNIAL_HERB = "PERENNIAL_HERB"
    EPIPHYTE = "EPIPHYTE"
    VINE = "VINE"
    SHRUB = "SHRUB"
    TREE = "TREE"

class CorollaType(str, Enum):
    ADNATE = "ADNATE"
    BILABIATE = "BILABIATE"
    CAMPANULATE = "CAMPANULATE"
    CORYMBOSE = "CORYMBOSE"
    CONVOLUTE = "CONVOLUTE"
    CORONA = "CORONA"
    CUNEATE = "CUNEATE"
    CYLINDRICAL = "CYLINDRICAL"
    DISK = "DISK"
    DELTATE = "DELTATE"
    ELLIPTIC = "ELLIPTIC"
    FUNNELFORM = "FUNNELFORM"
    FLABELLATE = "FLABELLATE"
    FILIFORM = "FILIFORM"
    HOOD = "HOOD"
    IRREGULAR = "IRREGULAR"
    KEEL = "KEEL"
    LABELLUM = "LABELLUM"
    LANCEOLATE = "LANCEOLATE"
    LINEAR = "LINEAR"
    LIPPED = "LIPPED"
    LOBBED = "LOBBED"
    OVATE = "OVATE"
    OBLONG = "OBLONG"
    OBCORDATE = "OBCORDATE"
    OBOVATE = "OBOVATE"
    OBLANCEOLATE = "OBLANCEOLATE"
    ORBICULAR = "ORBICULAR"
    PALATE = "PALATE"
    PSEUDORACEMES = "PSEUDORACEMES"
    ROTATE = "ROTATE"
    RAY = "RAY"
    REFLEXED = "REFLEXED"
    RHOMBIC = "RHOMBIC"
    RENIFORM = "RENIFORM"
    SALVERFORM = "SALVERFORM"
    SUBORBICULAR = "SUBORBICULAR"
    SUBRHOMBIC = "SUBRHOMBIC"
    SPUR = "SPUR"
    SPATULATE = "SPATULATE"
    SPICATE = "SPICATE"
    SUBROTATE = "SUBROTATE"
    STANDARD = "STANDARD"
    TUBULAR = "TUBULAR"
    TRIANGULAR = "TRIANGULAR"
    URCEOLATE = "URCEOLATE"
    UNILABIATE = "UNILABIATE"
    VALVATE = "VALVATE"
    VERTICIL = "VERTICIL"
    ZYGOMORPHIC = "ZYGOMORPHIC"
    CUP = "CUP"
    UNGUICULATE = "UNGUICULATE"
    CLAW = "CLAW"
    FASICLE = "FASICLE"
    STELLATE = "STELLATE"
    SUBPANICULATE = "SUBPANICULATE"
    PENTAGONAL = "PENTAGONAL"

class OriginType(str, Enum):
    NATURALIZED = "NATURALIZED"
    INDIGENOUS = "INDIGENOUS"
    ENDEMIC = "ENDEMIC"
    POLYNESIAN_INTRODUCTION = "POLYNESIAN INTRODUCTION"

class Location(str, Enum):
    HAWAII = "HAWAII"
    MAUI = "MAUI"
    KAHOOLAWE = "KAHOOLAWE"
    MOLOKAI = "MOLOKAI"
    LANAI = "LANAI"
    OAHU = "OAHU"
    KAUAI = "KAUAI"
    NIIHAU = "NIIHAU"
    ALL_ISLANDS = "ALL ISLANDS"

class FederalStatusType(str, Enum):
    SPECIES_OF_CONCERN = "SPECIES_OF_CONCERN"
    ENDANGERED = "ENDANGERED"
    THREATENED = "THREATENED"
    WITHDRAWN = "WITHDRAWN" 
    
class StatusType(str, Enum):
    NATURALIZED = "NATURALIZED"
    ENDEMIC = "ENDEMIC"
    RARE = "RARE"
    SECURE = "SECURE"
    VULNERABLE = "VULNERABLE"
    SOC = "Species of concern"

class Measurements(BaseModel):
    min: Optional[float] = None
    max: Optional[float] = None
    extreme_min: Optional[float] = None
    extreme_max: Optional[float] = None

class HawaiianPlant(BaseModel):
    # Basic Information
    family: str = Field(..., description="Plant family name (should only be 1 object)")
    genus: str = Field(..., description="Plant genus name (should only be 1 object)")
    species: str = Field(..., description="Plant species name (should only be 1 object)")
    common_name: Optional[str] = Field(None, description="Common name of the plant")
    wagner_pg_number: Optional[str] = Field(None, description="Wagner book reference number")
    description: Optional[Description] = Field(None, description="Take knowledge from outside the passage to infer whether the plant is DICOTS, MONOCOTS, CONIFERS or FERNS")
    infraspecific_epithet: str = Field(..., description="The third word in the scientific name of an infraspecific taxon, following the name of the species. This applies only to formal names of plants and fungi, and not to the formal names of bacteria or animals. In the name Cannabis sativa subsp. indica, the word indica is the infraspecific epithet.")

    hawaiian_name: Optional[List[str]] = Field(None, description="List of Hawaiian names")
    
    stem_hair_type: Optional[List[StemHairType]] = Field(None, description="Type of hair on stem")
    
    phyllotaxy_type: Optional[List[PhyllotaxyType]] = Field(None, description="The arrangement of leaves around the stem.")

    leaf_hair_description: Optional[str] = Field(None, description="Description of leaf hair.")
    leaf_hair_upper_description: Optional[str] = Field(None, description="Description of Upper leaf hairs.")
    leaf_hair_lower_description: Optional[str] = Field(None, description="Description of Lower leaf hairs.")
    
    breeding_type: Optional[List[BreedingType]] = Field(None, description="Plant reproductive class.")
    
    inflorescence_type: Optional[List[InflorescenceType]] = Field(None, description="In a flowering plant, a cluster of flowers on a branch or a system of branches")
    
    ray_color: Optional[str] = Field(None, description="Color of ray")
    floret_color: Optional[str] = Field(None, description="Color of florets")
    spathe_color: Optional[str] = Field(None, description="Color of spathe")
    perianth_outer_color: Optional[str] = Field(None, description="Color of perianth outer flower")
    perianth_inner_color: Optional[str] = Field(None, description="Color of perianth inner flower")
    perianth_color: Optional[str] = Field(None, description="Color of perianth")
    labellum_color: Optional[str] = Field(None, description="Color of labellum")

    corolla_type: Optional[List[CorollaType]] = Field(None, description="Type of corolla")
    corolla_color: Optional[str] = Field(None, description="Color of corolla")
    staminate_corolla_type: Optional[List[CorollaType]] = Field(None, description="Type of staminate corolla")
    pistillate_corolla_type: Optional[List[CorollaType]] = Field(None, description="Type of pistillate corolla")
    
    fruit_type: Optional[List[FruitType]] = Field(None, description="Type of fruit")
    fruit_length: Optional[Measurements] = Field(None, description="Fruit length in millimeters")
    fruit_width: Optional[Measurements] = Field(None, description="Fruit width in millimeters")
    fruit_diameter: Optional[Measurements] = Field(None, description="Fruit diameter in millimeters")
    
    ploidy: Optional[list] = Field(None, description="Ploidy level expressed as a function of n (e.g., 1n, 2n or 3n, etc..)")
    chromosome_number: Optional[list] = Field(None, description="The integer Number of chromosomes")
    average_chromosome_number: Optional[float] = Field(None, description="Average chromosome number")
    
    origin: Optional[List[OriginType]] = Field(None, description="Origin type of the plant")
    # fed_status: Optional[FederalStatusType] = Field(None, description="Federal conservation status")
    status: Optional[List[StatusType]] = Field(None, description="General status")

    life_form_type: Optional[List[LifeFormType]] = Field(None, description="Growth habit or life form")
    leaf_type: Optional[List[LeafType]] = Field(None, description="Simple or compound leaf type")
    leaf_shape_type: Optional[List[LeafShapeType]] = Field(None, description="Shape of leaves")
    leaf_margin_type: Optional[List[LeafMarginType]] = Field(None, description="Type of leaf margin")
    
    juvenile_leaf_type: Optional[List[LeafType]] = Field(None, description="Simple or compound juvenile leaf type")
    juvenile_leaf_shape_type: Optional[List[LeafShapeType]] = Field(None, description="Shape of juvenile leaves")
    juvenile_leaf_margin_type: Optional[List[LeafMarginType]] = Field(None, description="Type of leaf margin on juvenile plants")
    
    leaflets_leaf_type: Optional[List[LeafType]] = Field(None, description="Simple or compound juvenile leaf type")
    leaflets_leaf_shape_type: Optional[List[LeafShapeType]] = Field(None, description="Shape of juvenile leaves")
    leaflets_leaf_margin_type: Optional[List[LeafMarginType]] = Field(None, description="Type of leaf margin on juvenile plants")
    
    leaf_hair_upper_type: Optional[List[LeafHairType]] = Field(None, description="Type of upper leaf hairs")
    leaf_hair_lower_type: Optional[List[LeafHairType]] = Field(None, description="Type of lower leaf hairs")
    leaf_hair_type: Optional[List[LeafHairType]] = Field(None, description="Type of leaf hairs")
    juvenile_leaf_hair_type: Optional[List[LeafHairType]] = Field(None, description="Type of juvenile leaf hairs")
    
    island_type: Optional[List[Location]] = Field(None, description="Islands where the plant is found.")
    
    stem_height: Optional[Measurements] = Field(None, description="Stem or general plant height measurements in meters")
    leaf_length: Optional[Measurements] = Field(None, description="Length of leaves in millimeters")
    leaf_width: Optional[Measurements] = Field(None, description="Width of leaves in millimeters")
    juvenile_leaf_length: Optional[Measurements] = Field(None, description="Juvenile length of leaves in millimeters")
    juvenile_leaf_width: Optional[Measurements] = Field(None, description="Juvenile width of leaves in millimeters")
    leaflets_leaf_length: Optional[Measurements] = Field(None, description="Leaflets length of leaves in millimeters")
    leaflets_leaf_width: Optional[Measurements] = Field(None, description="Leaflets width of leaves in millimeters")    
    
    petioles: Optional[Measurements] = Field(None, description="Length of petiole stalk in centimeters") 
    staminate_inflorescence_length: Optional[Measurements] = Field(None, description="The measured length in millimeters of the male (pollen-producing) flower cluster. This specifically refers to catkins or other inflorescences containing only staminate (male) flowers.")    
    pistillate_inflorescence_length: Optional[Measurements] = Field(None, description="The measured length of the female (seed-producing) flower cluster. This specifically refers to inflorescences containing only pistillate (female) flowers.")    
    staminate_inflorescence_width: Optional[Measurements] = Field(None, description="The measured width in millimeters of the male (pollen-producing) flower cluster. This specifically refers to catkins or other inflorescences containing only staminate (male) flowers.")    
    pistillate_inflorescence_width: Optional[Measurements] = Field(None, description="The measured width of the female (seed-producing) flower cluster. This specifically refers to inflorescences containing only pistillate (female) flowers.")        
    
    inflorescence_flower_length: Optional[Measurements] = Field(None, description="The length of an inflorescence flower in millimeters.")
    inflorescence_flower_width: Optional[Measurements] = Field(None, description="The width of an inflorescence flower in millimeters.")
    
    flower_length: Optional[Measurements] = Field(None, description="Flower length in centimeters")
    flower_width: Optional[Measurements] = Field(None, description="Flower width in centimeters")
    
    rachis_length: Optional[Measurements] = Field(None, description="Rachis length in millimeters")
    rachis_diameter: Optional[Measurements] = Field(None, description="Rachis diameter in millimeters") 
    
    head_length: Optional[Measurements] = Field(None, description="The measured length of the capitulum (flower head) in millimeters.")
    head_diameter: Optional[Measurements] = Field(None, description="The measured diameters of the capitulum (flower head) in millimeters.")
    
    bur_length: Optional[Measurements] = Field(None, description="The measured length of the bur in millimeters.")
    tepal_length: Optional[Measurements] = Field(None, description="The measured length of the tepal in millimeters.")
    staminate_tepal_length: Optional[Measurements] = Field(None, description="The measured length of the staminate tepal in millimeters.")
    pistillate_tepal_length: Optional[Measurements] = Field(None, description="The measured length of the pistillate tepal in millimeters.")
      
    ray_length: Optional[Measurements] = Field(None, description="The measured length of the ray in millimeters.")
    ray_width: Optional[Measurements] = Field(None, description="The measured width of the ray in millimeters.")
    
    florets_length: Optional[Measurements] = Field(None, description="The measured length of the florets in millimeters.")
    
    involucre_length: Optional[Measurements] = Field(None, description="Involucre length in millimeters")
    involucre_width: Optional[Measurements] = Field(None, description="Involucre width in millimeters")
    staminate_involucre_length: Optional[Measurements] = Field(None, description="Staminate involucre length in millimeters")
    pistilate_involucre_length: Optional[Measurements] = Field(None, description="Pistillate involucre length in millimeters")   
    
    bract_length: Optional[Measurements] = Field(None, description="Bract length in millimeters")
    bract_width: Optional[Measurements] = Field(None, description="Bract width in millimeters")
    bract_lower_length: Optional[Measurements] = Field(None, description="Lower bract length in millimeters")
    bract_outer_length: Optional[Measurements] = Field(None, description="Outer bract length in millimeters")
    
    bracteoles_length: Optional[Measurements] = Field(None, description="Bracteoles length in millimeters")
    bracteoles_width: Optional[Measurements] = Field(None, description="Bracteole width in millimeters")
    
    pedicel_length: Optional[Measurements] = Field(None, description="Pedicel length in millimeters")
    pedicel_width: Optional[Measurements] = Field(None, description="Pedicel width in millimeters")
    staminate_pedicel_length: Optional[Measurements] = Field(None, description="Staminate pedicel length in millimeters")
    pistillate_pedicel_length: Optional[Measurements] = Field(None, description="Pistillate pedicel length in millimeters")
    staminate_pedicel_width: Optional[Measurements] = Field(None, description="Staminate pedicel width in millimeters")
    pistillate_pedicel_width: Optional[Measurements] = Field(None, description="Pistillate pedicel width in millimeters") 
    
    hypanthium_length: Optional[Measurements] = Field(None, description="Hypanthium length in millimeters")
    hypanthium_width: Optional[Measurements] = Field(None, description="Hypanthium width in millimeters")
    
    peduncle_length: Optional[Measurements] = Field(None, description="Peduncle length in millimeters")
    peduncle_width: Optional[Measurements] = Field(None, description="Peduncle width in millimeters")
    staminate_peduncle_length: Optional[Measurements] = Field(None, description="Staminate peduncle length in millimeters")
    staminate_peduncle_width: Optional[Measurements] = Field(None, description="Staminate peduncle width in millimeters")
    pistillate_peduncle_length: Optional[Measurements] = Field(None, description="Pistillate peduncle length in millimeters")
    pistillate_peduncle_width: Optional[Measurements] = Field(None, description="Pistillate peduncle width in millimeters")

    spathe_width: Optional[Measurements] = Field(None, description="Spathe width dimensions in millimeters")
    spathe_length: Optional[Measurements] = Field(None, description="Spathe length dimensions in millimeters")
    spadix_length: Optional[Measurements] = Field(None, description="Spadix length dimensions in millimeters")
    
    perianth_width: Optional[Measurements] = Field(None, description="Perianth width dimensions in millimeters")
    perianth_length: Optional[Measurements] = Field(None, description="Perianth length dimensions in millimeters")   
    perianth_outer_width: Optional[Measurements] = Field(None, description="Outer perianth width dimensions in millimeters")
    perianth_outer_length: Optional[Measurements] = Field(None, description="Outer perianth length dimensions in millimeters") 
    perianth_inner_width: Optional[Measurements] = Field(None, description="Inner perianth width dimensions in millimeters")
    perianth_inner_length: Optional[Measurements] = Field(None, description="Inner perianth length dimensions in millimeters") 
    
    perianth_lobes_width: Optional[Measurements] = Field(None, description="Perianth lobes width dimensions in millimeters")
    perianth_lobes_length: Optional[Measurements] = Field(None, description="Perianth lobes length dimensions in millimeters") 
    perianth_tube_length: Optional[Measurements] = Field(None, description="Perianth tube length dimensions in millimeters") 
    pistillate_perianth_tube_length: Optional[Measurements] = Field(None, description="Pistillate perianth tube length dimensions in millimeters") 
    staminate_perianth_tube_length: Optional[Measurements] = Field(None, description="Staminate perianth tube length dimensions in millimeters") 
    
    pappus_length: Optional[Measurements] = Field(None, description="Pappus length in millimeters")
    umbellet_length: Optional[Measurements] = Field(None, description="Umbellet length in millimeters")
    labellum_width: Optional[Measurements] = Field(None, description="Labellum width dimensions in millimeters")
    labellum_length: Optional[Measurements] = Field(None, description="Labellum length dimensions in millimeters")
    
    calyx_length: Optional[Measurements] = Field(None, description="Calyx length in millimeters")
    calyx_width: Optional[Measurements] = Field(None, description="Calyx width in millimeters")
    calyx_teeth_length: Optional[Measurements] = Field(None, description="Calyx teeth length in millimeters")
    calyx_teeth_width: Optional[Measurements] = Field(None, description="Calyx teeth width in millimeters")
    calyx_lobes_length: Optional[Measurements] = Field(None, description="Calyx lobe length in millimeters")
    calyx_lobes_width: Optional[Measurements] = Field(None, description="Calyx lobe width in millimeters")
    
    upper_calyx_length: Optional[Measurements] = Field(None, description="Upper calyx length in millimeters")
    lower_calyx_length: Optional[Measurements] = Field(None, description="Lower calyx length in millimeters")
    
    inner_calyx_lobes_length: Optional[Measurements] = Field(None, description="Inner calyx lobes length in millimeters")
    inner_calyx_lobes_width: Optional[Measurements] = Field(None, description="Inner calyx lobes width in millimeters")
    outer_calyx_lobes_length: Optional[Measurements] = Field(None, description="Outer calyx lobes length in millimeters")
    outer_calyx_lobes_width: Optional[Measurements] = Field(None, description="Outer calyx lobes width in millimeters")
    
    calyx_tube_length: Optional[Measurements] = Field(None, description="Calyx tube length in millimeters")
    calyx_tube_width: Optional[Measurements] = Field(None, description="Calyx tube width in millimeters")
    
    male_calyx_length: Optional[Measurements] = Field(None, description="Male calyx length in millimeters")
    male_calyx_width: Optional[Measurements] = Field(None, description="Male calyx width in millimeters")
    
    male_calyx_lobes_length: Optional[Measurements] = Field(None, description="Male calyx length in millimeters")
    male_calyx_lobes_width: Optional[Measurements] = Field(None, description="Male calyx width in millimeters")
    
    female_calyx_length: Optional[Measurements] = Field(None, description="female calyx length in millimeters")
    female_calyx_width: Optional[Measurements] = Field(None, description="female calyx width in millimeters")
    
    female_calyx_lobes_length: Optional[Measurements] = Field(None, description="Female calyx length in millimeters")
    female_calyx_lobes_width: Optional[Measurements] = Field(None, description="Female calyx width in millimeters")
    
    male_calyx_lobes_length_inner: Optional[Measurements] = Field(None, description="Male calyx inner lobe length in millimeters")
    male_calyx_lobes_length_outer: Optional[Measurements] = Field(None, description="Male calyx outer length in millimeters")
    female_calyx_lobes_length_inner: Optional[Measurements] = Field(None, description="Female calyx inner lobe length in millimeters")
    female_calyx_lobes_length_outer: Optional[Measurements] = Field(None, description="Female calyx outer length in millimeters")
    
    male_calyx_lobes_width_outer: Optional[Measurements] = Field(None, description="Male calyx outer lobe width in millimeters")
    male_calyx_tube_length: Optional[Measurements] = Field(None, description="Male calyx tube length in millimeters")

    female_calyx_lobes_width_inner: Optional[Measurements] = Field(None, description="Female calyx inner lobe width in millimeters")
    female_calyx_lobes_width_outer: Optional[Measurements] = Field(None, description="Female calyx outer lobe width in millimeters")
    female_calyx_tube_length: Optional[Measurements] = Field(None, description="Female calyx tube length in millimeters")
    
    inner_calyx_length: Optional[Measurements] = Field(None, description="Inner calyx length in millimeters")
    outer_calyx_length: Optional[Measurements] = Field(None, description="Outer calyx length in millimeters")

    corolla_length: Optional[Measurements] = Field(None, description="Corolla length in millimeters")
    corolla_width: Optional[Measurements] = Field(None, description="Corolla width in millimeters")
    
    corolla_length: Optional[Measurements] = Field(None, description="Corolla length in millimeters")
    corolla_width: Optional[Measurements] = Field(None, description="Corolla width in millimeters")
    corolla_tube_length: Optional[Measurements] = Field(None, description="Corolla tube length in millimeters")
    corolla_tube_width: Optional[Measurements] = Field(None, description="Corolla tube width in millimeters")
    corolla_lobes_length: Optional[Measurements] = Field(None, description="Corolla lobes length in millimeters")
    corolla_lobes_width: Optional[Measurements] = Field(None, description="Corolla lobes width in millimeters")
    
    
    upper_corolla: Optional[Measurements] = Field(None, description="Upper corolla length in millimeters")
    lower_corolla: Optional[Measurements] = Field(None, description="Lower corolla length in millimeters")
    upper_corolla_lobes_length: Optional[Measurements] = Field(None, description="Upper corolla lobes length in millimeters")
    lower_corolla_lobes_length: Optional[Measurements] = Field(None, description="Lower corolla lobes length in millimeters")
    
    corolla_lip: Optional[Measurements] = Field(None, description="Corolla lip length in millimeters")
    
    staminate_corolla_length: Optional[Measurements] = Field(None, description="Staminate corolla length in millimeters")
    pistillate_corolla_length: Optional[Measurements] = Field(None, description="Pistillate corolla length in millimeters")
    
    staminate_corolla_tube_length: Optional[Measurements] = Field(None, description="Staminate corolla tube length in millimeters")
    pistillate_corolla_tube_length: Optional[Measurements] = Field(None, description="Pistillate corolla tube length in millimeters")

    staminate_corolla_tube_width: Optional[Measurements] = Field(None, description="Staminate corolla tube length in millimeters")
    pistillate_corolla_tube_width: Optional[Measurements] = Field(None, description="Pistillate corolla tube length in millimeters")    
    
    female_corolla_lobes_length: Optional[Measurements] = Field(None, description="Female corolla lobes length in millimeters") 
    female_corolla_lobes_width:  Optional[Measurements] = Field(None, description="Female corolla lobes width in millimeters") 
    male_corrola_lobes_length: Optional[Measurements] = Field(None, description="Male corolla lobes length in millimeters") 
    male_corrola_lobes_width: Optional[Measurements] = Field(None, description="Male corolla lobes width in millimeters") 
    
    fruit_length: Optional[Measurements] = Field(None, description="Fruit length in centimeters") 
    fruit_width: Optional[Measurements] = Field(None, description="Fruit width in centimeters") 
    fruit_diameter: Optional[Measurements] = Field(None, description="Fruit diameter in centimeters") 
    
    
    seeds_perfruit: Optional[Measurements] = Field(None, description="Number of seeds per fruit") 
    
    seed_length: Optional[Measurements] = Field(None, description="Seed length in centimeters") 
    seed_width: Optional[Measurements] = Field(None, description="Seed width in centimeters") 
    seed_diameter: Optional[Measurements] = Field(None, description="Seed diameter in centimeters") 


In [10]:
## Set up parameters and prompts for LLM data extraction using gpt-4o, o3-mini and claude-sonnet

# Prompt
system_prompt = """
You are an expert plant taxonomist. Please analyze the following text and extract any plant data that is present.
You are exhaustive; you include ALL the details mentioned in the HawaiianPlant schema. When extracting information, ensure that you return a structured response in JSON format matching the `HawaiianPlant` schema. """

# Agent parameters/configurations
async def process_with_models(extracted_text):
    
    gpt4o_agent = Agent(
        model="openai:gpt-4o",
        result_type=HawaiianPlant, # HawaiianPlant is the JSON schema
        system_prompt=system_prompt,
    )

    claude_agent = Agent(
        model="anthropic:claude-3-5-sonnet-latest",
        result_type=HawaiianPlant,# HawaiianPlant is the JSON schema
        system_prompt=system_prompt, # HawaiianPlant is the JSON schema
        model_settings={'temperature': 0.2}
    )

    mini_agent = Agent(
        model="openai:o3-mini",
        result_type=HawaiianPlant, # HawaiianPlant is the JSON schema
        system_prompt=system_prompt,
    )
    
    # Extract data
    gpt4o_result = await gpt4o_agent.run(extracted_text)
    claude_result = await claude_agent.run(extracted_text)
    gpt3omini_result = await mini_agent.run(extracted_text)

    return {
        "gpt_4o_output": gpt4o_result.data,
        "sonnet_output": claude_result.data,
        "gpt_mini_output": gpt3omini_result.data
    }

async def extract_plant_data(ocr_transcription):
    results = await process_with_models(ocr_transcription)
    return results


In [12]:
gpt_4o_rows = []
sonnet_rows = []
gpt_mini_rows = []

gpt4o_path = f"{output_dir}/gpt_4o_output_man_transcriptions.csv"
sonnet_path = f"{output_dir}/sonnet_output_man_transcriptions.csv"
gptmini_path = f"{output_dir}/gpt_mini_output_man_transcriptions.csv"

## Iterate through transcriptions dictionary and extract plant data 
for species_id, transcription in tqdm.tqdm(transcription_dict.items()):
    try:
        results = await extract_plant_data(transcription)
    
    except:
        print('Crashed: ', species_id)
        continue
    
    gpt_4o_output = dict(results['gpt_4o_output'])
    sonnet_output = dict(results['sonnet_output'])
    gpt_mini_output = dict(results['gpt_mini_output'])
    
    gpt_4o_output["species_id"] = species_id
    sonnet_output["species_id"] = species_id
    gpt_mini_output["species_id"] = species_id

    # Append to CSV immediately
    pd.DataFrame([gpt_4o_output]).to_csv(gpt4o_path, mode='a', index=False, header=not os.path.exists(gpt4o_path))
    pd.DataFrame([sonnet_output]).to_csv(sonnet_path, mode='a', index=False, header=not os.path.exists(sonnet_path))
    pd.DataFrame([gpt_mini_output]).to_csv(gptmini_path, mode='a', index=False, header=not os.path.exists(gptmini_path))

    # Optionally store in memory too
    gpt_4o_rows.append(gpt_4o_output)
    sonnet_rows.append(sonnet_output)
    gpt_mini_rows.append(gpt_mini_output)

# Final DataFrames (if needed for further use)
gpt_4o_df = pd.DataFrame(gpt_4o_rows).set_index("species_id")
sonnet_df = pd.DataFrame(sonnet_rows).set_index("species_id")
gpt_mini_df = pd.DataFrame(gpt_mini_rows).set_index("species_id")

100%|██████████| 1/1 [01:26<00:00, 86.21s/it]


## Compare Extractions (Manual vs 3 LLMs)

In [27]:
## Import manually extracted data for comparison
manual_extracted_data_df = pd.read_csv(f"{data_dir}/manual_wagner_extractions.csv")
manual_extracted_data_df['species_id'] = (
    manual_extracted_data_df['family'].astype(str) + '_' +
    manual_extracted_data_df['genus'].astype(str) + '_' +
    manual_extracted_data_df['species'].astype(str)
)

manual_extracted_data_df.drop('Unnamed: 0.1', axis=1, inplace=True)
manual_extracted_data_df.drop('Unnamed: 0', axis=1, inplace=True)

manual_extracted_data_df.head()

Unnamed: 0,family,genus,species,common_name,wagner_pg_number,description,infraspecific_epithet,stem_hair_type,phyllotaxy_type,leaf_hair_description,...,male_corrola_lobes_length,male_corrola_lobes_width,fruit_length,fruit_width,fruit_diameter,seeds_perfruit,seed_length,seed_width,seed_diameter,species_id
0,Asteraceae,Ambrosia,artemisiifolia,common ragweed,pg 256-257,['DICOTS'],,['HIRSUTE'],"['OPPOSITE', 'ALTERNATE']",,...,,,,,,"{'exmin': None, 'min': nan, 'max': nan, 'exmax...",,,,Asteraceae_Ambrosia_artemisiifolia
1,Asteraceae,Dubautia,laxa,na`ena`e pua melemele,"pg 292-295,301",['DICOTS'],pseudoplantaginea,"['GLABROUS (SMOOTH)', 'HISPID']",['WHORLED'],glabrous,...,,,"{'exmin': nan, 'min': 2.0, 'max': 3.0, 'exmax'...",,,"{'exmin': None, 'min': nan, 'max': nan, 'exmax...",,,,Asteraceae_Dubautia_laxa
2,Asteraceae,Tetramolopium,filiforme,no common name,"pg 361-362, 365, 366",['DICOTS'],polyphyllum,['GLABROUS (SMOOTH)'],['ALTERNATE'],,...,,,"{'exmin': nan, 'min': 2.0, 'max': 2.7, 'exmax'...","{'exmin': nan, 'min': 0.6, 'max': nan, 'exmax'...",,"{'exmin': None, 'min': nan, 'max': nan, 'exmax...",,,,Asteraceae_Tetramolopium_filiforme
3,Asteraceae,Encelia,farinosa,brittle bush,pg 312-313,['DICOTS'],,['PUBERULENT'],['ALTERNATE'],densely white tomentose,...,,,"{'exmin': nan, 'min': 4.0, 'max': 4.5, 'exmax'...",,,"{'exmin': None, 'min': nan, 'max': nan, 'exmax...",,,,Asteraceae_Encelia_farinosa
4,Aristolochiaceae,Aristolochia,littoralis,calico flower,"pg 237-238,239",['DICOTS'],,['NAN'],['ALTERNATE'],,...,,,"{'exmin': nan, 'min': nan, 'max': 45.0, 'exmax...",,"{'exmin': nan, 'min': nan, 'max': nan, 'exmax'...","{'exmin': None, 'min': nan, 'max': nan, 'exmax...","{'exmin': nan, 'min': nan, 'max': 0.6, 'unit':...",,,Aristolochiaceae_Aristolochia_littoralis


In [22]:
class AreAnntoationsEqual(BaseModel):
    are_equal: bool = Field(..., description="Are the two annotations equal. Be lenient with evaluations.")
    # justification: str = Field(..., description="Justification of the propose value for are_equal. Justifications should be as concise as possible.")
    

## Setting the prompt and model for validation agent
validation_agent = Agent(
    model="openai:o3-mini",
    result_type=AreAnntoationsEqual,
    system_prompt = """You are an expert taxonomist. You are comparing the outcome of a manually extracted result versus an automatically extracted result. You need to compare the automatic results and determine whether the result is synonymous or equal the manual one; taking into consideration
    linguisitc and formatting nuances. If the measurements are correct but are seemigly in the wrong units, you can mark that as the results being the same = True. Your answer is whether the two results are similar True/False.""",
)

## Models to compare
models = ["gpt_4o_output", "gpt_mini_output", "sonnet_output"]

In [30]:
def is_missing(val):
    '''
    This function checks for all None/NaN values in a row/column to
    so the validation agent does not compare all missing values
    '''
    
    if val is None or val == 'None' or val == 'nan':
        return True
    elif isinstance(val, float) and np.isnan(val):
        return True
    elif isinstance(val, dict):
        relevant_keys = ['exmin', 'min', 'max', 'exmax']
        relevant_vals = [val.get(k) for k in relevant_keys]
        return all((v is None or isinstance(v, float) and np.isnan(v)) for v in relevant_vals)
    return False

def create_empty_df_with_species_id(df):
    '''
    I just create blank dataframes to keep track of accurate
    data extractions. These dataframes end up being populated with 1 or 0 
    if the automated extractions were the same as manual extractions.
    '''
    new_df = df.copy()
    for col in new_df.columns:
        if col != 'species_id':
            new_df[col] = np.nan
    return new_df

# Dictionary of NaN-filled DataFrames
validation_scores_model_dfs = {
    "gpt_mini": create_empty_df_with_species_id(manual_extracted_data_df).set_index('species_id'),
    "sonnet": create_empty_df_with_species_id(manual_extracted_data_df).set_index('species_id'),
    "gpt_4o": create_empty_df_with_species_id(manual_extracted_data_df).set_index('species_id'),
}


In [32]:
## Code to Compare Validations

plant_species = list(gpt_4o_df.index)

for input_species in tqdm.tqdm(plant_species):
    ground_truth_row = manual_extracted_data_df.loc[manual_extracted_data_df['species_id'] == input_species].iloc[0]

    model_rows = {
        "gpt_4o": gpt_4o_df.loc[input_species],
        "sonnet": sonnet_df.loc[input_species],
        "gpt_mini": gpt_mini_df.loc[input_species]
    }

    for gt_key, gt_value in ground_truth_row.items():
        # # Skip if all models and ground truth have missing values
        # if all(is_missing(row[gt_key]) and is_missing(gt_value) for row in model_rows.values()):
        #     continue

        for model_name, row in model_rows.items():
            auto_value = row[gt_key]

            # Skip if both are missing
            if is_missing(auto_value) and is_missing(gt_value):
                continue

            # Skip if values are exactly the same
            if auto_value == gt_value:
                validation_scores_model_dfs[model_name].loc[input_species, gt_key] = 1
                continue

            # Otherwise, validate via prompt
            prompt = (
                f"manually annotated {gt_key}: {gt_value} "
                f"automatically annotated {gt_key}: {auto_value}"
            )
            result = await validation_agent.run(prompt)
            is_correct = result.data.are_equal
            validation_scores_model_dfs[model_name].loc[input_species, gt_key] = int(bool(is_correct))

# Write results once at the end
validation_scores_model_dfs['gpt_mini'].to_csv(f'{output_dir}/gpt_mini_validation.csv')
validation_scores_model_dfs['sonnet'].to_csv(f'{output_dir}/sonnet_validation.csv')
validation_scores_model_dfs['gpt_4o'].to_csv(f'{output_dir}/gpt_4o_validation.csv')
