In [1]:
# pip install --upgrade pydantic-ai openai

# Wagner Database AI Extraction

#### Code works best with a python version 3.10 < version < 3.12 (I am using 3.10.13)

In [1]:
from enum import Enum
from typing import Optional, List
import os
from openai import OpenAI
import json
import base64
import pandas as pd
import numpy as np
import sys
sys.path.append('/Users/williamharrigan/Desktop/UH/Year_3/semester_2/wagner')
import creds

## pydantic
from pydantic import BaseModel, Field
from openai.lib._pydantic import to_strict_json_schema
from pydantic_ai import Agent
from openai.types.chat.chat_completion_content_part_param import (
    ChatCompletionContentPartTextParam,
    ChatCompletionContentPartImageParam
)

from openai.types.chat.chat_completion_content_part_image_param import (
    ImageURL
)

photo_dir = "../training_set/"

In [2]:
class Description(str, Enum):
    DICOTS = "Dicots"
    MONOCOTS = "Monocots"
    CONIFERS = "Conifers"
    FERNS = "Ferns and fern allies"

class StemHairType(str, Enum):
    DENDRITIC = "DENDRITIC"
    GLABROUS = "GLABROUS"
    HIRSUTE = "HIRSUTE"
    HISPID = "HISPID"
    LEPIDOTE = "LEPIDOTE"
    PILOSE = "PILOSE"
    PUBERULENT = "PUBERULENT"
    STRIGOSE = "STRIGOSE"
    STELLATE = "STELLATE"
    TOMENTOSE = "TOMENTOSE"
    VILLOUS = "VILLOUS"
    GLAUCOUS = "GLAUCOUS"
    
class FruitType(str, Enum):
    ACHENE = "ACHENE"
    AGGREGATE = "AGGREGATE"
    ARTICLE = "ARTICLE"
    BERRY = "BERRY"
    CAPSULE = "CAPSULE"
    CARYOPSIS = "CARYOPSIS"
    DRUPE = "DRUPE"
    FOLLICLE = "FOLLICLE"
    LEGUME = "LEGUME"
    MERICARP = "MERICARP"
    MULTIPLE = "MULTIPLE"
    NUT = "NUT"
    PEPO = "PEPO"
    POME = "POME"
    SCHIZOCARP = "SCHIZOCARP"
    SILICLE = "SILICLE"
    SILIQUE = "SILIQUE"
    SYCONIUM = "SYCONIUM"
    
class LeafShapeType(str, Enum):
    ACEROSE = "ACEROSE"
    AWL_SHAPED = "AWL_SHAPED"
    GLADIATE = "GLADIATE"
    HASTATE = "HASTATE"
    CORDATE = "CORDATE"
    DELTOID = "DELTOID"
    LANCEOLATE = "LANCEOLATE"
    LINEAR = "LINEAR"
    ELLIPTIC = "ELLIPTIC"
    ENSIFORM = "ENSIFORM"
    LYRATE = "LYRATE"
    OBCORDATE = "OBCORDATE"
    FALCATE = "FALCATE"
    FLABELLATE = "FLABELLATE"
    OBDELTOID = "OBDELTOID"
    OBELLIPTIC = "OBELLIPTIC"
    OBLANCEOLATE = "OBLANCEOLATE"
    OBLONG = "OBLONG"
    PERFOLIATE = "PERFOLIATE"
    QUADRATE = "QUADRATE"
    OBOVATE = "OBOVATE"
    ORBICULAR = "ORBICULAR"
    RENIFORM = "RENIFORM"
    RHOMBIC = "RHOMBIC"
    OVAL = "OVAL"
    OVATE = "OVATE"
    ROTUND = "ROTUND"
    SAGITTATE = "SAGITTATE"
    PANDURATE = "PANDURATE"
    PELTATE = "PELTATE"
    SPATULATE = "SPATULATE"
    SUBULATE = "SUBULATE"

class StemHairs(str, Enum):
    DENDRITIC = "DENDRITIC"
    GLABROUS = "GLABROUS"

class PhyllotaxyType(str, Enum):
    ALTERNATE = "ALTERNATE"
    OPPOSITE = "OPPOSITE"
    WHORLED = "WHORLED"
    DECUSSATE = "DECUSSATE"
    DISTICHOUS = "DISTICHOUS"
    EQUITANT = "EQUITANT"
    TERNATE = "TERNATE"
    CAULINE = "CAULINE"

class InflorescenceType(str, Enum):
    CATKIN = "CATKIN"
    CYME = "CYME"
    HEAD = "HEAD"
    PANICLE = "PANICLE"
    RACEME = "RACEME"
    SPATHE_SPADIX = "SPATHE_SPADIX"
    THYRSE = "THYRSE"
    UMBEL = "UMBEL"
    VERTISCILLATE = "VERTISCILLATE"
    SOLITARY = "SOLITARY"
    SPIKE = "SPIKE"
    LANCEOLATE = "LANCEOLATE"
    GLOBOSE = "GLOBOSE"
    INVOLUCRE = "INVOLUCRE"
    CORYMBOSE = "CORYMBOSE"
    STROBILOID = "STROBILOID"

class LeafHairType(str, Enum):
    DENDRITIC = "DENDRITIC"
    GLABROUS = "GLABROUS"
    HIRSUTE = "HIRSUTE"
    HISPID = "HISPID"
    LEPIDOTE = "LEPIDOTE"
    PILOSE = "PILOSE"
    PUBERULENT = "PUBERULENT"
    STRIGOSE = "STRIGOSE"
    STELLATE = "STELLATE"
    TOMENTOSE = "TOMENTOSE"
    VILLOUS = "VILLOUS"
    GLAUCOUS = "GLAUCOUS"

class LeafType(str, Enum):
    SIMPLE = "SIMPLE"
    COMPOUND = "COMPOUND"

class BreedingType(str, Enum):
    MONOECIOUS = "MONOECIOUS"  # Male and female flowers on the same plant
    ANDROMONOECIOUS = "ANDROMONOECIOUS"
    CHASMOGAMOUS = "CHASMOGAMOUS"
    DIOECIOUS = "DIOECIOUS"  # Separate male and female plants
    GYNODIOECIOUS = "GYNODIOECIOUS"
    POLYGAMO_MONOECIOUS = "POLYGAMO-MONOECIOUS"
    POLYGAMOUS = "POLYGAMOUS"
    POLYGAMO_DIOECIOUS = "POLYGAMO-DIOECIOUS"
    GYNOMONOECIOUS = "GYNOMONOECIOUS"
    STERILE = "STERILE"

class LifeMarginType(str, Enum):
    TEETH = "TEETH"
    LOBED = "LOBED"
    ENTIRE = "ENTIRE"


class LifeFormType(str, Enum):
    ANNUAL_HERB = "ANNUAL_HERB"
    PERENNIAL_HERB = "PERENNIAL_HERB"
    EPIPHYTE = "EPIPHYTE"
    VINE = "VINE"
    SHRUB = "SHRUB"
    TREE = "TREE"

class CorollaType(str, Enum):
    ADNATE = "ADNATE"
    BILABIATE = "BILABIATE"
    CAMPANULATE = "CAMPANULATE"
    CORYMBOSE = "CORYMBOSE"
    CONVOLUTE = "CONVOLUTE"
    CORONA = "CORONA"
    CUNEATE = "CUNEATE"
    CYLINDRICAL = "CYLINDRICAL"
    DISK = "DISK"
    DELTATE = "DELTATE"
    ELLIPTIC = "ELLIPTIC"
    FUNNELFORM = "FUNNELFORM"
    FLABELLATE = "FLABELLATE"
    FILIFORM = "FILIFORM"
    HOOD = "HOOD"
    IRREGULAR = "IRREGULAR"
    KEEL = "KEEL"
    LABELLUM = "LABELLUM"
    LANCEOLATE = "LANCEOLATE"
    LINEAR = "LINEAR"
    LIPPED = "LIPPED"
    LOBBED = "LOBBED"
    OVATE = "OVATE"
    OBLONG = "OBLONG"
    OBCORDATE = "OBCORDATE"
    OBOVATE = "OBOVATE"
    OBLANCEOLATE = "OBLANCEOLATE"
    ORBICULAR = "ORBICULAR"
    PALATE = "PALATE"
    PSEUDORACEMES = "PSEUDORACEMES"
    ROTATE = "ROTATE"
    RAY = "RAY"
    REFLEXED = "REFLEXED"
    RHOMBIC = "RHOMBIC"
    RENIFORM = "RENIFORM"
    SALVERFORM = "SALVERFORM"
    SUBORBICULAR = "SUBORBICULAR"
    SUBRHOMBIC = "SUBRHOMBIC"
    SPUR = "SPUR"
    SPATULATE = "SPATULATE"
    SPICATE = "SPICATE"
    SUBROTATE = "SUBROTATE"
    STANDARD = "STANDARD"
    TUBULAR = "TUBULAR"
    TRIANGULAR = "TRIANGULAR"
    URCEOLATE = "URCEOLATE"
    UNILABIATE = "UNILABIATE"
    VALVATE = "VALVATE"
    VERTICIL = "VERTICIL"
    ZYGOMORPHIC = "ZYGOMORPHIC"
    CUP = "CUP"
    UNGUICULATE = "UNGUICULATE"
    CLAW = "CLAW"
    FASICLE = "FASICLE"
    STELLATE = "STELLATE"
    SUBPANICULATE = "SUBPANICULATE"
    PENTAGONAL = "PENTAGONAL"

class OriginType(str, Enum):
    NATURALIZED = "NATURALIZED"
    INDIGENOUS = "INDIGENOUS"
    ENDEMIC = "ENDEMIC"
    POLYNESIAN_INTRODUCTION = "POLYNESIAN INTRODUCTION"

class Location(str, Enum):
    HAWAII = "HAWAII"
    MAUI = "MAUI"
    KAHOOLAWE = "KAHOOLAWE"
    MOLOKAI = "MOLOKAI"
    LANAI = "LANAI"
    OAHU = "OAHU"
    KAUAI = "KAUAI"
    NIIHAU = "NIIHAU"
    ALL_ISLANDS = "ALL ISLANDS"

class FederalStatusType(str, Enum):
    SPECIES_OF_CONCERN = "SPECIES_OF_CONCERN"
    ENDANGERED = "ENDANGERED"
    THREATENED = "THREATENED"
    WITHDRAWN = "WITHDRAWN" 
    
class StatusType(str, Enum):
    NATURALIZED = "NATURALIZED"
    ENDEMIC = "ENDEMIC"
    RARE = "RARE"
    SECURE = "SECURE"
    VULNERABLE = "VULNERABLE"

class Measurements(BaseModel):
    min: Optional[float] = None
    max: Optional[float] = None
    extreme_min: Optional[float] = None
    extreme_max: Optional[float] = None

class HawaiianPlant(BaseModel):
    # Basic Information
    family: str = Field(..., description="Plant family name (should only be 1 object)")
    genus: str = Field(..., description="Plant genus name (should only be 1 object)")
    species: str = Field(..., description="Plant species name (should only be 1 object)")
    common_name: Optional[str] = Field(None, description="Common name of the plant")
    wagner_pg_number: Optional[str] = Field(None, description="Wagner book reference number")
    description: Optional[Description] = Field(None, description="Take knowledge from outside the passage to infer whether the plant is DICOTS, MONOCOTS, CONIFERS or FERNS")
    infraspecific_epithet: str = Field(..., description="The third word in the scientific name of an infraspecific taxon, following the name of the species. This applies only to formal names of plants and fungi, and not to the formal names of bacteria or animals. In the name Cannabis sativa subsp. indica, the word indica is the infraspecific epithet.")

    hawaiian_name: Optional[List[str]] = Field(None, description="List of Hawaiian names")
    
    stem_hair_type: Optional[StemHairType] = Field(None, description="Type of hair on stem")
    
    phyllotaxy_type: Optional[PhyllotaxyType] = Field(None, description="The arrangement of leaves around the stem.")

    leaf_hair_description: Optional[str] = Field(None, description="Description of leaf hair.")
    leaf_hair_upper_description: Optional[str] = Field(None, description="Description of Upper leaf hairs.")
    leaf_hair_lower_description: Optional[str] = Field(None, description="Description of Lower leaf hairs.")
    
    breeding_type: Optional[BreedingType] = Field(None, description="Plant reproductive class.")
    
    inflorescence_type: Optional[InflorescenceType] = Field(None, description="In a flowering plant, a cluster of flowers on a branch or a system of branches")
    
    ray_color: Optional[str] = Field(None, description="Color of ray")
    floret_color: Optional[str] = Field(None, description="Color of florets")
    spathe_color: Optional[str] = Field(None, description="Color of spathe")
    perianth_outer_color: Optional[str] = Field(None, description="Color of perianth outer flower")
    perianth_inner_color: Optional[str] = Field(None, description="Color of perianth inner flower")
    perianth_color: Optional[str] = Field(None, description="Color of perianth")
    labellum_color: Optional[str] = Field(None, description="Color of labellum")

    corolla_type: Optional[CorollaType] = Field(None, description="Type of corolla")
    corolla_color: Optional[str] = Field(None, description="Color of corolla")
    staminate_corolla_type: Optional[CorollaType] = Field(None, description="Type of staminate corolla")
    pistillate_corolla_type: Optional[CorollaType] = Field(None, description="Type of pistillate corolla")
    
    fruit_type: Optional[FruitType] = Field(None, description="Type of fruit")
    fruit_length: Optional[Measurements] = Field(None, description="Fruit length in millimeters")
    fruit_width: Optional[Measurements] = Field(None, description="Fruit width in millimeters")
    fruit_diameter: Optional[Measurements] = Field(None, description="Fruit diameter in millimeters")
    
    ploidy: Optional[str] = Field(None, description="Ploidy level expressed as a function of n (e.g., 1n, 2n or 3n, etc..)")
    chromosome_number: Optional[int] = Field(None, description="The integer Number of chromosomes")
    average_chromosome_number: Optional[float] = Field(None, description="Average chromosome number")
    
    origin: Optional[OriginType] = Field(None, description="Origin type of the plant")
    fed_status: Optional[FederalStatusType] = Field(None, description="Federal conservation status")
    status: Optional[StatusType] = Field(None, description="General status")

    life_form_type: Optional[LifeFormType] = Field(None, description="Growth habit or life form")
    leaf_type: Optional[LeafType] = Field(None, description="Simple or compound leaf type")
    leaf_shape_type: Optional[LeafShapeType] = Field(None, description="Shape of leaves")
    leaf_margin_type: Optional[LifeMarginType] = Field(None, description="Type of leaf margin")
    
    juvenile_leaf_type: Optional[LeafType] = Field(None, description="Simple or compound juvenile leaf type")
    juvenile_leaf_shape_type: Optional[LeafShapeType] = Field(None, description="Shape of juvenile leaves")
    juvenile_leaf_margin_type: Optional[LeafMarginType] = Field(None, description="Type of leaf margin on juvenile plants")
    
    leaflets_leaf_type: Optional[LeafType] = Field(None, description="Simple or compound juvenile leaf type")
    leaflets_leaf_shape_type: Optional[LeafShapeType] = Field(None, description="Shape of juvenile leaves")
    leaflets_leaf_margin_type: Optional[LeafMarginType] = Field(None, description="Type of leaf margin on juvenile plants")
    
    leaf_hair_upper_type: Optional[LeafHairType] = Field(None, description="Type of upper leaf hairs")
    leaf_hair_lower_type: Optional[LeafHairType] = Field(None, description="Type of lower leaf hairs")
    leaf_hair_type: Optional[LeafHairType] = Field(None, description="Type of leaf hairs")
    juvenile_leaf_hair_type: Optional[LeafHairType] = Field(None, description="Type of juvenile leaf hairs")
    
    island_type: Optional[List[Location]] = Field(None, description="Islands where the plant is found")
    
    stem_height: Optional[Measurements] = Field(None, description="Stem or general plant height measurements in meters")
    leaf_length: Optional[Measurements] = Field(None, description="Length of leaves in millimeters")
    leaf_width: Optional[Measurements] = Field(None, description="Width of leaves in millimeters")
    juvenile_leaf_length: Optional[Measurements] = Field(None, description="Juvenile length of leaves in millimeters")
    juvenile_leaf_width: Optional[Measurements] = Field(None, description="Juvenile width of leaves in millimeters")
    leaflets_leaf_length: Optional[Measurements] = Field(None, description="Leaflets length of leaves in millimeters")
    leaflets_leaf_width: Optional[Measurements] = Field(None, description="Leaflets width of leaves in millimeters")    
    
    petioles: Optional[Measurements] = Field(None, description="Length of petiole stalk in centimeters") 
    staminate_inflorescence_length: Optional[Measurements] = Field(None, description="The measured length in millimeters of the male (pollen-producing) flower cluster. This specifically refers to catkins or other inflorescences containing only staminate (male) flowers.")    
    pistillate_inflorescence_length: Optional[Measurements] = Field(None, description="The measured length of the female (seed-producing) flower cluster. This specifically refers to inflorescences containing only pistillate (female) flowers.")    
    staminate_inflorescence_width: Optional[Measurements] = Field(None, description="The measured width in millimeters of the male (pollen-producing) flower cluster. This specifically refers to catkins or other inflorescences containing only staminate (male) flowers.")    
    pistillate_inflorescence_width: Optional[Measurements] = Field(None, description="The measured width of the female (seed-producing) flower cluster. This specifically refers to inflorescences containing only pistillate (female) flowers.")        
    
    inflorescence_flower_length: Optional[Measurements] = Field(None, description="The length of an inflorescence flower in millimeters.")
    inflorescence_flower_width: Optional[Measurements] = Field(None, description="The width of an inflorescence flower in millimeters.")
    
    flower_length: Optional[Measurements] = Field(None, description="Flower length in centimeters")
    flower_width: Optional[Measurements] = Field(None, description="Flower width in centimeters")
    
    rachis_length: Optional[Measurements] = Field(None, description="Rachis length in millimeters")
    rachis_diameter: Optional[Measurements] = Field(None, description="Rachis diameter in millimeters") 
    
    head_length: Optional[Measurements] = Field(None, description="The measured length of the capitulum (flower head) in millimeters.")
    head_diameter: Optional[Measurements] = Field(None, description="The measured diameters of the capitulum (flower head) in millimeters.")
    
    bur_length: Optional[Measurements] = Field(None, description="The measured length of the bur in millimeters.")
    tepal_length: Optional[Measurements] = Field(None, description="The measured length of the tepal in millimeters.")
    staminate_tepal_length: Optional[Measurements] = Field(None, description="The measured length of the staminate tepal in millimeters.")
    pistillate_tepal_length: Optional[Measurements] = Field(None, description="The measured length of the pistillate tepal in millimeters.")
      
    ray_length: Optional[Measurements] = Field(None, description="The measured length of the ray in millimeters.")
    ray_width: Optional[Measurements] = Field(None, description="The measured width of the ray in millimeters.")
    
    florets_length: Optional[Measurements] = Field(None, description="The measured length of the florets in millimeters.")
    
    involucre_length: Optional[Measurements] = Field(None, description="Involucre length in millimeters")
    involucre_width: Optional[Measurements] = Field(None, description="Involucre width in millimeters")
    staminate_involucre_length: Optional[Measurements] = Field(None, description="Staminate involucre length in millimeters")
    pistillate_involucre_length: Optional[Measurements] = Field(None, description="Pistillate involucre length in millimeters")   
    
    bract_length: Optional[Measurements] = Field(None, description="Bract length in millimeters")
    bract_width: Optional[Measurements] = Field(None, description="Bract width in millimeters")
    bract_lower_length: Optional[Measurements] = Field(None, description="Lower bract length in millimeters")
    bract_outer_length: Optional[Measurements] = Field(None, description="Outer bract length in millimeters")
    
    bracteoles_length: Optional[Measurements] = Field(None, description="Bracteoles length in millimeters")
    bracteole_width: Optional[Measurements] = Field(None, description="Bracteole width in millimeters")
    
    pedicel_length: Optional[Measurements] = Field(None, description="Pedicel length in millimeters")
    pedicel_width: Optional[Measurements] = Field(None, description="Pedicel width in millimeters")
    staminate_pedicel_length: Optional[Measurements] = Field(None, description="Staminate pedicel length in millimeters")
    pistillate_pedicel_length: Optional[Measurements] = Field(None, description="Pistillate pedicel length in millimeters")
    staminate_pedicel_width: Optional[Measurements] = Field(None, description="Staminate pedicel width in millimeters")
    pistillate_pedicel_width: Optional[Measurements] = Field(None, description="Pistillate pedicel width in millimeters") 
    
    hypanthium_length: Optional[Measurements] = Field(None, description="Hypanthium length in millimeters")
    hypanthium_width: Optional[Measurements] = Field(None, description="Hypanthium width in millimeters")
    
    peduncle_length: Optional[Measurements] = Field(None, description="Peduncle length in millimeters")
    peduncle_width: Optional[Measurements] = Field(None, description="Peduncle width in millimeters")
    staminate_peduncle_length: Optional[Measurements] = Field(None, description="Staminate peduncle length in millimeters")
    staminate_peduncle_width: Optional[Measurements] = Field(None, description="Staminate peduncle width in millimeters")
    pistillate_peduncle_length: Optional[Measurements] = Field(None, description="Pistillate peduncle length in millimeters")
    pistillate_peduncle_width: Optional[Measurements] = Field(None, description="Pistillate peduncle width in millimeters")

    spathe_width: Optional[Measurements] = Field(None, description="Spathe width dimensions in millimeters")
    spathe_length: Optional[Measurements] = Field(None, description="Spathe length dimensions in millimeters")
    spadix_length: Optional[Measurements] = Field(None, description="Spadix length dimensions in millimeters")
    
    perianth_width: Optional[Measurements] = Field(None, description="Perianth width dimensions in millimeters")
    perianth_length: Optional[Measurements] = Field(None, description="Perianth length dimensions in millimeters")   
    perianth_outer_width: Optional[Measurements] = Field(None, description="Outer perianth width dimensions in millimeters")
    perianth_outer_length: Optional[Measurements] = Field(None, description="Outer perianth length dimensions in millimeters") 
    perianth_inner_width: Optional[Measurements] = Field(None, description="Inner perianth width dimensions in millimeters")
    perianth_inner_length: Optional[Measurements] = Field(None, description="Inner perianth length dimensions in millimeters") 
    
    perianth_lobes_width: Optional[Measurements] = Field(None, description="Perianth lobes width dimensions in millimeters")
    perianth_lobes_length: Optional[Measurements] = Field(None, description="Perianth lobes length dimensions in millimeters") 
    perianth_tube_length: Optional[Measurements] = Field(None, description="Perianth tube length dimensions in millimeters") 
    pistillate_perianth_tube_length: Optional[Measurements] = Field(None, description="Pistillate perianth tube length dimensions in millimeters") 
    staminate_perianth_tube_length: Optional[Measurements] = Field(None, description="Staminate perianth tube length dimensions in millimeters") 
    
    pappus_length: Optional[Measurements] = Field(None, description="Pappus length in millimeters")
    umbellet_length: Optional[Measurements] = Field(None, description="Umbellet length in millimeters")
    labellum_width: Optional[Measurements] = Field(None, description="Labellum width dimensions in millimeters")
    labellum_length: Optional[Measurements] = Field(None, description="Labellum length dimensions in millimeters")
    
    calyx_length: Optional[Measurements] = Field(None, description="Calyx length in millimeters")
    calyx_width: Optional[Measurements] = Field(None, description="Calyx width in millimeters")
    calyx_teeth_length: Optional[Measurements] = Field(None, description="Calyx teeth length in millimeters")
    calyx_teeth_width: Optional[Measurements] = Field(None, description="Calyx teeth width in millimeters")
    calyx_lobes_length: Optional[Measurements] = Field(None, description="Calyx lobe length in millimeters")
    calyx_lobes_width: Optional[Measurements] = Field(None, description="Calyx lobe width in millimeters")
    
    upper_calyx_length: Optional[Measurements] = Field(None, description="Upper calyx length in millimeters")
    lower_calyx_length: Optional[Measurements] = Field(None, description="Lower calyx length in millimeters")
    
    inner_calyx_lobes_length: Optional[Measurements] = Field(None, description="Inner calyx lobes length in millimeters")
    inner_calyx_lobes_width: Optional[Measurements] = Field(None, description="Inner calyx lobes width in millimeters")
    outer_calyx_lobes_length: Optional[Measurements] = Field(None, description="Outer calyx lobes length in millimeters")
    outer_calyx_lobes_width: Optional[Measurements] = Field(None, description="Outer calyx lobes width in millimeters")
    
    calyx_tube_length: Optional[Measurements] = Field(None, description="Calyx tube length in millimeters")
    calyx_tube_width: Optional[Measurements] = Field(None, description="Calyx tube width in millimeters")
    
    male_calyx_length: Optional[Measurements] = Field(None, description="Male calyx length in millimeters")
    male_calyx_width: Optional[Measurements] = Field(None, description="Male calyx width in millimeters")
    
    male_calyx_lobes_length: Optional[Measurements] = Field(None, description="Male calyx length in millimeters")
    male_calyx_lobes_width: Optional[Measurements] = Field(None, description="Male calyx width in millimeters")
    
    female_calyx_length: Optional[Measurements] = Field(None, description="female calyx length in millimeters")
    female_calyx_width: Optional[Measurements] = Field(None, description="female calyx width in millimeters")
    
    female_calyx_lobes_length: Optional[Measurements] = Field(None, description="Female calyx length in millimeters")
    female_calyx_lobes_width: Optional[Measurements] = Field(None, description="Female calyx width in millimeters")
    
# male_calyx_lobes_length_inner
# male_calyx_lobes_length_outer
# male_calyx_lobes_width_outer
# male_calyx_tube_length
# female_calyx_lobes_length_inner
# female_calyx_lobes_length_outer
# female_calyx_lobes_width_inner
# female_calyx_lobes_width_outer
# female_calyx_length
# inner_calyx_length
# outer_calyx_length
# corolla_length
# corolla_width
# corolla_tube_length
# corolla_tube_width
# corolla_lobes_length
# corolla_lobes_width
# upper_corolla
# lower_corolla
# upper_corolla_lobes_length
# lower_corolla_lobes_length
# corolla_lip
# staminate_corolla_length
# pistillate_corolla_length
# staminate_corolla_tube_length
# staminate_corolla_tube_width
# pistillate_corolla_tube_length
# pistillate_corolla_tube_width
# female_corolla_lobes_length
# female_corolla_lobes_width
# male_corrola_lobes_length
# male_corrola_lobes_width
# fruit_length
# fruit_width
# fruit_diameter
# seeds_per
# seed_length
# seed_length_ex
# seed_width
# seed_diameter


In [3]:
## Input to base64 format to be intepreted by extraction agents (chatbots)
with open(f"{photo_dir}/Acanthaceae_Dicliptera_chinensis.jpeg", "rb") as image_file:
            base64_image = base64.b64encode(image_file.read()).decode("utf-8")

## Setting the overall framework for how the extraction agent operates across all interactions
system_prompt  = """
You are an expert plant taxonomist. Please analyze this image and return the details according to the schema provided.
You are exhaustive; you include ALL the details mentioned. Do not make any assumptions about the data and do not try to
interpret what is not obvious from the text. When extracting information, ensure that you return a structured response in JSON format matching the `HawaiianPlant` schema. """

## Prompt to set specific focus for each extraction agent interaction (can be dynamic as need be)
user_prompt= "Transcribe the plant information you see in this image"


## GPT-4o Text Transcription (Text Extraction Only)

In [5]:
## Setting extraction agent to be gpt-4o
## System prompt is being set to specifically transcribe information
openai_img_to_text_transcription_agent = Agent(
    model="openai:gpt-4o",
    result_type=str,
    system_prompt = "You are a vision model capable of accurately performing OCR on an image",
)

## Data to be extracted from input image
image_urls = [
    f"data:image/png;base64,{base64_image}",
]

## Setting chat parameters: low detail (for efficiency) from input image
image_params = [
    ChatCompletionContentPartImageParam(
        type='image_url', 
        image_url=ImageURL(url=url, detail='low')
    ) for url in image_urls
]

## Setting chat prompt to 'user_prompt'
msg_open_ai = [
            ChatCompletionContentPartTextParam(text="Convert this to text. Don't miss any text.", type='text'),
            *image_params
]

## Running data extraction
r = await openai_img_to_text_transcription_agent.run(msg_open_ai)

## Output to 'text'
text = r.data.split("---")[1]
print(text)



171

1. Dicliptera chinensis (L.) Juss.
[Justicia chinensis L.]

nat

Sprawling or decumbent perennial herbs; stems 2-7 dm long. Leaves green, lower surface slightly paler, ovate, 2.5-13.5 cm long, sparsely strigillose, especially on the veins, spathes prominent on upper surface, white raised streaks the size of the leaf blades, 1-3.5 cm long. Flowers in axillary cymes, each composed of 2 irregular, ovate bracts of unequal size, the larger one ca. 12-14 mm long, the smaller one ca. 8-9 mm long, all bracts short-villous especially along the margins, the veins inconspicuous, pedicels 10 mm long; calyx lobes of unequal size, 5-7 mm long; corolla white to purple, the throat with purple spots, 5-13 mm long. Capsules oblong, 6-7 mm long, short-villous. Seeds 4, discoid. Native to tropical areas worldwide; in Hawaiian naturalized only on Midway Atoll but appears at least on Kaua'i and O'ahu, but perhaps more widespread. Popper (1999) states that this plant was recently introduced to Hawai'i

## DeepSeek Model Extraction (R1 distill llama 70B) from text to Hawaiian Plant Schema

In [6]:
## Setting deepseek LLM as extraction agent
## Data to be extracted in HawaiianPlant schema
groq_extraction_agent = Agent(
    model="groq:deepseek-r1-distill-llama-70b",
    retries=3,
    result_type=HawaiianPlant,
    system_prompt = system_prompt,
    # Had to raise temperature to 0.5
    model_settings = {'temperature': 0.5}
)

## Setting variables and extracting data
r = await groq_extraction_agent.run(text)
deepseek_llama = r.data


BadRequestError: Error code: 400 - {'error': {'message': "Failed to call a function. Please adjust your prompt. See 'failed_generation' for more details.", 'type': 'invalid_request_error', 'code': 'tool_use_failed', 'failed_generation': '\nAlright, so I\'m trying to figure out how to fix the validation errors in my JSON response. Let\'s see what the errors are. \n\nFirst, there\'s an error with "infraspecific_epithet" where it says the input should be a valid string, but I have null. That makes sense because in the original data, there\'s no infraspecific epithet mentioned. Maybe I can leave it as null since it\'s not required, but the schema says it\'s required. Hmm, that\'s confusing. Maybe I should set it to an empty string or see if there\'s a default value.\n\nNext, the "leaf_shape_type" is set to "ovate", but the enum expects it to be uppercase like "OVATE". So I need to correct that to "OVATE".\n\nThen, both "leaf_hair_lower" and "leaf_hair_upper" are set to "sparsely strigillose", but the enum doesn\'t recognize that. I think "strigillose" might not be the correct term here. Maybe it should be "STRIGOSE" since that\'s one of the enum options.\n\nI\'ll make these corrections and try again. Hopefully, that fixes all the validation errors.\n</think>\n\n<tool_call>{"id":"call_p16e","name":"final_result","arguments":{"family":"Acanthaceae","genus":"Justicia","species":"chinensis","infraspecific_epithet":"","average_chromosome_number":null,"bract_length":{"min":8,"max":14},"bract_width":null,"bracteole_width":null,"bracteoles_length":null,"breeding_type":null,"bur_length":null,"calyx_dimensions":null,"calyx_length":{"min":5,"max":7},"calyx_length_lobes":null,"calyx_teeth_length":null,"calyx_teeth_width":null,"calyx_width":null,"calyx_width_lobes":null,"chromosome_number":null,"common_name":null,"corolla_color":"white to purple","corolla_length":{"min":5,"max":13},"corolla_type":null,"corolla_width":null,"deciduous":null,"description":null,"federal_status":null,"female_inflorescence_type":null,"floret_color":null,"florets_length":null,"flower_length":null,"flower_width":null,"fruit_diameter":null,"fruit_length":{"min":6,"max":7},"fruit_type":"CAPSULE","fruit_width":null,"hawaiian_names":null,"head_diameter":null,"head_length":null,"hypanthium_length":null,"hypanthium_width":null,"imm_leaf_type":null,"inflorescence_flower_length":null,"inflorescence_type":"CYME","involucre_length":null,"involucre_width":null,"juvenile_leaf_dimensions":null,"juvenile_leaf_hair":null,"juvenile_leaf_length_dimensions":null,"juvenile_leaf_margin_type":null,"juvenile_leaf_shape_type":null,"juvenile_leaf_type":null,"juvenile_leaf_width_dimensions":null,"juvenile_petiole_length":null,"leaf_hair_lower":"STRIGOSE","leaf_hair_upper":"STRIGOSE","leaf_hairs":null,"leaf_length_dimensions":{"min":2.5,"max":13.5},"leaf_margin_type":null,"leaf_shape_type":"OVATE","leaf_teeth":null,"leaf_type":null,"leaf_width_dimensions":null,"leaflet_leaf_shape":null,"life_form_type":"PERENNIAL_HERB","locations":["HAWAII","KAUAI","OAHU"],"male_inflorescence_type":null,"origin":"NATURALIZED","outer_bract_length":{"min":12,"max":14},"page_number":null,"pedicel_length":{"min":10,"max":10},"pedicel_width":null,"peduncle_dimensions":null,"peduncle_length":null,"peduncle_width":null,"petiole_length":null,"phyllotaxy_type":null,"pistillate_inflorescence_length":null,"pistillate_tepal_length":null,"ploidy":null,"rachis_diameter":null,"rachis_length":null,"ray_color":null,"ray_length":null,"ray_width":null,"seed_dimensions":null,"seeds_per_fruit":null,"staminate_inflorescence_length":null,"staminate_tepal_length":null,"status":null,"stem_hair_type":null,"stem_height":{"min":0.2,"max":0.7},"subspecies":null,"tepal_length":null,"wagner_book_number":null}}<｜tool▁calls▁end｜>'}}

## GPT-4o Data Extraction to Hawaiian Plant Schema

In [7]:
## Setting extraction agent to be gpt-4o
## Extracted data will be formatted according to HawaiianPlant schema
openai_extraction_agent = Agent(
    model="openai:gpt-4o",
    result_type=HawaiianPlant,
    system_prompt = system_prompt,
)

## Data to be extracted from input image
image_urls = [
    f"data:image/png;base64,{base64_image}",
]

## Setting chat parameters: low detail (for efficiency) from input image
image_params = [
    ChatCompletionContentPartImageParam(
        type='image_url', 
        image_url=ImageURL(url=url, detail='low')
    ) for url in image_urls
]

## Setting chat prompt to 'user_prompt'
msg_open_ai = [
            ChatCompletionContentPartTextParam(text=user_prompt, type='text'),
            *image_params
]

## Running data extraction
r = await openai_extraction_agent.run(msg_open_ai)

## Setting results to 'gpt_4o_output'
gpt_4o_output = r.data

## Anthropic Model Data Extraction

In [8]:
## Setting extraction agent to be Claude Sonnet model
## Extracted data will be formatted according to HawaiianPlant schema
sonnet_extraction_agent = Agent(
    model="anthropic:claude-3-5-sonnet-latest",
    result_type=HawaiianPlant,
    system_prompt = system_prompt,
    model_settings = {'temperature': 0.2}

)

## Setting context
msg_claude = [
    ChatCompletionContentPartTextParam(text=user_prompt, type='text'),
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": "image/jpeg",
                        "data": f"{base64_image}",
                    }
                },
    
]

## Extracting data from image
r = await sonnet_extraction_agent.run(msg_claude)
sonnet_output = r.data

## GPT o3-Mini Model Data Extraction

In [9]:
## Setting system prompt
gptmini_extraction_agent = Agent(
    model="openai:o3-mini",
    result_type=HawaiianPlant,
    system_prompt = system_prompt,
)

## Extracting output from extracted text
r = await gptmini_extraction_agent.run(text)

## Setting output
gpt_mini_output = r.data

## Comparing Model Outputs

In [10]:
### IGNORE THE FOLLOWING CODE, IT IS A WORK SPACE

In [11]:
## Checking manual results versus automatically extracted results
## Setting class for boolean output based on intepretation if results are equal
class AreAnntoationsEqual(BaseModel):
    are_equal: bool = Field(..., description="Are the two annotations equal")
    justificaiton: str = Field(..., description="Justification of the propose value for are_equal")

## Setting the prompt and model for validation agent
validation_agent = Agent(
    model="groq:llama-3.3-70b-specdec",
    result_type=AreAnntoationsEqual,
    system_prompt = """You are an expert taxonomist. You are comparing the outcome of a manually extracted result versus an automatically extracted result. You need to compare the automatic results and determine whether the result is synonymous or equal the manual one; taking into consideration
    linguisitc and formatting nuances. Your answer is whether the two results are similar True/False and a justificaiton for your answer""",
)

## Models to compare
models = ["gpt_4o_output", "gpt_mini_output", "sonnet_output", "deepseek_llama"]


In [12]:
## Column Names/ properties
props = HawaiianPlant.model_json_schema()['properties'].keys()
props

dict_keys(['family', 'genus', 'species', 'subspecies', 'common_name', 'hawaiian_names', 'infraspecific_epithet', 'deciduous', 'wagner_book_number', 'page_number', 'description', 'life_form_type', 'stem_height', 'stem_hair_type', 'phyllotaxy_type', 'imm_leaf_type', 'leaf_type', 'leaf_shape_type', 'leaf_margin_type', 'leaf_width_dimensions', 'leaf_length_dimensions', 'leaf_hairs', 'leaf_hair_upper', 'leaf_hair_lower', 'petiole_length', 'leaflet_leaf_shape', 'leaf_teeth', 'juvenile_leaf_type', 'juvenile_leaf_shape_type', 'juvenile_leaf_margin_type', 'juvenile_leaf_dimensions', 'juvenile_leaf_hair', 'juvenile_petiole_length', 'juvenile_leaf_width_dimensions', 'juvenile_leaf_length_dimensions', 'flower_width', 'peduncle_dimensions', 'flower_length', 'rachis_length', 'rachis_diameter', 'head_length', 'head_diameter', 'bur_length', 'tepal_length', 'staminate_tepal_length', 'pistillate_tepal_length', 'ray_length', 'ray_width', 'florets_length', 'corolla_width', 'corolla_length', 'inflorescence

In [13]:
manual_results = {
    "datasheet": [
        {
            "Family": "Asteraceae",
            "Genus": "Bidens",
            "Species": "pilosa",
            "Common_Name": "spanish needle",
            "Hawaiian_name_1": "Ki",
            "Hawaiian_name_2": "Ki nehe",
            "Hawaiian_name_3": "Ki pipili",
            "Hawaiian_name_4": "Nehe",
            "Wagner_Book_#": 279,
            "Pg_#": "pg 267-271,279-281",
            "Description": "Dicots",
            "Life_Form_Type": "AH",
            "Stem_Height_(m)_min": 0.3,
            "Stem_Height_(m)_max": 1.8,
            "Phyllotaxy_Type": "O",
            "Leaf_Type": "Compound",
            "Leaflet_leaf_type": np.nan,
            "Leaf_Margin_Type": np.nan,
            "Leaflets_Shape_Type": np.nan,
            "Leaf_Length_(cm)_min": 2.5,
            "Leaf_Length_(cm)_max": 13.5,
            "Leaflet_Length_(cm)_min": np.nan,
            "Leaflet_Length_(cm)_max": np.nan,
            "Leaflet_Width_(cm)_min": np.nan,
            "Leaflet_Width_(cm)_max": np.nan,
            "Leaf_Hair_Type": "G",
            "Breeding_Type": "M",
            "Infloresence_Type": "Cy",
            "Head Length (mm) Min": 8,
            "Head Length (mm) Max": 10,
            "Ray Length (mm) min": 2,
            "Ray Length (mm) max": 8,
            "Ray Width (mm) Min": np.nan,
            "Ray Width (mm) Max": np.nan,
            "Ray Color": "Yellow or White",
            "Bract_Length_(mm)_Min": 2.5,
            "Bract_Length_(mm)_Max": 5,
            "Peduncle_Length_Min_(mm)": 10,
            "Peduncle_Length_Max_(mm)": 90,
            "Pappus Length (mm) Min": 1,
            "Pappus Length (mm) Max": 2,
            "Corolla_Type": "T",
            "Corolla_color": "yellow",
            "Fruit_Type": "A",
            "Fruit_length_(mm)_min": 8,
            "Fruit_length_(mm)_max": 16,
            "Fruit_width_(mm)_ min": np.nan,
            "Fruit_width_(mm)_max": np.nan,
            "Chromosome_#": "24, 36, 46, 48, 72, ca. 76",
            "Average_Chromosome_#": 50.33,
            "Origin_t1": "PC",
            "Island_H": 1.0,
            "Island_MA": 1.0,
            "Island_KAH": 1.0,
            "Island_MO": 1,
            "Island_L": 1.0,
            "Island_O": 1,
            "Island_KAU": 1.0,
            "Island_NI": 1.0,
            "Island_A": 1.0,
            "FedStatus_t1 (do at end) ": "NS"
        },
        {
            "Family": "Asteraceae",
            "Genus": "Bidens",
            "Species": "molokaiensis",
            "Common_Name": "ko`oko`olau",
            "Hawaiian_name_1": "ko`oko`olau",
            "Hawaiian_name_2": np.nan,
            "Hawaiian_name_3": np.nan,
            "Hawaiian_name_4": np.nan,
            "Wagner_Book_#": 279,
            "Pg_#": "pg 267-271,279",
            "Description": "Dicots",
            "Life_Form_Type": "PH",
            "Stem_Height_(m)_min": 0.1,
            "Stem_Height_(m)_max": 0.3,
            "Phyllotaxy_Type": "O",
            "Leaf_Type": "S or C ",
            "Leaflet_leaf_type": "S",
            "Leaf_Margin_Type": "T",
            "Leaflets_Shape_Type": "C or OVA",
            "Leaf_Length_(cm)_min": 3.0,
            "Leaf_Length_(cm)_max": 7.0,
            "Leaflet_Length_(cm)_min": 1.5,
            "Leaflet_Length_(cm)_max": 4.0,
            "Leaflet_Width_(cm)_min": 1.0,
            "Leaflet_Width_(cm)_max": 4.5,
            "Leaf_Hair_Type": "G",
            "Breeding_Type": "M",
            "Infloresence_Type": "Cy",
            "Head Length (mm) Min": 30,
            "Head Length (mm) Max": 55,
            "Ray Length (mm) min": 17,
            "Ray Length (mm) max": 25,
            "Ray Width (mm) Min": 9.0,
            "Ray Width (mm) Max": 11.0,
            "Ray Color": "Yellow or White",
            "Bract_Length_(mm)_Min": 3.0,
            "Bract_Length_(mm)_Max": 6,
            "Peduncle_Length_Min_(mm)": 50,
            "Peduncle_Length_Max_(mm)": 180,
            "Pappus Length (mm) Min": "Abset",
            "Pappus Length (mm) Max": 1,
            "Corolla_Type": "T",
            "Corolla_color": "yellow",
            "Fruit_Type": "A",
            "Fruit_length_(mm)_min": 6,
            "Fruit_length_(mm)_max": 12,
            "Fruit_width_(mm)_ min": 1.0,
            "Fruit_width_(mm)_max": 1.5,
            "Chromosome_#": 72,
            "Average_Chromosome_#": 72.0,
            "Origin_t1": np.nan,
            "Island_H": np.nan,
            "Island_MA": np.nan,
            "Island_KAH": np.nan,
            "Island_MO": 1,
            "Island_L": np.nan,
            "Island_O": 1,
            "Island_KAU": np.nan,
            "Island_NI": np.nan,
            "Island_A": np.nan,
            "FedStatus_t1 (do at end) ": "SOC"
        }
    ]
}

# Convert to DataFrame



In [None]:
groun_truth = {"family": manual_results['datasheet'][0]['Family'], "genus": manual_results['datasheet'][0]['Genus'], "species": "Species"}


In [None]:
manual_results['datasheet'][0]['Species']

'pilosa'

In [None]:
for prop in list(props)[0:3]:
    print("prop")
    for model in models:
        user_prompt = f""" manually annotated {prop}: {groun_truth[prop]}
        automatically annotated {prop}:  {getattr(eval(model),  prop)}
        """
        print("\n\n"+user_prompt)
        r = await validation_agent.run(user_prompt)
        print(r.data)

        print(f"{prop}:{getattr(eval(model),  prop)}:{r.data.are_equal}", end="\t")
    print("\n")  

prop


 manually annotated family: Asteraceae
        automatically annotated family:  Asteraceae
        
are_equal=True justificaiton="The manually annotated family 'Asteraceae' and the automatically annotated family 'Asteraceae' are identical, with no linguistic or formatting nuances that would suggest otherwise."
family:Asteraceae:True	

 manually annotated family: Asteraceae
        automatically annotated family:  Acanthaceae
        
are_equal=False justificaiton='The manually annotated family is Asteraceae, while the automatically annotated family is Acanthaceae. These are two distinct and different plant families, with Asteraceae being the family of sunflowers and daisies, and Acanthaceae being the family of acanthus plants. Therefore, the two results are not equal.'
family:Acanthaceae:False	

 manually annotated family: Asteraceae
        automatically annotated family:  Acanthaceae
        
are_equal=False justificaiton='The manually annotated family, Asteraceae, and the aut

## Testing Area

In [4]:
auto_df = pd.read_csv("/Users/williamharrigan/Desktop/UH/Year_3/semester_2/wagner/auto_extract_2.csv")
auto_df.columns = auto_df.columns.str.lower()
auto_df.head()

Unnamed: 0,unnamed: 0,family,genus,species,subspecies,common_name,hawaiian_names,infraspecific_epithet,deciduous,wagner_book_number,...,calyx_length_lobes,calyx_width_lobes,ploidy,chromosome_number,average_chromosome_number,breeding_type,locations,origin,federal_status,status
0,0,Aizoaceae,Trianthema,portulacastrum,,,,,,179,...,,,2n,26.0,,MONOECIOUS,['OAHU'],NATURALIZED,,SECURE
1,1,Apiaceae,Hydrocotyle,sibthorpioides,,,,,,"24, 48, 64",...,,,"2n = 24, 48, 64",,,,"['KAUAI', 'OAHU', 'MAUI', 'HAWAII']",NATURALIZED,,
2,2,Amaranthaceae,Achyranthes,mutica,,,,nelsonii,,,...,,,,,,,['KAUAI'],ENDEMIC,,RARE
3,3,Apiaceae,Daucus,pusillus,,American carrot,[],,,,...,,,2n = 22,22.0,,,"['OAHU', 'MOLOKAI', 'LANAI', 'HAWAII']",INDIGENOUS,,
4,4,Apiaceae,Centella,asiatica,,Asiatic pennywort,['pohe kula'],,,1,...,,,2n,,,,"['HAWAII', 'MAUI', 'MOLOKAI', 'LANAI', 'OAHU',...",NATURALIZED,,NATURALIZED


In [5]:
df= pd.read_csv("/Users/williamharrigan/Desktop/Github/ai_wagner_trait_data_extraction/files/man_extract.csv")
df.columns = df.columns.str.lower()
df.head()

Unnamed: 0,family,genus,species,common_name,hawaiian_name_1,hawaiian_name_2,hawaiian_name_3,hawaiian_name_4,wagner_pg_#,description,...,island_type_ma,island_type_kah,island_type_mo,island_type_l,island_type_o,island_type_kau,island_type_ni,island_type_a,fedstatus_t1 (do at end),status
0,Asteraceae,Ambrosia,artemisiifolia,common ragweed,,,,,pg 256-257,Dicots,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,NS,Naturalized
1,Asteraceae,Dubautia,laxa,na`ena`e pua melemele,Na`ena`e pua melemele,,,,"pg 292-295,301",Dicots,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,NS,Endemic
2,Asteraceae,Tetramolopium,filiforme,no common name,,,,,"pg 361-362, 365, 366",Dicots,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,E,Endemic
3,Asteraceae,Encelia,farinosa,brittle bush,,,,,pg 312-313,Dicots,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NS,Naturalized
4,Aristolochiaceae,Aristolochia,littoralis,calico flower,,,,,"pg 237-238,239",Dicots,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,NS,Naturalized


In [6]:
def collapse_type_columns(df):
    # Identify all columns that have '_type_' in their names
    type_cols = [col for col in df.columns if '_type_' in col]

    # Group columns by their prefix (everything before '_type_' + last element)
    col_groups = {}
    for col in type_cols:
        prefix = '_'.join(col.split('_')[:-1])  # Get the prefix
        if prefix not in col_groups:
            col_groups[prefix] = []
        col_groups[prefix].append(col)

    # Create new collapsed columns
    for prefix, cols in col_groups.items():
        df[prefix] = df.apply(lambda row: [col.split('_')[-1].upper() for col in cols if row[col] == 1], axis=1)
        
    # Drop the original type columns
    df.drop(columns=type_cols, inplace=True)

    return df

def merge_hawaiian_name_columns(df):
    # Identify columns that contain 'hawaiian_name'
    hawaiian_cols = [col for col in df.columns if 'hawaiian_name' in col]

    # Ensure there are columns to merge
    if not hawaiian_cols:
        return df
    
    # Merge values into a list, ensuring all values are strings and filtering out empty values
    df['hawaiian_name'] = df[hawaiian_cols].apply(lambda row: [str(val) for val in row if pd.notna(val) and str(val).strip() != ''], axis=1)

    # Drop the original columns
    df.drop(columns=hawaiian_cols, inplace=True)

    return df


In [7]:
type_cols = [col for col in df.columns if '_type_' in col]
# type_cols
# Group columns by their prefix (everything before '_type_' + last element)
col_groups = {}
for col in type_cols:
    prefix = '_'.join(col.split('_')[:-1])
    # print(prefix)
    if prefix not in col_groups:
        col_groups[prefix] = []
    col_groups[prefix].append(col)
    # print(col)

# # Create new collapsed columns
for prefix, cols in col_groups.items():
    df[prefix] = df.apply(lambda row: [col.split('_')[-1].upper() for col in cols if row[col] == 1], axis=1)
    
# Drop the original type columns
df.drop(columns=type_cols, inplace=True)

df['hawaiian_name'] = df[['hawaiian_name_1', 'hawaiian_name_2', 'hawaiian_name_3', 'hawaiian_name_4']].apply(
    lambda row: {x for x in row if pd.notna(x)}, axis=1
)

# Drop original columns if needed
df.drop(columns=['hawaiian_name_1', 'hawaiian_name_2', 'hawaiian_name_3', 'hawaiian_name_4'], inplace=True)



# hawaiian_cols = [col for col in df.columns if 'hawaiian_name' in col]
# # Merge values into a list, ensuring all values are strings and filtering out empty values
# df['hawaiian_name'] = df[hawaiian_cols].apply(lambda row: [str(val) for val in row if pd.notna(val) and str(val).strip() != ''], axis=1)

# # Drop the original columns
# df.drop(columns=hawaiian_cols, inplace=True)

In [8]:
df['island_type']

0     [H, MA, MO, O]
1                [O]
2                [O]
3               [MA]
4       [MA, O, KAU]
           ...      
94                []
95                []
96                []
97                []
98          [O, KAU]
Name: island_type, Length: 99, dtype: object

In [9]:
def collapse_measurements(df):
    # Group columns by their base name (everything before _min, _max, etc.)
    base_columns = {}
    
    for col in df.columns:
        parts = col.split('_')
        if len(parts) > 2:
            suffix = parts[-1]
            if suffix in ['exmin', 'min', 'max', 'exmax']:
                base_name = '_'.join(parts[:-1])
                if base_name not in base_columns:
                    base_columns[base_name] = []
                base_columns[base_name].append(col)
    
    # Columns to copy directly (non-measurement columns)
    non_measurement_cols = [col for col in df.columns 
                           if not any(col in cols for cols in base_columns.values())]
    
    # Start with non-measurement columns
    result_df = df[non_measurement_cols].copy()
    
    # Process each group of measurement columns
    collapsed_data = {}
    
    for base_name, cols in base_columns.items():
        # Extract just the name without the unit
        name_parts = base_name.split('_')
        unit = name_parts[-1]
        col_name = '_'.join(name_parts[:-1])
        
        # Create dictionaries for each row
        collapsed_data[col_name] = []
        
        for _, row in df.iterrows():
            measurement_dict = {
                'exmin': row[f"{base_name}_exmin"] if f"{base_name}_exmin" in cols else None,
                'min': row[f"{base_name}_min"] if f"{base_name}_min" in cols else None,
                'max': row[f"{base_name}_max"] if f"{base_name}_max" in cols else None,
                'exmax': row[f"{base_name}_exmax"] if f"{base_name}_exmax" in cols else None,
                'unit': unit
            }
            collapsed_data[col_name].append(measurement_dict)
    
    # Convert to DataFrame and concatenate with result_df
    collapsed_df = pd.DataFrame(collapsed_data)
    result_df = pd.concat([result_df, collapsed_df], axis=1)
    
    return result_df

# Example usage
# df is your original dataframe with columns like stem_height_m_exmin, etc.
new_df = collapse_measurements(df)

In [10]:
new_df = collapse_measurements(df)

In [11]:
for i in new_df.columns:
    print(i)

family
genus
species
common_name
wagner_pg_#
description
infraspecific_epithet
stem_hair_type
phyllotaxy_type
leaf_hair_description
leaf_hair_upper_description
leaf_hair_lower_description
breeding_type
inflorescence_type
ray_color
floret_color
hypanthium_length_mm_max.1
spathe_color
perianth_outer_color
perianth_inner_color
perianth_color
labellum_color
lower_calyx_length_mm_exmax.1
corolla_type
staminate_corolla_type
pistillate_corolla_type
corolla_color
fruit_type
fruit_width_mm_max.1
fruit_diameter_mm_min.1
ploidy
chromosome_#
average_chromosome_#
origin_t1
fedstatus_t1 (do at end) 
status
life_form_type
leaf_type
leaf_margin_type
leaf_shape_type
juvenile_leaf_type
juvenile_leaf_margin_type
juvenile_leaf_shape_type
leaflets_leaf_type
leaflets_leaf_margin_type
leaflets_leaf_shape_type
leaf_hair_type
leaf_hair_upper_type
leaf_hair_lower_type
juvenile_leaf_hair_type
island_type
hawaiian_name
stem_height
leaf_length
leaf_width
juvenile_leaf_length
juvenile_leaf_width
leaflet_leaf_length

In [13]:
pwd

'/Users/williamharrigan/Desktop/Github/ai_wagner_trait_data_extraction/ipynb'

In [15]:
# new_df.to_csv('/Users/williamharrigan/Desktop/test_csv')

In [16]:
for i in new_df.columns:
    if len(i.split('_')) > 2:
        thing = metric = i.split('_')[-1]
        metric = i.split('_')[-2]
        col_name = '_'.join(i.split('_')[:-2])
        if thing == 'max' or thing == 'min' or thing == 'exmax' or thing == 'exmin':
            print(i)

In [39]:
list(HawaiianPlant.schema()['$defs'].keys())

/var/folders/6n/xxr1dffx2lbdwlqml4kf1yy40000gn/T/ipykernel_97017/1977610553.py:1: PydanticDeprecatedSince20: The `schema` method is deprecated; use `model_json_schema` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  list(HawaiianPlant.schema()['$defs'].keys())


['BreedingType',
 'CorollaType',
 'FederalStatusType',
 'FemaleInflorescenceType',
 'FruitType',
 'InflorescenceType',
 'LeafHairType',
 'LeafShapeType',
 'LeafTeeth',
 'LeafType',
 'LifeFormType',
 'Location',
 'Measurements',
 'OriginType',
 'PhyllotaxyType',
 'StatusType',
 'StemHairType']

In [40]:
for i in new_df.columns:
    print(i)

family
genus
species
common_name
wagner_pg_#
description
infraspecific_epithet
stem_hair_type
phyllotaxy_type
leaf_hair_description
leaf_hair_upper_description
leaf_hair_lower_description
breeding_type
inflorescence_type
ray_color
floret_color
hypanthium_length_mm_max.1
spathe_color
perianth_outer_color
perianth_inner_color
perianth_color
labellum_color
lower_calyx_length_mm_exmax.1
corolla_type
staminate_corolla_type
pistillate_corolla_type
corolla_color
fruit_type
fruit_width_mm_max.1
fruit_diameter_mm_min.1
ploidy
chromosome_#
average_chromosome_#
origin_t1
fedstatus_t1 (do at end) 
status
life_form_type
leaf_type
leaf_margin_type
leaf_shape_type
juvenile_leaf_type
juvenile_leaf_margin_type
juvenile_leaf_shape_type
leaflets_leaf_type
leaflets_leaf_margin_type
leaflets_leaf_shape_type
leaf_hair_type
leaf_hair_upper_type
leaf_hair_lower_type
juvenile_leaf_hair_type
island_type
hawaiian_name
stem_height
leaf_length
leaf_width
juvenile_leaf_length
juvenile_leaf_width
leaflet_leaf_length