In [None]:
# pip install --upgrade pydantic-ai openai

# Wagner Database AI Extraction

#### Code works best with a python version 3.10 < version < 3.12 (I am using 3.10.13)

In [1]:
from enum import Enum
from typing import Optional, List
import os
from openai import OpenAI
import json
import base64
import pandas as pd
import numpy as np
import sys
sys.path.append('/Users/williamharrigan/Desktop/UH/Year_3/semester_2/wagner')
import creds

## pydantic
from pydantic import BaseModel, Field
from openai.lib._pydantic import to_strict_json_schema
from pydantic_ai import Agent
from openai.types.chat.chat_completion_content_part_param import (
    ChatCompletionContentPartTextParam,
    ChatCompletionContentPartImageParam
)

from openai.types.chat.chat_completion_content_part_image_param import (
    ImageURL
)

photo_dir = "../training_set/"

In [135]:
class Description(str, Enum):
    DICOTS = "Dicots"
    MONOCOTS = "Monocots"
    CONIFERS = "Conifers"
    FERNS = "Ferns and fern allies"

class StemHairType(str, Enum):
    DENDRITIC = "DENDRITIC"
    GLABROUS = "GLABROUS"
    HIRSUTE = "HIRSUTE"
    HISPID = "HISPID"
    LEPIDOTE = "LEPIDOTE"
    PILOSE = "PILOSE"
    PUBERULENT = "PUBERULENT"
    STRIGOSE = "STRIGOSE"
    STELLATE = "STELLATE"
    TOMENTOSE = "TOMENTOSE"
    VILLOUS = "VILLOUS"
    GLAUCOUS = "GLAUCOUS"
    
class FruitType(str, Enum):
    ACHENE = "ACHENE"
    AGGREGATE = "AGGREGATE"
    ARTICLE = "ARTICLE"
    BERRY = "BERRY"
    CAPSULE = "CAPSULE"
    CARYOPSIS = "CARYOPSIS"
    DRUPE = "DRUPE"
    FOLLICLE = "FOLLICLE"
    LEGUME = "LEGUME"
    MERICARP = "MERICARP"
    MULTIPLE = "MULTIPLE"
    NUT = "NUT"
    PEPO = "PEPO"
    POME = "POME"
    SCHIZOCARP = "SCHIZOCARP"
    SILICLE = "SILICLE"
    SILIQUE = "SILIQUE"
    SYCONIUM = "SYCONIUM"
    
class LeafShapeType(str, Enum):
    ACEROSE = "ACEROSE"
    AWL_SHAPED = "AWL_SHAPED"
    GLADIATE = "GLADIATE"
    HASTATE = "HASTATE"
    CORDATE = "CORDATE"
    DELTOID = "DELTOID"
    LANCEOLATE = "LANCEOLATE"
    LINEAR = "LINEAR"
    ELLIPTIC = "ELLIPTIC"
    ENSIFORM = "ENSIFORM"
    LYRATE = "LYRATE"
    OBCORDATE = "OBCORDATE"
    FALCATE = "FALCATE"
    FLABELLATE = "FLABELLATE"
    OBDELTOID = "OBDELTOID"
    OBELLIPTIC = "OBELLIPTIC"
    OBLANCEOLATE = "OBLANCEOLATE"
    OBLONG = "OBLONG"
    PERFOLIATE = "PERFOLIATE"
    QUADRATE = "QUADRATE"
    OBOVATE = "OBOVATE"
    ORBICULAR = "ORBICULAR"
    RENIFORM = "RENIFORM"
    RHOMBIC = "RHOMBIC"
    OVAL = "OVAL"
    OVATE = "OVATE"
    ROTUND = "ROTUND"
    SAGITTATE = "SAGITTATE"
    PANDURATE = "PANDURATE"
    PELTATE = "PELTATE"
    SPATULATE = "SPATULATE"
    SUBULATE = "SUBULATE"

class StemHairs(str, Enum):
    DENDRITIC = "DENDRITIC"
    GLABROUS = "GLABROUS"

class PhyllotaxyType(str, Enum):
    ALTERNATE = "ALTERNATE"
    OPPOSITE = "OPPOSITE"
    WHORLED = "WHORLED"
    DECUSSATE = "DECUSSATE"
    DISTICHOUS = "DISTICHOUS"
    EQUITANT = "EQUITANT"
    TERNATE = "TERNATE"
    CAULINE = "CAULINE"

class InflorescenceType(str, Enum):
    CATKIN = "CATKIN"
    CYME = "CYME"
    HEAD = "HEAD"
    PANICLE = "PANICLE"
    RACEME = "RACEME"
    SPATHE_SPADIX = "SPATHE_SPADIX"
    THYRSE = "THYRSE"
    UMBEL = "UMBEL"
    VERTISCILLATE = "VERTISCILLATE"
    SOLITARY = "SOLITARY"
    SPIKE = "SPIKE"
    LANCEOLATE = "LANCEOLATE"
    GLOBOSE = "GLOBOSE"
    INVOLUCRE = "INVOLUCRE"
    CORYMBOSE = "CORYMBOSE"
    STROBILOID = "STROBILOID"

class LeafHairType(str, Enum):
    DENDRITIC = "DENDRITIC"
    GLABROUS = "GLABROUS"
    HIRSUTE = "HIRSUTE"
    HISPID = "HISPID"
    LEPIDOTE = "LEPIDOTE"
    PILOSE = "PILOSE"
    PUBERULENT = "PUBERULENT"
    STRIGOSE = "STRIGOSE"
    STELLATE = "STELLATE"
    TOMENTOSE = "TOMENTOSE"
    VILLOUS = "VILLOUS"
    GLAUCOUS = "GLAUCOUS"

class LeafType(str, Enum):
    SIMPLE = "SIMPLE"
    COMPOUND = "COMPOUND"

class BreedingType(str, Enum):
    MONOECIOUS = "MONOECIOUS"  # Male and female flowers on the same plant
    ANDROMONOECIOUS = "ANDROMONOECIOUS"
    CHASMOGAMOUS = "CHASMOGAMOUS"
    DIOECIOUS = "DIOECIOUS"  # Separate male and female plants
    GYNODIOECIOUS = "GYNODIOECIOUS"
    POLYGAMO_MONOECIOUS = "POLYGAMO-MONOECIOUS"
    POLYGAMOUS = "POLYGAMOUS"
    POLYGAMO_DIOECIOUS = "POLYGAMO-DIOECIOUS"
    GYNOMONOECIOUS = "GYNOMONOECIOUS"
    STERILE = "STERILE"

class LeafMarginType(str, Enum):
    TEETH = "TEETH"
    LOBED = "LOBED"
    ENTIRE = "ENTIRE"


class LifeFormType(str, Enum):
    ANNUAL_HERB = "ANNUAL_HERB"
    PERENNIAL_HERB = "PERENNIAL_HERB"
    EPIPHYTE = "EPIPHYTE"
    VINE = "VINE"
    SHRUB = "SHRUB"
    TREE = "TREE"

class CorollaType(str, Enum):
    ADNATE = "ADNATE"
    BILABIATE = "BILABIATE"
    CAMPANULATE = "CAMPANULATE"
    CORYMBOSE = "CORYMBOSE"
    CONVOLUTE = "CONVOLUTE"
    CORONA = "CORONA"
    CUNEATE = "CUNEATE"
    CYLINDRICAL = "CYLINDRICAL"
    DISK = "DISK"
    DELTATE = "DELTATE"
    ELLIPTIC = "ELLIPTIC"
    FUNNELFORM = "FUNNELFORM"
    FLABELLATE = "FLABELLATE"
    FILIFORM = "FILIFORM"
    HOOD = "HOOD"
    IRREGULAR = "IRREGULAR"
    KEEL = "KEEL"
    LABELLUM = "LABELLUM"
    LANCEOLATE = "LANCEOLATE"
    LINEAR = "LINEAR"
    LIPPED = "LIPPED"
    LOBBED = "LOBBED"
    OVATE = "OVATE"
    OBLONG = "OBLONG"
    OBCORDATE = "OBCORDATE"
    OBOVATE = "OBOVATE"
    OBLANCEOLATE = "OBLANCEOLATE"
    ORBICULAR = "ORBICULAR"
    PALATE = "PALATE"
    PSEUDORACEMES = "PSEUDORACEMES"
    ROTATE = "ROTATE"
    RAY = "RAY"
    REFLEXED = "REFLEXED"
    RHOMBIC = "RHOMBIC"
    RENIFORM = "RENIFORM"
    SALVERFORM = "SALVERFORM"
    SUBORBICULAR = "SUBORBICULAR"
    SUBRHOMBIC = "SUBRHOMBIC"
    SPUR = "SPUR"
    SPATULATE = "SPATULATE"
    SPICATE = "SPICATE"
    SUBROTATE = "SUBROTATE"
    STANDARD = "STANDARD"
    TUBULAR = "TUBULAR"
    TRIANGULAR = "TRIANGULAR"
    URCEOLATE = "URCEOLATE"
    UNILABIATE = "UNILABIATE"
    VALVATE = "VALVATE"
    VERTICIL = "VERTICIL"
    ZYGOMORPHIC = "ZYGOMORPHIC"
    CUP = "CUP"
    UNGUICULATE = "UNGUICULATE"
    CLAW = "CLAW"
    FASICLE = "FASICLE"
    STELLATE = "STELLATE"
    SUBPANICULATE = "SUBPANICULATE"
    PENTAGONAL = "PENTAGONAL"

class OriginType(str, Enum):
    NATURALIZED = "NATURALIZED"
    INDIGENOUS = "INDIGENOUS"
    ENDEMIC = "ENDEMIC"
    POLYNESIAN_INTRODUCTION = "POLYNESIAN INTRODUCTION"

class Location(str, Enum):
    HAWAII = "HAWAII"
    MAUI = "MAUI"
    KAHOOLAWE = "KAHOOLAWE"
    MOLOKAI = "MOLOKAI"
    LANAI = "LANAI"
    OAHU = "OAHU"
    KAUAI = "KAUAI"
    NIIHAU = "NIIHAU"
    ALL_ISLANDS = "ALL ISLANDS"

class FederalStatusType(str, Enum):
    SPECIES_OF_CONCERN = "SPECIES_OF_CONCERN"
    ENDANGERED = "ENDANGERED"
    THREATENED = "THREATENED"
    WITHDRAWN = "WITHDRAWN" 
    
class StatusType(str, Enum):
    NATURALIZED = "NATURALIZED"
    ENDEMIC = "ENDEMIC"
    RARE = "RARE"
    SECURE = "SECURE"
    VULNERABLE = "VULNERABLE"

class Measurements(BaseModel):
    min: Optional[float] = None
    max: Optional[float] = None
    extreme_min: Optional[float] = None
    extreme_max: Optional[float] = None

class HawaiianPlant(BaseModel):
    # Basic Information
    family: str = Field(..., description="Plant family name (should only be 1 object)")
    genus: str = Field(..., description="Plant genus name (should only be 1 object)")
    species: str = Field(..., description="Plant species name (should only be 1 object)")
    common_name: Optional[str] = Field(None, description="Common name of the plant")
    wagner_pg_number: Optional[str] = Field(None, description="Wagner book reference number")
    description: Optional[Description] = Field(None, description="Take knowledge from outside the passage to infer whether the plant is DICOTS, MONOCOTS, CONIFERS or FERNS")
    infraspecific_epithet: str = Field(..., description="The third word in the scientific name of an infraspecific taxon, following the name of the species. This applies only to formal names of plants and fungi, and not to the formal names of bacteria or animals. In the name Cannabis sativa subsp. indica, the word indica is the infraspecific epithet.")

    hawaiian_name: Optional[List[str]] = Field(None, description="List of Hawaiian names")
    
    stem_hair_type: Optional[StemHairType] = Field(None, description="Type of hair on stem")
    
    phyllotaxy_type: Optional[PhyllotaxyType] = Field(None, description="The arrangement of leaves around the stem.")

    leaf_hair_description: Optional[str] = Field(None, description="Description of leaf hair.")
    leaf_hair_upper_description: Optional[str] = Field(None, description="Description of Upper leaf hairs.")
    leaf_hair_lower_description: Optional[str] = Field(None, description="Description of Lower leaf hairs.")
    
    breeding_type: Optional[BreedingType] = Field(None, description="Plant reproductive class.")
    
    inflorescence_type: Optional[InflorescenceType] = Field(None, description="In a flowering plant, a cluster of flowers on a branch or a system of branches")
    
    ray_color: Optional[str] = Field(None, description="Color of ray")
    floret_color: Optional[str] = Field(None, description="Color of florets")
    spathe_color: Optional[str] = Field(None, description="Color of spathe")
    perianth_outer_color: Optional[str] = Field(None, description="Color of perianth outer flower")
    perianth_inner_color: Optional[str] = Field(None, description="Color of perianth inner flower")
    perianth_color: Optional[str] = Field(None, description="Color of perianth")
    labellum_color: Optional[str] = Field(None, description="Color of labellum")

    corolla_type: Optional[CorollaType] = Field(None, description="Type of corolla")
    corolla_color: Optional[str] = Field(None, description="Color of corolla")
    staminate_corolla_type: Optional[CorollaType] = Field(None, description="Type of staminate corolla")
    pistillate_corolla_type: Optional[CorollaType] = Field(None, description="Type of pistillate corolla")
    
    fruit_type: Optional[FruitType] = Field(None, description="Type of fruit")
    fruit_length: Optional[Measurements] = Field(None, description="Fruit length in millimeters")
    fruit_width: Optional[Measurements] = Field(None, description="Fruit width in millimeters")
    fruit_diameter: Optional[Measurements] = Field(None, description="Fruit diameter in millimeters")
    
    ploidy: Optional[str] = Field(None, description="Ploidy level expressed as a function of n (e.g., 1n, 2n or 3n, etc..)")
    chromosome_number: Optional[int] = Field(None, description="The integer Number of chromosomes")
    average_chromosome_number: Optional[float] = Field(None, description="Average chromosome number")
    
    origin: Optional[OriginType] = Field(None, description="Origin type of the plant")
    fed_status: Optional[FederalStatusType] = Field(None, description="Federal conservation status")
    status: Optional[StatusType] = Field(None, description="General status")

    life_form_type: Optional[LifeFormType] = Field(None, description="Growth habit or life form")
    leaf_type: Optional[LeafType] = Field(None, description="Simple or compound leaf type")
    leaf_shape_type: Optional[LeafShapeType] = Field(None, description="Shape of leaves")
    leaf_margin_type: Optional[LeafMarginType] = Field(None, description="Type of leaf margin")
    
    juvenile_leaf_type: Optional[LeafType] = Field(None, description="Simple or compound juvenile leaf type")
    juvenile_leaf_shape_type: Optional[LeafShapeType] = Field(None, description="Shape of juvenile leaves")
    juvenile_leaf_margin_type: Optional[LeafMarginType] = Field(None, description="Type of leaf margin on juvenile plants")
    
    leaflets_leaf_type: Optional[LeafType] = Field(None, description="Simple or compound juvenile leaf type")
    leaflets_leaf_shape_type: Optional[LeafShapeType] = Field(None, description="Shape of juvenile leaves")
    leaflets_leaf_margin_type: Optional[LeafMarginType] = Field(None, description="Type of leaf margin on juvenile plants")
    
    leaf_hair_upper_type: Optional[LeafHairType] = Field(None, description="Type of upper leaf hairs")
    leaf_hair_lower_type: Optional[LeafHairType] = Field(None, description="Type of lower leaf hairs")
    leaf_hair_type: Optional[LeafHairType] = Field(None, description="Type of leaf hairs")
    juvenile_leaf_hair_type: Optional[LeafHairType] = Field(None, description="Type of juvenile leaf hairs")
    
    island_type: Optional[List[Location]] = Field(None, description="Islands where the plant is found")
    
    stem_height: Optional[Measurements] = Field(None, description="Stem or general plant height measurements in meters")
    leaf_length: Optional[Measurements] = Field(None, description="Length of leaves in millimeters")
    leaf_width: Optional[Measurements] = Field(None, description="Width of leaves in millimeters")
    juvenile_leaf_length: Optional[Measurements] = Field(None, description="Juvenile length of leaves in millimeters")
    juvenile_leaf_width: Optional[Measurements] = Field(None, description="Juvenile width of leaves in millimeters")
    leaflets_leaf_length: Optional[Measurements] = Field(None, description="Leaflets length of leaves in millimeters")
    leaflets_leaf_width: Optional[Measurements] = Field(None, description="Leaflets width of leaves in millimeters")    
    
    petioles: Optional[Measurements] = Field(None, description="Length of petiole stalk in centimeters") 
    staminate_inflorescence_length: Optional[Measurements] = Field(None, description="The measured length in millimeters of the male (pollen-producing) flower cluster. This specifically refers to catkins or other inflorescences containing only staminate (male) flowers.")    
    pistillate_inflorescence_length: Optional[Measurements] = Field(None, description="The measured length of the female (seed-producing) flower cluster. This specifically refers to inflorescences containing only pistillate (female) flowers.")    
    staminate_inflorescence_width: Optional[Measurements] = Field(None, description="The measured width in millimeters of the male (pollen-producing) flower cluster. This specifically refers to catkins or other inflorescences containing only staminate (male) flowers.")    
    pistillate_inflorescence_width: Optional[Measurements] = Field(None, description="The measured width of the female (seed-producing) flower cluster. This specifically refers to inflorescences containing only pistillate (female) flowers.")        
    
    inflorescence_flower_length: Optional[Measurements] = Field(None, description="The length of an inflorescence flower in millimeters.")
    inflorescence_flower_width: Optional[Measurements] = Field(None, description="The width of an inflorescence flower in millimeters.")
    
    flower_length: Optional[Measurements] = Field(None, description="Flower length in centimeters")
    flower_width: Optional[Measurements] = Field(None, description="Flower width in centimeters")
    
    rachis_length: Optional[Measurements] = Field(None, description="Rachis length in millimeters")
    rachis_diameter: Optional[Measurements] = Field(None, description="Rachis diameter in millimeters") 
    
    head_length: Optional[Measurements] = Field(None, description="The measured length of the capitulum (flower head) in millimeters.")
    head_diameter: Optional[Measurements] = Field(None, description="The measured diameters of the capitulum (flower head) in millimeters.")
    
    bur_length: Optional[Measurements] = Field(None, description="The measured length of the bur in millimeters.")
    tepal_length: Optional[Measurements] = Field(None, description="The measured length of the tepal in millimeters.")
    staminate_tepal_length: Optional[Measurements] = Field(None, description="The measured length of the staminate tepal in millimeters.")
    pistillate_tepal_length: Optional[Measurements] = Field(None, description="The measured length of the pistillate tepal in millimeters.")
      
    ray_length: Optional[Measurements] = Field(None, description="The measured length of the ray in millimeters.")
    ray_width: Optional[Measurements] = Field(None, description="The measured width of the ray in millimeters.")
    
    florets_length: Optional[Measurements] = Field(None, description="The measured length of the florets in millimeters.")
    
    involucre_length: Optional[Measurements] = Field(None, description="Involucre length in millimeters")
    involucre_width: Optional[Measurements] = Field(None, description="Involucre width in millimeters")
    staminate_involucre_length: Optional[Measurements] = Field(None, description="Staminate involucre length in millimeters")
    pistilate_involucre_length: Optional[Measurements] = Field(None, description="Pistillate involucre length in millimeters")   
    
    bract_length: Optional[Measurements] = Field(None, description="Bract length in millimeters")
    bract_width: Optional[Measurements] = Field(None, description="Bract width in millimeters")
    bract_lower_length: Optional[Measurements] = Field(None, description="Lower bract length in millimeters")
    bract_outer_length: Optional[Measurements] = Field(None, description="Outer bract length in millimeters")
    
    bracteoles_length: Optional[Measurements] = Field(None, description="Bracteoles length in millimeters")
    bracteoles_width: Optional[Measurements] = Field(None, description="Bracteole width in millimeters")
    
    pedicel_length: Optional[Measurements] = Field(None, description="Pedicel length in millimeters")
    pedicel_width: Optional[Measurements] = Field(None, description="Pedicel width in millimeters")
    staminate_pedicel_length: Optional[Measurements] = Field(None, description="Staminate pedicel length in millimeters")
    pistillate_pedicel_length: Optional[Measurements] = Field(None, description="Pistillate pedicel length in millimeters")
    staminate_pedicel_width: Optional[Measurements] = Field(None, description="Staminate pedicel width in millimeters")
    pistillate_pedicel_width: Optional[Measurements] = Field(None, description="Pistillate pedicel width in millimeters") 
    
    hypanthium_length: Optional[Measurements] = Field(None, description="Hypanthium length in millimeters")
    hypanthium_width: Optional[Measurements] = Field(None, description="Hypanthium width in millimeters")
    
    peduncle_length: Optional[Measurements] = Field(None, description="Peduncle length in millimeters")
    peduncle_width: Optional[Measurements] = Field(None, description="Peduncle width in millimeters")
    staminate_peduncle_length: Optional[Measurements] = Field(None, description="Staminate peduncle length in millimeters")
    staminate_peduncle_width: Optional[Measurements] = Field(None, description="Staminate peduncle width in millimeters")
    pistillate_peduncle_length: Optional[Measurements] = Field(None, description="Pistillate peduncle length in millimeters")
    pistillate_peduncle_width: Optional[Measurements] = Field(None, description="Pistillate peduncle width in millimeters")

    spathe_width: Optional[Measurements] = Field(None, description="Spathe width dimensions in millimeters")
    spathe_length: Optional[Measurements] = Field(None, description="Spathe length dimensions in millimeters")
    spadix_length: Optional[Measurements] = Field(None, description="Spadix length dimensions in millimeters")
    
    perianth_width: Optional[Measurements] = Field(None, description="Perianth width dimensions in millimeters")
    perianth_length: Optional[Measurements] = Field(None, description="Perianth length dimensions in millimeters")   
    perianth_outer_width: Optional[Measurements] = Field(None, description="Outer perianth width dimensions in millimeters")
    perianth_outer_length: Optional[Measurements] = Field(None, description="Outer perianth length dimensions in millimeters") 
    perianth_inner_width: Optional[Measurements] = Field(None, description="Inner perianth width dimensions in millimeters")
    perianth_inner_length: Optional[Measurements] = Field(None, description="Inner perianth length dimensions in millimeters") 
    
    perianth_lobes_width: Optional[Measurements] = Field(None, description="Perianth lobes width dimensions in millimeters")
    perianth_lobes_length: Optional[Measurements] = Field(None, description="Perianth lobes length dimensions in millimeters") 
    perianth_tube_length: Optional[Measurements] = Field(None, description="Perianth tube length dimensions in millimeters") 
    pistillate_perianth_tube_length: Optional[Measurements] = Field(None, description="Pistillate perianth tube length dimensions in millimeters") 
    staminate_perianth_tube_length: Optional[Measurements] = Field(None, description="Staminate perianth tube length dimensions in millimeters") 
    
    pappus_length: Optional[Measurements] = Field(None, description="Pappus length in millimeters")
    umbellet_length: Optional[Measurements] = Field(None, description="Umbellet length in millimeters")
    labellum_width: Optional[Measurements] = Field(None, description="Labellum width dimensions in millimeters")
    labellum_length: Optional[Measurements] = Field(None, description="Labellum length dimensions in millimeters")
    
    calyx_length: Optional[Measurements] = Field(None, description="Calyx length in millimeters")
    calyx_width: Optional[Measurements] = Field(None, description="Calyx width in millimeters")
    calyx_teeth_length: Optional[Measurements] = Field(None, description="Calyx teeth length in millimeters")
    calyx_teeth_width: Optional[Measurements] = Field(None, description="Calyx teeth width in millimeters")
    calyx_lobes_length: Optional[Measurements] = Field(None, description="Calyx lobe length in millimeters")
    calyx_lobes_width: Optional[Measurements] = Field(None, description="Calyx lobe width in millimeters")
    
    upper_calyx_length: Optional[Measurements] = Field(None, description="Upper calyx length in millimeters")
    lower_calyx_length: Optional[Measurements] = Field(None, description="Lower calyx length in millimeters")
    
    inner_calyx_lobes_length: Optional[Measurements] = Field(None, description="Inner calyx lobes length in millimeters")
    inner_calyx_lobes_width: Optional[Measurements] = Field(None, description="Inner calyx lobes width in millimeters")
    outer_calyx_lobes_length: Optional[Measurements] = Field(None, description="Outer calyx lobes length in millimeters")
    outer_calyx_lobes_width: Optional[Measurements] = Field(None, description="Outer calyx lobes width in millimeters")
    
    calyx_tube_length: Optional[Measurements] = Field(None, description="Calyx tube length in millimeters")
    calyx_tube_width: Optional[Measurements] = Field(None, description="Calyx tube width in millimeters")
    
    male_calyx_length: Optional[Measurements] = Field(None, description="Male calyx length in millimeters")
    male_calyx_width: Optional[Measurements] = Field(None, description="Male calyx width in millimeters")
    
    male_calyx_lobes_length: Optional[Measurements] = Field(None, description="Male calyx length in millimeters")
    male_calyx_lobes_width: Optional[Measurements] = Field(None, description="Male calyx width in millimeters")
    
    female_calyx_length: Optional[Measurements] = Field(None, description="female calyx length in millimeters")
    female_calyx_width: Optional[Measurements] = Field(None, description="female calyx width in millimeters")
    
    female_calyx_lobes_length: Optional[Measurements] = Field(None, description="Female calyx length in millimeters")
    female_calyx_lobes_width: Optional[Measurements] = Field(None, description="Female calyx width in millimeters")
    
    male_calyx_lobes_length_inner: Optional[Measurements] = Field(None, description="Male calyx inner lobe length in millimeters")
    male_calyx_lobes_length_outer: Optional[Measurements] = Field(None, description="Male calyx outer length in millimeters")
    female_calyx_lobes_length_inner: Optional[Measurements] = Field(None, description="Female calyx inner lobe length in millimeters")
    female_calyx_lobes_length_outer: Optional[Measurements] = Field(None, description="Female calyx outer length in millimeters")
    
    male_calyx_lobes_width_outer: Optional[Measurements] = Field(None, description="Male calyx outer lobe width in millimeters")
    male_calyx_tube_length: Optional[Measurements] = Field(None, description="Male calyx tube length in millimeters")

    female_calyx_lobes_width_inner: Optional[Measurements] = Field(None, description="Female calyx inner lobe width in millimeters")
    female_calyx_lobes_width_outer: Optional[Measurements] = Field(None, description="Female calyx outer lobe width in millimeters")
    female_calyx_tube_length: Optional[Measurements] = Field(None, description="Female calyx tube length in millimeters")
    
    inner_calyx_length: Optional[Measurements] = Field(None, description="Inner calyx length in millimeters")
    outer_calyx_length: Optional[Measurements] = Field(None, description="Outer calyx length in millimeters")

    corolla_length: Optional[Measurements] = Field(None, description="Corolla length in millimeters")
    corolla_width: Optional[Measurements] = Field(None, description="Corolla width in millimeters")
    
    corolla_length: Optional[Measurements] = Field(None, description="Corolla length in millimeters")
    corolla_width: Optional[Measurements] = Field(None, description="Corolla width in millimeters")
    corolla_tube_length: Optional[Measurements] = Field(None, description="Corolla tube length in millimeters")
    corolla_tube_width: Optional[Measurements] = Field(None, description="Corolla tube width in millimeters")
    corolla_lobes_length: Optional[Measurements] = Field(None, description="Corolla lobes length in millimeters")
    corolla_lobes_width: Optional[Measurements] = Field(None, description="Corolla lobes width in millimeters")
    
    
    upper_corolla: Optional[Measurements] = Field(None, description="Upper corolla length in millimeters")
    lower_corolla: Optional[Measurements] = Field(None, description="Lower corolla length in millimeters")
    upper_corolla_lobes_length: Optional[Measurements] = Field(None, description="Upper corolla lobes length in millimeters")
    lower_corolla_lobes_length: Optional[Measurements] = Field(None, description="Lower corolla lobes length in millimeters")
    
    corolla_lip: Optional[Measurements] = Field(None, description="Corolla lip length in millimeters")
    
    staminate_corolla_length: Optional[Measurements] = Field(None, description="Staminate corolla length in millimeters")
    pistillate_corolla_length: Optional[Measurements] = Field(None, description="Pistillate corolla length in millimeters")
    
    staminate_corolla_tube_length: Optional[Measurements] = Field(None, description="Staminate corolla tube length in millimeters")
    pistillate_corolla_tube_length: Optional[Measurements] = Field(None, description="Pistillate corolla tube length in millimeters")

    staminate_corolla_tube_width: Optional[Measurements] = Field(None, description="Staminate corolla tube length in millimeters")
    pistillate_corolla_tube_width: Optional[Measurements] = Field(None, description="Pistillate corolla tube length in millimeters")    
    
    female_corolla_lobes_length: Optional[Measurements] = Field(None, description="Female corolla lobes length in millimeters") 
    female_corolla_lobes_width:  Optional[Measurements] = Field(None, description="Female corolla lobes width in millimeters") 
    male_corrola_lobes_length: Optional[Measurements] = Field(None, description="Male corolla lobes length in millimeters") 
    male_corrola_lobes_width: Optional[Measurements] = Field(None, description="Male corolla lobes width in millimeters") 
    
    fruit_length: Optional[Measurements] = Field(None, description="Fruit length in centimeters") 
    fruit_width: Optional[Measurements] = Field(None, description="Fruit width in centimeters") 
    fruit_diameter: Optional[Measurements] = Field(None, description="Fruit diameter in centimeters") 
    
    
    seeds_perfruit: Optional[Measurements] = Field(None, description="Number of seeds per fruit") 
    
    seed_length: Optional[Measurements] = Field(None, description="Seed length in centimeters") 
    seed_width: Optional[Measurements] = Field(None, description="Seed width in centimeters") 
    seed_diameter: Optional[Measurements] = Field(None, description="Seed diameter in centimeters") 


In [None]:
## Input to base64 format to be intepreted by extraction agents (chatbots)
with open(f"{photo_dir}/Acanthaceae_Dicliptera_chinensis.jpeg", "rb") as image_file:
            base64_image = base64.b64encode(image_file.read()).decode("utf-8")

## Setting the overall framework for how the extraction agent operates across all interactions
system_prompt  = """
You are an expert plant taxonomist. Please analyze this image and return the details according to the schema provided.
You are exhaustive; you include ALL the details mentioned. Do not make any assumptions about the data and do not try to
interpret what is not obvious from the text. When extracting information, ensure that you return a structured response in JSON format matching the `HawaiianPlant` schema. """

## Prompt to set specific focus for each extraction agent interaction (can be dynamic as need be)
user_prompt= "Transcribe the plant information you see in this image"


## GPT-4o Text Transcription (Text Extraction Only)

In [None]:
## Setting extraction agent to be gpt-4o
## System prompt is being set to specifically transcribe information
openai_img_to_text_transcription_agent = Agent(
    model="openai:gpt-4o",
    result_type=str,
    system_prompt = "You are a vision model capable of accurately performing OCR on an image",
)

## Data to be extracted from input image
image_urls = [
    f"data:image/png;base64,{base64_image}",
]

## Setting chat parameters: low detail (for efficiency) from input image
image_params = [
    ChatCompletionContentPartImageParam(
        type='image_url', 
        image_url=ImageURL(url=url, detail='low')
    ) for url in image_urls
]

## Setting chat prompt to 'user_prompt'
msg_open_ai = [
            ChatCompletionContentPartTextParam(text="Convert this to text. Don't miss any text.", type='text'),
            *image_params
]

## Running data extraction
r = await openai_img_to_text_transcription_agent.run(msg_open_ai)

## Output to 'text'
text = r.data.split("---")[1]
print(text)

## DeepSeek Model Extraction (R1 distill llama 70B) from text to Hawaiian Plant Schema

In [None]:
# ## Setting deepseek LLM as extraction agent
# ## Data to be extracted in HawaiianPlant schema
# groq_extraction_agent = Agent(
#     model="groq:deepseek-r1-distill-llama-70b",
#     retries=3,
#     result_type=HawaiianPlant,
#     system_prompt = system_prompt,
#     # Had to raise temperature to 0.5
#     model_settings = {'temperature': 0.5}
# )

# ## Setting variables and extracting data
# r = await groq_extraction_agent.run(text)
# deepseek_llama = r.data


## GPT-4o Data Extraction to Hawaiian Plant Schema

In [None]:
## Setting extraction agent to be gpt-4o
## Extracted data will be formatted according to HawaiianPlant schema
openai_extraction_agent = Agent(
    model="openai:gpt-4o",
    result_type=HawaiianPlant,
    system_prompt = system_prompt,
)

## Data to be extracted from input image
image_urls = [
    f"data:image/png;base64,{base64_image}",
]

## Setting chat parameters: low detail (for efficiency) from input image
image_params = [
    ChatCompletionContentPartImageParam(
        type='image_url', 
        image_url=ImageURL(url=url, detail='low')
    ) for url in image_urls
]

## Setting chat prompt to 'user_prompt'
msg_open_ai = [
            ChatCompletionContentPartTextParam(text=user_prompt, type='text'),
            *image_params
]

## Running data extraction
r = await openai_extraction_agent.run(msg_open_ai)

## Setting results to 'gpt_4o_output'
gpt_4o_output = r.data

## Anthropic Model Data Extraction

In [None]:
## Setting extraction agent to be Claude Sonnet model
## Extracted data will be formatted according to HawaiianPlant schema
sonnet_extraction_agent = Agent(
    model="anthropic:claude-3-5-sonnet-latest",
    result_type=HawaiianPlant,
    system_prompt = system_prompt,
    model_settings = {'temperature': 0.2}

)

## Setting context
msg_claude = [
    ChatCompletionContentPartTextParam(text=user_prompt, type='text'),
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": "image/jpeg",
                        "data": f"{base64_image}",
                    }
                },
    
]

## Extracting data from image
r = await sonnet_extraction_agent.run(msg_claude)
sonnet_output = r.data

## GPT o3-Mini Model Data Extraction

In [None]:
## Setting system prompt
gptmini_extraction_agent = Agent(
    model="openai:o3-mini",
    result_type=HawaiianPlant,
    system_prompt = system_prompt,
)

## Extracting output from extracted text
r = await gptmini_extraction_agent.run(text)

## Setting output
gpt_mini_output = r.data

In [None]:
df.sample(1)

In [None]:
len()

In [125]:
family = 'Araliaceae'
genus = 'Cheirodendron'
species = 'forbesii'

## Input to base64 format to be intepreted by extraction agents (chatbots)
with open(f"{photo_dir}/{family}_{genus}_{species}.jpeg", "rb") as image_file:
            base64_image = base64.b64encode(image_file.read()).decode("utf-8")

## Setting the overall framework for how the extraction agent operates across all interactions
system_prompt  = """
You are an expert plant taxonomist. Please analyze this image and return the details according to the schema provided.
You are exhaustive; you include ALL the details mentioned. Do not make any assumptions about the data and do not try to
interpret what is not obvious from the text. When extracting information, ensure that you return a structured response in JSON format matching the `HawaiianPlant` schema. """

## Prompt to set specific focus for each extraction agent interaction (can be dynamic as need be)
user_prompt= "Transcribe the plant information you see in this image"

## Setting extraction agent to be gpt-4o
## System prompt is being set to specifically transcribe information
openai_img_to_text_transcription_agent = Agent(
    model="openai:gpt-4o",
    result_type=str,
    system_prompt = "You are a vision model capable of accurately performing OCR on an image",
)

## Data to be extracted from input image
image_urls = [
    f"data:image/png;base64,{base64_image}",
]

## Setting chat parameters: low detail (for efficiency) from input image
image_params = [
    ChatCompletionContentPartImageParam(
        type='image_url', 
        image_url=ImageURL(url=url, detail='low')
    ) for url in image_urls
]

## Setting chat prompt to 'user_prompt'
msg_open_ai = [
            ChatCompletionContentPartTextParam(text="Convert this to text. Don't miss any text.", type='text'),
            *image_params
]

## Running data extraction
r = await openai_img_to_text_transcription_agent.run(msg_open_ai)

## Output to 'text'

if len(r.data.split("---")) == 1:
    text = r.data.split("---")[0]
else:
    text = r.data.split("---")[1]
print(text)

## Setting extraction agent to be gpt-4o
## Extracted data will be formatted according to HawaiianPlant schema
openai_extraction_agent = Agent(
    model="openai:gpt-4o",
    result_type=HawaiianPlant,
    system_prompt = system_prompt,
)

## Data to be extracted from input image
image_urls = [
    f"data:image/png;base64,{base64_image}",
]

## Setting chat parameters: low detail (for efficiency) from input image
image_params = [
    ChatCompletionContentPartImageParam(
        type='image_url', 
        image_url=ImageURL(url=url, detail='low')
    ) for url in image_urls
]

## Setting chat prompt to 'user_prompt'
msg_open_ai = [
            ChatCompletionContentPartTextParam(text=user_prompt, type='text'),
            *image_params
]

## Running data extraction
r = await openai_extraction_agent.run(msg_open_ai)

## Setting results to 'gpt_4o_output'
gpt_4o_output = r.data

print('Done: gpt_4o_output')

## Setting extraction agent to be Claude Sonnet model
## Extracted data will be formatted according to HawaiianPlant schema
sonnet_extraction_agent = Agent(
    model="anthropic:claude-3-5-sonnet-latest",
    result_type=HawaiianPlant,
    system_prompt = system_prompt,
    model_settings = {'temperature': 0.2}

)

## Setting context
msg_claude = [
    ChatCompletionContentPartTextParam(text=user_prompt, type='text'),
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": "image/jpeg",
                        "data": f"{base64_image}",
                    }
                },
    
]

## Extracting data from image
r = await sonnet_extraction_agent.run(msg_claude)
sonnet_output = r.data

print('Done: sonnet_output')


## Setting system prompt
gptmini_extraction_agent = Agent(
    model="openai:o3-mini",
    result_type=HawaiianPlant,
    system_prompt = system_prompt,
)

## Extracting output from extracted text
r = await gptmini_extraction_agent.run(text)

## Setting output
gpt_mini_output = r.data

print('Done: miniGPT ')


3. Cheirodendron forbesii (Sherff) Lowry  
[= Kaduaea Kraj. var. Forbesii Sherff]  
(end)  

Trees 3-4 m tall. Leaves 3 (occasionally 4 on juvenile leaves), subcoriaceous, elliptic to ovate, 5-12 cm long and less than 3.5 cm wide, venation sunken on upper surface in dried material (weakly so on juvenile leaves), margins entire, apex rounded-acute to acuminate, base narrowly cuneate to attenuate, petioles 3-7(-10) mm. inflorescence: umbo, pedicels 0.5-1.5 in long surpassing leaves, pubescence on calyx lobes sparse dense, styles 3-1 mm long, corl (4-5)-conic, styles (4)(5) 1-1.5 mm long, concave. Vein: the length into a conical configuration: triangular or verruculent. styles 5 in spaces.  
Propagation: USU can be made by strong-hybridizing in water ac. Collector: Natural cycle: Prepare a cutting handling of water flow, 90-100 M beneath 110K endnote.  
Distribution: Wet forests, 600-900 m, Mount Kahili, north fork of Kana ohe, Punoe, Malakala w. Mountains, and around Makaeo.  
Note: Spec

In [136]:
df= pd.read_csv("/Users/williamharrigan/Desktop/Github/ai_wagner_trait_data_extraction/files/man_extract.csv")
df.columns = df.columns.str.lower()
df.head()
type_cols = [col for col in df.columns if '_type_' in col]
# type_cols
# Group columns by their prefix (everything before '_type_' + last element)
col_groups = {}
for col in type_cols:
    prefix = '_'.join(col.split('_')[:-1])
    # print(prefix)
    if prefix not in col_groups:
        col_groups[prefix] = []
    col_groups[prefix].append(col)
    # print(col)

# # Create new collapsed columns
for prefix, cols in col_groups.items():
    df[prefix] = df.apply(lambda row: [col.split('_')[-1].upper() for col in cols if row[col] == 1], axis=1)
    
# Drop the original type columns
df.drop(columns=type_cols, inplace=True)

df['hawaiian_name'] = df[['hawaiian_name_1', 'hawaiian_name_2', 'hawaiian_name_3', 'hawaiian_name_4']].apply(
    lambda row: {x for x in row if pd.notna(x)}, axis=1
)

# Drop original columns if needed
df.drop(columns=['hawaiian_name_1', 'hawaiian_name_2', 'hawaiian_name_3', 'hawaiian_name_4'], inplace=True)



# hawaiian_cols = [col for col in df.columns if 'hawaiian_name' in col]
# # Merge values into a list, ensuring all values are strings and filtering out empty values
# df['hawaiian_name'] = df[hawaiian_cols].apply(lambda row: [str(val) for val in row if pd.notna(val) and str(val).strip() != ''], axis=1)

# # Drop the original columns
# df.drop(columns=hawaiian_cols, inplace=True)
new_df = collapse_measurements(df)

In [137]:
ground_truth = new_df[(new_df['family'] == family) & 
                      (new_df['genus'] == genus) & 
                      (new_df['species'] == species)].to_dict(orient='records')[0]

model_outputs = {}
model_outputs['gpt_4o_output'] = dict(gpt_4o_output)
model_outputs['gpt_mini_output'] = dict(gpt_mini_output)
model_outputs['sonnet_output'] = dict(sonnet_output)

In [138]:
for gt_key, gt_value in ground_truth.items():
    print(f"Groundtruth {gt_key}: {gt_value}")
    for model in model_outputs.keys():
        
        print(f"Model {model}: {model_outputs[model][gt_key]}")

Groundtruth family: Araliaceae
Model gpt_4o_output: Araliaceae
Model gpt_mini_output: Araliaceae
Model sonnet_output: Araliaceae
Groundtruth genus: Cheirodendron
Model gpt_4o_output: Cheirodendron
Model gpt_mini_output: Cheirodendron
Model sonnet_output: Cheirodendron
Groundtruth species: forbesii
Model gpt_4o_output: forbesii
Model gpt_mini_output: forbesii
Model sonnet_output: forbesii
Groundtruth common_name: olapa, lapalapa
Model gpt_4o_output: None
Model gpt_mini_output: None
Model sonnet_output: None
Groundtruth wagner_pg_number: pg 225,227
Model gpt_4o_output: 227
Model gpt_mini_output: None
Model sonnet_output: 227
Groundtruth description: Dicots
Model gpt_4o_output: Dicots
Model gpt_mini_output: Dicots
Model sonnet_output: None
Groundtruth infraspecific_epithet: nan
Model gpt_4o_output: subsp.
Model gpt_mini_output: 
Model sonnet_output: forbesii
Groundtruth stem_hair_type: G
Model gpt_4o_output: None
Model gpt_mini_output: None
Model sonnet_output: None
Groundtruth phyllotaxy

In [None]:


for model, results in model_outputs.items():
    print(model)
    print(results['family'])

## Comparing Model Outputs

In [None]:
for i in gpt_mini_output:
    print(i[0], i[1])

In [None]:
### IGNORE THE FOLLOWING CODE, IT IS A WORK SPACE

In [None]:
## Checking manual results versus automatically extracted results
## Setting class for boolean output based on intepretation if results are equal
class AreAnntoationsEqual(BaseModel):
    are_equal: bool = Field(..., description="Are the two annotations equal")
    justificaiton: str = Field(..., description="Justification of the propose value for are_equal")

## Setting the prompt and model for validation agent
validation_agent = Agent(
    model="groq:llama-3.3-70b-specdec",
    result_type=AreAnntoationsEqual,
    system_prompt = """You are an expert taxonomist. You are comparing the outcome of a manually extracted result versus an automatically extracted result. You need to compare the automatic results and determine whether the result is synonymous or equal the manual one; taking into consideration
    linguisitc and formatting nuances. Your answer is whether the two results are similar True/False and a justificaiton for your answer""",
)

## Models to compare
# models = ["gpt_4o_output", "gpt_mini_output", "sonnet_output", "deepseek_llama"]
models = ["gpt_4o_output", "gpt_mini_output", "sonnet_output"]



In [None]:
## Column Names/ properties
props = HawaiianPlant.model_json_schema()['properties'].keys()
props

In [None]:
for prop in list(props)[0:3]:
    print("prop")
    for model in models:
        user_prompt = f""" manually annotated {prop}: {groun_truth[prop]}
        automatically annotated {prop}:  {getattr(eval(model),  prop)}
        """
        print("\n\n"+user_prompt)
        r = await validation_agent.run(user_prompt)
        print(r.data)

        print(f"{prop}:{getattr(eval(model),  prop)}:{r.data.are_equal}", end="\t")
    print("\n")  

## Testing Area

In [None]:
# auto_df = pd.read_csv("/Users/williamharrigan/Desktop/UH/Year_3/semester_2/wagner/auto_extract_2.csv")
# auto_df.columns = auto_df.columns.str.lower()
# auto_df.head()

In [33]:
df= pd.read_csv("/Users/williamharrigan/Desktop/Github/ai_wagner_trait_data_extraction/files/man_extract.csv")
df.columns = df.columns.str.lower()
df.head()

Unnamed: 0,family,genus,species,common_name,hawaiian_name_1,hawaiian_name_2,hawaiian_name_3,hawaiian_name_4,wagner_pg_number,description,...,island_type_ma,island_type_kah,island_type_mo,island_type_l,island_type_o,island_type_kau,island_type_ni,island_type_a,fedstatus_t1 (do at end),status
0,Asteraceae,Ambrosia,artemisiifolia,common ragweed,,,,,pg 256-257,Dicots,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,NS,Naturalized
1,Asteraceae,Dubautia,laxa,na`ena`e pua melemele,Na`ena`e pua melemele,,,,"pg 292-295,301",Dicots,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,NS,Endemic
2,Asteraceae,Tetramolopium,filiforme,no common name,,,,,"pg 361-362, 365, 366",Dicots,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,E,Endemic
3,Asteraceae,Encelia,farinosa,brittle bush,,,,,pg 312-313,Dicots,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NS,Naturalized
4,Aristolochiaceae,Aristolochia,littoralis,calico flower,,,,,"pg 237-238,239",Dicots,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,NS,Naturalized


In [34]:
df[(df['family'] == 'Agavaceae') & (df['genus'] == 'Pleomele')].filter(like='stem')
# amp[amp['family'] == 'Brassicaceae']['leaflet_leaf_width'].values

Unnamed: 0,stem_height_m_exmin,stem_height_m_min,stem_height_m_max,stem_height_m_exmax,stem_hair_type
47,,6.0,12.0,,
48,,6.0,8.0,,
49,,5.0,10.0,,


In [35]:
new_df[(new_df['family'] == 'Agavaceae') & (new_df['genus'] == 'Pleomele')].filter(like='stem').values


array([[nan,
        {'exmin': nan, 'min': 6.0, 'max': 12.0, 'exmax': nan, 'unit': 'm'}],
       [nan,
        {'exmin': nan, 'min': 6.0, 'max': 8.0, 'exmax': nan, 'unit': 'm'}],
       [nan,
        {'exmin': nan, 'min': 5.0, 'max': 10.0, 'exmax': nan, 'unit': 'm'}]],
      dtype=object)

In [36]:
def collapse_type_columns(df):
    # Identify all columns that have '_type_' in their names
    type_cols = [col for col in df.columns if '_type_' in col]

    # Group columns by their prefix (everything before '_type_' + last element)
    col_groups = {}
    for col in type_cols:
        prefix = '_'.join(col.split('_')[:-1])  # Get the prefix
        if prefix not in col_groups:
            col_groups[prefix] = []
        col_groups[prefix].append(col)

    # Create new collapsed columns
    for prefix, cols in col_groups.items():
        df[prefix] = df.apply(lambda row: [col.split('_')[-1].upper() for col in cols if row[col] == 1], axis=1)
        
    # Drop the original type columns
    df.drop(columns=type_cols, inplace=True)

    return df

def merge_hawaiian_name_columns(df):
    # Identify columns that contain 'hawaiian_name'
    hawaiian_cols = [col for col in df.columns if 'hawaiian_name' in col]

    # Ensure there are columns to merge
    if not hawaiian_cols:
        return df
    
    # Merge values into a list, ensuring all values are strings and filtering out empty values
    df['hawaiian_name'] = df[hawaiian_cols].apply(lambda row: [str(val) for val in row if pd.notna(val) and str(val).strip() != ''], axis=1)

    # Drop the original columns
    df.drop(columns=hawaiian_cols, inplace=True)

    return df


In [37]:
type_cols = [col for col in df.columns if '_type_' in col]
# type_cols
# Group columns by their prefix (everything before '_type_' + last element)
col_groups = {}
for col in type_cols:
    prefix = '_'.join(col.split('_')[:-1])
    # print(prefix)
    if prefix not in col_groups:
        col_groups[prefix] = []
    col_groups[prefix].append(col)
    # print(col)

# # Create new collapsed columns
for prefix, cols in col_groups.items():
    df[prefix] = df.apply(lambda row: [col.split('_')[-1].upper() for col in cols if row[col] == 1], axis=1)
    
# Drop the original type columns
df.drop(columns=type_cols, inplace=True)

df['hawaiian_name'] = df[['hawaiian_name_1', 'hawaiian_name_2', 'hawaiian_name_3', 'hawaiian_name_4']].apply(
    lambda row: {x for x in row if pd.notna(x)}, axis=1
)

# Drop original columns if needed
df.drop(columns=['hawaiian_name_1', 'hawaiian_name_2', 'hawaiian_name_3', 'hawaiian_name_4'], inplace=True)



# hawaiian_cols = [col for col in df.columns if 'hawaiian_name' in col]
# # Merge values into a list, ensuring all values are strings and filtering out empty values
# df['hawaiian_name'] = df[hawaiian_cols].apply(lambda row: [str(val) for val in row if pd.notna(val) and str(val).strip() != ''], axis=1)

# # Drop the original columns
# df.drop(columns=hawaiian_cols, inplace=True)

In [38]:
df['island_type']

0     [H, MA, MO, O]
1                [O]
2                [O]
3               [MA]
4       [MA, O, KAU]
           ...      
94                []
95                []
96                []
97                []
98          [O, KAU]
Name: island_type, Length: 99, dtype: object

In [39]:
def collapse_measurements(df):
    # Group columns by their base name (everything before _min, _max, etc.)
    base_columns = {}
    
    for col in df.columns:
        parts = col.split('_')
        if len(parts) > 2:
            suffix = parts[-1]
            if suffix in ['exmin', 'min', 'max', 'exmax']:
                base_name = '_'.join(parts[:-1])
                if base_name not in base_columns:
                    base_columns[base_name] = []
                base_columns[base_name].append(col)
    
    # Columns to copy directly (non-measurement columns)
    non_measurement_cols = [col for col in df.columns 
                           if not any(col in cols for cols in base_columns.values())]
    
    # Start with non-measurement columns
    result_df = df[non_measurement_cols].copy()
    
    # Process each group of measurement columns
    collapsed_data = {}
    
    for base_name, cols in base_columns.items():
        # Extract just the name without the unit
        name_parts = base_name.split('_')
        unit = name_parts[-1]
        col_name = '_'.join(name_parts[:-1])
        
        # Create dictionaries for each row
        collapsed_data[col_name] = []
        
        for _, row in df.iterrows():
            measurement_dict = {
                'exmin': row[f"{base_name}_exmin"] if f"{base_name}_exmin" in cols else None,
                'min': row[f"{base_name}_min"] if f"{base_name}_min" in cols else None,
                'max': row[f"{base_name}_max"] if f"{base_name}_max" in cols else None,
                'exmax': row[f"{base_name}_exmax"] if f"{base_name}_exmax" in cols else None,
                'unit': unit
            }
            collapsed_data[col_name].append(measurement_dict)
    
    # Convert to DataFrame and concatenate with result_df
    collapsed_df = pd.DataFrame(collapsed_data)
    result_df = pd.concat([result_df, collapsed_df], axis=1)
    
    return result_df

# Example usage
# df is your original dataframe with columns like stem_height_m_exmin, etc.
new_df = collapse_measurements(df)

In [40]:
new_df = collapse_measurements(df)

In [41]:
for i in new_df.columns:
    print(i)

family
genus
species
common_name
wagner_pg_number
description
infraspecific_epithet
stem_hair_type
phyllotaxy_type
leaf_hair_description
leaf_hair_upper_description
leaf_hair_lower_description
breeding_type
inflorescence_type
ray_color
floret_color
spathe_color
perianth_outer_color
perianth_inner_color
perianth_color
labellum_color
corolla_type
staminate_corolla_type
pistillate_corolla_type
corolla_color
fruit_type
fruit_width_mm_max.1
fruit_diameter_mm_min.1
ploidy
chromosome_#
average_chromosome_#
origin_t1
fedstatus_t1 (do at end) 
status
life_form_type
leaf_type
leaf_margin_type
leaf_shape_type
juvenile_leaf_type
juvenile_leaf_margin_type
juvenile_leaf_shape_type
leaflets_leaf_type
leaflets_leaf_margin_type
leaflets_leaf_shape_type
leaf_hair_type
leaf_hair_upper_type
leaf_hair_lower_type
juvenile_leaf_hair_type
island_type
hawaiian_name
stem_height
leaf_length
leaf_width
juvenile_leaf_length
juvenile_leaf_width
leaflet_leaf_length
leaflet_leaf_width
petioles
staminate_inflorescence

In [49]:
df= pd.read_csv("/Users/williamharrigan/Desktop/Github/ai_wagner_trait_data_extraction/files/man_extract.csv")
df.columns = df.columns.str.lower()
df.head()
type_cols = [col for col in df.columns if '_type_' in col]
# type_cols
# Group columns by their prefix (everything before '_type_' + last element)
col_groups = {}
for col in type_cols:
    prefix = '_'.join(col.split('_')[:-1])
    # print(prefix)
    if prefix not in col_groups:
        col_groups[prefix] = []
    col_groups[prefix].append(col)
    # print(col)

# # Create new collapsed columns
for prefix, cols in col_groups.items():
    df[prefix] = df.apply(lambda row: [col.split('_')[-1].upper() for col in cols if row[col] == 1], axis=1)
    
# Drop the original type columns
df.drop(columns=type_cols, inplace=True)

df['hawaiian_name'] = df[['hawaiian_name_1', 'hawaiian_name_2', 'hawaiian_name_3', 'hawaiian_name_4']].apply(
    lambda row: {x for x in row if pd.notna(x)}, axis=1
)

# Drop original columns if needed
df.drop(columns=['hawaiian_name_1', 'hawaiian_name_2', 'hawaiian_name_3', 'hawaiian_name_4'], inplace=True)



# hawaiian_cols = [col for col in df.columns if 'hawaiian_name' in col]
# # Merge values into a list, ensuring all values are strings and filtering out empty values
# df['hawaiian_name'] = df[hawaiian_cols].apply(lambda row: [str(val) for val in row if pd.notna(val) and str(val).strip() != ''], axis=1)

# # Drop the original columns
# df.drop(columns=hawaiian_cols, inplace=True)
new_df = collapse_measurements(df)

In [42]:
amp = new_df.sample(5)

In [43]:
amp[amp['family'] == 'Brassicaceae']

Unnamed: 0,family,genus,species,common_name,wagner_pg_number,description,infraspecific_epithet,stem_hair_type,phyllotaxy_type,leaf_hair_description,...,female_corolla_lobes_width,male_corrola_lobes_length,male_corrola_lobes_width,fruit_length,fruit_width,fruit_diameter,seeds,seed_length,seed_width,seed_diameter
28,Brassicaceae,Nasturtium,microphyllum,watercress,pg 411,Dicots,,G,A,glabrous,...,"{'exmin': nan, 'min': nan, 'max': nan, 'exmax'...","{'exmin': nan, 'min': nan, 'max': nan, 'exmax'...","{'exmin': nan, 'min': nan, 'max': nan, 'exmax'...","{'exmin': nan, 'min': 16.0, 'max': 25.0, 'exma...","{'exmin': nan, 'min': nan, 'max': nan, 'exmax'...","{'exmin': nan, 'min': nan, 'max': nan, 'exmax'...","{'exmin': None, 'min': None, 'max': nan, 'exma...","{'exmin': nan, 'min': nan, 'max': nan, 'exmax'...","{'exmin': None, 'min': nan, 'max': nan, 'exmax...","{'exmin': None, 'min': nan, 'max': nan, 'exmax..."


In [44]:
'leaflet_leaf_length' in amp[amp['family'] == 'Brassicaceae'].columns

True

In [45]:
# new_df.to_csv('/Users/williamharrigan/Desktop/test_csv')

In [46]:
# for i in new_df.columns:
#     if len(i.split('_')) > 2:
#         thing = metric = i.split('_')[-1]
#         metric = i.split('_')[-2]
#         col_name = '_'.join(i.split('_')[:-2])
#         if thing == 'max' or thing == 'min' or thing == 'exmax' or thing == 'exmin':
#             print(i)