In [1]:
import pandas as pd

In [2]:
annotations_df = pd.read_csv('walcott_annotations_from_powerapp.csv')
annotations_df.head()

Unnamed: 0,Title,bhl_pageid,scientific_name,common_name,author,geographic_range,specimen_location,altitude_feet,colors_listed,Created,Created By,[annotation notes]
0,,42667723,Prunus angustifolia,CHICKASAW PLUM,Marshall,"from Florida to southern New Jersey, and westw...","near Washington, District of Columbia",,"red, yellow",11/21/2025 1:57 PM,"Naples, Richard",
1,,42602674,Liriodendron tulipifera,Tuliptree,Linnaeus,eastern United States from Louisiana to Florid...,"Washington, District of Columbia",,"green, orange, gold",11/21/2025 2:03 PM,"Sobers, Kira",
2,,42667749,Zygadenus elegans,Deathcamas,Pursh,mountains of Nevada and New Mexico and northwa...,"Clearwater River, thirty-five miles by trail n...",6500.0,"greenish white, bright green",11/21/2025 2:05 PM,"Sobers, Kira",
3,,42669109,Tsuga mertensiana,Mountain Hemlock,(Bongard) Sargent,western Montana to California and Alaska,"near Glacier House, Glacier, British Columbia",3500.0,"dark green,",11/21/2025 2:06 PM,"Sobers, Kira",
4,,42602678,Dodecatheon meadia,Shootingstar,Linnaeus,"Texas to Georgia, and northward into Manitoba;...","near Washington, District of Columbia",,red,11/21/2025 2:08 PM,"Sobers, Kira",


In [3]:
annotations_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167 entries, 0 to 166
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Title               0 non-null      float64
 1   bhl_pageid          167 non-null    int64  
 2   scientific_name     167 non-null    object 
 3   common_name         167 non-null    object 
 4   author              167 non-null    object 
 5   geographic_range    166 non-null    object 
 6   specimen_location   159 non-null    object 
 7   altitude_feet       84 non-null     object 
 8   colors_listed       120 non-null    object 
 9   Created             167 non-null    object 
 10  Created By          167 non-null    object 
 11  [annotation notes]  16 non-null     object 
dtypes: float64(1), int64(1), object(10)
memory usage: 15.8+ KB


In [4]:
annotations_df['bhl_pageid'] = annotations_df['bhl_pageid'].astype(str)

In [5]:
import json
from pathlib import Path

In [6]:
model_dirs = {
    "gemini-2.5-pro": Path("gemini-2.5-pro_results"),
    "gemini-2.5-flash": Path("gemini-2.5-flash_results"),
    "gemini-2.5-flash-lite": Path("gemini-2.5-flash-lite_results"),
}

In [7]:
records = []
for model_name, dir_path in model_dirs.items():
    for json_file in dir_path.glob("*.json"):
        pageid = json_file.stem
        with open(json_file) as f:
            data = json.load(f)
        data["bhl_pageid"] = pageid
        data["model"] = model_name
        records.append(data)
predictions_df = pd.DataFrame(records)
predictions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1176 entries, 0 to 1175
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   plate_number       1176 non-null   object
 1   common_name        1176 non-null   object
 2   scientific_name    1176 non-null   object
 3   author             1176 non-null   object
 4   altitude_feet      1176 non-null   object
 5   geographic_range   1176 non-null   object
 6   specimen_location  1176 non-null   object
 7   colors_listed      1176 non-null   object
 8   bhl_pageid         1176 non-null   object
 9   model              1176 non-null   object
dtypes: object(10)
memory usage: 92.0+ KB


In [8]:
merged = predictions_df.merge(annotations_df, on="bhl_pageid", suffixes=("_pred", "_gt"))
merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 501 entries, 0 to 500
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   plate_number            501 non-null    object 
 1   common_name_pred        501 non-null    object 
 2   scientific_name_pred    501 non-null    object 
 3   author_pred             501 non-null    object 
 4   altitude_feet_pred      501 non-null    object 
 5   geographic_range_pred   501 non-null    object 
 6   specimen_location_pred  501 non-null    object 
 7   colors_listed_pred      501 non-null    object 
 8   bhl_pageid              501 non-null    object 
 9   model                   501 non-null    object 
 10  Title                   0 non-null      float64
 11  scientific_name_gt      501 non-null    object 
 12  common_name_gt          501 non-null    object 
 13  author_gt               501 non-null    object 
 14  geographic_range_gt     498 non-null    ob

In [9]:
def normalize(s):
    if pd.isna(s):
        return ""
    return str(s).strip().lower()

In [10]:
merged["common_name_pred_norm"] = merged["common_name_pred"].apply(normalize)
merged["common_name_gt_norm"] = merged["common_name_gt"].apply(normalize)
merged["common_name_match"] = merged["common_name_pred_norm"] == merged["common_name_gt_norm"]

merged.groupby("model")["common_name_match"].mean()

model
gemini-2.5-flash         1.000000
gemini-2.5-flash-lite    0.988024
gemini-2.5-pro           1.000000
Name: common_name_match, dtype: float64

In [11]:
mismatches = merged[~merged["common_name_match"]][["bhl_pageid", "model", "common_name_pred_norm", "common_name_gt_norm"]]
mismatches.head(10)

Unnamed: 0,bhl_pageid,model,common_name_pred_norm,common_name_gt_norm
381,42669697,gemini-2.5-flash-lite,springbeauty,virginia springbeauty
488,42602734,gemini-2.5-flash-lite,mountain lady slipper,mountain ladyslipper


In [12]:
def calculate_accuracy(merged_df, column):
    """Calculate exact-match accuracy for a column across models."""
    pred_col = f"{column}_pred"
    gt_col = f"{column}_gt"
    match_col = f"{column}_match"
    
    pred_norm = merged_df[pred_col].apply(normalize)
    gt_norm = merged_df[gt_col].apply(normalize)
    merged_df[match_col] = pred_norm == gt_norm
    
    return merged_df.groupby("model")[match_col].mean()


In [13]:
for col in ["common_name", "scientific_name", "author"]:
    print(f"\n{col}:")
    print(calculate_accuracy(merged, col))


common_name:
model
gemini-2.5-flash         1.000000
gemini-2.5-flash-lite    0.988024
gemini-2.5-pro           1.000000
Name: common_name_match, dtype: float64

scientific_name:
model
gemini-2.5-flash         0.964072
gemini-2.5-flash-lite    0.874251
gemini-2.5-pro           0.994012
Name: scientific_name_match, dtype: float64

author:
model
gemini-2.5-flash         0.742515
gemini-2.5-flash-lite    0.712575
gemini-2.5-pro           0.964072
Name: author_match, dtype: float64


In [14]:
import re

def tokenize(s):
    """Normalize and split into word set."""
    if pd.isna(s):
        return set()
    # Lowercase, remove punctuation, split into words
    s = str(s).lower()
    s = re.sub(r'[^\w\s]', '', s)  # Remove punctuation
    return set(s.split())

def word_difference(s1, s2):
    """Count words that differ between two strings."""
    words1 = tokenize(s1)
    words2 = tokenize(s2)
    return len(words1.symmetric_difference(words2))

def calculate_fuzzy_accuracy(merged_df, column, max_diff=2):
    """Calculate accuracy allowing up to max_diff word differences."""
    pred_col = f"{column}_pred"
    gt_col = f"{column}_gt"
    match_col = f"{column}_match"
    diff_col = f"{column}_diff"
    
    merged_df[diff_col] = merged_df.apply(
        lambda row: word_difference(row[pred_col], row[gt_col]), 
        axis=1
    )
    merged_df[match_col] = merged_df[diff_col] <= max_diff
    
    return merged_df.groupby("model")[match_col].mean()

# Run it
for col in ["geographic_range", "specimen_location"]:
    print(f"\n{col} (max 2 words different):")
    print(calculate_fuzzy_accuracy(merged, col, max_diff=2))


geographic_range (max 2 words different):
model
gemini-2.5-flash         0.688623
gemini-2.5-flash-lite    0.748503
gemini-2.5-pro           0.778443
Name: geographic_range_match, dtype: float64

specimen_location (max 2 words different):
model
gemini-2.5-flash         0.904192
gemini-2.5-flash-lite    0.886228
gemini-2.5-pro           0.886228
Name: specimen_location_match, dtype: float64


In [15]:
merged["geographic_range_diff"].value_counts().sort_index().head(10)

geographic_range_diff
0    145
1    194
2     31
3     24
4     18
5     10
6     17
7     14
8     12
9     11
Name: count, dtype: int64

In [18]:
mismatches = merged[~merged["geographic_range_match"]][["bhl_pageid", "model", "geographic_range_pred", "geographic_range_gt"]]
mismatches.head(10).to_dict(orient='records')

[{'bhl_pageid': '42602672',
  'model': 'gemini-2.5-pro',
  'geographic_range_pred': 'Southern Appalachians, ranging from the uplands of Georgia northward, and being abundant as far as West Virginia. In the mountains of Pennsylvania it is rare, and though reported many years ago to grow in southern New York State, it has long since disappeared from there as a native plant.',
  'geographic_range_gt': 'from the uplands of Georgia northward, and being abundant as far as West Virginia'},
 {'bhl_pageid': '42669644',
  'model': 'gemini-2.5-pro',
  'geographic_range_pred': 'Colorado northward through the Rockies to the Arctic regions and westward to Oregon. It grows also in Greenland and Asia.',
  'geographic_range_gt': 'from Colorado northward through the Rockies to the Arctic regions and westward to Oregon, also in Greenland and Asia'},
 {'bhl_pageid': '42669097',
  'model': 'gemini-2.5-pro',
  'geographic_range_pred': 'Florida and Alabama northward to southeastern Virginia. It has been repo

In [None]:
def parse_colors(s):
    """Parse comma-separated colors into a normalized set."""
    if pd.isna(s) or str(s).strip() == "":
        return set()
    # Split on comma, strip whitespace, lowercase, remove empty strings
    colors = [c.strip().lower() for c in str(s).split(",")]
    return set(c for c in colors if c)  # Filter out empty strings

def calculate_set_accuracy(merged_df, column):
    """Calculate exact set match accuracy."""
    pred_col = f"{column}_pred"
    gt_col = f"{column}_gt"
    match_col = f"{column}_match"
    
    merged_df[f"{column}_pred_set"] = merged_df[pred_col].apply(parse_colors)
    merged_df[f"{column}_gt_set"] = merged_df[gt_col].apply(parse_colors)
    merged_df[match_col] = merged_df[f"{column}_pred_set"] == merged_df[f"{column}_gt_set"]
    
    return merged_df.groupby("model")[match_col].mean()

print("colors_listed (exact set match):")
print(calculate_set_accuracy(merged, "colors_listed"))

In [None]:
def normalize_altitude(s):
    """Normalize altitude: remove commas, 'feet', and whitespace."""
    if pd.isna(s) or str(s).strip() == "":
        return ""
    s = str(s).lower()
    s = s.replace(",", "")      # Remove commas
    s = s.replace("feet", "")   # Remove "feet"
    s = s.strip()
    return s

def calculate_altitude_accuracy(merged_df):
    """Calculate exact match accuracy for altitude_feet."""
    merged_df["altitude_feet_pred_norm"] = merged_df["altitude_feet_pred"].apply(normalize_altitude)
    merged_df["altitude_feet_gt_norm"] = merged_df["altitude_feet_gt"].apply(normalize_altitude)
    merged_df["altitude_feet_match"] = merged_df["altitude_feet_pred_norm"] == merged_df["altitude_feet_gt_norm"]
    
    return merged_df.groupby("model")["altitude_feet_match"].mean()

print("altitude_feet:")
print(calculate_altitude_accuracy(merged))

In [None]:
# All match columns we created
match_cols = [
    "common_name_match",
    "scientific_name_match",
    "author_match",
    "geographic_range_match",
    "specimen_location_match",
    "altitude_feet_match",
    "colors_listed_match"
]

# Accuracy per field per model
accuracy_summary = merged.groupby("model")[match_cols].mean()

# Rename columns to remove "_match" suffix for cleaner display
accuracy_summary.columns = [col.replace("_match", "") for col in accuracy_summary.columns]

# Add overall average across all fields
accuracy_summary["overall"] = accuracy_summary.mean(axis=1)

print(accuracy_summary.round(3))

In [None]:
model_order = [
    "gemini-2.5-flash-lite",
    "gemini-2.5-flash",
    "gemini-2.5-pro"
]

accuracy_summary = accuracy_summary.reindex(model_order)

In [None]:
styled = accuracy_summary.style \
    .bar(axis=None, cmap='RdYlGn', vmin=0, vmax=1) \
    .format("{:.1%}") \
    .set_properties(**{
        'text-align': 'left',
        'padding-left': '5px'
    })

styled

In [None]:
styled = accuracy_summary.style \
    .bar(axis=None, cmap='RdYlGn', vmin=0, vmax=1) \
    .format("{:.1%}") \
    .set_properties(**{
        'text-align': 'left',
        'padding-left': '5px',
        'color': 'white',
        'text-shadow': '1px 1px 2px black'
    })

styled

In [None]:
styled.to_html("gemini_accuracy_table.html")