In [1]:
import pandas as pd

In [2]:
annotations_df = pd.read_csv('walcott_annotations_from_powerapp.csv')
annotations_df.head()

Unnamed: 0,Title,bhl_pageid,scientific_name,common_name,author,geographic_range,specimen_location,altitude_feet,colors_listed,Created,Created By,[annotation notes]
0,,42667723,Prunus angustifolia,CHICKASAW PLUM,Marshall,"from Florida to southern New Jersey, and westw...","near Washington, District of Columbia",,"red, yellow",11/21/2025 1:57 PM,"Naples, Richard",
1,,42602674,Liriodendron tulipifera,Tuliptree,Linnaeus,eastern United States from Louisiana to Florid...,"Washington, District of Columbia",,"green, orange, gold",11/21/2025 2:03 PM,"Sobers, Kira",
2,,42667749,Zygadenus elegans,Deathcamas,Pursh,mountains of Nevada and New Mexico and northwa...,"Clearwater River, thirty-five miles by trail n...",6500.0,"greenish white, bright green",11/21/2025 2:05 PM,"Sobers, Kira",
3,,42669109,Tsuga mertensiana,Mountain Hemlock,(Bongard) Sargent,western Montana to California and Alaska,"near Glacier House, Glacier, British Columbia",3500.0,"dark green,",11/21/2025 2:06 PM,"Sobers, Kira",
4,,42602678,Dodecatheon meadia,Shootingstar,Linnaeus,"Texas to Georgia, and northward into Manitoba;...","near Washington, District of Columbia",,red,11/21/2025 2:08 PM,"Sobers, Kira",


In [3]:
annotations_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167 entries, 0 to 166
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Title               0 non-null      float64
 1   bhl_pageid          167 non-null    int64  
 2   scientific_name     167 non-null    object 
 3   common_name         167 non-null    object 
 4   author              167 non-null    object 
 5   geographic_range    166 non-null    object 
 6   specimen_location   159 non-null    object 
 7   altitude_feet       84 non-null     object 
 8   colors_listed       120 non-null    object 
 9   Created             167 non-null    object 
 10  Created By          167 non-null    object 
 11  [annotation notes]  16 non-null     object 
dtypes: float64(1), int64(1), object(10)
memory usage: 15.8+ KB


In [4]:
annotations_df['bhl_pageid'] = annotations_df['bhl_pageid'].astype(str)

In [5]:
import json
from pathlib import Path

In [6]:
model_dirs = {
    "Qwen3-0.6b": Path("qwen06"),
    "Qwen3-1.7b": Path("qwen17"),
    "Qwen3-4b": Path("Qwen3-4B-Instruct-2507"),
    "Qwen3-8b": Path("qwen8b"),
    "Qwen3-14b": Path("qwen14b"),
    "Qwen3-32b": Path("qwen32b"),
}

In [7]:
records = []
for model_name, dir_path in model_dirs.items():
    for json_file in dir_path.glob("*.json"):
        pageid = json_file.stem
        with open(json_file) as f:
            data = json.load(f)
        data["bhl_pageid"] = pageid
        data["model"] = model_name
        records.append(data)
predictions_df = pd.DataFrame(records)
predictions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2352 entries, 0 to 2351
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   plate_number       2352 non-null   object
 1   common_name        2352 non-null   object
 2   scientific_name    2352 non-null   object
 3   author             2352 non-null   object
 4   altitude_feet      2352 non-null   object
 5   geographic_range   2352 non-null   object
 6   specimen_location  2352 non-null   object
 7   colors_listed      2352 non-null   object
 8   bhl_pageid         2352 non-null   object
 9   model              2352 non-null   object
dtypes: object(10)
memory usage: 183.9+ KB


In [8]:
merged = predictions_df.merge(annotations_df, on="bhl_pageid", suffixes=("_pred", "_gt"))
merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1002 entries, 0 to 1001
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   plate_number            1002 non-null   object 
 1   common_name_pred        1002 non-null   object 
 2   scientific_name_pred    1002 non-null   object 
 3   author_pred             1002 non-null   object 
 4   altitude_feet_pred      1002 non-null   object 
 5   geographic_range_pred   1002 non-null   object 
 6   specimen_location_pred  1002 non-null   object 
 7   colors_listed_pred      1002 non-null   object 
 8   bhl_pageid              1002 non-null   object 
 9   model                   1002 non-null   object 
 10  Title                   0 non-null      float64
 11  scientific_name_gt      1002 non-null   object 
 12  common_name_gt          1002 non-null   object 
 13  author_gt               1002 non-null   object 
 14  geographic_range_gt     996 non-null    

In [9]:
def normalize(s):
    if pd.isna(s):
        return ""
    return str(s).strip().lower()

In [10]:
merged["common_name_pred_norm"] = merged["common_name_pred"].apply(normalize)
merged["common_name_gt_norm"] = merged["common_name_gt"].apply(normalize)
merged["common_name_match"] = merged["common_name_pred_norm"] == merged["common_name_gt_norm"]

merged.groupby("model")["common_name_match"].mean()

model
Qwen3-0.6b    0.814371
Qwen3-1.7b    0.850299
Qwen3-14b     0.982036
Qwen3-32b     0.982036
Qwen3-4b      0.970060
Qwen3-8b      0.976048
Name: common_name_match, dtype: float64

In [11]:
mismatches = merged[~merged["common_name_match"]][["bhl_pageid", "model", "common_name_pred_norm", "common_name_gt_norm"]]
mismatches.head(10)

Unnamed: 0,bhl_pageid,model,common_name_pred_norm,common_name_gt_norm
3,42669668,Qwen3-0.6b,fremontia,mexican fremontia
4,42667748,Qwen3-0.6b,amelanchier alnifolia,saskatoon
7,42669351,Qwen3-0.6b,salix drummondiana barratt,drummond willow
12,42602668,Qwen3-0.6b,viola pedata linnaeus,birdsfoot violet
17,42667744,Qwen3-0.6b,achillea lanulosa,western yarrow
18,42667713,Qwen3-0.6b,oxytropis splendens,showy oxytrope
19,42669127,Qwen3-0.6b,pyrola secunda,sidebells pyrola
23,42667801,Qwen3-0.6b,loyds strawberry-cactus,lloyds strawberry-cactus
31,42602673,Qwen3-0.6b,cracca virginiana,rabbitbean
34,42669674,Qwen3-0.6b,cyrtopodium,spotted cyrtopodium


In [12]:
def calculate_accuracy(merged_df, column):
    """Calculate exact-match accuracy for a column across models."""
    pred_col = f"{column}_pred"
    gt_col = f"{column}_gt"
    match_col = f"{column}_match"
    
    pred_norm = merged_df[pred_col].apply(normalize)
    gt_norm = merged_df[gt_col].apply(normalize)
    merged_df[match_col] = pred_norm == gt_norm
    
    return merged_df.groupby("model")[match_col].mean()


In [13]:
for col in ["common_name", "scientific_name", "author"]:
    print(f"\n{col}:")
    print(calculate_accuracy(merged, col))


common_name:
model
Qwen3-0.6b    0.814371
Qwen3-1.7b    0.850299
Qwen3-14b     0.982036
Qwen3-32b     0.982036
Qwen3-4b      0.970060
Qwen3-8b      0.976048
Name: common_name_match, dtype: float64

scientific_name:
model
Qwen3-0.6b    0.251497
Qwen3-1.7b    0.502994
Qwen3-14b     0.994012
Qwen3-32b     0.994012
Qwen3-4b      0.976048
Qwen3-8b      0.928144
Name: scientific_name_match, dtype: float64

author:
model
Qwen3-0.6b    0.670659
Qwen3-1.7b    0.670659
Qwen3-14b     0.700599
Qwen3-32b     0.700599
Qwen3-4b      0.700599
Qwen3-8b      0.712575
Name: author_match, dtype: float64


In [14]:
import re

def tokenize(s):
    """Normalize and split into word set."""
    if pd.isna(s):
        return set()
    # Lowercase, remove punctuation, split into words
    s = str(s).lower()
    s = re.sub(r'[^\w\s]', '', s)  # Remove punctuation
    return set(s.split())

def word_difference(s1, s2):
    """Count words that differ between two strings."""
    words1 = tokenize(s1)
    words2 = tokenize(s2)
    return len(words1.symmetric_difference(words2))

def calculate_fuzzy_accuracy(merged_df, column, max_diff=2):
    """Calculate accuracy allowing up to max_diff word differences."""
    pred_col = f"{column}_pred"
    gt_col = f"{column}_gt"
    match_col = f"{column}_match"
    diff_col = f"{column}_diff"
    
    merged_df[diff_col] = merged_df.apply(
        lambda row: word_difference(row[pred_col], row[gt_col]), 
        axis=1
    )
    merged_df[match_col] = merged_df[diff_col] <= max_diff
    
    return merged_df.groupby("model")[match_col].mean()

# Run it
for col in ["geographic_range", "specimen_location"]:
    print(f"\n{col} (max 2 words different):")
    print(calculate_fuzzy_accuracy(merged, col, max_diff=2))


geographic_range (max 2 words different):
model
Qwen3-0.6b    0.754491
Qwen3-1.7b    0.772455
Qwen3-14b     0.850299
Qwen3-32b     0.808383
Qwen3-4b      0.814371
Qwen3-8b      0.814371
Name: geographic_range_match, dtype: float64

specimen_location (max 2 words different):
model
Qwen3-0.6b    0.556886
Qwen3-1.7b    0.826347
Qwen3-14b     0.910180
Qwen3-32b     0.904192
Qwen3-4b      0.892216
Qwen3-8b      0.898204
Name: specimen_location_match, dtype: float64


In [15]:
merged["geographic_range_diff"].value_counts().sort_index().head(10)

geographic_range_diff
0    395
1    318
2     91
3     48
4     15
5      7
6     31
7     11
8     13
9      6
Name: count, dtype: int64

In [16]:
def parse_colors(s):
    """Parse comma-separated colors into a normalized set."""
    if pd.isna(s) or str(s).strip() == "":
        return set()
    # Split on comma, strip whitespace, lowercase, remove empty strings
    colors = [c.strip().lower() for c in str(s).split(",")]
    return set(c for c in colors if c)  # Filter out empty strings

def calculate_set_accuracy(merged_df, column):
    """Calculate exact set match accuracy."""
    pred_col = f"{column}_pred"
    gt_col = f"{column}_gt"
    match_col = f"{column}_match"
    
    merged_df[f"{column}_pred_set"] = merged_df[pred_col].apply(parse_colors)
    merged_df[f"{column}_gt_set"] = merged_df[gt_col].apply(parse_colors)
    merged_df[match_col] = merged_df[f"{column}_pred_set"] == merged_df[f"{column}_gt_set"]
    
    return merged_df.groupby("model")[match_col].mean()

print("colors_listed (exact set match):")
print(calculate_set_accuracy(merged, "colors_listed"))

colors_listed (exact set match):
model
Qwen3-0.6b    0.179641
Qwen3-1.7b    0.335329
Qwen3-14b     0.514970
Qwen3-32b     0.598802
Qwen3-4b      0.479042
Qwen3-8b      0.502994
Name: colors_listed_match, dtype: float64


In [17]:
def normalize_altitude(s):
    """Normalize altitude: remove commas, 'feet', and whitespace."""
    if pd.isna(s) or str(s).strip() == "":
        return ""
    s = str(s).lower()
    s = s.replace(",", "")      # Remove commas
    s = s.replace("feet", "")   # Remove "feet"
    s = s.strip()
    return s

def calculate_altitude_accuracy(merged_df):
    """Calculate exact match accuracy for altitude_feet."""
    merged_df["altitude_feet_pred_norm"] = merged_df["altitude_feet_pred"].apply(normalize_altitude)
    merged_df["altitude_feet_gt_norm"] = merged_df["altitude_feet_gt"].apply(normalize_altitude)
    merged_df["altitude_feet_match"] = merged_df["altitude_feet_pred_norm"] == merged_df["altitude_feet_gt_norm"]
    
    return merged_df.groupby("model")["altitude_feet_match"].mean()

print("altitude_feet:")
print(calculate_altitude_accuracy(merged))

altitude_feet:
model
Qwen3-0.6b    0.568862
Qwen3-1.7b    0.976048
Qwen3-14b     0.922156
Qwen3-32b     0.970060
Qwen3-4b      0.976048
Qwen3-8b      0.982036
Name: altitude_feet_match, dtype: float64


In [18]:
# All match columns we created
match_cols = [
    "common_name_match",
    "scientific_name_match",
    "author_match",
    "geographic_range_match",
    "specimen_location_match",
    "altitude_feet_match",
    "colors_listed_match"
]

# Accuracy per field per model
accuracy_summary = merged.groupby("model")[match_cols].mean()

# Rename columns to remove "_match" suffix for cleaner display
accuracy_summary.columns = [col.replace("_match", "") for col in accuracy_summary.columns]

# Add overall average across all fields
accuracy_summary["overall"] = accuracy_summary.mean(axis=1)

print(accuracy_summary.round(3))

            common_name  scientific_name  author  geographic_range  \
model                                                                
Qwen3-0.6b        0.814            0.251   0.671             0.754   
Qwen3-1.7b        0.850            0.503   0.671             0.772   
Qwen3-14b         0.982            0.994   0.701             0.850   
Qwen3-32b         0.982            0.994   0.701             0.808   
Qwen3-4b          0.970            0.976   0.701             0.814   
Qwen3-8b          0.976            0.928   0.713             0.814   

            specimen_location  altitude_feet  colors_listed  overall  
model                                                                 
Qwen3-0.6b              0.557          0.569          0.180    0.542  
Qwen3-1.7b              0.826          0.976          0.335    0.705  
Qwen3-14b               0.910          0.922          0.515    0.839  
Qwen3-32b               0.904          0.970          0.599    0.851  
Qwen3-4b     

In [19]:
model_order = [
    "Qwen3-0.6b",
    "Qwen3-1.7b",
    "Qwen3-4b",
    "Qwen3-8b",
    "Qwen3-14b",
    "Qwen3-32b",    
]

accuracy_summary = accuracy_summary.reindex(model_order)

In [20]:
styled = accuracy_summary.style \
    .bar(axis=None, cmap='RdYlGn', vmin=0, vmax=1) \
    .format("{:.1%}") \
    .set_properties(**{
        'text-align': 'left',
        'padding-left': '5px'
    })

styled

Unnamed: 0_level_0,common_name,scientific_name,author,geographic_range,specimen_location,altitude_feet,colors_listed,overall
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Qwen3-0.6b,81.4%,25.1%,67.1%,75.4%,55.7%,56.9%,18.0%,54.2%
Qwen3-1.7b,85.0%,50.3%,67.1%,77.2%,82.6%,97.6%,33.5%,70.5%
Qwen3-4b,97.0%,97.6%,70.1%,81.4%,89.2%,97.6%,47.9%,83.0%
Qwen3-8b,97.6%,92.8%,71.3%,81.4%,89.8%,98.2%,50.3%,83.1%
Qwen3-14b,98.2%,99.4%,70.1%,85.0%,91.0%,92.2%,51.5%,83.9%
Qwen3-32b,98.2%,99.4%,70.1%,80.8%,90.4%,97.0%,59.9%,85.1%


In [21]:
styled = accuracy_summary.style \
    .bar(axis=None, cmap='RdYlGn', vmin=0, vmax=1) \
    .format("{:.1%}") \
    .set_properties(**{
        'text-align': 'left',
        'padding-left': '5px',
        'color': 'white',
        'text-shadow': '1px 1px 2px black'
    })

styled

Unnamed: 0_level_0,common_name,scientific_name,author,geographic_range,specimen_location,altitude_feet,colors_listed,overall
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Qwen3-0.6b,81.4%,25.1%,67.1%,75.4%,55.7%,56.9%,18.0%,54.2%
Qwen3-1.7b,85.0%,50.3%,67.1%,77.2%,82.6%,97.6%,33.5%,70.5%
Qwen3-4b,97.0%,97.6%,70.1%,81.4%,89.2%,97.6%,47.9%,83.0%
Qwen3-8b,97.6%,92.8%,71.3%,81.4%,89.8%,98.2%,50.3%,83.1%
Qwen3-14b,98.2%,99.4%,70.1%,85.0%,91.0%,92.2%,51.5%,83.9%
Qwen3-32b,98.2%,99.4%,70.1%,80.8%,90.4%,97.0%,59.9%,85.1%


In [22]:
styled.to_html("qwen3_accuracy_table.html")