In [32]:
"""
Author        : Aditya Jain
Date Started  : September 15, 2022
About         : Builds box plots test accuracy for binned training images
"""
import json
import plotly.express as px
import pandas as pd

test_accuracy_file  = '/home/mila/a/aditya.jain/logs/uk-denmark-moth-model_v01_taxon-accuracy.json'
train_points_file   = '/home/mila/a/aditya.jain/gbif_species_trainer/model_training/data/uk-denmark_count_training_points.json'

In [33]:
with open(test_accuracy_file, 'r') as f:
    test_acc_data = json.load(f)
test_acc_data = test_acc_data['species']    

with open(train_points_file, 'r') as f:
    train_pts_data = json.load(f)
train_pts_data = train_pts_data['species']

In [34]:
test_acc_data_list  = []
train_pts_data_list = []

for species in test_acc_data:
    test_acc_data_list.append(test_acc_data[species][0])
    
    # adding training points according to bins
    if train_pts_data[species]<5:        
        train_pts_data_list.append('0-4')
    elif train_pts_data[species]>=5 and train_pts_data[species]<10:        
        train_pts_data_list.append('5-9')
    elif train_pts_data[species]>=10 and train_pts_data[species]<20:        
        train_pts_data_list.append('10-19')
    elif train_pts_data[species]>=20 and train_pts_data[species]<50:        
        train_pts_data_list.append('20-49')
    elif train_pts_data[species]>=50 and train_pts_data[species]<100:        
        train_pts_data_list.append('50-99')
    elif train_pts_data[species]>=100 and train_pts_data[species]<200:        
        train_pts_data_list.append('100-199')
    elif train_pts_data[species]>=200 and train_pts_data[species]<500:        
        train_pts_data_list.append('200-499')
    elif train_pts_data[species]>=500 and train_pts_data[species]<=1000:        
        train_pts_data_list.append('500-1K')
    else:
        train_pts_data_list.append('1K+')

df = pd.DataFrame(list(zip(test_acc_data_list, train_pts_data_list)),
               columns =['Test Accuracy', 'Binned Number of Training Images'])

fig = px.box(df, x="Binned Number of Training Images", y="Test Accuracy", \
             title="UK-Denmark Model Accuracy (EfficientNet-V2B3)",\
             category_orders={"Binned Number of Training Images": ["0-4", "5-9", "10-19", "20-49", "50-99", "100-199", "200-499", "500-1K", "1K+"]})

bins = df['Binned Number of Training Images'].unique()
for bin_var in bins:
    fig.add_annotation(x=bin_var,
                       y = df[df['Binned Number of Training Images']==bin_var]['Test Accuracy'].max(),
                       text = str(len(df[df['Binned Number of Training Images']==bin_var]['Binned Number of Training Images'])),
                       yshift = 10,
                       showarrow = False
                      )

fig.write_image("UK-Denmark_boxplot_binned_test_accuracy.png")

1053
