This script will parse Zooniverse results and compare to the "ground truth".

In [55]:
# original file from Zooniverse -- annonomized version will be made and saved in "resources"
zoonie_file = '/Users/jnaiman/Downloads/tmp/JCDL2025/zooniverse/first-workflow-for-histograms-classifications.csv'

zoonie_subjects = '/Users/jnaiman/Downloads/tmp/JCDL2025/zooniverse/jcdl-et-al-subjects.csv'

# where ground truth jsons are
gt_jsons = './example_hists/jsons/'

In [19]:
import pandas as pd
import json
import numpy as np

## Subject set

Get the right one.

In [9]:
dfsub = pd.read_csv(zoonie_subjects)

In [12]:
# get subject IDs from the right subject set
sub_id_choose = '130875'

subject_ids = dfsub[dfsub['subject_set_id'].astype('str')==sub_id_choose]['subject_id'].unique()
subject_ids[:5]

array([112268234, 112268235, 112268236, 112268237, 112268238])

## Clean & anonymize data

Get data for just these subject IDs, anonymize annotators.

In [14]:
df_in = pd.read_csv(zoonie_file)

# only right subject IDs
df = df_in[df_in['subject_ids'].isin(subject_ids)]

In [39]:
#df.head()
user_names = df['user_name'].unique()
nAnn = np.linspace(0,len(user_names)-1,len(user_names)).astype('int') + 1

annotators = []
for i in range(len(df)):
    d = df.iloc[i]
    un = d['user_name']
    ind = np.where(un == user_names)[0][0]
    annotators.append('annotator' + str(nAnn[ind]))

dfout = df[['metadata', 'annotations', 'subject_ids']].copy()
dfout['annotators'] = annotators

# also, get the image metadata file name for ground truth
image_id = []
for id in dfout['subject_ids']:
    d = dfsub[dfsub['subject_id'] == id]
    if len(d) != 1:
        print('wrong size!')
        import sys; sys.exit()
    fn = json.loads(d['metadata'].values[0])
    image_id.append(fn['Filename'])

dfout['image file'] = image_id

In [40]:
dfout.head()

Unnamed: 0,metadata,annotations,subject_ids,annotators,image file
1,"{""source"":""api"",""session"":""bc9ecd19a25e8303652...","[{""task"":""T0"",""value"":""1"",""task_label"":""How ma...",112268275,annotator1,id_0041.jpeg
2,"{""source"":""api"",""session"":""bc9ecd19a25e8303652...","[{""task"":""T0"",""value"":""3"",""task_label"":""How ma...",112268248,annotator1,id_0014.jpeg
3,"{""source"":""api"",""session"":""bc9ecd19a25e8303652...","[{""task"":""T0"",""value"":""2"",""task_label"":""How ma...",112268262,annotator1,id_0028.jpeg
4,"{""source"":""api"",""session"":""bc9ecd19a25e8303652...","[{""task"":""T0"",""value"":""2"",""task_label"":""How ma...",112268291,annotator1,id_0057.jpeg
5,"{""source"":""api"",""session"":""bc9ecd19a25e8303652...","[{""task"":""T0"",""value"":""1"",""task_label"":""How ma...",112268254,annotator1,id_0020.jpeg


In [None]:
# parse tasks to human lists
ann_nGaussians = []
ann_medNum = []
ann_medDraw = []
for i in range(len(dfout)):
    d = dfout.iloc[i]

    anns = json.loads(d['annotations'])
    for ann in anns:
        if 'how many gauss' in ann['task_label'].lower():
            ann_nGaussians.append(int(ann['value']))
        elif ('median' in ann['task_label'].lower()) and ('as a number' in ann['task_label'].lower()):
            ann_medNum.append(float(ann['value']))
        elif ('median' in ann['task_label'].lower()) and ('draw a line' in ann['task_label'].lower()):
            ann_medDraw.append(float(ann['value'][0]['x']))

# for human line draw -- need to translate this from pixel location to x/y


df2 = dfout[['annotators','image file']].copy()
df2['Human nGaussians'] = ann_nGaussians
df2['Human median (number)'] = ann_medNum
df2['Human median (draw)'] = ann_medDraw

# get ground truths from jsons
gt_medians = []
gt_nGaussians = []
for i in range(len(df2)):
    d = df2.iloc[i]

    jfile = d['image file'].removesuffix('.jpeg')

    with open(gt_jsons + jfile + '.json', 'r') as f:
        j = json.load(f)
        j = json.loads(j)
    # assume single plot
    xs = j['plot0']['data']['xs']
    # median from distribution
    gt_medians.append(np.median(xs))
    # nGaussian from data params
    gt_nGaussians.append(j['plot0']['data']['data params']['nclusters'])

df2['Ground Truth nGaussian'] = gt_nGaussians
df2['Ground Truth Median'] = gt_medians

In [80]:
df2.head()

Unnamed: 0,annotators,image file,Human nGaussians,Human median (number),Human median (draw),Ground Truth nGaussian,Ground Truth Median
1,annotator1,id_0041.jpeg,1,0.055,501.769531,1,0.054428
2,annotator1,id_0014.jpeg,3,0.45,380.515625,1,0.478811
3,annotator1,id_0028.jpeg,2,-0.42,458.597656,2,-0.417302
4,annotator1,id_0057.jpeg,2,-0.7,389.261719,2,-0.758364
5,annotator1,id_0020.jpeg,1,-0.2,463.078125,1,-0.178007


In [77]:
j['plot0']['data']['data params']['nclusters']

1

In [82]:
j['plot0']['data pixels'].keys()

dict_keys(['xs', 'ys', 'xs_right', 'ys_right', 'xs_left', 'ys_left', 'colors'])

In [85]:
j['plot0']['data from plot'].keys()

dict_keys(['data', 'plot params'])

In [87]:
j['plot0']['data from plot']['plot params']

{'linethick': 1,
 'linestyles': '-',
 'bar color': 'cyan',
 'edge color': [[0, 0, 0]],
 'orientation': 'vertical',
 'rwidth': 1.0,
 'nbins': 20}

In [88]:
j.keys()

dict_keys(['figure', 'plot0', 'VQA'])

In [89]:
j['figure']

{'dpi': 300,
 'base': 5,
 'aspect ratio': 1.333,
 'nrows': 1,
 'ncols': 1,
 'plot style': 'default',
 'color map': 'Blues',
 'title fontsize': 12,
 'xlabel fontsize': 8,
 'ylabel fontsize': 8,
 'plot indexes': [[0, 0]]}

In [93]:
j['plot0'].keys()

dict_keys(['type', 'distribution', 'data', 'data pixels', 'data from plot', 'xticks', 'yticks', 'square', 'title', 'xlabel', 'ylabel'])

In [94]:
j['plot0']['square']

{'xmin': 192.6404517796116,
 'ymin': 154.83333333333334,
 'xmax': 754.0450112528131,
 'ymax': 492.99999999999994}