This script will parse Zooniverse results and compare to the "ground truth".

In [1]:
# original file from Zooniverse -- annonomized version will be made and saved in "resources"
zoonie_file = '/Users/jnaiman/Dropbox/Paper_JCDL2025/zooniverse/first-workflow-for-histograms-classifications.csv'

zoonie_subjects = '/Users/jnaiman/Dropbox/Paper_JCDL2025/zooniverse/jcdl-et-al-subjects.csv'

# where ground truth jsons are
gt_jsons = '../example_hists/jsons/'

In [2]:
import pandas as pd
import json
import numpy as np

## Subject set

Get the right one.

In [3]:
dfsub = pd.read_csv(zoonie_subjects)

In [4]:
# get subject IDs from the right subject set
sub_id_choose = '130875'

subject_ids = dfsub[dfsub['subject_set_id'].astype('str')==sub_id_choose]['subject_id'].unique()
subject_ids[:5]

array([112268234, 112268235, 112268236, 112268237, 112268238])

## Clean & anonymize data

Get data for just these subject IDs, anonymize annotators.

In [5]:
df_in = pd.read_csv(zoonie_file)

# only right subject IDs
df = df_in[df_in['subject_ids'].isin(subject_ids)]

In [6]:
#df.head()
user_names = df['user_name'].unique()
nAnn = np.linspace(0,len(user_names)-1,len(user_names)).astype('int') + 1

annotators = []
for i in range(len(df)):
    d = df.iloc[i]
    un = d['user_name']
    ind = np.where(un == user_names)[0][0]
    annotators.append('annotator' + str(nAnn[ind]))

dfout = df[['metadata', 'annotations', 'subject_ids']].copy()
dfout['annotators'] = annotators

# also, get the image metadata file name for ground truth
image_id = []
for id in dfout['subject_ids']:
    d = dfsub[dfsub['subject_id'] == id]
    if len(d) != 1:
        print('wrong size!')
        import sys; sys.exit()
    fn = json.loads(d['metadata'].values[0])
    image_id.append(fn['Filename'])

dfout['image file'] = image_id

In [7]:
#dfout.head()

In [8]:
# parse tasks to human lists
ann_nGaussians = []
ann_medNum = []
ann_medDraw = []
for i in range(len(dfout)):
    d = dfout.iloc[i]

    anns = json.loads(d['annotations'])
    for ann in anns:
        if 'how many gauss' in ann['task_label'].lower():
            ann_nGaussians.append(int(ann['value']))
        elif ('median' in ann['task_label'].lower()) and ('as a number' in ann['task_label'].lower()):
            ann_medNum.append(float(ann['value']))
        elif ('median' in ann['task_label'].lower()) and ('draw a line' in ann['task_label'].lower()):
            ann_medDraw.append(float(ann['value'][0]['x']))


df2 = dfout[['annotators','image file']].copy()
df2['Human nGaussians'] = ann_nGaussians
df2['Human median (number)'] = ann_medNum
#df2['Human median (draw)'] = ann_medDraw

# get ground truths from jsons
ann_medDraw_convert = []
gt_medians = []
gt_nGaussians = []
for i in range(len(df2)):
    d = df2.iloc[i]

    jfile = d['image file'].removesuffix('.jpeg')

    with open(gt_jsons + jfile + '.json', 'r') as f:
        j = json.load(f)
        j = json.loads(j)
    # assume single plot
    xs = j['plot0']['data']['xs']
    # median from distribution
    gt_medians.append(np.median(xs))
    # nGaussian from data params
    gt_nGaussians.append(j['plot0']['data']['data params']['nclusters'])

    # for human draw, translate to pixels
    xs_pixels = j['plot0']['data pixels']['xs'] # bar centers in pixels
    edges = np.array(j['plot0']['data from plot']['data'][1]) # [0,1] are height, edges output form histogram
    centers = edges[:-1] + 0.5*np.abs(edges[:-1] - edges[1:])
    med = (ann_medDraw[i] - np.min(xs_pixels))/(np.max(xs_pixels)-np.min(xs_pixels))*(np.max(centers) - np.min(centers)) + np.min(centers)
    ann_medDraw_convert.append(med)

df2['Human median (draw)'] = ann_medDraw_convert
df2['Ground Truth nGaussians'] = gt_nGaussians
df2['Ground Truth Median'] = gt_medians

In [9]:
df2.head()

Unnamed: 0,annotators,image file,Human nGaussians,Human median (number),Human median (draw),Ground Truth nGaussians,Ground Truth Median
1,annotator1,id_0041.jpeg,1,0.055,0.055269,1,0.054428
2,annotator1,id_0014.jpeg,3,0.45,0.457472,1,0.478811
3,annotator1,id_0028.jpeg,2,-0.42,-0.415863,2,-0.417302
4,annotator1,id_0057.jpeg,2,-0.7,-0.707283,2,-0.758364
5,annotator1,id_0020.jpeg,1,-0.2,-0.171428,1,-0.178007


In [10]:
# save to file
df2.to_csv('./cleaned_zooniverse.csv', index=False)