In [2]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df = pd.read_csv('./zooniverse_exports/classify-classifications.csv', delimiter=",")

In [4]:
print(df.keys())

Index(['classification_id', 'user_name', 'user_id', 'user_ip', 'workflow_id',
       'workflow_name', 'workflow_version', 'created_at', 'gold_standard',
       'expert', 'metadata', 'annotations', 'subject_data', 'subject_ids'],
      dtype='object')


In [5]:
subject_data = df['subject_data']

In [23]:
def extract_subject_keys(data):
    all_keys = []

    for i in range(len(data)):
        for k, v in json.loads(data[i]).items():
            for key in v.keys():
                all_keys.append(key)

    return set(all_keys)

def extract_subject_names(data):
    subject_filenames = []
    field_counts = {   # check to see how many files have an 'image' or 'IMAGE' field vs. 'Filename'
        "Filename": 0,
        "IMAGE": 0,
        "image": 0
    }
    
    without_filename = []
    
    for datum in data:
        subject = json.loads(datum)
        
        for k, v in subject.items():
            keys = v.keys()
            filename = False
            if "Filename" in keys:
                filename = v["Filename"]
                field_counts['Filename'] += 1
            if "IMAGE" in keys:
                filename = v["IMAGE"]
                field_counts['IMAGE'] += 1
            if "image" in keys:
                filename = v["image"]
                field_counts['image'] += 1
            
            if filename:
                subject_filenames.append(filename)
        
    print(field_counts)
    print(len(without_filename))
    return subject_filenames

def split_name(name):
    '''
    Extract number designation from UDG candidate name string 
    Input: name, e.g. "UDGcand_1261", 
    Returns: number designation, e.g. 1261
    '''
    
    return int(name.split('_')[-1])

def compare(a, b):
    '''
    Custom comparison function to sort UDG candidate names by ascending number designation
    Input: params a, b as strings, e.g. "UDGcand_1303", "UDGcand_1311"
    Returns: split_name(a) - split_name(b)
    '''
    
    a = split_name(a)
    b = split_name(b)
    
    return a - b

In [14]:
unique_subject_keys = extract_subject_keys(subject_data)
print(unique_subject_keys)

{'DEC', 'R', 'Filename', 'Image_id', 'image_cross', 'image', 'G-I', 'ID', 'IMAGE', 'RA', 'retired'}


In [24]:
unique_names = set(extract_subject_names(subject_data))

{'Filename': 6295, 'IMAGE': 4259, 'image': 93154}
0


In [28]:
list(unique_names)

['UDGcand_1541_insp.png',
 'UDGcand_5603_insp.png',
 'UDGcand_6077_insp.png',
 'UDGcand_5840_insp.png',
 'UDGcand_5600_insp.png',
 'UDGcand_1156_insp.png',
 'UDGcand_6215_insp.png',
 'UDGcand_5551_insp.png',
 'UDGcand_866_insp.png',
 'UDGcand_1917_insp.png',
 'UDGcand_6896_insp.png',
 'UDGcand_2143_insp.png',
 'UDGcand_6106_insp.png',
 'UDGcand_3839_insp.png',
 'UDGcand_4802_insp.png',
 'UDGcand_2357_insp.png',
 'UDGcand_3795_insp.png',
 'UDGcand_2453_insp.png',
 'UDGcand_6018_insp.png',
 'UDGcand_6594_insp.png',
 'UDGcand_757_insp.png',
 'UDGcand_6943_insp.png',
 'UDGcand_3794_insp.png',
 'UDGcand_1664_insp.png',
 'UDGcand_983_insp.png',
 'UDGcand_934_insp.png',
 'UDGcand_7083_insp.png',
 'UDGcand_4309_insp.png',
 'UDGcand_1093_insp.png',
 'UDGcand_1292_insp.png',
 'UDGcand_4674_insp.png',
 'UDGcand_1550_insp.png',
 'UDGcand_4734_insp.png',
 'UDGcand_1464_insp.png',
 'UDGcand_3313_insp.png',
 'UDGcand_5167_insp.png',
 'UDGcand_4748_insp.png',
 'UDGcand_2355_insp.png',
 'UDGcand_1553_i

In [10]:
parsed_names = []

for name in unique_names:
    name = name[:-9]
    parsed_names.append(name)

In [11]:
# sort for presentation purposes

from functools import cmp_to_key

sorted_parsed_names = sorted(parsed_names, key=cmp_to_key(compare))

In [30]:
# # save the sorted list of unique target names to a .txt file
# #   uncomment to re-save
# np.savetxt('sf_candidate_names__classification-classify.txt', sorted_parsed_names, delimiter=',', fmt="%s")
# np.savetxt('sf_objectImageStrings__classification-classify.txt', list(unique_names), delimiter=',', fmt="%s")