In [1]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('../SpaceFluff/zooniverse_exports/classify-classifications.csv', delimiter=",")

In [3]:
print(df.keys())

Index(['classification_id', 'user_name', 'user_id', 'user_ip', 'workflow_id',
       'workflow_name', 'workflow_version', 'created_at', 'gold_standard',
       'expert', 'metadata', 'annotations', 'subject_data', 'subject_ids'],
      dtype='object')


In [4]:
subject_data = df['subject_data']

In [5]:
def extract_subject_keys(data):
    all_keys = []

    for i in range(len(data)):
        for k, v in json.loads(data[i]).items():
            for key in v.keys():
                all_keys.append(key)

    return set(all_keys)

def extract_subject_names(data):
    subject_filenames = []
    field_counts = {   # check to see how many files have an 'image' or 'IMAGE' field vs. 'Filename'
        "Filename": 0,
        "IMAGE": 0,
        "image": 0
    }
    
    without_filename = []
    
    for datum in data:
        subject = json.loads(datum)
        
        for k, v in subject.items():
            keys = v.keys()
            filename = False
            if "Filename" in keys:
                filename = v["Filename"]
                field_counts['Filename'] += 1
            if "IMAGE" in keys:
                filename = v["IMAGE"]
                field_counts['IMAGE'] += 1
            if "image" in keys:
                filename = v["image"]
                field_counts['image'] += 1
            
            if filename:
                subject_filenames.append(filename)
        
    print(field_counts)
    print(len(without_filename))
    return subject_filenames

def split_name(name):
    '''
    Extract number designation from UDG candidate name string 
    Input: name, e.g. "UDGcand_1261", 
    Returns: number designation, e.g. 1261
    '''
    
    return int(name.split('_')[-1])

def compare(a, b):
    '''
    Custom comparison function to sort UDG candidate names by ascending number designation
    Input: params a, b as strings, e.g. "UDGcand_1303", "UDGcand_1311"
    Returns: split_name(a) - split_name(b)
    '''
    
    a = split_name(a)
    b = split_name(b)
    
    return a - b

In [6]:
unique_subject_keys = extract_subject_keys(subject_data)
print(unique_subject_keys)

{'RA', 'image_cross', 'DEC', 'image', 'Image_id', 'Filename', 'ID', 'R', 'G-I', 'IMAGE', 'retired'}


In [7]:
unique_names = set(extract_subject_names(subject_data))

{'Filename': 6295, 'IMAGE': 4259, 'image': 93154}
0


In [8]:
list(unique_names)

['UDGcand_1564_insp.png',
 'UDGcand_4271_insp.png',
 'UDGcand_4442_insp.png',
 'UDGcand_4775_insp.png',
 'UDGcand_5827_insp.png',
 'UDGcand_1794_insp.png',
 'UDGcand_440_insp.png',
 'UDGcand_3718_insp.png',
 'UDGcand_3797_insp.png',
 'UDGcand_6476_insp.png',
 'UDGcand_2524_insp.png',
 'UDGcand_4433_insp.png',
 'UDGcand_2812_insp.png',
 'UDGcand_4998_insp.png',
 'UDGcand_955_insp.png',
 'UDGcand_849_insp.png',
 'UDGcand_3979_insp.png',
 'UDGcand_6245_insp.png',
 'UDGcand_4284_insp.png',
 'UDGcand_3181_insp.png',
 'UDGcand_2771_insp.png',
 'UDGcand_6049_insp.png',
 'UDGcand_1332_insp.png',
 'UDGcand_5472_insp.png',
 'UDGcand_1526_insp.png',
 'UDGcand_5096_insp.png',
 'UDGcand_2237_insp.png',
 'UDGcand_4952_insp.png',
 'UDGcand_6202_insp.png',
 'UDGcand_206_insp.png',
 'UDGcand_6563_insp.png',
 'UDGcand_131_insp.png',
 'UDGcand_2908_insp.png',
 'UDGcand_1952_insp.png',
 'UDGcand_5209_insp.png',
 'UDGcand_7233_insp.png',
 'UDGcand_267_insp.png',
 'UDGcand_2122_insp.png',
 'UDGcand_6301_ins

In [9]:
parsed_names = []

for name in unique_names:
    name = name[:-9]
    parsed_names.append(name)

In [10]:
# sort for presentation purposes

from functools import cmp_to_key

sorted_parsed_names = sorted(parsed_names, key=cmp_to_key(compare))

In [12]:
# save the sorted list of unique target names to a .txt file
#   uncomment to re-save
np.savetxt('sf_candidate_names__classification-classify.txt', sorted_parsed_names, delimiter=',', fmt="%s")
np.savetxt('sf_objectImageStrings__classification-classify.txt', list(unique_names), delimiter=',', fmt="%s")