In [1]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('../SpaceFluff/zooniverse_exports/classify-classifications.csv', delimiter=",")

In [3]:
print(df.keys())

Index(['classification_id', 'user_name', 'user_id', 'user_ip', 'workflow_id',
       'workflow_name', 'workflow_version', 'created_at', 'gold_standard',
       'expert', 'metadata', 'annotations', 'subject_data', 'subject_ids'],
      dtype='object')


In [4]:
subject_data = df['subject_data']

In [5]:
def extract_subject_keys(data):
    all_keys = []

    for i in range(len(data)):
        for k, v in json.loads(data[i]).items():
            for key in v.keys():
                all_keys.append(key)

    return set(all_keys)

def extract_subject_names(data):
    subject_filenames = []
    field_counts = {   # check to see how many files have an 'image' or 'IMAGE' field vs. 'Filename'
        "Filename": 0,
        "IMAGE": 0,
        "image": 0
    }
    
    without_filename = []
    
    for datum in data:
        subject = json.loads(datum)
        
        for k, v in subject.items():
            keys = v.keys()
            filename = False
            if "Filename" in keys:
                filename = v["Filename"]
                field_counts['Filename'] += 1
            if "IMAGE" in keys:
                filename = v["IMAGE"]
                field_counts['IMAGE'] += 1
            if "image" in keys:
                filename = v["image"]
                field_counts['image'] += 1
            
            if filename:
                subject_filenames.append(filename)
        
    print(field_counts)
    print(len(without_filename))
    return subject_filenames

def split_name(name):
    '''
    Extract number designation from UDG candidate name string 
    Input: name, e.g. "UDGcand_1261", 
    Returns: number designation, e.g. 1261
    '''
    
    return int(name.split('_')[-1])

def compare(a, b):
    '''
    Custom comparison function to sort UDG candidate names by ascending number designation
    Input: params a, b as strings, e.g. "UDGcand_1303", "UDGcand_1311"
    Returns: split_name(a) - split_name(b)
    '''
    
    a = split_name(a)
    b = split_name(b)
    
    return a - b

In [6]:
unique_subject_keys = extract_subject_keys(subject_data)
print(unique_subject_keys)

{'image_cross', 'G-I', 'IMAGE', 'ID', 'Filename', 'RA', 'DEC', 'Image_id', 'retired', 'R', 'image'}


In [7]:
unique_names = set(extract_subject_names(subject_data))

{'Filename': 6295, 'IMAGE': 4259, 'image': 93154}
0


In [8]:
list(unique_names)

['UDGcand_6748_insp.png',
 'UDGcand_2945_insp.png',
 'UDGcand_4183_insp.png',
 'UDGcand_2884_insp.png',
 'UDGcand_3298_insp.png',
 'UDGcand_9_insp.png',
 'UDGcand_4412_insp.png',
 'UDGcand_6560_insp.png',
 'UDGcand_1916_insp.png',
 'UDGcand_1722_insp.png',
 'UDGcand_6817_insp.png',
 'UDGcand_4199_insp.png',
 'UDGcand_4873_insp.png',
 'UDGcand_1504_insp.png',
 'UDGcand_3341_insp.png',
 'UDGcand_5941_insp.png',
 'UDGcand_4302_insp.png',
 'UDGcand_562_insp.png',
 'UDGcand_531_insp.png',
 'UDGcand_4795_insp.png',
 'UDGcand_1924_insp.png',
 'UDGcand_2906_insp.png',
 'UDGcand_1379_insp.png',
 'UDGcand_7189_insp.png',
 'UDGcand_2697_insp.png',
 'UDGcand_41_insp.png',
 'UDGcand_2934_insp.png',
 'UDGcand_5155_insp.png',
 'UDGcand_6272_insp.png',
 'UDGcand_4334_insp.png',
 'UDGcand_2854_insp.png',
 'UDGcand_6013_insp.png',
 'UDGcand_1624_insp.png',
 'UDGcand_6726_insp.png',
 'UDGcand_1692_insp.png',
 'UDGcand_812_insp.png',
 'UDGcand_2120_insp.png',
 'UDGcand_236_insp.png',
 'UDGcand_5143_insp.p

In [9]:
parsed_names = []

for name in unique_names:
    name = name[:-9]
    parsed_names.append(name)

In [10]:
# sort for presentation purposes

from functools import cmp_to_key

sorted_parsed_names = sorted(parsed_names, key=cmp_to_key(compare))

In [11]:
# # save the sorted list of unique target names to a .txt file
# #   uncomment to re-save
# np.savetxt('sf_candidate_names__classification-classify.txt', sorted_parsed_names, delimiter=',', fmt="%s")
# np.savetxt('sf_objectImageStrings__classification-classify.txt', list(unique_names), delimiter=',', fmt="%s")