In [1]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# load classifications
df = pd.read_csv('./zooniverse_exports/classify-classifications.csv', delimiter=",")

In [3]:
print(df.keys())

Index(['classification_id', 'user_name', 'user_id', 'user_ip', 'workflow_id',
       'workflow_name', 'workflow_version', 'created_at', 'gold_standard',
       'expert', 'metadata', 'annotations', 'subject_data', 'subject_ids'],
      dtype='object')


In [4]:
# inspect columns to see what we're working with
firstEntry = df.loc[0]
firstEntry['subject_data']
firstEntry['annotations']

'[{"task":"T0","task_label":"Look at the very center of the image: do you see a single galaxy or a group of far away objects? ","value":"Galaxy"},{"task":"T1","task_label":"Is the galaxy fluffy or is it bright?","value":"Fluffy"}]'

In [5]:
def getFilename(row):
    '''
    extract the 'Filename'|'image'|'IMAGE' field from the subject_data column from a row
    '''
    keys = list(row.values())[0].keys()
    accessKey = 'Filename' if 'Filename' in keys else 'image' if 'image' in keys else 'IMAGE' if 'IMAGE' in keys else None
    
    if accessKey:
        return list(row.values())[0][accessKey]
    
    else: 
        print('No filename found!')
    
def extract_task_value(task_index, row):
    try:
        return row[task_index]['value']
    except: 
        return

In [6]:
# json parse 'annotations' column 
df['annotations'] = df['annotations'].apply(json.loads)
df['subject_data'] = df['subject_data'].apply(json.loads)

df['Filename'] = df['subject_data'].apply(getFilename)

df['Task0'] = df['annotations'].apply(lambda x: extract_task_value(0, x))
df['Task1'] = df['annotations'].apply(lambda x: extract_task_value(1, x))

## Goal of this notebook:

- For every unique object in the dataset, extract the following fields:
    - name,
    - 'task 0' value (recall 'task 0' asks if the users sees a galaxy or a group of objects
    - @todo: 'task 1' value if it exists

In [7]:
def percentageVotesForAnswer(counts, answer):
    '''
    `counts` is a df column like {galaxy: 15, group of objects (cluster): 10, something else/empty center: 2}
    `answer` is one of the keys of counts
    '''
    
    totalVotes = sum(counts.values())
    
    if not answer in counts.keys():
        return 0
    
    votesForAnswer = counts[answer]
    
    return round(100*votesForAnswer/totalVotes,0)

In [8]:
# extract unique object names that resulted from another notebook 
candidate_names_classify = np.loadtxt('./sf_objectImageStrings__classification-classify.txt', dtype=str)

In [9]:
# group df by filename, so that each group contains only rows belonging to that object
gr = df.groupby('Filename')

# create empty list to push results to
task0Values = []

In [10]:
# loop over every group created above to accumulate 'task 0' votes ('galaxy'/'group of objects'/'something else')
for objectName in candidate_names_classify:
    task0 = gr.get_group(objectName)['Task0']
    
    counts = task0.value_counts().to_dict()
    
    countObj = {
        "name": objectName,
        "counts": counts,
    }
    
    task0Values.append(countObj)

In [11]:
df_task0 = pd.DataFrame(task0Values)
df_task0.head()

Unnamed: 0,name,counts
0,UDGcand_1541_insp.png,{'Galaxy': 15}
1,UDGcand_5603_insp.png,"{'Group of objects (Cluster)': 11, 'Galaxy': 4}"
2,UDGcand_6077_insp.png,{'Group of objects (Cluster)': 17}
3,UDGcand_5840_insp.png,{'Something else/empty center': 14}
4,UDGcand_5600_insp.png,{'Something else/empty center': 16}


In [12]:
answer_types = ['Galaxy', 'Group of objects (Cluster)', 'Something else/empty center']

df_task0['# votes'] = df_task0['counts'].apply(lambda x: sum(x.values()))

for ans_type in answer_types:
    vote_percentage_column = df_task0['counts'].apply(lambda x: percentageVotesForAnswer(x, ans_type))
    df_task0['% votes {}'.format(ans_type)] = vote_percentage_column

In [13]:
# rough initial separation just for inspection
mostly_something = df_task0[df_task0['% votes Galaxy'] > 50]
mostly_nothing = df_task0[df_task0['% votes Something else/empty center'] > 50]

# strip the image extension from the names
mostly_nothing = list(map(lambda x: x[:-9], list(mostly_nothing['name'])))
mostly_something = list(map(lambda x: x[:-9], list(mostly_something['name'])))

In [14]:
# extract catalogue targets from other notebook
ct = np.loadtxt('../aku/sf_catalogue_targets.txt', dtype='str')

# differing: objects in the catalogue that people mostly vote 'something else' on
difference = list(set(ct) & set(mostly_nothing))

# corresponding: objects in the catalogue that people mostly vote 'galaxy'/'cluster' on
intersection = list(set(ct) & set(mostly_something))

In [15]:
print(len(difference), len(intersection))

len(ct) - len(difference) - len(intersection)

95 80


90

In [16]:
remaining_objects = np.setdiff1d(np.setdiff1d(ct, difference), intersection)

In [17]:
# slice df_task0 to see votes for objects that are included in the catalogue as potential UDGs
df_task0['name'] = df_task0['name'].apply(lambda x: x[:-9])
df_task0[df_task0['name'].isin(remaining_objects)]

Unnamed: 0,name,counts,# votes,% votes Galaxy,% votes Group of objects (Cluster),% votes Something else/empty center
79,UDGcand_5044,"{'Galaxy': 7, 'Group of objects (Cluster)': 5,...",17,41.0,29.0,29.0
203,UDGcand_7158,"{'Something else/empty center': 7, 'Group of o...",16,25.0,31.0,44.0
263,UDGcand_4474,"{'Something else/empty center': 6, 'Group of o...",15,27.0,33.0,40.0
335,UDGcand_2824,"{'Galaxy': 7, 'Group of objects (Cluster)': 5,...",15,47.0,33.0,20.0
387,UDGcand_1765,"{'Galaxy': 7, 'Something else/empty center': 6...",17,41.0,24.0,35.0
...,...,...,...,...,...,...
5856,UDGcand_4237,"{'Galaxy': 6, 'Something else/empty center': 6...",15,40.0,20.0,40.0
5859,UDGcand_4321,"{'Group of objects (Cluster)': 11, 'Galaxy': 3...",16,19.0,69.0,12.0
6086,UDGcand_949,"{'Something else/empty center': 8, 'Group of o...",16,25.0,25.0,50.0
6221,UDGcand_6648,"{'Something else/empty center': 8, 'Galaxy': 7...",16,44.0,6.0,50.0


---

## Q: Are all votes counted properly?

In [18]:
df_task0['# votes'].agg(['sum']), df.shape[0]

(sum    100871
 Name: # votes, dtype: int64,
 103708)

^ number of rows in the dataframe doesn't match the number of votes between galaxy/cluster/something else. Why?

---

In [19]:
def extract_retired_info(row):
    return list(row.values())[0]["retired"]

def extract_retired_object_props(row):
    keys = list(list(first.values())[0].keys())
    

In [20]:
df["retired"] = df["subject_data"].apply(extract_retired_info)
df_retired = df[~df["retired"].isnull()]

- Group retired objects by filename,
- extract properties (RA, DEC, etc.) from them,
- append properties as new columns to task0 dataframe

In [21]:
gr_retired = df_retired.groupby(["Filename"])  # group by filename
props = ["R", "RA", "DEC", "G-I"]              # extract object properties

props_list = []

for objectName in candidate_names_classify:
    # get group
    try:
        row = gr_retired.get_group(objectName)['subject_data']

        # get first entry in the group (props should be the same for every entry since they all describe the same object)
        firstEntry = row.iloc[0]
        values  = list(firstEntry.values())[0]

        # create object with name, properties
        entry = {'name': objectName[:-9]}

        for key in props:
            entry[key] = values[key]

        props_list.append(entry)
    except:
        continue
        
df_props = pd.DataFrame(props_list)

In [22]:
df_with_props = df_task0.merge(df_props, how='outer')

In [25]:
df_with_props.head()

Unnamed: 0,name,counts,# votes,% votes Galaxy,% votes Group of objects (Cluster),% votes Something else/empty center,R,RA,DEC,G-I
0,UDGcand_1541,{'Galaxy': 15},15,100.0,0.0,0.0,18.1924,56.6240234375,-35.0544166565,0.587689876556
1,UDGcand_5603,"{'Group of objects (Cluster)': 11, 'Galaxy': 4}",15,27.0,73.0,0.0,19.9224,52.5327110291,-35.0038948059,1.24255418777
2,UDGcand_6077,{'Group of objects (Cluster)': 17},17,0.0,100.0,0.0,20.8128,57.528503418,-34.5431938171,1.43765449524
3,UDGcand_5840,{'Something else/empty center': 14},14,0.0,0.0,100.0,18.8742,51.9040298462,-35.703163147,1.45068645477
4,UDGcand_5600,{'Something else/empty center': 16},16,0.0,0.0,100.0,19.1851,51.8673019409,-35.73462677,1.50421380997


### Objects that don't have enough votes haven't been retired, so have to get properties elsewhere (Q: where?)

In [23]:
# objects that aren't retired
df_with_props[df_with_props['RA'].isnull()]

Unnamed: 0,name,counts,# votes,% votes Galaxy,% votes Group of objects (Cluster),% votes Something else/empty center,R,RA,DEC,G-I
10,UDGcand_6896,{'Group of objects (Cluster)': 1},1,0.0,100.0,0.0,,,,
21,UDGcand_6943,{'Group of objects (Cluster)': 1},1,0.0,100.0,0.0,,,,
43,UDGcand_6915,{'Galaxy': 1},1,100.0,0.0,0.0,,,,
175,UDGcand_6977,{'Galaxy': 1},1,100.0,0.0,0.0,,,,
259,UDGcand_6882,{'Galaxy': 1},1,100.0,0.0,0.0,,,,
...,...,...,...,...,...,...,...,...,...,...
6209,UDGcand_6877,{'Group of objects (Cluster)': 1},1,0.0,100.0,0.0,,,,
6306,UDGcand_7017,{'Galaxy': 2},2,100.0,0.0,0.0,,,,
6307,UDGcand_6986,{'Galaxy': 1},1,100.0,0.0,0.0,,,,
6348,UDGcand_6931,{'Group of objects (Cluster)': 2},2,0.0,100.0,0.0,,,,
