In [1]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [21]:
# load classifications
df = pd.read_csv('./zooniverse_exports/classify-classifications.csv', delimiter=",")

In [3]:
print(df.keys())

Index(['classification_id', 'user_name', 'user_id', 'user_ip', 'workflow_id',
       'workflow_name', 'workflow_version', 'created_at', 'gold_standard',
       'expert', 'metadata', 'annotations', 'subject_data', 'subject_ids'],
      dtype='object')


In [29]:
firstEntry = df.loc[0]
firstEntry['subject_data']
firstEntry['annotations']

'[{"task":"T0","task_label":"Look at the very center of the image: do you see a single galaxy or a group of far away objects? ","value":"Galaxy"},{"task":"T1","task_label":"Is the galaxy fluffy or is it bright?","value":"Fluffy"}]'

In [4]:
def parse_task(row):
    loads = json.loads(row)
    return loads

def getFilename(row):
    '''
    extract the 'Filename'|'image'|'IMAGE' field from the subject_data column in every row
    '''
    keys = list(row.values())[0].keys()
    accessKey = 'Filename' if 'Filename' in keys else 'image' if 'image' in keys else 'IMAGE' if 'IMAGE' in keys else None
    
    if accessKey:
        return list(row.values())[0][accessKey]
    
def extract_task(task_index, row):
    try:
        return row[task_index]['value']
    except: 
        return

In [5]:
# json parse 'annotations' column 
df['annotations'] = df['annotations'].apply(parse_task)
df['subject_data'] = df['subject_data'].apply(json.loads)

df['Filename'] = df['subject_data'].apply(getFilename)

df['Task0'] = df['annotations'].apply(lambda x: extract_task(0, x))
df['Task1'] = df['annotations'].apply(lambda x: extract_task(1, x))

## Goal of this notebook:

- For every unique object in the dataset, extract the following fields:
    - name,
    - 'task 0' value (recall 'task 0' asks if the users sees a galaxy or a group of objects

In [20]:
def percentageVotesForAnswer(counts, answer):
    '''
    `counts` is a df column like {galaxy: 15, group of objects (cluster): 10, something else/empty center: 2}
    `answer` is one of the keys of counts
    '''
    
    totalVotes = sum(counts.values())
    
    if not answer in counts.keys():
        return 0
    
    votesForAnswer = counts[answer]
    
    return round(100*votesForAnswer/totalVotes,0)

In [8]:
# extract unique object names that resulted from another notebook 
candidate_names_classify = np.loadtxt('./sf_objectImageStrings__classification-classify.txt', dtype=str)

In [9]:
# group df by filename, so that each group contains only rows belonging to that object
gr = df.groupby('Filename')

# create empty list to push results to
task0Values = []

In [10]:
# loop over every group created above to accumulate 'task 0' votes ('galaxy'/'group of objects'/'something else')
for objectName in candidate_names_classify:
    task0 = gr.get_group(objectName)['Task0']
    
    counts = task0.value_counts().to_dict()
    
    countObj = {
        "name": objectName,
        "counts": counts
    }
    
    task0Values.append(countObj)

In [11]:
df_task0 = pd.DataFrame(task0Values)
df_task0.head()

Unnamed: 0,name,counts
0,UDGcand_1541_insp.png,{'Galaxy': 15}
1,UDGcand_5603_insp.png,"{'Group of objects (Cluster)': 11, 'Galaxy': 4}"
2,UDGcand_6077_insp.png,{'Group of objects (Cluster)': 17}
3,UDGcand_5840_insp.png,{'Something else/empty center': 14}
4,UDGcand_5600_insp.png,{'Something else/empty center': 16}


In [12]:
answer_types = ['Galaxy', 'Group of objects (Cluster)', 'Something else/empty center']

df_task0['# votes'] = df_task0['counts'].apply(lambda x: sum(x.values()))

for ans_type in answer_types:
    df_task0['% votes {}'.format(ans_type)] = df_task0['counts'].apply(
        lambda x: percentageVotesForAnswer(x, ans_type)
    )


In [13]:
mostly_something = df_task0[df_task0['% votes Something else/empty center'] < 30]
mostly_nothing = df_task0[df_task0['% votes Something else/empty center'] > 70]

In [14]:
# extract names of objects where people votes mostly 'Something else/empty center' with a 50% threshold (@todo: refine!)
probably_not_galaxy = list(map(lambda x: x[:-9], list(mostly_nothing['name'])))
probably_galaxy = list(map(lambda x: x[:-9], list(mostly_something['name'])))

In [15]:
# extract catalogue targets from other notebook
ct = np.loadtxt('../aku/sf_catalogue_targets.txt', dtype='str')

differing_answers = list(set(ct) & set(probably_not_galaxy))
corresponding_answers = list(set(ct) & set(probably_galaxy))

In [16]:
len(differing_answers), len(corresponding_answers)

len(ct) - len(differing_answers) - len(corresponding_answers)

107

In [17]:
remaining_objects = np.setdiff1d(np.setdiff1d(ct, differing_answers), corresponding_answers)

In [18]:
# slice df_task0 to see votes for objects that are included in the catalogue as potential UDGs
df_task0['name'] = df_task0['name'].apply(lambda x: x[:-9])
df_task0[df_task0['name'].isin(remaining_objects)]

Unnamed: 0,name,counts,# votes,% votes Galaxy,% votes Group of objects (Cluster),% votes Something else/empty center
122,UDGcand_2367,"{'Something else/empty center': 8, 'Galaxy': 5...",14,36.0,7.0,57.0
203,UDGcand_7158,"{'Something else/empty center': 7, 'Group of o...",16,25.0,31.0,44.0
263,UDGcand_4474,"{'Something else/empty center': 6, 'Group of o...",15,27.0,33.0,40.0
275,UDGcand_197,"{'Galaxy': 8, 'Something else/empty center': 5...",14,57.0,7.0,36.0
387,UDGcand_1765,"{'Galaxy': 7, 'Something else/empty center': 6...",17,41.0,24.0,35.0
...,...,...,...,...,...,...
6014,UDGcand_5097,"{'Something else/empty center': 11, 'Group of ...",16,12.0,19.0,69.0
6022,UDGcand_6339,"{'Something else/empty center': 10, 'Galaxy': ...",16,19.0,19.0,62.0
6086,UDGcand_949,"{'Something else/empty center': 8, 'Galaxy': 4...",16,25.0,25.0,50.0
6221,UDGcand_6648,"{'Something else/empty center': 8, 'Galaxy': 7...",16,44.0,6.0,50.0


In [19]:
df_task0['# votes'].agg(['sum']), df.shape[0]

(sum    100871
 Name: # votes, dtype: int64,
 103708)