# Processing Zooniverse Data Exports for Space Fluff

In [1]:
import pandas as pd
import json
from datetime import date

In [2]:
global exports_dir
global outputs_dir
#exports_dir = /home/anna/Desktop/SUNDIAL/images/
#outputs_dir = /home/anna/Desktop/SUNDIAL/images/
exports_dir = 'zooniverse_exports/'
outputs_dir = 'outputs'

filename_classify = exports_dir+'classify-classifications.csv'
filename_onthego = exports_dir+'classify-on-the-go-classifications.csv'
filename_hardcore = exports_dir+'classify-hardcore-edition-classifications.csv'

outputfile_classify = exports_dir+'space-fluff_classifications_clean.csv'
outputfile_onthego = exports_dir+'space-fluff_onthego_clean.csv'
outputfile_hardcore = exports_dir+'space-fluff_hardcore_clean.csv'

In [3]:
columns_out_classify = ['classification_id', 'created_at', 'user_name', 'user_id',
               'workflow_id', 'workflow_version', 'subject_ids', 
               'taskvalue_T0_classify', 'taskvalue_T1_classify', 'taskvalue_T2_classify',
                'subject_name_classify']
columns_out_onthego = ['classification_id', 'created_at', 'user_name', 'user_id',
               'workflow_id', 'workflow_version', 'subject_ids', 
               'taskvalue_T0_onthego', 'subject_name_onthego']

# not all tasks are active in the hardcore workflow: amt there is T0, T2, T1, T3, T4, T5, T9 

columns_out_hardcore = ['classification_id', 'created_at', 'user_name', 'user_id',
               'workflow_id', 'workflow_version', 'subject_ids', 
               'taskvalue_T0', 'taskvalue_T1','taskvalue_T2', 'taskvalue_T3',
                'taskvalue_T4', 'taskvalue_T5', 'taskvalue_T9', 'subject_name']

In [4]:

columns_in = ['classification_id', 'user_name', 'user_id', 'user_ip', 
              'workflow_id','workflow_name', 'workflow_version', 'created_at', 
              'gold_standard', 'expert', 'metadata', 'annotations', 
              'subject_data', 'subject_ids']
       
columns_new_classify = ['metadata_json_classify', 'annotations_json_classify',
                'subject_data_json_classify', 
               'taskvalue_T0_classify', 'taskvalue_T1_classify', 'taskvalue_T2_classify',
                'subject_name_classify']
columns_new_onthego = ['metadata_json_onthego', 'annotations_json_onthego',
                'subject_data_json_onthego', 
               'taskvalue_T0_onthego', 'subject_name_onthego']
columns_new_hardcore = ['metadata_json', 'annotations_json', 'subject_data_json', 
               'taskvalue_T0', 'taskvalue_T1','taskvalue_T2', 'taskvalue_T3',
                'taskvalue_T4', 'taskvalue_T5', 'taskvalue_T9', 'subject_name']

In [5]:
classifications_classify = pd.read_csv(filename_classify)
classifications_classify = classifications_classify[6295:]

classifications_onthego = pd.read_csv(filename_onthego)
classifications_onthego = classifications_onthego[19989:]

classifications_hardcore = pd.read_csv(filename_hardcore)
classifications_hardcore = classifications_hardcore[5945:]



In [6]:
classifications_classify['metadata_json'] = [json.loads(q) for q in classifications_classify.metadata]
classifications_classify['annotations_json'] = [json.loads(q) for q in classifications_classify.annotations]
classifications_classify['subject_data_json'] = [json.loads(q) for q in classifications_classify.subject_data]

In [7]:
classifications_onthego['metadata_json'] = [json.loads(q) for q in classifications_onthego.metadata]
classifications_onthego['annotations_json'] = [json.loads(q) for q in classifications_onthego.annotations]
classifications_onthego['subject_data_json'] = [json.loads(q) for q in classifications_onthego.subject_data]

In [8]:
classifications_hardcore['metadata_json'] = [json.loads(q) for q in classifications_hardcore.metadata]
classifications_hardcore['annotations_json'] = [json.loads(q) for q in classifications_hardcore.annotations]
classifications_hardcore['subject_data_json'] = [json.loads(q) for q in classifications_hardcore.subject_data]

In [9]:
# some functions to make things a bit cleaner

def clean_answers_onthego(classifications, annotations_columns): 
    for i, row in classifications.iterrows():
        answers = {'T0':''}
        for t in row[annotations_columns]:
            answers[t['task']] = t['value']
        taskvalue_T0_onthego.append(answers['T0'])
    
    
    
    
def clean_answers_classify(classifications, annotations_columns): 
    for i, row in classifications.iterrows():
        answers = {'T0':'', 'T1':'', 'T2':''}
        for t in row[annotations_columns]:
            answers[t['task']] = t['value']
        taskvalue_T0_classify.append(answers['T0'])
        taskvalue_T1_classify.append(answers['T1'])
        taskvalue_T2_classify.append(answers['T2'])

    

def clean_answers_hardcore(classifications, annotations_columns): 
    for i, row in classifications.iterrows():
        answers = {'T0':'', 'T1':'', 'T2':'', 'T3':'', 'T4':'', 'T5':'', 'T9':''}
        for t in row[annotations_columns]:
            #print(t['task'],',  ',t['value'])
            answers[t['task']] = t['value']
        #print(answered)
        taskvalue_T0.append(answers['T0'])
        taskvalue_T1.append(answers['T1'])
        taskvalue_T2.append(answers['T2'])
        taskvalue_T3.append(answers['T3'])
        taskvalue_T4.append(answers['T4'])
        taskvalue_T5.append(answers['T5'])
        taskvalue_T9.append(answers['T9'])
         
        #print('len of taskvalue: ', len(taskvalue_T0))

def add_subject_name(classifications):
    subject_name = []
    for row in classifications['subject_data']:
        image_no = row.split('IMAGE":"')[1]
        subject_name.append(image_no.split('"')[0])
    #print('len of subj:  ',len(subject_name))    
    return subject_name
            

In [10]:
taskvalue_T0 = []
taskvalue_T1 = []
taskvalue_T2 = []
taskvalue_T3 = []
taskvalue_T4 = []
taskvalue_T5 = []
taskvalue_T9 = []


clean_answers_hardcore(classifications_hardcore, 'annotations_json')
subject_names = add_subject_name(classifications_hardcore)

classifications_hardcore['taskvalue_T0'] = taskvalue_T0
classifications_hardcore['taskvalue_T1'] = taskvalue_T1
classifications_hardcore['taskvalue_T2'] = taskvalue_T2
classifications_hardcore['taskvalue_T3'] = taskvalue_T3
classifications_hardcore['taskvalue_T4'] = taskvalue_T4
classifications_hardcore['taskvalue_T5'] = taskvalue_T5
classifications_hardcore['taskvalue_T9'] = taskvalue_T9
classifications_hardcore['subject_name'] = subject_names

print(classifications_hardcore['taskvalue_T0'].head(20))

IndexError: list index out of range

In [None]:
taskvalue_T0_onthego = []

clean_answers_onthego(classifications_onthego, 'annotations_json')
subject_names_onthego = add_subject_name(classifications_onthego)

classifications_onthego['taskvalue_T0_onthego'] = taskvalue_T0_onthego
classifications_onthego['subject_name_onthego'] = subject_names_onthego

In [None]:
taskvalue_T0_classify = []
taskvalue_T1_classify = []
taskvalue_T2_classify = []

clean_answers_classify(classifications_classify, 'annotations_json')
subject_names_classify = add_subject_name(classifications_classify)


classifications_classify['taskvalue_T0_classify'] = taskvalue_T0_classify
classifications_classify['taskvalue_T1_classify'] = taskvalue_T1_classify
classifications_classify['taskvalue_T2_classify'] = taskvalue_T2_classify
classifications_classify['subject_name_classify'] = subject_names_classify



In [None]:
output_hardcore = classifications_hardcore[columns_out_hardcore]
output_hardcore.to_csv(outputfile_hardcore, index=False)

In [None]:
output_classify = classifications_classify[columns_out_classify]
output_classify.to_csv(outputfile_classify, index=False)

In [None]:
output_onthego = classifications_onthego[columns_out_onthego]
output_onthego.to_csv(outputfile_onthego, index=False)