## Goal
Figure out how to properly (efficiently) parse the datetimes in the SpaceFluff data.

In [1]:
import json
import pandas as pd
from datetime import date
from sf import parseTime

## Load `classify_classifications.csv`, print column names, parse stringified columns:

In [2]:
# load csv
df = pd.read_csv('../SpaceFluff/zooniverse_exports/classify-classifications.csv', delimiter=",")

# json parse any stringified columns
df['annotations'] = df['annotations'].apply(json.loads)
df['subject_data'] = df['subject_data'].apply(json.loads)

# parse date strings to dates
df['created_at'] = parseTime(df['created_at'])

In [3]:
df.columns  # print column names for reference

Index(['classification_id', 'user_name', 'user_id', 'user_ip', 'workflow_id',
       'workflow_name', 'workflow_version', 'created_at', 'gold_standard',
       'expert', 'metadata', 'annotations', 'subject_data', 'subject_ids'],
      dtype='object')

In [4]:
# # filter out entries from before 2020,10,20 as that's the supposed beta start date
df[df.created_at < pd.Timestamp(date(2020, 10,20), tz='utc')]

Unnamed: 0,classification_id,user_name,user_id,user_ip,workflow_id,workflow_name,workflow_version,created_at,gold_standard,expert,metadata,annotations,subject_data,subject_ids
0,271393634,SUNDIAL-Itn,2048353.0,f37e74914974b1786028,16138,Classify!,15.28,2020-09-02 07:47:42+00:00,,,"{""source"":""api"",""session"":""e1526a14b6f188c31d4...","[{'task': 'T0', 'task_label': 'Look at the ver...","{'43856466': {'retired': None, 'Filename': 'UD...",43856466
1,271929397,not-logged-in-0abc0f712c4255eb980f,,0abc0f712c4255eb980f,16138,Classify!,15.28,2020-09-05 14:26:33+00:00,,,"{""source"":""api"",""session"":""f8d6e3b8746b19b6ad2...","[{'task': 'T0', 'task_label': 'Look at the ver...","{'43851832': {'retired': None, 'Filename': 'UD...",43851832
2,272079802,not-logged-in-f33c0dda4287fcab99c1,,f33c0dda4287fcab99c1,16138,Classify!,15.28,2020-09-06 15:46:50+00:00,,,"{""source"":""api"",""session"":""5138e36df25d0a784a2...","[{'task': 'T0', 'task_label': 'Look at the ver...","{'43854768': {'retired': None, 'Filename': 'UD...",43854768
3,272079992,not-logged-in-f33c0dda4287fcab99c1,,f33c0dda4287fcab99c1,16138,Classify!,15.28,2020-09-06 15:48:02+00:00,,,"{""source"":""api"",""session"":""5138e36df25d0a784a2...","[{'task': 'T0', 'task_label': 'Look at the ver...","{'43852307': {'retired': None, 'Filename': 'UD...",43852307
4,272080282,not-logged-in-f33c0dda4287fcab99c1,,f33c0dda4287fcab99c1,16138,Classify!,15.28,2020-09-06 15:49:42+00:00,,,"{""source"":""api"",""session"":""5138e36df25d0a784a2...","[{'task': 'T0', 'task_label': 'Look at the ver...","{'43850884': {'retired': None, 'Filename': 'UD...",43850884
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6290,281169253,not-logged-in-23e5cf4956c21acfe542,,23e5cf4956c21acfe542,16138,Classify!,15.28,2020-10-17 20:03:45+00:00,,,"{""source"":""api"",""session"":""fe21b80b97103224fe7...","[{'task': 'T0', 'task_label': 'Look at the ver...","{'43858257': {'retired': None, 'Filename': 'UD...",43858257
6291,281169306,not-logged-in-23e5cf4956c21acfe542,,23e5cf4956c21acfe542,16138,Classify!,15.28,2020-10-17 20:03:59+00:00,,,"{""source"":""api"",""session"":""fe21b80b97103224fe7...","[{'task': 'T0', 'task_label': 'Look at the ver...","{'43856147': {'retired': None, 'Filename': 'UD...",43856147
6292,281169330,not-logged-in-23e5cf4956c21acfe542,,23e5cf4956c21acfe542,16138,Classify!,15.28,2020-10-17 20:04:06+00:00,,,"{""source"":""api"",""session"":""fe21b80b97103224fe7...","[{'task': 'T0', 'task_label': 'Look at the ver...","{'43843542': {'retired': None, 'Filename': 'UD...",43843542
6293,281169351,not-logged-in-23e5cf4956c21acfe542,,23e5cf4956c21acfe542,16138,Classify!,15.28,2020-10-17 20:04:13+00:00,,,"{""source"":""api"",""session"":""fe21b80b97103224fe7...","[{'task': 'T0', 'task_label': 'Look at the ver...","{'43844279': {'retired': None, 'Filename': 'UD...",43844279


---

### Group classifications by workflow_version

In [5]:
gr_version = df.groupby(['workflow_version'])

In [6]:
gr_version.groups

{15.28: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]}

All classifications in this .csv file have been done in the same workflow version, so can we assume these are all after the beta phase, and thus all properly usable?