In [1]:
import sys
sys.path.append('../..')

# reload sf import while I'm working on extracting functionality to it from notebooks
from importlib import reload
import sf_lib; import sf_lib.sf; import sf_lib.df
reload(sf_lib), reload(sf_lib.sf), reload(sf_lib.df)

import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
from datetime import date

from sf_lib.df import (
    make_df_classify, 
    make_df_tasks_with_props
)
from sf_lib.sf import (
    get_running_vote_fraction,
    getFilename, 
    getMetadataValue, 
    parseTime, 
    getGroupSize, 
    extractTaskValue
)

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

## Functions:

In [2]:
def df_to_json(df, path):
    df.to_json(path)

### Load auxiliary data, like object info and names of targets in the catalogue

In [3]:
object_info = pd.read_csv('../../catalogue/sf_spacefluff_object_data.csv', comment="#")
candidate_names_classify = np.loadtxt('../sf_candidate_names__classification-classify.txt', dtype='str')
catalogue_targets = np.loadtxt('../../catalogue/sf_catalogue_targets.txt', dtype='str')

### Define tasks so we can load the dataframes properly

<span style="color: red;"><strong>IMPORTANT!</strong></span> When we go to combine the three dataframes, make sure to swap the names of the T1 and T2 columns in the `hardcore` workflow. For some reason, The question assigned to T1 in `Classify!` is assigned to T2 in `Hardcore`. If we don't swap them, we'll end up with useless data.

In [4]:
tasks_hardcore = [0, 2, 1, 3, 4, 5, 9]
tasks_classify = [0, 1]
tasks_onthego = [0]

task_strings = ['T{}'.format(t) for t in tasks_hardcore]

### Load dataframes

In [5]:
df_classify = make_df_classify('classify', tasks_classify)
df_hardcore = make_df_classify('hardcore', tasks_hardcore)
df_onthego = make_df_classify('onthego', tasks_onthego)

#### Swap T1 and T2 columns in `hardcore` workflow

In [6]:
df_hardcore[['T1', 'T2']] = df_hardcore[['T2', 'T1']]

### Stack the three dataframes

Note the following inconsistency: `onthego` formulates one of the _task 0_ answers as "Group of objects (cluster)", while `classify` and `hardcore` have it formulated with uppercase 'C': "Group of objects (Cluster)". Fix this by just coercing all answers to lowercase.

In [7]:
df = df_classify.append(df_hardcore).append(df_onthego)
df['T0'] = df['T0'].apply(lambda x: x.lower())

In [8]:
df = df.sort_values('created_at')

## Find duplicate classifications of an object made by the same user across workflows

For ease of readability, I won't be coding the most efficient way time-complexity wise. 

In [9]:
# slice dataframe columns we need for much faster indexing
groupby_user = df[['user_name', 'Filename', 'T0']].groupby('user_name')
unique_users = df['user_name'].unique()

In [10]:
# loop through groups to find users who made more classifications than they saw unique objects,
# i.e. they saw at least one object multiple times

users_with_duplicates = []

for user in unique_users:
    clas = groupby_user.get_group(user)
    objects_seen = clas['Filename']
    if not objects_seen.shape[0] == objects_seen.unique().shape[0]:
        users_with_duplicates.append(user)

### Generate a list with each user and the objects they saw multiple times:

In [11]:
duplicate_classifications = {user: [] for user in users_with_duplicates}

for user in users_with_duplicates:
    clas = groupby_user.get_group(user)
    objects = clas['Filename']
    
    objects_seen = []
    for obj in objects:
        if not obj in objects_seen:
            objects_seen.append(obj)
        else:
            duplicate_classifications[user].append(obj)      

### Loop through all duplicate classifications and extract classification_id of every classification where the user had already seen that object:

In [12]:
to_filter = []  # ids will be appeneded to this list

for user, dupes in duplicate_classifications.items():
    seen = []

    # query df by username and filename to get these objects
    vals = df.query("user_name == @user & Filename.isin(@dupes)")[['Filename', 'classification_id']].values
    for entry in vals:
        name, _id = entry
        if name in seen:
            to_filter.append(_id)
        else:
            seen.append(name)

## Filter all classifications where user had already seen that object:

In [13]:
df = df.query("~classification_id.isin(@to_filter)")

# Rewrite creation of 'df_tasks_with_props' for stacked dataframe
beats wasting hours messing with df_tasks_with_props code from before

In [14]:
task_strings_hc = ['T{}'.format(t) for t in tasks_hardcore]
task_strings_classify = ['T0', 'T1']

def df_votes_with_vote_count(df_votes):
    df_votes.insert(1, 'vote_count', df_votes['T0'].apply(lambda x: sum(list(x.values()))))
    return df_votes

def df_votes_with_object_info(df_votes):
    df_votes = df_votes.merge(object_info, how='outer', on='name')
    df_votes = df_votes.query("~vote_count.isnull()")
    return df_votes

def get_answer_vote_percentage(row, entry, decimal_places=1):
    none_count = row.get('None', 0)
    total_votes = sum(row.values())
    actual_votes = total_votes - none_count
    
    if actual_votes > 0:
        return round(100*row.get(entry, 0)/actual_votes, decimal_places)

tasks_hardcore = [0, 2, 1, 3, 4, 5, 9]
task_strings = ['T{}'.format(t) for t in tasks_hardcore]

def df_votes_with_vote_percentages(df_votes, df):
    for task in task_strings:
        for entry in df.query("~{}.isnull()".format(task))[task].unique().tolist():
            df_votes["{} % {}".format(task, entry.lower())] = df_votes[task].apply(lambda x: get_answer_vote_percentage(x,  entry))
            
    return df_votes

def make_df_votes(df, task_strings):
    t0_answers = df['T0'].unique().tolist()
    gr = df[['Filename', *task_strings]].groupby('Filename')
    
    vals_list = []
    for name in df['Filename'].unique().tolist():
        vals = { "name": name }
        for task in tasks_hardcore:
            t = 'T{}'.format(task)
            vals[t] = gr.get_group(name)[t].value_counts().to_dict()

        vals_list.append(vals)

    df_votes = pd.DataFrame(vals_list)
    df_votes = df_votes_with_vote_count(df_votes)
    df_votes = df_votes_with_object_info(df_votes)
    df_votes = df_votes_with_vote_percentages(df_votes, df)
    
    return df_votes

##### Export `df_votes` to json:

In [15]:
df_votes = make_df_votes(df, task_strings_hc)
df_to_json(df_votes, 'df_votes.json')

##### Export `df` (stacked df) to json:

In [16]:
df.reset_index(drop=True, inplace=True)
df_to_json(df, 'df_stacked.json')

---

# Some statistics from the filtering process above, like # of users that saw any object multiple times, number of classifications to be filtered out, etc.

In [17]:
print('# users that saw at least one object multiple times:', len(users_with_duplicates))  # Discovery: 233 users saw the same object multiple times across workflows.

duplicate_count = [len(objects) for [user, objects] in duplicate_classifications.items()] 

# Print the frequency of duplicate votes (first entry is # of duplicates seen by user, second is the amount of users that saw that many duplicate objects)
duplicate_count_frequency = np.unique(duplicate_count, return_counts=True)
print('[# of objects seen multiple times per user,  frequency]', '\n', np.array(duplicate_count_frequency).T)  # Discovery: there is one user that saw 4363 duplicate objects, and one that saw 2046 duplicates. What happened here?

# users that saw at least one object multiple times: 233
[# of objects seen multiple times per user,  frequency] 
 [[   1  121]
 [   2   32]
 [   3   14]
 [   4    8]
 [   5    4]
 [   6    7]
 [   7    3]
 [   8    1]
 [   9    1]
 [  10    3]
 [  12    2]
 [  13    1]
 [  14    1]
 [  15    1]
 [  16    2]
 [  17    1]
 [  18    2]
 [  19    1]
 [  20    2]
 [  24    1]
 [  28    2]
 [  34    1]
 [  38    1]
 [  42    1]
 [  48    1]
 [  55    1]
 [  58    1]
 [  61    1]
 [  65    1]
 [  73    1]
 [  90    1]
 [ 102    1]
 [ 104    1]
 [ 120    1]
 [ 121    1]
 [ 142    1]
 [ 171    1]
 [ 183    1]
 [ 224    1]
 [ 231    1]
 [ 473    1]
 [ 793    1]
 [2046    1]
 [4363    1]]


### Extract usernames of users that saw more than 1000 duplicates:

In [18]:
strange_users = list(filter(lambda entry: len(entry[1]) > 1000, duplicate_classifications.items()))
strange_users = [user[0] for user in strange_users]

#### Print number of votes cast per option for T0 (task 0) by these users:

In [19]:
for user in strange_users:
    print('\n')
    print(df.query("user_name == @user")['T0'].value_counts().to_dict())
    print('# votes by user:', df.query("user_name == @user").shape[0])



{'galaxy': 3422, 'group of objects (cluster)': 1724, 'something else/empty center': 805}
# votes by user: 5951


{'galaxy': 3837, 'group of objects (cluster)': 1233, 'something else/empty center': 690}
# votes by user: 5760


##### Print # of 'duplicate' classifications that were filtered out:

In [20]:
len(df)

223059

In [21]:
print('Number of classifications filtered as duplicates:', len(to_filter))

Number of classifications filtered as duplicates: 10316


Discovery: there are 10316 classifications (approx. 5% of the total, actually, way  more than I expected) made by users that had already seen that object at least once.


# @idea: plot distribution of votes per user between galaxy, group, empty center as a function of votes per user

Discovery (cells deleted): classification may have retired state without me having noticed it

## Plot properties

In [22]:
def get_cols(df, cols):
    return df[cols].T.values

In [23]:
def fancy_plot(x, y, 
   xlabel, ylabel, title, 
   hist1=None, hist2=None, 
   savepath=None, 
   figsize=(12,8), 
   invert_x=True, invert_y=True
):
    if not hist1:
        hist1 = x
    if not hist2:
        hist2 = y

    fig = plt.figure(figsize=figsize)
    gs = fig.add_gridspec(5,5, wspace=0, hspace=0)

    fr1 = fig.add_subplot(gs[:-1,1:])
    fr2 = fig.add_subplot(gs[:-1, 0])
    fr3 = fig.add_subplot(gs[-1, 1:])

    common_hist_args = {
        "bins": 50, 
        "color": '#ddd',
        'histtype': 'step',
        "edgecolor": '#333'
    }

    common_args = {
        's': 6,
        "alpha": 0.6,
        "marker": "D",
    #     "facecolors": 'xkcd:lightish blue',
        "facecolors": 'none',
        "edgecolors": "xkcd:lightish blue",
        "lw": 1
    }

    fr2.hist(y, orientation='horizontal', **common_hist_args)
    fr3.hist(x, **common_hist_args)

    fr1.scatter(x, y, **common_args)
    fr1.grid(alpha=0.3, which='both')
    
    # fr1.set_ylabel(r'$\mu_{e,r}$ [mag arcsec$^{-2}$]', fontsize=10)

    fr1.set_xlabel(xlabel, bbox={'alpha': 0.75, 'color': 'white'}, fontsize=12)
    fr1.set_ylabel(ylabel, bbox={'alpha': 0.75, 'color': 'white'}, fontsize=12)

    tick_params = {
        "pad": -15,
        "left": "off",
        "labelleft": "on"
    }
    
    fr1.tick_params(axis="y", pad=-25, left='off', labelleft='off')
    fr1.tick_params(axis="x", **tick_params)

    if invert_x:
        fr1.set_xlim(fr1.get_xlim()[::-1])
        fr3.set_xlim(fr3.get_xlim()[::-1])
        
    if invert_y:
        fr1.set_ylim(fr1.get_ylim()[::-1])
        fr2.set_ylim(fr2.get_ylim()[::-1])
        
    fr2.set_xlim(fr2.get_xlim()[::-1])
    fr3.set_ylim(fr3.get_ylim()[::-1])

    fr2.set_axis_off()
    fr3.set_axis_off()

#     fr1.set_zorder(100)
    # plt.setp(fr1.get_xticklabels(), bbox={'alpha': 0.7,'color': 'white'})
    # plt.setp(fr1.get_yticklabels(), bbox={'alpha': 0.7,'color': 'white'})

    fr1.set_title(title)
    fig.tight_layout()

    if savepath:
        plt.savefig(savepath, dpi=400)
        
    plt.show()