# Evaluation of different crowd analysis metrics


(1) Dataset filtering:

* Exclude workers based on:
    - contradiction ration (absolut thresh, batch stdv, pair stdv, total stdv)
    - worker quality score (thresholds)
    - attention check fails 
    
(2) Label aggregation:

* Majority vote
* Top vote
* CT unit-label score (tresholds)

# Aggregate labels and evaluate against gold

In [3]:
import pandas as pd

from evaluation import evaluate 
from aggregation import aggregate_binary_labels
from load_data import load_expert_data
from load_data import load_experiment_data
from clean_annotations import clean_worker_cont_rate
from calculate_iaa import get_agreement 

def get_performance_overview(data_dict_list_exp, run, group, n_q, batch):
    data_dict_list_crowd = load_experiment_data(run, group, n_q, batch)
    units = [ 'raw', 'pair', 'batch', 'total']
    stds = ['raw', 0.5, 1, 1.5, 2]
    votes = ['majority_vote', 'top_vote', 'ct_vote_0.5', 'ct_vote_0.6',
             'ct_vote_0.7', 'ct_vote_0.8', 'ct_vote_0.9']
    

    overview_dicts = []
    expert_bin_labels = aggregate_binary_labels(data_dict_list_exp)
    for unit in units:
        for n_stdv in stds:
            if unit == str(n_stdv) == 'raw':
                data_dict_list_crowd_clean = data_dict_list_crowd
                crowd_bin_labels_clean  = aggregate_binary_labels(data_dict_list_crowd_clean)
            elif 'raw' not in [unit, n_stdv]:
                data_dict_list_crowd_clean = clean_worker_cont_rate(data_dict_list_crowd,\
                                            run,  group,  batch, unit, n_stdv)
                crowd_bin_labels_clean  = aggregate_binary_labels(data_dict_list_crowd_clean)
            else:
                crowd_bin_labels_clean = []
            if crowd_bin_labels_clean != []:
                iaa = get_agreement(data_dict_list_crowd_clean, 
                                    collapse_relations = False, v=False, disable_kappa = True)
                iaa_levels = get_agreement(data_dict_list_crowd_clean,
                                           collapse_relations = 'levels', v=False, disable_kappa = True)
                for v in votes:
                    print(v)
                    results_dict_rel = evaluate(expert_bin_labels, crowd_bin_labels_clean,\
                                            vote=v, label = 'relation')
                    results_dict_level = evaluate(expert_bin_labels, crowd_bin_labels_clean,\
                                            vote=v, label = 'level')
                    # missing: write function to get iaa of selected labels only 
                    d = dict()
                    d['cleaning'] = f'cont_rate_{unit}_{n_stdv}'
                    d['aggregation'] = v
                    d['f1_relation'] = results_dict_rel['f1']
                    d['p_relation'] = results_dict_rel['p']
                    d['r_relation'] = results_dict_rel['r']
                    d['f1_level'] = results_dict_level['f1']
                    d['iaa'] = iaa['Krippendorff']
                    d['iaa_levels'] = iaa_levels['Krippendorff']
                    overview_dicts.append(d)
    return overview_dicts

# load expert data    
run = '4'
group = 'expert_inspection1'
n_q = '*'
batch = '*'
data_dict_list_exp = load_expert_data(run, group, n_q, batch)

run = '4'
group = 'experiment2'
n_q = '*'
batch = '*'

# evaluate:
overview_dicts = get_performance_overview(data_dict_list_exp, run, group, n_q, batch)
df = pd.DataFrame(overview_dicts)

Discarded 0.0 annotations.
Discarded 0.0 annotations.
majority_vote
top_vote
ct_vote_0.5
False False
False False
False False
False False
True False
True False
True False
False False
True False
False False
False False
False False
False False
False False
True True
False False
False False
False False
False False
False False
True True
False True
True False
False False
False False
False False
ct_vote_0.6
False False
False False
False False
False False
True False
True False
True False
False False
True False
False False
False False
False False
False False
False False
True True
False False
False False
False False
False False
False False
True True
False False
True False
False False
False False
False False
ct_vote_0.7
False False
False False
False False
False False
True False
True False
True False
False False
True False
False False
False False
False False
False False
False False
True False
False False
False False
False False
False False
False False
True False
False False
True False
False False
F

  'precision', 'predicted', average, warn_for)


majority_vote
top_vote
ct_vote_0.5
False False
False False
False False
False False
True False
True False
True False
False False
True False
False False
False False
False False
False False
False False
True True
False False
False False
False False
False False
False False
True True
False True
True False
False False
False False
False False
ct_vote_0.6
False False
False False
False False
False False
True False
True False
True False
False False
True False
False False
False False
False False
False False
False False
True True
False False
False False
False False
False False
False False
True True
False False
True False
False False
False False
False False
ct_vote_0.7
False False
False False
False False
False False
True False
True False
True False
False False
True False
False False
False False
False False
False False
False False
True False
False False
False False
False False
False False
False False
True False
False False
True False
False False
False False
False False
ct_vote_0.8
False False
False F

True False
False False
False False
False False
False False
False False
True False
False False
True False
False False
False False
False False
ct_vote_0.9
False False
False False
False False
False False
True False
True False
True False
False False
True False
False False
False False
False False
False False
False False
True False
False False
False False
False False
False False
False False
True False
False False
True False
False False
False False
False False
majority_vote
top_vote
ct_vote_0.5
False False
False False
False False
False False
True False
True False
True False
False False
True False
False False
False False
False False
False False
False False
True True
False False
False False
False False
False False
False False
True True
False True
True False
False False
False False
False False
ct_vote_0.6
False False
False False
False False
False False
True False
True False
True False
False False
True False
False False
False False
False False
False False
False False
True True
False False
False F

majority_vote
top_vote
ct_vote_0.5
False False
False False
False False
False False
True False
True False
True False
False False
True False
False False
False False
False False
False False
False False
True True
False False
False False
False False
False False
False False
True True
False True
True False
False False
False False
False False
ct_vote_0.6
False False
False False
False False
False False
True False
True False
True False
False False
True False
False False
False False
False False
False False
False False
True True
False False
False False
False False
False False
False False
True True
False False
True False
False False
False False
False False
ct_vote_0.7
False False
False False
False False
False False
True False
True False
True False
False False
True False
False False
False False
False False
False False
False False
True False
False False
False False
False False
False False
False False
True False
False False
True False
False False
False False
False False
ct_vote_0.8
False False
False F

In [4]:
df = df.sort_values('f1_relation', ascending = False).round(2)
df

Unnamed: 0,aggregation,cleaning,f1_level,f1_relation,iaa,iaa_levels,p_relation,r_relation
64,top_vote,cont_rate_total_0.5,0.91,0.85,0.39,0.56,0.89,0.87
71,top_vote,cont_rate_total_1,0.91,0.85,0.38,0.55,0.89,0.87
77,majority_vote,cont_rate_total_1.5,0.91,0.82,0.38,0.55,0.83,0.83
35,majority_vote,cont_rate_batch_0.5,0.91,0.82,0.41,0.68,0.83,0.83
70,majority_vote,cont_rate_total_1,0.91,0.82,0.38,0.55,0.83,0.83
63,majority_vote,cont_rate_total_0.5,0.91,0.82,0.39,0.56,0.83,0.83
84,majority_vote,cont_rate_total_2,0.91,0.82,0.37,0.55,0.83,0.83
14,majority_vote,cont_rate_pair_1,0.83,0.79,0.41,0.63,0.79,0.80
78,top_vote,cont_rate_total_1.5,0.91,0.75,0.38,0.55,0.84,0.80
85,top_vote,cont_rate_total_2,0.91,0.75,0.37,0.55,0.84,0.80


In [None]:
print(df.to_latex(index=False))

# Evaluate individual workers