# Evaluation of different crowd analysis metrics


(1) Dataset filtering:

* Exclude workers based on:
    - contradiction ration (absolut thresh, batch stdv, pair stdv, total stdv)
    - worker quality score (thresholds)
    - attention check fails 
    
(2) Label aggregation:

* Majority vote
* Top vote
* CT unit-label score (tresholds)

In [3]:
from clean_annotations import clean_workers
from calculate_iaa import get_agreement
from aggregation import aggregate_binary_labels
from evaluation import evaluate_configs


from utils_data import load_experiment_data, load_expert_data, load_gold_data
from utils_analysis import sort_by_key
from utils_analysis import load_analysis, load_ct

from sklearn.metrics import precision_recall_fscore_support as p_r_f1
from collections import defaultdict
import pandas as pd


def iaa_dis_agreement(data_dict_list, expert_unit_agreement_dict):
    
    data_by_agreement = defaultdict(list)
    data_by_triple = sort_by_key(data_dict_list, ['relation', 'property', 'concept'])
    
    for t, gold_expect in expert_unit_agreement_dict.items():
        data = data_by_triple[t]
        data_by_agreement[gold_expect].extend(data)
        
    for exp, data in data_by_agreement.items():
        agreement = get_agreement(data, v=False)
        print(exp, agreement['Krippendorff'])
        

def get_expert_agreement_labels(expert_annotations):
    expert_annotations_by_unit = sort_by_key(expert_annotations, ['relation',
                                                              'property', 'concept'])
    unit_agreement_dict = dict()
    for unit, data in expert_annotations_by_unit.items():
        agreements = []
        for d in data:
            w = d['workerid']
            if not w.endswith('_test1'):
                for k in d.keys():
                    #print(k)
                    if k.startswith('disagreement_'):
                        agreements.append(k)
        n_agreement_annotations = len(agreements)
        n_agree = agreements.count('disagreement_agreement')
        prop_agreement = n_agree/n_agreement_annotations

        if prop_agreement == 1.0:
            unit_agreement_dict[unit] = 'agreement'
        elif 'disagreement_agreement' in agreements:
            unit_agreement_dict[unit] = 'possible_disagreement'
        else:
            unit_agreement_dict[unit] = 'disagreement'
    return unit_agreement_dict




def get_gold_dis_agreement(gold, agreement_labels):
    
    gold_agree = defaultdict(list)
    
    for d in gold:
        t = f"{d['relation']}-{d['property']}-{d['concept']}"
        label = agreement_labels[t]
        gold_agree[label].append(d)
    return gold_agree

    


# Total evaluation

In [4]:
# load gold

gold = load_gold_data()
print(len(gold))
# remove instances without gold
gold = [d for d in gold if d['answer'] != 'NOGOLD']
print(len(gold))



154
131


In [5]:
# load crowd:
run = '*'
group = 'experiment*'
n_q = '*'
batch = '*'
crowd = load_experiment_data(run, group, n_q, batch)

Discarded 655.0 annotations.


In [7]:
ct_thresholds = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 1]
stds = [ 0.5, 1, 1.5, 2]
overview_dicts = evaluate_configs(gold, crowd, ct_thresholds, stds)
df =  pd.DataFrame(overview_dicts) 
df_total = df.sort_values(by=['relations-f1'], ascending=False)[['filtering',
                                                           'filtering_unit',
                                                           'n_stdv',
                                                           'aggregation',
                                                            'relations-f1',
                                                            'relations-p',
                                                           'relations-r',
                                                           'alpha', 'relations-coverage']]
df_total.round(2).to_csv('../analyses/evaluation_accuracy_full.csv')
df_total

----Label distribution----
True: 21
False 110
----------------------------
131 17917 1491
aggretation
no filtering - different aggretation methods
cleaning and aggregation


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


clean all contradictory annotations


Unnamed: 0,filtering,filtering_unit,n_stdv,aggregation,relations-f1,relations-p,relations-r,alpha,relations-coverage
28,contradictions,batch,0.5,majority_vote,0.876159,0.917413,0.862595,0.205896,1.0
48,contradictions,total,1,majority_vote,0.859888,0.888003,0.847328,0.213531,1.0
60,exclude_contradictory_annotations,-,-,majority_vote,0.854730,0.866298,0.847328,0.337895,1.0
52,contradictions,total,1.5,majority_vote,0.853606,0.885221,0.839695,0.204368,1.0
44,contradictions,total,0.5,majority_vote,0.852112,0.877439,0.839695,0.232688,1.0
36,contradictions,batch,1.5,majority_vote,0.849769,0.899538,0.832061,0.197188,1.0
32,contradictions,batch,1,majority_vote,0.848639,0.890882,0.832061,0.191913,1.0
16,contradictions,pair,1,majority_vote,0.839664,0.871849,0.824427,0.198477,1.0
12,contradictions,pair,0.5,majority_vote,0.827994,0.845697,0.816794,0.212183,1.0
56,contradictions,total,2,majority_vote,0.827289,0.866747,0.809160,0.199709,1.0


In [8]:
from collections import defaultdict
# Filter table for paper
from collections import Counter
# include: top thee

# top config of each filtering-voting combo
selected_rows = []
res_dicts = df_total.to_dict('records')

selected_rows.extend(res_dicts[:3])


top_dict = defaultdict(list)

for d in res_dicts:
    ag = d['aggregation']
    if ag.startswith('uas'):
        ag, thresh = ag.split('-')
    f_a = (d['filtering'], ag)
    f1 = d['relations-f1']
    top_dict[f_a].append((f1, d))


for f_a, rows in top_dict.items():
    f1s = [f1 for f1, d in rows]
    top = max(f1s)
    for f1, d in rows:
        if f1 == top:
            if d not in selected_rows:
                selected_rows.append(d)
           
df = pd.DataFrame(selected_rows).sort_values(by=['relations-f1'], ascending=False)[['filtering',
                                                           'filtering_unit',
                                                           'n_stdv',
                                                           'aggregation',
                                                            'relations-f1',
                                                            'relations-p',
                                                           'relations-r',
                                                           'alpha', 'relations-coverage']]
df


Unnamed: 0,filtering,filtering_unit,n_stdv,aggregation,relations-f1,relations-p,relations-r,alpha,relations-coverage
0,contradictions,batch,0.5,majority_vote,0.876159,0.917413,0.862595,0.205896,1.0
1,contradictions,total,1,majority_vote,0.859888,0.888003,0.847328,0.213531,1.0
2,exclude_contradictory_annotations,-,-,majority_vote,0.85473,0.866298,0.847328,0.337895,1.0
9,ct_wqs,pair,0.5,majority_vote,0.814956,0.862047,0.793893,0.185651,1.0
15,-,-,-,majority_vote,0.814956,0.862047,0.793893,0.185651,1.0
14,ct_wqs,batch,1,majority_vote,0.814956,0.862047,0.793893,0.185651,1.0
13,ct_wqs,total,2,majority_vote,0.814956,0.862047,0.793893,0.185651,1.0
11,ct_wqs,total,1,majority_vote,0.814956,0.862047,0.793893,0.185651,1.0
10,ct_wqs,total,0.5,majority_vote,0.814956,0.862047,0.793893,0.185651,1.0
12,ct_wqs,total,1.5,majority_vote,0.814956,0.862047,0.793893,0.185651,1.0


In [4]:
#print(df.round(2).to_latex(index = False))

 # Evaluation with respect to expected worker behavior

In [6]:
from utils_analysis import sort_by_key

# get agreement data

run = "4"
#group1 = 'reason_agreement_expert_inspection1'
group = 'reason_agreement*_expert_inspection*'
batch = '*'
n_q = '*'

#run4-group_reason_agreement_expert_inspection1
#expert_annotations = load_expert_data(run, group, n_q, batch)
#expert_annotations2 = load_expert_data(run, group2, n_q, batch)
#expert_annotations = expert_annotations1 + expert_annotations2
#expert_unit_agreement_dict = get_expert_agreement_labels(expert_annotations)
#agreement_labels = get_expert_agreement_labels(expert_annotations)

gold_by_agreement = sort_by_key(gold, ['expected_agreement'])
print('\n--- agreement categories---')
for l, data in gold_by_agreement.items():
    print(l, len(data))


--- agreement categories---
disagreement 34
agreement 49
possible_disagreement 48


In [10]:
# evaluate agree category:
gold_agree = gold_by_agreement['agreement']
gold_poss_disagree = gold_by_agreement['possible_disagreement']
gold_disagree = gold_by_agreement['disagreement']
print(len(gold_agree))
print(len(gold_poss_disagree))
print(len(gold_disagree))

NameError: name 'gold_by_agreement' is not defined

In [13]:
#for d in gold_agree:
 #   print(d['quid'], d['answer'])

In [15]:
for d in gold_poss_disagree:
    if d['quid'] == 'typical_of_concept-rhino-black':
        print(d['quid'], d['answer'])
        print(d.keys())



typical_of_concept-rhino-black false
dict_keys(['answer', 'completionurl', 'concept', 'disagreement_cnt', 'expected_agreement', 'property', 'quid', 'relation', 'workerid'])


In [8]:
for d in gold_disagree:
    print(d['quid'], d['answer'])

impossible-shovel-roll false
implied_category-freebooter-dangerous true
rare-carrot-red true
variability_limited-recliner-square false
variability_open-recliner-square false
implied_category-recliner-square false
typical_of_concept-recliner-square false
rare-recliner-square false
impossible-leopard-yellow false
variability_limited-pen-round true
rare-washer-roll false
unusual-washer-roll true
rare-acaridae-dangerous true
unusual-acaridae-dangerous true
creative-acaridae-dangerous false
implied_category-stock-fly false
typical_of_property-stock-fly false
typical_of_concept-stock-fly false
afforded_unusual-stock-fly false
afforded_usual-stock-fly false
variability_limited-stock-fly false
creative-cruiser-wheels false
variability_open-cruiser-wheels false
rare-cruiser-wheels false
impossible-cruiser-wheels false
creative-rhino-black false
rare-rhino-black false
creative-leopard-yellow false
impossible-recliner-square false
afforded_usual-plaice-lay_eggs true
unusual-shovel-roll true
impos

In [35]:
# disagree vs agree

overview_dicts_agree = evaluate_configs(gold_agree, crowd)
for d in overview_dicts_agree:
    d['behav.'] = 'agree'
    
gold_disagree_all = []
gold_disagree_all.extend(gold_poss_disagree)
gold_disagree_all.extend(gold_disagree)


overview_dicts_disagree_all = evaluate_configs(gold_disagree_all, crowd)
for d in overview_dicts_disagree_all:
    d['behav.'] = 'disagree'
    
#overview_dicts_disagree = evaluate_configs(gold_disagree, crowd)
#for d in overview_dicts_disagree:
 #   d['behav.'] = 'disagree'
    #overview_dicts_total.append(d)

df_agree = pd.DataFrame(overview_dicts_agree).sort_values(by=['relations-f1'], ascending=False)
dicts_agree = df_agree.to_dict("records")

df_disagree = pd.DataFrame(overview_dicts_disagree).sort_values(by=['relations-f1'], ascending=False)
dicts_disagree = df_disagree.to_dict("records")


#df_disagree = pd.DataFrame(overview_dicts_disagree).sort_values(by=['relations-f1'], ascending=False)
#dicts_disagree = df_disagree.to_dict("records")

#df_agree

----Label distribution----
True: 6
False 43
----------------------------
49 17917 608
aggretation
no filtering - different aggretation methods
cleaning and aggregation
clean all contradictory annotations
----Label distribution----
True: 15
False 67
----------------------------
82 17917 883
aggretation
no filtering - different aggretation methods
cleaning and aggregation
clean all contradictory annotations


In [9]:
overview_dicts_total = []
overview_dicts_total.extend(overview_dicts_agree)
overview_dicts_total.extend(overview_dicts_disagree)
#overview_dicts_total.extend(overview_dicts_poss_disagree)

df =  pd.DataFrame(overview_dicts_total) 
df = df.sort_values(by=['relations-f1'], ascending=False)[['behav.', 'filtering',
                                                           'filtering_unit',
                                                           'n_stdv',
                                                           'aggregation',
                                                            'relations-f1',
                                                            'relations-p',
                                                           'relations-r',
                                                           'alpha']]
df.round(2).to_csv('../analyses/evaluation_accuracy_disagree.csv')
df

NameError: name 'overview_dicts_agree' is not defined

In [37]:
# Get df for paper


configs_to_keep = []
for d in selected_rows:
    config = (d['filtering'], d['filtering_unit'], d['aggregation'], d['n_stdv'])
    config = [c  if c is not None else '-' for c in config]
    #if x is not None else ''
    if d['filtering'] != 'ct_wqs' and d['aggregation'] != 'top_vote':
        #print(config)
        print(config)
        configs_to_keep.append(config)

# add top three
selected_rows_ag_dis = []
selected_rows_ag_dis.extend(dicts_agree[:3])
selected_rows_ag_dis.extend(dicts_disagree[:3])
#for d in overview_dicts_agree:

#configs_agree = [(d['filtering'], d['filtering_unit'], d['aggregation'], d['n_stdv'])
 #                  for d in dicts_agree]


for d in dicts_agree:
    config = (d['filtering'], d['filtering_unit'], d['aggregation'], d['n_stdv'])
    config = [c  if c is not None else '-' for c in config]
    if config in configs_to_keep:
        print('keeping', config)
        if d not in selected_rows_ag_dis :
            selected_rows_ag_dis.append(d)
        #else:
         #   print('already collected')
    #else:
     #   print(config, 'not found')
 
        
for d in dicts_disagree:
    config = (d['filtering'], d['filtering_unit'], d['aggregation'], d['n_stdv'])
    config = [c  if c is not None else '-' for c in config]
    if config in configs_to_keep and d not in selected_rows_ag_dis:
        selected_rows_ag_dis.append(d)
    

df = pd.DataFrame(selected_rows_ag_dis).sort_values(by=['relations-f1'], ascending=False)[[
                                                            'behav.',
                                                           'aggregation',
                                                            'filtering',
                                                            'filtering_unit',
                                                            'n_stdv',
                                                           'relations-f1',
                                                            'relations-p',
                                                           'relations-r', 'alpha']]
df 

['contradictions', 'batch', 'majority_vote', 0.5]
['contradictions', 'total', 'majority_vote', 1]
['exclude_contradictory_annotations', '-', 'majority_vote', '-']
['-', '-', 'uas-0.65', '-']
['-', '-', 'majority_vote', '-']
keeping ['-', '-', 'uas-0.65', '-']
keeping ['contradictions', 'total', 'majority_vote', 1]
keeping ['contradictions', 'batch', 'majority_vote', 0.5]
keeping ['exclude_contradictory_annotations', '-', 'majority_vote', '-']
keeping ['-', '-', 'majority_vote', '-']


Unnamed: 0,behav.,aggregation,filtering,filtering_unit,n_stdv,relations-f1,relations-p,relations-r,alpha
0,agree,majority_vote,contradictions,pair,0.5,0.909816,0.944341,0.897959,0.277441
1,agree,uas-0.65,-,-,-,0.906319,0.923639,0.897959,0.225307
2,agree,uas-0.7,-,-,-,0.901284,0.905734,0.897959,0.225307
6,agree,majority_vote,contradictions,total,1,0.893367,0.938776,0.877551,0.278331
3,disagree,majority_vote,contradictions,batch,1.5,0.885784,0.895838,0.882353,0.201318
4,disagree,majority_vote,contradictions,batch,1,0.885784,0.895838,0.882353,0.201797
5,disagree,majority_vote,contradictions,batch,0.5,0.885784,0.895838,0.882353,0.214447
7,agree,majority_vote,contradictions,batch,0.5,0.861015,0.930029,0.836735,0.276553
8,agree,majority_vote,exclude_contradictory_annotations,-,-,0.861015,0.930029,0.836735,0.324119
10,disagree,majority_vote,-,-,-,0.855311,0.859314,0.852941,0.207792


In [28]:
print(df.round(2).to_latex(index = False))

\begin{tabular}{lllllrrrr}
\toprule
   behav. &    aggregation &                          filtering & filtering\_unit & n\_stdv &  relations-f1 &  relations-p &  relations-r &  alpha \\
\midrule
    agree &  majority\_vote &                     contradictions &           pair &    0.5 &          0.91 &         0.94 &         0.90 &   0.28 \\
    agree &       uas-0.65 &                                  - &              - &      - &          0.91 &         0.92 &         0.90 &   0.23 \\
    agree &        uas-0.7 &                                  - &              - &      - &          0.90 &         0.91 &         0.90 &   0.23 \\
    agree &  majority\_vote &                     contradictions &          total &      1 &          0.89 &         0.94 &         0.88 &   0.28 \\
 disagree &  majority\_vote &                     contradictions &          batch &    0.5 &          0.89 &         0.91 &         0.88 &   0.16 \\
    agree &  majority\_vote &                     contradictio