# Analysis of the results of a MTurk experiment to annotate reasons why sentences need citations. Comparison with original WikiLabels experiment. 

In [13]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')

In [None]:

import pandas as pd
import holoviews as hv
from bokeh.sampledata.les_mis import data
import sys
import numpy as np
import json
reload(sys)  
import matplotlib.pyplot as plt
sys.setdefaultencoding('utf8')

#true_reasons={0:'direct quotation',1:'statistics',2:'controversial',
#              3:'opinion',4:'life',5:'scientific',6:'historical',7:'other'}
true_reasons={'name':['direct quotation','statistics','controversial','opinion','life','scientific','historical','other'],'group':np.zeros(8),'index':list(range(8))}
languages=['english']


In [2]:
raw_text={}
for l in languages:    
    filename='../results_second_round/'+l+'.json'
    with open(filename) as f:
            data=json.load(f)
            for row in data['tasks']:
                r=row['data']
                text=row['data']['statement'].decode()
                unique=str(r['revision'])+'_'+str(r['section_index'])+'_'+str(r['paragraph_index'])+'_'+str(r['sentence_index'])
                raw_text[unique]=text

In [3]:

#test_sentences={'The Doctor thought '}

aggregated_votes_peritem={}
aggregated_votes={}
result_files=[]
agreement={}

for real in range(0,len(true_reasons['name'])):
    aggregated_votes[real]={}
    aggregated_votes_peritem[real]={}
    agreement[true_reasons['name'][real]]=[]
    for vote in range(0,len(true_reasons['name'])):
        aggregated_votes[real][vote]=0
        aggregated_votes_peritem[real][vote]=0
real={}
voted={}
for i in range(1,3):
    result_files.append('result_test_'+str(i)+'.csv')
for results in result_files:
    with open(results)  as f:
        f.readline()
        for line in f:
            row=line.split(',')
            index=row[30][row[30].rfind('/')+1:row[30].rfind('.')]            
            if index not in voted:
                voted[index]=np.zeros(8)
            vote=int(row[-1].replace('"','').rstrip().split(' ')[1])
            gt=int(row[-2].replace('"','').split('-')[1])
            real[index]=gt-1
            voted[index][vote-1]+=1
            aggregated_votes[gt-1][vote-1]+=1

all_agreement=[]
mismatch={'WikiLabels':[],'MTurk':[],'sentence':[],'agreement':[]}
for index in real:
    chosen=np.argmax(voted[index])
    ag=voted[index][chosen]/float(3)
    all_agreement.append(ag)
    agreement[true_reasons['name'][real[index]]].append(ag)
    if ag>0.4:
        aggregated_votes_peritem[real[index]][chosen]+=1
        if real[index]!=chosen:
            mismatch['WikiLabels'].append(true_reasons['name'][real[index]])
            mismatch['MTurk'].append(true_reasons['name'][chosen])
            mismatch['sentence'].append(raw_text[index])
            mismatch['agreement'].append(voted[index][chosen]/float(3))

for reason in agreement:
    agreement[reason]=np.average(agreement[reason])
plot_input={'real':[], 'vote':[], 'counts':[]}
for r in aggregated_votes:
    for v in aggregated_votes[r]:
        count=aggregated_votes[r][v]
        plot_input['real'].append(int(r))
        plot_input['vote'].append(int(v))
        plot_input['counts'].append(int(count))


plot_input_peritem={'real':[], 'vote':[], 'counts':[]}
for r in aggregated_votes:
    for v in aggregated_votes[r]:
        count=aggregated_votes_peritem[r][v]
        plot_input_peritem['real'].append(int(r))
        plot_input_peritem['vote'].append(int(v))
        plot_input_peritem['counts'].append(int(count))

plot_input_normalized={'real':[], 'vote':[], 'counts':[]}
for r in aggregated_votes:
    tot=sum([aggregated_votes_peritem[r][c] for c in aggregated_votes_peritem[r]])
    print([aggregated_votes_peritem[r][c] for c in aggregated_votes_peritem[r]])
    for v in aggregated_votes_peritem[r]:
        count=aggregated_votes_peritem[r][v]
        plot_input_normalized['real'].append(true_reasons['name'][r])
        plot_input_normalized['vote'].append(true_reasons['name'][v])
        plot_input_normalized['counts'].append(int(count)/float(tot) if tot>0 else 0)
        if r==v:
            print(count)


0.68333333333333335

## Agreement in the MTurk experiment
We see here the breakdown of average agreement per class. The category 'controversial' has 0 entries. Average agreement is 68%.

In [7]:
%%opts Bars [tools=['hover'] height=300 width=600 fontsize={'title':10, 'xlabel':15, 'ylabel':15, 'ticks':15} xrotation=45]
%%opts Bars (cmap='Spectral' edge_cmap='Spectral')
bars = hv.Bars(agreement, hv.Dimension('Category'), 'agreement')
bars

## Comparison between individual Turkers' judgements and categories assigned in WikiLabels 
The lines in this plot reflect each judgement of each statement in the MT experiment: the color of the edge, the same of the starting edge, represent the category assigned to the statement by Wikipedians through wikilabels; the destination edge represents the category assigned by turkers. The more the lines that start and end on the same edge, the higher the agreement between turkers and Wikipedians. There are 3 judgement per statements.

In [8]:
nodes = hv.Dataset(pd.DataFrame(true_reasons),'index')
links = pd.DataFrame.from_dict(plot_input)
hv.extension('bokeh')
chord=hv.Chord((links,nodes),['real','vote'],['counts'])


In [9]:
%%opts Chord (label_text_font_size='18pt')
%%opts Chord [edge_color_index='real' label_index='name' color_index='index' height=800 width=800] 
%%opts Chord (cmap='Spectral' edge_cmap='Spectral')

chord

## Comparison between average Turkers' judgements and categories assigned in WikiLabels 
* We have computed a unique category for each statement by taking the class for which the majority of the turkers agreed. If there is no agreement (i.e. 0.33%), we excluded the statement.
* The lines in this plot reflect the categorization of each statement in the MT experiment: the color of the edge, the same of the starting edge, represent the category assigned to the statement by Wikipedians through wikilabels; the destination edge represents the category assigned by turkers. The more the lines that start and end on the same edge, the higher the agreement between turkers and Wikipedians.

In [10]:
links = pd.DataFrame.from_dict(plot_input_peritem)
chord_peritem=hv.Chord((links,nodes),['real','vote'],['counts'])
%opts Chord (label_text_font_size='18pt')
%opts Chord [edge_color_index='real' label_index='name' color_index='index' height=800 width=800] 
%opts Chord (cmap='Spectral' edge_cmap='Spectral')
chord_peritem

A better representation of the plot above in the form of a heatmap. The darker the color in the heatmap, the higher the number of statements that have been categorized in the same way by Turkers and Wikipedians

In [11]:
links_normalized=pd.DataFrame.from_dict(plot_input_normalized)
%opts HeatMap (cmap='BuPu')

%opts HeatMap [tools=['hover'] height=800 width=800 fontsize={'title':10, 'xlabel':15, 'ylabel':15, 'ticks':15} xrotation=45]

heatmap=hv.HeatMap((links_normalized),['real','vote'],['counts']).sort()
heatmap* hv.Labels(heatmap).options(text_font_size='10pt')


In [77]:
pd.DataFrame.from_dict(mismatch).to_csv('mismatched_sentences.tsv','\t')