In [1]:
import pandas as pd
import re

In [2]:
df_parc = pd.read_csv('./../../../Data/parc_error_analysis.csv', sep='\t', index_col=0)
df_polnear = pd.read_csv('./../../../Data/polnear_error_analysis.csv', sep='\t', index_col=0)
df_polnear.drop(['filename_y', 'doc_token_number_y'], axis=1, inplace=True)
df_polnear.rename(columns={'filename_x':'filename'}, inplace=True)

In [3]:
def clean_att_columns(df):
    for column_name in ['gold_attribution', 'pred_attribution']:
        cleaned_atts = []
        for element in df[column_name]:
            if '-NE-' in element:
                element = re.sub("[B,I]-.+-NE-.+", "", element)
                #element = element.replace('.-...-NE-..', '')
            cleaned_atts.append(element)

        df[column_name] = cleaned_atts
        df[column_name] = df[column_name].apply(lambda row: row.strip('_ '))
        


In [4]:
clean_att_columns(df_parc)
clean_att_columns(df_polnear)

In [5]:
df_parc.head()

Unnamed: 0,filename,token,POS,doc_token_number,sentence_number,sentence_token_number,ne_info,relevant_ne,cue_label,candidate_source_label,gold_attribution,pred_attribution
0,wsj_2400.tsv,The,DT,1,1,1,O,0,0,O,,
1,wsj_2400.tsv,economy,NN,2,1,2,O,0,0,O,,
2,wsj_2400.tsv,'s,POS,3,1,3,O,0,0,O,,
3,wsj_2400.tsv,temperature,NN,4,1,4,O,0,0,O,,
4,wsj_2400.tsv,will,MD,5,1,5,O,0,0,O,,


In [6]:
df_polnear.head()

Unnamed: 0,filename,token,POS,doc_token_number_x,sentence_number,sentence_token_number,ne_info,relevant_ne,cue_label,candidate_source_label,gold_attribution,pred_attribution
0,breitbart_2016-09-12_stealth-over-health-hilla...,Stealth,NNP,1,1,1,O,0,0,O,,
1,breitbart_2016-09-12_stealth-over-health-hilla...,Over,IN,2,1,2,O,0,0,O,,
2,breitbart_2016-09-12_stealth-over-health-hilla...,Health,NNP,3,1,3,O,0,0,O,,
3,breitbart_2016-09-12_stealth-over-health-hilla...,:,:,4,1,4,O,0,0,O,,
4,breitbart_2016-09-12_stealth-over-health-hilla...,Hillary,NNP,5,1,5,B-PERSON,1,0,O,,


In [7]:
def unmatched_contents(df):
    filenames = set(df['filename'])

    count_unmatched = 0
    count_total = 0
    for filename in filenames:
        df_per_file = df[df['filename'] == filename]
        att_label_set = set(df_per_file['pred_attribution'])

        content_ids = set()
        source_ids = set()
        
        for att_label in att_label_set:
            att_split = att_label.split('-')
            if len(att_split) < 4:
                continue
            if att_split[1] == 'CONTENT':
                content_ids.add(att_split[3])
            elif att_split[1] == 'SOURCE':
                source_ids.add(att_split[3])

        count_unmatched += len(content_ids - source_ids)
        count_total += len(content_ids)
        if len(content_ids - source_ids) != 0:
            print(filename)

    return (count_unmatched, count_total)

In [8]:
def count_gold_candidate_sources(df, num_att_id=3):
    filenames = set(df['filename'])
    
    missed_count = 0
    partial_count = 0
    full_count = 0

    for filename in filenames:
        df_per_file = df[df['filename'] == filename]
        cand_source_dict = dict()

        for index, row in df_per_file.iterrows():
            att_split = row['gold_attribution'].split('-')
            if len(att_split) != num_att_id+1:
                continue
            if att_split[1] == 'SOURCE':
                att_id = att_split[num_att_id]
                if att_id not in cand_source_dict:
                    cand_source_dict[att_id] = set()
                cand_source_dict[att_id].add(row['candidate_source_label'])


        for cand_labels in cand_source_dict.values():
            if 'O' in cand_labels and len(cand_labels) == 1:
                missed_count += 1
            elif 'O' in cand_labels and len(cand_labels) > 1:
                partial_count += 1
            elif 'O' not in cand_labels:
                full_count += 1
                #print(filename, source_id, cand_labels)

    return (missed_count, partial_count, full_count)



In [9]:
print('PARC \nFiles with missing sources:')
count_unmatched, count_total = unmatched_contents(df_parc)
print('\nPrecentage of contents which do not have a source linked to it')
print(count_unmatched * 100 /count_total)

PARC 
Files with missing sources:
wsj_2412.tsv
wsj_2407.tsv
wsj_2417.tsv
wsj_2450.tsv
wsj_2413.tsv
wsj_2425.tsv
wsj_2418.tsv

Precentage of contents which do not have a source linked to it
1.8480492813141685


In [10]:
print('POLNEAR \nFiles with missing sources:')
count_unmatched, count_total = unmatched_contents(df_polnear)
print('\nPrecentage of contents which do not have a source linked to it')
print(count_unmatched * 100 /count_total)

POLNEAR 
Files with missing sources:
wash-post_2016-09-18_e-u-leaders-weigh-plans-for-grea.tsv
politico_2016-09-25_poll-clinton-and-trump-in-dead-h.tsv
politico_2016-09-23_partisan-fireworks-over-clinton-.tsv
usa-today_2016-09-08_candidates-vie-for-vets-support-.tsv
politico_2016-09-30_flight-attendants-union-endorses.tsv

Precentage of contents which do not have a source linked to it
0.585480093676815


In [11]:
missed_count, partial_count, full_count = count_gold_candidate_sources(df_polnear, num_att_id=2)
total = missed_count + partial_count + full_count
print("POLNEAR")
print('Percentage of sources which were missed completely, partially or fully in the candidate source list')
print('missed:', missed_count * 100 /total)
print('partial:', partial_count * 100 /total)
print('full:', full_count * 100 /total)

POLNEAR
Percentage of sources which were missed completely, partially or fully in the candidate source list
missed: 4.312114989733059
partial: 32.49486652977413
full: 63.19301848049281


In [12]:
missed_count, partial_count, full_count = count_gold_candidate_sources(df_parc, num_att_id=3)
total = missed_count + partial_count + full_count
print("PARC")
print('Percentage of sources which were missed completely, partially or fully in the candidate source list')
print('missed:', missed_count * 100 /total)
print('partial:', partial_count * 100 /total)
print('full:', full_count * 100 /total)

PARC
Percentage of sources which were missed completely, partially or fully in the candidate source list
missed: 4.8
partial: 46.8
full: 48.4
