In [1]:
import pandas as pd
import re

In [2]:
df_parc = pd.read_csv('./../../../Data/parc_error_analysis.csv', sep='\t', index_col=0)
df_polnear = pd.read_csv('./../../../Data/polnear_error_analysis.csv', sep='\t', index_col=0)
df_polnear.drop(['filename_y', 'doc_token_number_y'], axis=1, inplace=True)
df_polnear.rename(columns={'filename_x':'filename'}, inplace=True)

In [18]:
def clean_att_columns(df):
    for column_name in ['gold_attribution', 'pred_attribution']:
        cleaned_atts = []
        for element in df[column_name]:
            if '-NE-' in element:
                element = re.sub("[B,I]-.+-NE-.+", "", element)
                #element = element.replace('.-...-NE-..', '')
            cleaned_atts.append(element)

        df[column_name] = cleaned_atts
        df[column_name] = df[column_name].apply(lambda row: row.strip('_ '))
        
clean_att_columns(df_parc)
clean_att_columns(df_polnear)

In [19]:
def unmatched_contents(df):
    filenames = set(df['filename'])

    count_unmatched = 0
    count_total = 0
    for filename in filenames:
        df_per_file = df[df['filename'] == filename]
        att_label_set = set(df_per_file['pred_attribution'])

        content_ids = set()
        source_ids = set()
        
        for att_label in att_label_set:
            att_split = att_label.split('-')
            if len(att_split) < 4:
                continue
            if att_split[1] == 'CONTENT':
                content_ids.add(att_split[3])
            elif att_split[1] == 'SOURCE':
                source_ids.add(att_split[3])

        count_unmatched += len(content_ids - source_ids)
        count_total += len(content_ids)
        if len(content_ids - source_ids) != 0:
            print(filename)

    return (count_unmatched, count_total)

In [20]:
print(unmatched_contents(df_parc))
print(unmatched_contents(df_polnear))

wsj_2425.tsv
wsj_2450.tsv
wsj_2412.tsv
wsj_2407.tsv
wsj_2413.tsv
wsj_2417.tsv
wsj_2418.tsv
(9, 487)
politico_2016-09-23_partisan-fireworks-over-clinton-.tsv
usa-today_2016-09-08_candidates-vie-for-vets-support-.tsv
politico_2016-09-25_poll-clinton-and-trump-in-dead-h.tsv
wash-post_2016-09-18_e-u-leaders-weigh-plans-for-grea.tsv
politico_2016-09-30_flight-attendants-union-endorses.tsv
(5, 854)


In [43]:
def count_gold_candidate_sources(df, num_att_id=3):
    filenames = set(df['filename'])
    
    missed_count = 0
    partial_count = 0
    total_count = 0

    for filename in filenames:
        df_per_file = df[df['filename'] == filename]
        cand_source_dict = dict()

        for index, row in df_per_file.iterrows():
            att_split = row['gold_attribution'].split('-')
            if len(att_split) != num_att_id+1:
                continue
            if att_split[1] == 'CONTENT':
                att_id = att_split[num_att_id]
                if att_id not in cand_source_dict:
                    cand_source_dict[att_id] = set()
                cand_source_dict[att_id].add(row['candidate_source_label'])


        for cand_labels in cand_source_dict.values():
            if 'O' in cand_labels and len(cand_labels) == 1:
                missed_count += 1
            elif 'O' in cand_labels and len(cand_labels) > 1:
                partial_count += 1
            elif 'O' not in cand_labels:
                total_count += 1
                #print(filename, source_id, cand_labels)

    print(missed_count, partial_count, total_count)



In [35]:
df_polnear.loc[df_polnear['filename'] == 'breitbart_2016-09-30_the-latest-trump-urges-supporter.tsv'].loc[
    df_polnear['gold_attribution'] != '']

Unnamed: 0,filename,token,POS,doc_token_number_x,sentence_number,sentence_token_number,ne_info,relevant_ne,cue_label,candidate_source_label,gold_attribution,pred_attribution
2674,breitbart_2016-09-30_the-latest-trump-urges-su...,Trump,NNP,4,1,4,S-PERSON,1,0,B,B-SOURCE-1,
2675,breitbart_2016-09-30_the-latest-trump-urges-su...,urges,VBZ,5,1,5,O,0,1,O,B-CUE-1,
2676,breitbart_2016-09-30_the-latest-trump-urges-su...,supporters,NNS,6,1,6,O,0,0,B,B-CONTENT-1,
2677,breitbart_2016-09-30_the-latest-trump-urges-su...,to,TO,7,1,7,O,0,0,O,I-CONTENT-1,
2678,breitbart_2016-09-30_the-latest-trump-urges-su...,monitor,VB,8,1,8,O,0,0,O,I-CONTENT-1,
2679,breitbart_2016-09-30_the-latest-trump-urges-su...,polling,NN,9,1,9,O,0,0,B,I-CONTENT-1,
2680,breitbart_2016-09-30_the-latest-trump-urges-su...,places,NNS,10,1,10,O,0,0,B,I-CONTENT-1,
2702,breitbart_2016-09-30_the-latest-trump-urges-su...,Donald,NNP,32,2,21,B-PERSON,1,0,B,B-SOURCE-3,
2703,breitbart_2016-09-30_the-latest-trump-urges-su...,Trump,NNP,33,2,22,E-PERSON,1,0,I,I-SOURCE-3,
2704,breitbart_2016-09-30_the-latest-trump-urges-su...,is,VBZ,34,2,23,O,0,0,O,B-CUE-2,


In [41]:
count_gold_candidate_sources(df_polnear, num_att_id=2)

38 2042 110


In [42]:
count_gold_candidate_sources(df_parc, num_att_id=3)

35 500 4
