In [2]:
from IPython.core.display import display, HTML

display(HTML('<style>.container { width:85% !important; }</style>'))


# Prepare sip list
Source indicating predicates are a list of verbs typically used to indicate attribution predicates.
can be saved as external file (.txt would work)

In [3]:
import glob
import os

In [4]:
# These are the SIPs we will use to filter out the predicates in the main program
with open ('./data/sip.txt', 'r') as infile:
    split_these = infile.read()

In [5]:
# here we split the SIP list into the verb (lemma) plus its frequency in the parc corpus
import re

sip_splitter = re.compile('\w+\s\d+')
sips_to_sort = sip_splitter.findall(split_these)

# the least frequent verbs are removed (fequency = <=10)
filtered_sips = [items for items in sips_to_sort if int(items[-2:])>10]
filtered_sips_alone =[sips.strip('0123456789').strip() for sips in filtered_sips]

# Extract all triples (subject predicate object) 

In [6]:
def extract_subj_pred_obj (file):
    '''
    takes filename as input (conll format) and returns a list of every subject predicate & object triple 
    in the text based on the allen semantic role labelling system (ARG0, B-V, ARG1)
    '''
    #read in conll file
    with open (file, 'r', encoding = 'utf-8') as infile:
        text = infile.read()
    text_rows = text.splitlines()
    rows = [row.split('\t') for row in text_rows]
    
    # to iterate through each sentence the for loop should be as long as the number of sentences in the file
    # as indicated by the last row [-1], first index position [0].
    
    num_sents = int(rows[-1][0])
    # skip header
    rows_only = rows[1:]
    
    file_tuples_list = []
    
    for n in range(1, num_sents):
        sent_id = n
        col_subj = set()
        col_obj = set()
        col_pred = set()
    
        for row in rows_only:
            # for a particular sentence_id
            if row[0] == str(sent_id):
                # get the index of each item in each row in that sentence
                for i, item in enumerate(row):

                    if 'ARG0' in item: 
                        col_subj.add(i)
                    elif 'ARG1' in item:
                        col_obj.add(i)
                    elif item == 'B-V':
                        col_pred.add (i)

        # since the index of a row is the column position in the file, 
        # if a col (i) has a subj, pred and obj in a given sentence span, we want it:
        # we can find the intersection (shared numbers) in the above sets and keep the overlapping indexes
        
        search_col = set.intersection(col_subj, col_pred, col_obj)
        
        if len(search_col)>0:
            # the value in search col is the index_number of the column we want in each row
            for value in search_col:
                subj_phrase= ''
                obj_phrase = ''
                pred=''
                # we want to cycle through the rows again now we know which columns we are interested in
                for row in rows_only:
                    try:
                        if row[0] == str(sent_id):
                            column = row[value] 
                            word = row[2] 
                            lemma = row[3]
                            # we extract the id of the predicate to evaluate (compare with human annotations)
                            index = rows.index(row)
                            if 'ARG0' in column:
                                subj_phrase+=' '+word
                            elif 'ARG1' in column:
                                obj_phrase+=' '+word
                            elif column == 'B-V':
                                ind = index+1
                                pred = lemma #we use the lemma here instead of the word to filter on later
                                
                    except:
                        pass
#                         print('some isssue with', file)
                #once extracted, a tuple is created for each column (this may be multiple for embedded phrases)
                one_shot = (subj_phrase, pred, obj_phrase, ind)
                file_tuples_list.append(one_shot)

    return file_tuples_list

# clean tuples and creates output dictionary

## getting the category of the text source to add to the output dictionary

In [7]:
import pandas as pd
# text_source_category.csv is a list of the publishers and their respective classification (news, journal)

with open ('./data/text_source_category.csv', 'r') as infile:
    text_source_classes_text = infile.readlines()
# convert text to list of 2 items (publisher | text source class)   
publishers_type_list = [item.strip().split(';') for item in text_source_classes_text[1:]]

# some cleaning - stripping \n from the second element
publishers_type_list[0][0]=publishers_type_list[0][0].strip(',')
# convert list to dict to perform mapping in dataframe
publishers_type_dict = {key:value for key, value in publishers_type_list}
# look up which filename is associated with which publisher (info in tsv file)
df = pd.read_csv('./data/metadata.tsv', sep ='\t')
# create new column in dataframe where col 'type' is determined by the col 'publisher' 
df['type']=df.Publisher.map(publishers_type_dict)

# this resulting df can be used to lookup text-class based on publisher info

In [8]:
def source_type (stripped_basename):
    '''
    takes a stripped basename of a file, searches a datafrane for the filename; looks up the text source 
    category and returns the appropriate classification to be appended to a dictionary value list 
    '''
    source_class = df.loc[df['File_ID'] == stripped_basename, 'type'].values[0]
    
    return source_class

## creating the output dictionary


In [11]:
#  change for relative file path to conll-allen-nlp
allen_nlp_directory = '../conll-allen-nlp'

import glob
import os

# final output is dictionary: key is basename of file; values are SIP filtered triples from that file 
triple_tuple_dict = {}
dict_of_SIP_indexes={}

for filename in glob.glob(f'{allen_nlp_directory}/*'):
    # the stripped basename of the file will be used twice below (as dict_key)
    dict_key = os.path.splitext(os.path.basename(filename))[0]
    # we identify the text source class
    try:
        text_class = source_type(dict_key)
    except:
#         print(filename)
        # two files are causing issues - @berkleywellness & Science-_-AAS - both are news so forced the class below
        text_class = 'news'
    # here we execute the above function
    triple_list = extract_subj_pred_obj(filename)
    cleaned_triples = [triple for triple in triple_list if len(triple)>0]
    sip_filtered_triple = []
    # this isolates the index of the SIP
    ind_alone=[]
    for triple in cleaned_triples:
        if triple[1] in filtered_sips_alone:
            sip_filtered_triple.append(triple)
            ind_alone.append(triple[-1])
    sip_filtered_triple.append(text_class)
    
    triple_tuple_dict[dict_key]=sip_filtered_triple
    
    #dict of basename: list of indeces for SIPs in basename file
    dict_of_SIP_indexes[dict_key]=ind_alone


# accessing output

In [12]:
# sample category look-up
triple_tuple_dict['Activist-Post_20170704T090503'][-1]

'news'

In [13]:
# sample attribution content extraction (the [:-1] avoids the non-triple - text_source category at the end of the list)

for triples in triple_tuple_dict['Activist-Post_20170704T090503'][:-1]:
    print (triples[2], '\n')

 a failing measles vaccine is behind the outbreak 

 the marketing and cheerleading arm of the vaccine industry and the medical-industrial complex 

 an article titled , “ The 2013 Measles Outbreak : A Failing Vaccine , Not A Failure To Vaccinate , ” which deconstructed the myth that the minimally – or non-vaccinated were responsible for outbreaks of measles in highly vaccination-compliant populations 

 to subject themselves to them 

 this 

 We conclude that outbreaks of measles can occur in secondary schools , even when more than 99 percent of the students have been vaccinated and more than 95 percent are immune 

 that outbreaks of measles can occur in secondary schools , even when more than 99 percent of the students have been vaccinated and more than 95 percent are immune 

 “ This outbreak suggests that measles transmission may persist in some settings despite appropriate implementation of the current measles elimination strategy 

 that measles transmission may persist in some

# Evaluating Output

- the precision, recall and f1 scores are evaluated by comparing the indexes of the B-Cue in gold (allen.conll) and the system identified B-V semantic role label

In [1]:
# dict of indexes of cues (values) per file (key)
dict_of_SIP_indexes


In [31]:
# create list of file-name + list of indexes of attr_cue
allen_nlp_directory = '../conll-allen-nlp'
gold_attr_index_dict = {}
for filename in glob.glob(f'{allen_nlp_directory}/*'):
    dict_key = os.path.splitext(os.path.basename(filename))[0]
    with open (filename, 'r', encoding = 'utf-8') as infile:
        text = infile.read()
    output_list =[]
    text_rows = text.splitlines()
    rows = [row.split('\t') for row in text_rows]
    for row in rows:
        #some lines are blank
        try:
            if 'B-cue' in row[12]:
                index = rows.index(row)
                output_list.append(index+1)
                
        except:
            pass
    gold_attr_index_dict[dict_key]= output_list

In [2]:
gold_attr_index_dict



In [15]:
# matching == True Positives(tp); in dict, but not in conll = False Positives(fp); in conll not in dict = False Negatives(fn).
# this provides file by file stats for potential bug-hunting
matching = {}
# these are aggregates scores for final analysis
tp = 0
fp = 0
fn = 0

for key, values in dict_of_SIP_indexes.items():
    for gold_key, gold_values in gold_attr_index_dict.items():
        matching_list = 0
        if gold_key == key:
            len_gold = len(gold_values)
            len_system = len(values)
            for value in values:
                # if system in gold
                if value in gold_values:
                    matching_list+=1
                else:
                    pass
            TP = matching_list
            FP = len_system-matching_list
            FN = len_gold - matching_list
            
            tp+=TP
            fp+=FP
            fn+=FN
            # cannot divide by 0
            if TP >0:
                precision = TP/(TP+FP)
                recall = TP/(TP+FN)
            else:
                precision = 'div by zero'
                recall = 'div by zero'
            inner_dict = {}
            inner_dict['precision']=precision
            inner_dict['recall'] = recall
            matching[key]= inner_dict     

In [3]:
matching


In [16]:
recall = tp/(tp+fn)
precision = tp/(tp+fp)
f1 = 2*((precision*recall)/(precision+recall))

In [17]:
recall 

0.6424983620877921

In [18]:
precision

0.46207004868855034

In [19]:
f1

0.5375479627261099