In [1]:
import pandas as pd
from collections import defaultdict
df = pd.read_csv("./Data/source-content/parc_features/parc_train_features.tsv", sep="\t", index_col=0, header=0)

  mask |= (ar1 == a)


### Part 1: Finding where sources occur (same sentence as content? Different?)

In [3]:
current_filename = df["filename"][0]
att_sentences = [] # This will be a list of 2-lists representing for each attribution in a file 
                    # which sentence the (source, content) appear in (this is the B label location)
attribution_list = []

for index, attribution, filename, sentence_num in zip(df.index, df["attribution"], df["filename"], df["sentence_number"]):
    if filename != current_filename:
        attribution_list.append(att_sentences)
        att_sentences = []
        current_filename = filename
    
    att_list = attribution.split(" ")
    if att_sentences == []:
        att_sentences = [[None,None]] * len(att_list)
    for i in range(len(att_list)):
        att = att_list[i]
        att_split = att.split("-")
        if att_split[0] not in {"_", "0", ""} and att_split[0] == "B":
            if att_split[1] == "SOURCE":
                att_sentences[i][0] = sentence_num
            elif att_split[1] == "CONTENT":
                att_sentences[i][1] = sentence_num
    

In [4]:
count_dict = defaultdict(int)
for l in attribution_list:
    for s, c in l:
        if s != None:
            if (s - c > 1 or s - c < -1):
                count_dict[s-c] += 1
total = sum(count_dict.values())
print(f"Source-content are more than 1 sentence apart: {total}")

Source-content are more than 1 sentence apart: 555


In [5]:
count_dict = defaultdict(int)
for l in attribution_list:
    for s, c in l:
        if s != None:
            if (s - c != 0):
                count_dict[s-c] += 1
total = sum(count_dict.values())
print(f"Source-content are in different sentences: {total}")

Source-content are in different sentences: 1126


In [11]:
total_attributions = 0
for l in attribution_list:
    if l != [[None, None]]:
        total_attributions += len(l)
print(f"Total attributions: {total_attributions}")

Total attributions: 17667


In [9]:
1126/17667

0.06373464651610347

##### Conclusion: 
~6% of attributions in parc training have source, content in different sentences
~3% of attributions in part training have source, content more than one sentence apart

So, we could use either only same sentence, or +=1 sentence, and be rather successful

### Part 2: Gold sources; how long are they? What are they composed of, POS-wise?

Make a list of gold sources as lists of (token, lemma, POS, ne_info)

In [15]:
current_source = []
source_list = []
in_source = False

for token, lemma, pos, ne_info, relevant_ne, attribution in zip(df["token"], df["lemma"], df["POS"], df["ne_info"], df["relevant_ne"], df["attribution"]):
    att_list = attribution.split(" ")
    for i in range(len(att_list)):
        att = att_list[i]
        att_split = att.split("-")
        if att_split[0] not in {"_", "0", ""} and att_split[1] == "SOURCE":
            if att_split[0] == "B":
                source_list.append(current_source)
                current_source = [(token, lemma, pos, ne_info, relevant_ne)]
            elif att_split[0] == "I":
                current_source.append((token, lemma, pos, ne_info, relevant_ne))

                

In [7]:
print(f"Total number of sources: {len(source_list)}")

Total number of sources: 16349


In [12]:
len_dict = defaultdict(int)
for l in source_list:
    len_dict[len(l)] += 1
print(f"Number of sources with len 1: {len_dict[1]}, len 2: {len_dict[2]}, len 3: {len_dict[3]}") 
print(f"Number of sources with len <=3: {(5697+4543+1521)}")

Number of sources with len 1: 5697, len 2: 4543, len 3: 1521
Number of sources with len <=3: 11761


In [24]:
for l in source_list:
    if len(l) > 3:
        print(l)

[('James', 'James', 'NNP', 'B-PERSON', 1), ('A.', 'A.', 'NNP', 'I-PERSON', 1), ('Talcott', 'Talcott', 'NNP', 'E-PERSON', 1), ('of', 'of', 'IN', 'O', 0), ('Boston', 'Boston', 'NNP', 'S-GPE', 1), ("'s", "'s", 'POS', 'O', 0), ('Dana-Farber', 'Dana-Farber', 'NNP', 'B-ORG', 1), ('Cancer', 'Cancer', 'NNP', 'I-ORG', 1), ('Institute', 'Institute', 'NNP', 'E-ORG', 1)]
[('Darrell', 'Darrell', 'NNP', 'B-PERSON', 1), ('Phillips', 'Phillips', 'NNP', 'E-PERSON', 1), (',', ',', ',', 'O', 0), ('vice', 'vice', 'NN', 'O', 0), ('president', 'president', 'NN', 'O', 0), ('of', 'of', 'IN', 'O', 0), ('human', 'human', 'JJ', 'O', 0), ('resources', 'resource', 'NNS', 'O', 0), ('for', 'for', 'IN', 'O', 0), ('Hollingsworth', 'Hollingsworth', 'NNP', 'B-ORG', 1), ('&amp;', '&amp;', 'NNP', 'I-ORG', 1), ('Vose', 'Vose', 'NNP', 'E-ORG', 1)]
[('Brenda', 'Brenda', 'NNP', 'B-PERSON', 1), ('Malizia', 'Malizia', 'NNP', 'I-PERSON', 1), ('Negus', 'Negus', 'NNP', 'E-PERSON', 1), (',', ',', ',', 'O', 0), ('editor', 'editor', 

[('more', 'more', 'JJR', 'O', 0), ('and', 'and', 'CC', 'O', 0), ('more', 'more', 'JJR', 'O', 0), ('Japanese', 'japanese', 'JJ', 'S-NORP', 1), ('companies', 'company', 'NNS', 'O', 0)]
[('Yoshio', 'Yoshio', 'NNP', 'B-PERSON', 1), ('Hatakeyama', 'Hatakeyama', 'NNP', 'E-PERSON', 1), (',', ',', ',', 'O', 0), ('the', 'the', 'DT', 'O', 0), ('president', 'president', 'NNP', 'O', 0), ('of', 'of', 'IN', 'O', 0), ('the', 'the', 'DT', 'B-ORG', 1), ('Japan', 'Japan', 'NNP', 'I-ORG', 1), ('Management', 'Management', 'NNP', 'I-ORG', 1), ('Association', 'Association', 'NNP', 'E-ORG', 1)]
[('Matsuo', 'Matsuo', 'NNP', 'B-PERSON', 1), ('Toshimitsu', 'Toshimitsu', 'NNP', 'E-PERSON', 1), (',', ',', ',', 'O', 0), ('a', 'a', 'DT', 'O', 0), ('66-year-old', '66-year-old', 'JJ', 'O', 0), ('executive', 'executive', 'JJ', 'O', 0), ('vice', 'vice', 'NN', 'O', 0), ('president', 'president', 'NN', 'O', 0), ('of', 'of', 'IN', 'O', 0), ('Japan', 'Japan', 'NNP', 'B-ORG', 1), ('Air', 'Air', 'NNP', 'I-ORG', 1), ('Lines',

[('Mr.', 'Mr.', 'NNP', 'O', 0), ('Frank', 'Frank', 'NNP', 'S-PERSON', 1), (',', ',', ',', 'O', 0), ('senior', 'senior', 'JJ', 'O', 0), ('vice', 'vice', 'NN', 'O', 0), ('president', 'president', 'NNP', 'O', 0), ('at', 'at', 'IN', 'O', 0), ('International', 'International', 'NNP', 'B-ORG', 1), ('Management', 'Management', 'NNP', 'I-ORG', 1), ('Group', 'Group', 'NNP', 'E-ORG', 1)]
[('David', 'David', 'NNP', 'B-PERSON', 1), ('J.', 'J.', 'NNP', 'I-PERSON', 1), ('Stern', 'Stern', 'NNP', 'E-PERSON', 1), (',', ',', ',', 'O', 0), ('the', 'the', 'DT', 'O', 0), ('commissioner', 'commissioner', 'NN', 'O', 0), ('of', 'of', 'IN', 'O', 0), ('the', 'the', 'DT', 'B-ORG', 1), ('National', 'National', 'NNP', 'I-ORG', 1), ('Basketball', 'Basketball', 'NNP', 'I-ORG', 1), ('Association', 'Association', 'NNP', 'E-ORG', 1)]
[('media-stock', 'media-stock', 'JJ', 'O', 0), ('analyst', 'analyst', 'NN', 'O', 0), ('Richard', 'Richard', 'NNP', 'B-PERSON', 1), ('J.', 'J.', 'NNP', 'I-PERSON', 1), ('MacDonald', 'MacDon

[('Jack', 'Jack', 'NNP', 'B-PERSON', 1), ('MacAllister', 'MacAllister', 'NNP', 'E-PERSON', 1), (',', ',', ',', 'O', 0), ('chairman', 'chairman', 'NN', 'O', 0), ('and', 'and', 'CC', 'O', 0), ('chief', 'chief', 'JJ', 'O', 0), ('executive', 'executive', 'JJ', 'O', 0), ('officer', 'officer', 'NN', 'O', 0)]
[('William', 'William', 'NNP', 'B-PERSON', 1), ('C.', 'C.', 'NNP', 'I-PERSON', 1), ('Ferguson', 'Ferguson', 'NNP', 'E-PERSON', 1), (',', ',', ',', 'O', 0), ('chairman', 'chairman', 'NN', 'O', 0), ('and', 'and', 'CC', 'O', 0), ('chief', 'chief', 'JJ', 'O', 0), ('executive', 'executive', 'JJ', 'O', 0), ('officer', 'officer', 'NN', 'O', 0)]
[('Southern', 'southern', 'JJ', 'B-ORG', 1), ('New', 'New', 'NNP', 'I-ORG', 1), ('England', 'England', 'NNP', 'I-ORG', 1), ('Telecommunications', 'telecommunication', 'NNS', 'E-ORG', 1)]
[('Walter', 'Walter', 'NNP', 'B-PERSON', 1), ('H.', 'H.', 'NNP', 'I-PERSON', 1), ('Monteith', 'Monteith', 'NNP', 'I-PERSON', 1), ('Jr.', 'Jr.', 'NNP', 'E-PERSON', 1), ('

[('A', 'a', 'DT', 'O', 0), ('Christie', 'Christie', 'NNP', 'B-ORG', 1), ("'s", "'s", 'POS', 'E-ORG', 1), ('spokeswoman', 'spokeswoman', 'NN', 'O', 0)]
[('Trevor', 'Trevor', 'NNP', 'B-PERSON', 1), ('Woodland', 'Woodland', 'NNP', 'E-PERSON', 1), (',', ',', ',', 'O', 0), ('chief', 'chief', 'JJ', 'O', 0), ('corporate', 'corporate', 'JJ', 'O', 0), ('trader', 'trader', 'NN', 'O', 0), ('at', 'at', 'IN', 'O', 0), ('Harris', 'Harris', 'NNP', 'B-ORG', 1), ('Trust', 'trust', 'NNP', 'I-ORG', 1), ('&amp;', '&amp;', 'NNP', 'I-ORG', 1), ('Savings', 'Savings', 'NNPS', 'I-ORG', 1), ('Bank', 'Bank', 'NNP', 'E-ORG', 1), ('in', 'in', 'IN', 'O', 0), ('New', 'New', 'NNP', 'B-GPE', 1), ('York', 'York', 'NNP', 'E-GPE', 1)]
[('Nauman', 'Nauman', 'NNP', 'B-PERSON', 1), ('Barakat', 'Barakat', 'NNP', 'E-PERSON', 1), ('of', 'of', 'IN', 'O', 0), ('Shearson', 'Shearson', 'NNP', 'B-ORG', 1), ('Lehman', 'Lehman', 'NNP', 'I-ORG', 1), ('Hutton', 'Hutton', 'NNP', 'I-ORG', 1), ('Inc', 'Inc', 'NNP', 'E-ORG', 1)]
[('Tom', '

##### How many sources DO NOT have any NEs or pronouns?

In [21]:
none_count = 0
for l in source_list:
    ne = False
    pronoun = False
    for item in l:
        if item[4] == 1:
            ne = True
        if item[2] == "PRP" or item[2] == "PRP$":
            pronoun = True
    if not ne and not pronoun:
        none_count += 1

In [22]:
print(f"Number of sources with no pronoun or relevant_ne: {none_count}")

Number of sources with no pronoun or relevant_ne: 4422


~25% of sources have no pronoun or relevant ne

In [25]:
none_count = 0
for l in source_list:
    ne = False
    pronoun = False
    for item in l:
        if item[3] != "O":
            ne = True
        if item[2] == "PRP" or item[2] == "PRP$":
            pronoun = True
    if not ne and not pronoun:
        none_count += 1
print(f"Number of sources with no pronoun or any ne: {none_count}")

Number of sources with no pronoun or any ne: 4118


Using NE instead of relevant_ne makes almost no difference (That's good info)

In [2]:
def content_in_sentence(df):
    '''
    Takes a df with "attribution", "filename", and "sentence_number" columns and returns a list (column) of binary
    "sentence contains a content" labels
    '''
    sent_with_content = set()
    for filename, sentence_number, attribution in zip(df["filename"], df["sentence_number"], df["attribution"]):
        for att in attribution.split(" "):
            att_split = att.split("-")
            if att_split[0] not in {"_", "0", ""} and att_split[1] == "CONTENT":
                sent_with_content.add((filename, sentence_number))
    labels = []
    for filename, sentence_number in zip(df["filename"], df["sentence_number"]):
        if (filename, sentence_number) in sent_with_content:
            label = 1
        else:
            label = 0
        labels.append(label)
    return labels

In [3]:
df["content_in_sentence"] = content_in_sentence(df)

In [4]:
def candidate_sources(df):
    '''
    Takes a df and returns a list of IOB labels for candidate sources
    Requires columns POS, relevant_ne, ne_info, content_in_sentence
    '''
    labels = []
    source_pos = {"NN", "NNP", "PRP", "PRP$", "NNS", "NNPS"}
    zipped = zip(df["POS"], df["relevant_ne"], df["ne_info"], df["content_in_sentence"])
    for pos, relevant_ne, ne_info, content_in_sentence in zipped:
        label = "O"
        if content_in_sentence == 0:
            labels.append(label)
            continue
        elif relevant_ne == 1:
            ne_IOB = ne_info.split("-")[0]
            if ne_IOB == "E" or ne_IOB == "I":
                label = "I"
            else: #ne_IOB=="B" or "S"
                label = "B"
        elif pos in source_pos:
            label = "B"
        labels.append(label)
    return labels

In [5]:
candidate_source_labels = candidate_sources(df)

In [42]:
count = 0
for entry in candidate_source_labels:
    if entry == "B":
        count += 1
print(f"Number of Bs in candidate_source_labels: {count}")
len(candidate_source_labels)

Number of Bs in candidate_source_labels: 134598


1066853

##### Thoughts:
We have 16349 sources and 134598 candidate sources. That means we have a) a lot of negative labels, although not necessarily too many because: b) we'll potentially have many positive labels per content. That's kind of okay, I think, we'll see I guess.

In [6]:
def collect_source_content(df):
    '''
    Takes a df with columns "attribution" and "condidate_source_label" and returns a list of tuples of tuples
    representing pairs of source, content spans
    '''
    #### This operates by going through a df and constructing this list sentence-by-sentence
    ### For every sentence, collects a list of contents, a list of candidate sources (represented by spans)
    ### and a set of indices for each content representing the source that it is paired with
    ## Then, it pairs each candidate source with each content, and assigns labels to these, based on whether
    ## the source candidate contains indices in the index list for the content
    # Consider making this function operate on a single sentence?
    
    current_sentence = 1
    current_filename = df["filename"][0]
    zipper = zip(range(len(df["filename"])), df["sentence_number"], df["attribution"], df["candidate_source_label"], df["filename"])
    in_content = False
    in_source = False
    
    # These dictionaries are of the form {(filename, sentence_num): [content1, ...]}
    # With content1 = (start_index, end_index)
    # with len(contents) = num_attributions
    content_dict = dict()
    source_dict = dict()
    cand_source_dict = dict()
    
    contents = [None] * len(df["attribution"][0].split(" "))
    sources = [None] * len(df["attribution"][0].split(" "))
    cand_sources = []
    
    for index, sentence, attribution, source_label, filename in zipper:
        if sentence != current_sentence or current_filename != filename:
            content_dict[(current_filename, current_sentence)] = contents
            source_dict[(current_filename, current_sentence)] = sources
            cand_source_dict[(current_filename, current_sentence)] = cand_sources
            
            current_sentence = sentence
            current_filename = filename
            
            contents = [None] * len(attribution.split(" "))
            sources = [None] * len(attribution.split(" "))
            cand_sources = []
            in_content = False
            in_source = False
            
        else:
            att_list = attribution.split(" ")
            for i in range(len(att_list)):
                att = att_list[i]
                att_split = att.split("-")
                if att_split[0] in {"_", "0", ""}:
                    if type(contents[i]) == int:
                        # Found the end of a content span
                        contents[i] = (contents[i], index)
                    elif type(sources[i]) == int:
                        # Found the end of a source span
                        sources[i] = (sources[i], index)
                    continue
                
                IOB_label = att_split[0]
                att_part = att_split[1]
                att_type = att_split[2]
                if att_type != "NE":
                    if att_part == "CONTENT":
                        if IOB_label == "B":
                            # Start of a content
                            contents[i] = index
                        elif type(sources[i]) == int:
                            # Found the end of a source span
                            sources[i] = (sources[i], index)
                    elif att_part == "SOURCE":
                        if IOB_label == "B":
                            # Start of a source
                            sourcess[i] = index
                        elif type(contents[i]) == int:
                            # Found the end of a content span
                            contents[i] = (contents[i], index)

                    
            
    
    
    
    

In [7]:
def collect_span_dict(df, att_part):
    '''
    Extracts a dictionary {(filename, sent_num): [content1, ...], ...}    
    '''
    current_sentence = 1
    current_filename = df["filename"][0]
    zipper = zip(range(len(df["filename"])), df["sentence_number"], df["attribution"], df["filename"])
    
    # These dictionaries are of the form {(filename, sentence_num): [content1, ...]}
    # With content1 = (start_index, end_index)
    # with len(contents) = num_attributions
    return_dict = dict()
    
    contents = [None] * len(df["attribution"][0].split(" "))
    
    for index, sentence, attribution, filename in zipper:
        if sentence != current_sentence or current_filename != filename:
            if check_for_contents(contents):
                for i in range(len(contents)):
                    if type(contents[i]) == int:
                        contents[i] = contents[i],index
                return_dict[(current_filename, current_sentence)] = contents
            current_sentence = sentence
            current_filename = filename
            contents = [None] * len(attribution.split(" "))
        att_list = attribution.split(" ")
        for i in range(len(att_list)):
            att_split = att_list[i].split("-")
            if att_split[0] in {"_", "0", ""} or att_split[1] != att_part or att_split[2] == "NE":
                # Outside of span for attribution i
                if type(contents[i]) == int:
                    # Have a start index for att i
                    contents[i] = contents[i], index
            else:
                if att_split[1] == att_part and att_split[0] == "B":
                    # Beginning of attribution i
                    contents[i] = index
    
    if check_for_contents(contents):
        for i in range(len(contents)):
            if type(contents[i]) == int:
                contents[i] = contents[i],index
        return_dict[(current_filename, current_sentence)] = contents

    return return_dict
    
    

In [8]:
def check_for_contents(l):
    '''
    Checks to see if l contains anything other than None objects
    '''
    for entry in l:
        if entry != None:
            return True
    return False

In [9]:
content_dict = collect_span_dict(df, "CONTENT")

In [10]:
source_dict = collect_span_dict(df, "SOURCE")

In [21]:
for location, content in content_dict.items():
    source = source_dict[location]
    print(location, source, content)

('wsj_0003.tsv', 1) [None, None, (90, 91), None, None, None, None, None, None, None, None, None, None] [None, None, (57, 89), None, None, None, None, None, None, None, None, None, None]
('wsj_0003.tsv', 2) [None, None, None, None, None, None, None, None, None, None, None, None, (122, 123)] [None, None, None, None, None, None, None, None, None, None, None, None, (93, 122)]
('wsj_0003.tsv', 5) [None, (186, 189), None, None, None, None, None, None, None, None, None, None, None] [None, (191, 198), None, None, None, None, None, None, None, None, None, None, None]
('wsj_0003.tsv', 9) [None, None, None, (260, 269), None, None, None, None, None, None, None, None, None] [None, None, None, (245, 259), None, None, None, None, None, None, None, None, None]
('wsj_0003.tsv', 11) [None, None, None, None, None, None, None, None, (293, 296), None, None, None, None] [None, None, None, None, None, None, None, None, (297, 326), None, None, None, None]
('wsj_0003.tsv', 12) [None, None, None, None, None, No

KeyError: ('wsj_0004.tsv', 5)

In [38]:
def collect_candidate_sources(df):
    '''
    Outputs a dictionary of of {(filename, sent_num): [cand_source, ...]}
    with cand_source in the form of index tuple
    '''
    current_sentence = 1
    current_filename = df["filename"][0]
    zipper = zip(range(len(df["filename"])), df["sentence_number"], df["candidate_source_label"], df["filename"])
    
    # These dictionaries are of the form {(filename, sentence_num): [content1, ...]}
    # With content1 = (start_index, end_index)
    # with len(contents) = num_attributions
    return_dict = dict()
    
    candidate_sources = []
    in_span = False
    start_index = 0
    
    for index, sentence, label, filename in zipper:
        #print(filename,sentence,label,in_span)
        if sentence != current_sentence or current_filename != filename:
            if candidate_sources:
                return_dict[(current_filename, current_sentence)] = candidate_sources
            current_sentence = sentence
            current_filename = filename
            candidate_sources = []
        if in_span:
            if label == "O":
                # If in_span, label 'should' be I or O, so we continue or close span, respectively
                # If label is B, we treat it as one continuous span, instead of 2 separate ones
                candidate_sources.append((start_index, index))
                in_span = False
        else:
            assert label != "I"
            if label == "B":
                start_index = index
                in_span = True
    if candidate_sources:
        return_dict[(current_filename, current_sentence)] = candidate_sources
    
    return return_dict

In [32]:
def collect_instances(content_dict, source_dict, cand_source_dict):
    '''
    Takes three dictionaries of {(filename, sent_num): [index_tuple, ...], ...}
    and returns a list of [(source), (content), label] representing instances for a source, content
    classifier. source and content are tuples representing spans, label is a binary label for whether
    source and content are attributed together
    '''
    return_list = []
    # First make dict of {location: [(content, source), ...]} pairs. If no source for a content, (source)==None
    gold_pairs = defaultdict(list)
    # Then make function to compare spans for overlap (get label from two source spans)
    for location, content_list in content_dict.items():
        if location in source_dict:
            source_list = source_dict[location]
        else:
            source_list = [None] * len(content_list)
        assert len(content_list)==len(source_list)
        
        for i in range(len(content_list)):
            content_span = content_list[i]
            if content_span != None:
                source_span = source_list[i]
                gold_pairs[location].append((source_span, content_span))
    
    for location, pair_list in gold_pairs.items():
        if location in cand_source_dict:
            cand_source_list = cand_source_dict[location]
        else:
            cand_source_list = []
        for source, content in pair_list:
            for cand_source in cand_source_list:
                if source != None and compare_spans(source, cand_source):
                    return_list.append([cand_source, content, 1])
                else:
                    return_list.append([cand_source, content, 0])
    return return_list
    
    

In [13]:
def compare_spans(span1, span2):
    '''
    Given two tuples of integers, determines whether there'a any overlap between them
    '''
    for i in range(span1[0], span1[1]):
        if span2[0] <= i and i < span2[1]:
            return True
    return False
    

In [39]:
df["candidate_source_label"] = candidate_source_labels
cand_source_dict = collect_candidate_sources(df)

In [None]:
for location, cand_source_list in cand_source_dict.items():
    for i in range(len(cand_source_list)):
        if 

In [40]:
instances = collect_instances(content_dict, source_dict, cand_source_dict)

In [41]:
instances

[[(58, 59), (57, 89), 0],
 [(60, 61), (57, 89), 0],
 [(65, 68), (57, 89), 0],
 [(72, 73), (57, 89), 0],
 [(74, 76), (57, 89), 0],
 [(78, 79), (57, 89), 0],
 [(80, 81), (57, 89), 0],
 [(83, 84), (57, 89), 0],
 [(87, 88), (57, 89), 0],
 [(90, 91), (57, 89), 1],
 [(94, 96), (93, 122), 0],
 [(97, 98), (93, 122), 0],
 [(103, 104), (93, 122), 0],
 [(106, 107), (93, 122), 0],
 [(111, 112), (93, 122), 0],
 [(113, 114), (93, 122), 0],
 [(115, 116), (93, 122), 0],
 [(119, 120), (93, 122), 0],
 [(122, 123), (93, 122), 1],
 [(187, 189), (191, 198), 1],
 [(196, 197), (191, 198), 0],
 [(246, 247), (245, 259), 0],
 [(250, 251), (245, 259), 0],
 [(253, 254), (245, 259), 0],
 [(256, 257), (245, 259), 0],
 [(260, 263), (245, 259), 1],
 [(264, 265), (245, 259), 1],
 [(266, 269), (245, 259), 1],
 [(294, 296), (297, 326), 1],
 [(297, 298), (297, 326), 0],
 [(304, 305), (297, 326), 0],
 [(308, 309), (297, 326), 0],
 [(311, 312), (297, 326), 0],
 [(315, 316), (297, 326), 0],
 [(321, 322), (297, 326), 0],
 [(

In [42]:
pos_count = 0
neg_count = 0
for source, content, label in instances:
    if label == 1:
        pos_count += 1
    else:
        neg_count += 1
pos_count, neg_count

(21165, 84690)

In [45]:
zipper = zip(df["POS"], df["token"], range(len(df["filename"])), df["sentence_number"], df["attribution"], df["candidate_source_label"], df["filename"])
for POS, token, index, sentence, attribution, source_label, filename in zipper:
    if (index > 56 and index < 125) or (index > 242 and index < 350):
        print(filename, sentence, index, attribution, token, POS, source_label)

wsj_0003.tsv 1 57 _ _ B-CONTENT-AT-1 _ _ _ _ _ _ _ _ _ _ A DT O
wsj_0003.tsv 1 58 _ _ I-CONTENT-AT-1 _ _ _ _ _ _ _ _ _ _ form NN B
wsj_0003.tsv 1 59 _ _ I-CONTENT-AT-1 _ _ _ _ _ _ _ _ _ _ of IN O
wsj_0003.tsv 1 60 _ _ I-CONTENT-AT-1 _ _ _ _ _ _ _ _ _ _ asbestos NN B
wsj_0003.tsv 1 61 _ _ I-CONTENT-AT-1 _ _ _ _ _ _ _ _ _ _ once RB O
wsj_0003.tsv 1 62 _ _ I-CONTENT-AT-1 _ _ _ _ _ _ _ _ _ _ used VBD O
wsj_0003.tsv 1 63 _ _ I-CONTENT-AT-1 _ _ _ _ _ _ _ _ _ _ to TO O
wsj_0003.tsv 1 64 _ _ I-CONTENT-AT-1 _ _ _ _ _ _ _ _ _ _ make VB O
wsj_0003.tsv 1 65 _ _ I-CONTENT-AT-1 _ _ _ _ _ _ _ _ _ _ Kent NNP B
wsj_0003.tsv 1 66 _ _ I-CONTENT-AT-1 _ _ _ _ _ _ _ _ _ _ cigarette NN B
wsj_0003.tsv 1 67 _ _ I-CONTENT-AT-1 _ _ _ _ _ _ _ _ _ _ filters NNS B
wsj_0003.tsv 1 68 _ _ I-CONTENT-AT-1 _ _ _ _ _ _ _ _ _ _ has VBZ O
wsj_0003.tsv 1 69 _ _ I-CONTENT-AT-1 _ _ _ _ _ _ _ _ _ _ caused VBN O
wsj_0003.tsv 1 70 _ _ I-CONTENT-AT-1 _ _ _ _ _ _ _ _ _ _ a DT O
wsj_0003.tsv 1 71 _ _ I-CONTENT-AT-1 _ _ _ _ _ _ _ _ _

##### Final Notes:
Very important note: All consecutive cand_source spans (mostly consecutive candidate source POS "B" labels) are treated as single continuous spans. This assumes that sources appear consectively rarely enough that it is worth it to consider, for instance, consectutive nouns like "Kent cigarette filters" as one instead of 3 candidate sources. Ideally some stats could be run on how this affects 'correctness' (eg. how many gold sources appear back to back, making this assumption false, and the like)

This code runs on ALL files in < 2 minutes on my machine.

Instance count:

| filename      | pos_instance | neg_instance |
|---------------|--------------|--------------|
| parc_train    | 21165        | 84690        |
| parc_dev      | 746          | 3205         |
| parc_test     | 1419         | 5719         |
| polnear_train | 20988        | 130305       |
| polnear_dev   | 2318         | 13875        |
| polnear_test  | 2234         | 13361        |

In [3]:
df.iloc[57:89]

Unnamed: 0,POS,dependency_head,dependency_label,doc_token_number,lemma,ne_info,sentence_number,sentence_token_number,token,cue_label,...,quotation_pn,quotation_ne,quotation_qm,filename,content_label_gold,ancestor_is_cue,cue_in_window_of_5,distance_to_prev_cue,distance_to_next_cue,cue_in_sentence
57,DT,2,det,1,a,O,1,1,A,0,...,O0,O1,O0,wsj_0003.tsv,B,1,0,.,34,1
58,NN,13,nsubj,2,form,O,1,2,form,0,...,O0,O1,O0,wsj_0003.tsv,I,1,0,.,33,1
59,IN,4,case,3,of,O,1,3,of,0,...,O0,O1,O0,wsj_0003.tsv,I,1,0,.,32,1
60,NN,2,nmod,4,asbesto,O,1,4,asbestos,0,...,O0,O1,O0,wsj_0003.tsv,I,1,0,.,31,1
61,RB,6,advmod,5,once,O,1,5,once,0,...,O0,O1,O0,wsj_0003.tsv,I,1,0,.,30,1
62,VBD,35,ccomp,6,use,O,1,6,used,0,...,O0,O1,O0,wsj_0003.tsv,I,1,0,.,29,1
63,TO,8,mark,7,to,O,1,7,to,0,...,O0,O1,O0,wsj_0003.tsv,I,1,0,.,28,1
64,VB,6,xcomp,8,make,O,1,8,make,0,...,O0,O1,O0,wsj_0003.tsv,I,1,0,.,27,1
65,NNP,11,compound,9,Kent,S-GPE,1,9,Kent,0,...,O0,O1,O0,wsj_0003.tsv,I,1,0,.,26,1
66,NN,11,compound,10,cigarette,O,1,10,cigarette,0,...,O0,O1,O0,wsj_0003.tsv,I,1,0,.,25,1
