In [13]:
import pandas as pd
import instance_generation

In [39]:
filepath = "./../../Data/parc_features/parc_train_features.tsv"

In [40]:
token_df = pd.read_csv(filepath, delimiter='\t', index_col=0)
token_df = token_df[:100]

In [41]:
instance_output = instance_generation.collect_instances_main(token_df)

In [55]:
def seperate_instance_gold(instance_output):
    
    gold_label_list = list()
    pair_list = list()
    for instance_list in instance_output:
        gold_label_list.append(instance_list[2])
        pair_tuple = (instance_list[0], instance_list[1])
        pair_list.append(pair_tuple)
    
    return pair_list, gold_label_list

def create_instance_list(list_of_tuples,df):
    main_attribution = list()
    main_gap = list()

    # Loop through instances meaning content and source span indices
    for instance in pair_list:
        # initiate instance_list and index_list
        attribution_indices = list()

        source, content = instance
        b_source, e_source = source
        b_content, e_content = content

        # For index of source
        for index in range(b_source, e_source+1):  
            attribution_indices.append(index)
        # for index of content
        for index in range(b_content, e_content+1):
            attribution_indices.append(index)
            
        # Find gap indices
        if b_source < b_content: 
            # If content follows source, gap is between last token of source and first of content
            gap_indices = [e_source, b_content]
        else:
            # If source follows content, then visa versa
            gap_indices = [e_content, b_source]

        main_attribution.append(attribution_indices)
        main_gap.append(gap_indices)
        
    return main_attribution, main_gap

In [99]:
# Extract relevant lists
pair_list, gold_labels = seperate_instance_gold(instance_output)

sequence_indices, gap_indices = create_instance_list(pair_list, df)

In [100]:
pair_df = pd.DataFrame()
pair_df['source_content_boundaries'] = pair_list
pair_df['gold_labels'] = gold_labels
pair_df['all_indices_in_span'] = sequence_indices
pair_df['gap_indices'] = gap_indices

In [102]:
pair_df.head()

Unnamed: 0,source_content_boundaries,gold_labels,all_indices_in_span,gap_indices
0,"((58, 59), (57, 89))",0,"[58, 59, 57, 58, 59, 60, 61, 62, 63, 64, 65, 6...","[89, 58]"
1,"((60, 61), (57, 89))",0,"[60, 61, 57, 58, 59, 60, 61, 62, 63, 64, 65, 6...","[89, 60]"
2,"((65, 68), (57, 89))",0,"[65, 66, 67, 68, 57, 58, 59, 60, 61, 62, 63, 6...","[89, 65]"
3,"((72, 73), (57, 89))",0,"[72, 73, 57, 58, 59, 60, 61, 62, 63, 64, 65, 6...","[89, 72]"
4,"((74, 76), (57, 89))",0,"[74, 75, 76, 57, 58, 59, 60, 61, 62, 63, 64, 6...","[89, 74]"


In [113]:
def get_word_length(df, list_of_tuples): 
    """
    Get word length for the content or source in the tuple (specified by s or c)
    """
    
    content_len_list = []
    source_len_list = []
    
    for s_tuple, c_tuple in list_of_tuples:
        # Get content len
        start, end = c_tuple
        content_len_list.append(end+1-start)
            
        # Get source len
        start, end = s_tuple
        source_len_list.append(end+1-start)
    pair_df['content_length'] = content_len_list    
    pair_df['source_length'] = source_len_list

In [115]:
get_word_length(pair_df, pair_df['source_content_boundaries'])

In [116]:
pair_df

Unnamed: 0,source_content_boundaries,gold_labels,all_indices_in_span,gap_indices,c_length,content_length,source_length
0,"((58, 59), (57, 89))",0,"[58, 59, 57, 58, 59, 60, 61, 62, 63, 64, 65, 6...","[89, 58]",7,33,2
1,"((60, 61), (57, 89))",0,"[60, 61, 57, 58, 59, 60, 61, 62, 63, 64, 65, 6...","[89, 60]",7,33,2
2,"((65, 68), (57, 89))",0,"[65, 66, 67, 68, 57, 58, 59, 60, 61, 62, 63, 6...","[89, 65]",7,33,4
3,"((72, 73), (57, 89))",0,"[72, 73, 57, 58, 59, 60, 61, 62, 63, 64, 65, 6...","[89, 72]",7,33,2
4,"((74, 76), (57, 89))",0,"[74, 75, 76, 57, 58, 59, 60, 61, 62, 63, 64, 6...","[89, 74]",7,33,3
5,"((78, 79), (57, 89))",0,"[78, 79, 57, 58, 59, 60, 61, 62, 63, 64, 65, 6...","[89, 78]",7,33,2
6,"((80, 81), (57, 89))",0,"[80, 81, 57, 58, 59, 60, 61, 62, 63, 64, 65, 6...","[89, 80]",7,33,2
7,"((83, 84), (57, 89))",0,"[83, 84, 57, 58, 59, 60, 61, 62, 63, 64, 65, 6...","[89, 83]",7,33,2
8,"((87, 88), (57, 89))",0,"[87, 88, 57, 58, 59, 60, 61, 62, 63, 64, 65, 6...","[89, 87]",7,33,2
9,"((90, 91), (57, 89))",1,"[90, 91, 57, 58, 59, 60, 61, 62, 63, 64, 65, 6...","[89, 90]",7,33,2


In [117]:
def get_distance_c2sentstart(token_df, pair_df, list_of_tuples): 
    for t in list_of_tuples: 
        s, c = t
        start, end = c
        
        # Get the distance from the start of the document to the content span.
        c_doc = df.iloc[start]['filename']
        index_doc_start = df[df.filename == c_doc].first_valid_index()
        
        for i in range(start,end+1):
            df.loc[i, 'distance_c2docstart'] = i - index_doc_start
            
        # Get the distance from the start of the sentence to the content span.
        file_df = df.loc[df["filename"] == c_doc] # Filter by filename to get the correct indices.
        
        c_sent = df.iloc[start]["sentence_number"]
        index_sent_start = file_df[df.sentence_number == c_sent].first_valid_index()
        
        for i in range(start,end+1):
            df.loc[i, 'distance_c2sentstart'] = i-index_sent_start
            
get_distance_c2sentstart(token_df, pair_df, pair_df['source_content_boundaries'])