In [1]:
import pandas as pd
import glob

In [2]:
glob.glob('./sample_preprocessing_output/*')

['./sample_preprocessing_output\\wsj_0001.tsv',
 './sample_preprocessing_output\\wsj_0002.tsv',
 './sample_preprocessing_output\\wsj_0003.tsv',
 './sample_preprocessing_output\\wsj_0004.tsv',
 './sample_preprocessing_output\\wsj_0005.tsv',
 './sample_preprocessing_output\\wsj_0006.tsv',
 './sample_preprocessing_output\\wsj_0007.tsv',
 './sample_preprocessing_output\\wsj_0008.tsv',
 './sample_preprocessing_output\\wsj_0009.tsv',
 './sample_preprocessing_output\\wsj_0010.tsv']

In [3]:
filepath = './sample_preprocessing_output/wsj_0001.tsv'

In [4]:
df = pd.read_csv(filepath, delimiter='\t', index_col=0)
df = df[['sentence_number', 'doc_token_number', 'sentence_token_number', 'token', 'lemma', 'POS', 
    'dependency_head', 'dependency_label', 'ne_info', 'cue_label']]
df.head()

Unnamed: 0,sentence_number,doc_token_number,sentence_token_number,token,lemma,POS,dependency_head,dependency_label,ne_info,cue_label
0,1,1,1,Pierre,Pierre,NNP,9,nsubj,B-PERSON,0
1,1,2,2,Vinken,Vinken,NNP,1,flat,E-PERSON,0
2,1,3,3,",",",",",",1,punct,O,0
3,1,4,4,61,61,CD,5,nummod,B-DATE,0
4,1,5,5,years,year,NNS,6,obl:npmod,I-DATE,0


In [5]:
list(df[df['ne_info'] != 'O'].index)

[0, 1, 3, 4, 5, 15, 16, 19, 23, 24, 27]

In [6]:
def get_boundary_indices(df, boundary_type='sent'):
    """
    Gets indices of sentence or doc boundaries. For sentence boundary fill in 'sent' (is default). For doc fill in anything else.
    returns list of indices.
    """
    indices = set()
    # Boundary at start of doc for sent and doc
    indices.add(0)
    # If set to sentence boundaries then find all starts of sentences
    if boundary_type == 'sent':
        for index in df[df['sentence_token_number'] == 1].index:
            if index >= 2:   # Only if index is not 0 then add index to list
                indices.add(index-2)
                indices.add(index-1)
                indices.add(index)
            elif index == 1:
                indices.add(index)
    
    # Also sentence boundary at end of doc
    indices.add(df.shape[0]-2)
    indices.add(df.shape[0]-1)
    return indices

def add_boundary_column(df, boundary_type):
    """
    Boundary type is 'sent' or 'doc'
    """
    df[f'near_{boundary_type}_boundary'] = 0
    indices = get_boundary_indices(df, boundary_type=boundary_type)
    df.loc[indices, f'near_{boundary_type}_boundary'] = 1
    
def get_sent_start_indices(df):
    indices = set()
    for index in df[df['sentence_token_number'] == 1].index:
        indices.add(index)
    return indices

def get_sent_bound_indices(df):
    sent_start_indices = get_sent_start_indices(df)
    sent_end_indices = set()
    
    for index in sent_start_indices:
        if index>0:
            sent_end_indices.add(index-1)
    sent_end_indices.add(df.shape[0]-1)
    sent_bound_indices = zip(sent_start_indices, sent_end_indices)
    
    return list(sent_bound_indices)

In [7]:
add_boundary_column(df, boundary_type='sent')
add_boundary_column(df, boundary_type='doc')

In [8]:
df['dist_beg_sent'] = df['sentence_token_number'] -1
sent_bound_indices = get_sent_bound_indices(df)
for start_i, end_i in sent_bound_indices:
    range_indices = list(range(start_i, end_i+1))
    df.loc[range_indices, 'dist_end_sent'] = df.loc[end_i, 'sentence_token_number'] - df['dist_beg_sent'] -1
    df.loc[range_indices, 'sent_len'] = df.loc[end_i, 'sentence_token_number']
df['dist_end_sent'] = df['dist_end_sent'].astype('int64')
df['sent_len'] = df['sent_len'].astype('int64')

In [9]:
# For testing, remove later # TODO
df.at[19, 'POS'] = 'PNP'

In [10]:
df['pn_in_sent'] = 0
df['ne_in_sent'] = 0
df['qm_in_sent'] = 0
for start_i, end_i in sent_bound_indices:
    range_indices = list(range(start_i, end_i+1))
    if df.loc[range_indices].loc[df['POS'] == 'PNP'].shape[0] != 0:
        df.loc[range_indices, 'pn_in_sent'] = 1
#     if df.loc[range_indices].loc[df[''] == 1].shape[0] != 0:
#         df.loc[range_indices, 'ne_in_sent'] = 1
    if df.loc[range_indices].loc[df['token'] == 'quote'].shape[0] != 0:
        df.loc[range_indices, 'qm_in_sent'] = 1
        
#df['pn_in_sent'] = df['pn_in_sent'].astype('int64')

In [11]:
df

Unnamed: 0,sentence_number,doc_token_number,sentence_token_number,token,lemma,POS,dependency_head,dependency_label,ne_info,cue_label,near_sent_boundary,near_doc_boundary,dist_beg_sent,dist_end_sent,sent_len,pn_in_sent,ne_in_sent,qm_in_sent
0,1,1,1,Pierre,Pierre,NNP,9,nsubj,B-PERSON,0,1,1,0,17,18,0,0,0
1,1,2,2,Vinken,Vinken,NNP,1,flat,E-PERSON,0,0,0,1,16,18,0,0,0
2,1,3,3,",",",",",",1,punct,O,0,0,0,2,15,18,0,0,0
3,1,4,4,61,61,CD,5,nummod,B-DATE,0,0,0,3,14,18,0,0,0
4,1,5,5,years,year,NNS,6,obl:npmod,I-DATE,0,0,0,4,13,18,0,0,0
5,1,6,6,old,old,JJ,1,amod,E-DATE,0,0,0,5,12,18,0,0,0
6,1,7,7,",",",",",",9,punct,O,0,0,0,6,11,18,0,0,0
7,1,8,8,will,will,MD,9,aux,O,0,0,0,7,10,18,0,0,0
8,1,9,9,join,join,VB,0,root,O,0,0,0,8,9,18,0,0,0
9,1,10,10,the,the,DT,11,det,O,0,0,0,9,8,18,0,0,0
