In [1]:
import pandas as pd
import numpy
import glob

In [2]:
glob.glob('./sample_preprocessing_output/*')

['./sample_preprocessing_output\\wsj_0001.tsv',
 './sample_preprocessing_output\\wsj_0002.tsv',
 './sample_preprocessing_output\\wsj_0003.tsv',
 './sample_preprocessing_output\\wsj_0004.tsv',
 './sample_preprocessing_output\\wsj_0005.tsv',
 './sample_preprocessing_output\\wsj_0006.tsv',
 './sample_preprocessing_output\\wsj_0007.tsv',
 './sample_preprocessing_output\\wsj_0008.tsv',
 './sample_preprocessing_output\\wsj_0009.tsv',
 './sample_preprocessing_output\\wsj_0010.tsv']

In [3]:
filepath = './sample_preprocessing_output/wsj_0001.tsv'

In [4]:
df = pd.read_csv(filepath, delimiter='\t', index_col=0)

In [5]:
df.columns

Index(['POS', 'dependency_head', 'dependency_label', 'doc_token_number',
       'lemma', 'ne_info', 'sentence_number', 'sentence_token_number', 'token',
       'cue_label', 'attribution'],
      dtype='object')

In [6]:
df[['sentence_number', 'doc_token_number', 'sentence_token_number', 'token', 'lemma', 'POS', 
    'dependency_head', 'dependency_label', 'ne_info',   
       'cue_label']]

Unnamed: 0,sentence_number,doc_token_number,sentence_token_number,token,lemma,POS,dependency_head,dependency_label,ne_info,cue_label
0,1,1,1,Pierre,Pierre,NNP,9,nsubj,B-PERSON,0
1,1,2,2,Vinken,Vinken,NNP,1,flat,E-PERSON,0
2,1,3,3,",",",",",",1,punct,O,0
3,1,4,4,61,61,CD,5,nummod,B-DATE,0
4,1,5,5,years,year,NNS,6,obl:npmod,I-DATE,0
5,1,6,6,old,old,JJ,1,amod,E-DATE,0
6,1,7,7,",",",",",",9,punct,O,0
7,1,8,8,will,will,MD,9,aux,O,0
8,1,9,9,join,join,VB,0,root,O,0
9,1,10,10,the,the,DT,11,det,O,0


In [7]:
def get_previous_or_following(df, column_name, step=-1):
    """
    Gets previous or following label, for instance pos tag or token.

    :param df: Dataframe to apply to
    :param column_name str: column to apply to
    :param step integer: -1 for previous, 1 for next, -2 for one before previous...

    :returns df:
    """
    for i in range(df.shape[0]):
        if step == 0:
            break
        # If the previous or following row is not in the range of the df
        # then take '.' as value
        if (i + step < 0) or (i + step >= df.shape[0]):
            value = '.'
        # Otherwise take the item at i+step
        else:
            value = df.at[i + step, column_name]
        if step > 0:
            step_str = f'+{step}'
        else:
            step_str = str(step)
        # Fill in df
        df.at[i, f'{column_name}_{step_str}'] = value

In [8]:
def add_tokens_window5(df):
    
    get_previous_or_following(df, 'token', step=-1)
    get_previous_or_following(df, 'token', step=-2)
    get_previous_or_following(df, 'token', step=-3)
    get_previous_or_following(df, 'token', step=-4)
    get_previous_or_following(df, 'token', step=-5)

    get_previous_or_following(df, 'token', step=1)
    get_previous_or_following(df, 'token', step=2)
    get_previous_or_following(df, 'token', step=3)
    get_previous_or_following(df, 'token', step=4)
    get_previous_or_following(df, 'token', step=5)

    return df

In [9]:
def add_lemmas_window5(df):
    
    get_previous_or_following(df, 'lemma', step=-1)
    get_previous_or_following(df, 'lemma', step=-2)
    get_previous_or_following(df, 'lemma', step=-3)
    get_previous_or_following(df, 'lemma', step=-4)
    get_previous_or_following(df, 'lemma', step=-5)

    get_previous_or_following(df, 'lemma', step=1)
    get_previous_or_following(df, 'lemma', step=2)
    get_previous_or_following(df, 'lemma', step=3)
    get_previous_or_following(df, 'lemma', step=4)
    get_previous_or_following(df, 'lemma', step=5)

    return df

In [10]:
def add_lemmas_window5(df):
    
    get_previous_or_following(df, 'POS', step=-1)
    get_previous_or_following(df, 'POS', step=-2)
    get_previous_or_following(df, 'POS', step=-3)
    get_previous_or_following(df, 'POS', step=-4)
    get_previous_or_following(df, 'POS', step=-5)

    get_previous_or_following(df, 'POS', step=1)
    get_previous_or_following(df, 'POS', step=2)
    get_previous_or_following(df, 'POS', step=3)
    get_previous_or_following(df, 'POS', step=4)
    get_previous_or_following(df, 'POS', step=5)

    return df

In [11]:
def add_bigrams_prev(df):
    
    df['bigram_prev_token'] = df['token'] + ' ' + df['token_-1']
    df['bigram_prev_lemma'] = df['lemma'] + ' ' + df['lemma_-1']
    df['bigram_prev_POS'] = df['POS'] + ' ' + df['POS_-1']
    
    return df

In [12]:
def add_bigrams_following(df):
    
    df['bigram_following_token'] = df['token'] + ' ' + df['token_+1']
    df['bigram_following_lemma'] = df['lemma'] + ' ' + df['lemma_+1']
    df['bigram_following_POS'] = df['POS'] + ' ' + df['POS_+1']
    
    return df

In [13]:
def shape(token):
    '''
    Takes token (str), and returns str of shape.

    Get short shape of token.
    lower = x
    upper = X
    digit = d
    other = o
    i.e
    cats -> x
    Cats -> Xx
    USoA -> XxX
    1999 -> d
    13dec19 -> dxd
    U.S.A -> XoXoXo
    '''

    # Create empty list to store shape symbols in
    shape_list = []

    # To prevent breaking on NaN values
    if type(token) != str:
        shape = 'o'
        return shape

    # Loop over every character
    for character in token:
        # If token is NaN


        # For any character except for first (swapped with other if statement for faster computing)
        if len(shape_list) > 0:
            # If the character is upper case, and the previous shape symbol is not upper,
            # set shape symbol to 'X'
            if character.isupper() and shape_list[-1] != 'X':
                shape_character = 'X'
            elif character.islower() and shape_list[-1] != 'x':
                shape_character = 'x'
            elif character.isdigit() and shape_list[-1] != 'd':
                shape_character = 'd'
            # If character is not upper, lower or digit (and the previous symbol is 'o')
            elif not any([character.isupper(), character.islower(), character.isdigit(),
                         shape_list[-1] == 'o']):
                shape_character = 'o'
            # If not the case (ie previous was upper and so is this one), continue to next character
            else:
                continue

        # For first character
        else:
            if character.isupper():
                shape_character = 'X'
            elif character.islower():
                shape_character = 'x'
            elif character.isdigit():
                shape_character = 'd'
            elif not any([character.isupper(), character.islower(), character.isdigit()]):
                shape_character = 'o'
            else:
                continue

        shape_list.append(shape_character)

    shape = ''.join(shape_list)
    return shape


In [14]:
def add_shape(df):
    
    df['shape'] = df.apply(lambda row:
                          shape(row['token']), axis=1)
    return df

In [28]:
def add_ne_info_window5(df):
    
    ne_indices = list(df[df['relevant_ne'] == 1].index)

    ne_set= set()

    for index in ne_indices:
        ne_set.add(index)
        ne_set.add(index-1)
        ne_set.add(index-2)
        ne_set.add(index-3)
        ne_set.add(index-4)
        ne_set.add(index-5)
        ne_set.add(index+1)
        ne_set.add(index+2)
        ne_set.add(index+3)
        ne_set.add(index+4)
        ne_set.add(index+5)

    #print(ne_set)

    ne_list= list()
    for index in ne_set:
        if index in range(0, len(df.index)):
            ne_list.append(index)
        else:
            continue

    df['ne_+-5'] = 0
    df.loc[ne_list, f'ne_+-5'] = 1
    df['ne_+-5'] = df['ne_+-5'].astype('int64')

    return df

In [30]:
def add_relevant_ne(df):
    
    relevant_ne = ['PERSON', 'ORG', 'GPE', 'LOC', 'NORP', 'FAC']

    useful_ne = list()
    df['ne_short'] = df.apply(lambda row: row['ne_info'][2:], axis=1)

    for ne in relevant_ne:
        useful_ne += list(df.loc[df['ne_short']== ne].index)

    #print(useful_ne)
    df['relevant_ne'] = 0
    df.loc[useful_ne,'relevant_ne'] = 1

    return df

Unnamed: 0,POS,dependency_head,dependency_label,doc_token_number,lemma,ne_info,sentence_number,sentence_token_number,token,cue_label,attribution,ne_short,relevant_ne,ne_+-5
0,NNP,9,nsubj,1,Pierre,B-PERSON,1,1,Pierre,0,0,PERSON,1,1
1,NNP,1,flat,2,Vinken,E-PERSON,1,2,Vinken,0,0,PERSON,1,1
2,",",1,punct,3,",",O,1,3,",",0,0,,0,1
3,CD,5,nummod,4,61,B-DATE,1,4,61,0,0,DATE,0,1
4,NNS,6,obl:npmod,5,year,I-DATE,1,5,years,0,0,DATE,0,1
5,JJ,1,amod,6,old,E-DATE,1,6,old,0,0,DATE,0,1
6,",",9,punct,7,",",O,1,7,",",0,0,,0,1
7,MD,9,aux,8,will,O,1,8,will,0,0,,0,0
8,VB,0,root,9,join,O,1,9,join,0,0,,0,0
9,DT,11,det,10,the,O,1,10,the,0,0,,0,0


In [29]:
add_ne_info_window5(df)

Unnamed: 0,POS,dependency_head,dependency_label,doc_token_number,lemma,ne_info,sentence_number,sentence_token_number,token,cue_label,attribution,ne_short,relevant_ne,ne_+-5
0,NNP,9,nsubj,1,Pierre,B-PERSON,1,1,Pierre,0,0,PERSON,1,1
1,NNP,1,flat,2,Vinken,E-PERSON,1,2,Vinken,0,0,PERSON,1,1
2,",",1,punct,3,",",O,1,3,",",0,0,,0,1
3,CD,5,nummod,4,61,B-DATE,1,4,61,0,0,DATE,0,1
4,NNS,6,obl:npmod,5,year,I-DATE,1,5,years,0,0,DATE,0,1
5,JJ,1,amod,6,old,E-DATE,1,6,old,0,0,DATE,0,1
6,",",9,punct,7,",",O,1,7,",",0,0,,0,1
7,MD,9,aux,8,will,O,1,8,will,0,0,,0,0
8,VB,0,root,9,join,O,1,9,join,0,0,,0,0
9,DT,11,det,10,the,O,1,10,the,0,0,,0,0


In [31]:
lexicon = ['said', 'says', 'according', 'publishing']

In [53]:
def add_lexicon_check(df, lexicon):
    
    df['lexicon_check'] = 0
    
    lexicon_indices = list()
    
    for item in lexicon:
        lexicon_indices += list(df.loc[df['token']== item].index)
    
    df.loc[lexicon_indices,'lexicon_check'] = 1
        
    return df

In [54]:
add_lexicon_check(df, lexicon)

Unnamed: 0,POS,dependency_head,dependency_label,doc_token_number,lemma,ne_info,sentence_number,sentence_token_number,token,cue_label,attribution,ne_short,relevant_ne,ne_+-5,lexicon_check
0,NNP,9,nsubj,1,Pierre,B-PERSON,1,1,Pierre,0,0,PERSON,1,1,0
1,NNP,1,flat,2,Vinken,E-PERSON,1,2,Vinken,0,0,PERSON,1,1,0
2,",",1,punct,3,",",O,1,3,",",0,0,,0,1,0
3,CD,5,nummod,4,61,B-DATE,1,4,61,0,0,DATE,0,1,0
4,NNS,6,obl:npmod,5,year,I-DATE,1,5,years,0,0,DATE,0,1,0
5,JJ,1,amod,6,old,E-DATE,1,6,old,0,0,DATE,0,1,0
6,",",9,punct,7,",",O,1,7,",",0,0,,0,1,0
7,MD,9,aux,8,will,O,1,8,will,0,0,,0,0,0
8,VB,0,root,9,join,O,1,9,join,0,0,,0,0,0
9,DT,11,det,10,the,O,1,10,the,0,0,,0,0,0
