I worked with this notbeook inside the development data folder, so if you would like to run the code, download it and put it in the right folder (or change the paths in the notebook).

In [1]:
import pprint
import json
import codecs
from nltk.tree import Tree
from collections import defaultdict

In [2]:
# Specify path to data to be analyzed (I entered the path to the tutorial data for now, so that the code runs)

with codecs.open ('tutorial/conll16st-en-01-12-16-trial/relations.json', 'r', encoding = 'utf-8') as pdtb_file:
    

    # Assign all relations (a list) to a variable
    relations = [json.loads(x) for x in pdtb_file];

In [4]:
# Loop through relations and select only explicit relations:

# List for explicit relations:

relations_explicit = []

for relation in relations:
    if relation['Type'] == 'Explicit':
        relations_explicit.append(relation)
        
print(len(relations_explicit))
pprint.pprint(relations_explicit[0])

13
{'Arg1': {'CharacterSpanList': [[783, 877]],
          'RawText': 'Several big securities firms backed off from program '
                     'trading a few months after the 1987 crash',
          'TokenList': [[783, 790, 143, 4, 0],
                        [791, 794, 144, 4, 1],
                        [795, 805, 145, 4, 2],
                        [806, 811, 146, 4, 3],
                        [812, 818, 147, 4, 4],
                        [819, 822, 148, 4, 5],
                        [823, 827, 149, 4, 6],
                        [828, 835, 150, 4, 7],
                        [836, 843, 151, 4, 8],
                        [844, 845, 152, 4, 9],
                        [846, 849, 153, 4, 10],
                        [850, 856, 154, 4, 11],
                        [857, 862, 155, 4, 12],
                        [863, 866, 156, 4, 13],
                        [867, 871, 157, 4, 14],
                        [872, 877, 158, 4, 15]]},
 'Arg2': {'CharacterSpanList': [[883, 957]],
    

In [9]:
# Split in two lists according to arg1 in same sentence and arg1 in previous sentence:

relations_ss = []
relations_ps = []
relations_other = []

for relation in relations_explicit:
    sentence_id_arg1 = relation['Arg1']['TokenList'][0][3]
    sentence_id_connective = relation['Connective']['TokenList'][0][3]
    sentence_id_arg2 = relation['Arg2']['TokenList'][0][3]
    
    if sentence_id_arg1 == sentence_id_connective == sentence_id_arg2:
        relations_ss.append(relation)
    elif int(sentence_id_arg1) == int(sentence_id_connective) - 1 == int(sentence_id_arg2) -1:
        relations_ps.append(relation)
    else: 
        relations_other.append(relation)
print(len(relations_ps))
print(len(relations_ss))

3
9


In [17]:
# Open parse file:

with codecs.open('tutorial/conll16st-en-01-12-16-trial/parses.json', 'r', encoding = 'utf8') as parse_file:
    parses = json.load(parse_file)

In [95]:
# NOT NEEDED 

def normalize_str(text):
    """"""
    
    text = text.lstrip('(')
    
    text = text.rstrip(')')
    text_list = text.split()
    
    clean_text_list = []
    
    for word in text_list:
        word = word.strip()
        clean_text_list.append(word)
    
    clean_text = ' '.join(clean_text_list)
    
    return(clean_text)

my_rel = relations_ss[7]

arg1_str = my_rel['Arg1']['RawText']
print(arg1_str)
print(normalize_str(arg1_str))
print(normalize_str("(rtc's! Work\n didn t doesn t $50"))

We would stop index arbitrage when the market is under stress
We would stop index arbitrage when the market is under stress
rtc's! Work didn t doesn t $50


In [85]:
def get_const_type(constituent_list, constituent_name):
    """
    Input: set of constituents, name of a constituent type (str)
    Output: list of constituents of the selected type 
    """
    const_type_list = []
    
    for constituent in constituent_list:
        
        
        if constituent.startswith('('+constituent_name):
            const_type_list.append(constituent)
    
    return const_type_list
            
# E.g. get clauses:   

In [113]:
def get_subtrees(parsed_sentence, const_type = 0):
    """"""
    tree_nltk = Tree.fromstring(parsed_sentence)
    
    tree_height = tree_nltk.height()
    
        
    const_tuple_list = []

    for s in tree_nltk.subtrees():

        label = s.label()
        
        if (label != '``') and (label != ''):
            
            subtree_tokens_str = str(s.flatten())
            #print(type(subtree_tokens_str))
            clean_subtree_str = normalize_str(subtree_tokens_str).lstrip(label).lstrip(' ')



            if const_type != 0:
                if label == const_type:
                    const_tuple_list.append((label, clean_subtree_str))

            else:

                const_tuple_list.append((label, clean_subtree_str))



    return const_tuple_list

doc_id = my_rel['DocID']
sentence_id_arg1 = my_rel['Arg1']['TokenList'][0][3]
parsed_sentence = parses[doc_id]['sentences'][sentence_id_arg1]['parsetree']  

consts = get_subtrees(parsed_sentence)

pprint.pprint(consts)

[('S',
  "They said , ` Too bad , ' so we finally said we 're not going to do "
  "business with them . ''"),
 ('S', 'They said , ` Too bad'),
 ('NP', 'They'),
 ('PRP', 'They'),
 ('VP', 'said , ` Too bad'),
 ('VBD', 'said'),
 (',', ','),
 ('ADJP', 'Too bad'),
 ('RB', 'Too'),
 ('JJ', 'bad'),
 (',', ','),
 ("''", "'"),
 ('IN', 'so'),
 ('S', "we finally said we 're not going to do business with them"),
 ('NP', 'we'),
 ('PRP', 'we'),
 ('ADVP', 'finally'),
 ('RB', 'finally'),
 ('VP', "said we 're not going to do business with them"),
 ('VBD', 'said'),
 ('SBAR', "we 're not going to do business with them"),
 ('S', "we 're not going to do business with them"),
 ('NP', 'we'),
 ('PRP', 'we'),
 ('VP', "'re not going to do business with them"),
 ('VBP', "'re"),
 ('RB', 'not'),
 ('VP', 'going to do business with them'),
 ('VBG', 'going'),
 ('S', 'to do business with them'),
 ('VP', 'to do business with them'),
 ('TO', 'to'),
 ('VP', 'do business with them'),
 ('VB', 'do'),
 ('NP', 'business'),
 ('

In [114]:
def get_data(relation, arg):
    
    """returns DocID, sentenceID, Argument raw text and sentence_token_id in relations.json"""
    
    doc_id = relation['DocID']
    sentence_id_arg = relation[arg]['TokenList'][0][3]
    arg_str = relation[arg]['RawText']   
    
    sentence_token_id_relations = []
    
    relations_token_list = relation[arg]['TokenList']
    
    for line in relations_token_list:
        sentence_token_id = line[4]
        sentence_token_id_relations.append(sentence_token_id)
    
    return doc_id, sentence_id_arg, arg_str, sentence_token_id_relations

my_rel = relations_ss[3]
my_id, my_sid, my_arg, my_st = get_data(my_rel, 'Arg1')
print(my_id, my_st)

wsj_1000 [0, 1, 2, 3, 4, 5]


In [115]:
def parse_sentence_token_id(doc_id, sentence_id_arg, sentence_token_id_relations):
    """"""
    
    argument_id_tuples_list = []
    
    parsed_words_list = parses[doc_id]['sentences'][sentence_id_arg]['words']
    
    for number, word in enumerate(parsed_words_list):
        
        if number in sentence_token_id_relations:
        
            argument_id_tuples_list.append((number, word[0]))
        
        
    return argument_id_tuples_list

parse_sentence_token_id('wsj_1000', 22, my_st)

# Looks good, if you compare these ids to the sentence_token_ids taken from relations.json. 
# Covers the span of the argument

[(0, '``'), (1, 'The'), (2, 'impression'), (3, 'I'), (4, "'ve"), (5, 'got')]

In [116]:
def get_arg_list(relation, arg):
    
    """"""
    
    doc_id, sentence_id_arg, arg_str, sentence_token_id_relations = get_data(relation, arg)

    
    sentence_token_id_parse_tuples = parse_sentence_token_id(doc_id, sentence_id_arg, sentence_token_id_relations)

    # check if argument is interrupted by comparing to the sentence token ids in the relations.json file
    
    for index, pair in enumerate(sentence_token_id_parse_tuples):
        if pair == sentence_token_id_parse_tuples[-1]:
            break

        next_pair = sentence_token_id_parse_tuples[index + 1]


        if (next_pair[0] - pair[0]) > 1:
            slice_index = index
            
            break
        else:
            slice_index = None
    
    argument_tokens_as_in_parsing = []
    
    for st_id, token in sentence_token_id_parse_tuples:
        
        argument_tokens_as_in_parsing.append(token)  
    # Get the argument as either 1 string in a list (if it is not interrupted) or as 2 strings in a list
    # if it is interrupted

    if slice_index:
        arg1_1 = ' '.join(argument_tokens_as_in_parsing[:slice_index + 1])

        arg1_2 = ' '.join(argument_tokens_as_in_parsing[slice_index + 1:])

        arg_str_list = [arg1_1, arg1_2]
    else:

        arg_str_list = [' '.join(argument_tokens_as_in_parsing)]
    
    
    return arg_str_list

for relation in relations_ss:
    print(get_arg_list(relation, 'Arg2'))

["it scares natural buyers '' of stock"]
["Mr. Timbers explained he 's `` not totally convinced index arbitrage changes the overall level of the stock market"]
["they 're ruining the market"]
["we finally said we 're not going to do business with them"]
["they -LCB- the exchange -RCB- ca n't do it"]
["that all the exchange can do is `` slow down the process '' by using its circuit breakers and shock absorbers"]
['the market is under stress']
['we have recently']
['But it may become much more important']


In [117]:
# Map arguments to constituents

def constituent_structure(relation, arg):
    """"""

    # Dicts with argument as key and constituent as value (tuple: (label, constituent))
    arg_is_const = defaultdict(list)
    arg_starts_const = defaultdict(list)
    arg_in_const = defaultdict(list)
    const_part_arg = defaultdict(list)
    
    
    arg_list = get_arg_list(relation, arg)
    
    doc_id, sentence_id_arg, arg_str, sentence_token_id_relations = get_data(relation, arg)


    sentence = parses[doc_id]['sentences'][sentence_id_arg]['parsetree']

    const_list = get_subtrees(sentence)


    for argument in arg_list:
        




        for constituent in const_list:

            label, const = constituent
            
            


            # Constituents the argument is part of:

            if const == argument:

                arg_is_const[argument].append((label, const))




            elif const.startswith(argument):

                arg_starts_const[argument].append((label, const))


            elif argument in const:

                arg_in_const[argument].append((label, const))



            # Constituents that are part of the argument

            if const in argument:
                const_part_arg[argument].append((label, const))
                
    return arg_is_const, arg_starts_const, arg_in_const, const_part_arg



In [118]:
constituent_structure(relations_ss[0], 'Arg1')

(defaultdict(list,
             {"Index arbitrage does n't work": [('S',
                "Index arbitrage does n't work")]}),
 defaultdict(list, {}),
 defaultdict(list,
             {"Index arbitrage does n't work": [('S',
                "`` Index arbitrage does n't work , and it scares natural buyers '' of stock .")]}),
 defaultdict(list,
             {"Index arbitrage does n't work": [('S',
                "Index arbitrage does n't work"),
               ('NP', 'Index arbitrage'),
               ('NN', 'Index'),
               ('NN', 'arbitrage'),
               ('VP', "does n't work"),
               ('VBZ', 'does'),
               ('RB', "n't"),
               ('VP', 'work'),
               ('VB', 'work'),
               ('NP', 'it'),
               ('PRP', 'it')]}))