In [1]:
import pprint
import json
import codecs
from nltk.tree import Tree
from collections import defaultdict

In [2]:
# Specify path to data to be analyzed (I entered the path to the tutorial data for now, so that the code runs)

with codecs.open ('relations.json', 'r', encoding = 'utf-8') as pdtb_file:
    

    # Assign all relations (a list) to a variable
    relations = [json.loads(x) for x in pdtb_file];

In [3]:
# Loop through relations and select only explicit relations:


def explicit_relations(relations):
    '''Extract all explicit relations from the relations file'''
    relations_explicit = []

    for relation in relations:
        if relation['Type'] == 'Explicit':
            relations_explicit.append(relation)
    return (relations_explicit)

In [4]:
ex_relations = explicit_relations(relations)

In [5]:
def ss_ps_relations(explicit_relations):
    
    '''Sort relations into same sentence and previous sentence relations'''

    relations_ss = []
    relations_ps = []
    #relations_other = []

    for relation in explicit_relations:
        sentence_id_arg1 = relation['Arg1']['TokenList'][0][3]
        sentence_id_connective = relation['Connective']['TokenList'][0][3]
        sentence_id_arg2 = relation['Arg2']['TokenList'][0][3]
    
        if sentence_id_arg1 == sentence_id_connective == sentence_id_arg2:
            relations_ss.append(relation)
        elif int(sentence_id_arg1) == int(sentence_id_connective) - 1 == int(sentence_id_arg2) -1:
            relations_ps.append(relation)
        #else: 
            #relations_other.append(relation)
            
    return relations_ss, relations_ps

In [6]:
ss_rel, ps_rel = ss_ps_relations(ex_relations)

In [7]:
my_relation = ss_rel[0]
pprint.pprint(my_relation)

{'Arg1': {'CharacterSpanList': [[1611, 1613], [1645, 1672]],
          'RawText': 'or receive cash from the funds',
          'TokenList': [[1611, 1613, 284, 11, 20],
                        [1645, 1652, 292, 11, 28],
                        [1653, 1657, 293, 11, 29],
                        [1658, 1662, 294, 11, 30],
                        [1663, 1666, 295, 11, 31],
                        [1667, 1672, 296, 11, 32]]},
 'Arg2': {'CharacterSpanList': [[1618, 1643]],
          'RawText': 'their jobs are terminated',
          'TokenList': [[1618, 1623, 287, 11, 23],
                        [1624, 1628, 288, 11, 24],
                        [1629, 1632, 289, 11, 25],
                        [1633, 1643, 290, 11, 26]]},
 'Connective': {'CharacterSpanList': [[1615, 1617]],
                'RawText': 'if',
                'TokenList': [[1615, 1617, 286, 11, 22]]},
 'DocID': 'wsj_0204',
 'ID': 3182,
 'Sense': ['Contingency.Condition'],
 'Type': 'Explicit'}


In [8]:
def get_data(relation, arg):
    
    """returns DocID, sentenceID, Argument raw text and sentence_token_id in relations.json"""
    
    relation_id = relation['ID']
    doc_id = relation['DocID']
    sentence_id_arg = relation[arg]['TokenList'][0][3]
    #arg_str = relation[arg]['RawText']   
    
    sentence_token_id_relations = []
    
    relations_token_list = relation[arg]['TokenList']
    
    for line in relations_token_list:
        sentence_token_id = line[4]
        sentence_token_id_relations.append(sentence_token_id)
    
    return relation_id, doc_id, sentence_id_arg,  sentence_token_id_relations


In [9]:
relid, docid, sid, stid = get_data(my_relation, 'Arg1')
print(get_data(my_relation, 'Arg1'))

(3182, 'wsj_0204', 11, [20, 28, 29, 30, 31, 32])


In [10]:
with codecs.open('parses.json', 'r', encoding = 'utf8') as parse_file:
        parses = json.load(parse_file)

Data structure:

1 dict for each argument

keys: sentence-token-ids of the entire sentence containing the argument (this is not ideal, we always want to look at the sentence of the connective and the previous sentence)

values: dics:

        label: IN / OUT
        token: token
        constituent: constituent
        parent constituent: parent constituent
        daughter constituent: daughter constituent
        POS: POS


In [11]:
def parsed_sentence_token_id(doc_id, sentence_id_arg, sentence_token_id_relations):
    
    """Enumerate words of a sentence in the parse file, so that they can be matched with the 
    sentence token ids in the relations file"""
    
    argument_id_tuples_list = []
    
    parsed_words_list = parses[doc_id]['sentences'][sentence_id_arg]['words']
    
    word_list = []
    
    word_dict = dict()
    
    for number, word in enumerate(parsed_words_list):
        word_dict[number] = dict()
        word_dict[number]['token'] = word[0]
        word_list.append(word[0])
        
        if number in sentence_token_id_relations:
            word_dict[number]['label'] = 'IN'
            
           
        
            argument_id_tuples_list.append((number, word[0]))
        else:
        
            word_dict[number]['label'] = 'OUT'
        
    return argument_id_tuples_list, word_dict, word_list

In [12]:
arg_token_ids, word_dict, word_list = parsed_sentence_token_id(docid, sid, stid)
#print(arg_token_ids)
#print(type(arg_token_ids))
pprint.pprint(word_dict)


{0: {'label': 'OUT', 'token': 'Under'},
 1: {'label': 'OUT', 'token': 'two'},
 2: {'label': 'OUT', 'token': 'new'},
 3: {'label': 'OUT', 'token': 'features'},
 4: {'label': 'OUT', 'token': ','},
 5: {'label': 'OUT', 'token': 'participants'},
 6: {'label': 'OUT', 'token': 'will'},
 7: {'label': 'OUT', 'token': 'be'},
 8: {'label': 'OUT', 'token': 'able'},
 9: {'label': 'OUT', 'token': 'to'},
 10: {'label': 'OUT', 'token': 'transfer'},
 11: {'label': 'OUT', 'token': 'money'},
 12: {'label': 'OUT', 'token': 'from'},
 13: {'label': 'OUT', 'token': 'the'},
 14: {'label': 'OUT', 'token': 'new'},
 15: {'label': 'OUT', 'token': 'funds'},
 16: {'label': 'OUT', 'token': 'to'},
 17: {'label': 'OUT', 'token': 'other'},
 18: {'label': 'OUT', 'token': 'investment'},
 19: {'label': 'OUT', 'token': 'funds'},
 20: {'label': 'IN', 'token': 'or'},
 21: {'label': 'OUT', 'token': ','},
 22: {'label': 'OUT', 'token': 'if'},
 23: {'label': 'OUT', 'token': 'their'},
 24: {'label': 'OUT', 'token': 'jobs'},
 25

In [13]:
def arg_rest_dict(word_dict):
    
    """"""
    
    arg_dict = dict()
    rest_dict = dict()
    

    for number, subdict in word_dict.items():
        
   
        
        if subdict['label'] == 'IN':
        
         
            arg_dict[number] = subdict['token']
        else:
            rest_dict[number] = subdict['token']
    
    return arg_dict, rest_dict
        
    
my_arg_dict, my_rest_dict = arg_rest_dict(word_dict)
pprint.pprint(my_rest_dict)

{0: 'Under',
 1: 'two',
 2: 'new',
 3: 'features',
 4: ',',
 5: 'participants',
 6: 'will',
 7: 'be',
 8: 'able',
 9: 'to',
 10: 'transfer',
 11: 'money',
 12: 'from',
 13: 'the',
 14: 'new',
 15: 'funds',
 16: 'to',
 17: 'other',
 18: 'investment',
 19: 'funds',
 21: ',',
 22: 'if',
 23: 'their',
 24: 'jobs',
 25: 'are',
 26: 'terminated',
 27: ',',
 33: '.'}


In [20]:
def discontinuous_chunk(arg_dict):
    
    """"""
    
    my_list = list(sorted(arg_dict.items()))
    
    
    slice_index = -1
    for number, pair in enumerate(my_list):

        token_id, token = pair
        print(token_id, token)
        token = pair[1]



        next_pair = my_list[number+1]
        
        next_token_id = next_pair[0]
        
        next_token = next_pair[1]
        #print(next_token_id, next_token)

        if (next_token_id - token_id) >=2:
            slice_index = number

            break
      

            
 
    return slice_index, my_list
print(len(my_rest_dict))    
slice_index, my_list = discontinuous_chunk(my_arg_dict)
print(slice_index)

28
20 or
0


In [21]:
def arg_list(slice_index, sorted_list):
    
    '''Return a list of arguments. In case of a discontinuous argument, the list will have two items, 
    in case of a continuous argument, it will only have one.'''
   
  
        
        
    if slice_index != -1:
    
        arg1_1 = sorted_list[:slice_index + 1]

        arg1_2 = sorted_list[slice_index + 1:]

        arg_str_list = [arg1_1, arg1_2]
        
    else:
        arg_str_list = [sorted_list]
    
    return arg_str_list

my_chunks = arg_list(slice_index, my_list)
print(my_chunks)
print(len(my_chunks))


[[(20, 'or')], [(28, 'receive'), (29, 'cash'), (30, 'from'), (31, 'the'), (32, 'funds')]]
2


In [22]:
def parsed_sentence(parses, doc_id, sentence_id_arg):
    '''Return the parsed sentence as a tree'''
    
    parsed_sentence = parses[doc_id]['sentences'][sentence_id_arg]['parsetree']
    
    tree_nltk = Tree.fromstring(parsed_sentence)
    

    
    return tree_nltk
    
    

In [24]:
my_tree = parsed_sentence(parses, docid, sid)
#print(my_tree)
my_chunk = my_chunks[1]
print(my_chunk)
len(my_chunk)

[(28, 'receive'), (29, 'cash'), (30, 'from'), (31, 'the'), (32, 'funds')]


5

I want to get the lowest constituent containing a chunk (either the entire argument, a part of the argument or 
a part of the sentence that does not belong to the argument).
I am trying to match the items in the constituent I find to the sentence token ids. 

In [51]:
def get_chunk_constituent(tree, chunk):
    '''Loop through the subtrees of the tree (low to high) and return the subtree if is the argument 
    or contains the argument'''
    
    chunk_t_list = []
   
    for pair in chunk:
        n = pair[0]
        t = pair[1]
        chunk_t_list.append(t)
        
    chunk_str = ' '.join(chunk_t_list)
    
        
    
    height = tree.height()
    
    status = 'not found'
    
    

    for h in range(height):

        if status == 'found':
            break

        for subtree in tree.subtrees(lambda t: t.height() == h):
            

            label = subtree.label()
            
            leaves = subtree.leaves()

            my_leaves = ' '.join(subtree.leaves())
          

            if chunk == leaves:


                my_sub = subtree.leaves()
                status = 'found'
                return my_sub, subtree.label()
                break


            elif chunk_str in my_leaves:
                
          

                my_sub = subtree.leaves()
                status = 'found'
                return my_sub, subtree.label()

                break
  
        
    

my_const, l = get_chunk_constituent(my_tree, my_chunk)
print(l, my_const)

VP ['receive', 'cash', 'from', 'the', 'funds']


In [48]:
A = [1,2,3,4,5,6,7]
B = [3,4,5]

if l2 in l1:
    print('yes')
    
if listA in listB: 
    return True

SyntaxError: 'return' outside function (<ipython-input-48-39717d989393>, line 8)

In [19]:
def parent_const(tree, subtree):
    
    '''Returns parent constituent of the constituent that is the argument or the constituent that
    contains the argument'''
    
    tree_leaves = ' '.join(tree.leaves())
    
    height_const = subtree.height()
    height_total = tree.height()

    const_leaves = ' '.join(subtree.leaves())
  

    status = 'not found'

    for n in range(height_const, height_total):
        if status == 'found':
            break

        for sub in tree.subtrees((lambda t: t.height() == n)):

            label = subtree.label()


            my_leaves = ' '.join(sub.leaves())
    


        if (const_leaves in my_leaves) and (const_leaves != my_leaves):

            my_sub = sub
            status = 'found'
            return label, my_sub.leaves()
            break
        
    parent_list = my_sub.leaves()
    
    
                
           
print(parent_const(my_tree, my_const))

AttributeError: 'list' object has no attribute 'height'

In [None]:
def daughters(subtree):
    
    label = subtree.label()
    original_tree = str(subtree.flatten()).lstrip('('+label).rstrip(')').strip()
 
    
    
    
    h = subtree.height()
    
    daughters_list = []
    
    rest_str = str(subtree.flatten()).lstrip('('+label).rstrip(')').strip()
    
    
    
    for sub in subtree.subtrees():
        l = sub.label()

        my_sub = str(sub.flatten()).lstrip('('+l).rstrip(')').strip()

        
        rest_list = rest_str.split()
        
        
        if (my_sub in rest_str) and (my_sub != original_tree):
        
            
            if len(my_sub) > 2 :
            
                daughters_list.append((my_sub, l))

                rest_str = rest_str.replace(my_sub, '').strip()

               
                
        elif len(my_sub) <= 2 and my_sub in rest_list:
            
            daughters_list.append((my_sub, l))
            rest_str = rest_str.replace(my_sub, '').strip()

            
        
    return(daughters_list)
    
daughters(subtree)
#print(my_arg[1])

In [None]:


for relation in ss_rel[20:30]:
    
    relid, docid, sid, stid = get_data(relation, 'Arg1')
    arg_token_ids = parsed_sentence_token_id(docid, sid, stid)
    slice_index = discontinuous_arg(arg_token_ids)
    my_args = arg_list(slice_index, arg_token_ids)
    
    for arg in my_args:
        my_tree = parsed_sentence(parses, docid, sid)

        subtree, height, total_height = get_argument_constituent(my_tree, arg)
        print(str(subtree.flatten()))
    
    