In [10]:
import pprint
import json
import codecs
from nltk.tree import Tree

In [17]:
# View data structure: json
    # List of dicts that can have dicts or lists as values

with codecs.open ('tutorial/conll16st-en-01-12-16-trial/relations.json', 'r', encoding = 'utf-8') as pdtb_file:
    

    # Assign all relations (a list) to a variable
    relations = [json.loads(x) for x in pdtb_file];
    
# Assign one relation to a variable
example_relation = relations[10]



# Dict
print(type(example_relation))
pprint.pprint(example_relation)

# Assign part of a list element to a variable:
# Dicts (arg1, arg2, connective):
    # Keys: 'CharacterSpanList', 'RawText', TokenList', 
arg1 = example_relation['Arg1']
arg2 = example_relation['Arg2']
connective = example_relation['Connective']

sentence_id_arg1 = arg1['TokenList'][0][3]
sentence_id_arg2 = arg2['TokenList'][0][3]
sentence_id_connective = connective['TokenList'][0][3]
# We can easily see which relations have arg1 in the previous sentence and which in the same sentence by comparing
# the sentence ids of arg1 and connective or arg2. 

#Other keys:

DocID = example_relation['DocID']
relationID = example_relation['ID']
relationtype = example_relation['Type']
    


<class 'dict'>
{'Arg1': {'CharacterSpanList': [[2493, 2517]],
          'RawText': 'and told them to cool it',
          'TokenList': [[2493, 2496, 465, 15, 8],
                        [2497, 2501, 466, 15, 9],
                        [2502, 2506, 467, 15, 10],
                        [2507, 2509, 468, 15, 11],
                        [2510, 2514, 469, 15, 12],
                        [2515, 2517, 470, 15, 13]]},
 'Arg2': {'CharacterSpanList': [[2526, 2552]],
          'RawText': "they're ruining the market",
          'TokenList': [[2526, 2530, 472, 15, 15],
                        [2530, 2533, 473, 15, 16],
                        [2534, 2541, 474, 15, 17],
                        [2542, 2545, 475, 15, 18],
                        [2546, 2552, 476, 15, 19]]},
 'Connective': {'CharacterSpanList': [[2518, 2525]],
                'RawText': 'because',
                'TokenList': [[2518, 2525, 471, 15, 14]]},
 'DocID': 'wsj_1000',
 'ID': 14887,
 'Sense': ['Contingency.Cause.Reason'],
 '

In [7]:
# Open json file containing the parses and assign them to a variable:
with codecs.open('tutorial/conll16st-en-01-12-16-trial/parses.json', 'r', encoding = 'utf8') as parse_file:
    parses = json.load(parse_file)
    
# Explore data structure
print(type(parses))

# Take a look at the tructure:
#pprint.pprint(list(parses.items())[0])

<class 'dict'>


In [8]:
#Work with example relation defined above

# Print parse tree of example relations (which happens to be in one sentence):
print(parses[DocID]['sentences'][sentence_id_arg1]['parsetree'])

( (S (NP (PRP We)) (VP (VBP 've) (VP (VP (VBN talked) (PP (TO to) (NP (NP (NNS proponents)) (PP (IN of) (NP (NN index) (NN arbitrage)))))) (CC and) (VP (VBD told) (NP (PRP them)) (S (VP (TO to) (VP (VB cool) (NP (PRP it)) (SBAR (IN because) (S (NP (PRP they)) (VP (VBP 're) (VP (VBG ruining) (NP (DT the) (NN market)))))))))))) (. .)) )



In [61]:
# Get words and POS 

example_sentence = parses[DocID]['sentences'][sentence_id_arg1]
#pprint.pprint(example_sentence['words'])

# Explore data structure
print(type(example_sentence['words']))
print(type(example_sentence['words'][0][1]))

# Take a look at the data structure:
#pprint.pprint(example_sentence['words'])
    
##################################

# Function to get all tokens and POS tags of a sentence

def get_pos(doc_id, sentence_id):
    
    """"""
    # Get  parse tree of sentence:
    
    parsed_sentence = parses[doc_id]['sentences'][sentence_id]
    
    sentence_pos_list = []
    
    for word in parsed_sentence['words']:
        token = word[0]
        pos = word[1]['PartOfSpeech']
        
        sentence_pos_list.append((token, pos))
        
    return sentence_pos_list


# We can then add this list of tuples to a dict with sentence ids as keys and lists of tuples (token, pos) 
# as values
get_pos(DocID, sentence_id_arg1)

<class 'list'>
<class 'dict'>


[('We', 'PRP'),
 ("'ve", 'VBP'),
 ('talked', 'VBN'),
 ('to', 'TO'),
 ('proponents', 'NNS'),
 ('of', 'IN'),
 ('index', 'NN'),
 ('arbitrage', 'NN'),
 ('and', 'CC'),
 ('told', 'VBD'),
 ('them', 'PRP'),
 ('to', 'TO'),
 ('cool', 'VB'),
 ('it', 'PRP'),
 ('because', 'IN'),
 ('they', 'PRP'),
 ("'re", 'VBP'),
 ('ruining', 'VBG'),
 ('the', 'DT'),
 ('market', 'NN'),
 ('.', '.')]

In [56]:
# Explore parse tree

#example_tree = parses[DocID]['sentences'][sentence_id_arg1]['parsetree']

#print(example_tree)
#print(type(example_tree))

# Load string as tree:

#tree_nltk = Tree.fromstring(example_tree)
#print(type(tree_nltk))

# Parsing the tree:

#print(tree_nltk.flatten())

# Draw your tree :-) 
#tree_nltk.draw() 
#tree_nltk.freeze()

# Height of the tree


# As far as I understand, the filter function in the subtrees function shows only subtrees (i.e. consituents)
# of the selected depth. We want all possible constituents (maybe not the smallest ones consisting of only one
# word, but here I haven't excluded them), so I loop through all possible depths (the deepest one is the absolute 
# height of the tree).


##############################

# Funcktion to extract all constituents:

def get_constituents (doc_id, sentence_id):
    """
    Input: document ID (str), sentence ID (str)
    Output: a set of constituents of the sentence
    """
    
    tree = parses[doc_id]['sentences'][sentence_id]['parsetree']
    
    tree_nltk = Tree.fromstring(tree)
    
    tree_height = tree_nltk.height()
    
    constituent_list = []
    constituent_set = set()
    
    for n in range(tree_height):
        for s in tree_nltk.subtrees(lambda t: t.height() == n):
            constituent_str = str(s)
            constituent_set.add(constituent_str)
            
    

    return constituent_set

# Look at individual constituents:
constituents = list(get_constituents(DocID, sentence_id_arg1))

#print(constituents[11])


# Get only constituents of a particular type:

def get_const_type(constituent_list, constituent_name):
    """"""
    const_type_list = []
    
    for constituent in constituent_list:
        
        
        if constituent.startswith('('+constituent_name):
            const_type_list.append(constituent)
    
    return const_type_list
            
# E.g. get clauses:   
pprint.pprint(get_const_type(constituents, 'SBAR'))

['(SBAR\n'
 '  (IN because)\n'
 '  (S\n'
 '    (NP (PRP they))\n'
 "    (VP (VBP 're) (VP (VBG ruining) (NP (DT the) (NN market))))))"]


# Data Structure:

Dictionary with unique ID
    For each relation:
        Connector
        Form
        PoS
        Constituents
    Previous sentence
        Tokens
        Lemma
        PoS
        Constituents ([CAT, [w1, .. ,wn]])
        Dependency
    Current sentence
        Tokens
        Lemma
        PoS
        Constituents
        Dependency
    Gold Arg1
    Gold Arg2
    Relation
