# Arg classifier

In [206]:
import json
from nltk.tree import Tree
from nltk.tree import ParentedTree
from collections import defaultdict

In [207]:
with open('conll16st-en-01-12-16-train/relations.json', encoding='utf-8') as pdtb_file:
    relations = [json.loads(x) for x in pdtb_file];
    
with open('conll16st-en-01-12-16-train/parses.json', encoding='utf8') as parse_file:
    parses = json.load(parse_file)

In [208]:
# We only want explicit relations:

def explicit_relations(relations):
    '''Extract all explicit relations from the relations file'''
    relations_explicit = []

    for relation in relations:
        if relation['Type'] == 'Explicit':
            relations_explicit.append(relation)
    return (relations_explicit)

ex_relations = explicit_relations(relations)

In [209]:
# Seperate them into PS and SS relations

def ss_ps_relations(explicit_relations):
    '''
    Sort relations into same sentence and previous sentence relations
    '''
    
    relations_ss = []
    relations_ps = []
    #relations_other = []

    for relation in explicit_relations:
        sentence_id_arg1 = relation['Arg1']['TokenList'][0][3]
        sentence_id_connective = relation['Connective']['TokenList'][0][3]
        sentence_id_arg2 = relation['Arg2']['TokenList'][0][3]
    
        if sentence_id_arg1 == sentence_id_connective == sentence_id_arg2:
            relations_ss.append(relation)
        elif int(sentence_id_arg1) == int(sentence_id_connective) - 1 == int(sentence_id_arg2) -1:
            relations_ps.append(relation)
        #else: 
            #relations_other.append(relation)
            
    return relations_ss, relations_ps

ss_relations, ps_relations = ss_ps_relations(ex_relations)

In [210]:
def get_phrase_structure(docID, sentenceID):
    """
    Retrieve phrase_structure from the parses.json file by filename and sentenceID.
    """
         
    sentencelist = parses[docID]["sentences"]    
    phrase_structure = sentencelist[sentenceID]["parsetree"]
    
    return phrase_structure

# Output format

In [211]:
def get_constituent(docID,sentenceID,index,pos,token):
    """
    Finds parent node of input token. 
    """
    
    phrase_structure = get_phrase_structure(docID, sentenceID)    
    tree = Tree.fromstring(phrase_structure)    
    tree = ParentedTree.convert(tree)
    
    for subtree in tree.subtrees(filter=lambda x:
                                 x.label() == pos and x[0] == token):
        return subtree.parent().label()

#get_constituent("wsj_0204",6,0,"DT","The")

### SS relations

In [221]:
tuplelist = []

for relation in ss_relations[:20]:
    
    tokenlist = []
    
    connective = relation["Connective"]["RawText"]
    connective_index = relation["Connective"]["TokenList"][0][4]
    docID = relation["DocID"]
    
    arg1_tokenlist = relation["Arg1"]["TokenList"]
    arg2_tokenlist = relation["Arg2"]["TokenList"]
    
    #retrieve phrase structure for sentence the argument is in
    arg1_sentenceID = arg1_tokenlist[0][3]
    arg1_phrase_structure = get_phrase_structure(docID, arg1_sentenceID)
    
    arg2_sentenceID = arg2_tokenlist[0][3]
    arg2_phrase_structure = get_phrase_structure(docID, arg2_sentenceID) #should be the same
    
    
    # Build a list with indices of words in the argument
    indices = []
    
    for wordspan in arg1_tokenlist:

        index = wordspan[4]        
        indices.append(index)

        
    # Every word that precedes the connective
    word_data = parses[docID]["sentences"][arg1_sentenceID]["words"][:connective_index]
    
    for index, word in enumerate(word_data):
        
        pos = word[1]["PartOfSpeech"]
        token = word[0]
        
        if index in indices:
            argpart = True
        else:
            argpart = False
        
        constituent = get_constituent(docID,arg1_sentenceID,index,pos,token)
        
        tokenlist.append(((word[0],argpart),pos,constituent,connective))
            
    tuplelist.append(tokenlist)
    


In [222]:
tuplelist

[[(('Under', False), 'IN', 'PP', 'if'),
  (('two', False), 'CD', 'NP', 'if'),
  (('new', False), 'JJ', 'NP', 'if'),
  (('features', False), 'NNS', 'NP', 'if'),
  ((',', False), ',', 'S', 'if'),
  (('participants', False), 'NNS', 'NP', 'if'),
  (('will', False), 'MD', 'VP', 'if'),
  (('be', False), 'VB', 'VP', 'if'),
  (('able', False), 'JJ', 'ADJP', 'if'),
  (('to', False), 'TO', 'VP', 'if'),
  (('transfer', False), 'VB', 'VP', 'if'),
  (('money', False), 'NN', 'NP', 'if'),
  (('from', False), 'IN', 'PP', 'if'),
  (('the', False), 'DT', 'NP', 'if'),
  (('new', False), 'JJ', 'NP', 'if'),
  (('funds', False), 'NNS', 'NP', 'if'),
  (('to', False), 'TO', 'VP', 'if'),
  (('other', False), 'JJ', 'NP', 'if'),
  (('investment', False), 'NN', 'NP', 'if'),
  (('funds', False), 'NNS', 'NP', 'if'),
  (('or', True), 'CC', 'VP', 'if'),
  ((',', False), ',', 'S', 'if')],
 [(('Solo', True), 'NNP', 'NP', 'if'),
  (('woodwind', True), 'JJ', 'NP', 'if'),
  (('players', True), 'NNS', 'NP', 'if'),
  (('hav

### PS relations

In [218]:
tuplelist = []

for relation in ps_relations[:2]:
    
    tokenlist = []
    
    connective = relation["Connective"]["RawText"]
    connective_index = relation["Connective"]["TokenList"][0][4]
    docID = relation["DocID"]
    
    arg1_tokenlist = relation["Arg1"]["TokenList"]
    arg2_tokenlist = relation["Arg2"]["TokenList"]
    
    #retrieve phrase structure for sentence the argument is in
    arg1_sentenceID = arg1_tokenlist[0][3]
    arg1_phrase_structure = get_phrase_structure(docID, arg1_sentenceID)
    
    arg2_sentenceID = arg2_tokenlist[0][3]
    arg2_phrase_structure = get_phrase_structure(docID, arg2_sentenceID)
    
    
    # Build a list with indices of words in the argument
    indices = []
    
    for wordspan in arg1_tokenlist:

        index = wordspan[4]        
        indices.append(index)

        
    # Every word in the sentence 
    word_data = parses[docID]["sentences"][arg1_sentenceID]["words"]
    
    for index, word in enumerate(word_data):
        
        pos = word[1]["PartOfSpeech"]
        token = word[0]
        
        if index in indices:
            argpart = True
        else:
            argpart = False
        
        constituent = get_constituent(docID,arg1_sentenceID,index,pos,token)
        
        tokenlist.append(((word[0],argpart),pos,constituent,connective))
            
    tuplelist.append(tokenlist)
    


In [219]:
tuplelist

[[(('The', True), 'DT', 'NP', 'Also'),
  (('new', True), 'JJ', 'NP', 'Also'),
  (('``', True), '``', 'NP', 'Also'),
  (('social', True), 'JJ', 'NP', 'Also'),
  (('choice', True), 'NN', 'NP', 'Also'),
  (("''", True), "''", 'NP', 'Also'),
  (('fund', True), 'NN', 'NP', 'Also'),
  (('will', True), 'MD', 'VP', 'Also'),
  (('shun', True), 'VB', 'VP', 'Also'),
  (('securities', True), 'NNS', 'NP', 'Also'),
  (('of', True), 'IN', 'PP', 'Also'),
  (('companies', True), 'NNS', 'NP', 'Also'),
  (('linked', True), 'VBN', 'VP', 'Also'),
  (('to', True), 'TO', 'PP', 'Also'),
  (('South', True), 'NNP', 'NP', 'Also'),
  (('Africa', True), 'NNP', 'NP', 'Also'),
  ((',', True), ',', 'NP', 'Also'),
  (('nuclear', True), 'JJ', 'NP', 'Also'),
  (('power', True), 'NN', 'NP', 'Also'),
  (('and', True), 'CC', 'NP', 'Also'),
  (('in', True), 'IN', 'PP', 'Also'),
  (('some', True), 'DT', 'NP', 'Also'),
  (('cases', True), 'NNS', 'NP', 'Also'),
  ((',', True), ',', 'NP', 'Also'),
  (('Northern', True), 'NNP', 