# Arg classifier

In [1]:
import json
from nltk.tree import Tree
from nltk.tree import ParentedTree
from collections import defaultdict

In [2]:
with open('conll16st-en-01-12-16-train/relations.json', encoding='utf-8') as pdtb_file:
    relations = [json.loads(x) for x in pdtb_file];
    
with open('conll16st-en-01-12-16-train/parses.json', encoding='utf8') as parse_file:
    parses = json.load(parse_file)

In [3]:
# We only want explicit relations:

def explicit_relations(relations):
    '''Extract all explicit relations from the relations file'''
    relations_explicit = []

    for relation in relations:
        if relation['Type'] == 'Explicit':
            relations_explicit.append(relation)
    return (relations_explicit)

ex_relations = explicit_relations(relations)

In [4]:
# Seperate them into PS and SS relations

def ss_ps_relations(explicit_relations):
    '''
    Sort relations into same sentence and previous sentence relations
    '''
    
    relations_ss = []
    relations_ps = []
    #relations_other = []

    for relation in explicit_relations:
        sentence_id_arg1 = relation['Arg1']['TokenList'][0][3]
        sentence_id_connective = relation['Connective']['TokenList'][0][3]
        sentence_id_arg2 = relation['Arg2']['TokenList'][0][3]
    
        if sentence_id_arg1 == sentence_id_connective == sentence_id_arg2:
            relations_ss.append(relation)
        elif int(sentence_id_arg1) == int(sentence_id_connective) - 1 == int(sentence_id_arg2) -1:
            relations_ps.append(relation)
        #else: 
            #relations_other.append(relation)
            
    return relations_ss, relations_ps

ss_relations, ps_relations = ss_ps_relations(ex_relations)

In [5]:
def get_phrase_structure(docID, sentenceID):
    """
    Retrieve phrase_structure from the parses.json file by filename and sentenceID.
    """
         
    sentencelist = parses[docID]["sentences"]    
    phrase_structure = sentencelist[sentenceID]["parsetree"]
    
    
    return phrase_structure

In [6]:
def enumerate_parsetree(parse_tree_sentence):
    
    
    for number, subtree in enumerate(parse_tree_sentence.subtrees(lambda t: t.height() == 2)):
            subtree.set_label(number)

    return parse_tree_sentence

In [7]:
def find_constituent(parse_tree_sentence):

    constituent_dict = {'VP': [], 'NP': [], 'PP': [], 'SBAR': [], 'SINV': [], 'S': []}
    
    parse_tree_sentence = enumerate_parsetree(parse_tree_sentence)
    len_sentence = len(parse_tree_sentence)

    for subtree in parse_tree_sentence.subtrees():
        label = subtree.label()
        pos_list = subtree.pos()
        len_subtree = len(subtree)

        if label == 'SBAR':
            constituent_dict['SBAR'].append(pos_list)
            
        elif (label == 'S') and (len_subtree < len_sentence):
             constituent_dict['S'].append(pos_list)

        elif label == 'VP':
             constituent_dict['VP'].append(pos_list)

        elif label == 'SINV':
             constituent_dict['SINV'].append(pos_list)

        elif label == 'NP':
             constituent_dict['NP'].append(pos_list)

        elif label == 'PP':
             constituent_dict['PP'].append(pos_list)

    return constituent_dict

# Output format

In [8]:
def get_constituent(docID,sentenceID,index,pos,token):
    """
    Finds parent node of input token. 
    """
    
    phrase_structure = get_phrase_structure(docID, sentenceID)    
    tree = Tree.fromstring(phrase_structure)    
    tree = ParentedTree.convert(tree)
    
    for subtree in tree.subtrees(filter=lambda x:
                                 x.label() == pos and x[0] == token):
        return subtree.parent().label()

get_constituent("wsj_0204",6,0,"DT","The")

'NP'

In [9]:
def get_arg_constituent():
    pass

In [21]:
featuresdict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

### SS relations

In [14]:
tuplelist = []
featurelist = []

for relation in ss_relations:
    
    tokenlist = []
    sentencefeatures = []
    
    connective = relation["Connective"]["RawText"]
    connective_index = relation["Connective"]["TokenList"][0][4]
    docID = relation["DocID"]
    
    arg1_tokenlist = relation["Arg1"]["TokenList"]
    arg2_tokenlist = relation["Arg2"]["TokenList"]
    
    #retrieve phrase structure for sentence the argument is in
    arg1_sentenceID = arg1_tokenlist[0][3]
    arg1_phrase_structure = get_phrase_structure(docID, arg1_sentenceID)
    
    arg2_sentenceID = arg2_tokenlist[0][3]
    arg2_phrase_structure = get_phrase_structure(docID, arg2_sentenceID) #should be the same
    
    # Get POS here, because I change the pos tags later 
    phrase  = get_phrase_structure(docID, arg1_sentenceID)
    phrase_tree = Tree.fromstring(phrase)
    
    # Build a list with indices of words in the argument
    indices = []
    
    for wordspan in arg1_tokenlist:

        index = wordspan[4]        
        indices.append(index)
    
    # Get argument of connective (this cannot be done on token level)
    
    
    # Get lists of tokens in the following constituents (which are most freuqently containing constituents):
    
   
    constituent_dict = find_constituent(phrase_tree)
    
    
    
    
        
    # Every word that precedes the connective (list of tokens preceding the connective)
    word_data = parses[docID]["sentences"][arg1_sentenceID]["words"][:connective_index]
    
   
    for index, word in enumerate(word_data):
        
        
        features = dict()
        
        pos = word[1]["PartOfSpeech"]
        token = word[0]
        
        
        if index in indices:
            argpart = True
        else:
            argpart = False
        
        #constituent = get_constituent(docID,arg1_sentenceID,index,pos,token)
        
        # Add the most frequent constituents as features (each token is part or not part of SBAR_list, VP_list, SINV_list, NP_list, PP_list)
        
        for constituent, word_list in constituent_dict.items():
            # Set of words belonging to one constituent:
            word_set = set()
            # loop through the lists that contain the token-position pairs of each constituent
            for const_list in word_list:
                for word, n in const_list: 
                    # Add each position to a set, so the postions occur only once 
                    word_set.add(n)
            if index in word_set:
                # If the word is in one of the constituent sets, give it the value 'true', else give it the values 'false'
                features[constituent] = True
            else:
                features[constituent] = False
    
        
     
        
        #tokenlist.append(((word[0],argpart),pos,constituent,connective))
        
        
        
        features["pos"] = pos
        #features["constituent"] = constituent
#         features["upper_constituent"] = FUNCTION
        features["token"] = token
        
        features["connective"] = connective
        
        features['label'] = argpart
        
        features['SS/PS'] = 'SS'
        
        sentencefeatures.append(features)
        
        tokenlist.append((word[0], argpart))
     
        
        featuresdict[docID]["Sentences"][arg1_sentenceID].append(features)
#         featuresdict[docID]["Sentences"][arg1_sentenceID].append({"features":{"constituent" : constituent}})
#         featuresdict[docID]["Sentences"][arg1_sentenceID].append({"features":{"connective" : connective}})
        
        
        
       
    
    
    featurelist.append(sentencefeatures)
    tuplelist.append(tokenlist)
    


In [16]:
json.dump(featuresdict, open("features_arg1_SS.json", "w"))

In [14]:
#tuplelist
#featurelist



In [20]:
featuresdict

defaultdict(<function __main__.<lambda>>,
            {'wsj_0204': defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
                         {'Sentences': defaultdict(list,
                                      {11: [{'NP': False,
                                         'PP': True,
                                         'S': False,
                                         'SBAR': False,
                                         'SINV': False,
                                         'SS/PS': 'SS',
                                         'VP': False,
                                         'connective': 'if',
                                         'label': False,
                                         'pos': 'IN',
                                         'token': 'Under'},
                                        {'NP': True,
                                         'PP': True,
                                         'S': False,
                                       

In [47]:
for doc, sentences in featuresdict.items():
    for sid, sentence in sentences.items():
        for sid, s in sentence.items():
            for tokendict in s:
                if tokendict['S'] == True:
                    print(tokendict)
        
        
        

### PS relations

In [17]:
featuresdict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

In [18]:
tuplelist = []
featurelist = []

for relation in ps_relations:
    
    tokenlist = []
    
    connective = relation["Connective"]["RawText"]
    connective_index = relation["Connective"]["TokenList"][0][4]
    docID = relation["DocID"]
    
    arg1_tokenlist = relation["Arg1"]["TokenList"]
    arg2_tokenlist = relation["Arg2"]["TokenList"]
    
    #retrieve phrase structure for sentence the argument is in
    arg1_sentenceID = arg1_tokenlist[0][3]
    arg1_phrase_structure = get_phrase_structure(docID, arg1_sentenceID)
    
    arg2_sentenceID = arg2_tokenlist[0][3]
    arg2_phrase_structure = get_phrase_structure(docID, arg2_sentenceID)
    
    
    # Build a list with indices of words in the argument
    indices = []
    
    for wordspan in arg1_tokenlist:

        index = wordspan[4]        
        indices.append(index)

        
    # Every word in the sentence 
    word_data = parses[docID]["sentences"][arg1_sentenceID]["words"]
    sentencefeatures = []
    
    for index, word in enumerate(word_data):
        
        features = dict()
        
        pos = word[1]["PartOfSpeech"]
        token = word[0]
        
        if index in indices:
            argpart = True
        else:
            argpart = False
        
        #constituent = get_constituent(docID,arg1_sentenceID,index,pos,token)
        
        #tokenlist.append(((word[0],argpart),pos,constituent,connective))
        
        # Add the most frequent constituents as features (each token is part or not part of SBAR_list, VP_list, SINV_list, NP_list, PP_list)
        
        for constituent, word_list in constituent_dict.items():
            # Set of words belonging to one constituent:
            word_set = set()
            # loop through the lists that contain the token-position pairs of each constituent
            for const_list in word_list:
                for word, n in const_list: 
                    # Add each position to a set, so the postions occur only once 
                    word_set.add(n)
            if index in word_set:
                # If the word is in one of the constituent sets, give it the value 'true', else give it the values 'false'
                features[constituent] = True
            else:
                features[constituent] = False
       
        
        
        features["pos"] = pos
        #features["constituent"] = constituent
        features["connective"] = connective
        features['token'] = token
        features['label'] = argpart
        features['SS/PS'] = 'PS'
        sentencefeatures.append(features)
        
        tokenlist.append((word[0], argpart))
    
        featuresdict[docID]["Sentences"][arg1_sentenceID].append(features)
    
    featurelist.append(sentencefeatures)
    tuplelist.append(tokenlist)
    
    


In [19]:
json.dump(featuresdict, open("features_arg1_PS.json", "w"))

## Arg2

In [29]:
tuplelist = []
featurelist = []

for relation in (ss_relations + ps_relations):
    
    tokenlist = []
    sentencefeatures = []
    
    connective = relation["Connective"]["RawText"]
    connective_index = relation["Connective"]["TokenList"][0][4]
    docID = relation["DocID"]
    
    arg1_tokenlist = relation["Arg1"]["TokenList"]
    arg2_tokenlist = relation["Arg2"]["TokenList"]
    
    #retrieve phrase structure for sentence the argument is in
    arg1_sentenceID = arg1_tokenlist[0][3]
    arg1_phrase_structure = get_phrase_structure(docID, arg1_sentenceID)
    
    arg2_sentenceID = arg2_tokenlist[0][3]
    arg2_phrase_structure = get_phrase_structure(docID, arg2_sentenceID) #should be the same
    
    # Get POS here, because I change the pos tags later 
    phrase  = get_phrase_structure(docID, arg1_sentenceID)
    phrase_tree = Tree.fromstring(phrase)
    
    # Build a list with indices of words in the argument
    indices = []
    
    for wordspan in arg2_tokenlist:

        index = wordspan[4]        
        indices.append(index)
    
    # Get argument of connective (this cannot be done on token level)
    
    
    # Get lists of tokens in the following constituents (which are most freuqently containing constituents):
    
   
    constituent_dict = find_constituent(phrase_tree)
    
    
    
    
        
    # Every word that precedes the connective (list of tokens preceding the connective)
    word_data = parses[docID]["sentences"][arg2_sentenceID]["words"][connective_index+1:] #correct for connective index
    
   
    for index, word in enumerate(word_data):
        
        
        features = dict()
        
        pos = word[1]["PartOfSpeech"]
        token = word[0]
        
        
        if index in indices:
            argpart = True
        else:
            argpart = False
        
        #constituent = get_constituent(docID,arg1_sentenceID,index,pos,token)
        
        # Add the most frequent constituents as features (each token is part or not part of SBAR_list, VP_list, SINV_list, NP_list, PP_list)
        
        for constituent, word_list in constituent_dict.items():
            # Set of words belonging to one constituent:
            word_set = set()
            # loop through the lists that contain the token-position pairs of each constituent
            for const_list in word_list:
                for word, n in const_list: 
                    # Add each position to a set, so the postions occur only once 
                    word_set.add(n)
            if index in word_set:
                # If the word is in one of the constituent sets, give it the value 'true', else give it the values 'false'
                features[constituent] = True
            else:
                features[constituent] = False
    
        
     
        
        #tokenlist.append(((word[0],argpart),pos,constituent,connective))
        
        
        
        features["pos"] = pos
        #features["constituent"] = constituent
#         features["upper_constituent"] = FUNCTION
        features["token"] = token
        
        features["connective"] = connective
        
        features['label'] = argpart
        
        features['SS/PS'] = 'SS'
        
        sentencefeatures.append(features)
        
        tokenlist.append((word[0], argpart))
     
        
        featuresdict[docID]["Sentences"][arg2_sentenceID].append(features)
#         featuresdict[docID]["Sentences"][arg1_sentenceID].append({"features":{"constituent" : constituent}})
#         featuresdict[docID]["Sentences"][arg1_sentenceID].append({"features":{"connective" : connective}})
        
        
        
       
    
    
    featurelist.append(sentencefeatures)
    tuplelist.append(tokenlist)
    


In [30]:
json.dump(featuresdict, open("features_arg2.json", "w"))

## The classifier

In [19]:
import nltk

In [20]:
def arg_features(sentence, i, history, n_sent):
    
    features = featurelist[n_sent][i]
    #print(features)
    
    if i == 0:
        features["prev-word"] = "<START>"
        features["prev-tag"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
        features["prev-tag"] = history[i-1]
        
#     print(features)
    return features

In [21]:
class ConsecutiveArgTagger(nltk.TaggerI):

    def __init__(self, train_sents):
        train_set = []
        for n_sent, tagged_sent in enumerate(train_sents):
            untagged_sent = nltk.tag.untag(tagged_sent)
#             print(untagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = arg_features(untagged_sent, i, history, n_sent)
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)

    def tag(self, sentence):
        
        global count
        
        history = []
#         print(sentence,"\n")
        for i, word in enumerate(sentence):
            featureset = arg_features(sentence, i, history, count)
            tag = self.classifier.classify(featureset)
            history.append(tag)
            
        count += 1 #for the sentenceID
        
        return list(zip(sentence, history))

In [22]:
count = 0 #reset the sentenceID
tagger = ConsecutiveArgTagger(tuplelist)

In [23]:
size = int(len(tuplelist) * 0.1)

In [24]:
train_sents, test_sents = tuplelist[size:], tuplelist[:size]

In [25]:
print(tagger.evaluate(test_sents))

0.9629629629629629


In [26]:
tagger.tag_sents(test_sents)

[[(('f', True), True),
  (('f', True), False),
  (('f', True), False),
  (('f', True), False),
  (('f', True), False),
  (('f', True), False),
  (('f', True), False),
  (('f', True), False),
  (('f', True), False),
  (('f', True), False),
  (('f', True), False),
  (('f', True), False),
  (('f', True), False),
  (('f', True), False),
  (('f', True), False),
  (('f', True), False),
  (('f', True), False),
  (('f', True), False),
  (('f', True), False),
  (('f', True), True),
  (('f', True), True),
  (('f', True), True),
  (('f', True), True),
  (('f', True), True),
  (('f', True), True),
  (('f', True), True),
  (('f', False), True)]]