# Arg classifier

In [2]:
import json
from nltk.tree import Tree
from nltk.tree import ParentedTree
from collections import defaultdict

In [3]:
with open('conll16st-en-01-12-16-train/relations.json', encoding='utf-8') as pdtb_file:
    relations = [json.loads(x) for x in pdtb_file];
    
with open('conll16st-en-01-12-16-train/parses.json', encoding='utf8') as parse_file:
    parses = json.load(parse_file)

In [4]:
# We only want explicit relations:

def explicit_relations(relations):
    '''Extract all explicit relations from the relations file'''
    relations_explicit = []

    for relation in relations:
        if relation['Type'] == 'Explicit':
            relations_explicit.append(relation)
    return (relations_explicit)

ex_relations = explicit_relations(relations)

In [5]:
# Seperate them into PS and SS relations

def ss_ps_relations(explicit_relations):
    '''
    Sort relations into same sentence and previous sentence relations
    '''
    
    relations_ss = []
    relations_ps = []
    #relations_other = []

    for relation in explicit_relations:
        sentence_id_arg1 = relation['Arg1']['TokenList'][0][3]
        sentence_id_connective = relation['Connective']['TokenList'][0][3]
        sentence_id_arg2 = relation['Arg2']['TokenList'][0][3]
    
        if sentence_id_arg1 == sentence_id_connective == sentence_id_arg2:
            relations_ss.append(relation)
        elif int(sentence_id_arg1) == int(sentence_id_connective) - 1 == int(sentence_id_arg2) -1:
            relations_ps.append(relation)
        #else: 
            #relations_other.append(relation)
            
    return relations_ss, relations_ps

ss_relations, ps_relations = ss_ps_relations(ex_relations)

In [6]:
def get_phrase_structure(docID, sentenceID):
    """
    Retrieve phrase_structure from the parses.json file by filename and sentenceID.
    """
         
    sentencelist = parses[docID]["sentences"]    
    phrase_structure = sentencelist[sentenceID]["parsetree"]
    
    return phrase_structure

# Output format

In [163]:
def get_constituent(docID,sentenceID,index,pos,token):
    """
    Finds parent node of input token. 
    """
    
    phrase_structure = get_phrase_structure(docID, sentenceID)    
    tree = Tree.fromstring(phrase_structure)    
    tree = ParentedTree.convert(tree)
    
    for subtree in tree.subtrees(filter=lambda x:
                                 x.label() == pos and x[0] == token):
        return subtree.parent().label()

#get_constituent("wsj_0204",6,0,"DT","The")

In [257]:
def get_upper_constituent():
    pass

In [288]:
featuresdict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

### SS relations

In [291]:
tuplelist = []
featurelist = []

for relation in ss_relations[:20]:
    
    tokenlist = []
    sentencefeatures = []
    
    connective = relation["Connective"]["RawText"]
    connective_index = relation["Connective"]["TokenList"][0][4]
    docID = relation["DocID"]
    
    arg1_tokenlist = relation["Arg1"]["TokenList"]
    arg2_tokenlist = relation["Arg2"]["TokenList"]
    
    #retrieve phrase structure for sentence the argument is in
    arg1_sentenceID = arg1_tokenlist[0][3]
    arg1_phrase_structure = get_phrase_structure(docID, arg1_sentenceID)
    
    arg2_sentenceID = arg2_tokenlist[0][3]
    arg2_phrase_structure = get_phrase_structure(docID, arg2_sentenceID) #should be the same
    
    
    # Build a list with indices of words in the argument
    indices = []
    
    for wordspan in arg1_tokenlist:

        index = wordspan[4]        
        indices.append(index)

        
    # Every word that precedes the connective
    word_data = parses[docID]["sentences"][arg1_sentenceID]["words"][:connective_index]
    
    for index, word in enumerate(word_data):
        
        features = dict()
        
        pos = word[1]["PartOfSpeech"]
        token = word[0]
        
        if index in indices:
            argpart = True
        else:
            argpart = False
        
        constituent = get_constituent(docID,arg1_sentenceID,index,pos,token)
        
        #tokenlist.append(((word[0],argpart),pos,constituent,connective))
        
        
        
        features["pos"] = pos
        features["constituent"] = constituent
#         features["upper_constituent"] = FUNCTION
        
        features["connective"] = connective
        
        sentencefeatures.append(features)
        
        tokenlist.append((word[0], argpart))
        
        featuresdict[docID]["Sentences"][arg1_sentenceID].append({"token": word[0], "pos" : pos, "constituent" : constituent, "connective" : connective})
#         featuresdict[docID]["Sentences"][arg1_sentenceID].append({"features":{"constituent" : constituent}})
#         featuresdict[docID]["Sentences"][arg1_sentenceID].append({"features":{"connective" : connective}})
        
        
        
        
        
        
    
    
    featurelist.append(sentencefeatures)
    tuplelist.append(tokenlist)
    


In [292]:
json.dump(featuresdict, open("features.json", "w"), indent=4)

In [None]:
tuplelist
featurelist

In [None]:
tuplelist

### PS relations

In [248]:
tuplelist = []
featurelist = []

for relation in ps_relations:
    
    tokenlist = []
    
    connective = relation["Connective"]["RawText"]
    connective_index = relation["Connective"]["TokenList"][0][4]
    docID = relation["DocID"]
    
    arg1_tokenlist = relation["Arg1"]["TokenList"]
    arg2_tokenlist = relation["Arg2"]["TokenList"]
    
    #retrieve phrase structure for sentence the argument is in
    arg1_sentenceID = arg1_tokenlist[0][3]
    arg1_phrase_structure = get_phrase_structure(docID, arg1_sentenceID)
    
    arg2_sentenceID = arg2_tokenlist[0][3]
    arg2_phrase_structure = get_phrase_structure(docID, arg2_sentenceID)
    
    
    # Build a list with indices of words in the argument
    indices = []
    
    for wordspan in arg1_tokenlist:

        index = wordspan[4]        
        indices.append(index)

        
    # Every word in the sentence 
    word_data = parses[docID]["sentences"][arg1_sentenceID]["words"]
    
    for index, word in enumerate(word_data):
        
        features = dict()
        
        pos = word[1]["PartOfSpeech"]
        token = word[0]
        
        if index in indices:
            argpart = True
        else:
            argpart = False
        
        constituent = get_constituent(docID,arg1_sentenceID,index,pos,token)
        
        #tokenlist.append(((word[0],argpart),pos,constituent,connective))
        
        
        
        features["pos"] = pos
        features["constituent"] = constituent
        features["connective"] = connective
        
        sentencefeatures.append(features)
        
        tokenlist.append((word[0], argpart))
    
    
    featurelist.append(sentencefeatures)
    tuplelist.append(tokenlist)
    
    


In [None]:
tuplelist

## The classifier

In [48]:
import nltk

In [239]:
def arg_features(sentence, i, history, n_sent):
    
    features = featurelist[n_sent][i]
    #print(features)
    
    if i == 0:
        features["prev-word"] = "<START>"
        features["prev-tag"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
        features["prev-tag"] = history[i-1]
        
#     print(features)
    return features

In [240]:
class ConsecutiveArgTagger(nltk.TaggerI):

    def __init__(self, train_sents):
        train_set = []
        for n_sent, tagged_sent in enumerate(train_sents):
            untagged_sent = nltk.tag.untag(tagged_sent)
#             print(untagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = arg_features(untagged_sent, i, history, n_sent)
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)

    def tag(self, sentence):
        
        global count
        
        history = []
#         print(sentence,"\n")
        for i, word in enumerate(sentence):
            featureset = arg_features(sentence, i, history, count)
            tag = self.classifier.classify(featureset)
            history.append(tag)
            
        count += 1 #for the sentenceID
        
        return list(zip(sentence, history))

In [249]:
count = 0 #reset the sentenceID
tagger = ConsecutiveArgTagger(tuplelist)

In [250]:
size = int(len(tuplelist) * 0.1)

In [251]:
train_sents, test_sents = tuplelist[size:], tuplelist[:size]

In [252]:
print(tagger.evaluate(test_sents))

0.6883632752550329


In [237]:
tagger.tag_sents(test_sents)

[[(('Under', False), False),
  (('two', False), False),
  (('new', False), False),
  (('features', False), False),
  ((',', False), False),
  (('participants', False), False),
  (('will', False), False),
  (('be', False), False),
  (('able', False), False),
  (('to', False), False),
  (('transfer', False), False),
  (('money', False), False),
  (('from', False), False),
  (('the', False), False),
  (('new', False), False),
  (('funds', False), False),
  (('to', False), False),
  (('other', False), False),
  (('investment', False), False),
  (('funds', False), False),
  (('or', True), False),
  ((',', False), False)],
 [(('Solo', True), False),
  (('woodwind', True), False),
  (('players', True), False),
  (('have', True), False),
  (('to', True), False),
  (('be', True), False),
  (('creative', True), False)],
 [(('Solo', True), False),
  (('woodwind', True), False),
  (('players', True), False),
  (('have', True), False),
  (('to', True), False),
  (('be', True), False),
  (('creative