# Arg classifier

In [130]:
import json
import pickle
import nltk
from nltk.tree import Tree
from nltk.tree import ParentedTree
from collections import defaultdict

In [2]:
with open('conll16st-en-01-12-16-train/relations.json', encoding='utf-8') as pdtb_file:
    relations = [json.loads(x) for x in pdtb_file];
    
with open('conll16st-en-01-12-16-train/parses.json', encoding='utf8') as parse_file:
    parses = json.load(parse_file)

In [3]:
# We only want explicit relations:

def explicit_relations(relations):
    '''Extract all explicit relations from the relations file'''
    relations_explicit = []

    for relation in relations:
        if relation['Type'] == 'Explicit':
            relations_explicit.append(relation)
    return (relations_explicit)

ex_relations = explicit_relations(relations)

In [4]:
# Seperate them into PS and SS relations

def ss_ps_relations(explicit_relations):
    '''
    Sort relations into same sentence and previous sentence relations
    '''
    
    relations_ss = []
    relations_ps = []
    #relations_other = []

    for relation in explicit_relations:
        sentence_id_arg1 = relation['Arg1']['TokenList'][0][3]
        sentence_id_connective = relation['Connective']['TokenList'][0][3]
        sentence_id_arg2 = relation['Arg2']['TokenList'][0][3]
    
        if sentence_id_arg1 == sentence_id_connective == sentence_id_arg2:
            relations_ss.append(relation)
        elif int(sentence_id_arg1) == int(sentence_id_connective) - 1 == int(sentence_id_arg2) -1:
            relations_ps.append(relation)
        #else: 
            #relations_other.append(relation)
            
    return relations_ss, relations_ps

ss_relations, ps_relations = ss_ps_relations(ex_relations)

In [5]:
def get_phrase_structure(docID, sentenceID):
    """
    Retrieve phrase_structure from the parses.json file by filename and sentenceID.
    """
         
    sentencelist = parses[docID]["sentences"]    
    phrase_structure = sentencelist[sentenceID]["parsetree"]
    
    return phrase_structure

# Output format

In [6]:
def get_constituent(docID,sentenceID,index,pos,token):
    """
    Finds parent node of input token. 
    """
    
    phrase_structure = get_phrase_structure(docID, sentenceID)    
    tree = Tree.fromstring(phrase_structure)    
    tree = ParentedTree.convert(tree)
    
    for subtree in tree.subtrees(filter=lambda x:
                                 x.label() == pos and x[0] == token):
        return subtree.parent().label()

#get_constituent("wsj_0204",6,0,"DT","The")

In [7]:
def get_upper_constituent():
    pass

In [60]:
featuresdict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

### SS relations

In [123]:
tuplelist = []
featurelist = []

for relation in ss_relations:
    
    tokenlist = []
    sentencefeatures = []
    
    connective = relation["Connective"]["RawText"]
    connective_index = relation["Connective"]["TokenList"][0][4]
    docID = relation["DocID"]
    
    arg1_tokenlist = relation["Arg1"]["TokenList"]
    arg2_tokenlist = relation["Arg2"]["TokenList"]
    
    #retrieve phrase structure for sentence the argument is in
    arg1_sentenceID = arg1_tokenlist[0][3]
    arg1_phrase_structure = get_phrase_structure(docID, arg1_sentenceID)
    
    arg2_sentenceID = arg2_tokenlist[0][3]
    arg2_phrase_structure = get_phrase_structure(docID, arg2_sentenceID) #should be the same
    
    
    # Build a list with indices of words in the argument
    indices = []
    
    for wordspan in arg1_tokenlist:

        index = wordspan[4]        
        indices.append(index)

        
    # Every word that precedes the connective
    word_data = parses[docID]["sentences"][arg1_sentenceID]["words"][:connective_index]
    
    for index, word in enumerate(word_data):
        
        features = dict()
        
        pos = word[1]["PartOfSpeech"]
        token = word[0]
        
        if index in indices:
            argpart = True
        else:
            argpart = False
        
        constituent = get_constituent(docID,arg1_sentenceID,index,pos,token)
        
        #tokenlist.append(((word[0],argpart),pos,constituent,connective))
        
        
        
        features["pos"] = pos
        features["constituent"] = constituent
#         features["upper_constituent"] = FUNCTION
        
        features["connective"] = connective
        
        sentencefeatures.append(features)
        
        tokenlist.append((word[0], argpart))
        
        featuresdict[docID]["Sentences"][arg1_sentenceID].append({
                "token": word[0],
                "pos": pos,
                "constituent": constituent,
                "connective": connective,
                "label": argpart
            })
  
    
    featurelist.append(sentencefeatures)
    tuplelist.append(tokenlist)
    


In [124]:
json.dump(featuresdict, open("features_SS.json", "w", encoding="utf-8"), indent=4)

### relations

In [180]:
def get_features(ps_relations, sentence_type):

    #preparing the format for the json output
    featuresdict_arg1 = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    featuresdict_arg2 = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

    for relation in ps_relations:

        connective = relation["Connective"]["RawText"]
        connective_index = relation["Connective"]["TokenList"][0][4]
        docID = relation["DocID"]

        arg1_tokenlist = relation["Arg1"]["TokenList"]
        arg2_tokenlist = relation["Arg2"]["TokenList"]

        #retrieve phrase structure for sentence the argument is in
        arg1_sentenceID = arg1_tokenlist[0][3]
        arg1_phrase_structure = get_phrase_structure(docID, arg1_sentenceID)

        arg2_sentenceID = arg2_tokenlist[0][3]
        arg2_phrase_structure = get_phrase_structure(docID, arg2_sentenceID)


        # Build a list with indices of words in the argument
        indices = []

        for wordspan in arg1_tokenlist:

            index = wordspan[4]        
            indices.append(index)

        #SENTENCE TYPE
        if sentence_type == "SS":
            # Every word that precedes the connective
            word_data_arg1 = parses[docID]["sentences"][arg1_sentenceID]["words"][:connective_index] 
            
        if sentence_type == "PS":
            # Every word in the sentence 
            word_data_arg1 = parses[docID]["sentences"][arg1_sentenceID]["words"]
            
        word_data_arg2 = parses[docID]["sentences"][arg2_sentenceID]["words"][connective_index:]

        
        for index, word in enumerate(word_data_arg1):
                pos = word[1]["PartOfSpeech"]
                token = word[0]

                if index in indices:
                    argpart = True
                else:
                    argpart = False

                constituent = get_constituent(docID,arg1_sentenceID,index,pos,token)

                featuresdict_arg1[docID]["Sentences"][arg1_sentenceID].append({
                        "token": word[0],
                        "pos": pos,
                        "constituent": constituent,
                        "connective": connective,
                        "label": argpart
                    })
                
        for index, word in enumerate(word_data_arg2):
                pos = word[1]["PartOfSpeech"]
                token = word[0]

                if index in indices:
                    argpart = True
                else:
                    argpart = False

                constituent = get_constituent(docID,arg1_sentenceID,index,pos,token)

                featuresdict_arg2[docID]["Sentences"][arg1_sentenceID].append({
                        "token": word[0],
                        "pos": pos,
                        "constituent": constituent,
                        "connective": connective,
                        "label": argpart
                    })

          
    
    return featuresdict_arg1, featuresdict_arg2
            






In [181]:
def wrap(ss_relations, ps_relations):
    featuresdict_ps, featuresdict_ps_arg2 = get_features(ps_relations, "PS")
    print("Got features from PS relations!")
    
    featuresdict_ss, featuresdict_ss_arg2 = get_features(ss_relations, "SS")
    print("Got features from SS relations!")
    
    json.dump(featuresdict_ps, open("features_PS.json", "w", encoding="utf-8"))
    json.dump(featuresdict_ps_arg2, open("features_PS_arg2.json", "w", encoding="utf-8"))
    
    json.dump(featuresdict_ss, open("features_SS.json", "w", encoding="utf-8"))
    json.dump(featuresdict_ss_arg2, open("features_SS_arg2.json", "w", encoding="utf-8"))

In [182]:
wrap(ss_relations, ps_relations)

## Building the Tuplelist

In [155]:
def tuplebuilder(features_json):
    """
    Makes a tuplelist and corresponding featurelist
    from an inputted features.json file. 
    
    Outputs a tuplelist and a featurelist in a list for every sentence. 
    
    """
    
    with open(features_json, encoding='utf8') as features_file:
        features = json.load(features_file)

    tuplelist = []
    featurelist = []

    for docID in features.keys():
        for sentence_n in features[docID]["Sentences"]:

            #reset content of tokenlist
            tokenlist = []
            sentencetokenfeatures = []

            for token in features[docID]["Sentences"][sentence_n]:
                tokenfeatures = dict()

                #importing variables from tokeninfo
                for feature, value in token.items():
                    if feature != "label":
                        tokenfeatures[feature] = value

                tokenstring = token["token"]
                argpart = token["label"]

                tokentuple = (tokenstring, argpart)    

                tokenlist.append(tokentuple)           
                sentencetokenfeatures.append(tokenfeatures)

            #Building tuple
            tuplelist.append(tokenlist)
            featurelist.append(sentencetokenfeatures)
            
    return tuplelist, featurelist
            
tuplelist, featurelist = tuplebuilder("features.json")
            

## The classifier

In [157]:
def arg_features(sentence, i, history, n_sent):
    
    features = featurelist[n_sent][i]
    #print(features)
    
    if i == 0:
        features["prev-word"] = "<START>"
        features["prev-tag"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
        features["prev-tag"] = history[i-1]
        
#     print(features)
    return features

In [158]:
class ConsecutiveArgTagger(nltk.TaggerI):

    def __init__(self, train_sents):
        train_set = []
        for n_sent, tagged_sent in enumerate(train_sents):
            untagged_sent = nltk.tag.untag(tagged_sent)
#             print(untagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = arg_features(untagged_sent, i, history, n_sent)
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)

    def tag(self, sentence):
        
        global count
        
        history = []
#         print(sentence,"\n")
        for i, word in enumerate(sentence):
            featureset = arg_features(sentence, i, history, count)
            tag = self.classifier.classify(featureset)
            history.append(tag)
            
        count += 1 #for the sentenceID
        
        return list(zip(sentence, history))

In [159]:
tagger = ConsecutiveArgTagger(tuplelist)

In [160]:
pickle.dump(tagger, open("tagger_SS.pickle", "wb"))


### Evaluating

In [161]:
count = 0 #reset the sentenceID
size = int(len(tuplelist) * 0.1)

In [162]:
train_sents, test_sents = tuplelist[size:], tuplelist[:size]

In [163]:
print(tagger.evaluate(test_sents))

0.6594021376085505


In [150]:
tagger.tag_sents(test_sents)

[[(('He', False), False),
  (('said', False), False),
  (('construction', True), False),
  (('would', True), False),
  (("n't", True), False),
  (('resume', True), False)],
 [(('Mr.', False), False),
  (('Maxwell', False), False),
  (('said', False), False),
  (('he', True), False),
  (('would', True), False),
  (("n't", True), False),
  (('be', True), False),
  (('surprised', True), False)],
 [(('But', True), False),
  (('the', True), False),
  (('Memphis', True), False),
  ((',', True), False),
  (('Tenn.', True), False),
  ((',', True), False),
  (('facility', True), False),
  (('was', True), False),
  (("n't", True), False),
  (('to', True), False),
  (('begin', True), False),
  (('turning', True), False),
  (('out', True), False),
  (('product', True), False),
  (('until', True), False),
  (('1993', True), False),
  ((',', False), False)],
 [(('Last', False), True),
  (('Friday', False), True),
  (("'s", False), True),
  (('announcement', False), True),
  (('was', False), True),
 