In [6]:
import json
import pickle
import nltk
from collections import defaultdict

## Building the Tuplelist

In [7]:
def tuplebuilder(features_json):
    """
    Makes a tuplelist and corresponding featurelist
    from an inputted features.json file. 
    
    Outputs a tuplelist and a featurelist in a list for every sentence. 
    
    """
    
    with open(features_json, encoding='utf8') as features_file:
        features = json.load(features_file)

    tuplelist = []
    featurelist = []

    for docID in features.keys():
        for sentence_n in features[docID]["Sentences"]:

            #reset content of tokenlist
            tokenlist = []
            sentencetokenfeatures = []

            for token in features[docID]["Sentences"][sentence_n]:
                tokenfeatures = dict()

                #importing variables from tokeninfo
                for feature, value in token.items():
                    if feature != "label":
                        tokenfeatures[feature] = value

                tokenstring = token["token"]
                argpart = token["label"]

                tokentuple = (tokenstring, argpart)    

                tokenlist.append(tokentuple)           
                sentencetokenfeatures.append(tokenfeatures)

            #Building tuple
            tuplelist.append(tokenlist)
            featurelist.append(sentencetokenfeatures)
            
    return tuplelist, featurelist
            
#tuplelist, featurelist = tuplebuilder("features_arg1_SS.json")
            

## The classifier

In [8]:
def arg_features(sentence, i, history, n_sent):
    
    features = featurelist[n_sent][i]
    #print(features)
    
    if i == 0:
        features["prev-word"] = "<START>"
        features["prev-tag"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
        features["prev-tag"] = history[i-1]
        
#     print(features)
    return features

In [9]:
class ConsecutiveArgTagger(nltk.TaggerI):

    def __init__(self, train_sents):
        train_set = []
        for n_sent, tagged_sent in enumerate(train_sents):
            untagged_sent = nltk.tag.untag(tagged_sent)
#             print(untagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = arg_features(untagged_sent, i, history, n_sent)
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)

    def tag(self, sentence):
        
        global count
        
        history = []
#         print(sentence,"\n")
        for i, word in enumerate(sentence):
            featureset = arg_features(sentence, i, history, count)
            tag = self.classifier.classify(featureset)
            history.append(tag)
            
        count += 1 #for the sentenceID
        
        return list(zip(sentence, history))

## Magic, creating the models

In [10]:
tuplelist, featurelist = tuplebuilder("features_arg1_SS.json")
tagger = ConsecutiveArgTagger(tuplelist)
pickle.dump(tagger, open("models/tagger_arg1_SS.pickle", "wb"))
pickle.dump(((tuplelist,featurelist)), open("models/tuplelist_featurelist_arg1_SS.pickle", "wb"))

tuplelist, featurelist = tuplebuilder("features_arg1_PS.json")
tagger = ConsecutiveArgTagger(tuplelist)
pickle.dump(tagger, open("models/tagger_arg1_PS.pickle", "wb"))
pickle.dump(((tuplelist,featurelist)), open("models/tuplelist_featurelist_arg1_PS.pickle", "wb"))

tuplelist, featurelist = tuplebuilder("features_arg2.json")
tagger = ConsecutiveArgTagger(tuplelist)
pickle.dump(tagger, open("models/tagger_arg2.pickle", "wb"))
pickle.dump(((tuplelist,featurelist)), open("models/tuplelist_featurelist_arg2.pickle", "wb"))

### Evaluating

In [11]:
# for testing
tuplelist, featurelist = tuplebuilder("features_arg2.json")

In [12]:
count = 0 #reset the sentenceID
size = int(len(tuplelist) * 0.1)

In [13]:
train_sents, test_sents = tuplelist[size:], tuplelist[:size]

In [14]:
print(tagger.evaluate(test_sents))

0.7138628309360017


In [None]:
#tagger.tag_sents(test_sents)