In [1]:
import nltk
from nltk.classify import MaxentClassifier

In [2]:
def ReadData(path):
    lines = open(path, "r")
    startwords = []
    pretag = 'start'
    prepos = 'start'
    prechunk = 'start'
    features = []
    for line in lines:
        if line == '\n':
            pretag = 'start'
            prepos = 'start'
            prechunk = 'start'
        else:
            values = line.split()
            if pretag == 'start':
                startwords.append(values[0])
            else:
                features[-1][-1] = values[0]
            features.append([pretag,prepos,prechunk]+values+['end'])
            prepos = values[1]
            prechunk = values[2]
            pretag = values[3]
    return features, startwords
features,startwords = ReadData("CONLL_NAME_CORPUS_FOR_STUDENTS/CONLL_train.pos-chunk-name")
features

[['start', 'start', 'start', '-DOCSTART-', '-X-', 'O', 'O', 'end'],
 ['start', 'start', 'start', 'EU', 'NNP', 'I-NP', 'I-ORG', 'rejects'],
 ['I-ORG', 'NNP', 'I-NP', 'rejects', 'VBZ', 'I-VP', 'O', 'German'],
 ['O', 'VBZ', 'I-VP', 'German', 'JJ', 'I-NP', 'I-MISC', 'call'],
 ['I-MISC', 'JJ', 'I-NP', 'call', 'NN', 'I-NP', 'O', 'to'],
 ['O', 'NN', 'I-NP', 'to', 'TO', 'I-VP', 'O', 'boycott'],
 ['O', 'TO', 'I-VP', 'boycott', 'VB', 'I-VP', 'O', 'British'],
 ['O', 'VB', 'I-VP', 'British', 'JJ', 'I-NP', 'I-MISC', 'lamb'],
 ['I-MISC', 'JJ', 'I-NP', 'lamb', 'NN', 'I-NP', 'O', '.'],
 ['O', 'NN', 'I-NP', '.', '.', 'O', 'O', 'end'],
 ['start', 'start', 'start', 'Peter', 'NNP', 'I-NP', 'I-PER', 'Blackburn'],
 ['I-PER', 'NNP', 'I-NP', 'Blackburn', 'NNP', 'I-NP', 'I-PER', 'end'],
 ['start', 'start', 'start', 'BRUSSELS', 'NNP', 'I-NP', 'I-LOC', '1996-08-22'],
 ['I-LOC', 'NNP', 'I-NP', '1996-08-22', 'CD', 'I-NP', 'O', 'end'],
 ['start', 'start', 'start', 'The', 'DT', 'I-NP', 'O', 'European'],
 ['O', 'DT',

In [4]:
def FeatureBuilder(pretag,prepos,prechunk,token,pos,chunk,nextword):
    feature = {}
    feature['token'] = token
    feature['pos'] = pos
    feature['chunk'] = chunk
    feature['cap'] = token[0].isupper()
    feature['startwords'] = token in startwords
    feature['cap_start'] = token not in startwords and token[0].isupper()
    feature['pretag'] = pretag
    feature['name_list1'] = "Jan" in token
    feature['name_list2'] = "Tom" in token
    feature['preposchunk'] = (prepos,prechunk)
    feature['nextword'] = nextword.isdigit()
    return feature
def MEtrain(feature, iterations = 20):
    return MaxentClassifier.train(feature, max_iter=iterations)
MEMM = MEtrain([(FeatureBuilder(pretag,prepos,prechunk,token,pos,chunk,nextword),tag) for pretag,prepos,prechunk,token,pos,chunk,tag,nextword in features])

  ==> Training (20 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -2.07944        0.834
             2          -0.23736        0.834
             3          -0.20178        0.853
             4          -0.17413        0.901
             5          -0.15489        0.937
             6          -0.14104        0.949
             7          -0.13062        0.955
             8          -0.12245        0.961
             9          -0.11582        0.964
            10          -0.11029        0.968
            11          -0.10556        0.970
            12          -0.10145        0.972
            13          -0.09782        0.974
            14          -0.09456        0.975
            15          -0.09162        0.978
            16          -0.08893        0.979
            17          -0.08646        0.980
            18          -0.08417        0.981
            19          -0.08204        0.982
  

In [5]:
def ReadTestData(path):
    lines = open(path, "r")
    tokens = []
    pos = []
    chunk = []
    words = []
    poslist = []
    chunklist = []
    for line in lines:
        if line == '\n':
            words.append(tokens)
            poslist.append(pos)
            chunklist.append(chunk)
            tokens = []
            pos = []
            chunk = []
        else:
            values = line.split()
            tokens.append(values[0])
            pos.append(values[1])
            chunk.append(values[2])
    return words, poslist,chunklist
words, poslist,chunklist = ReadTestData("CONLL_NAME_CORPUS_FOR_STUDENTS/CONLL_dev.pos-chunk")

In [6]:
def MEtag(tokens,pos,chunk):
    tags = ['I-PER','I-LOC','I-ORG','I-MISC','O']
    n = len(tokens)
    dp = {}
    dp['start'] = (1,[])
    for i,word in enumerate(tokens[:-1]):
        new_dp = {}
        for pretag in dp:
            probs = MEMM.prob_classify(FeatureBuilder(pretag,pos[i-1],chunk[i-1],word,pos[i],chunk[i],tokens[i+1]))
            for tag in tags:
                probability = probs.prob(tag)
                probability *= dp[pretag][0]
                new_path = dp[pretag][1]+[tag]
                if probability > new_dp.get(tag,(-float('inf'),''))[0]:
                    new_dp[tag] = (probability,new_path)
        dp = new_dp
    return max(dp.values())

In [7]:
def WriteOutput(path,words,poslist,chunklist):
    file = open(path,'w')
    for tokens,pos,chunk in zip(words,poslist,chunklist):
        tags = MEtag(tokens+['end'],pos+['start'],chunk+['start'])[1]
        for i in range(len(tokens)):
            file.write(tokens[i]+'\t'+tags[i]+'\n')
        file.write('\n')
    file.close()

In [8]:
WriteOutput('CONLL_dev.name',words,poslist,chunklist)

In [9]:
def score (keyFileName, responseFileName):
    keyFile = open(keyFileName, 'r')
    key = keyFile.readlines()
    responseFile = open(responseFileName, 'r')
    response = responseFile.readlines()
    if len(key) != len(response):
        print("length mismatch between key and submitted file")
        exit()
    correct = 0
    incorrect = 0
    keyGroupCount = 0
    keyStart = 0
    responseGroupCount = 0
    responseStart = 0
    correctGroupCount = 0
    for i in range(len(key)):
        key[i] = key[i].rstrip('\n')
        response[i] = response[i].rstrip('\n')
        if key[i] == "":
            if response[i] == "":
                continue
            else:
                print("sentence break expected at line " + str(i))
                exit()
        keyFields = key[i].split('\t')
        if len(keyFields) != 2:
            print("format error in key at line " + str(i) + ":" + key[i])
            exit()
        keyToken = keyFields[0]
        keyTag = keyFields[1]
        responseFields = response[i].split('\t')
        if len(responseFields) != 2:
            print("format error at line " + str(i))
            exit()
        responseToken = responseFields[0]
        responseTag = responseFields[1]
        if responseToken != keyToken:
            print("token mismatch at line " + str(i))
            exit()
        if responseTag == keyTag:
              correct = correct + 1
        else:
              incorrect = incorrect + 1
        # the previous token ends a group if
        #   we are in a group AND
        #   the current tag is O OR the current tag is a B tag
        #   the current tag is an I tag with a different type from the current group
        responseEnd =  responseStart!=0 and (responseTag=='O' or responseTag[0:1]=='B' or (responseTag[0:1]=='I' and responseTag[2:]!=responseGroupType))
        # the current token begins a group if
        #   the previous token was not in a group or ended a group AND
        #   the current tag is an I or B tag
        responseBegin = (responseStart==0 or responseEnd) and (responseTag[0:1]=='B' or responseTag[0:1]=='I')
        keyEnd =  keyStart!=0 and (keyTag=='O' or keyTag[0:1]=='B' or (keyTag[0:1]=='I' and keyTag[2:]!=keyGroupType))
        keyBegin = (keyStart==0 or keyEnd) and (keyTag[0:1]=='B' or keyTag[0:1]=='I')
        if responseEnd:
            responseGroupCount = responseGroupCount + 1
        if keyEnd:
            keyGroupCount = keyGroupCount + 1
        if responseEnd and keyEnd and responseStart == keyStart and responseGroupType == keyGroupType:
            correctGroupCount = correctGroupCount + 1
        if responseBegin:
            responseStart = i
            responseGroupType = responseTag[2:]
        elif responseEnd:
            responseStart = 0
        if keyBegin:
            keyStart = i
            keyGroupType = keyTag[2:]
        elif keyEnd:
            keyStart = 0
    print(correct, "out of", str(correct + incorrect) + " tags correct")
    accuracy = 100.0 * correct / (correct + incorrect)
    print("  accuracy: %5.2f" % accuracy)
    print(keyGroupCount, "groups in key")
    print(responseGroupCount, "groups in response")
    print(correctGroupCount, "correct groups")
    precision = 100.0 * correctGroupCount / responseGroupCount
    recall = 100.0 * correctGroupCount / keyGroupCount
    F = 2 * precision  * recall / (precision + recall)
    print("  precision: %5.2f" % precision)
    print("  recall:    %5.2f" % recall)
    print("  F1:        %5.2f" % F)

score('CONLL_NAME_CORPUS_FOR_STUDENTS/CONLL_dev.name', 'CONLL_dev.name')

49365 out of 51578 tags correct
  accuracy: 95.71
5917 groups in key
5749 groups in response
4567 correct groups
  precision: 79.44
  recall:    77.18
  F1:        78.30
