In [1]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
from nltk.metrics.scores import accuracy

In [2]:
from nltk.corpus import brown
brown.words(categories='fiction')

['Thirty-three', 'Scotty', 'did', 'not', 'go', 'back', ...]

In [3]:
# for label in brown.categories():
#     print('Iteration for category {}'.format(label))
#     for fileid in brown.fileids(categories=label):
#         print('Iteration for FileId {}'.format(fileid))
#         print(brown.words(fileids=fileid))

In [5]:
# have nltk_data directory in home directory
import os, os.path
path = os.path.expanduser('~/nltk_data')
if not os.path.exists(path):
    os.mkdir(path)

os.path.exists(path)

import nltk.data
path in nltk.data.path

True

----
Accuracy using Wiki Gold dataset
----

In [6]:
with open("./wiki_gold.txt", encoding="utf8") as f:
    raw_annotations = f.read()

split_annotations = raw_annotations.split()

# Amend class annotations to reflect Stanford's NERTagger
for n,i in enumerate(split_annotations):
    if i == "I-PER":
        split_annotations[n] = "PERSON"
    if i == "I-ORG":
        split_annotations[n] = "ORGANIZATION"
    if i == "I-LOC":
        split_annotations[n] = "LOCATION"
    if i == "I-MISC":
        split_annotations[n] = "MISC"

# Group NE data into tuples
def group(lst, n):
    for i in range(0, len(lst), n):
        val = lst[i:i+n]
        if len(val) == n:
            yield tuple(val)

reference_annotations = list(group(split_annotations, 2))
pure_tokens = split_annotations[::2]

In [7]:
tagged_words = nltk.pos_tag(pure_tokens)
nltk_unformatted_prediction = nltk.ne_chunk(tagged_words)

In [8]:
type(nltk_unformatted_prediction)

nltk.tree.Tree

In [9]:
multiline_string = nltk.chunk.tree2conllstr(nltk_unformatted_prediction)
listed_pos_and_ne = multiline_string.split()

In [10]:
#Convert prediction to multiline string and then to list (includes pos tags)
multiline_string = nltk.chunk.tree2conllstr(nltk_unformatted_prediction)
listed_pos_and_ne = multiline_string.split()

# Delete pos tags and rename
del listed_pos_and_ne[1::3]
listed_ne = listed_pos_and_ne

# Amend class annotations for consistency with reference_annotations
for n,i in enumerate(listed_ne):
    if i == "B-PERSON":
        listed_ne[n] = "PERSON"
    if i == "I-PERSON":
        listed_ne[n] = "PERSON"    
    if i == "B-ORGANIZATION":
        listed_ne[n] = "ORGANIZATION"
    if i == "I-ORGANIZATION":
        listed_ne[n] = "ORGANIZATION"
    if i == "B-LOCATION":
        listed_ne[n] = "LOCATION"
    if i == "I-LOCATION":
        listed_ne[n] = "LOCATION"
    if i == "B-GPE":
        listed_ne[n] = "LOCATION"
    if i == "I-GPE":
        listed_ne[n] = "LOCATION"

# Group prediction into tuples
nltk_formatted_prediction = list(group(listed_ne, 2))
nltk_formatted_prediction = nltk_formatted_prediction[:37501]
reference_annotations = reference_annotations[:37501]

In [11]:
nltk_accuracy_Wiki = accuracy(reference_annotations, nltk_formatted_prediction)
print(nltk_accuracy_Wiki)

0.8999493346844084


In [12]:
# for word in range(len(nltk_formatted_prediction)):
#     print("NLTK Prediction: ", nltk_formatted_prediction[word], "\t\tReference Annotations: ", reference_annotations[word])

----
NER from FAQ Sample
----

In [13]:
with open("./FAQ_Full.txt") as f:
    train_text = f.read()

with open("./FAQ_Sample.txt") as f:
    sample_text = f.read()

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)    # Train custom tokenizer
tokenized = custom_sent_tokenizer.tokenize(sample_text)       # Tokenize input

def process_content():
    try:
        for i in tokenized[5:]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)        # Tag parts of speech for words
            namedEnt = nltk.ne_chunk(tagged)    # Extract named entities
            namedEnt.draw()                     # Visualize results tree
    except Exception as e:
        print(str(e))

# process_content()

----
Accuracy using manually annotated FAQ question dataset
----

In [14]:
# Create unique words list to be manually annotated
# import re
# with open("./FAQ_small.txt",'r') as f:
#     anno_text = f.read()
#     anno_text = re.sub('\W+', ' ', anno_text)
#     anno_text = anno_text.split()

# len(anno_text)

# # with open("./FAQ_anno_samp.txt",'w') as f:
# #     for word in anno_text:
# #         f.write(word+' \n')
# #         f.truncate()

In [21]:
with open("./FAQ_anno_samp.txt", encoding="utf8") as f:
    raw_annotations = f.read()

split_annotations = raw_annotations.split()

# Amend class annotations for consistency
for n,i in enumerate(split_annotations):
    if i == "I-PER":
        split_annotations[n] = "PERSON"
    if i == "I-ORG":
        split_annotations[n] = "ORGANIZATION"
    if i == "I-LOC":
        split_annotations[n] = "LOCATION"
    if i == "I-MISC":
        split_annotations[n] = "MISC"

# Group NE data into tuples
def group(lst, n):
    for i in range(0, len(lst), n):
        val = lst[i:i+n]
        if len(val) == n:
            yield tuple(val)

reference_annotations = list(group(split_annotations, 2))
pure_tokens = split_annotations[::2]

In [22]:
tagged_words = nltk.pos_tag(pure_tokens)
nltk_unformatted_prediction = nltk.ne_chunk(tagged_words)

In [23]:
# Convert prediction to multiline string and then to list (includes pos tags)
multiline_string = nltk.chunk.tree2conllstr(nltk_unformatted_prediction)
listed_pos_and_ne = multiline_string.split()

# Delete pos tags and rename
del listed_pos_and_ne[1::3]
listed_ne = listed_pos_and_ne

# Amend class annotations for consistency with reference_annotations
for n,i in enumerate(listed_ne):
    if i == "B-PERSON":
        listed_ne[n] = "PERSON"
    if i == "I-PERSON":
        listed_ne[n] = "PERSON"    
    if i == "B-ORGANIZATION":
        listed_ne[n] = "ORGANIZATION"
    if i == "I-ORGANIZATION":
        listed_ne[n] = "ORGANIZATION"
    if i == "B-LOCATION":
        listed_ne[n] = "LOCATION"
    if i == "I-LOCATION":
        listed_ne[n] = "LOCATION"
    if i == "B-GPE":
        listed_ne[n] = "LOCATION"
    if i == "I-GPE":
        listed_ne[n] = "LOCATION"

# Group prediction into tuples
nltk_formatted_prediction = list(group(listed_ne, 2))

In [25]:
nltk_accuracy_FAQ = accuracy(reference_annotations, nltk_formatted_prediction)
print(nltk_accuracy_FAQ)

0.9033457249070632


In [24]:
# for word in range(len(nltk_formatted_prediction)):
#     print("NLTK Prediction: ", nltk_formatted_prediction[word], "\t\tReference Annotations: ", reference_annotations[word])

----
Investigating NLTK's NE Chunker
----

In [18]:
# Loads the serialized NEChunkParser object
chunker = nltk.data.load('chunkers/maxent_ne_chunker/english_ace_multiclass.pickle')

# The MaxEnt classifier
maxEnt = chunker._tagger.classifier()

def maxEnt_report():
    maxEnt = chunker._tagger.classifier()
    print("These are the labels used by the NLTK\'s NEC:\n")
    print(maxEnt.labels())
    print("These are the most informative features found in the ACE corpora:\n")
    print(maxEnt.show_most_informative_features())

def ne_report(sentence, report_all=False):
    tokens = nltk.word_tokenize(sentence)    # Tokenize input into words
    tagged_tokens = nltk.pos_tag(tokens)     # Tag parts of speech for each word
    tags = []
    for i in range(len(tagged_tokens)):
        featureset = chunker._tagger.feature_detector(tagged_tokens, i, tags)    # Return the feature detector that this tagger uses to generate featuresets for its classifier.
        tag = chunker._tagger.choose_tag(tagged_tokens, i, tags)                 # Decide which tag should be used for the specified token and return that tag.
        if tag != 'O' or report_all:
            print ('\nExplanation on the why the word \'' + tagged_tokens[i][0] + '\' was tagged:')
            featureset = chunker._tagger.feature_detector(tagged_tokens, i, tags)
            maxEnt.explain(featureset)    # Print a table showing the effect of each of the features in
                                          # the given feature set, and how they combine to determine the
                                          # probabilities of each label for that featureset.
        tags.append(tag)

In [19]:
maxEnt_report()

These are the labels used by the NLTK's NEC:

['I-GSP', 'B-LOCATION', 'B-GPE', 'I-ORGANIZATION', 'I-PERSON', 'O', 'I-FACILITY', 'I-LOCATION', 'B-PERSON', 'B-FACILITY', 'B-GSP', 'B-ORGANIZATION', 'I-GPE']
These are the most informative features found in the ACE corpora:

  10.125 bias==True and label is 'O'
   6.631 suffix3=='day' and label is 'O'
  -6.207 bias==True and label is 'I-GSP'
   5.628 prevtag=='O' and label is 'O'
  -4.740 shape=='upcase' and label is 'O'
   4.106 shape+prevtag=='<function shape at 0x8bde0d4>+O' and label is 'O'
  -3.994 shape=='mixedcase' and label is 'O'
   3.992 pos+prevtag=='NNP+B-PERSON' and label is 'I-PERSON'
   3.890 prevtag=='I-ORGANIZATION' and label is 'I-ORGANIZATION'
   3.879 shape+prevtag=='<function shape at 0x8bde0d4>+I-ORGANIZATION' and label is 'I-ORGANIZATION'
None


In [20]:
ne_report('STAR act, is a California law designed to improve the interface between community college programs and CSU degree programs.')


Explanation on the why the word 'California' was tagged:
  Feature                                            B-GPE B-ORGAN       O   B-GSP
  --------------------------------------------------------------------------------
  prevtag=='O' (1)                                   3.767
  shape=='upcase' (1)                                2.701
  pos+prevtag=='NNP+O' (1)                           2.254
  en-wordlist==False (1)                             2.095
  label is 'B-GPE' (1)                              -2.005
  bias==True (1)                                    -1.975
  suffix3=='nia' (1)                                 1.700
  prefix3=='cal' (1)                                 1.139
  pos=='NNP' (1)                                     0.681
  prevword=='a' (1)                                  0.641
  nextpos=='nn' (1)                                  0.597
  word=='California' (1)                             0.556
  wordlen==10 (1)                                   -0.399
  prevpos

Features are used in NLTK's ne_chunk
----

The shape of the word (e.g., does it contain numbers? does it begin with a capital letter?)

The length of the word

The first three letters of the word

The last three letters of the word

The POS tag of the word

The word itself

Does the word exist in an English dictionary?

The tag of the word that precedes this word (i.e., was the previous word identified as a NE)

The POS tag of the preceding word

The POS tag of the following word

The word that precedes this word

The word that follows this word

The word combined with the POS tag of the following word

The POS tag of the word combined with the tag of the preceding word

The shape of the word combined with the tag of the preceding word

----
Using Stanford NERTagger
----

In [15]:
import os
os.environ['CLASSPATH'] = "./stanford-ner/"
os.environ['STANFORD_MODELS'] = "./stanford-ner/classifiers/"

# need this when running on win10
os.environ['JAVAHOME'] = "C:\Program Files\Java\jdk1.8.0_92\\bin"

In [16]:
from nltk.tag import StanfordNERTagger
st = StanfordNERTagger('./stanford-ner/classifiers/english.conll.4class.distsim.crf.ser.gz')

In [17]:
from nltk import word_tokenize
text = 'STAR act, is a California law designed to improve the interface between community college programs and CSU degree programs.'

tokenized_text = word_tokenize(text)
classified_text = st.tag(tokenized_text)

print(classified_text)

[('STAR', 'O'), ('act', 'O'), (',', 'O'), ('is', 'O'), ('a', 'O'), ('California', 'LOCATION'), ('law', 'O'), ('designed', 'O'), ('to', 'O'), ('improve', 'O'), ('the', 'O'), ('interface', 'O'), ('between', 'O'), ('community', 'O'), ('college', 'O'), ('programs', 'O'), ('and', 'O'), ('CSU', 'ORGANIZATION'), ('degree', 'O'), ('programs', 'O'), ('.', 'O')]


In [24]:
# Stanford NERTagger accuracy for wikigold dataset

with open("./wiki_gold.txt", encoding="utf8") as f:
    raw_annotations = f.read()

split_annotations = raw_annotations.split()

# Amend class annotations to reflect Stanford's NERTagger
for n,i in enumerate(split_annotations):
    if i == "I-PER":
        split_annotations[n] = "PERSON"
    if i == "I-ORG":
        split_annotations[n] = "ORGANIZATION"
    if i == "I-LOC":
        split_annotations[n] = "LOCATION"

# Group NE data into tuples
def group(lst, n):
    for i in range(0, len(lst), n):
        val = lst[i:i+n]
        if len(val) == n:
            yield tuple(val)

reference_annotations = list(group(split_annotations, 2))
pure_tokens = split_annotations[::2]

stanford_prediction = st.tag(pure_tokens)
stanford_prediction = stanford_prediction[:37501]
reference_annotations = reference_annotations[:37501]
stanford_accuracy_Wiki = accuracy(reference_annotations, stanford_prediction)
print(stanford_accuracy_Wiki)

# print(len(reference_annotations))
# print(len(stanford_prediction))

0.9150689314951601


In [18]:
# Stanford NERTagger accuracy for manually annotated FAQ dataset

with open("./FAQ_anno_samp.txt", encoding="utf8") as f:
    raw_annotations = f.read()

split_annotations = raw_annotations.split()

# Amend class annotations for consistency
for n,i in enumerate(split_annotations):
    if i == "I-PER":
        split_annotations[n] = "PERSON"
    if i == "I-ORG":
        split_annotations[n] = "ORGANIZATION"
    if i == "I-LOC":
        split_annotations[n] = "LOCATION"
    if i == "I-MISC":
        split_annotations[n] = "MISC"

# Group NE data into tuples
def group(lst, n):
    for i in range(0, len(lst), n):
        val = lst[i:i+n]
        if len(val) == n:
            yield tuple(val)

reference_annotations = list(group(split_annotations, 2))
pure_tokens = split_annotations[::2]

stanford_prediction = st.tag(pure_tokens)
stanford_prediction = stanford_prediction
stanford_accuracy_FAQ = accuracy(reference_annotations, stanford_prediction)
print(stanford_accuracy_FAQ)

# print(len(reference_annotations))
# print(len(stanford_prediction))

0.9033457249070632


In [25]:
# for word in range(len(stanford_prediction)):
#     print("Stanford Prediction: ", stanford_prediction[word], "\tReference Annotations: ", reference_annotations[word])

----
Visual comparison
----

In [26]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style

N = 1
ind = np.arange(N)  # the x locations for the groups
width = 0.35       # the width of the bars

fig, ax = plt.subplots()

stanford_percentage_Wiki = stanford_accuracy_Wiki * 100
rects1 = ax.bar(ind, stanford_percentage_Wiki, width, color='b')

nltk_percentage_Wiki = nltk_accuracy_Wiki * 100
rects2 = ax.bar(ind+width, nltk_percentage_Wiki, width, color='g')

stanford_percentage_FAQ = stanford_accuracy_FAQ * 100
rects3 = ax.bar(ind+width+width, stanford_percentage_FAQ, width, color='b')

nltk_percentage_FAQ = nltk_accuracy_FAQ * 100
rects4 = ax.bar(ind+width+width+width, nltk_percentage_FAQ, width, color='g')

# add some text for labels, title and axes ticks
ax.set_xlabel('Wiki_Gold                                FAQ_Sample')
ax.set_ylabel('Accuracy (by percentage)')
ax.set_title('NER Classifier Accuracy')
ax.set_xticks(ind+width)
ax.set_xticklabels( ('') )

ax.legend( (rects1[0], rects2[0]), ('Stanford', 'NLTK'), bbox_to_anchor=(1.05, 1), loc=4, borderaxespad=0. )

def autolabel(rects):
    # attach some text labels
    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x()+rect.get_width()/2., 1.02*height, '%10.2f' % float(height),
                ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)
autolabel(rects3)
autolabel(rects4)

plt.show()