In [79]:
from time import time
import re
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
from nltk.classify.util import accuracy as accuracy_model
from nltk.metrics.scores import accuracy as accuracy_score
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
with open("./wiki_gold.txt", encoding="utf8") as f:
    raw_annotations = f.read()

split_annotations = raw_annotations.split()

# Amend class annotations to reflect Stanford's NERTagger
for n,i in enumerate(split_annotations):
    if i == "I-PER":
        split_annotations[n] = "PERSON"
    if i == "I-ORG":
        split_annotations[n] = "ORGANIZATION"
    if i == "I-LOC":
        split_annotations[n] = "LOCATION"
    if i == "I-MISC":
        split_annotations[n] = "MISC"
        
# Group NE data into tuples
def group(lst, n):
    for i in range(0, len(lst), n):
        val = lst[i:i+n]
        if len(val) == n:
            yield tuple(val)

reference_annotations = list(group(split_annotations, 2))
pure_tokens = split_annotations[::2]

In [3]:
# feature extractor to be used with on a wordlist
def feature_extractor(tokens):
    tagged_tokens = nltk.pos_tag(tokens)
    feature_list = []

    def shape(word):
        if re.match('[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$', word, re.UNICODE):
            return 'number'
        elif re.match('\W+$', word, re.UNICODE):
            return 'punct'
        elif re.match('\w+$', word, re.UNICODE):
            if word.istitle():
                return 'upcase'
            elif word.islower():
                return 'downcase'
            else:
                return 'mixedcase'
        else:
            return 'other'

    def simplify_pos(s):
        if s.startswith('V'): return "V"
        else: return s.split('-')[0]

    def _feature_detector(tokens, index):
        word = tokens[index][0]
        pos = simplify_pos(tokens[index][1])
        english_wordlist = set(w.lower() for w in nltk.corpus.words.words())
        if index == 0:
            prevword = None
            prevpos = None
            prevshape = None
            nextword = tokens[index+1][0].lower()
            nextpos = tokens[index+1][1].lower()
        elif index == len(tokens)-1:
            prevword = tokens[index-1][0].lower()
            prevpos = tokens[index-1][1].lower()
            prevshape = shape(prevword)
            nextword = None
            nextpos = None
        else:
            prevword = tokens[index-1][0].lower()
            prevpos = tokens[index-1][1].lower()
            prevshape = shape(prevword)
            nextword = tokens[index+1][0].lower()
            nextpos = tokens[index+1][1].lower()

        # features dictionary
        features = {
            'bias': True,
            'shape': shape(word),
            'wordlen': len(word),
            'prefix3': word[:3].lower(),
            'suffix3': word[-3:].lower(),
            'pos': pos,
            'word': word,
            'en-wordlist': (word in english_wordlist),
            'prevpos': prevpos,
            'nextpos': nextpos,
            'prevword': prevword,
            'nextword': nextword,
            'word+nextpos': '%s+%s' % (word.lower(), nextpos),
            'word+prevpos': '%s+%s' % (word.lower(), prevpos),
            'prevshape': prevshape,
            }

        return features

    for token in tagged_tokens:
        features = _feature_detector(tagged_tokens,tagged_tokens.index(token))
        feature_list.append(features)
    
    return feature_list

In [4]:
# load extracted list of features dictionaries from the WikiGold dataset (extracted from ~39k words)
import pickle
feature_dict_list = pickle.load(open('feature_dict_list.p', 'rb'))

In [5]:
# create list of corresponding labels
token_labels = []
for word,tag in reference_annotations:
    token_labels.append(tag)
    
len(token_labels)

39152

In [6]:
# zip features dictionaries with corresponding lebels
label_feats = list(zip(feature_dict_list,token_labels))

In [7]:
# replace None feature values with with ' '
for feats,label in label_feats:
    for feat in feats:
        if feats[feat] == None:
            feats[feat] = ' '

In [8]:
# split the dataset into 75% training and 25% testing sets
def split_train_test(l, n):
    return [x[1] for x in enumerate(l) if x[0] % n], l[::n]

train_feats, test_feats = split_train_test(label_feats, 5)
len(train_feats), len(test_feats)

(31321, 7831)

In [9]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.linear_model import LogisticRegression
sk_classifier = SklearnClassifier(LogisticRegression(multi_class='multinomial', solver='lbfgs'))

In [10]:
# train maximum entropy classifier
start_time = time()
sk_classifier.train(train_feats)
print('Time to train: {:0.2f} sec'.format(time() - start_time))
print(sk_classifier)

Time to train: 3.52 sec
<SklearnClassifier(LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False))>


In [11]:
# check accuracy of model using testing set
accuracy_model(sk_classifier, test_feats)

0.9337249393436343

In [12]:
# classify named entities from given features
def make_predictions(test_feats):
    pred_list = []
    if type(test_feats[0]) is dict:
        for feats in test_feats:
            pred_list.append(sk_classifier.classify(feats))
    elif type(test_feats[0]) is tuple:
        for feats,label in test_feats:
            pred_list.append(sk_classifier.classify(feats))

    return pred_list

In [45]:
# classify using model and verify accuracy score
start_time = time()
pred_list = make_predictions(test_feats)
true_labels = [label for feat,label in test_feats]
print('Time to classify: {:0.2f} sec'.format(time() - start_time))
print('Accuracy score: {}'.format(accuracy_score(true_labels, pred_list)))

Time to classify: 3.36 sec
Accuracy score: 0.9337249393436343


In [48]:
# evaluate using precision, recall and f1 scores
print('Precision score: {}'.format(precision_score(true_labels, pred_list, average='weighted')))
print('Recall score: {}'.format(recall_score(true_labels, pred_list, average='weighted')))
print('F1 score: {}'.format(f1_score(true_labels, pred_list, average='weighted')))

print('Confusion Matrix: \n{}'.format(confusion_matrix(true_labels, pred_list)))

Precision score: 0.9296451609262882
Recall score: 0.9337249393436343
F1 score: 0.930873141898728
Confusion Matrix: 
[[ 194    9   17   34   19]
 [  14  143   77   36   23]
 [  11   30 6454   25   12]
 [  34   28   56  219   42]
 [  12    6   19   15  302]]


In [61]:
# classifying words from the manually annotated FAQ sample
with open("./FAQ_anno_samp.txt", encoding="utf8") as f:
    raw_annotations = f.read()

split_annotations = raw_annotations.split()

# Amend class annotations for consistency
for n,i in enumerate(split_annotations):
    if i == "I-PER":
        split_annotations[n] = "PERSON"
    if i == "I-ORG":
        split_annotations[n] = "ORGANIZATION"
    if i == "I-LOC":
        split_annotations[n] = "LOCATION"
    if i == "I-MISC":
        split_annotations[n] = "MISC"

# Group NE data into tuples
def group(lst, n):
    for i in range(0, len(lst), n):
        val = lst[i:i+n]
        if len(val) == n:
            yield tuple(val)

reference_annotations = list(group(split_annotations, 2))
pure_tokens = split_annotations[::2]

start_time = time()
feats_dict_list = feature_extractor(pure_tokens)
print('Time for feature extracttion: {:0.2f} sec'.format(time()-start_time))

Time for feature extracttion: 34.22 sec


In [62]:
# classify using model and verify accuracy score
start_time = time()
pred_list = make_predictions(feats_dict_list)
true_labels = [label for word,label in reference_annotations]
print('Time to classify: {:0.2f} sec'.format(time() - start_time))
print('Accuracy score: {}'.format(accuracy_score(true_labels, pred_list)))

Time to classify: 0.12 sec
Accuracy score: 0.8921933085501859


In [80]:
# evaluate using precision, recall and f1 scores

print('Precision score: {}'.format(precision_score(true_labels, pred_list, average='weighted')))
print('Recall score: {}'.format(recall_score(true_labels, pred_list, average='weighted')))
print('F1 score: {}'.format(f1_score(true_labels, pred_list, average='weighted', labels=['',])))

print('Confusion Matrix: \n{}'.format(confusion_matrix(true_labels, pred_list)))

Precision score: 0.9207333468017035
Recall score: 0.8921933085501859
F1 score: 0.904256085450804
Confusion Matrix: 
[[  1   0   0   0   0]
 [  0   9   1   3   2]
 [  5   2 220   6   2]
 [  0   0   8  10   0]
 [  0   0   0   0   0]]


In [78]:
for i in range(len(pure_tokens)):
    check = '<Correct>' if true_labels[i] == pred_list[i] else '<Incorrect>'
    print(pure_tokens[i], true_labels[i], pred_list[i], (check))

What O MISC <Incorrect>
is O O <Correct>
the O O <Correct>
STAR ORGANIZATION ORGANIZATION <Correct>
Act O PERSON <Incorrect>
STAR ORGANIZATION ORGANIZATION <Correct>
act O O <Correct>
is O O <Correct>
a O O <Correct>
California LOCATION LOCATION <Correct>
law O O <Correct>
designed O O <Correct>
to O O <Correct>
improve O O <Correct>
the O O <Correct>
interface O O <Correct>
between O O <Correct>
community O O <Correct>
college O O <Correct>
programs O O <Correct>
and O O <Correct>
CSU ORGANIZATION ORGANIZATION <Correct>
degree O O <Correct>
programs O O <Correct>
C O PERSON <Incorrect>
ID O ORGANIZATION <Incorrect>
is O O <Correct>
a O O <Correct>
course O O <Correct>
numbering O O <Correct>
system O O <Correct>
used O O <Correct>
in O O <Correct>
the O O <Correct>
implementation O O <Correct>
of O O <Correct>
this O O <Correct>
act O O <Correct>
Students O O <Correct>
with O O <Correct>
an O O <Correct>
appropriate O O <Correct>
AS MISC MISC <Correct>
degree O O <Correct>
an O O <Cor

In [16]:
# using text from another corpus
from nltk.corpus import brown

tokens = brown.words(fileids='ca01')
print('Number of words in text: {}'.format(len(tokens)),
      '| Text: {}'.format(tokens))

Number of words in text: 2242 | Text: ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]


In [140]:
# extract features from text
start_time = time()
feats_dict_list = feature_extractor(tokens)
print('Time for feature extracttion: {:0.2f} sec'.format(time()-start_time))

Time for feature extracttion: 257.79 sec


In [141]:
# predict labels
start_time = time()
pred_list = make_predictions(feats_dict_list)
print('Time to classify: {:0.2f} sec'.format(time() - start_time))

Time to classify: 0.89 sec


In [142]:
# no ground truth available for new text
# we can observe words in text with their predicted labels

for i in range(len(tokens)):
    print(tokens[i], pred_list[i])

The O
Fulton LOCATION
County LOCATION
Grand MISC
Jury PERSON
said O
Friday MISC
an O
investigation O
of O
Atlanta's O
recent O
primary O
election O
produced O
`` O
no O
evidence O
'' O
that O
any O
irregularities O
took O
place O
. O
The O
jury O
further O
said O
in O
term-end O
presentments O
that O
the O
City ORGANIZATION
Executive ORGANIZATION
Committee ORGANIZATION
, O
which O
had O
over-all O
charge O
of O
the O
election O
, O
`` O
deserves O
the O
praise O
and O
thanks O
of O
the O
City ORGANIZATION
of O
Atlanta LOCATION
'' O
for O
the O
manner O
in O
which O
the O
election O
was O
conducted O
. O
The O
September-October O
term O
jury O
had O
been O
charged O
by O
Fulton LOCATION
Superior PERSON
Court ORGANIZATION
Judge PERSON
Durwood PERSON
Pye PERSON
to O
investigate O
reports O
of O
possible O
`` O
irregularities O
'' O
in O
the O
hard-fought O
primary O
which O
was O
won O
by O
Mayor-nominate ORGANIZATION
Ivan PERSON
Allen PERSON
Jr. PERSON
. O
`` O
Only O
a O
relative O
hand