# py-CRFsuite NER model

this version does not consider capitalization (all words are lowered with `.lower()`), in hopes of creating a case-independent model, for use in situations where case information is not available, such as when using the output of an automatic speech recognition (speech-to-text) system.

code modified from:
https://github.com/scrapinghub/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb

In [1]:
import pandas as pd
import numpy as np
import pycrfsuite
from nltk import pos_tag
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from itertools import chain
from collections import Counter

## read travel data

In [2]:
X_tokens = list(np.load('../00_data/encoded/add_tokens.npy'))
X_postags = list(np.load('../00_data/encoded/add_postags.npy'))
X_zips = [list(zip(X_tokens[i], X_postags[i])) for i in range(len(X_tokens))]
y_nertags = list(np.load('../00_data/encoded/add_nertags.npy'))

## gazetteers

precompiled lists for feature extraction

In [3]:
# define lists
timewords = ['midnight', 'noon', 'am', 'pm', 'morning', 'afternoon', 'evening', 'night', "o'clock"]
timewords = [s.lower() for s in timewords]

datewords = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'January', 'February',
             'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December',
             'tomorrow']
datewords = [s.lower() for s in datewords]

# from transdict finds
placewords = ['Euston', 'London', 'Birmingham', 'Street', 'Chicago', 'Wilmslow', 'Macclesfield', 'Stockport',
              'Piedmont', 'Dallas', 'Newark', 'Wigan', 'Preston', 'Denver', 'Liverpool', 'Seattle', 'Tokyo',
              'Wrexham', 'Richmond' , 'Manchester', 'Crewe', 'Baltimore', 'Ottawa', 'Toronto', 'Vancouver', 
              'Moscow', 'Boston', 'Edinburgh', 'Oakland', 'Newcastle', 'Durham', 'Lime', 'Taunton', 
              'Copenhagen', 'Heathrow', 'Helsinki', 'Pittsburg', 'Raleigh', 'Picadilly', 'Watford', 'Hertford',
              'Leicester', 'Newton', 'Abbot', 'Greenbay', 'Miami', 'Orlando', 'Washington', 'Dulles', 'Charlotte',
              'Tahoe', 'Southbend', 'Springfield', 'York', 'Burbank', 'Syracuse', 'Cleveland', 'Fairbanks',
              'Carolina', 'Montreal', 'Wolverhampton', 'Leeds', 'Derby', 'Blackpool', 'Oxenholme', 'Ontario',
              'Riyadh', 'Portland', 'Barclay', 'Calgary', 'Bangkok', 'Burigan', 'Nantucket', 'Menlo', 'Cottage',
              'Wisconsin', 'Gatwick', 'Singapore', 'Irvine', 'Frankfurt', 'Jersey', 'Columbus', 'Merseyside', 
              'Fanshawe', 'Essex', 'Stafford', 'Philadelphia', 'Switzerland', 'Denmark', 'Sandestrom',
              'Braniff', 'Stockholm', 'Sweden', 'Germany', 'Belgium', 'Brussels', 'Rochester', 'Anchorage', 
              'California', 'Arkansas', 'England', 'Michigan', 'Detroit', 'Indiana', 'Louisville', 'Ohio',
              'Tulsa', 'Indianapolis', 'Milwaukee', 'Oklahoma', 'Colorado', 'Virginia', 'Coxhoe', 'Redwich',
              'Camden', 'Leicestershire', 'Cumbria', 'Heswall', 'Wirral', 'Liver', 'Cheshire', 'Goldhampton',
              'International', 'Airport', 'Central', 'General', 'St', 'Atlanta', 'Nuneaton', 'Beijing', 'Europe',
              'LA', 'ORD', 'SFO', 'JFK', 'LAX', 'SAS', 'SF', 'Pancreas', 'Rugby',
              'Palo', 'Green', 'New', 'Lime', 'Orange', 'San', 'Los', 'Hong', 'John', 'Las', 'Saudi', 'Saint',
              'Santa', 'Soviet', 'Little', 'Alto', 'Bay', 'Lime' 'Street', 'County', 'Francisco', 'Angeles', 
              'Diego', 'Jose', 'Bernadino', 'Kong', 'Wayne', 'Vegas', 'Arabia', 'Louis', 'Petersburg', 'Ana', 
              'Union', 'Rock']
placewords = [s.lower() for s in placewords]

# from transdict finds
companywords = ['United', 'American', 'Virgin', 'Express', 'Delta', 'Trainlines', 'Visa', 'Airlines', 'Travel',
                'Hertz', 'Lufthansa', 'Northwest', 'Canadian', 'British', 'Trainline', 'Northwestern',
                'Mastercard', 'Airline', 'Southwestern', 'Telesales', 'Merchandising', 'Aeroflight',
                'Airways', 'Korean', 'Hotel', 'Alaskan', 'Alaska', 'TWA', 'Eagle', 'Saudia', 'Arabian', 
                'PanAm', 'CP', 'Air', 'trainlines', 'Continental', 'Cathay', 'Pan', 'Pacific', 'Am']
companywords = [s.lower() for s in companywords]

# seat-class words
ticketwords = ['Coach', 'Business', 'non-stop', 'Round-trip', 'Advance',
               'Super', 'Value', 'Return', 'Saver', 'First', 'Class', 'Single']
ticketwords = [s.lower() for s in ticketwords]

currencies = ['pounds', 'dollars', 'cents', 'dollar']

numbers = ['hundred', 'hundreds', 'thousand', 'thousands']

vowels = ['a', 'e', 'i', 'o', 'u']

# timewords, datewords, placewords, companywords, ticketwords, currencies, numbers

## feature extractors

In [4]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'wordlength='+str(len(word)),
        'wordending[-3:]=' + word[-3:],
        'wordending[-2:]=' + word[-2:],
        'wordending[-1:]=' + word[-1:],
        'postag=' + postag,
        'posclass=' + postag[:2],
        'word.hasdigit=%s' % (word.find('#')>-1),
        'word.istime=%s' % (word in timewords),
        'word.isdate=%s' % (word in datewords),
        'word.isplace=%s' % (word in placewords),
        'word.iscompany=%s' % (word in companywords),
        'word.istixtype=%s' % (word in ticketwords),
        'word.iscurrency=%s' % (word in currencies),
        'word.isnumber=%s' % (word in numbers),
        'word.startsvowel=%s' % (word[0] in vowels),
        'word.endsvowel=%s' % (word[-1] in vowels),
    ]
    
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:wordending[-3:]=' + word1[-3:],
            '-1:wordending[-2:]=' + word1[-2:],
            '-1:wordending[-1:]=' + word1[-1:],
            '-1:postag=' + postag1,
            '-1:posclass=' + postag1[:2],
            '-1:word.hasdigit=%s' % (word1.find('#')>-1),
            '-1:word.istime=%s' % (word1 in timewords),
            '-1:word.isdate=%s' % (word1 in datewords),
            '-1:word.isplace=%s' % (word1 in placewords),
            '-1:word.iscompany=%s' % (word1 in companywords),
            '-1:word.istixtype=%s' % (word1 in ticketwords),
            '-1:word.iscurrency=%s' % (word1 in currencies),
            '-1:word.isnumber=%s' % (word1 in numbers),
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:wordending[-3:]=' + word1[-3:],
            '+1:wordending[-2:]=' + word1[-2:],
            '+1:wordending[-1:]=' + word1[-1:],
            '+1:postag=' + postag1,
            '+1:posclass=' + postag1[:2],
            '+1:word.hasdigit=%s' % (word1.find('#')>-1),
            '+1:word.istime=%s' % (word1 in timewords),
            '+1:word.isdate=%s' % (word1 in datewords),
            '+1:word.isplace=%s' % (word1 in placewords),
            '+1:word.iscompany=%s' % (word1 in companywords),
            '+1:word.istixtype=%s' % (word1 in ticketwords),
            '+1:word.iscurrency=%s' % (word1 in currencies),
            '+1:word.isnumber=%s' % (word1 in numbers),
        ])
    else:
        features.append('EOS')
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2tokens(sent):
    return [token for token, postag in sent]

In [5]:
split_idx = int(len(X_tokens)*0.9)
train_sents = X_zips[:split_idx]
test_sents = X_zips[split_idx:]
y_train = y_nertags[:split_idx]
y_test = y_nertags[split_idx:]

In [6]:
%%time
X_train = [sent2features(s) for s in train_sents]
X_test = [sent2features(s) for s in test_sents]

CPU times: user 2.22 s, sys: 52 ms, total: 2.28 s
Wall time: 2.28 s


In [7]:
sent2features(train_sents[0])[0]

['bias',
 'wordlength=4',
 'wordending[-3:]=hat',
 'wordending[-2:]=at',
 'wordending[-1:]=t',
 'postag=WP',
 'posclass=WP',
 'word.hasdigit=False',
 'word.istime=False',
 'word.isdate=False',
 'word.isplace=False',
 'word.iscompany=False',
 'word.istixtype=False',
 'word.iscurrency=False',
 'word.isnumber=False',
 'word.startsvowel=False',
 'word.endsvowel=False',
 'BOS',
 '+1:wordending[-3:]=ime',
 '+1:wordending[-2:]=me',
 '+1:wordending[-1:]=e',
 '+1:postag=NN',
 '+1:posclass=NN',
 '+1:word.hasdigit=False',
 '+1:word.istime=False',
 '+1:word.isdate=False',
 '+1:word.isplace=False',
 '+1:word.iscompany=False',
 '+1:word.istixtype=False',
 '+1:word.iscurrency=False',
 '+1:word.isnumber=False']

## create model and train

In [8]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)
    
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 200,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

CPU times: user 1.65 s, sys: 16 ms, total: 1.67 s
Wall time: 1.67 s


In [9]:
%%time
trainer.train('../00_data/model/baseline.crfsuite')

CPU times: user 30.1 s, sys: 4 ms, total: 30.1 s
Wall time: 30.1 s


## create tagger, evaluation

In [10]:
tagger = pycrfsuite.Tagger()
tagger.open('../00_data/model/baseline.crfsuite')

<contextlib.closing at 0x7f71d4544c88>

In [11]:
idx = 4
example_sent = test_sents[idx]
print(' '.join(sent2tokens(example_sent)), end='\n\n')

print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct:  ", ' '.join(y_test[idx]))

that 's on the ##th of october

Predicted: O O O O DAT O DAT
Correct:   O O O O DAT O DAT


In [12]:
scores = []

for idx, sent in enumerate(test_sents):
    
    preds = tagger.tag(sent2features(sent))
    trues = y_test[idx]

    for jdx, pred in enumerate(preds):
        
        if pred == trues[jdx]:
            scores.append(1)
        else:
            scores.append(0)
            
print('acc:', sum(scores)/len(scores))

acc: 0.998266078184111
