In [1]:
import spacy
import re
from skweak import heuristics, gazetteers, aggregation, utils

In [2]:
def money_detector(doc):
    for tok in doc[1:]:
        if tok.text[0].isdigit() and tok.nbor(-1).is_currency:
            yield tok.i-1, tok.i+1, "MONEY"

In [3]:
lf1 = heuristics.FunctionAnnotator("money", money_detector)

In [4]:
lf2 = heuristics.TokenConstraintAnnotator(
    "years", lambda tok: re.match("(19|20)\d{2}$", tok.text), "DATE")


In [5]:
NAMES = [("Barack", "Obama"), ("Donald", "Trump"), ("Joe", "Biden")]
trie = gazetteers.Trie(NAMES)
lf3 = gazetteers.GazetteerAnnotator("presidents", {"PERSON": trie})

In [6]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Donald Trump paid $750 in federal income taxes in 2016")

In [7]:
doc = lf3(lf2(lf1(doc)))

In [8]:
hmm = aggregation.HMM("hmm", ["PERSON", "DATE", "MONEY"])
hmm.fit_and_aggregate([doc])

Starting iteration 1
Finished E-step with 1 documents
Starting iteration 2
Finished E-step with 1 documents


         1         -18.9513             +nan
         2         -19.0673          -0.1160


[Donald Trump paid $750 in federal income taxes in 2016]

In [9]:
utils.display_entities(doc, "hmm")