In [1]:
import sys
sys.path.append("..")

import spacy

from hc_nlp.pipeline import ThesaurusMatcher, EntityFilter
from hc_nlp.spacy_helpers import display_ner_annotations

## ThesaurusMatcher

In [6]:
nlp = spacy.load("en_core_web_sm", disable=['ner'])

thes = ThesaurusMatcher(nlp, thesaurus_path="../data/labels_all_unambiguous_types_people_orgs.jsonl", case_sensitive=False)
nlp.add_pipe(thes, last=True)

nlp.pipe_names

2020-12-08 14:19:47,425 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl
2020-12-08 14:19:52,464 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 5s


['tagger', 'parser', 'ThesaurusMatcher']

In [15]:
text = """
Charles Babbage's calculating engines are among the most celebrated icons in the prehistory of computing. His Difference Engine No. 1 was the first successful automatic calculator and remains one of the finest examples of precision engineering of the time. The portion shown was assembled in 1832 by Babbage's engineer, Joseph Clement. 
"""

In [16]:
# without entityfilter
if 'EntityFilter' in nlp.pipe_names:
    nlp.remove_pipe('EntityFilter')

doc = nlp(text)
display_ner_annotations(doc)

In [17]:
# with entityfilter
if 'EntityFilter' in nlp.pipe_names:
    nlp.remove_pipe('EntityFilter')

# we could also put the EntityFilter before the ner component
entityfilter = EntityFilter(max_token_length=1)
nlp.add_pipe(entityfilter, last=True)

print(nlp.pipe_names)

doc = nlp(text)
display_ner_annotations(doc)

['tagger', 'parser', 'ThesaurusMatcher', 'EntityFilter']
