# Model Playground
For experimenting with `hc-nlp`'s best model.

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

import spacy
from hc_nlp import pipeline

def load_augmented_model(thesaurus_path: str, model_type: str = "en_core_web_trf"):
    nlp = spacy.load(model_type)

    nlp.add_pipe('date_matcher', before='ner')
    nlp.add_pipe('thesaurus_matcher', config={"case_sensitive": False, "overwrite_ents": False, "thesaurus_path":thesaurus_path}, after='ner')
    nlp.add_pipe('entity_filter', config={"ent_labels_ignore": ["DATE"]}, last=True)
    nlp.add_pipe('map_entity_types', last=True)

    return nlp

In [2]:
nlp = spacy.load("en_core_web_trf")
nlp_aug = load_augmented_model("../data/labels_all_unambiguous_types_people_orgs.jsonl")

nlp.pipe_names, nlp_aug.pipe_names

2021-01-20 14:54:43,532 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl
2021-01-20 14:54:45,812 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 2s


(['transformer', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'],
 ['transformer',
  'tagger',
  'parser',
  'date_matcher',
  'ner',
  'thesaurus_matcher',
  'attribute_ruler',
  'lemmatizer',
  'entity_filter',
  'map_entity_types'])

---

## text

In [72]:
text = """
Illuminated album presented to Mr Joseph Henry Morton on his retirement from the London & North Eastern Railway after 51 years faithful service, 31 March 1924. Green leather bound book inscribed with the signatures of those who identified themselves with the presentation.
"""

## base model

In [73]:
doc = nlp(text)

spacy.displacy.render(doc, style='ent')

In [74]:
[ent.text for ent in doc.ents]

['Joseph Henry Morton',
 'the London & North Eastern Railway',
 '51 years',
 '31 March 1924']

## augmented model

In [75]:
doc_aug = nlp_aug(text)

print([ent.text for ent in doc_aug.ents])
spacy.displacy.render(doc_aug, style='ent')

['Joseph Henry Morton', 'the London & North Eastern Railway', '51 years', '31 March 1924']


In [76]:
[(ent.text, ent.start, ent.end, doc[ent.start - 1]) for ent in doc.ents]

[('Joseph Henry Morton', 6, 9, Mr),
 ('the London & North Eastern Railway', 13, 19, from),
 ('51 years', 20, 22, after),
 ('31 March 1924', 25, 28, ,)]

In [81]:
from hc_nlp import constants

def _remove_n_years_from_date_entities(
    doc: spacy.tokens.Doc
) -> spacy.tokens.Doc:
    """
    Removes any DATE entities with the format 'n years'
    """

    newdoc = copy.deepcopy(doc)

    for ent in newdoc.ents:
        if (ent.label_ == "DATE") and ("years" in ent.text.lower()) and ent[0].like_num:
            newdoc.ents = [e for e in newdoc.ents if e != ent]
                
    return newdoc


In [82]:
newdoc_aug = _remove_n_years_from_date_entities(doc_aug)
spacy.displacy.render(newdoc_aug, style='ent')

In [65]:
from hc_nlp.pipeline import pattern_matcher
from spacy.pipeline.morphologizer import DEFAULT_MORPH_MODEL
config = {"model": DEFAULT_MORPH_MODEL}

n = spacy.load("en_core_web_trf")
n.add_pipe("pattern_matcher", before='ner', config={"patterns": constants.COLLECTION_NAME_PATTERNS})

n.pipe_names

['transformer',
 'tagger',
 'parser',
 'pattern_matcher',
 'ner',
 'attribute_ruler',
 'lemmatizer']

In [66]:
d = n(text)

spacy.displacy.render(d, style='ent')