# Model Playground
For experimenting with `hc-nlp`'s best model.

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

import spacy
from hc_nlp import pipeline, constants

def load_augmented_model(thesaurus_path: str, model_type: str = "en_core_web_trf"):
    nlp = spacy.load(model_type)
    
    nlp.add_pipe('date_matcher', before='ner')
    nlp.add_pipe('pattern_matcher', before='date_matcher', config={"patterns": constants.COLLECTION_NAME_PATTERNS})
    nlp.add_pipe('thesaurus_matcher', config={"case_sensitive": False, "overwrite_ents": False, "thesaurus_path":thesaurus_path}, after='ner')
    nlp.add_pipe('entity_filter', config={"ent_labels_ignore": ["DATE"]}, last=True)
    nlp.add_pipe('map_entity_types', last=True)

    return nlp

In [2]:
nlp = spacy.load("en_core_web_trf")
nlp.add_pipe('map_entity_types', last=True)

nlp_aug = load_augmented_model("../data/labels_all_unambiguous_types_people_orgs.jsonl")

nlp.pipe_names, nlp_aug.pipe_names

2021-01-21 12:47:14,412 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl
2021-01-21 12:47:16,886 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 2s


(['transformer',
  'tagger',
  'parser',
  'ner',
  'attribute_ruler',
  'lemmatizer',
  'map_entity_types'],
 ['transformer',
  'tagger',
  'parser',
  'pattern_matcher',
  'date_matcher',
  'ner',
  'thesaurus_matcher',
  'attribute_ruler',
  'lemmatizer',
  'entity_filter',
  'map_entity_types'])

---

## text

In [3]:
text = """
Illuminated album presented to Mr Joseph Henry Morton, born in London, on his retirement from the London & North Eastern Railway after 51 years faithful service, 31 March 1924. Green leather bound book inscribed with the signatures of those who identified themselves with the presentation.
"""

## base model

In [4]:
doc = nlp(text)

spacy.displacy.render(doc, style='ent')

In [5]:
[ent.text for ent in doc.ents]

['Joseph Henry Morton',
 'London',
 'the London & North Eastern Railway',
 '51 years',
 '31 March 1924']

## augmented model

In [6]:
doc_aug = nlp_aug(text)

print([ent.text for ent in doc_aug.ents])
spacy.displacy.render(doc_aug, style='ent')

['Joseph Henry Morton', 'London', 'the London & North Eastern Railway', '31 March 1924']


In [7]:
[(ent.text, ent.start, ent.end, doc[ent.start - 1]) for ent in doc.ents]

[('Joseph Henry Morton', 6, 9, Mr),
 ('London', 12, 13, in),
 ('the London & North Eastern Railway', 18, 24, from),
 ('51 years', 25, 27, after),
 ('31 March 1924', 30, 33, ,)]

---

# test model

In [8]:
from hc_nlp.io import load_text_and_annotations_from_labelstudio
from hc_nlp.model_testing import test_ner
import pandas as pd
import time

def generate_results_table(pipelines: dict, examples: list):
    results = pd.DataFrame()
    times = dict()
    for name, p in pipelines.items():
        start = time.time()
        temp_res = pd.json_normalize(test_ner(p, examples=examples))
        end = time.time()
        temp_res['pipeline'] = name
        
        results = results.append(temp_res)
        times[name] = end-start
        print(f"{name}: {int(end-start)}s")
        
    results = results.set_index('pipeline')
    
    return results, times


In [9]:
data = load_text_and_annotations_from_labelstudio("../labelling/export/2020-12-10-12-43-04.zip", nlp)


In [10]:
res, times = generate_results_table({"core": nlp, "augmented": nlp_aug}, data)

res.T

Failed:  Poster, London & North Eastern Railway, Bamburgh by Tom Purvis, 1936. Coloured lithograph depicting a stylised view of the coast with Bamburgh castle, the beach and village. Printed by Chorley & Pickersgill Ltd, Lithographers, Leeds. Format: double royal. Dimensions: 40 x 25 inches, 1016 x 635mm.
[E103] Trying to set conflicting doc.ents: '(40, 48, 'LOC')' and '(40, 51, 'NORP')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap.
Failed:  Glass bottle containing unknown grey, metallic-looking powder. Part of Statham's student chemical laboratory.
[E103] Trying to set conflicting doc.ents: '(71, 78, 'PERSON')' and '(71, 78, 'ORG')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap.
Failed:  Bone gouge probably by McQueen of Newcastle, England, late 19th early 20th century, nickel plated steel, handle probably brass, nickel plated
[E103] Trying to set conflicting doc.ents: '(23, 43, 'ORG')

pipeline,core,augmented
ents_p,0.602317,0.646254
ents_r,0.731903,0.728552
ents_f,0.660817,0.68494
support,1549,1549
labels_missing_from_annotations,"[CARDINAL, ORDINAL, TIME, QUANTITY, LANGUAGE, ...","[CARDINAL, ORDINAL, TIME, QUANTITY, LANGUAGE, ..."
ents_per_type.LOC.p,0.893238,0.896057
ents_per_type.LOC.r,0.801917,0.798722
ents_per_type.LOC.f,0.845118,0.844595
ents_per_type.LOC.support,322,322
ents_per_type.ORG.p,0.612745,0.615385
