In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

from hc_nlp import pipeline
from hc_nlp.model_testing import test_ner
from hc_nlp.io import load_text_and_annotations_from_labelstudio
from hc_nlp.spacy_helpers import display_manual_annotations, display_ner_annotations
from hc_nlp import constants

import pandas as pd
import spacy

import time

In [2]:
# !{sys.executable} -m spacy download en_core_web_trf

In [3]:
# LOAD MODELS

model_name = "en_core_web_lg"

# pure NER
nlp_lg = spacy.load(model_name)

# custom components
# thes = ThesaurusMatcher(nlp_lg, thesaurus_path="../data/labels_all_unambiguous_types_people_orgs.jsonl", case_sensitive=False)
# thes_ow = ThesaurusMatcher(nlp_lg, thesaurus_path="../data/labels_all_unambiguous_types_people_orgs.jsonl", 
#                                   case_sensitive=False, overwrite_ents=True)
# entityfilter = EntityFilter()

# model with people & orgs thesaurus annotations before model
nlp_thes = spacy.load(model_name)
nlp_thes.add_pipe("thesaurus_matcher", config={"case_sensitive": False, "overwrite_ents": False, "thesaurus_path":"../data/labels_all_unambiguous_types_people_orgs.jsonl"}, before='ner')
nlp_thes.add_pipe("entity_filter", last=True, config={"ent_labels_ignore": ["DATE"]})

# model with people & orgs thesaurus annotations after model
nlp_thes_aft = spacy.load(model_name)
nlp_thes_aft.add_pipe("thesaurus_matcher", config={"case_sensitive": False, "overwrite_ents": False, "thesaurus_path":"../data/labels_all_unambiguous_types_people_orgs.jsonl"}, after='ner')
nlp_thes_aft.add_pipe("entity_filter", last=True, config={"ent_labels_ignore": ["DATE"]})

# model with people & orgs thesaurus annotations after model, overwriting NER annotations
nlp_thes_aft_ow = spacy.load(model_name)
nlp_thes_aft_ow.add_pipe("thesaurus_matcher", config={"case_sensitive": False, "overwrite_ents": True, "thesaurus_path":"../data/labels_all_unambiguous_types_people_orgs.jsonl"}, after='ner')
nlp_thes_aft_ow.add_pipe("entity_filter", last=True, config={"ent_labels_ignore": ["DATE"]})

# model with rules for dates
# datematcher = DateMatcher(nlp_lg)
nlp_dates_bef = spacy.load(model_name)
nlp_dates_bef.add_pipe("date_matcher", before='ner')

# model with rules for dates before & thesaurus after
nlp_dates_bef_thes_aft = spacy.load(model_name)
nlp_dates_bef_thes_aft.add_pipe("date_matcher", before='ner')
nlp_dates_bef_thes_aft.add_pipe("thesaurus_matcher", config={"case_sensitive": False, "overwrite_ents": False, "thesaurus_path":"../data/labels_all_unambiguous_types_people_orgs.jsonl"}, after='ner')
nlp_dates_bef_thes_aft.add_pipe("entity_filter", last=True, config={"ent_labels_ignore": ["DATE"]})

# model with rules for dates before & thesaurus after, with overwrite
nlp_dates_bef_thes_aft_ow = spacy.load(model_name)
nlp_dates_bef_thes_aft_ow.add_pipe("date_matcher", before='ner')
nlp_dates_bef_thes_aft_ow.add_pipe("thesaurus_matcher", config={"case_sensitive": False, "overwrite_ents": True, "thesaurus_path":"../data/labels_all_unambiguous_types_people_orgs.jsonl"}, after='ner')
nlp_dates_bef_thes_aft_ow.add_pipe("entity_filter", last=True, config={"ent_labels_ignore": ["DATE"]})

# add mapping from Spacy to HC types to all pipelines
# mapentitytypes = MapEntityTypes(nlp_lg, validate_mapping=False)
nlp_lg.add_pipe("map_entity_types")
nlp_thes.add_pipe("map_entity_types")
nlp_thes_aft.add_pipe("map_entity_types")
nlp_thes_aft_ow.add_pipe("map_entity_types")
nlp_dates_bef.add_pipe("map_entity_types")
nlp_dates_bef_thes_aft.add_pipe("map_entity_types")
nlp_dates_bef_thes_aft_ow.add_pipe("map_entity_types")

nlp_lg.pipe_names, nlp_thes.pipe_names, nlp_thes_aft.pipe_names, nlp_thes_aft_ow.pipe_names, nlp_dates_bef.pipe_names, nlp_dates_bef_thes_aft.pipe_names, nlp_dates_bef_thes_aft_ow.pipe_names


2021-01-11 17:35:02,734 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


2021-01-11 17:35:30,309 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 27s


17016 term thesaurus imported in 27s


2021-01-11 17:35:33,186 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


2021-01-11 17:35:51,691 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 18s


17016 term thesaurus imported in 18s


2021-01-11 17:35:55,246 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


2021-01-11 17:36:17,371 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 22s


17016 term thesaurus imported in 22s


2021-01-11 17:36:23,170 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


2021-01-11 17:36:39,631 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 16s


17016 term thesaurus imported in 16s


2021-01-11 17:36:42,024 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


2021-01-11 17:36:59,204 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 17s


17016 term thesaurus imported in 17s


(['tok2vec',
  'tagger',
  'parser',
  'ner',
  'attribute_ruler',
  'lemmatizer',
  'map_entity_types'],
 ['tok2vec',
  'tagger',
  'parser',
  'thesaurus_matcher',
  'ner',
  'attribute_ruler',
  'lemmatizer',
  'entity_filter',
  'map_entity_types'],
 ['tok2vec',
  'tagger',
  'parser',
  'ner',
  'thesaurus_matcher',
  'attribute_ruler',
  'lemmatizer',
  'entity_filter',
  'map_entity_types'],
 ['tok2vec',
  'tagger',
  'parser',
  'ner',
  'thesaurus_matcher',
  'attribute_ruler',
  'lemmatizer',
  'entity_filter',
  'map_entity_types'],
 ['tok2vec',
  'tagger',
  'parser',
  'date_matcher',
  'ner',
  'attribute_ruler',
  'lemmatizer',
  'map_entity_types'],
 ['tok2vec',
  'tagger',
  'parser',
  'date_matcher',
  'ner',
  'thesaurus_matcher',
  'attribute_ruler',
  'lemmatizer',
  'entity_filter',
  'map_entity_types'],
 ['tok2vec',
  'tagger',
  'parser',
  'date_matcher',
  'ner',
  'thesaurus_matcher',
  'attribute_ruler',
  'lemmatizer',
  'entity_filter',
  'map_entity_typ

In [4]:
data = load_text_and_annotations_from_labelstudio("../labelling/export/2020-12-10-12-43-04.zip", nlp_lg)

len(data)

526

In [5]:
def generate_results_table(pipelines: dict, examples: list):
    results = pd.DataFrame()
    times = dict()
    for name, p in pipelines.items():
        start = time.time()
        temp_res = pd.json_normalize(test_ner(p, examples=examples))
        end = time.time()
        temp_res['pipeline'] = name
        
        results = results.append(temp_res)
        times[name] = end-start
        print(f"{name}: {int(end-start)}s")
        
    results = results.set_index('pipeline')
    
    return results, times

res, times = generate_results_table(
    {
        'NER': nlp_lg, 
        'NER_thes_bef': nlp_thes, 
        'NER_thes_aft': nlp_thes_aft, 
        'NER_thes_aft_ow': nlp_thes_aft_ow,
        'NER_datematcher_bef': nlp_dates_bef,
        'NER_datematcher_bef_thes_aft': nlp_dates_bef_thes_aft,
        'NER_datematcher_bef_thes_aft_ow': nlp_dates_bef_thes_aft_ow
#         'NER_datematcher_aft': nlp_dates_aft,
    }, 
    data)

res.T
        

Failed:  Poster, London & North Eastern Railway, Bamburgh by Tom Purvis, 1936. Coloured lithograph depicting a stylised view of the coast with Bamburgh castle, the beach and village. Printed by Chorley & Pickersgill Ltd, Lithographers, Leeds. Format: double royal. Dimensions: 40 x 25 inches, 1016 x 635mm.
[E103] Trying to set conflicting doc.ents: '(40, 51, 'NORP')' and '(40, 51, 'ORG')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap.
Failed:  Glass bottle containing unknown grey, metallic-looking powder. Part of Statham's student chemical laboratory.
[E103] Trying to set conflicting doc.ents: '(71, 78, 'PERSON')' and '(71, 78, 'ORG')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap.
Failed:  Bone gouge probably by McQueen of Newcastle, England, late 19th early 20th century, nickel plated steel, handle probably brass, nickel plated
[E103] Trying to set conflicting doc.ents: '(23, 43, 'ORG')

pipeline,NER,NER_thes_bef,NER_thes_aft,NER_thes_aft_ow,NER_datematcher_bef,NER_datematcher_bef_thes_aft,NER_datematcher_bef_thes_aft_ow
ents_p,0.508428,0.548974,0.545026,0.550767,0.522862,0.560209,0.565928
ents_r,0.626676,0.627346,0.620643,0.625335,0.651475,0.645442,0.650134
ents_f,0.561393,0.585549,0.580382,0.585687,0.580125,0.599813,0.605115
support,1549,1549,1549,1549,1549,1549,1549
labels_missing_from_annotations,"[FAC, MONEY, LANGUAGE, TIME, PERCENT, CARDINAL...","[FAC, MONEY, LANGUAGE, TIME, PERCENT, CARDINAL...","[FAC, MONEY, LANGUAGE, TIME, PERCENT, CARDINAL...","[FAC, MONEY, LANGUAGE, TIME, PERCENT, CARDINAL...","[FAC, MONEY, LANGUAGE, TIME, PERCENT, CARDINAL...","[FAC, MONEY, LANGUAGE, TIME, PERCENT, CARDINAL...","[FAC, MONEY, LANGUAGE, TIME, PERCENT, CARDINAL..."
ents_per_type.LOC.p,0.789655,0.833333,0.802817,0.831502,0.789655,0.802817,0.831502
ents_per_type.LOC.r,0.731629,0.734824,0.728435,0.72524,0.731629,0.728435,0.72524
ents_per_type.LOC.f,0.759536,0.780985,0.763819,0.774744,0.759536,0.763819,0.774744
ents_per_type.LOC.support,322,322,322,322,322,322,322
ents_per_type.ORG.p,0.517483,0.491803,0.509524,0.497925,0.517483,0.509524,0.497925


In [6]:
for text, annotations in data[0:30]:
    print("NER")
    display_ner_annotations(text, nlp_lg)
    print("best thesaurus")
    display_ner_annotations(text, nlp_thes_aft)
    print("best thesaurus w/ date patterns")
    display_ner_annotations(text, nlp_dates_bef_thes_aft)
    print("GT")
    display_manual_annotations(text, annotations)
    print("-----")
    print("-----")

NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus




best thesaurus w/ date patterns


GT


-----
-----
