In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

from hc_nlp.pipeline import ThesaurusMatcher, EntityFilter, MapEntityTypes, DateMatcher
from hc_nlp.model_testing import test_ner
from hc_nlp.io import load_text_and_annotations_from_labelstudio
from hc_nlp.spacy_helpers import display_manual_annotations, display_ner_annotations
from hc_nlp import constants

import pandas as pd
import spacy

import time

In [6]:
# !{sys.executable} -m spacy download en_core_web_trf

[38;5;3m⚠ Skipping pipeline package dependencies and setting `--no-deps`. You
don't seem to have the spaCy package itself installed (maybe because you've
built from source?), so installing the package dependencies would cause spaCy to
be downloaded, which probably isn't what you want. If the pipeline package has
other dependencies, you'll have to install them manually.[0m
Collecting en_core_web_trf==3.0.0a0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.0.0a0/en_core_web_trf-3.0.0a0.tar.gz (459.7 MB)
[K     |████████████████████████████████| 459.7 MB 16.4 MB/s eta 0:00:013 |                                | 604 kB 609 kB/s eta 0:12:33     |█████████████████████▍          | 307.4 MB 10.6 MB/s eta 0:00:15
[?25hBuilding wheels for collected packages: en-core-web-trf
  Building wheel for en-core-web-trf (setup.py) ... [?25ldone
[?25h  Created wheel for en-core-web-trf: filename=en_core_web_trf-3.0.0a0-py3-none-any.whl size=459703091 sha256

In [3]:
# LOAD MODELS

model_name = "en_core_web_trf"

# pure NER
nlp_lg = spacy.load(model_name)

# custom components
# thes = ThesaurusMatcher(nlp_lg, thesaurus_path="../data/labels_all_unambiguous_types_people_orgs.jsonl", case_sensitive=False)
# thes_ow = ThesaurusMatcher(nlp_lg, thesaurus_path="../data/labels_all_unambiguous_types_people_orgs.jsonl", 
#                                   case_sensitive=False, overwrite_ents=True)
# entityfilter = EntityFilter()

# model with people & orgs thesaurus annotations before model
nlp_thes = spacy.load(model_name)
nlp_thes.add_pipe("ThesaurusMatcher", config={"case_sensitive": False, "overwrite_ents": False, "thesaurus_path":"../data/labels_all_unambiguous_types_people_orgs.jsonl"}, before='ner')
nlp_thes.add_pipe("EntityFilter", last=True, config={"ent_labels_ignore": ["DATE"]})

# model with people & orgs thesaurus annotations after model
nlp_thes_aft = spacy.load(model_name)
nlp_thes_aft.add_pipe("ThesaurusMatcher", config={"case_sensitive": False, "overwrite_ents": False, "thesaurus_path":"../data/labels_all_unambiguous_types_people_orgs.jsonl"}, after='ner')
nlp_thes_aft.add_pipe("EntityFilter", last=True, config={"ent_labels_ignore": ["DATE"]})

# model with people & orgs thesaurus annotations after model, overwriting NER annotations
nlp_thes_aft_ow = spacy.load(model_name)
nlp_thes_aft_ow.add_pipe("ThesaurusMatcher", config={"case_sensitive": False, "overwrite_ents": True, "thesaurus_path":"../data/labels_all_unambiguous_types_people_orgs.jsonl"}, after='ner')
nlp_thes_aft_ow.add_pipe("EntityFilter", last=True, config={"ent_labels_ignore": ["DATE"]})

# model with rules for dates
# datematcher = DateMatcher(nlp_lg)
nlp_dates_bef = spacy.load(model_name)
nlp_dates_bef.add_pipe("DateMatcher", before='ner')

# model with rules for dates before & thesaurus after
nlp_dates_bef_thes_aft = spacy.load(model_name)
nlp_dates_bef_thes_aft.add_pipe("DateMatcher", before='ner')
nlp_dates_bef_thes_aft.add_pipe("ThesaurusMatcher", config={"case_sensitive": False, "overwrite_ents": False, "thesaurus_path":"../data/labels_all_unambiguous_types_people_orgs.jsonl"}, after='ner')
nlp_dates_bef_thes_aft.add_pipe("EntityFilter", last=True, config={"ent_labels_ignore": ["DATE"]})

# model with rules for dates before & thesaurus after, with overwrite
nlp_dates_bef_thes_aft_ow = spacy.load(model_name)
nlp_dates_bef_thes_aft_ow.add_pipe("DateMatcher", before='ner')
nlp_dates_bef_thes_aft_ow.add_pipe("ThesaurusMatcher", config={"case_sensitive": False, "overwrite_ents": True, "thesaurus_path":"../data/labels_all_unambiguous_types_people_orgs.jsonl"}, after='ner')
nlp_dates_bef_thes_aft_ow.add_pipe("EntityFilter", last=True, config={"ent_labels_ignore": ["DATE"]})

# add mapping from Spacy to HC types to all pipelines
# mapentitytypes = MapEntityTypes(nlp_lg, validate_mapping=False)
nlp_lg.add_pipe("MapEntityTypes", config={"validate_mapping": False})
nlp_thes.add_pipe("MapEntityTypes", config={"validate_mapping": False})
nlp_thes_aft.add_pipe("MapEntityTypes", config={"validate_mapping": False})
nlp_thes_aft_ow.add_pipe("MapEntityTypes", config={"validate_mapping": False})
nlp_dates_bef.add_pipe("MapEntityTypes", config={"validate_mapping": False})
nlp_dates_bef_thes_aft.add_pipe("MapEntityTypes", config={"validate_mapping": False})
nlp_dates_bef_thes_aft_ow.add_pipe("MapEntityTypes", config={"validate_mapping": False})

nlp_lg.pipe_names, nlp_thes.pipe_names, nlp_thes_aft.pipe_names, nlp_thes_aft_ow.pipe_names, nlp_dates_bef.pipe_names, nlp_dates_bef_thes_aft.pipe_names, nlp_dates_bef_thes_aft_ow.pipe_names


2020-12-10 16:54:45,755 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


2020-12-10 16:54:49,736 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 3s


17016 term thesaurus imported in 3s


2020-12-10 16:54:54,100 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


2020-12-10 16:54:57,790 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 3s


17016 term thesaurus imported in 3s


2020-12-10 16:55:01,189 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


2020-12-10 16:55:04,573 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 3s


17016 term thesaurus imported in 3s


2020-12-10 16:55:18,016 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


2020-12-10 16:55:25,640 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 7s


17016 term thesaurus imported in 7s


2020-12-10 16:55:29,772 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


2020-12-10 16:55:33,294 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 3s


17016 term thesaurus imported in 3s


(['transformer',
  'tagger',
  'parser',
  'ner',
  'attribute_ruler',
  'lemmatizer',
  'MapEntityTypes'],
 ['transformer',
  'tagger',
  'parser',
  'ThesaurusMatcher',
  'ner',
  'attribute_ruler',
  'lemmatizer',
  'EntityFilter',
  'MapEntityTypes'],
 ['transformer',
  'tagger',
  'parser',
  'ner',
  'ThesaurusMatcher',
  'attribute_ruler',
  'lemmatizer',
  'EntityFilter',
  'MapEntityTypes'],
 ['transformer',
  'tagger',
  'parser',
  'ner',
  'ThesaurusMatcher',
  'attribute_ruler',
  'lemmatizer',
  'EntityFilter',
  'MapEntityTypes'],
 ['transformer',
  'tagger',
  'parser',
  'DateMatcher',
  'ner',
  'attribute_ruler',
  'lemmatizer',
  'MapEntityTypes'],
 ['transformer',
  'tagger',
  'parser',
  'DateMatcher',
  'ner',
  'ThesaurusMatcher',
  'attribute_ruler',
  'lemmatizer',
  'EntityFilter',
  'MapEntityTypes'],
 ['transformer',
  'tagger',
  'parser',
  'DateMatcher',
  'ner',
  'ThesaurusMatcher',
  'attribute_ruler',
  'lemmatizer',
  'EntityFilter',
  'MapEntityTy

In [4]:
data = load_text_and_annotations_from_labelstudio("2020-12-10-12-43-04", nlp_lg)

len(data)

526

In [5]:
def generate_results_table(pipelines: dict, examples: list):
    results = pd.DataFrame()
    times = dict()
    for name, p in pipelines.items():
        start = time.time()
        temp_res = pd.json_normalize(test_ner(p, examples=examples))
        end = time.time()
        temp_res['pipeline'] = name
        
        results = results.append(temp_res)
        times[name] = end-start
        print(f"{name}: {int(end-start)}s")
        
    results = results.set_index('pipeline')
    
    return results, times

res, times = generate_results_table(
    {
        'NER': nlp_lg, 
        'NER_thes_bef': nlp_thes, 
        'NER_thes_aft': nlp_thes_aft, 
        'NER_thes_aft_ow': nlp_thes_aft_ow,
        'NER_datematcher_bef': nlp_dates_bef,
        'NER_datematcher_bef_thes_aft': nlp_dates_bef_thes_aft,
        'NER_datematcher_bef_thes_aft_ow': nlp_dates_bef_thes_aft_ow
#         'NER_datematcher_aft': nlp_dates_aft,
    }, 
    data)

res.T
        

Failed:  Poster, London & North Eastern Railway, Bamburgh by Tom Purvis, 1936. Coloured lithograph depicting a stylised view of the coast with Bamburgh castle, the beach and village. Printed by Chorley & Pickersgill Ltd, Lithographers, Leeds. Format: double royal. Dimensions: 40 x 25 inches, 1016 x 635mm.
[(8, 38, 'ORG'), (52, 62, 'PERSON'), (64, 68, 'DATE'), (185, 210, 'ORG'), (227, 232, 'LOC'), (40, 48, 'LOC'), (134, 142, 'LOC'), (8, 38, 'ORG'), (40, 51, 'ORG'), (40, 51, 'NORP'), (52, 62, 'PERSON'), (64, 68, 'DATE'), (70, 99, 'ORG'), (185, 210, 'ORG'), (212, 225, 'LOC'), (227, 232, 'LOC')]
Failed:  Glass bottle containing unknown grey, metallic-looking powder. Part of Statham's student chemical laboratory.
[(71, 78, 'ORG'), (71, 78, 'PERSON')]
Failed:  Bone gouge probably by McQueen of Newcastle, England, late 19th early 20th century, nickel plated steel, handle probably brass, nickel plated
[(45, 52, 'LOC'), (23, 43, 'ORG'), (54, 82, 'DATE'), (45, 52, 'LOC'), (54, 82, 'DATE'), (23, 

pipeline,NER,NER_thes_bef,NER_thes_aft,NER_thes_aft_ow,NER_datematcher_bef,NER_datematcher_bef_thes_aft,NER_datematcher_bef_thes_aft_ow
ents_p,0.59375,0.621185,0.640292,0.625912,0.59375,0.640292,0.625912
ents_r,0.733287,0.715369,0.725017,0.709166,0.733287,0.725017,0.709166
ents_f,0.656183,0.664958,0.680026,0.664943,0.656183,0.680026,0.664943
support,1549,1549,1549,1549,1549,1549,1549
labels_missing_from_annotations,"[TIME, PERCENT, QUANTITY, CARDINAL, ORDINAL, F...","[TIME, PERCENT, QUANTITY, CARDINAL, ORDINAL, F...","[TIME, PERCENT, QUANTITY, CARDINAL, ORDINAL, F...","[TIME, PERCENT, QUANTITY, CARDINAL, ORDINAL, F...","[TIME, PERCENT, QUANTITY, CARDINAL, ORDINAL, F...","[TIME, PERCENT, QUANTITY, CARDINAL, ORDINAL, F...","[TIME, PERCENT, QUANTITY, CARDINAL, ORDINAL, F..."
ents_per_type.LOC.p,0.860902,0.862963,0.860377,0.863118,0.860902,0.860377,0.863118
ents_per_type.LOC.r,0.768456,0.781879,0.765101,0.761745,0.768456,0.765101,0.761745
ents_per_type.LOC.f,0.812057,0.820423,0.809947,0.809269,0.812057,0.809947,0.809269
ents_per_type.LOC.support,322,322,322,322,322,322,322
ents_per_type.ORG.p,0.621749,0.541485,0.614634,0.557303,0.621749,0.614634,0.557303


In [6]:
res.T.to_csv("../data/AAIL_results_nlp_trf_2.csv")

In [7]:
for text, annotations in data[0:30]:
    print("NER")
    display_ner_annotations(text, nlp_lg)
    print("best thesaurus")
    display_ner_annotations(text, nlp_thes_aft)
    print("best thesaurus w/ date patterns")
    display_ner_annotations(text, nlp_dates_bef_thes_aft)
    print("GT")
    display_manual_annotations(text, annotations)
    print("-----")
    print("-----")

NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
