In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

from hc_nlp.pipeline import ThesaurusMatcher, EntityFilter, MapEntityTypes, DateMatcher
from hc_nlp.model_testing import test_ner
from hc_nlp.io import load_text_and_annotations_from_labelstudio
from hc_nlp.spacy_helpers import display_manual_annotations, display_ner_annotations
from hc_nlp import constants

import pandas as pd
import spacy

import time

In [2]:
# !{sys.executable} -m spacy download en_core_web_trf

In [3]:
# LOAD MODELS

model_name = "en_core_web_trf"

# pure NER
nlp_lg = spacy.load(model_name)

# custom components
# thes = ThesaurusMatcher(nlp_lg, thesaurus_path="../data/labels_all_unambiguous_types_people_orgs.jsonl", case_sensitive=False)
# thes_ow = ThesaurusMatcher(nlp_lg, thesaurus_path="../data/labels_all_unambiguous_types_people_orgs.jsonl", 
#                                   case_sensitive=False, overwrite_ents=True)
# entityfilter = EntityFilter()

# model with people & orgs thesaurus annotations before model
nlp_thes = spacy.load(model_name)
nlp_thes.add_pipe("ThesaurusMatcher", config={"case_sensitive": False, "overwrite_ents": False, "thesaurus_path":"../data/labels_all_unambiguous_types_people_orgs.jsonl"}, before='ner')
nlp_thes.add_pipe("EntityFilter", last=True, config={"ent_labels_ignore": ["DATE"]})

# model with people & orgs thesaurus annotations after model
nlp_thes_aft = spacy.load(model_name)
nlp_thes_aft.add_pipe("ThesaurusMatcher", config={"case_sensitive": False, "overwrite_ents": False, "thesaurus_path":"../data/labels_all_unambiguous_types_people_orgs.jsonl"}, after='ner')
nlp_thes_aft.add_pipe("EntityFilter", last=True, config={"ent_labels_ignore": ["DATE"]})

# model with people & orgs thesaurus annotations after model, overwriting NER annotations
nlp_thes_aft_ow = spacy.load(model_name)
nlp_thes_aft_ow.add_pipe("ThesaurusMatcher", config={"case_sensitive": False, "overwrite_ents": True, "thesaurus_path":"../data/labels_all_unambiguous_types_people_orgs.jsonl"}, after='ner')
nlp_thes_aft_ow.add_pipe("EntityFilter", last=True, config={"ent_labels_ignore": ["DATE"]})

# model with rules for dates
# datematcher = DateMatcher(nlp_lg)
nlp_dates_bef = spacy.load(model_name)
nlp_dates_bef.add_pipe("DateMatcher", before='ner')

# model with rules for dates before & thesaurus after
nlp_dates_bef_thes_aft = spacy.load(model_name)
nlp_dates_bef_thes_aft.add_pipe("DateMatcher", before='ner')
nlp_dates_bef_thes_aft.add_pipe("ThesaurusMatcher", config={"case_sensitive": False, "overwrite_ents": False, "thesaurus_path":"../data/labels_all_unambiguous_types_people_orgs.jsonl"}, after='ner')
nlp_dates_bef_thes_aft.add_pipe("EntityFilter", last=True, config={"ent_labels_ignore": ["DATE"]})

# model with rules for dates before & thesaurus after, with overwrite
nlp_dates_bef_thes_aft_ow = spacy.load(model_name)
nlp_dates_bef_thes_aft_ow.add_pipe("DateMatcher", before='ner')
nlp_dates_bef_thes_aft_ow.add_pipe("ThesaurusMatcher", config={"case_sensitive": False, "overwrite_ents": True, "thesaurus_path":"../data/labels_all_unambiguous_types_people_orgs.jsonl"}, after='ner')
nlp_dates_bef_thes_aft_ow.add_pipe("EntityFilter", last=True, config={"ent_labels_ignore": ["DATE"]})

# add mapping from Spacy to HC types to all pipelines
# mapentitytypes = MapEntityTypes(nlp_lg, validate_mapping=False)
nlp_lg.add_pipe("MapEntityTypes")
nlp_thes.add_pipe("MapEntityTypes")
nlp_thes_aft.add_pipe("MapEntityTypes")
nlp_thes_aft_ow.add_pipe("MapEntityTypes")
nlp_dates_bef.add_pipe("MapEntityTypes")
nlp_dates_bef_thes_aft.add_pipe("MapEntityTypes")
nlp_dates_bef_thes_aft_ow.add_pipe("MapEntityTypes")

nlp_lg.pipe_names, nlp_thes.pipe_names, nlp_thes_aft.pipe_names, nlp_thes_aft_ow.pipe_names, nlp_dates_bef.pipe_names, nlp_dates_bef_thes_aft.pipe_names, nlp_dates_bef_thes_aft_ow.pipe_names


2021-01-06 15:38:39,614 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


2021-01-06 15:38:43,300 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 3s


17016 term thesaurus imported in 3s


2021-01-06 15:38:47,786 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


2021-01-06 15:38:51,306 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 3s


17016 term thesaurus imported in 3s


2021-01-06 15:38:55,203 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


2021-01-06 15:38:58,853 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 3s


17016 term thesaurus imported in 3s


2021-01-06 15:39:07,586 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


2021-01-06 15:39:11,365 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 3s


17016 term thesaurus imported in 3s


2021-01-06 15:39:15,111 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


2021-01-06 15:39:18,493 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 3s


17016 term thesaurus imported in 3s


(['transformer',
  'tagger',
  'parser',
  'ner',
  'attribute_ruler',
  'lemmatizer',
  'MapEntityTypes'],
 ['transformer',
  'tagger',
  'parser',
  'ThesaurusMatcher',
  'ner',
  'attribute_ruler',
  'lemmatizer',
  'EntityFilter',
  'MapEntityTypes'],
 ['transformer',
  'tagger',
  'parser',
  'ner',
  'ThesaurusMatcher',
  'attribute_ruler',
  'lemmatizer',
  'EntityFilter',
  'MapEntityTypes'],
 ['transformer',
  'tagger',
  'parser',
  'ner',
  'ThesaurusMatcher',
  'attribute_ruler',
  'lemmatizer',
  'EntityFilter',
  'MapEntityTypes'],
 ['transformer',
  'tagger',
  'parser',
  'DateMatcher',
  'ner',
  'attribute_ruler',
  'lemmatizer',
  'MapEntityTypes'],
 ['transformer',
  'tagger',
  'parser',
  'DateMatcher',
  'ner',
  'ThesaurusMatcher',
  'attribute_ruler',
  'lemmatizer',
  'EntityFilter',
  'MapEntityTypes'],
 ['transformer',
  'tagger',
  'parser',
  'DateMatcher',
  'ner',
  'ThesaurusMatcher',
  'attribute_ruler',
  'lemmatizer',
  'EntityFilter',
  'MapEntityTy

In [4]:
data = load_text_and_annotations_from_labelstudio("2020-12-10-12-43-04", nlp_lg)

len(data)

526

In [5]:
def generate_results_table(pipelines: dict, examples: list):
    results = pd.DataFrame()
    times = dict()
    for name, p in pipelines.items():
        start = time.time()
        temp_res = pd.json_normalize(test_ner(p, examples=examples))
        end = time.time()
        temp_res['pipeline'] = name
        
        results = results.append(temp_res)
        times[name] = end-start
        print(f"{name}: {int(end-start)}s")
        
    results = results.set_index('pipeline')
    
    return results, times

res, times = generate_results_table(
    {
        'NER': nlp_lg, 
        'NER_thes_bef': nlp_thes, 
        'NER_thes_aft': nlp_thes_aft, 
        'NER_thes_aft_ow': nlp_thes_aft_ow,
        'NER_datematcher_bef': nlp_dates_bef,
        'NER_datematcher_bef_thes_aft': nlp_dates_bef_thes_aft,
        'NER_datematcher_bef_thes_aft_ow': nlp_dates_bef_thes_aft_ow
#         'NER_datematcher_aft': nlp_dates_aft,
    }, 
    data)

res.T
        

Failed:  Poster, London & North Eastern Railway, Bamburgh by Tom Purvis, 1936. Coloured lithograph depicting a stylised view of the coast with Bamburgh castle, the beach and village. Printed by Chorley & Pickersgill Ltd, Lithographers, Leeds. Format: double royal. Dimensions: 40 x 25 inches, 1016 x 635mm.
[E103] Trying to set conflicting doc.ents: '(40, 51, 'NORP')' and '(40, 48, 'LOC')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap.
Failed:  Glass bottle containing unknown grey, metallic-looking powder. Part of Statham's student chemical laboratory.
[E103] Trying to set conflicting doc.ents: '(71, 78, 'PERSON')' and '(71, 78, 'ORG')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap.
Failed:  Bone gouge probably by McQueen of Newcastle, England, late 19th early 20th century, nickel plated steel, handle probably brass, nickel plated
[E103] Trying to set conflicting doc.ents: '(23, 43, 'ORG')

pipeline,NER,NER_thes_bef,NER_thes_aft,NER_thes_aft_ow,NER_datematcher_bef,NER_datematcher_bef_thes_aft,NER_datematcher_bef_thes_aft_ow
ents_p,0.590022,0.62069,0.639335,0.625297,0.590022,0.639335,0.625297
ents_r,0.731675,0.71419,0.723605,0.708137,0.731675,0.723605,0.708137
ents_f,0.653257,0.664165,0.678864,0.664144,0.653257,0.678864,0.664144
support,1549,1549,1549,1549,1549,1549,1549
labels_missing_from_annotations,"[QUANTITY, ORDINAL, TIME, LANGUAGE, CARDINAL, ...","[QUANTITY, ORDINAL, TIME, LANGUAGE, CARDINAL, ...","[QUANTITY, ORDINAL, TIME, LANGUAGE, CARDINAL, ...","[QUANTITY, ORDINAL, TIME, LANGUAGE, CARDINAL, ...","[QUANTITY, ORDINAL, TIME, LANGUAGE, CARDINAL, ...","[QUANTITY, ORDINAL, TIME, LANGUAGE, CARDINAL, ...","[QUANTITY, ORDINAL, TIME, LANGUAGE, CARDINAL, ..."
ents_per_type.LOC.p,0.856115,0.858156,0.855596,0.858182,0.856115,0.855596,0.858182
ents_per_type.LOC.r,0.765273,0.778135,0.762058,0.758842,0.765273,0.762058,0.758842
ents_per_type.LOC.f,0.808149,0.816189,0.806122,0.805461,0.808149,0.806122,0.805461
ents_per_type.LOC.support,322,322,322,322,322,322,322
ents_per_type.ORG.p,0.62645,0.544872,0.619617,0.56044,0.62645,0.619617,0.56044


In [6]:
res.T.to_csv("../data/AAIL_results_nlp_trf_2.csv")

In [7]:
for text, annotations in data[0:30]:
    print("NER")
    display_ner_annotations(text, nlp_lg)
    print("best thesaurus")
    display_ner_annotations(text, nlp_thes_aft)
    print("best thesaurus w/ date patterns")
    display_ner_annotations(text, nlp_dates_bef_thes_aft)
    print("GT")
    display_manual_annotations(text, annotations)
    print("-----")
    print("-----")

NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
