In [3]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

from hc_nlp.pipeline import ThesaurusMatcher, EntityFilter, MapEntityTypes, DateMatcher
from hc_nlp.model_testing import test_ner
from hc_nlp.io import load_text_and_annotations_from_labelstudio
from hc_nlp.spacy_helpers import display_manual_annotations, display_ner_annotations
from hc_nlp import constants

import pandas as pd
import spacy

In [6]:
!{sys.executable} -m spacy download en_core_web_trf

[38;5;3m⚠ Skipping pipeline package dependencies and setting `--no-deps`. You
don't seem to have the spaCy package itself installed (maybe because you've
built from source?), so installing the package dependencies would cause spaCy to
be downloaded, which probably isn't what you want. If the pipeline package has
other dependencies, you'll have to install them manually.[0m
Collecting en_core_web_trf==3.0.0a0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.0.0a0/en_core_web_trf-3.0.0a0.tar.gz (459.7 MB)
[K     |████████████████████████████████| 459.7 MB 16.4 MB/s eta 0:00:013 |                                | 604 kB 609 kB/s eta 0:12:33     |█████████████████████▍          | 307.4 MB 10.6 MB/s eta 0:00:15
[?25hBuilding wheels for collected packages: en-core-web-trf
  Building wheel for en-core-web-trf (setup.py) ... [?25ldone
[?25h  Created wheel for en-core-web-trf: filename=en_core_web_trf-3.0.0a0-py3-none-any.whl size=459703091 sha256

In [4]:
# LOAD MODELS

model_name = "en_core_web_trf"

# pure NER
nlp_lg = spacy.load(model_name)

# custom components
# thes = ThesaurusMatcher(nlp_lg, thesaurus_path="../data/labels_all_unambiguous_types_people_orgs.jsonl", case_sensitive=False)
# thes_ow = ThesaurusMatcher(nlp_lg, thesaurus_path="../data/labels_all_unambiguous_types_people_orgs.jsonl", 
#                                   case_sensitive=False, overwrite_ents=True)
# entityfilter = EntityFilter()

# model with people & orgs thesaurus annotations before model
nlp_thes = spacy.load(model_name)
nlp_thes.add_pipe("ThesaurusMatcher", config={"case_sensitive": False, "overwrite_ents": False, "thesaurus_path":"../data/labels_all_unambiguous_types_people_orgs.jsonl"}, before='ner')
nlp_thes.add_pipe("EntityFilter", last=True)

# model with people & orgs thesaurus annotations after model
nlp_thes_aft = spacy.load(model_name)
nlp_thes_aft.add_pipe("ThesaurusMatcher", config={"case_sensitive": False, "overwrite_ents": False, "thesaurus_path":"../data/labels_all_unambiguous_types_people_orgs.jsonl"}, after='ner')
nlp_thes_aft.add_pipe("EntityFilter", last=True)

# model with people & orgs thesaurus annotations after model, overwriting NER annotations
nlp_thes_aft_ow = spacy.load(model_name)
nlp_thes_aft_ow.add_pipe("ThesaurusMatcher", config={"case_sensitive": False, "overwrite_ents": True, "thesaurus_path":"../data/labels_all_unambiguous_types_people_orgs.jsonl"}, after='ner')
nlp_thes_aft_ow.add_pipe("EntityFilter", last=True)

# model with rules for dates
# datematcher = DateMatcher(nlp_lg)
nlp_dates_bef = spacy.load(model_name)
nlp_dates_bef.add_pipe("DateMatcher", before='ner')

# model with rules for dates before & thesaurus after
nlp_dates_bef_thes_aft = spacy.load(model_name)
nlp_dates_bef_thes_aft.add_pipe("DateMatcher", before='ner')
nlp_dates_bef_thes_aft.add_pipe("ThesaurusMatcher", config={"case_sensitive": False, "overwrite_ents": False, "thesaurus_path":"../data/labels_all_unambiguous_types_people_orgs.jsonl"}, after='ner')
nlp_dates_bef_thes_aft.add_pipe("EntityFilter", last=True)

# model with rules for dates before & thesaurus after, with overwrite
nlp_dates_bef_thes_aft_ow = spacy.load(model_name)
nlp_dates_bef_thes_aft_ow.add_pipe("DateMatcher", before='ner')
nlp_dates_bef_thes_aft_ow.add_pipe("ThesaurusMatcher", config={"case_sensitive": False, "overwrite_ents": True, "thesaurus_path":"../data/labels_all_unambiguous_types_people_orgs.jsonl"}, after='ner')
nlp_dates_bef_thes_aft_ow.add_pipe("EntityFilter", last=True)

# add mapping from Spacy to HC types to all pipelines
# mapentitytypes = MapEntityTypes(nlp_lg, validate_mapping=False)
nlp_lg.add_pipe("MapEntityTypes", config={"validate_mapping": False})
nlp_thes.add_pipe("MapEntityTypes", config={"validate_mapping": False})
nlp_thes_aft.add_pipe("MapEntityTypes", config={"validate_mapping": False})
nlp_thes_aft_ow.add_pipe("MapEntityTypes", config={"validate_mapping": False})
nlp_dates_bef.add_pipe("MapEntityTypes", config={"validate_mapping": False})
nlp_dates_bef_thes_aft.add_pipe("MapEntityTypes", config={"validate_mapping": False})
nlp_dates_bef_thes_aft_ow.add_pipe("MapEntityTypes", config={"validate_mapping": False})

nlp_lg.pipe_names, nlp_thes.pipe_names, nlp_thes_aft.pipe_names, nlp_thes_aft_ow.pipe_names, nlp_dates_bef.pipe_names, nlp_dates_bef_thes_aft.pipe_names, nlp_dates_bef_thes_aft_ow.pipe_names


2020-12-04 10:17:04,551 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


2020-12-04 10:17:07,980 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 3s


17016 term thesaurus imported in 3s


2020-12-04 10:17:12,601 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


2020-12-04 10:17:16,988 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 4s


17016 term thesaurus imported in 4s


2020-12-04 10:17:20,993 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


2020-12-04 10:17:24,563 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 3s


17016 term thesaurus imported in 3s


2020-12-04 10:17:32,832 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


2020-12-04 10:17:36,671 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 3s


17016 term thesaurus imported in 3s


2020-12-04 10:17:40,782 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


2020-12-04 10:17:44,402 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 3s


17016 term thesaurus imported in 3s


(['transformer',
  'tagger',
  'parser',
  'ner',
  'attribute_ruler',
  'lemmatizer',
  'MapEntityTypes'],
 ['transformer',
  'tagger',
  'parser',
  'ThesaurusMatcher',
  'ner',
  'attribute_ruler',
  'lemmatizer',
  'EntityFilter',
  'MapEntityTypes'],
 ['transformer',
  'tagger',
  'parser',
  'ner',
  'ThesaurusMatcher',
  'attribute_ruler',
  'lemmatizer',
  'EntityFilter',
  'MapEntityTypes'],
 ['transformer',
  'tagger',
  'parser',
  'ner',
  'ThesaurusMatcher',
  'attribute_ruler',
  'lemmatizer',
  'EntityFilter',
  'MapEntityTypes'],
 ['transformer',
  'tagger',
  'parser',
  'DateMatcher',
  'ner',
  'attribute_ruler',
  'lemmatizer',
  'MapEntityTypes'],
 ['transformer',
  'tagger',
  'parser',
  'DateMatcher',
  'ner',
  'ThesaurusMatcher',
  'attribute_ruler',
  'lemmatizer',
  'EntityFilter',
  'MapEntityTypes'],
 ['transformer',
  'tagger',
  'parser',
  'DateMatcher',
  'ner',
  'ThesaurusMatcher',
  'attribute_ruler',
  'lemmatizer',
  'EntityFilter',
  'MapEntityTy

In [5]:
data = load_text_and_annotations_from_labelstudio("2020-11-25-17-28-58", nlp_lg)

len(data)

500

In [6]:
def map_annotations_spacy_to_hc(data):
    new_data = []
    
    for text, annotations in data:
        new_annotations = [(item[0], item[1], constants.SPACY_TO_HC_ENTITY_MAPPING.get(item[2], item[2])) for item in annotations]
        
        new_data.append((text, new_annotations))
        
    return new_data
        
data = map_annotations_spacy_to_hc(data)

In [8]:
def generate_results_table(pipelines: dict, examples: list):
    results = pd.DataFrame()
    for name, p in pipelines.items():
        temp_res = pd.json_normalize(test_ner(p, examples=examples))
        temp_res['pipeline'] = name
        
        results = results.append(temp_res)
    
    results = results.set_index('pipeline')
    
    return results

res = generate_results_table(
    {
        'NER': nlp_lg, 
        'NER_thes_bef': nlp_thes, 
        'NER_thes_aft': nlp_thes_aft, 
        'NER_thes_aft_ow': nlp_thes_aft_ow,
        'NER_datematcher_bef': nlp_dates_bef,
        'NER_datematcher_bef_thes_aft': nlp_dates_bef_thes_aft,
        'NER_datematcher_bef_thes_aft_ow': nlp_dates_bef_thes_aft_ow
#         'NER_datematcher_aft': nlp_dates_aft,
    }, 
    data)

res.T
        

Failed:  Gold plated BBC Micro computer with two 1-megabyte disc drives and keyboard, made by Acorn Computers Limited, Cambridge, England,1985.  From 'The Micro User' competition.
[(12, 30, 'OBJECT'), (85, 108, 'ORG'), (110, 119, 'LOC'), (121, 133, 'LOC'), (133, 133, 'DATE')]
Failed:  Gold plated BBC Micro computer with two 1-megabyte disc drives and keyboard, made by Acorn Computers Limited, Cambridge, England,1985.  From 'The Micro User' competition.
[(12, 30, 'OBJECT'), (85, 108, 'ORG'), (110, 119, 'LOC'), (121, 133, 'LOC'), (133, 133, 'DATE')]
Failed:  Gold plated BBC Micro computer with two 1-megabyte disc drives and keyboard, made by Acorn Computers Limited, Cambridge, England,1985.  From 'The Micro User' competition.
[(12, 30, 'OBJECT'), (85, 108, 'ORG'), (110, 119, 'LOC'), (121, 133, 'LOC'), (133, 133, 'DATE')]
Failed:  Gold plated BBC Micro computer with two 1-megabyte disc drives and keyboard, made by Acorn Computers Limited, Cambridge, England,1985.  From 'The Micro User' co

pipeline,NER,NER_thes_bef,NER_thes_aft,NER_thes_aft_ow,NER_datematcher_bef,NER_datematcher_bef_thes_aft,NER_datematcher_bef_thes_aft_ow
ents_p,0.485795,0.509706,0.527191,0.516336,0.483191,0.527494,0.516645
ents_r,0.673759,0.64145,0.64933,0.635146,0.668243,0.650118,0.635934
ents_f,0.564543,0.568039,0.581921,0.569611,0.560847,0.582421,0.570117
support,1274,1274,1274,1274,1274,1274,1274
labels_missing_from_annotations,"[QUANTITY, NORP, EVENT, PERCENT, MONEY, TIME, ...","[QUANTITY, NORP, EVENT, PERCENT, MONEY, TIME, ...","[QUANTITY, NORP, EVENT, PERCENT, MONEY, TIME, ...","[QUANTITY, NORP, EVENT, PERCENT, MONEY, TIME, ...","[QUANTITY, NORP, EVENT, PERCENT, MONEY, TIME, ...","[QUANTITY, NORP, EVENT, PERCENT, MONEY, TIME, ...","[QUANTITY, NORP, EVENT, PERCENT, MONEY, TIME, ..."
ents_per_type.ORG.p,0.491228,0.418182,0.485788,0.434679,0.491228,0.485788,0.434679
ents_per_type.ORG.r,0.662162,0.621622,0.635135,0.618243,0.662162,0.635135,0.618243
ents_per_type.ORG.f,0.564029,0.5,0.550512,0.51046,0.564029,0.550512,0.51046
ents_per_type.ORG.support,297,297,297,297,297,297,297
ents_per_type.LOC.p,0.828897,0.834586,0.831418,0.833977,0.828897,0.831418,0.833977


In [7]:
for text, annotations in data[0:30]:
    print("NER")
    display_ner_annotations(text, nlp_lg)
    print("best thesaurus")
    display_ner_annotations(text, nlp_thes_aft)
    print("best thesaurus w/ date patterns")
    display_ner_annotations(text, nlp_dates_bef_thes_aft)
    print("GT")
    display_manual_annotations(text, annotations)
    print("-----")
    print("-----")

NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
NER


best thesaurus


best thesaurus w/ date patterns


GT


-----
-----
