In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

from hc_nlp.pipeline import ThesaurusMatcher, EntityFilter, MapEntityTypes, DateMatcher
from hc_nlp.model_testing import test_ner
from hc_nlp.io import load_text_and_annotations_from_labelstudio
from hc_nlp.spacy_helpers import display_manual_annotations, display_ner_annotations
from hc_nlp import constants

import pprint
pp = pprint.PrettyPrinter(indent=2)

import pandas as pd
import spacy

In [64]:
# LOAD MODELS

# pure NER
nlp_lg = spacy.load("en_core_web_lg")

# custom components
thes = ThesaurusMatcher(nlp_lg, thesaurus_path="../data/labels_all_unambiguous_types_people_orgs.jsonl", case_sensitive=False)
thes_ow = ThesaurusMatcher(nlp_lg, thesaurus_path="../data/labels_all_unambiguous_types_people_orgs.jsonl", 
                                  case_sensitive=False, overwrite_ents=True)
entityfilter = EntityFilter()

# model with people & orgs thesaurus annotations before model
nlp_thes = spacy.load("en_core_web_lg")
nlp_thes.add_pipe(thes, before='ner')
nlp_thes.add_pipe(entityfilter, last=True)

# model with people & orgs thesaurus annotations after model
nlp_thes_aft = spacy.load("en_core_web_lg")
nlp_thes_aft.add_pipe(thes, after='ner')
nlp_thes_aft.add_pipe(entityfilter, last=True)

# model with people & orgs thesaurus annotations after model, overwriting NER annotations
nlp_thes_aft_ow = spacy.load("en_core_web_lg")
nlp_thes_aft_ow.add_pipe(thes_ow, after='ner')
nlp_thes_aft_ow.add_pipe(entityfilter, last=True)

# model with rules for dates
datematcher = DateMatcher(nlp_lg)
nlp_dates_bef = spacy.load("en_core_web_lg")
nlp_dates_bef.add_pipe(datematcher, before='ner')

# model with rules for dates before & thesaurus after
nlp_dates_bef_thes_aft = spacy.load("en_core_web_lg")
nlp_dates_bef_thes_aft.add_pipe(datematcher, before='ner')
nlp_dates_bef_thes_aft.add_pipe(thes, after='ner')
nlp_dates_bef_thes_aft.add_pipe(entityfilter, last=True)

# model with rules for dates before & thesaurus after, with overwrite
nlp_dates_bef_thes_aft_ow = spacy.load("en_core_web_lg")
nlp_dates_bef_thes_aft_ow.add_pipe(datematcher, before='ner')
nlp_dates_bef_thes_aft_ow.add_pipe(thes_ow, after='ner')
nlp_dates_bef_thes_aft_ow.add_pipe(entityfilter, last=True)

# add mapping from Spacy to HC types to all pipelines
mapentitytypes = MapEntityTypes(nlp_lg, validate_mapping=False)
nlp_lg.add_pipe(mapentitytypes)
nlp_thes.add_pipe(mapentitytypes)
nlp_thes_aft.add_pipe(mapentitytypes)
nlp_thes_aft_ow.add_pipe(mapentitytypes)
nlp_dates_bef.add_pipe(mapentitytypes)
nlp_dates_bef_thes_aft.add_pipe(mapentitytypes)
nlp_dates_bef_thes_aft_ow.add_pipe(mapentitytypes)

nlp_lg.pipe_names, nlp_thes.pipe_names, nlp_thes_aft.pipe_names, nlp_thes_aft_ow.pipe_names, nlp_dates_bef.pipe_names, nlp_dates_bef_thes_aft.pipe_names, nlp_dates_bef_thes_aft_ow.pipe_names


2020-12-03 12:48:39,418 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl
2020-12-03 12:48:39,418 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl
2020-12-03 12:48:39,418 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl
2020-12-03 12:48:39,418 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl
2020-12-03 12:48:39,418 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl
2020-12-03 12:48:39,418 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl
2020-12-03 12:48:39,418 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl
2020-12-03 12:48:39,418 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unam

(['tagger', 'parser', 'ner', 'MapEntityTypes'],
 ['tagger',
  'parser',
  'ThesaurusMatcher',
  'ner',
  'EntityFilter',
  'MapEntityTypes'],
 ['tagger',
  'parser',
  'ner',
  'ThesaurusMatcher',
  'EntityFilter',
  'MapEntityTypes'],
 ['tagger',
  'parser',
  'ner',
  'ThesaurusMatcher',
  'EntityFilter',
  'MapEntityTypes'],
 ['tagger', 'parser', 'DateMatcher', 'ner', 'MapEntityTypes'],
 ['tagger',
  'parser',
  'DateMatcher',
  'ner',
  'ThesaurusMatcher',
  'EntityFilter',
  'MapEntityTypes'],
 ['tagger',
  'parser',
  'DateMatcher',
  'ner',
  'ThesaurusMatcher',
  'EntityFilter',
  'MapEntityTypes'])

In [4]:
data = load_text_and_annotations_from_labelstudio("../labelling/export/2020-11-25-17-28-58.zip", nlp_lg)

len(data)

500

In [23]:
def map_annotations_spacy_to_hc(data):
    new_data = []
    
    for text, annotations in data:
        new_annotations = [(item[0], item[1], constants.SPACY_TO_HC_ENTITY_MAPPING.get(item[2], item[2])) for item in annotations]
        
        new_data.append((text, new_annotations))
        
    return new_data
        
data = map_annotations_spacy_to_hc(data)

In [66]:
def generate_results_table(pipelines: dict, examples: list):
    results = pd.DataFrame()
    for name, p in pipelines.items():
        temp_res = pd.json_normalize(test_ner(p, examples=examples))
        temp_res['pipeline'] = name
        
        results = results.append(temp_res)
    
    results = results.set_index('pipeline')
    
    return results

res = generate_results_table(
    {
        'NER': nlp_lg, 
        'NER_thes_bef': nlp_thes, 
        'NER_thes_aft': nlp_thes_aft, 
        'NER_thes_aft_ow': nlp_thes_aft_ow,
        'NER_datematcher_bef': nlp_dates_bef,
        'NER_datematcher_bef_thes_aft': nlp_dates_bef_thes_aft,
        'NER_datematcher_bef_thes_aft_ow': nlp_dates_bef_thes_aft_ow
#         'NER_datematcher_aft': nlp_dates_aft,
    }, 
    data)

res.T
        

pipeline,NER,NER_thes_bef,NER_thes_aft,NER_thes_aft_ow,NER_datematcher_bef,NER_datematcher_bef_thes_aft,NER_datematcher_bef_thes_aft_ow
ents_p,39.557,42.908,42.8401,43.2594,40.5858,44.3262,44.7478
ents_r,58.9159,56.795,56.4022,56.7164,60.9584,58.9159,59.2302
ents_f,47.3335,48.8844,48.6945,49.0823,48.7284,50.5902,50.9804
support,1274,1274,1274,1274,1274,1274,1274
labels_missing_from_annotations,"[NORP, MONEY, CARDINAL, TIME, PERCENT, FAC, LA...","[NORP, MONEY, CARDINAL, TIME, PERCENT, FAC, LA...","[NORP, MONEY, CARDINAL, TIME, PERCENT, FAC, LA...","[NORP, MONEY, CARDINAL, TIME, PERCENT, FAC, LA...","[NORP, MONEY, CARDINAL, TIME, FAC, LANGUAGE, L...","[NORP, MONEY, CARDINAL, TIME, FAC, LANGUAGE, E...","[NORP, MONEY, CARDINAL, TIME, FAC, LANGUAGE, E..."
ents_per_type.ORG.p,38.9908,37.0763,38.7409,37.797,38.9908,38.7409,37.797
ents_per_type.ORG.r,57.2391,58.9226,53.8721,58.9226,57.2391,53.8721,58.9226
ents_per_type.ORG.f,46.3847,45.5137,45.0704,46.0526,46.3847,45.0704,46.0526
ents_per_type.ORG.support,297,297,297,297,297,297,297
ents_per_type.LOC.p,69.967,73.6842,71.7687,73.6842,69.967,71.7687,73.6842


In [27]:
for text, annotations in data[0:50]:
#     print("NER")
#     display_ner_annotations(text, nlp_lg)
    print("thes people & orgs")
    display_ner_annotations(text, nlp_thes_po)
    print([(ent.text, ent.ent_id_) for ent in nlp_thes_po(text).ents])
    print("thes people & orgs (after)")
    display_ner_annotations(text, nlp_thes_aft_po)
    print([(ent.text, ent.ent_id_) for ent in nlp_thes_aft_po(text).ents])
    print("GT")
    display_manual_annotations(text, annotations)
    print("-----")
    print("-----")

thes people & orgs


[('c.1930-1969', ''), ('Smith', 'https://collection.sciencemuseumgroup.org.uk/people/cp120932'), ('Britain', ''), ('569', '')]
thes people & orgs (after)


[('c.1930-1969', ''), ('Smith Alarm', ''), ('Britain', ''), ('569', '')]
GT


-----
-----
thes people & orgs


[('Italy', '')]
thes people & orgs (after)


[('Italy', '')]
GT


-----
-----
thes people & orgs


[('Doulton', ''), ('20 gallon', ''), ('1934', '')]
thes people & orgs (after)


[('Doulton', ''), ('20 gallon', ''), ('1934', '')]
GT


-----
-----
thes people & orgs


[('Engraver', ''), ('Chicago', '')]
thes people & orgs (after)


[('Engraver', ''), ('Chicago', '')]
GT


-----
-----
thes people & orgs


[('Keystone', ''), ('October 1938', '')]
thes people & orgs (after)


[('Keystone', ''), ('October 1938', '')]
GT


-----
-----
thes people & orgs


[('Two', ''), ("Space Monkeys' Warehouse Party", '')]
thes people & orgs (after)


[('Two', ''), ("Space Monkeys' Warehouse Party", '')]
GT


-----
-----
thes people & orgs


[('Great Central Railway', 'https://collection.sciencemuseumgroup.org.uk/people/cp3885')]
thes people & orgs (after)


[('Great Central Railway', '')]
GT


-----
-----
thes people & orgs


[('British Railways', 'https://collection.sciencemuseumgroup.org.uk/people/cp3862'), ('Special Train Service', '')]
thes people & orgs (after)


[('British Railways', ''), ('Special Train Service', '')]
GT


-----
-----
thes people & orgs


[('33mm', ''), ('London & North Western Railway', 'https://collection.sciencemuseumgroup.org.uk/people/cp1589'), ('Richards', '')]
thes people & orgs (after)


[('33mm', ''), ('London & North Western Railway', ''), ('Richards', '')]
GT


-----
-----
thes people & orgs


[('Roman', ''), ('Sforza', '')]
thes people & orgs (after)


[('Roman', ''), ('Sforza', '')]
GT


-----
-----
thes people & orgs


[('John Pollock & Co.', '')]
thes people & orgs (after)


[('John Pollock & Co.', '')]
GT


-----
-----
thes people & orgs


[('Moroccan', ''), ('1890-1925', '')]
thes people & orgs (after)


[('Moroccan', ''), ('1890-1925', '')]
GT


-----
-----
thes people & orgs


[('Six', ''), ('C.V. Boys', ''), ('175 mm', '')]
thes people & orgs (after)


[('Six', ''), ('C.V. Boys', ''), ('175 mm', '')]
GT


-----
-----
thes people & orgs


[('Baillie', ''), ('Clockmakers of the World', ''), ('366 \n  \n ', '')]
thes people & orgs (after)


[('Baillie', ''), ('Clockmakers of the World', '')]
GT


-----
-----
thes people & orgs


[('Dudson Brothers', ''), ('Great North Eastern Railway', 'https://collection.sciencemuseumgroup.org.uk/people/cp17759'), ('39 mm', '')]
thes people & orgs (after)


[('Dudson Brothers', ''), ('Great North Eastern Railway', ''), ('39 mm', '')]
GT


-----
-----
thes people & orgs


[('National Union of Railwaymen', 'https://collection.sciencemuseumgroup.org.uk/people/cp7884')]
thes people & orgs (after)


[('National Union of Railwaymen', '')]
GT


-----
-----
thes people & orgs


[('Nash Contra logoscope', ''), ('Firmin Nash', 'https://collection.sciencemuseumgroup.org.uk/people/cp51965'), ('Medical Data Systems', ''), ('Nottingham', ''), ('England', ''), ('1974-1975', '')]
thes people & orgs (after)


[('Nash Contra logoscope', ''), ('Dr Firmin Nash', ''), ('Medical Data Systems', ''), ('Nottingham', ''), ('England', ''), ('1974-1975', '')]
GT


-----
-----
thes people & orgs


[('Ferguson', 'https://collection.sciencemuseumgroup.org.uk/people/cp126966'), ('London', ''), ('1822-1869', '')]
thes people & orgs (after)


[('Ferguson', ''), ('London', ''), ('1822-1869', '')]
GT


-----
-----
thes people & orgs


[('One', 'https://collection.sciencemuseumgroup.org.uk/people/cp113279'), ('Pearlware', ''), ('Graham', ''), ('Conduit House', ''), ('Pentonville', ''), ('London', '')]
thes people & orgs (after)


[('One', ''), ('Pearlware', ''), ('Graham', ''), ('Conduit House', ''), ('Pentonville', ''), ('London', '')]
GT


-----
-----
thes people & orgs


[('English', ''), ('Dutch', ''), ('1700-1850', '')]
thes people & orgs (after)


[('English', ''), ('Dutch', ''), ('1700-1850', '')]
GT


-----
-----
thes people & orgs


[('Petzval', ''), ('Front Lens', ''), ('7 inches 6-0.5', ''), ('Gundlach-Manhatten Optical Co.', '')]
thes people & orgs (after)


[('Petzval', ''), ('Front Lens', ''), ('7 inches 6-0.5', ''), ('Gundlach-Manhatten Optical Co.', '')]
GT


-----
-----
thes people & orgs


[('Cecil Beaton', 'https://collection.sciencemuseumgroup.org.uk/people/cp163137'), ('Rolling Stones', ''), ('Mick Jagger', '')]
thes people & orgs (after)


[('Cecil Beaton', ''), ('Rolling Stones', ''), ('Mick Jagger', '')]
GT


-----
-----
thes people & orgs


[('Taylor', ''), ('Taylor Hobson', '')]
thes people & orgs (after)


[('Taylor', ''), ('Taylor Hobson', '')]
GT


-----
-----
thes people & orgs


[('the Coaching Arrangements Book', ''), ('Railway Clearing House', 'https://collection.sciencemuseumgroup.org.uk/people/cp4558'), ('October 1921', '')]
thes people & orgs (after)


[('the Coaching Arrangements Book', ''), ('the Railway Clearing House', ''), ('October 1921', '')]
GT


-----
-----
thes people & orgs


[('Stephenson', ''), ('English', '')]
thes people & orgs (after)


[('Stephenson', ''), ('English', '')]
GT


-----
-----
thes people & orgs


[('London & North Eastern Railway', 'https://collection.sciencemuseumgroup.org.uk/people/cp1796'), ('Bamburgh', ''), ('Tom Purvis', 'https://collection.sciencemuseumgroup.org.uk/people/cp2312'), ('1936', ''), ('Coloured lithograph', ''), ('Bamburgh', ''), ('Chorley & Pickersgill Ltd', ''), ('Lithographers', ''), ('Leeds', ''), ('40', ''), ('1016', '')]
thes people & orgs (after)


[('London & North Eastern Railway', ''), ('Bamburgh', ''), ('Tom Purvis', ''), ('1936', ''), ('Coloured lithograph', ''), ('Bamburgh', ''), ('Chorley & Pickersgill Ltd', ''), ('Lithographers', ''), ('Leeds', ''), ('40', ''), ('1016', '')]
GT


-----
-----
thes people & orgs


[('Kodak', 'https://collection.sciencemuseumgroup.org.uk/people/cp3742'), ('Mr J Herbert', ''), ('1888', ''), ('11', ''), ('London', '')]
thes people & orgs (after)


[('Kodak', ''), ('Mr J Herbert', ''), ('1888', ''), ('11', ''), ('London', '')]
GT


-----
-----
thes people & orgs




[]
thes people & orgs (after)


[]
GT


-----
-----
thes people & orgs


[('Train Alterations', ''), ('April 15th, 1912', ''), ('South Eastern & Chatham Railway', 'https://collection.sciencemuseumgroup.org.uk/people/cp1689')]
thes people & orgs (after)


[('Train Alterations', ''), ('April 15th, 1912', ''), ('South Eastern & Chatham Railway', '')]
GT


-----
-----
thes people & orgs


[('Merck', ''), ('Sharp', 'https://collection.sciencemuseumgroup.org.uk/people/cp137172'), ('Dohme Limited', ''), ('Hertfordshire', ''), ('England', ''), ('1992-2000', ''), ('Royal Free Hospital', 'https://collection.sciencemuseumgroup.org.uk/people/cp2490'), ('London', '')]
thes people & orgs (after)


[('Merck Sharp &', ''), ('Hertfordshire', ''), ('England', ''), ('1992-2000', ''), ('Royal Free Hospital', ''), ('London', '')]
GT


-----
-----
thes people & orgs


[('British', '')]
thes people & orgs (after)


[('British', '')]
GT


-----
-----
thes people & orgs


[('Benedictine Order', ''), ('French', ''), ('1860-1920', '')]
thes people & orgs (after)


[('Benedictine Order', ''), ('French', ''), ('1860-1920', '')]
GT


-----
-----
thes people & orgs


[('1940', ''), ('Crompton Parkinson', ''), ('618698', '')]
thes people & orgs (after)


[('1940', ''), ('Crompton Parkinson', ''), ('618698', '')]
GT


-----
-----
thes people & orgs


[('Bronze medal', ''), ('S.M. Bouret', ''), ('France', ''), ('1747', '')]
thes people & orgs (after)


[('Bronze medal', ''), ('S.M. Bouret', ''), ('France', ''), ('1747', '')]
GT


-----
-----
thes people & orgs


[('French', ''), ('17th Century', '')]
thes people & orgs (after)


[('French', ''), ('17th Century', '')]
GT


-----
-----
thes people & orgs


[('Joshanda', ''), ('Unani Tibb', ''), ('Hamdard Laboratories', ''), ('Pakistan', ''), ('2005', '')]
thes people & orgs (after)


[('Joshanda', ''), ('Unani Tibb', ''), ('Hamdard Laboratories', ''), ('Pakistan', ''), ('2005', '')]
GT


-----
-----
thes people & orgs


[('British Railways', 'https://collection.sciencemuseumgroup.org.uk/people/cp3862'), ('Banana van', ''), ('Hornby', ''), ('England', '')]
thes people & orgs (after)


[('British Railways', ''), ('Banana van', ''), ('Hornby', ''), ('England', '')]
GT


-----
-----
thes people & orgs


[('British Railways, Eastern Region', 'https://collection.sciencemuseumgroup.org.uk/people/cp32472'), ('1974', '')]
thes people & orgs (after)


[('British Railways', ''), ('Eastern Region', ''), ('1974', '')]
GT


-----
-----
thes people & orgs


[('North America', ''), ('English', ''), ('1851-1920', '')]
thes people & orgs (after)


[('North America', ''), ('English', ''), ('1851-1920', '')]
GT


-----
-----
thes people & orgs


[('Barrie Trinder', ''), ('Reynolds', ''), ('Oxford Dictionary of National Biography,', ''), ('Oxford University Press', 'https://collection.sciencemuseumgroup.org.uk/people/cp6581'), ('2004', ''), ('Richard Reynolds', 'https://collection.sciencemuseumgroup.org.uk/people/cp51227'), ('1735-1816', ''), ('British', ''), ('English', '')]
thes people & orgs (after)


[('Barrie Trinder', ''), ('Reynolds', ''), ('Oxford Dictionary of National Biography, Oxford University Press', ''), ('2004', ''), ('Richard Reynolds', ''), ('1735-1816', ''), ('British', ''), ('English', '')]
GT


-----
-----
thes people & orgs


[('Eighty five', ''), ('1970s-1990s', ''), ('Collett Dickenson Pearce', '')]
thes people & orgs (after)


[('Eighty five', ''), ('1970s-1990s', ''), ('Collett Dickenson Pearce', '')]
GT


-----
-----
thes people & orgs


[('Howard Grubb', 'https://collection.sciencemuseumgroup.org.uk/people/cp27950'), ('Dublin', ''), ('Norman Lockyer', ''), ('Solar Physics Observatory, South Kensington', 'https://collection.sciencemuseumgroup.org.uk/people/cp20581')]
thes people & orgs (after)


[('Howard Grubb', ''), ('Dublin', ''), ('Norman Lockyer', ''), ('the Solar Physics Observatory', ''), ('South Kensington', '')]
GT


-----
-----
thes people & orgs


[('40 degrees', '')]
thes people & orgs (after)


[('40 degrees', '')]
GT


-----
-----
thes people & orgs


[]
thes people & orgs (after)


[]
GT


-----
-----
thes people & orgs


[('New Hebrides', ''), ('1851-1900', '')]
thes people & orgs (after)


[('New Hebrides', ''), ('1851-1900', '')]
GT


-----
-----
thes people & orgs


[('1-12', ''), ('Regd', ''), ('443714', '')]
thes people & orgs (after)


[('1-12', ''), ('Regd', ''), ('443714', '')]
GT


-----
-----
thes people & orgs


[('John Pollock & Co.', '')]
thes people & orgs (after)


[('John Pollock & Co.', '')]
GT


-----
-----
thes people & orgs


[('Association of French First Aid', ''), ('Lancelot', ''), ('Bescher', ''), ('French', ''), ('1901', '')]
thes people & orgs (after)


[('Association of French First Aid', ''), ('Lancelot', ''), ('Bescher', ''), ('French', ''), ('1901', '')]
GT


-----
-----
thes people & orgs


[('British', ''), ('Wembley', '')]
thes people & orgs (after)


[('British', ''), ('Wembley', '')]
GT


-----
-----
thes people & orgs


[('Brass X-ray tube', ''), ('Debye-Scherrer', ''), ('A. Taylor', ''), ('Nimonic" Alloys', ''), ('1951', '')]
thes people & orgs (after)


[('Brass X-ray tube', ''), ('Debye-Scherrer', ''), ('A. Taylor', ''), ('Nimonic" Alloys', ''), ('1951', '')]
GT


-----
-----


In [10]:
doc = nlp_lg(data[0][0])

for ent in list(doc.ents):
    print(ent.label_)
    

DATE
ORG
GPE
CARDINAL
