In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

from hc_nlp.pipeline import ThesaurusMatcher, EntityFilter, MapEntityTypes, DateMatcher
from hc_nlp.model_testing import test_ner
from hc_nlp.io import load_text_and_annotations_from_labelstudio
from hc_nlp.spacy_helpers import display_manual_annotations, display_ner_annotations
from hc_nlp import constants

import pprint
pp = pprint.PrettyPrinter(indent=2)

import pandas as pd
import spacy
import time

In [8]:
# LOAD MODELS

# pure NER
nlp_lg = spacy.load("en_core_web_lg")

# custom components
thes = ThesaurusMatcher(nlp_lg, thesaurus_path="../data/labels_all_unambiguous_types_people_orgs.jsonl", case_sensitive=False)
thes_ow = ThesaurusMatcher(nlp_lg, thesaurus_path="../data/labels_all_unambiguous_types_people_orgs.jsonl", 
                                  case_sensitive=False, overwrite_ents=True)
entityfilter = EntityFilter(ignore_ent_labels=['DATE'])

# model with people & orgs thesaurus annotations before model
nlp_thes = spacy.load("en_core_web_lg")
nlp_thes.add_pipe(thes, before='ner')
nlp_thes.add_pipe(entityfilter, last=True)

# model with people & orgs thesaurus annotations after model
nlp_thes_aft = spacy.load("en_core_web_lg")
nlp_thes_aft.add_pipe(thes, after='ner')
nlp_thes_aft.add_pipe(entityfilter, last=True)

# model with people & orgs thesaurus annotations after model, overwriting NER annotations
nlp_thes_aft_ow = spacy.load("en_core_web_lg")
nlp_thes_aft_ow.add_pipe(thes_ow, after='ner')
nlp_thes_aft_ow.add_pipe(entityfilter, last=True)

# model with rules for dates
datematcher = DateMatcher(nlp_lg)
nlp_dates_bef = spacy.load("en_core_web_lg")
nlp_dates_bef.add_pipe(datematcher, before='ner')

# model with rules for dates before & thesaurus after
nlp_dates_bef_thes_aft = spacy.load("en_core_web_lg")
nlp_dates_bef_thes_aft.add_pipe(datematcher, before='ner')
nlp_dates_bef_thes_aft.add_pipe(thes, after='ner')
nlp_dates_bef_thes_aft.add_pipe(entityfilter, last=True)

# model with rules for dates before & thesaurus after, with overwrite
nlp_dates_bef_thes_aft_ow = spacy.load("en_core_web_lg")
nlp_dates_bef_thes_aft_ow.add_pipe(datematcher, before='ner')
nlp_dates_bef_thes_aft_ow.add_pipe(thes_ow, after='ner')
nlp_dates_bef_thes_aft_ow.add_pipe(entityfilter, last=True)

# add mapping from Spacy to HC types to all pipelines
mapentitytypes = MapEntityTypes(nlp_lg, validate_mapping=False)
nlp_lg.add_pipe(mapentitytypes)
nlp_thes.add_pipe(mapentitytypes)
nlp_thes_aft.add_pipe(mapentitytypes)
nlp_thes_aft_ow.add_pipe(mapentitytypes)
nlp_dates_bef.add_pipe(mapentitytypes)
nlp_dates_bef_thes_aft.add_pipe(mapentitytypes)
nlp_dates_bef_thes_aft_ow.add_pipe(mapentitytypes)

nlp_lg.pipe_names, nlp_thes.pipe_names, nlp_thes_aft.pipe_names, nlp_thes_aft_ow.pipe_names, nlp_dates_bef.pipe_names, nlp_dates_bef_thes_aft.pipe_names, nlp_dates_bef_thes_aft_ow.pipe_names


2020-12-10 16:41:21,344 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl
2020-12-10 16:41:21,344 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl
2020-12-10 16:41:32,707 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 11s
2020-12-10 16:41:32,707 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 11s
2020-12-10 16:41:32,709 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl
2020-12-10 16:41:32,709 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl
2020-12-10 16:41:36,189 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 3s
2020-12-10 16:41:36,189 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 3s


(['tagger', 'parser', 'ner', 'MapEntityTypes'],
 ['tagger',
  'parser',
  'ThesaurusMatcher',
  'ner',
  'EntityFilter',
  'MapEntityTypes'],
 ['tagger',
  'parser',
  'ner',
  'ThesaurusMatcher',
  'EntityFilter',
  'MapEntityTypes'],
 ['tagger',
  'parser',
  'ner',
  'ThesaurusMatcher',
  'EntityFilter',
  'MapEntityTypes'],
 ['tagger', 'parser', 'DateMatcher', 'ner', 'MapEntityTypes'],
 ['tagger',
  'parser',
  'DateMatcher',
  'ner',
  'ThesaurusMatcher',
  'EntityFilter',
  'MapEntityTypes'],
 ['tagger',
  'parser',
  'DateMatcher',
  'ner',
  'ThesaurusMatcher',
  'EntityFilter',
  'MapEntityTypes'])

In [9]:
data = load_text_and_annotations_from_labelstudio("../labelling/export/2020-12-10-12-43-04.zip", nlp_lg)

len(data)

526

In [10]:
def generate_results_table(pipelines: dict, examples: list):
    results = pd.DataFrame()
    times = dict()
    for name, p in pipelines.items():
        start = time.time()
        temp_res = pd.json_normalize(test_ner(p, examples=examples))
        end = time.time()
        temp_res['pipeline'] = name
        
        results = results.append(temp_res)
        times[name] = end-start
        
        print(f"{name}: {int(end-start)}s")
    
    results = results.set_index('pipeline')
    
    return results

res = generate_results_table(
    {
        'NER': nlp_lg, 
        'NER_thes_bef': nlp_thes, 
        'NER_thes_aft': nlp_thes_aft, 
        'NER_thes_aft_ow': nlp_thes_aft_ow,
        'NER_datematcher_bef': nlp_dates_bef,
        'NER_datematcher_bef_thes_aft': nlp_dates_bef_thes_aft,
        'NER_datematcher_bef_thes_aft_ow': nlp_dates_bef_thes_aft_ow
#         'NER_datematcher_aft': nlp_dates_aft,
    }, 
    data)

res.T
        

NER: 5s
NER_thes_bef: 6s
NER_thes_aft: 6s
NER_thes_aft_ow: 6s
NER_datematcher_bef: 6s
NER_datematcher_bef_thes_aft: 6s
NER_datematcher_bef_thes_aft_ow: 6s


pipeline,NER,NER_thes_bef,NER_thes_aft,NER_thes_aft_ow,NER_datematcher_bef,NER_datematcher_bef_thes_aft,NER_datematcher_bef_thes_aft_ow
ents_p,51.3795,56.25,55.9517,56.738,52.3932,57.0131,57.7955
ents_r,67.7885,67.3764,66.8269,67.3764,69.9176,68.956,69.5055
ents_f,58.4542,61.3125,60.9077,61.6013,59.9,62.4184,63.1119
support,1549,1549,1549,1549,1549,1549,1549
labels_missing_from_annotations,"[ORDINAL, CARDINAL, LANGUAGE, PERCENT, LAW, FA...","[ORDINAL, CARDINAL, LANGUAGE, PERCENT, LAW, FA...","[ORDINAL, CARDINAL, LANGUAGE, PERCENT, LAW, FA...","[ORDINAL, CARDINAL, LANGUAGE, PERCENT, LAW, FA...","[ORDINAL, CARDINAL, LANGUAGE, LAW, FAC, MONEY,...","[ORDINAL, CARDINAL, LANGUAGE, LAW, FAC, MONEY,...","[ORDINAL, CARDINAL, LANGUAGE, LAW, FAC, MONEY,..."
ents_per_type.LOC.p,78.2895,81.8815,80.339,82.5175,78.2895,80.339,82.5175
ents_per_type.LOC.r,79.5987,78.5953,79.2642,78.9298,79.5987,79.2642,78.9298
ents_per_type.LOC.f,78.9386,80.2048,79.798,80.6838,78.9386,79.798,80.6838
ents_per_type.LOC.support,322,322,322,322,322,322,322
ents_per_type.ORG.p,51.0917,50.303,51.7564,51.1294,51.0917,51.7564,51.1294


In [11]:
# res.T.to_csv("../data/AAIL_results_nlp_lg_2.csv")

In [7]:
for text, annotations in data[0:50]:
#     print("NER")
#     display_ner_annotations(text, nlp_lg)
    print("thes people & orgs")
    display_ner_annotations(text, nlp_thes)
    print([(ent.text, ent.ent_id_) for ent in nlp_thes(text).ents])
    print("thes people & orgs (after)")
    display_ner_annotations(text, nlp_thes_aft)
    print([(ent.text, ent.ent_id_) for ent in nlp_thes_aft(text).ents])
    print("GT")
    display_manual_annotations(text, annotations)
    print("-----")
    print("-----")

thes people & orgs


[('Newhaven', ''), ('Dieppe', ''), ('C Mc D Mann & Co Ltd', ''), ('Hanley', ''), ('44 mm', ''), ('0.73kg', '')]
thes people & orgs (after)


[('Newhaven', ''), ('Dieppe', ''), ('C Mc D Mann & Co Ltd', ''), ('Hanley', ''), ('44 mm', ''), ('0.73kg', '')]
GT


-----
-----
thes people & orgs


[('c.1930-1969', ''), ('Smith', 'https://collection.sciencemuseumgroup.org.uk/people/cp120932'), ('Britain', ''), ('569', '')]
thes people & orgs (after)


[('c.1930-1969', ''), ('Smith Alarm', ''), ('Britain', ''), ('569', '')]
GT


-----
-----
thes people & orgs


[('Italy', '')]
thes people & orgs (after)


[('Italy', '')]
GT


-----
-----
thes people & orgs


[('Doulton', ''), ('20 gallon', ''), ('1934', '')]
thes people & orgs (after)


[('Doulton', ''), ('20 gallon', ''), ('1934', '')]
GT


-----
-----
thes people & orgs


[('Wood', ''), ('Liverpool', ''), ('1850', '')]
thes people & orgs (after)


[('Wood', ''), ('Liverpool', ''), ('1850', '')]
GT


-----
-----
thes people & orgs


[('Engraver', ''), ('Chicago', '')]
thes people & orgs (after)


[('Engraver', ''), ('Chicago', '')]
GT


-----
-----
thes people & orgs


[('Keystone', ''), ('October 1938', '')]
thes people & orgs (after)


[('Keystone', ''), ('October 1938', '')]
GT


-----
-----
thes people & orgs


[('Two', ''), ("Space Monkeys' Warehouse Party", '')]
thes people & orgs (after)


[('Two', ''), ("Space Monkeys' Warehouse Party", '')]
GT


-----
-----
thes people & orgs


[('Great Central Railway', 'https://collection.sciencemuseumgroup.org.uk/people/cp3885')]
thes people & orgs (after)


[('Great Central Railway', '')]
GT


-----
-----
thes people & orgs


[('33mm', ''), ('London & North Western Railway', 'https://collection.sciencemuseumgroup.org.uk/people/cp1589'), ('Richards', '')]
thes people & orgs (after)


[('33mm', ''), ('London & North Western Railway', ''), ('Richards', '')]
GT


-----
-----
thes people & orgs


[('Roman', ''), ('Sforza', '')]
thes people & orgs (after)


[('Roman', ''), ('Sforza', '')]
GT


-----
-----
thes people & orgs


[('Lancashire Derbyshire &', ''), ('East Coast', 'https://collection.sciencemuseumgroup.org.uk/people/cp128305'), ('Chesterfield', ''), ('Lincoln', ''), ('Sutton', ''), ('1896', ''), ('Bemrose', 'https://collection.sciencemuseumgroup.org.uk/people/cp11583')]
thes people & orgs (after)


[('Lancashire Derbyshire & East Coast Railway', ''), ('Chesterfield', ''), ('Lincoln', ''), ('Sutton', ''), ('1896', ''), ('Bemrose & Sons Ltd.', '')]
GT


-----
-----
thes people & orgs


[('John Pollock & Co.', '')]
thes people & orgs (after)


[('John Pollock & Co.', '')]
GT


-----
-----
thes people & orgs


[('Fumigator', ''), ('Alformant A', ''), ('the Formalin Hygienic Co. Ltd.', ''), ('London', '')]
thes people & orgs (after)


[('Fumigator', ''), ('Alformant A', ''), ('the Formalin Hygienic Co. Ltd.', ''), ('London', '')]
GT


-----
-----
thes people & orgs


[('Moroccan', ''), ('1890-1925', '')]
thes people & orgs (after)


[('Moroccan', ''), ('1890-1925', '')]
GT


-----
-----
thes people & orgs


[('Harrison', 'https://collection.sciencemuseumgroup.org.uk/people/cp19704'), ('2013', ''), ("Copper Horses'", ''), ('Harrison', 'https://collection.sciencemuseumgroup.org.uk/people/cp19704'), ('The Bradford Fellowship', ''), ('2012-2013', '')]
thes people & orgs (after)


[('Chris Harrison', ''), ('2013', ''), ("Copper Horses'", ''), ('Chris Harrison', ''), ('The Bradford Fellowship', ''), ('2012-2013', '')]
GT


-----
-----
thes people & orgs


[('Six', ''), ('C.V. Boys', ''), ('175 mm', '')]
thes people & orgs (after)


[('Six', ''), ('C.V. Boys', ''), ('175 mm', '')]
GT


-----
-----
thes people & orgs


[('Dudson Brothers', ''), ('Great North Eastern Railway', 'https://collection.sciencemuseumgroup.org.uk/people/cp17759'), ('39 mm', '')]
thes people & orgs (after)


[('Dudson Brothers', ''), ('Great North Eastern Railway', ''), ('39 mm', '')]
GT


-----
-----
thes people & orgs


[('National Union of Railwaymen', 'https://collection.sciencemuseumgroup.org.uk/people/cp7884')]
thes people & orgs (after)


[('National Union of Railwaymen', '')]
GT


-----
-----
thes people & orgs


[('Perken', ''), ('Double plano convex condensers', ''), ('Thornton Pickard', '')]
thes people & orgs (after)


[('Perken', ''), ('Double plano convex condensers', ''), ('Thornton Pickard', '')]
GT


-----
-----
thes people & orgs


[('Charles Frederick Grindrod', 'https://collection.sciencemuseumgroup.org.uk/people/cp110758'), ('Royal Photographic Society', 'https://collection.sciencemuseumgroup.org.uk/people/cp17553'), ('early 1900s', '')]
thes people & orgs (after)


[('Dr Charles Frederick Grindrod', ''), ('Royal Photographic Society', ''), ('early 1900s', '')]
GT


-----
-----
thes people & orgs


[('One', 'https://collection.sciencemuseumgroup.org.uk/people/cp113279'), ('Pearlware', ''), ('Graham', ''), ('Conduit House', ''), ('Pentonville', ''), ('London', '')]
thes people & orgs (after)


[('One', ''), ('Pearlware', ''), ('Graham', ''), ('Conduit House', ''), ('Pentonville', ''), ('London', '')]
GT


-----
-----
thes people & orgs


[('Cecil Beaton', 'https://collection.sciencemuseumgroup.org.uk/people/cp163137'), ('Rolling Stones', ''), ('Mick Jagger', '')]
thes people & orgs (after)


[('Cecil Beaton', ''), ('Rolling Stones', ''), ('Mick Jagger', '')]
GT


-----
-----
thes people & orgs


[('1953', '')]
thes people & orgs (after)


[('1953', '')]
GT


-----
-----
thes people & orgs


[('Taylor', ''), ('Taylor Hobson', '')]
thes people & orgs (after)


[('Taylor', ''), ('Taylor Hobson', '')]
GT


-----
-----
thes people & orgs


[('the Coaching Arrangements Book', ''), ('Railway Clearing House', 'https://collection.sciencemuseumgroup.org.uk/people/cp4558'), ('October 1921', '')]
thes people & orgs (after)


[('the Coaching Arrangements Book', ''), ('the Railway Clearing House', ''), ('October 1921', '')]
GT


-----
-----
thes people & orgs


[('Badge', ''), ('1986', ''), ('World Cup Snooker', '')]
thes people & orgs (after)


[('Badge', ''), ('1986', ''), ('World Cup Snooker', '')]
GT


-----
-----
thes people & orgs


[('London & North Eastern Railway', 'https://collection.sciencemuseumgroup.org.uk/people/cp1796'), ('Bamburgh', ''), ('Tom Purvis', 'https://collection.sciencemuseumgroup.org.uk/people/cp2312'), ('1936', ''), ('Coloured lithograph', ''), ('Bamburgh', ''), ('Chorley & Pickersgill Ltd', ''), ('Lithographers', ''), ('Leeds', ''), ('40', ''), ('1016', '')]
thes people & orgs (after)


[('London & North Eastern Railway', ''), ('Bamburgh', ''), ('Tom Purvis', ''), ('1936', ''), ('Coloured lithograph', ''), ('Bamburgh', ''), ('Chorley & Pickersgill Ltd', ''), ('Lithographers', ''), ('Leeds', ''), ('40', ''), ('1016', '')]
GT


-----
-----
thes people & orgs


[('John Pollock & Co.', '')]
thes people & orgs (after)


[('John Pollock & Co.', '')]
GT


-----
-----
thes people & orgs




[]
thes people & orgs (after)


[]
GT


-----
-----
thes people & orgs


[('Air Data Computer', ''), ('0510', ''), ('Smith', 'https://collection.sciencemuseumgroup.org.uk/people/cp120932'), ('1965', ''), ('-11', ''), ('XX105', '')]
thes people & orgs (after)


[('Air Data Computer', ''), ('0510', ''), ('Smith', ''), ('1965', ''), ('-11', ''), ('XX105', '')]
GT


-----
-----
thes people & orgs


[('Train Alterations', ''), ('April 15th, 1912', ''), ('South Eastern & Chatham Railway', 'https://collection.sciencemuseumgroup.org.uk/people/cp1689')]
thes people & orgs (after)


[('Train Alterations', ''), ('April 15th, 1912', ''), ('South Eastern & Chatham Railway', '')]
GT


-----
-----
thes people & orgs


[('Benedictine Order', ''), ('French', ''), ('1860-1920', '')]
thes people & orgs (after)


[('Benedictine Order', ''), ('French', ''), ('1860-1920', '')]
GT


-----
-----
thes people & orgs


[('French', ''), ('17th Century', '')]
thes people & orgs (after)


[('French', ''), ('17th Century', '')]
GT


-----
-----
thes people & orgs


[('Joshanda', ''), ('Unani Tibb', ''), ('Hamdard Laboratories', ''), ('Pakistan', ''), ('2005', '')]
thes people & orgs (after)


[('Joshanda', ''), ('Unani Tibb', ''), ('Hamdard Laboratories', ''), ('Pakistan', ''), ('2005', '')]
GT


-----
-----
thes people & orgs


[('British Railways', 'https://collection.sciencemuseumgroup.org.uk/people/cp3862'), ('Banana van', ''), ('Hornby', ''), ('England', '')]
thes people & orgs (after)


[('British Railways', ''), ('Banana van', ''), ('Hornby', ''), ('England', '')]
GT


-----
-----
thes people & orgs


[('British Railways, Eastern Region', 'https://collection.sciencemuseumgroup.org.uk/people/cp32472'), ('1974', '')]
thes people & orgs (after)


[('British Railways', ''), ('Eastern Region', ''), ('1974', '')]
GT


-----
-----
thes people & orgs


[('North America', ''), ('English', ''), ('1851-1920', '')]
thes people & orgs (after)


[('North America', ''), ('English', ''), ('1851-1920', '')]
GT


-----
-----
thes people & orgs


[('Barrie Trinder', ''), ('Reynolds', ''), ('Oxford Dictionary of National Biography,', ''), ('Oxford University Press', 'https://collection.sciencemuseumgroup.org.uk/people/cp6581'), ('2004', ''), ('Richard Reynolds', 'https://collection.sciencemuseumgroup.org.uk/people/cp51227'), ('1735-1816', ''), ('British', ''), ('English', '')]
thes people & orgs (after)


[('Barrie Trinder', ''), ('Reynolds', ''), ('Oxford Dictionary of National Biography, Oxford University Press', ''), ('2004', ''), ('Richard Reynolds', ''), ('1735-1816', ''), ('British', ''), ('English', '')]
GT


-----
-----
thes people & orgs


[('Bayer', 'https://collection.sciencemuseumgroup.org.uk/people/cp68155'), ('George', 'https://collection.sciencemuseumgroup.org.uk/people/cp68811')]
thes people & orgs (after)


[('Farbenfabriken Bayer AG', ''), ('W. & J. George', '')]
GT


-----
-----
thes people & orgs


[('Eighty five', ''), ('1970s-1990s', ''), ('Collett Dickenson Pearce', '')]
thes people & orgs (after)


[('Eighty five', ''), ('1970s-1990s', ''), ('Collett Dickenson Pearce', '')]
GT


-----
-----
thes people & orgs


[('40 degrees', '')]
thes people & orgs (after)


[('40 degrees', '')]
GT


-----
-----
thes people & orgs


[]
thes people & orgs (after)


[]
GT


-----
-----
thes people & orgs


[('Leslie Manufacturing Co. Ltd.', ''), ('Billinghurst', ''), ('1920', '')]
thes people & orgs (after)


[('Leslie Manufacturing Co. Ltd.', ''), ('Billinghurst', ''), ('1920', '')]
GT


-----
-----
thes people & orgs


[('New Hebrides', ''), ('1851-1900', '')]
thes people & orgs (after)


[('New Hebrides', ''), ('1851-1900', '')]
GT


-----
-----
thes people & orgs


[('Ectron', ''), ('St. Francis Psychiatric Hospital', ''), ('1950-1990', '')]
thes people & orgs (after)


[('Ectron', ''), ('St. Francis Psychiatric Hospital', ''), ('1950-1990', '')]
GT


-----
-----
thes people & orgs


[('1-12', ''), ('Regd', ''), ('443714', '')]
thes people & orgs (after)


[('1-12', ''), ('Regd', ''), ('443714', '')]
GT


-----
-----
thes people & orgs


[('John Pollock & Co.', '')]
thes people & orgs (after)


[('John Pollock & Co.', '')]
GT


-----
-----
thes people & orgs


[('British', ''), ('Wembley', '')]
thes people & orgs (after)


[('British', ''), ('Wembley', '')]
GT


-----
-----
thes people & orgs


[('John Pollock & Co.', '')]
thes people & orgs (after)


[('John Pollock & Co.', '')]
GT


-----
-----
