# Training a new model


In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

from hc_nlp.pipeline import ThesaurusMatcher, EntityFilter, MapEntityTypes, DateMatcher
from hc_nlp.model_testing import test_ner
from hc_nlp.io import load_text_and_annotations_from_labelstudio
from hc_nlp.spacy_helpers import correct_entity_boundaries
from hc_nlp import constants

import pprint
pp = pprint.PrettyPrinter(indent=2)

import pandas as pd
import time

import random
import warnings

from tqdm.auto import tqdm

import spacy
from spacy.util import minibatch, compounding
from spacy.training import Example


In [2]:
nlp = spacy.load("en_core_web_lg")

## 1. Create training set
This is all the data that isn't in the test set at `../data/TEST_SET_2020-12-10-12-43-04.zip`

In [3]:
TEST_DATA = load_text_and_annotations_from_labelstudio('../data/TEST_SET_2020-12-10-12-43-04.zip', spacy_model=nlp)
test_text = [i[0] for i in TEST_DATA]

In [4]:
TEST_DATA[0], test_text[0]

(('Soup plate, white ceramic, Newhaven to Dieppe service logo at top. Made by C Mc D Mann & Co Ltd, Hanley. Overall: 44 mm x 253 mm, 0.73kg.',
  [(97, 103, 'LOC'), (27, 35, 'LOC'), (39, 45, 'LOC'), (75, 95, 'ORG')]),
 'Soup plate, white ceramic, Newhaven to Dieppe service logo at top. Made by C Mc D Mann & Co Ltd, Hanley. Overall: 44 mm x 253 mm, 0.73kg.')

In [5]:
text = pd.read_json("../data/text_all.json")

In [6]:
# text_test may contain duplicates as the descriptions come from labelled data, but text_train will not
text_train = text[~text['text'].isin(test_text)].drop_duplicates(subset='text')
text_test = text[text['text'].isin(test_text)]

len(text), len(text_train), len(text_test)

(291620, 264098, 2297)

In [7]:
train_size = None

text_train = text_train.sample(train_size, random_state=42) if train_size is not None else text_train

## 2. Train new model

### 2.1 Initialise model with rule- and thesaurus-based matching

- `nlp` is our out-of-the-box model
- `nlp_thes` is our model with the additional components

In [8]:
# model with rules for dates before & thesaurus after, with overwrite
# thes_ow = ThesaurusMatcher(nlp, thesaurus_path="../data/labels_all_unambiguous_types_people_orgs.jsonl", 
#                                   case_sensitive=False, overwrite_ents=True)
# entityfilter = EntityFilter(ent_labels_ignore=['DATE'])
# mapentitytypes = MapEntityTypes(nlp, validate_mapping=False)

nlp_thes = spacy.load("en_core_web_lg")
nlp_thes.add_pipe('DateMatcher', before='ner')
nlp_thes.add_pipe('ThesaurusMatcher', config={"case_sensitive": False, "overwrite_ents": True, "thesaurus_path":"../data/labels_all_unambiguous_types_people_orgs.jsonl"}, after='ner')
nlp_thes.add_pipe('EntityFilter', config={"ent_labels_ignore": ["DATE"]}, last=True)
# nlp_thes.add_pipe(mapentitytypes)

# nlp.add_pipe(mapentitytypes)

nlp.pipe_names, nlp_thes.pipe_names

2021-01-07 15:53:21,212 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl


2021-01-07 15:53:23,493 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 2s


17016 term thesaurus imported in 2s


(['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'],
 ['tok2vec',
  'tagger',
  'parser',
  'DateMatcher',
  'ner',
  'ThesaurusMatcher',
  'attribute_ruler',
  'lemmatizer',
  'EntityFilter'])

### 2.2 Create training data

In [9]:
def get_entity_list(doc, correct=True):
    entity_list = []
    
    for ent in doc.ents:
        start = doc[ent.start].idx

        # TODO: correct end in hc-nlp
        end = doc[ent.end-1].idx + len(doc[ent.end-1].text)
        entity_list.append((start, end, ent.label_))
        
    if correct:
        entity_list_old = entity_list
        entity_list = correct_entity_boundaries(nlp, doc.text, entity_list)

        if entity_list_old != entity_list:
            print(entity_list_old, entity_list, doc.text)

    return entity_list

text = "Who is Shaka Khan?"
doc = nlp_thes(text)

get_entity_list(doc)

[(7, 17, 'PERSON')]

In [10]:
TRAIN_DATA = []
model = nlp_thes

start = time.time()
for doc in model.pipe(text_train['text'].head(5000).tolist()):
    TRAIN_DATA.append(
        Example.from_dict(doc, {"entities": get_entity_list(doc)})
#         (doc.text, {"entities": get_entity_list(doc)})
    )
end = time.time()

print(f"{len(TRAIN_DATA)} records created in {int(end-start)} seconds")

[(123, 142, 'ORG')] [(130, 141, 'ORG')] Window, showing figure of St.Paul when viewed through polaroid screens. (includes illuminating box and 2 viewing screens):Children's Gallery: Polarised Light (St. Pauls Window).
5000 records created in 55 seconds


### 2.3 Train new model

In [11]:
nlp_new = spacy.load('en_core_web_lg')
ner = nlp_new.get_pipe("ner")

n_iter = 100

In [12]:
required_pipes = ['ner']
disable_pipes = [pipe for pipe in nlp_new.pipe_names if pipe not in required_pipes]

with nlp_new.disable_pipes(*disable_pipes), warnings.catch_warnings():
    # show warnings for misaligned entity spans once
    warnings.filterwarnings("once", category=UserWarning, module='spacy')
    
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
#             examples = zip(*batch)
            try:
                nlp_new.update(
                    batch,
                    drop=0.5, # dropout for regularization
                    losses=losses,
                )
                # TODO: drop this except and let spaCy handle the errors
            except ValueError:
#                 print(batch)
#                 print()
                print('failure')
                break

        print("Losses", losses)
        print("Test performance", test_ner(nlp_new, examples=TEST_DATA))
        

failure
Losses {'ner': 3602.1182443512066}
Failed:  Poster, London & North Eastern Railway, Bamburgh by Tom Purvis, 1936. Coloured lithograph depicting a stylised view of the coast with Bamburgh castle, the beach and village. Printed by Chorley & Pickersgill Ltd, Lithographers, Leeds. Format: double royal. Dimensions: 40 x 25 inches, 1016 x 635mm.
[E103] Trying to set conflicting doc.ents: '(40, 48, 'LOC')' and '(40, 51, 'NORP')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap.
Failed:  Glass bottle containing unknown grey, metallic-looking powder. Part of Statham's student chemical laboratory.
[E103] Trying to set conflicting doc.ents: '(71, 78, 'ORG')' and '(71, 78, 'PERSON')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap.
Failed:  Bone gouge probably by McQueen of Newcastle, England, late 19th early 20th century, nickel plated steel, handle probably brass, nickel plated
[E103] Trying to

KeyboardInterrupt: 