# Exporting Spacy Labels to Label Studio

In [1]:
import sys
sys.path.append("..")

import spacy
from hc_nlp.pipeline import EntityFilter, ThesaurusMatcher, DateMatcher, MapEntityTypes

import json
from tqdm.auto import tqdm

In [2]:
nlp = spacy.load("en_core_web_lg")

thes_ow = ThesaurusMatcher(nlp, thesaurus_path="../data/labels_all_unambiguous_types_people_orgs.jsonl", 
                                  case_sensitive=False, overwrite_ents=True)
entityfilter = EntityFilter()
datematcher = DateMatcher(nlp)
mapentitytypes = MapEntityTypes(nlp, validate_mapping=False)

nlp.add_pipe(datematcher, before='ner')
nlp.add_pipe(thes_ow, after='ner')
nlp.add_pipe(entityfilter, last=True)
nlp.add_pipe(mapentitytypes, last=True)

nlp.pipe_names

2020-12-08 18:46:35,362 - hc_nlp.pipeline - INFO - Loading thesaurus from ../data/labels_all_unambiguous_types_people_orgs.jsonl
2020-12-08 18:46:40,537 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 5s


['tagger',
 'parser',
 'DateMatcher',
 'ner',
 'ThesaurusMatcher',
 'EntityFilter',
 'MapEntityTypes']

In [3]:
with open("../data/text_processed_750.json", "r") as f:
    data = json.load(f)
    
data[0]

{'uri': 'https://collection.sciencemuseumgroup.org.uk/objects/co8663782',
 'text': "Two-minute phonograph cylinder containing ‘The Dawn ’ by Amy Evans, associated with an Edison 'Amberola' phonograph, 1880-1912"}

In [4]:
allowed_labels = ["PERSON", "ORG", "NORP", "LOC", "OBJECT", "EVENT", "DATE"]

In [7]:
new_data = []
ignored_labels = []

for idx, item in tqdm(enumerate(data), total=len(data)):
    new_item = {"id": idx, "data": item}
    
    completions = [{
#         "completed_at": 10000000,
        "id": idx*1000 + 1,
        "lead_time": 0,
        "result": []
    }]
    
    doc = nlp(item['text'])
    
    for idx, ent in enumerate(doc.ents):
        if ent.label_ in allowed_labels:
            completions[0]['result'].append(
                {
                    "from_name": "label",
                    "id": str(hash(ent.text)),
                    "to_name": "text",
                    "type": "labels",
                    "value": {
                      "labels": [
                        ent.label_
                      ],
                      "start": doc[ent.start].idx,
                      "end": doc[ent.end-1].idx + len(doc[ent.end-1].text),
                      "text": ent.text,
                      "score": 1,
                    },
                }
            )
        else:
            ignored_labels.append(ent.label_)
        
    new_item['predictions'] = completions
    new_data.append(new_item)    
    
    
set(ignored_labels)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=750.0), HTML(value='')))




{'CARDINAL', 'FAC', 'LANGUAGE', 'LAW', 'MONEY', 'ORDINAL', 'QUANTITY', 'TIME'}

In [8]:
assert len(new_data) == len(data)
# output_path = "../data/text_processed_spacy_lg_750.json"

with open(output_path, 'w') as f:
    json.dump(new_data, f)

In [6]:
# -- DEBUG
text = "Filing coherer, glass tube fitted on to mounting bracket with terminals, designed or used by Sir Oliver Lodge, England, 1894-1904"

doc = nlp(text)
for idx, ent in enumerate(doc.ents):
    print(ent.text, ent.label_)
    print(doc[ent.end].idx + len(doc[ent.end]) - 1 if ent.end < len(doc) else len(doc.text)-1)
    print()

Oliver Lodge PERSON
109

England LOC
118

1894-1904 DATE
128

