install libraries just for this notebook:

In [1]:
!pip install rich
!pip install ipywidgets

You should consider upgrading via the '/Users/kalyan/.pyenv/versions/3.9.1/envs/hc/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/kalyan/.pyenv/versions/3.9.1/envs/hc/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../../..")

import spacy
from tqdm.auto import tqdm
from rich import print as rprint
from IPython.display import clear_output
import pickle
import random

from config.best_spacy_pipeline import load_model
from heritageconnector.datastore import es, index
from heritageconnector.disambiguation.search import es_text_search
from heritageconnector.utils.generic import get_timestamp
from es_utils import ElasticsearchConnector, simplify_document, get_hc_candidates, get_wiki_candidates

2021-01-27 16:58:03,540 - heritageconnector.datastore - DEBUG - Connected to Elasticsearch cluster at https://c4241473f2f84e2ab15c5b7c81eb34a4.eu-west-1.aws.found.io:9243/


In [3]:
nlp = load_model("en_core_web_trf")

2021-01-27 16:58:06,765 - hc_nlp.pipeline - INFO - Loading thesaurus from ../../../config/../GITIGNORE_DATA/labels_all_unambiguous_types_people_orgs.jsonl
2021-01-27 16:58:08,057 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 1s


## 1. Create dataset of NER-annotated record descriptions, with one annotation per record

```python
[(uri, doc), ...]
```

In [4]:
# hc_doc_generator is paginated so that spaCy's nlp.pipe can be used
no_docs = 1000
batch_size = 100

esc = ElasticsearchConnector(es, index)
hc_doc_generator = esc.get_document_generator(limit=no_docs, batch_size=batch_size)

In [5]:
def ner_annotate_page_of_docs(hc_doc_list: list, nlp):
    """
    Creates list of (uri, Doc: spacy.tokens.Doc) tuples from a list of (uri, text) tuples.
    Each doc object only contains one entity annotation, meaning the output list will be larger than the input list.
    """
    
    ner_data = []
    #     uris = [item[0] for item in hc_doc_list]
    descriptions = [item[1] for item in hc_doc_list]
    try:
        docs = list(nlp.pipe(descriptions, batch_size=50, n_process=1))
    except Exception as e:
        print(descriptions)
        raise e
    
    for idx, doc in enumerate(docs):
        for ent in doc.ents:
            tempdoc = spacy.tokens.Doc(nlp.vocab, words=[str(tok) for tok in doc])
            tempdoc.ents = [ent]
            
            ner_data.append((hc_doc_list[idx][0], tempdoc))
    
    return ner_data

In [6]:
"""
Create a list of annotated docs (no longer paginated), with one annotation per doc.

Output: [(uri, doc), ...]
"""

annotated_docs = []

# with nlp.disable_pipes(['tok2vec', 'tagger', 'lemmatizer']):
for doc_page in tqdm(hc_doc_generator, total=no_docs/batch_size):
    annotated_docs += ner_annotate_page_of_docs(doc_page, nlp)
        
len(annotated_docs)


  0%|          | 0/10.0 [00:00<?, ?it/s]

3120

In [7]:
annotated_docs[0]

('https://collection.sciencemuseumgroup.org.uk/objects/co8410829',
 Box containing preformed curves used by Brockbank , Wilson and Mulliner in connection with the surveying of railway lines around Manchester )

## 2. Candidate retrieval for entity mentions

Get candidate matches for each entity match by looking up the entity span in the HC Elasticsearch index.

Outputs `[(uri, doc, candidates), ...]`, where candidates is a dict with keys `uri`, `topconcept`, `label`, `description`.

In [8]:
docs_with_candidates = []

for uri, doc in tqdm(annotated_docs):
    ent_text = doc.ents[0].text
    candidates = get_hc_candidates(es, ent_text, limit=10)
    
    docs_with_candidates.append((uri, doc, candidates))
    

  0%|          | 0/3120 [00:00<?, ?it/s]

In [9]:
print(len(docs_with_candidates))

docs_with_candidates[0]

3120


('https://collection.sciencemuseumgroup.org.uk/objects/co8410829',
 Box containing preformed curves used by Brockbank , Wilson and Mulliner in connection with the surveying of railway lines around Manchester ,
 [{'uri': 'https://collection.sciencemuseumgroup.org.uk/objects/co8410829',
   'topconcept': 'OBJECT',
   'label': 'Box containing preformed curves used by Brockbank, Wilson and Mulliner',
   'description': 'Box containing preformed curves used by Brockbank, Wilson and Mulliner in connection with the surveying of railway lines around Manchester'},
  {'uri': 'https://collection.sciencemuseumgroup.org.uk/objects/co226954',
   'topconcept': 'OBJECT',
   'label': 'Ralph Brocklebank',
   'description': 'Painting, oil on canvas. Ralph Brocklebank, London & North Western Railway, "A Director of the LNWR .  Portrait was presented by his friends'},
  {'uri': 'https://collection.sciencemuseumgroup.org.uk/people/cp86298',
   'topconcept': 'PERSON',
   'label': 'Gibson, William Wilson',
   '

## 3. Interface for labelling correct linked entity

Input is one of:
* `1-n`: choice of HC entity
* `x`: 'none of above'
* `https://collection.sciencemuseum...`: custom entity

**First, we shuffle the list of records to annotate:**

In [10]:
random.Random(42).shuffle(docs_with_candidates)

In [13]:
def display_annotation_interface_page(doc_with_candidates):
    uri, doc, candidates = doc_with_candidates
    
    spacy.displacy.render(doc, style='ent')
    rprint(f"[link {uri}]🔗")
    print("")
    
    for idx, c in enumerate(candidates):
        rprint(f"{idx+1}: [bold red]{c['label']}, {c['topconcept']}[/bold red] \n[grey]{c['description']}[/grey] \n{c['uri']} \n ")
        
def annotate(number: int, offset: int = 0, save_to_disk = True, exclude_labels: list = ["NORP", "LOC", "EVENT", "DATE", "QUANTITY", "FAC", "PERCENT", "CARDINAL"]):
    """Launch annotation"""
    records_to_annotate = docs_with_candidates[0 + offset:number + offset]
    records_annotated = []
    
    for idx, record in enumerate(records_to_annotate):
        uri, doc, candidates = record
        
        if doc.ents[0].label_ in exclude_labels:
            continue
        
        rprint(f"[green]{idx+1}/{len(records_to_annotate)}[/green]")
        display_annotation_interface_page(record)
        user_input = input("> ")
        
        # TODO: number retrieved != number requested as some are skipped. Use while loop instead
        
        if user_input.isdigit():
            user_input = int(user_input)
            if user_input in range(1, len(candidates) + 1):
                annotation = candidates[user_input - 1]['uri']
        elif user_input == "x":
            annotation = "NO_MATCH"
        elif user_input.startswith("http"):
            annotation = user_input
        elif user_input == "q":
            rprint("[bold red]SESSION EXITED[/bold red]")
            break
        else:
            annotation = "INVALID/SKIPPED"
        
        records_annotated.append((uri, doc, candidates, annotation))
        
        clear_output()
        
    if save_to_disk:
        strtime = get_timestamp()
        with open(f"annotations_{strtime}.pkl", "wb") as f:
            pickle.dump(records_annotated, f)
        
    return records_annotated

def load_annotations_from_disk(file_path):
    with open(file_path, "rb") as f:
        a = pickle.load(f)

    return a


In [14]:
annotation_results = annotate(500, save_to_disk=True)




KeyboardInterrupt: Interrupted by user

In [None]:
annotation_results