In [1]:
%load_ext autoreload
%autoreload 2

# es
import sys
sys.path.append("../..")
from heritageconnector import datastore
from heritageconnector.nlp.nel import BLINKServiceWrapper
from elasticsearch import helpers
from itertools import islice

# blink
import requests, json
import time

endpoint = "http://54.195.144.9:8000/blink/multiple"
headers = {
  'Content-Type': 'application/json'
}

# this is the number of unlinked entity mentions in all categories except for DATE, LANGUAGE and NORP
# as of 15.04.21 (using en_core_web_lg model)
no_unlinked_mentions = 462921

2021-04-16 14:27:45,199 - heritageconnector.datastore - DEBUG - Connected to Elasticsearch cluster at https://c4241473f2f84e2ab15c5b7c81eb34a4.eu-west-1.aws.found.io:9243/


## 1. Load unlinked entity mentions from Elasticsearch

In [3]:
entity_fields = [
    "graph.@hc:entityPERSON.@value", 
    "graph.@hc:entityORG.@value", 
    #"graph.@hc:entityDATE.@value", 
    "graph.@hc:entityLOC.@value", 
    #"graph.@hc:entityLANGUAGE.@value", 
    "graph.@hc:entityFAC.@value", 
    #"graph.@hc:entityNORP.@value", 
    "graph.@hc:entityOBJECT.@value"
]

fields_exist = [{"exists": {"field": field}} for field in entity_fields]

es_query = {
    "query": {
        "bool": {
            "should": fields_exist
        }
    }
}


doc_generator = helpers.scan(
            client=datastore.es,
            index='heritageconnector',
            query=es_query,
            preserve_order=True,
        )

In [3]:
def get_lookup_expressions_from_doc(doc, field_names):
    """Get descriptions with [[entities highlighted]] from an elasticsearch document and list of field names, representing the allowed entity types"""
    
    if 'graph' in field_names[0]:
        field_names = [i[6:-7] for i in field_names]
    
    # get description and entity mentions
    uri = doc['_id']
    description = doc['_source']['data']['http://www.w3.org/2001/XMLSchema#description']
    ent_mentions = []
    
    for field_name in field_names:
        field_val = doc['_source']['graph'][field_name]
        
        if isinstance(field_val, dict):
            ent_mentions.append((field_val['@value'], field_name))
        elif isinstance(field_val, list):
            ent_mentions += [(v['@value'], field_name) for v in field_val]
            
    # create modified descriptions with each mention [[highlighted]].
    # only the first mention is highlighted, if the entity mention occurs more than once in the description
    mod_descriptions = []
    
    for mention, label in ent_mentions:
        mod_desc = description.replace(mention, f"[[{mention}]]", 1)
        uid = uri# + "___" + mention.lower()
        mod_descriptions.append({"id": uid, "text": mod_desc, "metadata": {"mention": mention, "label": label}})
            
    return mod_descriptions

doc = next(doc_generator)
get_lookup_expressions_from_doc(doc, entity_fields)

[{'id': 'https://collection.sciencemuseumgroup.org.uk/objects/co422089',
  'text': 'Notice. British Railways (Southern Region). Suspension of trains between Waterloo and Clapham Junction on the Hounslow (via Brentford Central or Richmond), Shepperton and Teddington (via Richmond) and Windsor and [[Weybridge]] (via Richmond) routes due to engineering works, 5 May 1957. Green text on white background. BR (SR) ref: AD7407/B1-1/4/9457. Printed by The Baynard Press. Format: double royal. Dimensions: 40 x 25 inches, 1016 x 635mm.',
  'metadata': {'mention': 'Weybridge', 'label': '@hc:entityPERSON'}},
 {'id': 'https://collection.sciencemuseumgroup.org.uk/objects/co422089',
  'text': 'Notice. British Railways (Southern Region). Suspension of trains between Waterloo and Clapham Junction on the Hounslow (via [[Brentford Central]] or Richmond), Shepperton and Teddington (via Richmond) and Windsor and Weybridge (via Richmond) routes due to engineering works, 5 May 1957. Green text on white backgro

## 2. Query these mentions against BLINK
- how good are the links?
- how fast is it?

In [4]:
def make_blink_request(query: dict):
    request_len = len(query['items'])
    start = time.time()
    response_json = requests.request("POST", endpoint, headers=headers, data=json.dumps(query)).json()
    end = time.time()
    response_time = end-start
    response_rate = response_time/request_len
    
    time_wholecorpus = response_rate*no_unlinked_mentions
    
    print(f"{request_len} items processed in {round(response_time, 1)} seconds ({response_rate} seconds/item)")
    print(f"Predicted time to attempt to link all mentions: {int(time_wholecorpus)} seconds = {round(time_wholecorpus/3600, 1)} hours")
    return response_json

In [16]:
blink_service = BLINKServiceWrapper(
    endpoint,
    description_field = "data.http://www.w3.org/2001/XMLSchema#description", 
    entity_fields = entity_fields, 
)

In [5]:
no_docs = 10

items = []

for doc in islice(doc_generator, no_docs):
    items += get_lookup_expressions_from_doc(doc, entity_fields)

In [6]:
query = {
    "items": items,
    "threshold": 0.8,
}

In [14]:
blink_service.make_blink_request(query)

2021-04-16 13:40:09,252 - heritageconnector.nlp.nel - INFO - 123 items processed in 70.5 seconds (0.5731584638114867 seconds/item)


{'items': [{'id': 'https://collection.sciencemuseumgroup.org.uk/objects/co226701',
   'metadata': {'mention': 'Yorks', 'label': '@hc:entityPERSON'},
   'text': 'Print. \'The New Elizabethan Era\' by Michael Turner. Mounted in folder  with accompanying text. A4 4-6-2 no 60022 Mallard hauling The Elizabethan passenger train in the cutting between Copenhagen  and Gasworks tunnels approaching Kings Cross. Other steam- hauled trains in the picture and carriages of another train passing out of view on overhead bridge. Factory chimney "Ebonite" in background. Published by Michael Benn & Associates   Ltd, Wetherby, [[Yorks]]. 255 x 310mm. Copyright: Michael Turner.  10 x 12.2047 in.',
   'links': [{'title': 'Yorkshire',
     'url': 'https://en.wikipedia.org/wiki/Yorkshire',
     'score': 0.999674916267395,
     'qid': 'Q163'}]},
  {'id': 'https://collection.sciencemuseumgroup.org.uk/objects/co226701',
   'metadata': {'mention': 'Kings Cross', 'label': '@hc:entityORG'},
   'text': 'Print. \'The

## 3. A full wrapper around unlinked entities and BLINK

In [10]:
blink_service = BLINKServiceWrapper(
    endpoint,
    description_field = "data.http://www.w3.org/2001/XMLSchema#description", 
    entity_fields = entity_fields, 
    wiki_link_threshold=0.8
)

In [11]:
blink_service.process_unlinked_entity_mentions(output_path="./test_output.jsonl", page_size=5, limit=20)

  0%|          | 0/4 [00:00<?, ?it/s]