In [1]:
# es
import sys
sys.path.append("../..")
from heritageconnector import datastore
from elasticsearch import helpers
from itertools import islice

# blink
import requests, json
import time

endpoint = "http://54.154.40.169:8000/blink/multiple"
headers = {
  'Content-Type': 'application/json'
}

# this is the number of unlinked entity mentions in all categories except for DATE, LANGUAGE and NORP
# as of 15.04.21 (using en_core_web_lg model)
no_unlinked_mentions = 462921

2021-04-15 16:54:26,455 - heritageconnector.datastore - DEBUG - Connected to Elasticsearch cluster at https://c4241473f2f84e2ab15c5b7c81eb34a4.eu-west-1.aws.found.io:9243/


## 1. Load unlinked entity mentions from Elasticsearch

In [2]:
entity_fields = [
    "graph.@hc:entityPERSON.@value", 
    "graph.@hc:entityORG.@value", 
    #"graph.@hc:entityDATE.@value", 
    "graph.@hc:entityLOC.@value", 
    #"graph.@hc:entityLANGUAGE.@value", 
    "graph.@hc:entityFAC.@value", 
    #"graph.@hc:entityNORP.@value", 
    "graph.@hc:entityOBJECT.@value"
]

fields_exist = [{"exists": {"field": field}} for field in entity_fields]

es_query = {
    "query": {
        "bool": {
            "should": fields_exist
        }
    }
}


doc_generator = helpers.scan(
            client=datastore.es,
            index='heritageconnector',
            query=es_query,
            preserve_order=True,
        )

In [22]:
def get_lookup_expressions_from_doc(doc, field_names):
    """Get descriptions with [[entities highlighted]] from an elasticsearch document and list of field names, representing the allowed entity types"""
    
    if 'graph' in field_names[0]:
        field_names = [i[6:-7] for i in field_names]
    
    # get description and entity mentions
    uri = doc['_id']
    description = doc['_source']['data']['http://www.w3.org/2001/XMLSchema#description']
    ent_mentions = []
    
    for field_name in field_names:
        field_val = doc['_source']['graph'][field_name]
        
        if isinstance(field_val, dict):
            ent_mentions.append((field_val['@value'], field_name))
        elif isinstance(field_val, list):
            ent_mentions += [(v['@value'], field_name) for v in field_val]
            
    # create modified descriptions with each mention [[highlighted]].
    # only the first mention is highlighted, if the entity mention occurs more than once in the description
    mod_descriptions = []
    
    for mention, label in ent_mentions:
        mod_desc = description.replace(mention, f"[[{mention}]]", 1)
        uid = uri# + "___" + mention.lower()
        mod_descriptions.append({"uid": uid, "text": mod_desc, "metadata": {"mention": mention, "label": label}})
            
    return mod_descriptions

doc = next(doc_generator)
get_lookup_expressions_from_doc(doc, entity_fields)

[{'uid': 'https://collection.sciencemuseumgroup.org.uk/people/cp39976',
  'text': 'Website - [[Zoltan Glass]] Speed and Spirit (http://www.intercult.org/2-ex-glass-e.htm) WIKI: http://en.wikipedia.org/wiki/Zolt%C3%A1n_Glass 4 October 2010 http://www.zoltanglass.com/zoltan.html accessed 12 September 2014 \n  \n In 1925 he began work as a cartoonist and retoucher, but moved into photography in the early 1930s; he took many photographs at the Nürburgring and Avus races, but subsequently  moved to London; his work also concentrated on fashion and naturist photography; established Zoltan Glass Studio at 183, Kings Road, Chelsea (later at Paradise Walk, SW3); sold studio in 1964 and retired to France, where he died in 1981',
  'metadata': {'mention': 'Zoltan Glass', 'label': '@hc:entityPERSON'}},
 {'uid': 'https://collection.sciencemuseumgroup.org.uk/people/cp39976',
  'text': 'Website - Zoltan Glass Speed and Spirit (http://www.intercult.org/2-ex-glass-e.htm) WIKI: http://en.wikipedia.org/w

## 2. Query these mentions against BLINK
- how good are the links?
- how fast is it?

In [23]:
def make_blink_request(query: dict):
    request_len = len(query['items'])
    start = time.time()
    response_json = requests.request("POST", endpoint, headers=headers, data=json.dumps(query)).json()
    end = time.time()
    response_time = end-start
    response_rate = response_time/request_len
    
    time_wholecorpus = response_rate*no_unlinked_mentions
    
    print(f"{request_len} items processed in {round(response_time, 1)} seconds ({response_rate} seconds/item)")
    print(f"Predicted time to attempt to link all mentions: {int(time_wholecorpus)} seconds = {round(time_wholecorpus/3600, 1)} hours")
    return response_json

In [24]:
no_docs = 10

items = []

for doc in islice(doc_generator, no_docs):
    items += get_lookup_expressions_from_doc(doc, entity_fields)

In [25]:
query = {
    "items": items,
    "threshold": 0.8,
}

In [26]:
make_blink_request(query)

161 items processed in 73.3 seconds (0.4550112167500561 seconds/item)
Predicted time to attempt to link all mentions: 210634 seconds = 58.5 hours


{'items': [{'uid': 'https://collection.sciencemuseumgroup.org.uk/people/cp51512',
   'metadata': {'mention': 'Thomas D. Lockwood', 'label': '@hc:entityPERSON'},
   'text': "SCM Archive coll \n (1848-1927), Electrical engineer  Thomas Dixon Lockwood, born on the 30th December 1848 at Smethwick, Birmingham, left England for Canada with his father in 1865 and became the first operator at Port Hope, Ontario, for the Provincial Telegraph Co. He afterwards went to the United States and was successively engaged in making paper in Massachusetts, plate glass in Indiana, and teaching school in Arkansas. His early experience as a telegraph operator and electrical work exerted their influence, and other pursuits were abandoned for telegraphy.  He worked for the Gold and Stock and the American District Telegraph Companies, among others, and in 1879 was appointed as assistant general inspector in the Bell Telephone Co. His work went much beyond his titular office, for the organization was then in th