## Running inference on the entity linker where the source records are not directly pulled from the Elasticsearch index

e.g. we want to make them up for a demo!

In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../../..")

from heritageconnector import datastore
from heritageconnector.config import field_mapping
from heritageconnector.best_spacy_pipeline import load_model
from heritageconnector.datastore import es, index
from smg_jobs.smg_loader import preprocess_text_for_ner

import entity_linker

import pandas as pd
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

2021-05-06 10:48:54,261 - heritageconnector.datastore - DEBUG - Connected to Elasticsearch cluster at https://c4241473f2f84e2ab15c5b7c81eb34a4.eu-west-1.aws.found.io:9243/


### Create NERLoader and train linker

In [2]:
source_description_field = target_description_field = "data.http://www.w3.org/2001/XMLSchema#description"
target_title_field = "graph.@rdfs:label.@value"
target_alias_field = "graph.@skos:altLabel.@value"
target_type_field = "graph.@skos:hasTopConcept.@value"

record_loader = datastore.RecordLoader("SMG", field_mapping)
ner_loader = datastore.NERLoader(
    record_loader,
    source_es_index = 'heritageconnector',
    target_es_index = 'heritageconnector',
    source_description_field = source_description_field, 
    target_title_field = target_title_field, 
    target_description_field = target_description_field, 
    target_type_field = target_type_field,
    target_alias_field = target_alias_field,
    entity_types_to_link={"PERSON", "OBJECT", "ORG"},
    text_preprocess_func=preprocess_text_for_ner
)

data_path = "../../../GITIGNORE_DATA/NEL/review_data_1103.xlsx"
clf = ner_loader.train_entity_linker(data_path)
ner_loader.has_trained_linker

2021-05-06 10:48:56,652 - heritageconnector.datastore - INFO - Training entity linker...
2021-05-06 10:48:59,270 - heritageconnector.nlp.nel - DEBUG - Calculating sBERT embeddings... (1/2)
2021-05-06 10:49:02,485 - heritageconnector.nlp.nel - DEBUG - Calculating sBERT embeddings... (2/2)


True

### Make up some entities and run inference on them

In [3]:
made_up_ents = [
    {
        "item_uri": "fake_uri",
        "item_description": "Macy’s Day Bird is a collection of threadbare songs, recorded at home, that document the profundity of everyday interactions and chronicle moments of sublime realization",
        "ent_label": "ORG",
        "ent_text": "Macy",
        "item_description_with_ent": "[[Macy]]’s Day Bird is a collection of threadbare songs, recorded at home, that document the profundity of everyday interactions and chronicle moments of sublime realization",
        "ent_sentence": "",
    },
    {
        "item_uri": "fake_uri_2",
        "item_description": "Reconstruction of the double helix model of DNA, using some of the original metal plates, by Francis Crick and James Watson, England, 1953",
        "ent_label": "PERSON",
        "ent_text": "Francis Crick",
        "item_description_with_ent": "Reconstruction of the double helix model of DNA, using some of the original metal plates, by [[Francis Crick]] and James Watson, England, 1953",
        "ent_sentence": "",
    }
]



In [4]:
# get link candidates - we have to set a value for 'alias' here otherwise it breaks
ner_loader._entity_list = made_up_ents
ner_loader.get_link_candidates(candidates_per_entity_mention=10)
ner_loader._entity_list
# ner_loader._entity_list[0]['link_candidates'][0]['alias'] = ""

# ent_df = ner_loader.entity_list_as_dataframe
# ent_df.head(2)

2021-05-06 10:49:41,854 - heritageconnector.datastore - INFO - Getting link candidates for each of 2 entities


  0%|          | 0/2 [00:00<?, ?it/s]

[{'item_uri': 'fake_uri',
  'item_description': 'Macy’s Day Bird is a collection of threadbare songs, recorded at home, that document the profundity of everyday interactions and chronicle moments of sublime realization',
  'ent_label': 'ORG',
  'ent_text': 'Macy',
  'item_description_with_ent': '[[Macy]]’s Day Bird is a collection of threadbare songs, recorded at home, that document the profundity of everyday interactions and chronicle moments of sublime realization',
  'ent_sentence': '',
  'link_candidates': [{'uri': 'https://collection.sciencemuseumgroup.org.uk/people/cp160978',
    'title': 'MACK',
    'description': 'accessed 10 February 2015   2 November 2014      Art and photography publishing house based London, established 2011 by Michael Mack, John Koh and Jean-Michel Dentand; MACK established the First Book Award in 2012 in collaboration with the National Media Museum and the Wilson Centre for Photography.',
    'type': 'ORGANISATION'},
   {'uri': 'https://collection.science

### Run inference

In [14]:
ent_df['pred_proba'] = clf.predict_proba(ent_df)[:,1]

2021-04-30 11:46:11,698 - heritageconnector.nlp.nel - DEBUG - Calculating sBERT embeddings... (1/2)
2021-04-30 11:46:11,784 - heritageconnector.nlp.nel - DEBUG - Calculating sBERT embeddings... (2/2)


In [15]:
ent_df

Unnamed: 0,item_uri,candidate_rank,item_description_with_ent,ent_label,ent_text,ent_sentence,candidate_title,candidate_type,candidate_uri,link_correct,candidate_alias,candidate_description,item_description,pred_proba
0,fake_uri,0.0,"[[Macy]]’s Day Bird is a collection of threadbare songs, recorded at home, that document the profundity of everyday interactions and chronicle moments of sublime realization",ORG,Macy,,MACK,ORGANISATION,https://collection.sciencemuseumgroup.org.uk/people/cp160978,,,"accessed 10 February 2015 2 November 2014 Art and photography publishing house based London, established 2011 by Michael Mack, John Koh and Jean-Michel Dentand; MACK established the First Book Award in 2012 in collaboration with the National Media Museum and the Wilson Centre for Photography.","Macy’s Day Bird is a collection of threadbare songs, recorded at home, that document the profundity of everyday interactions and chronicle moments of sublime realization",0.027574
1,fake_uri,1.0,"[[Macy]]’s Day Bird is a collection of threadbare songs, recorded at home, that document the profundity of everyday interactions and chronicle moments of sublime realization",ORG,Macy,,Mary,PERSON,https://collection.sciencemuseumgroup.org.uk/people/cp93770,,,"duchess from 1477-1482; Duchess of Brabant, Limburg, Lothier, Luxemburg and Guelders; Margravine of Namur; Countess Palatine of Burgundy; Countess of Artois, Flanders, Charolais, Hainaut, Holland, Zeeland and Zutphen; daughter of Charles the Bold","Macy’s Day Bird is a collection of threadbare songs, recorded at home, that document the profundity of everyday interactions and chronicle moments of sublime realization",0.008676
2,fake_uri,2.0,"[[Macy]]’s Day Bird is a collection of threadbare songs, recorded at home, that document the profundity of everyday interactions and chronicle moments of sublime realization",ORG,Macy,,Mary II,PERSON,https://collection.sciencemuseumgroup.org.uk/people/cp97562,,,ODNB: co-regent with William III,"Macy’s Day Bird is a collection of threadbare songs, recorded at home, that document the profundity of everyday interactions and chronicle moments of sublime realization",0.000702
3,fake_uri,3.0,"[[Macy]]’s Day Bird is a collection of threadbare songs, recorded at home, that document the profundity of everyday interactions and chronicle moments of sublime realization",ORG,Macy,,Mary Chulkhurst,PERSON,https://collection.sciencemuseumgroup.org.uk/people/cp80519,,,,"Macy’s Day Bird is a collection of threadbare songs, recorded at home, that document the profundity of everyday interactions and chronicle moments of sublime realization",2.1e-05
4,fake_uri,4.0,"[[Macy]]’s Day Bird is a collection of threadbare songs, recorded at home, that document the profundity of everyday interactions and chronicle moments of sublime realization",ORG,Macy,,Ernst Mach,PERSON,https://collection.sciencemuseumgroup.org.uk/people/cp38783,,,,"Macy’s Day Bird is a collection of threadbare songs, recorded at home, that document the profundity of everyday interactions and chronicle moments of sublime realization",1.6e-05
5,fake_uri,5.0,"[[Macy]]’s Day Bird is a collection of threadbare songs, recorded at home, that document the profundity of everyday interactions and chronicle moments of sublime realization",ORG,Macy,,Mary Anning,PERSON,https://collection.sciencemuseumgroup.org.uk/people/cp39781,,,n 90652738,"Macy’s Day Bird is a collection of threadbare songs, recorded at home, that document the profundity of everyday interactions and chronicle moments of sublime realization",0.000165
6,fake_uri,6.0,"[[Macy]]’s Day Bird is a collection of threadbare songs, recorded at home, that document the profundity of everyday interactions and chronicle moments of sublime realization",ORG,Macy,,Mary Munton,PERSON,https://collection.sciencemuseumgroup.org.uk/people/cp28541,,,objects 1992-7789 and 2000-7659,"Macy’s Day Bird is a collection of threadbare songs, recorded at home, that document the profundity of everyday interactions and chronicle moments of sublime realization",0.000274
7,fake_uri,7.0,"[[Macy]]’s Day Bird is a collection of threadbare songs, recorded at home, that document the profundity of everyday interactions and chronicle moments of sublime realization",ORG,Macy,,John Mace,PERSON,https://collection.sciencemuseumgroup.org.uk/people/cp29112,,,Union List of Artist Names Online : The J. Paul Getty Trust: ID: 500021631.,"Macy’s Day Bird is a collection of threadbare songs, recorded at home, that document the profundity of everyday interactions and chronicle moments of sublime realization",0.00016
8,fake_uri,8.0,"[[Macy]]’s Day Bird is a collection of threadbare songs, recorded at home, that document the profundity of everyday interactions and chronicle moments of sublime realization",ORG,Macy,,Wolfgang. Mack,PERSON,https://collection.sciencemuseumgroup.org.uk/people/cp91916,,,See A679186; (German): Wolfgang Mack (,"Macy’s Day Bird is a collection of threadbare songs, recorded at home, that document the profundity of everyday interactions and chronicle moments of sublime realization",6.5e-05
9,fake_uri,9.0,"[[Macy]]’s Day Bird is a collection of threadbare songs, recorded at home, that document the profundity of everyday interactions and chronicle moments of sublime realization",ORG,Macy,,Mary Solbie,OBJECT,https://collection.sciencemuseumgroup.org.uk/objects/co8432646,,,"A black and white silver gelatin print entitled ""Mary Solbie"" by Huw Davies, taken in Wigan, UK, c 1984. From the Impressions Gallery exhibition 'Roads to Wigan Pier', 20th October - 1st December 1984. nan","Macy’s Day Bird is a collection of threadbare songs, recorded at home, that document the profundity of everyday interactions and chronicle moments of sublime realization",5e-06
