In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../../..")

from heritageconnector import datastore
from heritageconnector.config import field_mapping
from heritageconnector.best_spacy_pipeline import load_model
from heritageconnector.datastore import es, index
from smg_jobs.smg_loader import preprocess_text_for_ner

import entity_linker

import pandas as pd
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

2021-03-23 16:21:31,935 - heritageconnector.datastore - DEBUG - Connected to Elasticsearch cluster at https://c4241473f2f84e2ab15c5b7c81eb34a4.eu-west-1.aws.found.io:9243/


### NERLoader

In [2]:
source_description_field = target_description_field = "data.http://www.w3.org/2001/XMLSchema#description"
target_title_field = "graph.@rdfs:label.@value"
target_alias_field = "graph.@skos:altLabel.@value"
target_type_field = "graph.@skos:hasTopConcept.@value"

record_loader = datastore.RecordLoader("SMG", field_mapping)
ner_loader = datastore.NERLoader(
    record_loader,
    source_es_index = 'heritageconnector_test',
    target_es_index = 'heritageconnector_test',
    source_description_field = source_description_field, 
    target_title_field = target_title_field, 
    target_description_field = target_description_field, 
    target_type_field = target_type_field,
    target_alias_field = target_alias_field,
    entity_types_to_link={"PERSON", "OBJECT", "ORG"},
    text_preprocess_func=preprocess_text_for_ner
)

In [3]:
entity_list = ner_loader.get_list_of_entities_from_es("en_core_web_trf", 100, random_seed=420)
len(entity_list)

2021-03-23 16:21:32,931 - heritageconnector.datastore - INFO - Fetching docs and running NER.
spacy tried to use GPU but failed
2021-03-23 16:21:36,923 - hc_nlp.pipeline - INFO - Loading thesaurus from ../../../heritageconnector/../GITIGNORE_DATA/labels_all_unambiguous_types_people_orgs.jsonl
2021-03-23 16:21:38,320 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 1s


0it [00:00, ?it/s]

463

In [4]:
entity_list_with_links = ner_loader.get_link_candidates(candidates_per_entity_mention=10)

2021-03-23 16:21:51,361 - heritageconnector.datastore - INFO - Getting link candidates for each of 463 entities


  0%|          | 0/463 [00:00<?, ?it/s]

In [5]:
ent_df = ner_loader.entity_list_as_dataframe
ent_df.columns, ent_df.shape

(Index(['item_uri', 'candidate_rank', 'item_description_with_ent', 'ent_label',
        'ent_text', 'ent_sentence', 'candidate_title', 'candidate_type',
        'candidate_uri', 'link_correct', 'candidate_alias',
        'candidate_description', 'item_description'],
       dtype='object'),
 (1349, 13))

In [6]:
ent_df.head(1)

Unnamed: 0,item_uri,candidate_rank,item_description_with_ent,ent_label,ent_text,ent_sentence,candidate_title,candidate_type,candidate_uri,link_correct,candidate_alias,candidate_description,item_description
0,https://collection.sciencemuseumgroup.org.uk/people/cp11429,,"born [[1925]] , active 1940s-60s, merchant navy wireless, radar and radio telegraphy operator, British",DATE,1925,"born 1925 , active 1940s-60s, merchant navy wireless, radar and radio telegraphy operator, British",,,,,,,"born 1925 , active 1940s-60s, merchant navy wireless, radar and radio telegraphy operator, British"


In [7]:
review_df = ner_loader.get_links_data_for_review()
# review_df.head()
# review_df.to_csv("./review_data.csv")

In [8]:
# review_df.to_excel("./review_data_1103.xlsx")

## Train entity linker and push entities to Elasticsearch index

In [9]:
df = pd.read_excel("../../../GITIGNORE_DATA/NEL/review_data_1103.xlsx", index_col=0)
df.loc[~df['link_correct'].isnull(), 'link_correct'] = df.loc[~df['link_correct'].isnull(), 'link_correct'].apply(int)
df_annotated = df[(~df['link_correct'].isnull()) & (df['candidate_rank'] != -1)]


In [10]:
clf = ner_loader.train_entity_linker(df_annotated)

2021-03-23 16:22:01,376 - heritageconnector.datastore - INFO - Training entity linker...
2021-03-23 16:22:03,842 - heritageconnector.nlp.nel - DEBUG - Calculating sBERT embeddings... (1/2)
2021-03-23 16:22:07,075 - heritageconnector.nlp.nel - DEBUG - Calculating sBERT embeddings... (2/2)


In [11]:
ner_loader.has_trained_linker

True

In [13]:
ner_loader.load_entities_into_es(linking_confidence_threshold=0.5, force_load_without_linker=True)

2021-03-23 16:23:42,833 - heritageconnector.datastore - DEBUG - Connected to Elasticsearch cluster at https://c4241473f2f84e2ab15c5b7c81eb34a4.eu-west-1.aws.found.io:9243/
2021-03-23 16:23:42,833 - heritageconnector.datastore - DEBUG - Connected to Elasticsearch cluster at https://c4241473f2f84e2ab15c5b7c81eb34a4.eu-west-1.aws.found.io:9243/
2021-03-23 16:23:42,954 - heritageconnector.datastore - INFO - Loading 463 entities into heritageconnector_test
2021-03-23 16:23:42,954 - heritageconnector.datastore - INFO - Loading 463 entities into heritageconnector_test
2021-03-23 16:23:43,300 - heritageconnector.datastore - INFO - Loading entity mentions with no link candidates by type...
2021-03-23 16:23:43,300 - heritageconnector.datastore - INFO - Loading entity mentions with no link candidates by type...


  0%|          | 0/8 [00:00<?, ?ent type/s]

2021-03-23 16:23:44,006 - heritageconnector.datastore - INFO - Predicting links for entity mentions with link candidates and loading them in, in batches of 32768...
2021-03-23 16:23:44,006 - heritageconnector.datastore - INFO - Predicting links for entity mentions with link candidates and loading them in, in batches of 32768...


  0%|          | 0/1 [00:00<?, ?batch/s]

2021-03-23 16:23:54,052 - heritageconnector.nlp.nel - DEBUG - Calculating sBERT embeddings... (1/2)
2021-03-23 16:23:56,744 - heritageconnector.nlp.nel - DEBUG - Calculating sBERT embeddings... (2/2)
