In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../../..")

from heritageconnector import datastore
from heritageconnector.config import field_mapping
from heritageconnector.best_spacy_pipeline import load_model
from heritageconnector.datastore import es, index
from smg_jobs.smg_loader import preprocess_text_for_ner

import entity_linker

import pandas as pd
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

2021-03-18 10:03:43,928 - heritageconnector.datastore - DEBUG - Connected to Elasticsearch cluster at https://c4241473f2f84e2ab15c5b7c81eb34a4.eu-west-1.aws.found.io:9243/


### NERLoader

In [2]:
source_description_field = target_description_field = "data.http://www.w3.org/2001/XMLSchema#description"
target_title_field = "graph.@rdfs:label.@value"
target_alias_field = "graph.@skos:altLabel.@value"
target_type_field = "graph.@skos:hasTopConcept.@value"

record_loader = datastore.RecordLoader("SMG", field_mapping)
ner_loader = datastore.NERLoader(
    record_loader,
    source_es_index = 'heritageconnector',
    target_es_index = 'heritageconnector',
    source_description_field = source_description_field, 
    target_title_field = target_title_field, 
    target_description_field = target_description_field, 
    target_type_field = target_type_field,
    target_alias_field = target_alias_field,
    entity_types_to_link={"PERSON", "OBJECT", "ORG"},
    text_preprocess_func=preprocess_text_for_ner
)

In [3]:
entity_list = ner_loader.get_list_of_entities_from_es("en_core_web_trf", 500, random_seed=420)
len(entity_list)

2021-03-18 10:03:46,393 - heritageconnector.datastore - INFO - Fetching docs and running NER.
spacy tried to use GPU but failed
2021-03-18 10:03:51,249 - hc_nlp.pipeline - INFO - Loading thesaurus from ../../../heritageconnector/../GITIGNORE_DATA/labels_all_unambiguous_types_people_orgs.jsonl
2021-03-18 10:03:52,607 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 1s


0it [00:00, ?it/s]

1340

In [5]:
entity_list_with_links = ner_loader.get_link_candidates(candidates_per_entity_mention=10)

2021-03-18 10:04:38,710 - heritageconnector.datastore - INFO - Getting link candidates for each of 1340 entities


  0%|          | 0/1340 [00:00<?, ?it/s]

In [97]:
ent_df = ner_loader.entity_list_as_dataframe
ent_df.columns, ent_df.shape

(Index(['item_uri', 'candidate_rank', 'item_description_with_ent', 'ent_label',
        'ent_text', 'ent_sentence', 'candidate_title', 'candidate_type',
        'candidate_uri', 'link_correct', 'candidate_alias',
        'candidate_description', 'item_description'],
       dtype='object'),
 (6042, 13))

In [102]:
review_df = ner_loader.get_links_data_for_review()
# review_df.head()
# review_df.to_csv("./review_data.csv")

2021-03-18 11:40:34,039 - heritageconnector.datastore - DEBUG - Connected to Elasticsearch cluster at https://c4241473f2f84e2ab15c5b7c81eb34a4.eu-west-1.aws.found.io:9243/
2021-03-18 11:40:34,039 - heritageconnector.datastore - DEBUG - Connected to Elasticsearch cluster at https://c4241473f2f84e2ab15c5b7c81eb34a4.eu-west-1.aws.found.io:9243/
2021-03-18 11:40:34,039 - heritageconnector.datastore - DEBUG - Connected to Elasticsearch cluster at https://c4241473f2f84e2ab15c5b7c81eb34a4.eu-west-1.aws.found.io:9243/
2021-03-18 11:40:34,039 - heritageconnector.datastore - DEBUG - Connected to Elasticsearch cluster at https://c4241473f2f84e2ab15c5b7c81eb34a4.eu-west-1.aws.found.io:9243/
2021-03-18 11:40:34,039 - heritageconnector.datastore - DEBUG - Connected to Elasticsearch cluster at https://c4241473f2f84e2ab15c5b7c81eb34a4.eu-west-1.aws.found.io:9243/
2021-03-18 11:40:34,039 - heritageconnector.datastore - DEBUG - Connected to Elasticsearch cluster at https://c4241473f2f84e2ab15c5b7c81eb34

In [6]:
# review_df.to_excel("./review_data_1103.xlsx")

## Train entity linker and push entities to Elasticsearch index

In [24]:
df = pd.read_excel("../../../GITIGNORE_DATA/NEL/review_data_1103.xlsx", index_col=0)
df.loc[~df['link_correct'].isnull(), 'link_correct'] = df.loc[~df['link_correct'].isnull(), 'link_correct'].apply(int)
df_annotated = df[(~df['link_correct'].isnull()) & (df['candidate_rank'] != -1)]


In [139]:
clf = ner_loader.train_entity_linker(df_annotated)

2021-03-18 13:22:35,065 - heritageconnector.datastore - INFO - Training entity linker...
2021-03-18 13:22:35,065 - heritageconnector.datastore - INFO - Training entity linker...
2021-03-18 13:22:35,065 - heritageconnector.datastore - INFO - Training entity linker...
2021-03-18 13:22:35,065 - heritageconnector.datastore - INFO - Training entity linker...
2021-03-18 13:22:35,065 - heritageconnector.datastore - INFO - Training entity linker...
2021-03-18 13:22:35,065 - heritageconnector.datastore - INFO - Training entity linker...
2021-03-18 13:22:35,065 - heritageconnector.datastore - INFO - Training entity linker...
2021-03-18 13:22:35,065 - heritageconnector.datastore - INFO - Training entity linker...
2021-03-18 13:22:35,065 - heritageconnector.datastore - INFO - Training entity linker...
2021-03-18 13:22:35,065 - heritageconnector.datastore - INFO - Training entity linker...
2021-03-18 13:22:35,065 - heritageconnector.datastore - INFO - Training entity linker...
2021-03-18 13:22:35,0

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2021-03-18 13:22:40,817 - heritageconnector.nlp.nel - INFO - Calculating sBERT embeddings... (2/2)
2021-03-18 13:22:40,817 - heritageconnector.nlp.nel - INFO - Calculating sBERT embeddings... (2/2)
2021-03-18 13:22:40,817 - heritageconnector.nlp.nel - INFO - Calculating sBERT embeddings... (2/2)


Batches:   0%|          | 0/37 [00:00<?, ?it/s]

In [140]:
ner_loader.clf

Pipeline(steps=[('featgen', NELFeatureGenerator()),
                ('classifier', MLPClassifier(max_iter=1000, random_state=42))])

In [116]:
ner_loader.load_entities_into_es(linking_confidence_threshold=0.5)

2021-03-18 12:18:13,060 - heritageconnector.datastore - DEBUG - Connected to Elasticsearch cluster at https://c4241473f2f84e2ab15c5b7c81eb34a4.eu-west-1.aws.found.io:9243/
2021-03-18 12:18:13,060 - heritageconnector.datastore - DEBUG - Connected to Elasticsearch cluster at https://c4241473f2f84e2ab15c5b7c81eb34a4.eu-west-1.aws.found.io:9243/
2021-03-18 12:18:13,060 - heritageconnector.datastore - DEBUG - Connected to Elasticsearch cluster at https://c4241473f2f84e2ab15c5b7c81eb34a4.eu-west-1.aws.found.io:9243/
2021-03-18 12:18:13,060 - heritageconnector.datastore - DEBUG - Connected to Elasticsearch cluster at https://c4241473f2f84e2ab15c5b7c81eb34a4.eu-west-1.aws.found.io:9243/
2021-03-18 12:18:13,060 - heritageconnector.datastore - DEBUG - Connected to Elasticsearch cluster at https://c4241473f2f84e2ab15c5b7c81eb34a4.eu-west-1.aws.found.io:9243/
2021-03-18 12:18:13,060 - heritageconnector.datastore - DEBUG - Connected to Elasticsearch cluster at https://c4241473f2f84e2ab15c5b7c81eb34

Batches:   0%|          | 0/164 [00:00<?, ?it/s]

2021-03-18 12:21:02,555 - heritageconnector.nlp.nel - INFO - Calculating sBERT embeddings... (2/2)


Batches:   0%|          | 0/164 [00:00<?, ?it/s]

2021-03-18 12:23:34,061 - heritageconnector.datastore - INFO - Loading 261 linked and 1156 unlinked entities into heritageconnector
2021-03-18 12:23:34,061 - heritageconnector.datastore - INFO - Loading 261 linked and 1156 unlinked entities into heritageconnector
2021-03-18 12:23:34,061 - heritageconnector.datastore - INFO - Loading 261 linked and 1156 unlinked entities into heritageconnector
2021-03-18 12:23:34,061 - heritageconnector.datastore - INFO - Loading 261 linked and 1156 unlinked entities into heritageconnector
2021-03-18 12:23:34,061 - heritageconnector.datastore - INFO - Loading 261 linked and 1156 unlinked entities into heritageconnector
2021-03-18 12:23:34,061 - heritageconnector.datastore - INFO - Loading 261 linked and 1156 unlinked entities into heritageconnector
2021-03-18 12:23:34,061 - heritageconnector.datastore - INFO - Loading 261 linked and 1156 unlinked entities into heritageconnector
2021-03-18 12:23:34,061 - heritageconnector.datastore - INFO - Loading 261 l

In [128]:
unique_vals, unique_indices = np.unique(ent_df["item_description"], return_inverse=True)

In [129]:
len(ent_df)

6042