In [1]:
# %load_ext autoreload
# %autoreload 2
import sys
sys.path.append("../..")

from heritageconnector import datastore
from heritageconnector.config import field_mapping
from heritageconnector.best_spacy_pipeline import load_model
from heritageconnector.datastore import es, index
from smg_jobs.smg_loader import preprocess_text_for_ner

import numpy as np

from elasticsearch import helpers, Elasticsearch
from itertools import islice

from typing import Optional, Generator, List, Tuple

import pandas as pd
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

2021-03-19 15:35:12,230 - heritageconnector.datastore - DEBUG - Connected to Elasticsearch cluster at https://c4241473f2f84e2ab15c5b7c81eb34a4.eu-west-1.aws.found.io:9243/


In [2]:
nlp = load_model("en_core_web_trf")

spacy tried to use GPU but failed
2021-03-19 15:35:16,314 - hc_nlp.pipeline - INFO - Loading thesaurus from ../../heritageconnector/../GITIGNORE_DATA/labels_all_unambiguous_types_people_orgs.jsonl
2021-03-19 15:35:17,659 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 1s


In [3]:
category_mapping = {
    "Textiles": [
        "Textile Industry", 
        "Textiles Machinery"
    ],
    'Energy': [
        'Coal Mining', 
        'Electricity Supply', 
        'Environmental Science & Technology', 
        'Gas Industry', 
        "Heat Engines (non steam)",
        "Heating, Cooling and Ventilation",
        "James Watt's Garret Workshop",
        "Lighting",
        "Mining & Ore Dressing",
        "Motive Power",
        "Nuclear Physics",
        "Stationary Engines"
    ],
    "Communications": [
        "Cinematograph",
        "Electronic Component",
        "Pictorial Collection (Railway",
        "Printing & Writing",
        "Radio Communication",
        "Sound Reproduction",
        "Television",
    ],
    "General": [
        "Archive Collections",
        "Art",
        "Local History",
        "Photographs"
    ]
}

category_mapping_reversed = {i: k for k,v in category_mapping.items() for i in v }

categories = []
for k, v in category_mapping.items():
    categories += v 
    
len(categories)

25

In [4]:
# the below class inherits from datastore.NERLoader but changes _get_doc_generator so that it only gets objects from within 
# the categories listed above

class NERLoader(datastore.NERLoader):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        
    def _get_doc_generator(
        self,
        index: str,
        limit: Optional[int] = None,
        random_sample: bool = True,
        random_seed: int = 42,
    ) -> Generator[List[Tuple[str, str]], None, None]:
        """
        Returns a generator of document IDs and descriptions from the Elasticsearch index, batched according to
            `self.batch_size` and limited according to `limit`. Only documents with an XSD.description value are
            returned.

        Args:
            limit (Optional[int], optional): limit the number of documents to get and therefore load. Defaults to None.
            random_sample (bool, optional): whether to take documents at random. Defaults to True.
            random_seed (int, optional): random seed to use if random sampling is enabled using the `random_sample` parameter. Defaults to 42.

        Returns:
            Generator[List[Tuple[str, str]]]: generator of lists with length `self.batch_size`, where each list contains `(uri, description)` tuples.
        """

        es_query = {
            "query": {
                "function_score": {
                    "query": {
                        "bool": {
                            "must": [
                                {
                                    "exists": {
                                        "field": "data.http://www.w3.org/2001/XMLSchema#description"
                                    }
                                },
                                {"terms": {"graph.@sdo:isPartOf.@value.keyword": categories}}
                            ]
                        }
                    },
                    "random_score": {"seed": random_seed, "field": "_seq_no"},
                }
            }
        }

        doc_generator = helpers.scan(
            client=es,
            index=index,
            query=es_query,
            preserve_order=True,
        )

        if limit:
            doc_generator = islice(doc_generator, limit)

        doc_generator = (
            (
                doc["_id"],
                self.text_preprocess_func(
                    doc["_source"]["data"][
                        "http://www.w3.org/2001/XMLSchema#description"
                    ]
                ),
            )
            for doc in doc_generator
        )

        return doc_generator



In [5]:
source_description_field = target_description_field = "data.http://www.w3.org/2001/XMLSchema#description"
target_title_field = "graph.@rdfs:label.@value"
target_alias_field = "graph.@skos:altLabel.@value"
target_type_field = "graph.@skos:hasTopConcept.@value"

record_loader = datastore.RecordLoader("SMG", field_mapping)
ner_loader = NERLoader(
    record_loader = record_loader,
    source_es_index = 'heritageconnector',
    target_es_index = 'heritageconnector',
    source_description_field = source_description_field, 
    target_title_field = target_title_field, 
    target_description_field = target_description_field, 
    target_type_field = target_type_field,
    target_alias_field = target_alias_field,
    entity_types_to_link={"PERSON", "OBJECT", "ORG"},
    text_preprocess_func=preprocess_text_for_ner
)

In [6]:
ner_loader.get_list_of_entities_from_es("en_core_web_trf", 5000)
_ = ner_loader.get_link_candidates(10)


2021-03-19 15:35:17,686 - heritageconnector.datastore - INFO - Fetching docs and running NER.
spacy tried to use GPU but failed
2021-03-19 15:35:25,288 - hc_nlp.pipeline - INFO - Loading thesaurus from ../../heritageconnector/../GITIGNORE_DATA/labels_all_unambiguous_types_people_orgs.jsonl
2021-03-19 15:35:26,773 - hc_nlp.pipeline - INFO - 17016 term thesaurus imported in 1s


0it [00:00, ?it/s]

2021-03-19 15:44:25,725 - heritageconnector.datastore - INFO - Getting link candidates for each of 15215 entities


  0%|          | 0/15215 [00:00<?, ?it/s]

In [7]:
def train_entity_linker():
    df = pd.read_excel("../../GITIGNORE_DATA/NEL/review_data_1103.xlsx", index_col=0)
    df.loc[~df['link_correct'].isnull(), 'link_correct'] = df.loc[~df['link_correct'].isnull(), 'link_correct'].apply(int)
    df_annotated = df[(~df['link_correct'].isnull()) & (df['candidate_rank'] != -1)]

    return ner_loader.train_entity_linker(df_annotated)

clf = train_entity_linker()

2021-03-19 16:05:38,551 - heritageconnector.datastore - INFO - Training entity linker...
2021-03-19 16:05:41,226 - heritageconnector.nlp.nel - INFO - Calculating sBERT embeddings... (1/2)


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2021-03-19 16:05:44,607 - heritageconnector.nlp.nel - INFO - Calculating sBERT embeddings... (2/2)


Batches:   0%|          | 0/37 [00:00<?, ?it/s]

In [8]:
# get entities and split by those with candidates and those without
ent_df = ner_loader.entity_list_as_dataframe
ents_with_candidates, ents_without_candidates = ent_df[~ent_df['candidate_rank'].isna()], ent_df[ent_df['candidate_rank'].isna()]

In [9]:
# predict whether each link candidate is an actual link for the entity
y_pred = clf.predict_proba(ents_with_candidates)[:,1]
ents_with_candidates["y_pred"] = y_pred

2021-03-19 16:08:26,319 - heritageconnector.nlp.nel - INFO - Calculating sBERT embeddings... (1/2)


Batches:   0%|          | 0/104 [00:00<?, ?it/s]

2021-03-19 16:10:17,732 - heritageconnector.nlp.nel - INFO - Calculating sBERT embeddings... (2/2)


Batches:   0%|          | 0/788 [00:00<?, ?it/s]

In [10]:
# concatenate dataframes
ents_df_with_candidates_and_preds = pd.concat([ents_with_candidates, ents_without_candidates])


In [11]:
# add spaCy docs to the dataframe so these can just be retrieved instead of created in the demo

desc_unique, desc_unique_indices = np.unique(ents_df_with_candidates_and_preds.item_description.values, return_inverse=True)
docs_unique = np.array(list(nlp.pipe(desc_unique)), dtype=object)

# spacy docs in dataframe approach
# ents_df_with_candidates_and_preds['item_description_spacy_doc'] = docs_unique[desc_unique_indices]

# DocBin approach
# from spacy.tokens import DocBin
# doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"], store_user_data=True)

# for doc in docs_unique[desc_unique_indices]:
#     doc_bin.add(doc)

# doc_bin.to_disk('docs.spacy')

In [15]:
from spacy import displacy
html_list = [displacy.render(doc, style='ent', jupyter=False) for doc in docs_unique]

In [16]:
len(html_list) == len(docs_unique), html_list[0]

(True,
 '<div class="entities" style="line-height: 2.5; direction: ltr">&quot;A peep at the gas lights in \n<mark class="entity" style="background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Pall Mall\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">FAC</span>\n</mark>\n&quot; \n<mark class="entity" style="background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    1807\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">DATE</span>\n</mark>\n, \n<mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Rowlandson\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-le

In [20]:
ents_df_with_candidates_and_preds["ent_html"] = np.array(html_list)[desc_unique_indices]

In [22]:
# test of rendering a doc (if they are stored in the dataframe)
doc = ents_df_with_candidates_and_preds['ent_html'].iloc[0]
print(doc)


<div class="entities" style="line-height: 2.5; direction: ltr">A photograph of the crowd at an 
<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
    Arsenal
    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">ORG</span>
</mark>
 trial match, taken by 
<mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
    James Jarché
    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">PERSON</span>
</mark>
 and published in 
<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
    Weekly Illustrated
    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertic

In [23]:
ents_df_with_candidates_and_preds.head(1)

Unnamed: 0,item_uri,candidate_rank,item_description_with_ent,ent_label,ent_text,ent_sentence,candidate_title,candidate_type,candidate_uri,link_correct,candidate_alias,candidate_description,item_description,y_pred,ent_html
0,https://collection.sciencemuseumgroup.org.uk/objects/co8223447,0.0,"A photograph of the crowd at an [[Arsenal]] trial match, taken by James Jarché and published in Weekly Illustrated.",ORG,Arsenal,"A photograph of the crowd at an Arsenal trial match, taken by James Jarché and published in Weekly Illustrated.",Arsenal De Rochefort,ORGANISATION,https://collection.sciencemuseumgroup.org.uk/people/cp136697,,,"UNESCO, World Heritage Sites, Tentative List, available online at: Wreck Site, available online at: Created ex nihilo in 1666 by Louis XIV on the advice of Colbert, to protect the French coastline from attack and to build ships for the French naval fleet. By 1926, it no longer met the demands of a modern naval fleet and was closed.","A photograph of the crowd at an Arsenal trial match, taken by James Jarché and published in Weekly Illustrated.",0.034337,"<div class=""entities"" style=""line-height: 2.5; direction: ltr"">A photograph of the crowd at an \n<mark class=""entity"" style=""background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;"">\n Arsenal\n <span style=""font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem"">ORG</span>\n</mark>\n trial match, taken by \n<mark class=""entity"" style=""background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;"">\n James Jarché\n <span style=""font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem"">PERSON</span>\n</mark>\n and published in \n<mark class=""entity"" style=""background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;"">\n Weekly Illustrated\n <span style=""font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem"">ORG</span>\n</mark>\n.</div>"


In [24]:
# TODO: add item labels to dataframe here
# get categories
catalogue_df = pd.read_csv("../../GITIGNORE_DATA/smg-datasets-private/mimsy-catalogue-export.csv", usecols=["MKEY", "CATEGORY1", "TITLE"])
catalogue_df["MKEY"] = "https://collection.sciencemuseumgroup.org.uk/objects/co" + catalogue_df["MKEY"].astype(str)
catalogue_df["CATEGORY1"] = catalogue_df["CATEGORY1"].apply(lambda x: x.split(" - ")[1].strip())

# merge 
ents_df_with_candidates_and_preds = ents_df_with_candidates_and_preds.merge(catalogue_df, how='left', left_on='item_uri', right_on='MKEY',)
ents_df_with_candidates_and_preds["demo_category"] = ents_df_with_candidates_and_preds["CATEGORY1"].map(category_mapping_reversed)

ents_df_with_candidates_and_preds.groupby(["demo_category", "CATEGORY1"]).count()["item_uri"].unstack().fillna("-").T

demo_category,Communications,Energy,General,Textiles
CATEGORY1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Archive Collections,-,-,172.0,-
Art,-,-,21580.0,-
Coal Mining,-,759.0,-,-
Electricity Supply,-,1992.0,-,-
Environmental Science & Technology,-,216.0,-,-
Gas Industry,-,710.0,-,-
Heat Engines (non steam),-,160.0,-,-
"Heating, Cooling and Ventilation",-,101.0,-,-
James Watt's Garret Workshop,-,68.0,-,-
Lighting,-,676.0,-,-


In [25]:
ents_df_with_candidates_and_preds.head(1)

Unnamed: 0,item_uri,candidate_rank,item_description_with_ent,ent_label,ent_text,ent_sentence,candidate_title,candidate_type,candidate_uri,link_correct,candidate_alias,candidate_description,item_description,y_pred,ent_html,MKEY,TITLE,CATEGORY1,demo_category
0,https://collection.sciencemuseumgroup.org.uk/objects/co8223447,0.0,"A photograph of the crowd at an [[Arsenal]] trial match, taken by James Jarché and published in Weekly Illustrated.",ORG,Arsenal,"A photograph of the crowd at an Arsenal trial match, taken by James Jarché and published in Weekly Illustrated.",Arsenal De Rochefort,ORGANISATION,https://collection.sciencemuseumgroup.org.uk/people/cp136697,,,"UNESCO, World Heritage Sites, Tentative List, available online at: Wreck Site, available online at: Created ex nihilo in 1666 by Louis XIV on the advice of Colbert, to protect the French coastline from attack and to build ships for the French naval fleet. By 1926, it no longer met the demands of a modern naval fleet and was closed.","A photograph of the crowd at an Arsenal trial match, taken by James Jarché and published in Weekly Illustrated.",0.034337,"<div class=""entities"" style=""line-height: 2.5; direction: ltr"">A photograph of the crowd at an \n<mark class=""entity"" style=""background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;"">\n Arsenal\n <span style=""font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem"">ORG</span>\n</mark>\n trial match, taken by \n<mark class=""entity"" style=""background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;"">\n James Jarché\n <span style=""font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem"">PERSON</span>\n</mark>\n and published in \n<mark class=""entity"" style=""background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;"">\n Weekly Illustrated\n <span style=""font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem"">ORG</span>\n</mark>\n.</div>",https://collection.sciencemuseumgroup.org.uk/objects/co8223447,Crowd watching Arsenal trial match,Photographs,General


In [26]:
# TODO: export final dataframe to pickle
ents_df_with_candidates_and_preds.to_pickle("demo_data.pkl")

In [27]:
# also to parquet
ents_df_with_candidates_and_preds.to_parquet('demo_data.parquet.gzip', compression='gzip')