# Creating d3fc data 

Here we create a TSV of UMAP-projected embeddings, alongside different groupings of these embeddings, for a 2D projection visualisation. 

See [./create_colour_mappings_for_vis.ipynb](./create_colour_mappings_for_vis.ipynb) first.

In [93]:
import glob
from pathlib import Path
from typing import List, Optional, Iterable
import json
import re

import pandas as pd
import numpy as np
import requests
from tqdm.auto import tqdm

tqdm.pandas()
pd.set_option('display.max_colwidth', None)

In [3]:
def paginate_list(l, page_size):
    return [l[i : i + page_size] for i in range(0, len(l), page_size)]

def get_labels(uris: List[str]) -> dict:
    """Get labels for URIs using Heritage Connector API"""
    
    hc_api_labels_endpoint = "https://d0rgkq.deta.dev/labels"
    headers = {'Content-Type': 'application/json'}
    body = json.dumps({"uris": uris})
    res = requests.post(hc_api_labels_endpoint, headers=headers, data=body)
    
    return res.json()

get_labels(["http://www.wikidata.org/entity/Q3568968"])

{'http://www.wikidata.org/entity/Q3568968': 'William Stanley'}

## 1. Import data 

The ent-to-idx mapping created by DGL-KE, and the projected embeddings created by running the DGL-KE embeddings through UMAP.

In [4]:
ENT_MAPPING_PATH = "../data/processed/final_model_dglke/entities.tsv"
PROJECTED_EMBEDDINGS_PATH = "../data/processed/final_model_dglke/umap/best_projection_n_neighbours_10.npy"

ent_idx_mapping = pd.read_csv(
    ENT_MAPPING_PATH,
    sep="\t",
    index_col=0,
    header=None,
    names=["value"],
).fillna("")

projs = np.load(PROJECTED_EMBEDDINGS_PATH).astype('float32')

ent_idx_mapping.shape, projs.shape

((645565, 1), (645565, 2))

The various mappings from entities to groups (which will be displayed in different colours in the visualisation) created by the notebook [./create_colour_mappings_for_vis.ipynb](./create_colour_mappings_for_vis.ipynb).

In [5]:
MAPPINGS_FOLDER = "../data/processed/embedding_colour_mappings/"

mappings = {}

for filename in glob.glob(MAPPINGS_FOLDER + "*.tsv"):
    cat_name = Path(filename).stem
    mappings[cat_name] = pd.read_csv(filename, sep="\t", index_col=0, names=["value", "group"])
    
    print(f"Loaded {filename} to mappings['{cat_name}']")

Loaded ../data/processed/embedding_colour_mappings/mapping_type.tsv to mappings['mapping_type']
Loaded ../data/processed/embedding_colour_mappings/mapping_collection_category.tsv to mappings['mapping_collection_category']
Loaded ../data/processed/embedding_colour_mappings/mapping_database.tsv to mappings['mapping_database']


## 2. Transform data

We want to make a DataFrame we can export as a TSV, with columns:

``` markdown
- id
- label
- collection_category
- type
- x
- y
- index
```

In [6]:
# create `id, index`
transformed_data = ent_idx_mapping.copy().rename(columns={"value": "id"}).reset_index()

transformed_data.head()

Unnamed: 0,index,id
0,0,https://collection.sciencemuseumgroup.org.uk/people/cp28058
1,1,http://www.wikidata.org/entity/Q3568968
2,2,https://collection.sciencemuseumgroup.org.uk/objects/co138741
3,3,plastic
4,4,https://collection.sciencemuseumgroup.org.uk/people/cp28358


In [7]:
# create x, y
projs_df = pd.DataFrame(projs, columns=["x", "y"])
transformed_data = pd.concat([transformed_data, projs_df], axis=1)

transformed_data.head()

Unnamed: 0,index,id,x,y
0,0,https://collection.sciencemuseumgroup.org.uk/people/cp28058,4.982193,4.177696
1,1,http://www.wikidata.org/entity/Q3568968,5.086479,4.084801
2,2,https://collection.sciencemuseumgroup.org.uk/objects/co138741,-9.554541,-5.450275
3,3,plastic,-3.474861,-15.172668
4,4,https://collection.sciencemuseumgroup.org.uk/people/cp28358,2.950962,4.017635


In [8]:
# create mappings cols
MAPPINGS_TO_ADD = ["mapping_collection_category", "mapping_type"]

for mapping_name, mapping_df in mappings.items():
    if mapping_name in MAPPINGS_TO_ADD:
        new_col_name = mapping_name[8:] # remove prefix `mapping_`
        transformed_data[new_col_name] = mapping_df['group']
        
transformed_data.head()


Unnamed: 0,index,id,x,y,type,collection_category
0,0,https://collection.sciencemuseumgroup.org.uk/people/cp28058,4.982193,4.177696,Person,Person
1,1,http://www.wikidata.org/entity/Q3568968,5.086479,4.084801,Wikidata,Wikidata
2,2,https://collection.sciencemuseumgroup.org.uk/objects/co138741,-9.554541,-5.450275,Object,Category - Therapeutics
3,3,plastic,-3.474861,-15.172668,,
4,4,https://collection.sciencemuseumgroup.org.uk/people/cp28358,2.950962,4.017635,Organisation,Organisation


In [9]:
# create labels col

def has_probably_got_label(value):
    prefixes = [
        "https://collection.sciencemuseumgroup", 
        "http://www.wikidata.org/entity", 
        "https://blog.sciencemuseum.org.uk/",
        "http://journal.sciencemuseum.ac.uk/"
    ]
    
    for p in prefixes:
        if value.startswith(p): return True
        
    return False

ids_for_label_lookup = transformed_data.loc[
    transformed_data['id'].apply(has_probably_got_label) & (~transformed_data['type'].isna() | ~transformed_data['collection_category'].isna()),
    "id"
].tolist()

id_label_mapping = {}

for page in tqdm(paginate_list(ids_for_label_lookup, 5000)):
    id_label_mapping.update(get_labels(page))
    
transformed_data['label'] = transformed_data['id'].map(id_label_mapping)

transformed_data.head()

  0%|          | 0/87 [00:00<?, ?it/s]

KeyboardInterrupt: 

### 2.1 Add images 

In [87]:
def url_to_qid(url: str) -> Optional[str]:
    found = re.findall(r"(Q\d+)", url)
    if len(found) == 1:
        return found[0]
    else:
        return None
    
def qid_to_url(qid: str) -> str:
    return f"http://www.wikidata.org/entity/{qid}"

def wikidata_image_name_to_url(imgname: str) -> str:
    imgname_nospaces = "_".join(imgname.split(" ")) 

    return f"https://commons.wikimedia.org/wiki/File:{imgname_nospaces}"
    
def _param_join(params: List[str]) -> str:
    """
    Joins list of parameters for the URL. ['a', 'b'] -> "a%7Cb"
    """

    return "%7C".join(params) if len(params) > 1 else params[0]

def get_image_link_wikidata_entities(urls: Iterable[str]) -> dict:
    urls_to_qids = {url: url_to_qid(url) for url in urls}
    qids_lookup = [v for _, v in urls_to_qids.items() if v is not None]
    qids_lookup_paginated = paginate_list(qids_lookup, 50)
    results = {}

    for page in tqdm(qids_lookup_paginated):
        response = requests.get(f"https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&ids={_param_join(page)}&languages=en&languagefallback=1&formatversion=2")
        response_json = response.json()
        
        for qid, data in response_json['entities'].items():
            ent_url = qid_to_url(qid)
            if "P18" in data.get('claims', {}):
                try:
                    imgname = data['claims']['P18'][0]['mainsnak']['datavalue']['value']
                    results[ent_url] = wikidata_image_name_to_url(imgname)
                except:
                    print(f"{ent_url} failed")
                    results[ent_url] = "ERROR"
            else:
                results[ent_url] = None

    return results


In [88]:
# for Wikidata
print("getting images for Wikidata pages")
all_qids = transformed_data.loc[transformed_data['id'].str.contains('wikidata.org/entity'), 'id'].tolist()
qid_image_mapping = get_image_link_wikidata_entities(all_qids)
transformed_data['image_link'] = transformed_data['id'].map(qid_image_mapping, na_action='none')

getting images for Wikidata pages


  0%|          | 0/1654 [00:00<?, ?it/s]

http://www.wikidata.org/entity/Q13780930 failed


In [97]:
# for SMG
# TODO: get these from a manually generated mapping instead
print("getting images for SMG pages")



getting images for SMG pages


## 3. Export data

We remove all rows which have a NaN value for both the `type` and `collection_category` columns as these will never show on the plot. The javascript powering the visualisation will still need to check for NaN values.

In [98]:
export_data = transformed_data[~transformed_data['collection_category'].isna() & ~transformed_data['type'].isna()]
len(transformed_data), len(export_data)

(645565, 433571)

In [99]:
export_data.to_csv(
    "../data/processed/final_model_dglke/umap/visualisation_data_n_neighbours_10_with_wikidata_image_links.tsv", 
    sep="\t", 
    index=False,
)