## Creating colour mappings for vis

Here, we use python and SPARQL to create mappings from entities to their different properties, which we can use to colour an embedding visualisation.

In [20]:
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
import os
import csv
from tqdm.auto import tqdm

def make_sparql_request(query):
    sparql = SPARQLWrapper("http://63.33.68.17:3030/heritage-connector/sparql")
    sparql.setQuery(query)
    sparql.setMethod("POST")
    sparql.setReturnFormat(JSON)
    
    return sparql.query().convert()

In [21]:
# SMG
ent_mapping_path = "../data/processed/final_model_dglke/entities.tsv"

# V&A
ent_mapping_path = "../data/processed/final_model_dglke_vanda/entities.tsv"

ent_idx_mapping = pd.read_csv(
    ent_mapping_path,
    sep="\t",
    index_col=0,
    header=None,
    names=["value"],
    quoting=csv.QUOTE_NONE, 
    error_bad_lines=False,
).fillna("")

ent_idx_mapping.shape



  exec(code_obj, self.user_global_ns, self.user_ns)


(1208256, 1)

In [22]:
ent_idx_mapping.head()

Unnamed: 0,value
0,http://collections.vam.ac.uk/item/O1149857
1,http://www.wikidata.org/entity/Q7338619
2,http://collections.vam.ac.uk/item/O1175446
3,https://api.vam.ac.uk/v2/objects/search?id_mat...
4,http://collections.vam.ac.uk/item/O1163824


### 1. By database

categories: collection, journal, blog, Wikidata, V&A, 

In [23]:
def map_to_database(l):
    if 'collection.sciencemuseumgroup' in l:
        return 'SMG'
    elif 'blog.sciencemuseum' in l:
        return 'SMG blog'
    elif 'journal.sciencemuseum' in l:
        return 'SMG journal'
    elif 'wikidata.org/entity' in l:
        return 'Wikidata'
    elif ('https://api.vam.ac.uk/v2/objects/search' in l) or ('http://collections.vam.ac.uk/item' in l):
        return 'V&A'
    else:
        return None


mapping_database = ent_idx_mapping.copy()
mapping_database['group'] = mapping_database['value'].apply(map_to_database)

mapping_database.head()

Unnamed: 0,value,group
0,http://collections.vam.ac.uk/item/O1149857,V&A
1,http://www.wikidata.org/entity/Q7338619,Wikidata
2,http://collections.vam.ac.uk/item/O1175446,V&A
3,https://api.vam.ac.uk/v2/objects/search?id_mat...,V&A
4,http://collections.vam.ac.uk/item/O1163824,V&A


## 2. By type (org/person/object etc)

categories: SMG/V&A person, SMG/V&A organisation, SMG/V&A object, blog, journal, Wikidata entity

In [5]:
type_query = """PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
SELECT DISTINCT ?s ?o WHERE {        
  ?s skos:hasTopConcept ?o.
}
"""

res = make_sparql_request(type_query)

In [11]:
# TODO: use the above skos:hasTopConcept results as a starting point for this instead of the blank mappings

res_df = pd.json_normalize(res['results']['bindings'])[['s.value', 'o.value']]
mapping_type = ent_idx_mapping.copy()

mapping_type = pd.merge(
    left=mapping_type, 
    right=res_df, 
    left_on="value",
    right_on="s.value",
    how="left"
)[['value', 'o.value']].rename(columns={'o.value': 'group'})

mapping_type.loc[mapping_type['value'].str.startswith("http://www.wikidata.org/entity"), "group"] = 'WIKIDATA'
mapping_type['group'] = mapping_type['group'].apply(lambda i: i.title().replace("_", " ") if str(i) != "nan" else i)
mapping_type.head()

Unnamed: 0,value,group
0,http://collections.vam.ac.uk/item/O1149857,Object
1,http://www.wikidata.org/entity/Q7338619,Wikidata
2,http://collections.vam.ac.uk/item/O1175446,Object
3,https://api.vam.ac.uk/v2/objects/search?id_mat...,
4,http://collections.vam.ac.uk/item/O1163824,Object


### 2.1 V&A - distinguish between V&A people/orgs/objects and SMG people/orgs/objects

In [14]:
mapping_type_tochange = mapping_type[mapping_type["group"].isin(["Object", "Organisation", "Person"])]
                                     
for idx, row in tqdm(mapping_type_tochange.iterrows(), total=len(mapping_type_tochange)):
    mapping_type.loc[idx, "group"] = f"{mapping_type.loc[idx, 'group']} - {mapping_database.loc[idx, 'group']}"

  0%|          | 0/694576 [00:00<?, ?it/s]

## 3. By part of collection (SMG internal collection categories)

For this one we use SPARQL with the predicate `sdo:isPartOf`. We keep the `skos:hasTopConcept` value for any entity which doesn't have a `sdo:isPartOf` value (i.e. is not a collection object).

In [16]:
collection_query = """PREFIX sdo: <https://schema.org/>
SELECT DISTINCT ?s ?o WHERE {        
  ?s sdo:isPartOf ?o.
}
"""

collection_res = make_sparql_request(collection_query)

In [17]:
collection_res_df = pd.json_normalize(collection_res['results']['bindings'])[['s.value', 'o.value']]
mapping_collection_category = ent_idx_mapping.copy()

mapping_collection_category = pd.merge(
    left=mapping_collection_category, 
    right=collection_res_df, 
    left_on="value",
    right_on="s.value",
    how="left"
)[['value', 'o.value']].rename(columns={'o.value': 'group'})

mapping_collection_category['group'] = mapping_collection_category['group'].apply(lambda i: "Category - " + str(i) if str(i)!= "nan" else i)
mapping_collection_category['group'] = mapping_collection_category['group'].combine_first(mapping_type['group'])

mapping_collection_category.head()

Unnamed: 0,value,group
0,http://collections.vam.ac.uk/item/O1149857,Category - THES48602 - Theatre and Performance...
1,http://www.wikidata.org/entity/Q7338619,Wikidata
2,http://collections.vam.ac.uk/item/O1175446,Category - THES48602 - Theatre and Performance...
3,https://api.vam.ac.uk/v2/objects/search?id_mat...,
4,http://collections.vam.ac.uk/item/O1163824,Category - THES48602 - Theatre and Performance...


## 4. Export mappings

In [18]:
export_dir = "../data/processed/embedding_colour_mappings_vanda"

if not os.path.exists(export_dir): os.mkdir(export_dir)

In [19]:
def export_mapping(mapping_df, filename):
    mapping_df.to_csv(os.path.join(export_dir, filename), sep="\t", header=None)
    
export_mapping(mapping_database, 'mapping_database.tsv')
export_mapping(mapping_type, 'mapping_type.tsv')
export_mapping(mapping_collection_category, 'mapping_collection_category.tsv')