# Searching collection items by name
For how many items can we get an unambiguous result just from searching the title with some basic SPARQL queries?

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
sys.path.append("..")

from heritageconnector.utils.sparql import get_sparql_results

import re
from tqdm import tqdm
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## open collection data

In [4]:
df = pd.read_csv("../GITIGNORE_DATA/mimsy-catalogue-export.csv")

# clean up title column so it's searchable in the SPARQL query
df["TITLE"] = df["TITLE"].str.replace("\"", "").replace("\n", "")

df.head(2)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,TITLE,ITEM_NAME,CATEGORY1,COLLECTOR,PLACE_COLLECTED,DATE_COLLECTED,PLACE_MADE,CULTURE,DATE_MADE,MATERIALS,MEASUREMENTS,EXTENT,DESCRIPTION,ITEM_COUNT,PARENT_KEY,BROADER_TEXT,WHOLE_PART,ARRANGEMENT,LANGUAGE_OF_MATERIAL,EDITION,OPTION1,OPTION2,OPTION3,OPTION4,OPTION5,OPTION6,OPTION7,OPTION8,OPTION9,OPTION10,OPTION11,OPTION12,OPTION13,OPTION14,OPTION15,CREATE_DATE,UPDATE_DATE
0,Ansonia Sunwatch (pocket compass dial),Pocket horizontal sundial,SCM - Time Measurement,,,,"New York county, New York state, United States",,1922-1939,,,,Ansonia Sunwatch (pocket compass dial),1.0,,,WHOLE,,eng,,,,"Desborough, Jane",,,,,,,,SMG00083125,,,One Collection,,12-MAR-96,19-JUN-19
1,Model of train of wheels used in a clock (full siz,spring-driven clock mechanism; fusee; model,SCM - Time Measurement,,,,,,,,,,Model of train of wheels used in a clock (full size) with pair of vanes and base,1.0,,,WHOLE,,eng,,,,"Desborough, Jane",,,,,,,,,,,One Collection,,12-MAR-96,30-MAY-18


In [5]:
category_vc = df["CATEGORY1"].value_counts()
#category_vc[category_vc > 50][-40:]

## make table with query results for titles
using Wikidata EntitySearch, eliminating **humans, organisations, places, concepts (abstract ideas)**

we're only interested in *objects with unique titles* here

In [6]:
title_vc = df["TITLE"].value_counts()
unique_titles = title_vc[title_vc == 1].index.tolist()

df_unique = df[df['TITLE'].isin(unique_titles)]

len(df), len(df_unique), len(df_unique)/len(df)

(281899, 199053, 0.7061146013288447)

In [14]:
endpoint_url = "https://query.wikidata.org/sparql"

# human, organization, place, disambiguation page, ordinary matter, human disambiguation page, geographic entity, performance work, 
# scholarly article
types_exclude = ["Q5", "Q43229", "P276", "Q4167410", "Q28555911", "Q22808320", "Q27096213", "Q17538722", "Q13442814"]

map_ids = lambda ids: ",".join([f"wd:{i}" for i in ids])
    
def run_query(title):
    # NOTE: the LIMIT value at the end also tells the query how far up to go in the 'class of' chain
    
    query = f"""
    SELECT DISTINCT ?item ?itemLabel ?type ?typeLabel ?classTree ?classTreeLabel WHERE {{
      SERVICE wikibase:mwapi {{
          bd:serviceParam wikibase:api "EntitySearch" .
          bd:serviceParam wikibase:endpoint "www.wikidata.org" .
          bd:serviceParam mwapi:search "{title}" .
          bd:serviceParam mwapi:language "en" .
          ?item wikibase:apiOutputItem mwapi:item .
          ?num wikibase:apiOrdinal true .
      }}
      ?item (wdt:P279|wdt:P31) ?type .
      ?item (wdt:P31/wdt:P279*) ?classTree .
      FILTER ( ?type not in ( {map_ids(types_exclude)} ) )
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en"}}
    }} ORDER BY ASC(?num) LIMIT 20
    """
    
    res = get_sparql_results(endpoint_url, query)['results']['bindings']
    res_df = pd.json_normalize(res)
    
    if len(res_df) == 0:
        return pd.DataFrame()
    
    res_df['TITLE'] = title
    for col in ['item.value', 'type.value', 'classTree.value']:
        res_df[col] = res_df[col].apply(lambda item: re.findall(r"(Q\d+)", item)[0])
    res_df = res_df[['TITLE', 'item.value', 'itemLabel.value', 'type.value', 'typeLabel.value', 'classTree.value', 'classTreeLabel.value']]
    
    # TODO: work out how to put this in the SPARQL query
    titles_todrop = res_df.loc[res_df['classTree.value'].isin(types_exclude), 'TITLE']
    res_df = res_df[~res_df['TITLE'].isin(titles_todrop)]
    
    res_df = res_df[['TITLE', 'item.value', 'itemLabel.value', 'type.value', 'typeLabel.value']].drop_duplicates()
    
    return res_df

In [23]:
run_query("L'Aigle")

Unnamed: 0,TITLE,item.value,itemLabel.value,type.value,typeLabel.value
0,L'Aigle,Q500588,L'Aigle,Q484170,commune of France


In [18]:
categories = ["SCM - Art"]
#categories = category_vc[category_vc > 50][-30:].index.tolist() # 30 lowest populated categories
#categories = ["SIM - Television"]

# search on a subset of categories
df_tosearch = df_unique[df_unique["CATEGORY1"].isin(categories)]

len(df_tosearch), len(df_unique), len(df_tosearch)/len(df)

(5844, 199053, 0.020730829126743976)

In [19]:
searchres_df = pd.DataFrame()

for idx, row in tqdm(df_tosearch.iterrows(), total=df_tosearch.shape[0]):
    try:
        tempdf = run_query(row['TITLE'])
        tempdf["CATEGORY"] = row["CATEGORY1"]
        searchres_df = pd.concat([searchres_df, tempdf])
        
    except Exception:
        print(f"ERROR: {row['TITLE']}")

 63%|██████▎   | 3690/5844 [27:35<20:45,  1.73it/s]  

ERROR: Trade card: E. M. Clarke, 428 Strand, London.  
Op


100%|██████████| 5844/5844 [44:35<00:00,  2.18it/s]  


In [20]:
# EXPORT
searchres_df.to_pickle("../GITIGNORE_DATA/title_lookup/SCM-ART_v2.pkl")

In [21]:
title_vc = searchres_df.groupby("TITLE").count()['item.value']
unique_results = title_vc[title_vc == 1].index.tolist()
print(len(unique_results))

163


In [22]:
view_df = searchres_df
unique_results = view_df.loc[view_df['TITLE'].isin(unique_results), :]
unique_results

Unnamed: 0,TITLE,item.value,itemLabel.value,type.value,typeLabel.value,CATEGORY
0,Bleach Works at Llewenni,Q23832039,"Bleach works at Llewenni: as at first intended to be built for the honble Thos Fitzmaurice, Denbighshire",Q11060274,print,SCM - Art
0,Thomas Clark,Q23928704,Thomas Clark,Q3305213,painting,SCM - Art
0,William Hyde Wollaston,Q59546867,William Hyde Wollaston and his influence on early nineteenth-century science,Q1266946,thesis,SCM - Art
0,The Invention of Gunpowder,Q56827459,The invention of gunpowder,Q11835431,engraving,SCM - Art
0,The Laboratory,Q7745051,The Laboratory,Q5185279,poem,SCM - Art
0,Sanspareil,Q41781345,Sanspareil,Q15731356,apple cultivar,SCM - Art
0,Sovereign of the Seas,Q1032099,HMS Sovereign of the Seas,Q892367,first-rate,SCM - Art
0,HMS Vernon,Q5634827,HMS Vernon,Q11446,ship,SCM - Art
0,HMS Glasgow,Q1301596,HMS Glasgow,Q104843,cruiser,SCM - Art
0,Sources of energy,Q58859143,Sources of energy loss in dielectrics,Q187685,doctoral thesis,SCM - Art


In [24]:
df[df['TITLE'] == "Josiah Wedgwood"]

Unnamed: 0,TITLE,ITEM_NAME,CATEGORY1,COLLECTOR,PLACE_COLLECTED,DATE_COLLECTED,PLACE_MADE,CULTURE,DATE_MADE,MATERIALS,MEASUREMENTS,EXTENT,DESCRIPTION,ITEM_COUNT,PARENT_KEY,BROADER_TEXT,WHOLE_PART,ARRANGEMENT,LANGUAGE_OF_MATERIAL,EDITION,OPTION1,OPTION2,OPTION3,OPTION4,OPTION5,OPTION6,OPTION7,OPTION8,OPTION9,OPTION10,OPTION11,OPTION12,OPTION13,OPTION14,OPTION15,CREATE_DATE,UPDATE_DATE
40708,Josiah Wedgwood,oil painting; portrait,SCM - Art,,,,United Kingdom,,1830-1839,oil paint on canvas,overall: 765 mm x 635 mm,,"Painting. [Josiah Wedgwood] (1730-1795) by Charles Sibley (fl. 1826-1847), nd. [183-]. Oil on canvas, 30x25""/76.5x63.5cm. Inscription printed indistinctly on verso of former frame: 'After the oil by Reynolds, 1782, now in the Wedgwood Museum, Barlaston'. Portrait, HS to L with dark background.",1.0,,,WHOLE,,eng,,,,"Barrett, Katy",,,,,,,,SMG00004577,RECORD ACTIVE IN ASSET PANDA – EDIT WITH CAUTION,,One Collection,,12-MAR-96,06-OCT-18
