# disambiguating item names & materials with Getty

In [17]:
import sys
sys.path.append("..")

import re
from tqdm import tqdm
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from collections import Counter
import requests
from tqdm import tqdm
from fuzzywuzzy import fuzz

In [4]:
df = pd.read_csv("../GITIGNORE_DATA/mimsy-catalogue-export.csv")
df["ITEM_NAME_list"] = df['ITEM_NAME'].fillna("").astype(str).apply(lambda i: [x.strip().lower() for x in i.split(';')])
df["MATERIALS_list"] = df['MATERIALS'].fillna("").astype(str).apply(lambda i: [x.strip().lower() for x in i.split(';')])

df.head(5)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,TITLE,ITEM_NAME,CATEGORY1,COLLECTOR,PLACE_COLLECTED,DATE_COLLECTED,PLACE_MADE,CULTURE,DATE_MADE,MATERIALS,MEASUREMENTS,EXTENT,DESCRIPTION,ITEM_COUNT,PARENT_KEY,BROADER_TEXT,WHOLE_PART,ARRANGEMENT,LANGUAGE_OF_MATERIAL,EDITION,OPTION1,OPTION2,OPTION3,OPTION4,OPTION5,OPTION6,OPTION7,OPTION8,OPTION9,OPTION10,OPTION11,OPTION12,OPTION13,OPTION14,OPTION15,CREATE_DATE,UPDATE_DATE,ITEM_NAME_list,MATERIALS_list
0,Ansonia Sunwatch (pocket compass dial),Pocket horizontal sundial,SCM - Time Measurement,,,,"New York county, New York state, United States",,1922-1939,,,,Ansonia Sunwatch (pocket compass dial),1.0,,,WHOLE,,eng,,,,"Desborough, Jane",,,,,,,,SMG00083125,,,One Collection,,12-MAR-96,19-JUN-19,[pocket horizontal sundial],[]
1,Model of train of wheels used in a clock (full siz,spring-driven clock mechanism; fusee; model,SCM - Time Measurement,,,,,,,,,,Model of train of wheels used in a clock (full size) with pair of vanes and base,1.0,,,WHOLE,,eng,,,,"Desborough, Jane",,,,,,,,,,,One Collection,,12-MAR-96,30-MAY-18,"[spring-driven clock mechanism, fusee, model]",[]
2,Ship's log sandglass,log (nautical instrument); sandglass,SCM - Time Measurement,,,,,,,glass; sand; mounted; wood; timer,"overall: 140 mm 70 mm, 0.252 kg",,Ship's log-glass in wooden mount. 14 secs. Abbot Horne No.22. Type B,1.0,,,WHOLE,,eng,,,,"Desborough, Jane",,,,,,,,,,,One Collection,,12-MAR-96,30-MAY-18,"[log (nautical instrument), sandglass]","[glass, sand, mounted, wood, timer]"
3,Watch with Chinese duplex escapement,pocket watch; duplex watch,SCM - Time Measurement,,,,,,,,,,Watch with Chinese duplex escapement,1.0,,,WHOLE,,eng,,,,"Desborough, Jane",,,,,,,,,,,One Collection,,12-MAR-96,04-FEB-19,"[pocket watch, duplex watch]",[]
4,"""Ever Ready"" ceiling clock",clocks,SCM - Time Measurement,,,,,,,,"overall: 140 mm x 124 mm x 152 mm,",,"""Ever Ready"" ceiling clock",1.0,,,WHOLE,,eng,,,,"Desborough, Jane",,,,,,,,,RECORD ACTIVE IN ASSET PANDA – EDIT WITH CAUTION,,One Collection,,12-MAR-96,30-MAR-20,[clocks],[]


## 1. item names

### 1.1 look up top names

In [5]:
itemname_count = pd.Series(Counter(df["ITEM_NAME_list"].sum())).drop(index='')

In [6]:
# top item names and percentages
(itemname_count.sort_values(ascending=False) / len(df) * 100).head(20)

photograph              3.932969
poster                  3.548434
print                   1.941475
bottles                 1.646689
specimen                1.285922
tobacco pipe            0.985105
valve                   0.857045
specimen bottles        0.840727
bottle                  0.798868
specimen jars           0.777229
notice                  0.711248
commemorative medal     0.572545
printing block          0.542393
badge                   0.542038
painting                0.512595
silver gelatin print    0.494503
personal medals         0.488828
shop rounds             0.483507
spectacles              0.479959
tobacco pipes           0.444485
dtype: float64

In [7]:
itemname_count.to_csv("../GITIGNORE_DATA/object_itemnames_3.csv")

### 1.2 get dumps from Getty reconciliation endpoint
(SPARQL at vocab.getty.edu is down)
endpoint documentation: https://gist.github.com/workergnome/afe5b74cff8f1b4fb6490667cf6a4886

In [8]:
endpoint_url = "http://services.getty.edu/vocab/reconcile/?queries="

In [None]:
itemname_df = pd.DataFrame(itemname_count).reset_index().drop(columns=0).rename(columns={'index': 'query'})
itemname_df['type'] = '/aat'
itemname_df['query'] = itemname_df['query'].astype(str).str.replace("&", "") # & causes issues with URL

In [None]:
page_limit = 18
idx_list = itemname_df.index.tolist()
idx_paginated = [idx_list[i : i + page_limit] for i in range(0, len(idx_list), page_limit)]

response_df = pd.DataFrame()
failed_idx = []
for page in tqdm(idx_paginated):
    try:
        json_request = itemname_df.loc[page, :].to_json(orient="index")
        response = requests.post(endpoint_url + json_request).json()

        tempdf = pd.json_normalize(response).T
        tempdf = tempdf.rename(index = lambda x: x.strip('.result'), columns={0: 'response_dump'})

        response_df = response_df.append(tempdf)

    except:
        print(f"FAILED: {page[0]}:{page[-1]}")
        failed_idx.append(page)
        
response_df.index = response_df.index.astype(int)

In [None]:
itemnames_responses = pd.concat([itemname_df, response_df], axis=1)
itemnames_responses.head(1)

### 1.3 reconcile with names

In [None]:
itemnames_responses["match_name"] = ""
itemnames_responses["match_id"] = ""

for idx, row in tqdm(itemnames_responses.iterrows(), total=itemnames_responses.shape[0]):
    query = row['query']
    response = row['response_dump']
    
    if not isinstance(response, float):
        names = [item['name'] for item in response]
        ids = [item['id'] for item in response]
        idx_match = [idx for idx, item in enumerate(names) if fuzz.ratio(query, item) > 90]

        if len(idx_match) >= 1:
            itemnames_responses.loc[idx, "match_name"] = names[idx_match[0]]
            itemnames_responses.loc[idx, "match_id"] = ids[idx_match[0]]
        if len(idx_match) > 1:
            print(query, [names[i] for i in idx_match])

In [None]:
itemnames_responses.to_csv("../GITIGNORE_DATA/object_itemnames_matches_2.csv")

In [None]:
# number and % of item names matched§
len(itemnames_responses[itemnames_responses['match_name'] != ""]), len(itemnames_responses[itemnames_responses['match_name'] != ""])/len(itemnames_responses)

In [None]:
matched_itemnames = itemnames_responses.loc[itemnames_responses['match_name'] != "", 'query'].str.replace("&", "").tolist()
itemname_count.index = itemname_count.index.str.replace("&", "")

In [None]:
# % of collection items with one of the matched item names
itemname_count[matched_itemnames].sum() / len(df)

In [None]:
itemname_count.to_csv("../GITIGNORE_DATA/object_itemnames_count.csv")

## 2. Mapping Getty IDs to Wikidata
Resolving `match_id` column in below dataframe to Wikidata IDs, and storing these relationships in a CSV.

In [46]:
from heritageconnector.entity_matching.lookup import wikidata_id
from functools import partial

tqdm.pandas()

  from pandas import Panel


In [38]:
getty_ids = pd.read_csv('../GITIGNORE_DATA/itemnames_matches_final.csv')

for col in ['match_name', 'match_id']:
    getty_ids[col] = getty_ids[col].astype(str).str.replace("^0$", "")

getty_ids.head(5)

Unnamed: 0,item_name,count,match_name,match_id
0,photograph,11087.0,photographs,aat/300046300
1,poster,10003.0,posters,aat/300027221
2,print,5473.0,prints (visual works),aat/300041273
3,bottles,4642.0,bottles,aat/300045627
4,specimen,3625.0,specimens,aat/300235576


In [39]:
getty_ids["getty_aat_id"] = getty_ids["match_id"].apply(lambda i: re.findall(r"(300\d{6})", i)[0] if i.startswith("aat") else "")
getty_ids.head(5)

Unnamed: 0,item_name,count,match_name,match_id,getty_aat_id
0,photograph,11087.0,photographs,aat/300046300,300046300
1,poster,10003.0,posters,aat/300027221,300027221
2,print,5473.0,prints (visual works),aat/300041273,300041273
3,bottles,4642.0,bottles,aat/300045627,300045627
4,specimen,3625.0,specimens,aat/300235576,300235576


In [50]:
wid = wikidata_id()
lookup = partial(wid.lookup_wikidata_id, pid="P1014")
getty_ids_toscan = getty_ids.loc[getty_ids["match_id"] != ""]

getty_ids_toscan["wikidata_id"] = getty_ids_toscan["getty_aat_id"].progress_apply(lambda i: lookup(uid=i))
getty_ids['wikidata_id'] = getty_ids_toscan['wikidata_id']

100%|██████████| 4642/4642 [16:29<00:00,  4.69it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [65]:
perc_ids_linked = 100*len(getty_ids[getty_ids['wikidata_id'].astype(str).str.startswith("Q")])/len(getty_ids)
print(f"Percentage of IDs with Wikidata links: {perc_ids_linked}")

Percentage of IDs with Wikidata links: 8.322773167640927


In [66]:
getty_ids.to_csv("../GITIGNORE_DATA/itemnames_links.csv", index=False)