In [1]:
import sys
sys.path.append("..")

from heritageconnector.utils.generic import paginate_list

import geocoder
import pandas as pd
from collections import Counter
from tqdm.auto import tqdm

tqdm.pandas()

  from pandas import Panel


In [13]:
def get_id_and_countryid(loc: str):
    g = geocoder.geonames(loc, key='heritageconnector')

    if g.error:
        raise Exception(f"Rate limit met. {g.error}")

    try:
        address = g.json.get('address', None)
        # found label must be in requested label
        # the API sometimes does weird things like 'sussex' -> 'eastbourne'
        assert address.lower() in loc.lower()
    except:
        # try disambiguating one level up e.g. "debden, essex, england" -> "essex, england"
        if "," in loc:
            loc = ", ".join(loc.split(", ")[1:])
            return get_id_and_countryid(loc)
        else:
            return {}
        
    return {"name": loc, "geonames_address":address, "geonames_id": g.geonames_id, "country_id": g.json.get('raw', {}).get('countryId', None)}

## 1. Create place names dataframe and save to disk

In [25]:
df = pd.read_pickle("../GITIGNORE_DATA/filtering_people_orgs_result.pkl")
df.columns

Index(['LINK_ID', 'PREFERRED_NAME', 'TITLE_NAME', 'FIRSTMID_NAME',
       'LASTSUFF_NAME', 'SUFFIX_NAME', 'HONORARY_SUFFIX', 'GENDER',
       'BRIEF_BIO', 'DESCRIPTION', 'NOTE', 'BIRTH_DATE', 'BIRTH_PLACE',
       'DEATH_DATE', 'DEATH_PLACE', 'CAUSE_OF_DEATH', 'NATIONALITY',
       'OCCUPATION', 'WEBSITE', 'AFFILIATION', 'LINGUISTIC_GROUP', 'TYPE',
       'REFERENCE_NUMBER', 'SOURCE', 'CREATE_DATE', 'UPDATE_DATE',
       'res_ALL_NOTES', 'res_WIKIDATA_IDs', 'res_URLS', 'qcodes_filtered'],
      dtype='object')

In [26]:
df_objects = pd.read_pickle("../GITIGNORE_DATA/objects_with_types.pkl")
df_objects.columns

Index(['MKEY', 'TITLE', 'ITEM_NAME', 'CATEGORY1', 'COLLECTOR',
       'PLACE_COLLECTED', 'DATE_COLLECTED', 'PLACE_MADE', 'CULTURE',
       'DATE_MADE', 'MATERIALS', 'MEASUREMENTS', 'EXTENT', 'DESCRIPTION',
       'ITEM_COUNT', 'PARENT_KEY', 'BROADER_TEXT', 'WHOLE_PART', 'ARRANGEMENT',
       'LANGUAGE_OF_MATERIAL', 'EDITION', 'OPTION1', 'OPTION2', 'OPTION3',
       'OPTION4', 'OPTION5', 'OPTION6', 'OPTION7', 'OPTION8', 'OPTION9',
       'OPTION10', 'OPTION11', 'OPTION12', 'OPTION13', 'OPTION14', 'OPTION15',
       'CREATE_DATE', 'UPDATE_DATE', 'ITEM_NAME_resolved'],
      dtype='object')

In [29]:
def process_place_name(loc):
    """Split by semicolon and return unique values"""
    if isinstance(loc, str):
        loc = loc.lower()
    
    if str(loc) == "nan":
        return loc
    else:
        split = loc.split("; ")
        return list(set(split))

df["BIRTH_PLACE_list"] = df["BIRTH_PLACE"].apply(process_place_name)
df["DEATH_PLACE_list"] = df["DEATH_PLACE"].apply(process_place_name)
df_objects["PLACE_MADE_list"] = df_objects["PLACE_MADE"].apply(process_place_name)
df_objects["PLACE_COLLECTED_list"] = df_objects["PLACE_COLLECTED"].apply(process_place_name)

In [30]:
places_counts = Counter(df.BIRTH_PLACE_list.dropna().sum() + df.DEATH_PLACE_list.dropna().sum() + df_objects.PLACE_MADE_list.dropna().sum() + df_objects.PLACE_COLLECTED_list.dropna().sum())
places_unique = list(places_counts.keys())
len(places_counts)

6392

In [39]:
places_df = pd.DataFrame(pd.Series(places_counts).sort_values(ascending=False) / sum(places_counts.values()) * 100).reset_index().rename(columns={'index': 'place name', 0: '% collection'})
places_df.head()

Unnamed: 0,place name,% collection
0,"london, greater london, england, united kingdom",16.353221
1,"england, united kingdom",8.322525
2,united kingdom,4.79047
3,france,4.41835
4,"manchester, manchester urban district, greater...",4.233834


In [10]:
percentage_cumulative = places_df["% collection"].cumsum()

def no_records_to_reach_percent(percent):
    return percentage_cumulative[percentage_cumulative > percent].index.min()

for p in [50, 75, 85, 95, 99]:
    n = no_records_to_reach_percent(p)
    print(f"{n} ({int(n/len(places_df)*100)}%) records required to reach {p}%")

11 (0%) records required to reach 50%
119 (1%) records required to reach 75%
392 (6%) records required to reach 85%
1724 (26%) records required to reach 95%
5096 (79%) records required to reach 99%


## 2. Load place names dataframe from disk and fill
We have to this in chunks as the geonames API has a rate limit of 1000 queries per hour.

In [3]:
places_df = pd.read_pickle("s3://heritageconnector/places_disambiguation.pkl")
idx_paginated = paginate_list(places_df.index.tolist(), 300)

df_pages_list = [places_df.loc[page] for page in idx_paginated]

print(len(df_pages_list))
df_pages_list[0]

22


Unnamed: 0,place name,% collection
0,"london, greater london, england, united kingdom",16.353221
1,"england, united kingdom",8.322525
2,united kingdom,4.790470
3,france,4.418350
4,"manchester, manchester urban district, greater...",4.233834
...,...,...
295,"göttingen, braunschweig, lower saxony, germany",0.027793
296,"ealing, london, england, united kingdom",0.027021
297,"darmstadt, hesse, germany",0.027021
298,"bedford, bedfordshire, england, united kingdom",0.027021


In [43]:
PAGE_NO = 5

df_page = df_pages_list[PAGE_NO]
df_page['result'] = ""
df_page['result'] = df_page['result'].astype(object)

for idx, row in tqdm(df_page.iterrows(), total=len(df_page)):
    df_page.at[idx, "result"] = get_id_and_countryid(df_page.loc[idx, "place name"])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




In [44]:
df_page = pd.concat([df_page, df_page['result'].apply(pd.Series)], axis=1)

df_page.drop(columns='result')

Unnamed: 0,place name,% collection,name,geonames_address,geonames_id,country_id
1500,"south shields, south tyneside, tyne and wear, ...",0.003088,"south shields, south tyneside, tyne and wear, ...",South Shields,2637329.0,2635167
1501,"godalming, surrey, england, united kingdom",0.003088,"godalming, surrey, england, united kingdom",Godalming,2648372.0,2635167
1502,"dessau, halle, saxony-anhalt, germany",0.003088,"halle, saxony-anhalt, germany",Halle,2911522.0,2921044
1503,peterborough,0.003088,peterborough,Peterborough,2640354.0,2635167
1504,"devizes, wiltshire, england, united kingdom",0.003088,"devizes, wiltshire, england, united kingdom",Devizes,2651294.0,2635167
...,...,...,...,...,...,...
1795,"petersfield, hampshire, england, united kingdom",0.002316,"petersfield, hampshire, england, united kingdom",Petersfield,2640348.0,2635167
1796,"schrobenhausen, germany",0.002316,"schrobenhausen, germany",Schrobenhausen,2836084.0,2921044
1797,"springfield, sangamon county, illinois, united...",0.002316,"springfield, sangamon county, illinois, united...",Springfield,4250542.0,6252001
1798,"windemere, uk",0.002316,,,,


In [45]:
# df_page.drop(columns='result').to_csv("./places/5.csv")

## 3. Reconcile geonames IDs to Wikidata entities

In [68]:
from heritageconnector.utils.generic import paginate_list
from heritageconnector.utils.wikidata import get_sparql_results
from heritageconnector.config import config

In [52]:
results = pd.read_csv("./places/0_to_5.csv", index_col=0)
results[['geonames_id', 'country_id']] = results[['geonames_id', 'country_id']].applymap(lambda i: int(i) if not pd.isna(i) else i)
results.head()


Unnamed: 0,place name,% collection,name,geonames_address,geonames_id,country_id
0,"london, greater london, england, united kingdom",16.353221,"london, greater london, england, united kingdom",London,2643743.0,2635167.0
1,"england, united kingdom",8.322525,united kingdom,United Kingdom,2635167.0,2635167.0
2,united kingdom,4.79047,united kingdom,United Kingdom,2635167.0,2635167.0
3,france,4.41835,france,France,3017382.0,3017382.0
4,"manchester, manchester urban district, greater...",4.233834,"greater manchester, england, united kingdom",Greater Manchester,2648108.0,2635167.0


In [57]:
geonames_ids = list(set(results.geonames_id.dropna().tolist() + results.country_id.dropna().tolist()))
geonames_ids = [int(i) for i in geonames_ids]
len(geonames_ids)

1137

In [79]:
geonames_paginated = paginate_list(geonames_ids, 100)
geoname_wikidata_mapping = {}

In [80]:
make_geonames_list = lambda l: " ".join([f""" "{item}" """ for item in l])

for page in tqdm(geonames_paginated):
    query = f"""SELECT * WHERE {{
      VALUES ?geonames_id {{{make_geonames_list(page)}}}.
      ?qid wdt:P1566 ?geonames_id .  
    }}
    """

    for item in get_sparql_results(config.WIKIDATA_SPARQL_ENDPOINT, query)['results']['bindings']:
        geoname_wikidata_mapping.update({item['geonames_id']['value']: item['qid']['value']}) 

    

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12.0), HTML(value='')))




In [86]:
geoname_wikidata_mapping

{'1605651': 'http://www.wikidata.org/entity/Q869',
 '2652221': 'http://www.wikidata.org/entity/Q6225',
 '3489854': 'http://www.wikidata.org/entity/Q34692',
 '1257592': 'http://www.wikidata.org/entity/Q117196',
 '878675': 'http://www.wikidata.org/entity/Q954',
 '6457407': 'http://www.wikidata.org/entity/Q202174',
 '2168866': 'http://www.wikidata.org/entity/Q28224086',
 '149590': 'http://www.wikidata.org/entity/Q924',
 '2650228': 'http://www.wikidata.org/entity/Q149651',
 '614540': 'http://www.wikidata.org/entity/Q230',
 '163843': 'http://www.wikidata.org/entity/Q858',
 '3174530': 'http://www.wikidata.org/entity/Q3969986',
 '2654312': 'http://www.wikidata.org/entity/Q2019274',
 '2637918': 'http://www.wikidata.org/entity/Q1027127',
 '3033123': 'http://www.wikidata.org/entity/Q37776',
 '2838632': 'http://www.wikidata.org/entity/Q1194',
 '5101717': 'http://www.wikidata.org/entity/Q138338',
 '352260': 'http://www.wikidata.org/entity/Q463871',
 '2646057': 'http://www.wikidata.org/entity/Q1847

In [93]:
results = results.dropna(subset=['geonames_id', 'country_id'])
results[['geonames_id', 'country_id']] = results[['geonames_id', 'country_id']].astype(int).astype(str)

results['qid'] = results['geonames_id'].map(geoname_wikidata_mapping)
results['country_qid'] = results['country_id'].map(geoname_wikidata_mapping)

In [97]:
results.to_pickle("./places/placenames_to_qids.pkl")

In [98]:
results.head()

Unnamed: 0,place name,% collection,name,geonames_address,geonames_id,country_id,qid,country_qid
0,"london, greater london, england, united kingdom",16.353221,"london, greater london, england, united kingdom",London,2643743,2635167,http://www.wikidata.org/entity/Q84,http://www.wikidata.org/entity/Q145
1,"england, united kingdom",8.322525,united kingdom,United Kingdom,2635167,2635167,http://www.wikidata.org/entity/Q145,http://www.wikidata.org/entity/Q145
2,united kingdom,4.79047,united kingdom,United Kingdom,2635167,2635167,http://www.wikidata.org/entity/Q145,http://www.wikidata.org/entity/Q145
3,france,4.41835,france,France,3017382,3017382,http://www.wikidata.org/entity/Q142,http://www.wikidata.org/entity/Q142
4,"manchester, manchester urban district, greater...",4.233834,"greater manchester, england, united kingdom",Greater Manchester,2648108,2635167,http://www.wikidata.org/entity/Q23099,http://www.wikidata.org/entity/Q145


In [99]:
pd.read_pickle("./places/placenames_to_qids.pkl")

Unnamed: 0,place name,% collection,name,geonames_address,geonames_id,country_id,qid,country_qid
0,"london, greater london, england, united kingdom",16.353221,"london, greater london, england, united kingdom",London,2643743,2635167,http://www.wikidata.org/entity/Q84,http://www.wikidata.org/entity/Q145
1,"england, united kingdom",8.322525,united kingdom,United Kingdom,2635167,2635167,http://www.wikidata.org/entity/Q145,http://www.wikidata.org/entity/Q145
2,united kingdom,4.790470,united kingdom,United Kingdom,2635167,2635167,http://www.wikidata.org/entity/Q145,http://www.wikidata.org/entity/Q145
3,france,4.418350,france,France,3017382,3017382,http://www.wikidata.org/entity/Q142,http://www.wikidata.org/entity/Q142
4,"manchester, manchester urban district, greater...",4.233834,"greater manchester, england, united kingdom",Greater Manchester,2648108,2635167,http://www.wikidata.org/entity/Q23099,http://www.wikidata.org/entity/Q145
...,...,...,...,...,...,...,...,...
1794,grimsby,0.002316,grimsby,Grimsby,2647878,2635167,http://www.wikidata.org/entity/Q179406,http://www.wikidata.org/entity/Q145
1795,"petersfield, hampshire, england, united kingdom",0.002316,"petersfield, hampshire, england, united kingdom",Petersfield,2640348,2635167,http://www.wikidata.org/entity/Q1247815,http://www.wikidata.org/entity/Q145
1796,"schrobenhausen, germany",0.002316,"schrobenhausen, germany",Schrobenhausen,2836084,2921044,http://www.wikidata.org/entity/Q32289922,http://www.wikidata.org/entity/Q183
1797,"springfield, sangamon county, illinois, united...",0.002316,"springfield, sangamon county, illinois, united...",Springfield,4250542,6252001,http://www.wikidata.org/entity/Q28515,http://www.wikidata.org/entity/Q30


In [105]:
results.loc[results['place name'] == 'united kingdom', 'qid'].values[0]

'http://www.wikidata.org/entity/Q145'

In [110]:
results.loc[results['place name'] == str('united states').lower(), 'qid'].values

array([], dtype=object)