In [2]:
!pip install geocoder

Collecting geocoder
  Downloading geocoder-1.38.1-py2.py3-none-any.whl (98 kB)
[K     |████████████████████████████████| 98 kB 934 kB/s eta 0:00:011
Collecting future
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |████████████████████████████████| 829 kB 1.3 MB/s eta 0:00:01
[?25hCollecting ratelim
  Downloading ratelim-0.1.6-py2.py3-none-any.whl (4.0 kB)
Building wheels for collected packages: future
  Building wheel for future (setup.py) ... [?25ldone
[?25h  Created wheel for future: filename=future-0.18.2-py3-none-any.whl size=491059 sha256=593e0236bb42dbf74ad8c12c62ca7e7d306c38f92dc6de42e4922183318a3b30
  Stored in directory: /Users/kalyan/Library/Caches/pip/wheels/2f/a0/d3/4030d9f80e6b3be787f19fc911b8e7aa462986a40ab1e4bb94
Successfully built future
Installing collected packages: ratelim, future, geocoder
Successfully installed future-0.18.2 geocoder-1.38.1 ratelim-0.1.6
You should consider upgrading via the '/Users/kalyan/.pyenv/versions/3.9.1/envs/hc/bin/python -m pip i

In [3]:
import sys
sys.path.append("..")

from heritageconnector.utils.generic import paginate_list

import geocoder
import pandas as pd
from collections import Counter
from tqdm.auto import tqdm

tqdm.pandas()

  from pandas import Panel


In [4]:
def get_id_and_countryid(loc: str):
    g = geocoder.geonames(loc, key='heritageconnector')

    if g.error:
        raise Exception(f"Rate limit met. {g.error}")

    try:
        address = g.json.get('address', None)
        # found label must be in requested label
        # the API sometimes does weird things like 'sussex' -> 'eastbourne'
        assert address.lower() in loc.lower()
    except:
        # try disambiguating one level up e.g. "debden, essex, england" -> "essex, england"
        if "," in loc:
            loc = ", ".join(loc.split(", ")[1:])
            return get_id_and_countryid(loc)
        else:
            return {}
        
    return {"name": loc, "geonames_address":address, "geonames_id": g.geonames_id, "country_id": g.json.get('raw', {}).get('countryId', None)}

## 1. Create place names dataframe and save to disk

In [5]:
df = pd.read_csv("../GITIGNORE_DATA/adlib-people-dump.csv")
df.columns

Index(['status', 'description.0.type', 'description.0.value',
       'deduplication.0.value', 'deduplication.0.rule', 'admin.added',
       'admin.uid', 'admin.stream', 'admin.created', 'admin.modified',
       'admin.previous_status', 'admin.source', 'admin.version',
       'admin.processed', 'admin.id', 'admin.uuid', 'type.base', 'type.type',
       'conformance', 'summary_title', 'name.0.note.0.value', 'name.0.type',
       'name.0.primary', 'name.0.value', 'name.0.last_name',
       'address.0.street_address', 'gender', 'name.0.first_name',
       'nationality.0', 'lifecycle.death.0.date.0.earliest',
       'lifecycle.death.0.date.0.value', 'lifecycle.death.0.date.0.latest',
       'name.0.title_prefix', 'identifier.0.source', 'identifier.0.value',
       'lifecycle.birth.0.date.0.earliest', 'lifecycle.birth.0.date.0.value',
       'lifecycle.birth.0.date.0.latest', 'admin.analytics.count.total',
       'use.0.admin.uuid', 'use.0.admin.id', 'use.0.admin.uid',
       'use.0.@link.ty

In [6]:
def process_place_name(loc):
    """Split by semicolon and return unique values"""
    if isinstance(loc, str):
        loc = loc.lower()
    
    if str(loc) == "nan":
        return loc
    else:
        split = loc.split("; ")
        return list(set(split))

df["lifecycle.birth.0.place.0.summary_title_list"] = df["lifecycle.birth.0.place.0.summary_title"].apply(process_place_name)
df["lifecycle.death.0.place.0.summary_title_list"] = df["lifecycle.death.0.place.0.summary_title"].apply(process_place_name)


In [8]:
places_counts = Counter(df['lifecycle.birth.0.place.0.summary_title_list'].dropna().sum() + df['lifecycle.death.0.place.0.summary_title_list'].dropna().sum())
places_unique = list(places_counts.keys())
len(places_counts)

219

In [9]:
places_df = pd.DataFrame(pd.Series(places_counts).sort_values(ascending=False) / sum(places_counts.values()) * 100).reset_index().rename(columns={'index': 'place name', 0: '% collection'})
places_df.head()

Unnamed: 0,place name,% collection
0,london,18.0
1,manchester,7.333333
2,salford,2.0
3,glasgow,1.555556
4,surrey,1.555556


In [10]:
percentage_cumulative = places_df["% collection"].cumsum()

def no_records_to_reach_percent(percent):
    return percentage_cumulative[percentage_cumulative > percent].index.min()

for p in [50, 75, 85, 95, 99]:
    n = no_records_to_reach_percent(p)
    print(f"{n} ({int(n/len(places_df)*100)}%) records required to reach {p}%")

26 (11%) records required to reach 50%
106 (48%) records required to reach 75%
151 (68%) records required to reach 85%
196 (89%) records required to reach 95%
214 (97%) records required to reach 99%


In [11]:
places_df.to_pickle("../GITIGNORE_DATA/adlib_places_disambiguation.pkl")

## 2. Load place names dataframe from disk and fill
We have to this in chunks as the geonames API has a rate limit of 1000 queries per hour.

In [12]:
places_df = pd.read_pickle("../GITIGNORE_DATA/adlib_places_disambiguation.pkl")
idx_paginated = paginate_list(places_df.index.tolist(), 300)

df_pages_list = [places_df.loc[page] for page in idx_paginated]

print(len(df_pages_list))
df_pages_list[0]

1


Unnamed: 0,place name,% collection
0,london,18.000000
1,manchester,7.333333
2,salford,2.000000
3,glasgow,1.555556
4,surrey,1.555556
...,...,...
214,cairo,0.222222
215,new york,0.222222
216,bury,0.222222
217,chester,0.222222


In [15]:
places_df.to_csv("../GITIGNORE_DATA/adlib_placenames_to_qids.csv")

In [13]:
PAGE_NO = 0

df_page = df_pages_list[PAGE_NO]
df_page['result'] = ""
df_page['result'] = df_page['result'].astype(object)

for idx, row in tqdm(df_page.iterrows(), total=len(df_page)):
    df_page.at[idx, "result"] = get_id_and_countryid(df_page.loc[idx, "place name"])

  0%|          | 0/219 [00:00<?, ?it/s]

Status code Unknown from http://api.geonames.org/searchJSON: ERROR - HTTPConnectionPool(host='api.geonames.org', port=80): Read timed out. (read timeout=5.0)


Exception: Rate limit met. ERROR - HTTPConnectionPool(host='api.geonames.org', port=80): Read timed out. (read timeout=5.0)

In [14]:
df_page = pd.concat([df_page, df_page['result'].apply(pd.Series)], axis=1)

df_page.drop(columns='result')

Unnamed: 0,place name,% collection,0,country_id,geonames_address,geonames_id,name
0,london,18.000000,,2635167,London,2643743.0,london
1,manchester,7.333333,,,,,
2,salford,2.000000,,2635167,Salford,2638671.0,salford
3,glasgow,1.555556,,2635167,Glasgow,2648579.0,glasgow
4,surrey,1.555556,,,,,
...,...,...,...,...,...,...,...
214,cairo,0.222222,,,,,
215,new york,0.222222,,,,,
216,bury,0.222222,,,,,
217,chester,0.222222,,,,,


In [45]:
# df_page.drop(columns='result').to_csv("./places/5.csv")

## 3. Reconcile geonames IDs to Wikidata entities

In [68]:
from heritageconnector.utils.generic import paginate_list
from heritageconnector.utils.wikidata import get_sparql_results
from heritageconnector.config import config

In [52]:
results = pd.read_csv("./places/0_to_5.csv", index_col=0)
results[['geonames_id', 'country_id']] = results[['geonames_id', 'country_id']].applymap(lambda i: int(i) if not pd.isna(i) else i)
results.head()


Unnamed: 0,place name,% collection,name,geonames_address,geonames_id,country_id
0,"london, greater london, england, united kingdom",16.353221,"london, greater london, england, united kingdom",London,2643743.0,2635167.0
1,"england, united kingdom",8.322525,united kingdom,United Kingdom,2635167.0,2635167.0
2,united kingdom,4.79047,united kingdom,United Kingdom,2635167.0,2635167.0
3,france,4.41835,france,France,3017382.0,3017382.0
4,"manchester, manchester urban district, greater...",4.233834,"greater manchester, england, united kingdom",Greater Manchester,2648108.0,2635167.0


In [57]:
geonames_ids = list(set(results.geonames_id.dropna().tolist() + results.country_id.dropna().tolist()))
geonames_ids = [int(i) for i in geonames_ids]
len(geonames_ids)

1137

In [79]:
geonames_paginated = paginate_list(geonames_ids, 100)
geoname_wikidata_mapping = {}

In [80]:
make_geonames_list = lambda l: " ".join([f""" "{item}" """ for item in l])

for page in tqdm(geonames_paginated):
    query = f"""SELECT * WHERE {{
      VALUES ?geonames_id {{{make_geonames_list(page)}}}.
      ?qid wdt:P1566 ?geonames_id .  
    }}
    """

    for item in get_sparql_results(config.WIKIDATA_SPARQL_ENDPOINT, query)['results']['bindings']:
        geoname_wikidata_mapping.update({item['geonames_id']['value']: item['qid']['value']}) 

    

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12.0), HTML(value='')))




In [86]:
geoname_wikidata_mapping

{'1605651': 'http://www.wikidata.org/entity/Q869',
 '2652221': 'http://www.wikidata.org/entity/Q6225',
 '3489854': 'http://www.wikidata.org/entity/Q34692',
 '1257592': 'http://www.wikidata.org/entity/Q117196',
 '878675': 'http://www.wikidata.org/entity/Q954',
 '6457407': 'http://www.wikidata.org/entity/Q202174',
 '2168866': 'http://www.wikidata.org/entity/Q28224086',
 '149590': 'http://www.wikidata.org/entity/Q924',
 '2650228': 'http://www.wikidata.org/entity/Q149651',
 '614540': 'http://www.wikidata.org/entity/Q230',
 '163843': 'http://www.wikidata.org/entity/Q858',
 '3174530': 'http://www.wikidata.org/entity/Q3969986',
 '2654312': 'http://www.wikidata.org/entity/Q2019274',
 '2637918': 'http://www.wikidata.org/entity/Q1027127',
 '3033123': 'http://www.wikidata.org/entity/Q37776',
 '2838632': 'http://www.wikidata.org/entity/Q1194',
 '5101717': 'http://www.wikidata.org/entity/Q138338',
 '352260': 'http://www.wikidata.org/entity/Q463871',
 '2646057': 'http://www.wikidata.org/entity/Q1847

In [93]:
results = results.dropna(subset=['geonames_id', 'country_id'])
results[['geonames_id', 'country_id']] = results[['geonames_id', 'country_id']].astype(int).astype(str)

results['qid'] = results['geonames_id'].map(geoname_wikidata_mapping)
results['country_qid'] = results['country_id'].map(geoname_wikidata_mapping)

In [97]:
results.to_pickle("./places/placenames_to_qids.pkl")

In [98]:
results.head()

Unnamed: 0,place name,% collection,name,geonames_address,geonames_id,country_id,qid,country_qid
0,"london, greater london, england, united kingdom",16.353221,"london, greater london, england, united kingdom",London,2643743,2635167,http://www.wikidata.org/entity/Q84,http://www.wikidata.org/entity/Q145
1,"england, united kingdom",8.322525,united kingdom,United Kingdom,2635167,2635167,http://www.wikidata.org/entity/Q145,http://www.wikidata.org/entity/Q145
2,united kingdom,4.79047,united kingdom,United Kingdom,2635167,2635167,http://www.wikidata.org/entity/Q145,http://www.wikidata.org/entity/Q145
3,france,4.41835,france,France,3017382,3017382,http://www.wikidata.org/entity/Q142,http://www.wikidata.org/entity/Q142
4,"manchester, manchester urban district, greater...",4.233834,"greater manchester, england, united kingdom",Greater Manchester,2648108,2635167,http://www.wikidata.org/entity/Q23099,http://www.wikidata.org/entity/Q145


In [99]:
pd.read_pickle("./places/placenames_to_qids.pkl")

Unnamed: 0,place name,% collection,name,geonames_address,geonames_id,country_id,qid,country_qid
0,"london, greater london, england, united kingdom",16.353221,"london, greater london, england, united kingdom",London,2643743,2635167,http://www.wikidata.org/entity/Q84,http://www.wikidata.org/entity/Q145
1,"england, united kingdom",8.322525,united kingdom,United Kingdom,2635167,2635167,http://www.wikidata.org/entity/Q145,http://www.wikidata.org/entity/Q145
2,united kingdom,4.790470,united kingdom,United Kingdom,2635167,2635167,http://www.wikidata.org/entity/Q145,http://www.wikidata.org/entity/Q145
3,france,4.418350,france,France,3017382,3017382,http://www.wikidata.org/entity/Q142,http://www.wikidata.org/entity/Q142
4,"manchester, manchester urban district, greater...",4.233834,"greater manchester, england, united kingdom",Greater Manchester,2648108,2635167,http://www.wikidata.org/entity/Q23099,http://www.wikidata.org/entity/Q145
...,...,...,...,...,...,...,...,...
1794,grimsby,0.002316,grimsby,Grimsby,2647878,2635167,http://www.wikidata.org/entity/Q179406,http://www.wikidata.org/entity/Q145
1795,"petersfield, hampshire, england, united kingdom",0.002316,"petersfield, hampshire, england, united kingdom",Petersfield,2640348,2635167,http://www.wikidata.org/entity/Q1247815,http://www.wikidata.org/entity/Q145
1796,"schrobenhausen, germany",0.002316,"schrobenhausen, germany",Schrobenhausen,2836084,2921044,http://www.wikidata.org/entity/Q32289922,http://www.wikidata.org/entity/Q183
1797,"springfield, sangamon county, illinois, united...",0.002316,"springfield, sangamon county, illinois, united...",Springfield,4250542,6252001,http://www.wikidata.org/entity/Q28515,http://www.wikidata.org/entity/Q30


In [105]:
results.loc[results['place name'] == 'united kingdom', 'qid'].values[0]

'http://www.wikidata.org/entity/Q145'

In [110]:
results.loc[results['place name'] == str('united states').lower(), 'qid'].values

array([], dtype=object)