In [1]:
import geocoder
import pandas as pd
from collections import Counter
from tqdm.auto import tqdm

In [2]:
df = pd.read_pickle("../GITIGNORE_DATA/filtering_people_orgs_result.pkl")
df.columns

Index(['LINK_ID', 'PREFERRED_NAME', 'TITLE_NAME', 'FIRSTMID_NAME',
       'LASTSUFF_NAME', 'SUFFIX_NAME', 'HONORARY_SUFFIX', 'GENDER',
       'BRIEF_BIO', 'DESCRIPTION', 'NOTE', 'BIRTH_DATE', 'BIRTH_PLACE',
       'DEATH_DATE', 'DEATH_PLACE', 'CAUSE_OF_DEATH', 'NATIONALITY',
       'OCCUPATION', 'WEBSITE', 'AFFILIATION', 'LINGUISTIC_GROUP', 'TYPE',
       'REFERENCE_NUMBER', 'SOURCE', 'CREATE_DATE', 'UPDATE_DATE',
       'res_ALL_NOTES', 'res_WIKIDATA_IDs', 'res_URLS', 'qcodes_filtered'],
      dtype='object')

In [3]:
def process_place_name(loc):
    """Split by semicolon and return unique values"""
    if str(loc) == "nan":
        return loc
    else:
        split = loc.split("; ")
        return list(set(split))

df["BIRTH_PLACE_list"] = df["BIRTH_PLACE"].apply(process_place_name)
df["DEATH_PLACE_list"] = df["DEATH_PLACE"].apply(process_place_name)


In [4]:
places_counts = Counter(df.BIRTH_PLACE_list.dropna().sum() + df.DEATH_PLACE_list.dropna().sum())
places_unique = list(places_counts.keys())
len(places_counts)

3058

In [5]:
pd.Series(places_counts).sort_values(ascending=False) / sum(places_counts.values()) * 100

London, Greater London, England, United Kingdom                         12.017631
England, United Kingdom                                                  4.075183
United Kingdom                                                           3.093812
Paris, Ville de Paris, Île-de-France, France                             2.278776
France                                                                   1.871257
                                                                          ...    
Larbert, Stirling, Scotland, United Kingdom                              0.008317
Beacon, Dutchess county, New York state, United States                   0.008317
Fowey, Cornwall, England, United Kingdom                                 0.008317
Finsbury, Islington, London, Greater London, England, United Kingdom     0.008317
Broadheath, Trafford, Greater Manchester, England, United Kingdom        0.008317
Length: 3058, dtype: float64

In [None]:
def get_id_and_countryid(loc: str):
    g = geocoder.geonames(loc, key='heritageconnector')

    if g.error:
        raise Exception(f"Rate limit met. {g.error}")

    try:
        address = g.json.get('address', None)
        # found label must be in requested label
        # the API sometimes does weird things like 'sussex' -> 'eastbourne'
        assert address in loc
    except:
        # try disambiguating one level up e.g. "debden, essex, england" -> "essex, england"
        if "," in loc:
            loc = ", ".join(loc.split(", ")[1:])
            return get_id_and_countryid(loc)
        else:
            return {}
        
    return {"name": loc, "geonames_address":address, "geonames_id": g.geonames_id, "country_id": g.json.get('raw', {}).get('countryId', None)}

In [None]:
results_list = []

for place in tqdm(places_unique):
    try:
        results_list.append(get_id_and_countryid(place))
    except Exception as e:
        print(place)
        raise e
        break

In [None]:
pd.DataFrame(results_list)