# mapping demonyms

In [61]:
import pandas as pd
import re
import string

def split_list_string(l: list):
    """
    Splits string separated by either commas or semicolons into a lowercase list.
    """

    return [
        re.sub(r"\?():", "", x.strip().lower())
        for x in str(l).replace(";", ",").replace("?", "").replace("/", ",").split(",")
        if x.strip() != ""
    ]

In [62]:
df = pd.read_pickle("../GITIGNORE_DATA/filtering_people_orgs_result.pkl")
nationalities = list(set(df.NATIONALITY.apply(split_list_string).sum()))

len(nationalities), nationalities[0:10]

(243,
 ['argentinian',
  'scotland',
  'tasmanian',
  'babylonian',
  'zulu',
  'kenya',
  'nigerian',
  'syrian',
  'japanese',
  'indian'])

In [66]:
demonym_mapping = pd.read_csv("https://raw.githubusercontent.com/knowitall/chunkedextractor/master/src/main/resources/edu/knowitall/chunkedextractor/demonyms.csv", header=None, names=['people', 'country'])
demonym_mapping = demonym_mapping.applymap(lambda i: str(i).lower())
demonym_mapping

Unnamed: 0,people,country
0,aalborgenser,aalborg
1,aberdonian,aberdeen
2,abkhaz,abkhazia
3,abkhazian,abkhazia
4,abrenian,abra
...,...,...
2139,zimbabwean,zimbabwe
2140,zintani,zintan
2141,zulu,zululand
2142,zuricher,zurich


In [67]:
failed = []

def country_from_nationality(n):
    if n in demonym_mapping.country.tolist():
        return n
    elif n in demonym_mapping.people.tolist():
        return demonym_mapping.loc[demonym_mapping['people'] == n, 'country'].values.tolist()[0]
    else:
        failed.append(n)
        return None
    
demonym_dict = {n: country_from_nationality(n) for n in nationalities}

In [69]:
import sys
sys.path.append("..")

from heritageconnector.entity_matching.lookup import DenonymConverter

In [71]:
dc = DenonymConverter()
{n: dc.get_country_from_nationality(n) for n in nationalities}

{'argentinian': 'argentina',
 'scotland': 'scotland',
 'tasmanian': 'tasmania',
 'babylonian': 'babylonia',
 'zulu': 'zululand',
 'kenya': 'kenya',
 'nigerian': 'nigeria',
 'syrian': 'syria',
 'japanese': 'japan',
 'indian': 'india',
 'jamaican': 'jamaica',
 'hong kongese': None,
 'northern irish': 'northern ireland',
 'belarusian': 'belarus',
 'nepalese': None,
 'hindustani': None,
 'german ()': None,
 'european': 'europe',
 'austria': 'austria',
 'iraqi': 'iraq',
 'andorran': 'andorra',
 'japan': 'japan',
 'jerseyan': 'jersey',
 'brazilian': 'brazil',
 'fuegian': None,
 'irish': 'ireland',
 'french': 'france',
 'bavarian': 'bavaria',
 'australian-born': None,
 'britain': 'britain',
 'tanzanian': 'tanzania',
 'wesh': None,
 'breton': 'brittany',
 'ritish': None,
 'netherlands': 'netherlands',
 'briton': 'britain',
 'austrian-american': None,
 'multinational': None,
 'german & polish': None,
 'bechuanan': None,
 'englsih': None,
 'greek': 'greece',
 'norman': 'normandy',
 'macedonian':

In [84]:
def flatten_list_of_lists(l: list) -> list:
    """
    [[1, 2], [3]] -> [1, 2, 3]
    """
    
    res = []
    
    for item in l:
        if isinstance(item, list):
            res = res + item
        else:
            res.append(item)
            
    return res

In [88]:
flatten_list_of_lists(['abc'])

['abc']