In [None]:
import os
import json
import xmltodict
import numpy as np
import pandas as pd
from collections import defaultdict

NETHERLANDS_COUNTRY_CODE = '2750405'
NOORD_BRABANT_ADM1_CODE = '2749990'

source_file = './source/NL/NL.csv'
nb_out_file = './out/geonames_nb.csv'
alt_names_source_file = './source/alternateNames/alternateNames.txt'
alt_names_out_file = './out/alternate_names_nb.csv'
source_file_feature_codes = './source/featureCodes_en.txt'
out_file_feature_codes = './lookup/feature_codes.csv'


## Workflow

1. Filter out all Noord Brabant places
2. Add parentid
3. Create alternate names for Noord Brabant
4. Create lookup file for Feature Codes

## 1. Filter out NB places

In [None]:
%%time
geonames_df = pd.read_csv(source_file, delimiter=';', dtype={'geonameid': str, 'admin1_code': str, 'admin2_code': str, 'population': str})
geonames_nb_df = geonames_df.loc[(geonames_df['country_code'] == 'NL') & (geonames_df['admin1_code'] == '06') | (geonames_df['geonameid'] == 2750405)]
geonames_nb_df['lat_long'] = geonames_nb_df.apply(lambda row: (row['latitude'], row['longitude']), axis=1)
geonames_nb_df

## 2. Add parentid

In [None]:
%%time

def find_parent_id(row, df):
    # No parent for countries
    if row['feature_code'] == 'PCLI':  # 'PCLI' usually represents a country
        return None
     
    if pd.notnull(row['admin2_code']):
        # has admin2_code
        parent = df[(df['country_code'] == row['country_code']) &
                (df['admin2_code'] == row['admin2_code']) & 
                (df['admin1_code'] == row['admin1_code'])]
    else:
        # no admin2_code code, only part of NB
        parent = df[(df['country_code'] == row['country_code']) &
                (df['feature_code'].str.startswith('ADM1')) & 
                (df['admin1_code'] == row['admin1_code'])]
        
    # Return parentid if not empty
    if not parent.empty:        
        return parent.iloc[0]['geonameid']
                
    # Return None if no parent found
    return None

# call find_parent_id for each row
geonames_nb_df['parentid'] = geonames_nb_df.apply(lambda row: find_parent_id(row, geonames_df), axis=1)
geonames_nb_df.to_csv(nb_out_file, index=False)
geonames_nb_df

## 3. Filter out alternate names for Noord Brabant

In [None]:
%%time

alt_names_df = pd.read_csv(alt_names_source_file, delimiter='\t', dtype={'alternateNameId': str, 'geonameid': str, 'isPreferredName': str, 'isShortName': str, 'isColloquial': str, 'isHistoric': str})
merged_df = pd.merge(
    alt_names_df,
    geonames_nb_df[['geonameid']],
    on='geonameid',
    how='inner'
)

merged_df.drop('Unnamed: 10', axis=1, inplace=True)

# boolean -> one name_type column 
def determine_name_type(row):
    if row['isPreferredName'] == "1":
        return 'Preferred name'
    elif row['isShortName'] == "1":
        return 'Short name'
    elif row['isColloquial'] == "1":
        return 'Colloquial name'
    elif row['isHistoric'] == "1":
        return 'Historic name'
    else:
        return None  # or an empty string, depending on your preference

merged_df['name_type'] = alt_names_df.apply(determine_name_type, axis=1)

merged_df.to_csv(alt_names_out_file, index=False)
merged_df

## 4. Create lookup file for Feature Codes

In [None]:
%%time
feature_codes_df = pd.read_csv(source_file_feature_codes, delimiter='\t')

feature_codes_split = feature_codes_df['class_code'].str.split('.', expand=True)
feature_codes_df['feature_class'] = feature_codes_split[0]  # First part (before dot)
feature_codes_df['code'] = feature_codes_split[1]   # Second part (after dot)
feature_codes_df = feature_codes_df[['code', 'concept', 'description', 'class_code', 'feature_class']]
feature_codes_df.to_csv(out_file_feature_codes, index=False)

feature_codes_df

In [None]:
%reset -f