In [1]:
import os
import pandas as pd
# import seaborn as sns
from pathlib import Path
import numpy as np

def set_correct_working_dir(working_dir: str):
    current_working_dir = Path(os.getcwd())
    if working_dir == current_working_dir.name:
        print('WD correct')
        return
    for directory in current_working_dir.parents:
        if working_dir == directory.name:
            os.chdir(str(directory.absolute()))
            print('New WD:', str(directory.absolute()))
            return
    raise FileNotFoundError

set_correct_working_dir('geo-locations')

New WD: /home/avmo/src/projects/covidmap-sweden/geo-locations


### Reading in data

In [2]:
country_code = 'swe'
source = 'geonames'
country_path = Path(os.getcwd()) / 'data' / country_code
source = country_path / '_'.join(['source', source]) / 'SE.txt'
# column description to be found at: https://download.geonames.org/export/zip/
geonames_df = pd.read_csv(
    source,
    sep='\t',
    names=['country_code ', 'postal_code', 'place_name', 'admin_name_1', 'admin_code_1', 'admin_name_2', 'admin_code_2', 'admin_name_3', 'admin_code_3', 'latitude', 'longitude', 'accuracy'],
    converters={'postal_code': str},
)
geonames_df.fillna('', inplace=True)
display(geonames_df.head())

Unnamed: 0,country_code,postal_code,place_name,admin_name_1,admin_code_1,admin_name_2,admin_code_2,admin_name_3,admin_code_3,latitude,longitude,accuracy
0,SE,186 00,Vallentuna,Stockholm,AB,Vallentuna,115,,,59.5344,18.0776,4
1,SE,186 01,Vallentuna,Stockholm,AB,Vallentuna,115,,,59.5344,18.0776,4
2,SE,186 03,Brottby,Stockholm,AB,Vallentuna,115,,,59.5632,18.2403,4
3,SE,186 21,Vallentuna,Stockholm,AB,Vallentuna,115,,,59.5344,18.0776,4
4,SE,186 22,Vallentuna,Stockholm,AB,Vallentuna,115,,,59.5344,18.0776,4


In [4]:
# Convert postal code to integers "186 00" -> 18600
geonames_df.postal_code = geonames_df.postal_code.apply(lambda pc: int(''.join(pc.split())))

In [5]:
try:
    geonames_df.drop(columns=['admin_code_1', 'admin_code_2', 'admin_code_3', 'accuracy'], inplace=True)
except KeyError:
    pass
geonames_df.fillna(np.NaN)
for str_col in ['admin_name_1', 'admin_name_2', 'admin_name_3', 'place_name']:
    geonames_df.loc[(geonames_df[str_col].isnull(), 'admin_name_3')] = ''  # empty string for string columns

geonames_df.loc[(geonames_df['admin_name_2'] == geonames_df['admin_name_3'], 'admin_name_3')] = ''

geonames_df = geonames_df.groupby('postal_code').agg({'admin_name_1': lambda col: ';'.join(col),
                                                      'admin_name_2': lambda col: ';'.join(col),
                                                      'admin_name_3': lambda col: ';'.join(col), 
                                                      'place_name': lambda col: '||'.join(col),
                                                      'latitude': 'mean',
                                                      'longitude': 'mean'}).reset_index()

geonames_df['admin_name_1'] = geonames_df['admin_name_1'].apply(lambda x: ';'.join(list(set([y for y in x.split(';') if y]))) if ';' in x else x)
geonames_df['admin_name_2'] = geonames_df['admin_name_2'].apply(lambda x: ';'.join(list(set([y for y in x.split(';') if y]))) if ';' in x else x)
geonames_df['admin_name_3'] = geonames_df['admin_name_3'].apply(lambda x: ';'.join(list(set([y for y in x.split(';') if y]))) if ';' in x else x)
geonames_df.loc[(geonames_df['admin_name_3'] == '', 'region_id')] = geonames_df['admin_name_1'] + '::' + geonames_df['admin_name_2'] + '::' + geonames_df['place_name']
geonames_df.loc[(geonames_df['admin_name_3'] != '', 'region_id')] = geonames_df['admin_name_1'] + '::' + geonames_df['admin_name_2'] + '::' + geonames_df['admin_name_3'] + '::' + geonames_df['place_name']
geonames_df.reset_index(inplace=True, drop=True)
geonames_df.sort_values('postal_code', inplace=True)
geonames_df.insert(loc=0, column='country_code', value=country_code)
geonames_df = geonames_df.round({'latitude': 4, 'longitude': 4})

In [6]:
geonames_df.head(50)

Unnamed: 0,country_code,postal_code,admin_name_1,admin_name_2,admin_name_3,place_name,latitude,longitude,region_id
0,swe,10005,Stockholm,Stockholm,,Stockholm,59.3326,18.0649,Stockholm::Stockholm::Stockholm
1,swe,10012,Stockholm,Stockholm,,Stockholm,59.3326,18.0649,Stockholm::Stockholm::Stockholm
2,swe,10026,Stockholm,Stockholm,,Stockholm,59.3326,18.0649,Stockholm::Stockholm::Stockholm
3,swe,10028,Stockholm,Stockholm,,Stockholm,59.3326,18.0649,Stockholm::Stockholm::Stockholm
4,swe,10029,Stockholm,Stockholm,,Stockholm,59.3326,18.0649,Stockholm::Stockholm::Stockholm
5,swe,10031,Stockholm,Stockholm,,Stockholm,59.3326,18.0649,Stockholm::Stockholm::Stockholm
6,swe,10040,Stockholm,Stockholm,,Stockholm,59.3326,18.0649,Stockholm::Stockholm::Stockholm
7,swe,10041,Stockholm,Stockholm,,Stockholm,59.3326,18.0649,Stockholm::Stockholm::Stockholm
8,swe,10044,Stockholm,Stockholm,,Stockholm,59.3326,18.0649,Stockholm::Stockholm::Stockholm
9,swe,10052,Stockholm,Stockholm,,Stockholm,59.3326,18.0649,Stockholm::Stockholm::Stockholm


In [7]:
geonames_df.tail(50)

Unnamed: 0,country_code,postal_code,admin_name_1,admin_name_2,admin_name_3,place_name,latitude,longitude,region_id
16353,swe,98203,Norrbotten,Gällivare,,Gällivare,67.1339,20.6528,Norrbotten::Gällivare::Gällivare
16354,swe,98204,Norrbotten,Gällivare,,Gällivare,67.1339,20.6528,Norrbotten::Gällivare::Gällivare
16355,swe,98205,Norrbotten,Gällivare,,Gällivare,67.1339,20.6528,Norrbotten::Gällivare::Gällivare
16356,swe,98206,Norrbotten,Gällivare,,Nattavaara,66.75,20.95,Norrbotten::Gällivare::Nattavaara
16357,swe,98207,Norrbotten,Gällivare,,Nattavaaraby,67.0571,20.7122,Norrbotten::Gällivare::Nattavaaraby
16358,swe,98220,Norrbotten,,,Gällivare,67.1339,20.6528,Norrbotten::::Gällivare
16359,swe,98221,Norrbotten,Gällivare,,Gällivare,67.1339,20.6528,Norrbotten::Gällivare::Gällivare
16360,swe,98222,Norrbotten,Gällivare,,Gällivare,67.1339,20.6528,Norrbotten::Gällivare::Gällivare
16361,swe,98228,Norrbotten,Gällivare,,Gällivare,67.1339,20.6528,Norrbotten::Gällivare::Gällivare
16362,swe,98231,Norrbotten,Gällivare,,Gällivare,67.1339,20.6528,Norrbotten::Gällivare::Gällivare


In [8]:
geonames_df.dtypes

country_code     object
postal_code       int64
admin_name_1     object
admin_name_2     object
admin_name_3     object
place_name       object
latitude        float64
longitude       float64
region_id        object
dtype: object

In [9]:
try:
    geonames_df.drop(columns=['admin_name_1', 'admin_name_2', 'admin_name_3', 'place_name'], inplace=True)
except KeyError:
    pass
file_path = country_path / '_'.join([country_code, 'geocoding.csv'])
geonames_df.to_csv(file_path, index=False)