# Create SYB country fact builder

In [1]:
import csv
import json
import re
import statistics
import math
import os
import hashlib

### User parameters

In [2]:
dir_path = os.path.dirname(os.path.realpath('__file__'))
print(dir_path)

wd_dir = r'../../data/unsd/countryProfiles'
syb_dir = r'../../data/unsd/UNSYB/output'
print('data inputs dir: ' + wd_dir)



# https://volderette.de/jupyter-notebook-tip-multiple-outputs/
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


C:\Users\L.GonzalezMorales\Documents\GitHub\FIS4SDGs\notebooks\unsdCountryProfiles
data inputs dir: ../../data/unsd/countryProfiles


## Utilities

#### Commercial rounding

In [3]:
def round_KFM(x, n):
    posneg = math.copysign(1, x)
    z = abs(x)*10**n
    z = z + 0.5
    z = math.trunc(z)
    z = z/10**n
    result = z * posneg
    return result
    
print(round_KFM(3.5123,2))
print(round_KFM(4.0,1))

3.51
4.0


#### Regular expression to capture numeric values (including those in scientific notation)
The regex is

```
-?      # an optional -
\d+     # a series of digits
(?:     # start non capturing group
  \.    # a dot
  \d+   # a series of digits
)?      
(?:     # start non capturing group
  e     # "e"
  -?    # an optional -
  \d+   # digits
)?
```


In [4]:
def numeric_part(v):
    numeric_part_f = re.compile(r'-?\d+(?:\.\d+)?(?:e-?\d+)?')
    x = numeric_part_f.findall(v)
    if len(x) > 0:
        return float(x[0])
    else:
        return None


In [5]:
print(numeric_part('<4'))
print(numeric_part('abs'))
print(numeric_part('-23.2e2'))
print(numeric_part('-23.2e2 or more'))

print(numeric_part('0.01'))

4.0
None
-2320.0
-2320.0
0.01


#### Compute a hash of a dictionary

In [6]:
def dict_hash(d):
    out = hashlib.md5()
    for key, value in d.items():
        out.update(key.encode('utf-8'))
        out.update(str(value).encode('utf-8'))
    return out.hexdigest()


#### Get unique dictionaries in a list

In [7]:
def unique_dicts(dictionary_list):

    uniques_map = {}

    for d in dictionary_list:
        uniques_map[dict_hash(d)] = d

    return list(uniques_map.values())


#### Extract subset of key-value pairs from Python dictionary object

In [8]:
def subdict_list(dict_list, keys_list, exclude = False):
    sub_d_list = []
    if exclude:
        for d in dict_list:
            sub_d= {k: d[k] for k in d.keys() if k not in keys_list}
            sub_d_list.append(sub_d)
    else:
        for d in dict_list:
            sub_d= {k: d[k] for k in keys_list}
            sub_d_list.append(sub_d)
    
    return sub_d_list




#### Get a dict from a list based on something inside the dict

In [9]:
def select_dict(dict_list, k, v):
    selected = []
    for d in dict_list:
        if d[k] == v:
            selected.append(d)
    return selected

### Get SYB Series catalogue

In [18]:
syb_dir = r'..\..\data\unsd\UNSYB\output'

with open(r'..\..\data\unsd\UNSYB\output\SYB_Series_Catalog.json') as json_file:  
    syb_catalogue = json.load(json_file)
    
syb_catalogue[0]

{'topicId': 2,
 'topicNameEN': 'Population and migration',
 'topicNameFR': 'Population et migration',
 'tables': [{'tableCode': 'lifecbmort',
   'tableId': 8,
   'tableName': 'Population growth and indicators of fertility and mortality',
   'tableNote': None,
   'tableStatusId': 3,
   'tbBkCode': 'X03 ',
   'tbBkPrint': 'Y',
   'series': [{'newSeriesName': 'Infant mortality ratio',
     'units': 'Number of deaths per 1,000 live births',
     'multiplier': 0,
     'baseYear': None,
     'SELECT': None,
     'seriesCode': 'SYB011',
     'SYB_series': [253]},
    {'newSeriesName': 'Life expectancy at birth',
     'units': 'Number of years',
     'multiplier': 0,
     'baseYear': None,
     'SELECT': None,
     'seriesCode': 'SYB012',
     'SYB_series': [36, 268, 269]},
    {'newSeriesName': 'Maternal mortality ratio',
     'units': 'Number of deaths per 1,000 population',
     'multiplier': 0,
     'baseYear': None,
     'SELECT': None,
     'seriesCode': 'SYB013',
     'SYB_series': [115

### Get catalogue of SYB items on arcGIS online

In [23]:
with open(r'..\..\data\unsd\unsd_syb_catalogue.json') as json_file:  
    syb_arcgis_catalogue = json.load(json_file)

syb_arcgis_catalogue[0]

{'topic': 2,
 'id': '29deadbc8f8c4ffaa4e39a1309753967',
 'series': [{'seriesName': 'Asylum seekers  including pending cases',
   'publicationYear': '2018',
   'id': 'a1bc495b927c4873b6d314a0ce3b7bd1',
   'owner': 'unstats_admin',
   'created': 1561274445000,
   'modified': 1561310873000,
   'title': 'SYB indicator: Asylum seekers, including pending cases',
   'type': 'Feature Service',
   'tags': ['SYB'],
   'url': 'https://services7.arcgis.com/gp50Ao2knMlOM89z/arcgis/rest/services/SYB072_2018/FeatureServer',
   'seriesCode': 'SYB072'},
  {'seriesName': 'Capital city population',
   'publicationYear': '2018',
   'id': '7a7b7cc9beff4fd5b40f8d487a1e23b6',
   'owner': 'unstats_admin',
   'created': 1561274632000,
   'modified': 1561310942000,
   'title': 'SYB indicator: Capital city population',
   'type': 'Feature Service',
   'tags': ['SYB'],
   'url': 'https://services7.arcgis.com/gp50Ao2knMlOM89z/arcgis/rest/services/SYB080_2018/FeatureServer',
   'seriesCode': 'SYB080'},
  {'seriesNa

In [27]:
syb_fact_builder = []

for t1 in syb_catalogue:
    #print("- in topic " + str(t1['topicId']) + ': ' + t1['topicNameEN'])
    
    syb_arcgis_catalogue_s = select_dict(syb_arcgis_catalogue, 'topic', t1['topicId'])[0]['series']
    #display(syb_arcgis_catalogue_s)

    for t2 in t1['tables']:

        for s in t2['series']:
            
            #if s['seriesCode'] == 'SYB030':
            #    continue
            
            hub = ''
            syb_arcgis_catalogue_s = select_dict(syb_arcgis_catalogue_s, 'seriesCode', s['seriesCode'])
            if len(syb_arcgis_catalogue_s)>0:
                hub = syb_arcgis_catalogue_s[0]['id']
            
            display(hub)

            ##--- Read data for this series---#

            filename = 'wide_Topic' + str(t1['topicId']).zfill(2) + '_Table'+ str(t2['tableId']).zfill(3) + '_Series' + s['seriesCode'] + '.json'
            with open(r'../../data/unsd/UNSYB/output/' + filename) as json_file:  
                data = json.load(json_file)

            slices = []
            
            slice_keys = []
            
            all_dimensions = ['units','multiplier', 'baseYear', 'multiplierDesc'
                         'personnelTypeCode','personnelTypeDesc',
                         'activityCode','activityDesc',
                         'speciesCode','speciesDesc',
                         'ODACode','ODADesc',
                         'urbanizationCode','urbanizationDesc',
                         'ageCode','ageDesc',
                         'sexCode','sexDesc',
                         'educationLevCode','educationLevDesc']
            
            for k in list(data['data'][0]['data'][0].keys()):
                if k in all_dimensions:
                    slice_keys.append(k)
            
            s['seriesCode']
            slice_keys
            temp = []
                    
            for g in range(len(data['data'])):
                x = data['data'][g]['data']
                temp.extend(subdict_list(data['data'][g]['data'], slice_keys, exclude = False))

            slices =  unique_dicts(temp)
                
            for i in slices:
                slice = {}
                slice['topicId'] = t1['topicId']
                slice['topicName'] = t1['topicNameEN']
                slice['tableId'] = t2['tableId']
                slice['tableName'] = t2['tableName']
                slice['seriesCode'] = s['seriesCode']
                slice['seriesDesc'] = s['newSeriesName']
                slice['hub'] = hub
                slice['units'] = None
                slice['multiplier'] = None
                slice['baseYear'] = None
                slice['multiplierDesc'] = None
                slice['personnelTypeCode'] = None
                slice['personnelTypeDesc'] = None
                slice['activityCode'] = None
                slice['activityDesc'] = None
                slice['speciesCode'] = None
                slice['speciesDesc'] = None
                slice['ODACode'] = None
                slice['ODADesc'] = None
                slice['urbanizationCode'] = None
                slice['urbanizationDesc'] = None
                slice['ageCode'] = None
                slice['ageDesc'] = None
                slice['sexCode'] = None
                slice['sexDesc'] = None
                slice['educationLevCode'] = None
                slice['educationLevDesc'] = None
                for k,v in i.items():
                    slice[k] = v
                
                syb_fact_builder.append(slice)
                
keys = syb_fact_builder[0].keys()
file = 'syb_fact_builder.csv'

with open( r'../../data/unsd/countryProfiles/' + file, 'w', newline='', encoding='utf-8') as csv_file:  
    dict_writer = csv.DictWriter(csv_file, keys, delimiter='\t')
    dict_writer.writeheader()
    dict_writer.writerows(syb_fact_builder)
    

'b69e704c91974e69b5ae5763f1a49450'

'SYB011'

['units', 'multiplier', 'baseYear']

''

'SYB012'

['sexCode', 'sexDesc', 'units', 'multiplier', 'baseYear']

''

'SYB013'

['units', 'multiplier', 'baseYear']

''

'SYB014'

['units', 'multiplier', 'baseYear']

''

'SYB015'

['units', 'multiplier', 'baseYear']

''

'SYB072'

['units', 'multiplier', 'baseYear']

''

'SYB073'

['sexCode', 'sexDesc', 'units', 'multiplier', 'baseYear']

''

'SYB074'

['sexCode', 'sexDesc', 'units', 'multiplier', 'baseYear']

''

'SYB075'

['units', 'multiplier', 'baseYear']

''

'SYB076'

['units', 'multiplier', 'baseYear']

''

'SYB077'

['units', 'multiplier', 'baseYear']

''

'SYB078'

['units', 'multiplier', 'baseYear']

''

'SYB079'

['units', 'multiplier', 'baseYear']

''

'SYB080'

['units', 'multiplier', 'baseYear']

''

'SYB081'

['urbanizationCode', 'urbanizationDesc', 'units', 'multiplier', 'baseYear']

''

'SYB082'

['urbanizationCode', 'urbanizationDesc', 'units', 'multiplier', 'baseYear']

''

'SYB087'

['ageCode', 'ageDesc', 'units', 'multiplier', 'baseYear']

''

'SYB088'

['units', 'multiplier', 'baseYear']

''

'SYB089'

['sexCode', 'sexDesc', 'units', 'multiplier', 'baseYear']

''

'SYB090'

['units', 'multiplier', 'baseYear']

''

'SYB091'

['units', 'multiplier', 'baseYear']

'f4ebc92d8aea47bdac090dfa3da29bb2'

'SYB001'

['sexCode',
 'sexDesc',
 'educationLevCode',
 'educationLevDesc',
 'units',
 'multiplier',
 'baseYear']

''

'SYB002'

['educationLevCode', 'educationLevDesc', 'units', 'multiplier', 'baseYear']

''

'SYB108'

['units', 'multiplier', 'baseYear']

''

'SYB109'

['units', 'multiplier', 'baseYear']

''

'SYB110'

['units', 'multiplier', 'baseYear']

''

'SYB111'

['educationLevCode', 'educationLevDesc', 'units', 'multiplier', 'baseYear']

''

'SYB112'

['educationLevCode', 'educationLevDesc', 'units', 'multiplier', 'baseYear']

''

'SYB119'

['educationLevCode', 'educationLevDesc', 'units', 'multiplier', 'baseYear']

''

'SYB120'

['educationLevCode', 'educationLevDesc', 'units', 'multiplier', 'baseYear']

'5ab360904e1d44c88a11240eaaf76843'

'SYB113'

['educationLevCode', 'educationLevDesc', 'units', 'multiplier', 'baseYear']

''

'SYB118'

['units', 'multiplier', 'baseYear']

'16d3087c8d544007ac1aa01c2c755c42'

'SYB105'

['units', 'multiplier', 'baseYear']

''

'SYB106'

['units', 'multiplier', 'baseYear']

''

'SYB107'

['units', 'multiplier', 'baseYear']

'4ea3388567b64f33b3601bac26767f96'

'SYB022'

['activityCode', 'activityDesc', 'units', 'multiplier', 'baseYear']

''

'SYB023'

['units', 'multiplier', 'baseYear']

''

'SYB024'

['units', 'multiplier', 'baseYear']

''

'SYB025'

['units', 'multiplier', 'baseYear']

''

'SYB026'

['units', 'multiplier', 'baseYear']

'a201ac34ff6a47a6a978ca2b3b7f0dfd'

'SYB065'

['units', 'multiplier', 'baseYear']

''

'SYB066'

['units', 'multiplier', 'baseYear']

''

'SYB067'

['units', 'multiplier', 'baseYear']

''

'SYB068'

['units', 'multiplier', 'baseYear']

''

'SYB069'

['units', 'multiplier', 'baseYear']

'053bf43b5a724a5082efde98adebc4ca'

'SYB030'

['activityCode',
 'activityDesc',
 'sexCode',
 'sexDesc',
 'units',
 'multiplier',
 'baseYear']

KeyError: 'sexCode'