# Create SYB country fact builder

In [1]:
import csv
import json
import re
import statistics
import math
import os
import hashlib

### User parameters

In [2]:
dir_path = os.path.dirname(os.path.realpath('__file__'))
print(dir_path)

wd_dir = r'../../data/unsd/countryProfiles'
syb_dir = r'../../data/unsd/UNSYB/output'
print('data inputs dir: ' + wd_dir)



# https://volderette.de/jupyter-notebook-tip-multiple-outputs/
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


C:\Users\L.GonzalezMorales\Documents\GitHub\FIS4SDGs\notebooks\unsdCountryProfiles
data inputs dir: ../../data/unsd/countryProfiles


## Utilities

#### Commercial rounding

In [3]:
def round_KFM(x, n):
    posneg = math.copysign(1, x)
    z = abs(x)*10**n
    z = z + 0.5
    z = math.trunc(z)
    z = z/10**n
    result = z * posneg
    return result
    
print(round_KFM(3.5123,2))
print(round_KFM(4.0,1))

3.51
4.0


#### Regular expression to capture numeric values (including those in scientific notation)
The regex is

```
-?      # an optional -
\d+     # a series of digits
(?:     # start non capturing group
  \.    # a dot
  \d+   # a series of digits
)?      
(?:     # start non capturing group
  e     # "e"
  -?    # an optional -
  \d+   # digits
)?
```


In [4]:
def numeric_part(v):
    numeric_part_f = re.compile(r'-?\d+(?:\.\d+)?(?:e-?\d+)?')
    x = numeric_part_f.findall(v)
    if len(x) > 0:
        return float(x[0])
    else:
        return None


In [5]:
print(numeric_part('<4'))
print(numeric_part('abs'))
print(numeric_part('-23.2e2'))
print(numeric_part('-23.2e2 or more'))

print(numeric_part('0.01'))

4.0
None
-2320.0
-2320.0
0.01


#### Compute a hash of a dictionary

In [6]:
def dict_hash(d):
    out = hashlib.md5()
    for key, value in d.items():
        out.update(key.encode('utf-8'))
        out.update(str(value).encode('utf-8'))
    return out.hexdigest()


#### Get unique dictionaries in a list

In [7]:
def unique_dicts(dictionary_list):

    uniques_map = {}

    for d in dictionary_list:
        uniques_map[dict_hash(d)] = d

    return list(uniques_map.values())


#### Extract subset of key-value pairs from Python dictionary object

In [8]:
def subdict_list(dict_list, keys_list, exclude = False):
    sub_d_list = []
    if exclude:
        for d in dict_list:
            sub_d= {k: d[k] for k in d.keys() if k not in keys_list}
            sub_d_list.append(sub_d)
    else:
        for d in dict_list:
            sub_d= {k: d[k] for k in keys_list}
            sub_d_list.append(sub_d)
    
    return sub_d_list




#### Get a dict from a list based on something inside the dict

In [14]:
def select_dict(dict_list, k, v):
    selected = []
    for d in dict_list:
        if d[k] == v:
            selected.append(d)
    return selected

### Get SYB Series catalogue

In [29]:
with open(r'..\..\data\unsd\UNSYB\output\SYB_Series_Catalog.json') as json_file:  
    syb_catalogue = json.load(json_file)
    
#syb_catalogue[0]

### Get catalogue of SYB items on arcGIS online

In [30]:

with open(r'..\..\data\unsd\UNSYB\output\unsd_syb_catalogue_201907.json') as json_file:  
    syb_arcgis_catalogue = json.load(json_file)

#syb_arcgis_catalogue[0]

In [31]:
syb_fact_builder = []

for t1 in syb_catalogue:
    #print("- in topic " + str(t1['topicId']) + ': ' + t1['topicNameEN'])
    
    syb_arcgis_catalogue_s = select_dict(syb_arcgis_catalogue, 'topic', t1['topicId'])[0]['series']
    #display(syb_arcgis_catalogue_s)

    for t2 in t1['tables']:

        for s in t2['series']:
            
            if s['seriesCode'] == 'SYB030':  #This series is missing 'sex code'
                continue
            if s['seriesCode'] in ['SYB083','SYB084']:  #This series is missing data
                continue
                
            print(s['seriesCode'])
            
            hub = ''
            syb_arcgis_catalogue_s2 = select_dict(syb_arcgis_catalogue_s, 'seriesCode', s['seriesCode'])
            if len(syb_arcgis_catalogue_s2)>0:
                hub = syb_arcgis_catalogue_s2[0]['id']
            
            display(hub)
            
            ##--- Read data for this series---#

            filename = 'wide_Topic' + str(t1['topicId']).zfill(2) + '_Table'+ str(t2['tableId']).zfill(3) + '_Series' + s['seriesCode'] + '.json'
            with open(r'../../data/unsd/UNSYB/output/' + filename) as json_file:  
                data = json.load(json_file)

            slices = []
            
            slice_keys = []
            
            all_dimensions = ['units','multiplier', 'baseYear', 'multiplierDesc'
                         'personnelTypeCode','personnelTypeDesc',
                         'activityCode','activityDesc',
                         'speciesCode','speciesDesc',
                         'ODACode','ODADesc',
                         'urbanizationCode','urbanizationDesc',
                         'ageCode','ageDesc',
                         'sexCode','sexDesc',
                         'educationLevCode','educationLevDesc']
            
            #display(data['data'][0])
            
            for k in list(data['data'][0]['data'][0].keys()):
                if k in all_dimensions:
                    slice_keys.append(k)
            
            s['seriesCode']
            slice_keys
            temp = []
                    
            for g in range(len(data['data'])):
                x = data['data'][g]['data']
                temp.extend(subdict_list(data['data'][g]['data'], slice_keys, exclude = False))

            slices =  unique_dicts(temp)
                
            for i in slices:
                slice = {}
                slice['topicId'] = t1['topicId']
                slice['topicName'] = t1['topicNameEN']
                slice['tableId'] = t2['tableId']
                slice['tableName'] = t2['tableName']
                slice['seriesCode'] = s['seriesCode']
                slice['seriesDesc'] = s['newSeriesName']
                slice['hub'] = hub
                slice['units'] = None
                slice['multiplier'] = None
                slice['baseYear'] = None
                slice['multiplierDesc'] = None
                slice['personnelTypeCode'] = None
                slice['personnelTypeDesc'] = None
                slice['activityCode'] = None
                slice['activityDesc'] = None
                slice['speciesCode'] = None
                slice['speciesDesc'] = None
                slice['ODACode'] = None
                slice['ODADesc'] = None
                slice['urbanizationCode'] = None
                slice['urbanizationDesc'] = None
                slice['ageCode'] = None
                slice['ageDesc'] = None
                slice['sexCode'] = None
                slice['sexDesc'] = None
                slice['educationLevCode'] = None
                slice['educationLevDesc'] = None
                for k,v in i.items():
                    slice[k] = v
                
                syb_fact_builder.append(slice)
                
keys = syb_fact_builder[0].keys()
file = 'syb_fact_builder_201907.csv'

with open( r'../../data/unsd/countryProfiles/' + file, 'w', newline='', encoding='utf-8') as csv_file:  
    dict_writer = csv.DictWriter(csv_file, keys, delimiter='\t')
    dict_writer.writeheader()
    dict_writer.writerows(syb_fact_builder)
    

SYB011


'b69e704c91974e69b5ae5763f1a49450'

'SYB011'

['units', 'multiplier', 'baseYear']

SYB012


'3aa919d7703a4ecfa9d55bac8694343f'

'SYB012'

['sexCode', 'sexDesc', 'units', 'multiplier', 'baseYear']

SYB013


'0a5482be9b574d9294c03089fe52803f'

'SYB013'

['units', 'multiplier', 'baseYear']

SYB014


'f6f16a88b56840249b4439bbea05dba4'

'SYB014'

['units', 'multiplier', 'baseYear']

SYB015


'bbb7dbea77a9464c883e3605eca93d51'

'SYB015'

['units', 'multiplier', 'baseYear']

SYB072


'a1bc495b927c4873b6d314a0ce3b7bd1'

'SYB072'

['units', 'multiplier', 'baseYear']

SYB073


'd7f798f2a6cd46e2b6d9c09065f03de4'

'SYB073'

['sexCode', 'sexDesc', 'units', 'multiplier', 'baseYear']

SYB074


'1b32a522b98748ac8c040c4567b4061a'

'SYB074'

['sexCode', 'sexDesc', 'units', 'multiplier', 'baseYear']

SYB075


'a96325f096344e99a1b44d8a5c891b9e'

'SYB075'

['units', 'multiplier', 'baseYear']

SYB076


'772a41a4477f4b1b81409dbad8fbac7a'

'SYB076'

['units', 'multiplier', 'baseYear']

SYB077


'592ed636d538491593ed9de1e2c3f47b'

'SYB077'

['units', 'multiplier', 'baseYear']

SYB078


'133574278815466b886afee49f3206b9'

'SYB078'

['units', 'multiplier', 'baseYear']

SYB079


'966aa129360f4f9a8081dd4d0d4e219f'

'SYB079'

['units', 'multiplier', 'baseYear']

SYB080


'7a7b7cc9beff4fd5b40f8d487a1e23b6'

'SYB080'

['units', 'multiplier', 'baseYear']

SYB081


'b95285b0546841fa917c197a6dd73c1e'

'SYB081'

['urbanizationCode', 'urbanizationDesc', 'units', 'multiplier', 'baseYear']

SYB082


'c7e749b42fda4c84b39cf8fa5d038371'

'SYB082'

['urbanizationCode', 'urbanizationDesc', 'units', 'multiplier', 'baseYear']

SYB087


'493ee3f35221493fa582b807034fe488'

'SYB087'

['ageCode', 'ageDesc', 'units', 'multiplier', 'baseYear']

SYB088


'566287a1c8f442b389bcef3b2e9454d4'

'SYB088'

['units', 'multiplier', 'baseYear']

SYB089


'7e77f35a3ebb49c29d9fc8c10f825f33'

'SYB089'

['sexCode', 'sexDesc', 'units', 'multiplier', 'baseYear']

SYB090


'1e28bd6f12a942deb823d4cf809b6756'

'SYB090'

['units', 'multiplier', 'baseYear']

SYB091


'875ce0c9115b46e6a9fe2471e312ea03'

'SYB091'

['units', 'multiplier', 'baseYear']

SYB001


'f4ebc92d8aea47bdac090dfa3da29bb2'

'SYB001'

['sexCode',
 'sexDesc',
 'educationLevCode',
 'educationLevDesc',
 'units',
 'multiplier',
 'baseYear']

SYB002


'4aa9ce6bd1cf48deab7dcf9152c404a1'

'SYB002'

['educationLevCode', 'educationLevDesc', 'units', 'multiplier', 'baseYear']

SYB108


'7a57cd1c85464609ab8af5ce418cdf03'

'SYB108'

['units', 'multiplier', 'baseYear']

SYB109


'49f827b7121940cf8a472e62f6612048'

'SYB109'

['units', 'multiplier', 'baseYear']

SYB110


'5f92a363a2a94684bd0c89361ba52ce9'

'SYB110'

['units', 'multiplier', 'baseYear']

SYB111


'f1c6471ca64b4740bc3f9046073381ef'

'SYB111'

['educationLevCode', 'educationLevDesc', 'units', 'multiplier', 'baseYear']

SYB112


'e3d44eb52a9d495c821786cfc5723681'

'SYB112'

['educationLevCode', 'educationLevDesc', 'units', 'multiplier', 'baseYear']

SYB119


'c6fd3b6933f54d54af3bc10e344f3f70'

'SYB119'

['educationLevCode', 'educationLevDesc', 'units', 'multiplier', 'baseYear']

SYB120


'e39123d0e03d4a5d8c29369e5ac2f732'

'SYB120'

['educationLevCode', 'educationLevDesc', 'units', 'multiplier', 'baseYear']

SYB113


'5ab360904e1d44c88a11240eaaf76843'

'SYB113'

['educationLevCode', 'educationLevDesc', 'units', 'multiplier', 'baseYear']

SYB118


'a4862bdf37334c79a9e3344f843ddac1'

'SYB118'

['units', 'multiplier', 'baseYear']

SYB105


'16d3087c8d544007ac1aa01c2c755c42'

'SYB105'

['units', 'multiplier', 'baseYear']

SYB106


''

'SYB106'

['units', 'multiplier', 'baseYear']

SYB107


''

'SYB107'

['units', 'multiplier', 'baseYear']

SYB022


'4ea3388567b64f33b3601bac26767f96'

'SYB022'

['activityCode', 'activityDesc', 'units', 'multiplier', 'baseYear']

SYB023


'5e962982b65c47e2adaba44f367c9845'

'SYB023'

['units', 'multiplier', 'baseYear']

SYB024


'77621dfd97e94273a1405e960582854f'

'SYB024'

['units', 'multiplier', 'baseYear']

SYB025


'5ec9f38def1247ebbcee2557fc0c3a9b'

'SYB025'

['units', 'multiplier', 'baseYear']

SYB026


'18378adffb914440b6f6ae8579b9706d'

'SYB026'

['units', 'multiplier', 'baseYear']

SYB065


'a201ac34ff6a47a6a978ca2b3b7f0dfd'

'SYB065'

['units', 'multiplier', 'baseYear']

SYB066


'12fb78721cdc4d678a74e58006b131cf'

'SYB066'

['units', 'multiplier', 'baseYear']

SYB067


'fab67e6783d14d1988ff7ed19bea77ef'

'SYB067'

['units', 'multiplier', 'baseYear']

SYB068


'9b697a11f8084ac682dea3f55b2b088f'

'SYB068'

['units', 'multiplier', 'baseYear']

SYB069


'2ab2182d60cc430c92c51ce791ac7f3d'

'SYB069'

['units', 'multiplier', 'baseYear']

SYB092


'646314aeef1b42bc9de3a49902eb5d98'

'SYB092'

['sexCode', 'sexDesc', 'units', 'multiplier', 'baseYear']

SYB093


'4a000e35f2fb4699a79729f50a189b20'

'SYB093'

['sexCode', 'sexDesc', 'units', 'multiplier', 'baseYear']

SYB027


'3e6c2522bf274ac1930b60e124ff46fd'

'SYB027'

['activityCode', 'activityDesc', 'units', 'multiplier', 'baseYear']

SYB028


'b980c23d110d443f82c8bfbf2d8a13e7'

'SYB028'

['units', 'multiplier', 'baseYear']

SYB029


'070efd18b12049899372bda29e3f6205'

'SYB029'

['units', 'multiplier', 'baseYear']

SYB094


'd0ed6d76f6df4b58855d99661319b0f0'

'SYB094'

['units', 'multiplier', 'baseYear']

SYB095


'a27b203621164e85857ffc85a5392703'

'SYB095'

['units', 'multiplier', 'baseYear']

SYB098


'4294a38ad217408b9c426d923359875c'

'SYB098'

['units', 'multiplier', 'baseYear']

SYB099


'09f5f5b86d41446dae119948a3974cfd'

'SYB099'

['units', 'multiplier', 'baseYear']

SYB100


'2f2aa9381c5b4e93984c31b38f56d785'

'SYB100'

['units', 'multiplier', 'baseYear']

SYB101


'b31ac79e3dc44162a9942a4d4bcb1183'

'SYB101'

['sexCode', 'sexDesc', 'units', 'multiplier', 'baseYear']

SYB102


'04047e5125f84d8c98aa50821ea64767'

'SYB102'

['units', 'multiplier', 'baseYear']

SYB103


'e64a546a527f4ecc96313cf0b4b8f792'

'SYB103'

['units', 'multiplier', 'baseYear']

SYB104


'cfa16e075f8a4886b799f67a2cfba067'

'SYB104'

['units', 'multiplier', 'baseYear']

SYB003


'5157cb78563b4da4936c39efda0be6c3'

'SYB003'

['speciesCode', 'speciesDesc', 'units', 'multiplier', 'baseYear']

SYB045


'ddf9aaf75c8049688634c59e4a47a53a'

'SYB045'

['units', 'multiplier', 'baseYear']

SYB046


'7038b4b16ad0487186524da1df58b413'

'SYB046'

['units', 'multiplier', 'baseYear']

SYB047


'686bb7cb792748a29b203842f553f249'

'SYB047'

['units', 'multiplier', 'baseYear']

SYB048


'6bc0bf63f61a45fdbb01ac2a5692ee40'

'SYB048'

['units', 'multiplier', 'baseYear']

SYB049


'48a4a84dbdd640f48dea6795bb6e0f1f'

'SYB049'

['units', 'multiplier', 'baseYear']

SYB050


'be9f976cde964d02b38168123b478d66'

'SYB050'

['units', 'multiplier', 'baseYear']

SYB051


'b6e78096d8024588a0cce59d729bc154'

'SYB051'

['units', 'multiplier', 'baseYear']

SYB052


'b03884cce5cd4c6a8d713fb95fe16846'

'SYB052'

['units', 'multiplier', 'baseYear']

SYB085


'a5a5fa906d2143919125fe9cbd1c669f'

'SYB085'

['urbanizationCode', 'urbanizationDesc', 'units', 'multiplier', 'baseYear']

SYB086


'8bd9d377cbec4f948ce64c2e3debde30'

'SYB086'

['urbanizationCode', 'urbanizationDesc', 'units', 'multiplier', 'baseYear']

SYB123


'd3581d327da343e8bfd54e32b41ac565'

'SYB123'

['units', 'multiplier', 'baseYear']

SYB124


'f3014b6e5da449cbb2ddee83cb8b0080'

'SYB124'

['units', 'multiplier', 'baseYear']

SYB004


'516b158641f445a2a982a46bfdb852a3'

'SYB004'

['units', 'multiplier', 'baseYear']

SYB005


'200f2498b38942d994f5acaf07628f02'

'SYB005'

['units', 'multiplier', 'baseYear']

SYB006


'5d99ab6eba2a477f90088daf76a88be8'

'SYB006'

['units', 'multiplier', 'baseYear']

SYB007


'c7854288ecb84eeabcf5f798a37d9bfa'

'SYB007'

['units', 'multiplier', 'baseYear']

SYB008


'5df5708e8bad4269b2b780ed038bcf74'

'SYB008'

['units', 'multiplier', 'baseYear']

SYB009


'da7613fc860f46c28a8a4fda55564890'

'SYB009'

['units', 'multiplier', 'baseYear']

SYB010


'aab8b5297c4b4d40bc24258e000a5f4b'

'SYB010'

['units', 'multiplier', 'baseYear']

SYB058


'0155a072f23c40639c9c1f65b9667d89'

'SYB058'

['personnelTypeDesc', 'sexCode', 'sexDesc', 'units', 'multiplier', 'baseYear']

SYB059


'e44a82d6b21e41d790ea221489e2e05a'

'SYB059'

['units', 'multiplier', 'baseYear']

SYB060


'926e211d51d74533a30b4be0f429969f'

'SYB060'

['units', 'multiplier', 'baseYear']

SYB061


'55e5021e5ed54ef693b0a3e83d463468'

'SYB061'

['units', 'multiplier', 'baseYear']

SYB016


'98217db641c4408ebbfbd466bc435ed1'

'SYB016'

['units', 'multiplier', 'baseYear']

SYB017


'5d20115697c64d96bfb5c5d8f93a9397'

'SYB017'

['units', 'multiplier', 'baseYear']

SYB018


'c40c433ca8bf42268a279d4edaa4a129'

'SYB018'

['units', 'multiplier', 'baseYear']

SYB019


'd15f3b7424a841d5b189f658cf1c868e'

'SYB019'

['units', 'multiplier', 'baseYear']

SYB020


'e7076265cb3e4f749beabd9013df9231'

'SYB020'

['units', 'multiplier', 'baseYear']

SYB021


'35980a3e474845119427797aac070971'

'SYB021'

['units', 'multiplier', 'baseYear']

SYB062


'af3fcb6e186d4cc1b8ff1aea91147496'

'SYB062'

['units', 'multiplier', 'baseYear']

SYB063


'853d6d5eab0a4176902118b65863a2c3'

'SYB063'

['units', 'multiplier', 'baseYear']

SYB064


'f88839ade39842a1906c88053823bc63'

'SYB064'

['units', 'multiplier', 'baseYear']

SYB031


'c7368c21d75947f88fed9801e6477a3c'

'SYB031'

['units', 'multiplier', 'baseYear']

SYB032


'39dc2a917748444493018eab251e5774'

'SYB032'

['units', 'multiplier', 'baseYear']

SYB033


'685f45c354964e199c6f7b043ffbff14'

'SYB033'

['units', 'multiplier', 'baseYear']

SYB034


'1c35d274546f460894ebb842389de646'

'SYB034'

['units', 'multiplier', 'baseYear']

SYB035


'9dacceba74a14ddeb991f2c4b5f8a084'

'SYB035'

['units', 'multiplier', 'baseYear']

SYB036


'67e7c4507f504944a1e95ed64da925f4'

'SYB036'

['units', 'multiplier', 'baseYear']

SYB037


'628af69abf17412c9ed41531cf5e43a0'

'SYB037'

['units', 'multiplier', 'baseYear']

SYB038


'daf06dca87624c4f9d07f9e99a740782'

'SYB038'

['units', 'multiplier', 'baseYear']

SYB039


'e41b385906304f4398f737e65ecc2395'

'SYB039'

['units', 'multiplier', 'baseYear']

SYB040


'562b628eb16b4241bab20acd1d1eae28'

'SYB040'

['units', 'multiplier', 'baseYear']

SYB041


'c9ba0efc5b13471f8de72bb83adec724'

'SYB041'

['units', 'multiplier', 'baseYear']

SYB042


'13179ba04be9443186c9f5edf9dad3f1'

'SYB042'

['units', 'multiplier', 'baseYear']

SYB043


'5a449c541b25476d9ba1deb3f7136038'

'SYB043'

['units', 'multiplier', 'baseYear']

SYB044


'1a2f203243f24fcf892254d87c8b3403'

'SYB044'

['units', 'multiplier', 'baseYear']

SYB114


'e5f6c8d46ef94732a08edb7f03811dee'

'SYB114'

['units', 'multiplier', 'baseYear']

SYB115


'4c2c3e6e76de44078804d07868347973'

'SYB115'

['units', 'multiplier', 'baseYear']

SYB070


'a95c6bddae19483f9ae696ee6473cfeb'

'SYB070'

['ODACode', 'ODADesc', 'units', 'multiplier', 'baseYear']

SYB071


'868fe94fbfef470889b0c51a2f919fcc'

'SYB071'

['ODACode', 'ODADesc', 'units', 'multiplier', 'baseYear']

SYB096


'79dc5d6c0e424fb2bcb19ec77a742e07'

'SYB096'

['units', 'multiplier', 'baseYear']

SYB097


'8065c12d5211488c8641024ba14688f1'

'SYB097'

['units', 'multiplier', 'baseYear']

SYB053


'c1e65b470517439aa20cc885f9c00a79'

'SYB053'

['units', 'multiplier', 'baseYear']

SYB054


'd78a993f8f804607be4955d1217b85ee'

'SYB054'

['units', 'multiplier', 'baseYear']

SYB055


'17d2e797402744f5957179e55ae8a53c'

'SYB055'

['units', 'multiplier', 'baseYear']

SYB056


'fbbfb40c9c764ddbae5c618639a99685'

'SYB056'

['units', 'multiplier', 'baseYear']

SYB057


'c8e641cb9791484f8c4a2228967d999d'

'SYB057'

['units', 'multiplier', 'baseYear']

SYB116


'565037320392406681529c7afd3bbeb4'

'SYB116'

['units', 'multiplier', 'baseYear']

SYB117


'79729a8c39d743c7bab4a10a842d4a12'

'SYB117'

['units', 'multiplier', 'baseYear']

SYB121


'46b0b285f2f5495d851cb23659ad0133'

'SYB121'

['personnelTypeDesc', 'units', 'multiplier', 'baseYear']

SYB122


'6ab9cedce0e5488bba16195f651edd5e'

'SYB122'

['personnelTypeDesc', 'units', 'multiplier', 'baseYear']