# Create country profiles

In [1]:
import csv
import json
import re
import statistics
import math
import os

### User parameters

In [22]:
dir_path = os.path.dirname(os.path.realpath('__file__'))
print(dir_path)

wd_dir = r'../../data/unsd/countryProfiles'
syb_dir = r'../../data/unsd/UNSYB/output'
print('data inputs dir: ' + wd_dir)



# https://volderette.de/jupyter-notebook-tip-multiple-outputs/
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


C:\Users\L.GonzalezMorales\Documents\GitHub\FIS4SDGs\notebooks\unsdCountryProfiles
data inputs dir: ../../data/unsd/countryProfiles


## Utilities

#### Commercial rounding

In [3]:
def round_KFM(x, n):
    posneg = math.copysign(1, x)
    z = abs(x)*10**n
    z = z + 0.5
    z = math.trunc(z)
    z = z/10**n
    result = z * posneg
    return result
    
print(round_KFM(3.5123,2))
print(round_KFM(4.0,1))

3.51
4.0


#### Regular expression to capture numeric values (including those in scientific notation)
The regex is

```
-?      # an optional -
\d+     # a series of digits
(?:     # start non capturing group
  \.    # a dot
  \d+   # a series of digits
)?      
(?:     # start non capturing group
  e     # "e"
  -?    # an optional -
  \d+   # digits
)?
```


In [4]:
def numeric_part(v):
    numeric_part_f = re.compile(r'-?\d+(?:\.\d+)?(?:e-?\d+)?')
    x = numeric_part_f.findall(v)
    if len(x) > 0:
        return float(x[0])
    else:
        return None


In [5]:
print(numeric_part('<4'))
print(numeric_part('abs'))
print(numeric_part('-23.2e2'))
print(numeric_part('-23.2e2 or more'))

print(numeric_part('0.01'))

4.0
None
-2320.0
-2320.0
0.01


#### Compute a hash of a dictionary

In [6]:
def dict_hash(d):
    out = hashlib.md5()
    for key, value in d.items():
        out.update(key.encode('utf-8'))
        out.update(value.encode('utf-8'))
    return out.hexdigest()


#### Get unique dictionaries in a list

In [7]:
def unique_dicts(dictionary_list):

    uniques_map = {}

    for d in dictionary_list:
        uniques_map[dict_hash(d)] = d

    return list(uniques_map.values())


#### Extract subset of key-value pairs from Python dictionary object

In [8]:
def subdict_list(dict_list, keys_list, exclude = False):
    sub_d_list = []
    if exclude:
        for d in dict_list:
            sub_d= {k: d[k] for k in d.keys() if k not in keys_list}
            sub_d_list.append(sub_d)
    else:
        for d in dict_list:
            sub_d= {k: d[k] for k in keys_list}
            sub_d_list.append(sub_d)
    
    return sub_d_list




#### Get a dict from a list based on something inside the dict

In [9]:
def select_dict(dict_list, k, v):
    selected = []
    for d in dict_list:
        if d[k] == v:
            selected.append(d)
    return selected

### Create list of countries

In [10]:
countryArray = []
with open(r'..\..\globalResources\refAreas.txt', newline = '') as countryListXY:                                                                                          
    countryListXY = csv.DictReader(countryListXY, delimiter='\t')
    for row in countryListXY:
        countryArray.append(dict(row))
        
print(countryArray[1])

    
print("Total number or countries: ", len(countryArray))

{'X': '20.06660928', 'Y': '41.13897007', 'ISO3': 'ALB', 'M49': '8', 'areaName': 'Albania', 'UN_Member': '1', 'Country_Profile': '1'}
Total number or countries:  248


### Get catalogue of UN SYB series

In [25]:
syb_dir = r'..\..\data\unsd\UNSYB\output'

with open(r'..\..\data\unsd\UNSYB\output\SYB_Series_Catalog.json') as json_file:  
    syb_catalogue = json.load(json_file)
    
syb_catalogue[0].keys()
syb_catalogue[0]['tables'][0].keys()
syb_catalogue[0]['tables'][0]['series'][0].keys()
syb_catalogue[0]['tables'][0]['series'][0]

dict_keys(['topicId', 'topicNameEN', 'topicNameFR', 'tables'])

dict_keys(['tableCode', 'tableId', 'tableName', 'tableNote', 'tableStatusId', 'tbBkCode', 'tbBkPrint', 'series'])

dict_keys(['newSeriesName', 'units', 'multiplier', 'baseYear', 'SELECT', 'seriesCode', 'SYB_series'])

{'newSeriesName': 'Infant mortality ratio',
 'units': 'Number of deaths per 1,000 live births',
 'multiplier': 0,
 'baseYear': None,
 'SELECT': None,
 'seriesCode': 'SYB011',
 'SYB_series': [253]}

### Get catalogue of UNSD items on SDG open data groups

In [26]:
with open(r'..\..\data\unsd\unsd_catalogue.json') as json_file:  
    hub_catalogue = json.load(json_file)
    
#hub_catalogue[0]['series'][0]

## Create SYB country facts

In [12]:
countryArray[0].keys()
countryArray[0]

{'X': '66.02688198',
 'Y': '33.83160199',
 'ISO3': 'AFG',
 'M49': '4',
 'areaName': 'Afghanistan',
 'UN_Member': '1',
 'Country_Profile': '1'}

In [35]:
count_country = 0

for this_country in countryArray:
    if this_country['M49'] != '4':
        continue
    
    if this_country['Country_Profile'] != '1':
        continue
    
    print('Building SYB country facts for: ' + this_country['M49'] + '-' + this_country['areaName'])
    
    count_country += 1
    
    country_syb_facts = {}
    
    country_code = str(this_country['M49'])
    country_name = this_country['areaName']
    
    country_syb_facts['country_code'] = country_code
    country_syb_facts['country_name'] = country_name
    country_syb_facts['UNMember'] = this_country['UN_Member']
    country_syb_facts['CountryProfile'] = this_country['Country_Profile']

    count_fact = 0
    
    topics = []
    
    for t1 in syb_catalogue:
        #print("- in topic " + str(t1['topicId']) + ': ' + t1['topicNameEN'])

        topic = {}
        
        topic['topicId'] = t1['topicId']
        topic['topicName'] = t1['topicNameEN']
            
        tables = []

        for t2 in t1['tables']:
            #print("- - in table " + str(t2['tableId']) + ': ' + t2['tableName'])
            table = {}
                
            table['tableId'] = t2['tableId']
            table['tableName'] = t2['tableName']
            
            for s in t2['series']:
                
                print("- - - in series " + str(s['SYB_series']) + ': ' + s['newSeriesName'])
               
    
        
    
    

Building SYB country facts for: 4-Afghanistan
- - - in series [253]: Infant mortality ratio
- - - in series [36, 268, 269]: Life expectancy at birth
- - - in series [1159]: Maternal mortality ratio
- - - in series [13]: Population annual rate of increase
- - - in series [392]: Total fertility rate
- - - in series [1617]: Asylum seekers, including pending cases
- - - in series [1614, 1616, 1615]: International migrant stock as percent of total population
- - - in series [1610]: International migrant stock
- - - in series [1618]: Other population of concern to UNHCR
- - - in series [1619]: Total population of concern to UNHCR
- - - in series [1613]: Total refugees and people in refugee-like situations
- - - in series [265]: Capital city population as percent of total population
- - - in series [266]: Capital city population as percent of total urban population
- - - in series [264]: Capital city population
- - - in series [329, 330]: Population growth rate per annum
- - - in series [328]

In [None]:
 
                series = []
                
                indicator = {}
                indicator['indicatorCode'] = i['reference']
                
                facts = []
                


                if 'series' in i.keys():
                    for s in i['series']:
                        
                        #print("- - - - in series " + s['code'])
                        
                        if s['release'] == release:
                            
                            #if s['code'] != 'GB_XPD_RSDV':
                            #    continue
                            
                            
                            for this_fact in fact_builder:
                                
                                
                                
                                if this_fact['countryProfile'] != '1' or this_fact['seriesCode'] != s['code']:
                                    continue
                                #print("- - - - - in fact " + this_fact['seriesCode'])
                                
                                count_fact += 1
                                
                                seriesCode = s['code']
                                seriesTitle = this_fact['sliceDesc']
                                
                                for h in hub_catalogue:
                                    if h['goal'] != int(this_fact['goal']):
                                        continue

                                    for hs in h['series']:

                                        if hs['seriesCode'] != this_fact['seriesCode']:
                                            continue

                                        hub = hs['id']
                                        
                                #-----------------------------------------------------------
                                # Select dimensions values that are applicable for this fact
                                #-----------------------------------------------------------

                                slice_dimensions = {d: this_fact[d] for d in this_fact.keys() &
                                                    {'ageCode', 
                                                     'locationCode', 
                                                     'sexCode', 
                                                     'boundsCode', 
                                                     'educationLevelCode', 
                                                     'typeOfProductCode', 
                                                     'typeOfMobileTechnologyCode', 
                                                     'typeOfSpeedCode'}}

                                slice_dimensions = dict((k, v) for k, v in slice_dimensions.items() if v != '')

                                
                                #-----------------------------------------------------------
                                # Colect data for this fact
                                #-----------------------------------------------------------

                                ##--- Read data for this series---#

                                filename = 'Indicator_' + this_fact['indicator'] + '_Series_' + this_fact['seriesCode'] + '.json'
                                with open(r'../../data/unsd/' + release + '/' + filename) as json_file:  
                                    data = json.load(json_file)


                                ##--- Filter data for this_country['M49'] ---#

                                data = select_dict(data['refAreas'], 'geoAreaCode', str(this_country['M49']).zfill(3))

                                data = data[0]['data']
                                
                                if len(slice_dimensions)>0:
                                    for k,v in slice_dimensions.items():
                                        data = select_dict(data, k, v)

                                #----------------------------------------------------------------
                                # Main fact calculation 
                                #----------------------------------------------------------------

                                values = []
                                years = []
                                values_is_censored = []
                                values_numeric_part = []

                                n = None      # number of observations available
                                y_min = None    # first year available
                                y_max = None    # most recent year available

                                value_y_min = None   # data value in the first year available
                                value_y_max = None   # data value in the most recent year available

                                value_y_min_num = None   # data value in the first year available
                                value_y_max_num = None   # data value in the most recent year available


                                value_median = None
                                dif_first_last = None
                                value_y_max_is_censored = None

                                prog = ""
                                prog_10 = ""
                                prog_12 = ""
                                prog_15 = ""
                                prog_mmr_min = ""
                                prog_mmr_max = ""

                                fact_text = ""

                                if len(data) > 0:

                                    for d in data:
                                        if d['value_detail'] not in ['NA', 'N', 'NULL']:
                                            values.append(d['value_detail'])
                                            years.append(d['timePeriod'])
                                            values_is_censored.append(d['value_is_censored'])
                                            values_numeric_part.append(numeric_part(d['value_detail']))
                                        
                                 
                                    for i in range(len(values)):
                                        values[i] = values[i].replace("<", "less than ")
                                        values[i] = values[i].replace(">", "more than ")

                                   # for i in range(len(values_numeric_part)):
                                   #     if values_numeric_part[i] is not None:
                                   #         if values_numeric_part[i] <1.009 :
                                   #             values_numeric_part[i] = round_KFM(values_numeric_part[i],2)
                                   #         if values_numeric_part[i] >= 1.009 and values_numeric_part[i] <= 10:
                                   #             values_numeric_part[i] = round_KFM(values_numeric_part[i],1)
                                   #         if values_numeric_part[i] >10 :
                                   #             values_numeric_part[i] = round_KFM(values_numeric_part[i],1)
                                                
                                    
                                    for i in range(len(values)):
                                        
                                        decimal_pos = values[i].find('.')
                                        
                                        if decimal_pos == -1 and values_numeric_part[i] is not None:
                                            values_numeric_part[i] = int(values_numeric_part[i])
                                            
                                        if decimal_pos > 0:
                                            if len(values[i]) - decimal_pos > 2:
                                                values_numeric_part[i] = round_KFM(values_numeric_part[i],2)
                                                
                                    
                                    if all(v is None for v in values_numeric_part):
                                        continue


                                    #----------------------------------------------------------------
                                    # Initial auxiliary variables
                                    #----------------------------------------------------------------


                                    n = len(data)      # number of observations available
                                    y_min = min(years)    # first year available
                                    y_max = max(years)    # most recent year available

                                    value_y_min = values[years.index(min(years))]   # data value in the first year available
                                    value_y_max = values[years.index(max(years))]   # data value in the most recent year available
                                    
                                    
                                    value_y_min_num = values_numeric_part[years.index(min(years))]   # data value in the first year available
                                    value_y_max_num = values_numeric_part[years.index(max(years))]   # data value in the most recent year available
                                   
                                    x =  [i for i in values_numeric_part if i is not None]
                                    
                                    if len(x)>0:
                                        value_median = statistics.median(x)
                                    else:
                                        value_median = None
                                    #print(seriesCode)
                                    #display(values_numeric_part)
                                    #display()
                                    #display(x)
                                    #print("----------")

                                    
                                    if(values_numeric_part[years.index(min(years))] is not None and values_numeric_part[years.index(max(years))] is not None):
                                        dif_first_last = abs(values_numeric_part[years.index(min(years))] \
                                                             - values_numeric_part[years.index(max(years))])


                                    value_y_min_is_censored = values_is_censored[years.index(min(years))]
                                    value_y_max_is_censored = values_is_censored[years.index(max(years))]
                                    
                                    fact_prog = prog_info (value_y_min_num, value_y_max_num, 
                                                           down = this_fact['Down'], 
                                                           up = this_fact['Up'], 
                                                           unit_1 = this_fact['unit1'])

                                    #--------------------------------------------------------------------

                                    fact_text = ""
                                    
                                    if value_y_max_num and value_median:
                                        condition1 = dif_first_last >= 0.05 * abs(value_y_max_num)
                                        condition2 = not value_y_max_is_censored
                                        condition3 = value_y_max_num >= .25*value_median
                                        condition4 = int(y_min) < 2010
                                        condition5 = n > 1

                                        conditions = condition1 and condition2 and condition3 and condition4 and condition5

                                    else:
                                        
                                        conditions = False
                                    
                                    if value_y_min_is_censored:
                                        fact_value_y_min = value_y_min
                                    else:
                                        fact_value_y_min = str(value_y_min_num)
                                    
                                    if value_y_max_is_censored:
                                        fact_value_y_max = value_y_max
                                    else:
                                        fact_value_y_max = str(value_y_max_num)
                                    

                                    fact_elements = build_fact(text_type = this_fact['Text.type'], 
                                                               conditions = conditions, 
                                                               da2_1 = this_fact['DA2.1'], 
                                                               da2_2 = this_fact['DA2.2'], 
                                                               da3_1 = this_fact['DA3.1'], 
                                                               unit_1 = this_fact['unit1'], 
                                                               unit_2 = this_fact['unit2'], 
                                                               value_y_min = fact_value_y_min, 
                                                               y_min = y_min, 
                                                               value_y_max = fact_value_y_max, 
                                                               y_max = y_max, 
                                                               prog = fact_prog['prog'], 
                                                               prog_10 = fact_prog['prog_10'], 
                                                               prog_12 = fact_prog['prog_12'], 
                                                               prog_15 = fact_prog['prog_15'], 
                                                               prog_mmr_max = fact_prog['prog_mmr_max'])

                                    fact = {}
                                    fact['seriesCode'] = seriesCode
                                    fact['seriesTitle'] = seriesTitle
                                    fact['hub'] = hub 
                                    fact['slice_dimensions'] = slice_dimensions
                                    fact['text_type'] = this_fact['Text.type']
                                    fact['fact_text'] = fact_elements['fact_text']
                                    fact['fact_values'] = fact_elements['fact_values']
                                    fact['fact_units'] = fact_elements['fact_units']
                                    fact['fact_years'] = fact_elements['fact_years']
                                    fact['data_years'] = years
                                    fact['data_values'] = values
                                    fact['data_is_censored'] = values_is_censored
                                    fact['data_numeric_part'] = values_numeric_part

                                    if this_fact['Text.type'] == '10':
                                        fact['preferred_visualization'] = 'threshold'
                                        fact['threshold'] = this_fact['Threshold']
                                    elif this_fact['Text.type'] == '12':
                                        fact['preferred_visualization'] = 'boolean'
                                    elif(len(values)>3):
                                        fact['preferred_visualization'] = 'time_series'
                                    else:
                                        fact['preferred_visualization'] = 'singleton'


                                    #=======================  
                                    #print('***** apending fact ******')
                                    #display(fact)
                                    facts.append(fact)

                    if len(facts)>0:
                        #print('**** apending facts ******')
                        #display(facts)
                        indicator['facts'] = facts
                        indicators.append(indicator)

                if len(indicators)>0:
                    #print('*** apending indicators **')
                    #display(indicators)
                    target['indicators'] = indicators
                    targets.append(target)

        if len(targets)>0:
            #print('** apending targets ******')
            #display(targets)
            goal['targets'] = targets
            goals.append(goal)
            
    country_profile['goals'] = goals
    #display(goals)
               
    #=======================

    #with open('resolutions.json', 'w') as outfile:
    with open(wd_dir + r'/output/country_profile' + str(country_code).zfill(3) + ".json", 'w') as outfile:
        json.dump(country_profile, outfile, indent=4 )    

    #with open('resolutions.json', 'w') as outfile:



                