# Pre-processing script 2
- pivot into "wide" format

### Load necessary libraries

In [None]:
import csv
import json
import urllib3  # allows to access a URL with python
import math
import os
import io
import collections
import hashlib
import numpy as np
import pandas as pd
import re
import xlsxwriter

# https://volderette.de/jupyter-notebook-tip-multiple-outputs/
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


### User parameters

In [None]:

dir_path = os.path.dirname(os.path.realpath('__file__'))
print(dir_path)

data_dir = r'../../data/unsd/UNSYB/input/'
print('data inputs dir: ' + data_dir)

output_dir = r'../../data/unsd/UNSYB/output/'
print('outputs dir: ' + output_dir)



## Utilities

#### Convert string to camelCase

In [None]:
def camelCase(st):
    """
    https://stackoverflow.com/questions/8347048/camelcase-every-string-any-standard-library
    
    """
    output = ''.join(x for x in st.title() if x.isalnum())
    return output[0].lower() + output[1:]

#### Disable insecure request warnings when using `urllib3`.

In [None]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

#### Regular expression to capture numeric values (including those in scientific notation)
The regex is

```
-?      # an optional -
\d+     # a series of digits
(?:     # start non capturing group
  \.    # a dot
  \d+   # a series of digits
)?      
(?:     # start non capturing group
  e     # "e"
  -?    # an optional -
  \d+   # digits
)?
```


In [None]:
def numeric_part(v):
    numeric_part_f = re.compile(r'-?\d+(?:\.\d+)?(?:e-?\d+)?')
    x = numeric_part_f.findall(v)
    if len(x) > 0:
        return float(x[0])
    else:
        return None


#### Compute a hash of a dictionary

In [None]:
def dict_hash(d):
    out = hashlib.md5()
    for key, value in d.items():
        out.update(key.encode('utf-8'))
        out.update(str(value).encode('utf-8'))
    return out.hexdigest()


#### Get unique dictionaries in a list

In [None]:
def unique_dicts(dictionary_list):

    uniques_map = {}

    for d in dictionary_list:
        uniques_map[dict_hash(d)] = d

    return list(uniques_map.values())


#### Extract subset of key-value pairs from Python dictionary object

In [None]:
def subdict_list(dict_list, keys_list, exclude = False):
    sub_d_list = []
    if exclude:
        for d in dict_list:
            sub_d= {k: d[k] for k in d.keys() if k not in keys_list}
            sub_d_list.append(sub_d)
    else:
        for d in dict_list:
            if set(keys_list) <= set(d.keys()):
                sub_d= {k: d[k] for k in keys_list}
                sub_d_list.append(sub_d)
    
    return sub_d_list




#### Get a dict from a list based on something inside the dict

In [None]:
def select_dict(dict_list, k, v):
    selected = []
    for d in dict_list:
        if k in set(d.keys()):
            if d[k] == v:
                selected.append(d)
    return selected

#### Find the coverage of an (unordered) list of years

In [None]:
def year_intervals (years_list):
    """ Find the coverage of an ordered list of years"""
    
    years_list = list(map(int, years_list))
    
    years_list.sort()
    
    n = len(years_list)
    
    start_y = list()
    end_y = list()
    
    start_y.append(years_list[0])
    
    if n > 1:
        for i in range(n-1):
            if(years_list[i+1] - years_list[i]>1):
                start_y.append(years_list[i+1])
                end_y.append(years_list[i])
    
    end_y.append(years_list[n-1])
    
    interval_yy = list()
    
    for i in range(len(start_y)):

        if  end_y[i] - start_y[i]> 0 :
            interval_yy.append(str(start_y[i]) + '-' + str(end_y[i]))
        else:
            interval_yy.append(str(start_y[i]))

    
    x = ",".join(interval_yy)
    return(x)


In [None]:
year_intervals(['1995','2000', '1996', '2001','2002','2003','2004'])
year_intervals(['1995'])
year_intervals(['2000','2004'])

### Read data file

In [None]:
topic = '02'
table = '008'
series = 'SYB011'

file = 'Topic'+topic+'_Table'+table+'_'+series+'.json'

with open(output_dir + file) as json_file:  
    data = json.load(json_file)

In [None]:
def read_data_file(topic, table, series, dir_path):
    file = 'Topic'+topic+'_Table'+table+'_'+series+'.json'
    with open(dir_path + file) as json_file:  
        data = json.load(json_file)
    return data

In [None]:
data = read_data_file(topic = '02', table = '008', series = 'SYB011', dir_path = output_dir)
data.keys()
data['newSeriesName']
data['refAreas'][0]

### List of countries to be plotted on a map (with XY coordinates)

In [None]:
def countryListXY(file):
    
    countryListXY = []
    
    with open(file, newline = '', encoding='latin-1') as countryList:                                                                                          
        countryList = csv.DictReader(countryList, delimiter='\t')
        for row in countryList:
            countryListXY.append(dict(row))
            
    countryListXY = pd.DataFrame(countryListXY).astype({'M49':'str'})
    
    return(countryListXY)

    #print(countryListXY[1])
    #for c in countryListXY:

In [None]:
country_dictXY = countryListXY('../../globalResources/refAreas.txt').to_dict(orient = 'records')
country_dictXY[0]

### Add coordinates to data file

In [None]:
# def add_coordinates(data, cordinates):
data = read_data_file(topic = '02', table = '008', series = 'SYB011', dir_path = output_dir)
coordinates = countryListXY('../../globalResources/refAreas.txt').to_dict(orient = 'records')

for g in data['refAreas']:
        
    g['countryProfile']=None
    g['ISO3'] = None
    g['UN_Member'] = None
    g['X'] = None
    g['Y'] = None
    
    for xy in coordinates:

        if xy['M49'].zfill(3) != str(g['refAreaCode']).zfill(3):
            continue

        g['countryProfile']=xy['Country_Profile']
        g['ISO3'] = xy['ISO3']
        g['UN_Member'] = xy['UN_Member']
        g['X'] = xy['X']
        g['Y'] = xy['Y']


### Select `refAreas` that have coordinates

In [None]:
ref_areas_publish = []
for d in data['refAreas']:
    if d['X'] and d['Y']:
        ref_areas_publish.append(d)


In [None]:
ref_areas_publish[0]

### Select distinct years among all data records:

In [None]:
temp = []
timePeriods = []

for ra in ref_areas_publish:
    temp.extend(subdict_list(ra['data'], ['year'], exclude = False))
    
for i in unique_dicts(temp):
    timePeriods.append(i['year'])

timePeriods.sort()
timePeriods


### Select distinct slices among all data records

In [None]:
temp = []

for ra in ref_areas_publish:
    temp.extend(subdict_list(ra['data'], ['year', 'value', 'sourceNameEN', 'sourceNameFR', 'footnoteTextEN', 'footnoteTextFR'
                                         ], exclude = True))
    
slices = unique_dicts(temp)

In [None]:
slices

### Write dataset as "pivot" version

In [None]:
with open(output_dir + 'SYBcatalogue.json') as json_file:  
    catalogue = json.load(json_file)
catalogue[0]

In [None]:
with open(output_dir + 'SYBcatalogue.json') as json_file:  
    catalogue = json.load(json_file)
    
{'topicId': 2,
  'topicNameEN': 'Population and migration',
  'topicNameFR': 'Population et migration',
  'tables': [{'tableCode': 'lifecbmort',
    'tableId': 8,
    'tableName': 'Population growth and indicators of fertility and mortality',
    'tableNote': None,
    'tableStatusId': 3,
    'tbBkCode': 'X03 ',

for t in catalogue:
    for t2 in t['tables']:
        for s in t2['series']:

            if s['seriesCode'] != 'SYB122':
                continue

            # Read data file:
            data = read_data_file(topic = str(t['topicId']).zfill(2), table = str(t['tableId']).zfill(3), series =  s['seriesCode'], dir_path = output_dir)

            # Select only refAreas that have coordinates:

            ref_areas_publish = []
            for d in data['refAreas']:
                if d['X'] and d['Y']:
                    ref_areas_publish.append(d)

            # Select unique time periods among all records:

            temp = []
            timePeriods = []

            for ra in ref_areas_publish:
                temp.extend(subdict_list(ra['data'], ['year'], exclude = False))

            for dt in unique_dicts(temp):
                timePeriods.append(dt['year'])

            timePeriods.sort()

            temp = []

            # Obtain unique slices:

            for ra in ref_areas_publish:
                temp.extend(subdict_list(ra['data'], ['year', 'value', 'sourceNameEN', 'sourceNameFR', 'footnoteTextEN', 'footnoteTextFR'], exclude = True))

            slices = unique_dicts(temp)

            for ra in ref_areas_publish:

                data_new = []

                for j in slices:

                    # Select data corresponding to reference area ra and slice j:

                    slice_values = set(j.values())
                    slice_keys = set(j.keys())
                    slice_data = []

                    slice_data_wide = j.copy()

                    for record in ra['data']:
                        record_values = set(record.values())
                        if slice_values <= record_values:
                            slice_data.append(record)

                    #----------------------------
                    slice_footnotesEN = []
                    slice_sourcesEN = []
                    slice_years = []


                    for r in slice_data:
                        if 'footnoteTextEN' in r.keys():
                            slice_footnotes.append(r['footnoteTextEN'])
                        if 'sourceNameEN' in r.keys():
                            slice_sources.append(r['sourceNameEN'])
                        if 'years' in r.keys():
                            slice_years.append(r['year'])

                    slice_footnotes = list(set(slice_footnotes))
                    slice_sources = list(set(slice_sources))

                    max_year = None
                    if len(slice_years)>0:
                        max_year = max(slice_years)


                    #----------------------------

                    slice_footnote_join = []
                    counter = 0
                    for fn in slice_footnotes:
                        counter += 1
                        fn_years = []
                        for r in slice_data:
                            if 'footnotes' in r.keys():
                                if fn == r['footnotes']:
                                    fn_years.append(r['year'])
                        if fn:
                            if len(slice_footnotes) > 1:
                                slice_footnote_join.append('['+year_intervals(fn_years)+']: ' + fn)
                            if len(slice_footnotes) == 1:
                                slice_footnote_join.append(fn)


                    slice_footnote_join.sort()
                    slice_footnote_join = ' // '.join(slice_footnote_join)

                    slice_data_wide['footnotes'] = slice_footnote_join

                    #-----------------------

                    slice_sources_join = []
                    counter = 0

                    for src in slice_sources:
                        counter += 1
                        src_years = []
                        for r in slice_data:
                            if src == r['source']:
                                src_years.append(r['timePeriod'])

                        if src:
                            if len(slice_sources) > 1:
                                slice_sources_join.append('['+year_intervals(src_years)+']: ' + src)


                            if len(slice_sources) == 1:
                                slice_sources_join.append(src)

                    slice_sources_join.sort()
                    slice_sources_join = ' // '.join(slice_sources_join)

                    slice_data_wide['sources'] = slice_sources_join

                    #------------------------

                    s_keys = list(slice_keys)
                    s_keys.extend(['footnotes','source'])


                    for y in timePeriods:

                        slice_data_y = subdict_list(select_dict(slice_data, 'year', y), 
                                                    s_keys, exclude = True)
                        if len(slice_data_y)>0:
                            for ry in slice_data_y:

                                if 'value_numeric_part' in ry.keys():
                                    slice_data_wide['value_'+str(y)] = ry['value_numeric_part']
                                else:
                                    slice_data_wide['value_'+str(y)] = None

                        else:

                            slice_data_wide['value_'+str(y)] = None

                    if max_year:
                        slice_data_wide['value_latest_year'] = slice_data_wide['value_'+str(max_year)]
                        slice_data_wide['latest_year'] = max_year
                    else:
                        slice_data_wide['value_latest_year'] = None
                        slice_data_wide['latest_year'] = None



                    data_new.append(slice_data_wide)

                del ra['data']
                ra['data'] = data_new

            new_data = {}
            new_data['goal'] = data['goal']
            new_data['target'] = data['target']
            new_data['indicator'] = data['indicator']
            new_data['seriesCode'] = data['seriesCode']
            new_data['seriesDesc'] = data['seriesDesc']
            new_data['release'] = data['release']
            new_data['data'] = ref_areas_publish


            file_name = 'wide_Indicator_'+i['reference']+'_Series_'+s['code']+'.json'

            with open(wd_dir + r'data\unsd\2019.Q1.G.03\\' + file_name, 'w') as f:
                json.dump(new_data, f, indent=4)


            print(file_name)



In [None]:
metadata[0]['hex']