# Data availability analysis by country

## This is the plan:
1. Read metadata (catalogue of all series available)
2. Create a consolidated file with all the data 
3. Split the consolidated file by country -- this will be James' request
4. Compute availability statistics by country series
    - Number of "slices / disaggregations" available
    - Latest year available
    - Number of data points for the most aggregated slice
    - Average no. of data points for the rest of the slices

## Load necessary libraries

In [2]:
import os
import sys
import json
import urllib3  # allows to access a URL with python
import pandas as pd
import math
import os 
import hashlib
import csv
import copy
import unicodedata

In [3]:
os.path.abspath(os.curdir)

'C:\\Users\\L.GonzalezMorales\\Documents\\GitHub\\FIS4SDGs\\notebooks\\unsdDataAvailability'

## Setup interactive shell

In [4]:
# https://volderette.de/jupyter-notebook-tip-multiple-outputs/
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Set path to data and metadata directories in the local branch: 

In [5]:
release = '2019.Q2.G.01'

dir_path = os.path.dirname(os.path.realpath('__file__'))
print(dir_path)

data_dir = r'../../data/unsd/' + release + '/'
metadata_dir = r"../../globalResources/"
metadata_dir


print('data inputs dir: ' + data_dir)

C:\Users\L.GonzalezMorales\Documents\GitHub\FIS4SDGs\notebooks\unsdDataAvailability


'../../globalResources/'

data inputs dir: ../../data/unsd/2019.Q2.G.01/


## Utilities

#### Convert string to camelCase

In [None]:
def camelCase(st, toAscii = True, startLowercase = True):
    """
    https://stackoverflow.com/questions/8347048/camelcase-every-string-any-standard-library
    
    """
    output = ''.join(x for x in st.title() if x.isalnum())
    if startLowercase:
        output = output[0].lower() + output[1:]
    
    if toAscii:
        output = unicodedata.normalize('NFD', output).encode('ascii', 'ignore').decode('utf-8')
        
    return output

### Get data from a json file

In [None]:

def get_json_data(file, print_first_element = True):    
    
    """ Get json metadata file """
    
    try:
        json_data = json.load(open(file))
        if(print_first_element==True):
            print("\n----This is an example of a series_metadata element----")
            print(json_data[0])
        return json_data
    
    except:
        print("Unexpected error:", sys.exc_info()[0])
        return None
    
    

#### Print the tree structure of nested dictionaries

In [None]:
def jsonTree(d, depth=0):
  for x, v in d.items():
    print('-'*depth + x)
    if isinstance(v, list):
        v0 = v[0]
        if isinstance(v0, dict):
          jsonTree(v0, depth = depth+2)

#### Compute a hash of a dictionary

In [None]:
def dict_hash(d):
    out = hashlib.md5()
    for key, value in d.items():
        out.update(key.encode('utf-8'))
        out.update(str(value).encode('utf-8'))
    return out.hexdigest()


#### Get unique dictionaries in a list

In [None]:
def unique_dicts(dictionary_list):

    uniques_map = {}

    for d in dictionary_list:
        uniques_map[dict_hash(d)] = d

    return list(uniques_map.values())


#### Extract subset of key-value pairs from Python dictionary object

In [None]:
def subdict_list(dict_list, keys_list, exclude = False):
    sub_d_list = []
    if exclude:
        for d in dict_list:
            sub_d= {k: d[k] for k in d.keys() if k not in keys_list}
            sub_d_list.append(sub_d)
    else:
        for d in dict_list:
            sub_d= {k: d[k] for k in keys_list}
            sub_d_list.append(sub_d)
    
    return sub_d_list




#### Get a dict from a list based on something inside the dict

In [None]:
def select_dict(dict_list, k, v):
    selected = []
    for d in dict_list:
        if d[k] == v:
            selected.append(d)
    return selected

### List of countries

In [None]:
def countryListXY(file):
    
    countryListXY = []
    
    with open(file, newline = '', encoding='latin-1') as countryList:                                                                                          
        countryList = csv.DictReader(countryList, delimiter='\t')
        for row in countryList:
            countryListXY.append(dict(row))
            
    countryListXY = pd.DataFrame(countryListXY).astype({'M49':'str'})
    
    return(countryListXY)

    #print(countryListXY[1])
    #for c in countryListXY:

In [None]:
countries_df = countryListXY(metadata_dir + 'refAreas.txt')
countries_df.head(10)

# 1. Read metadata

## Get metadata file

In [None]:
series_metadata = get_json_data(metadata_dir + 'metadata.json', False)
jsonTree(series_metadata[0])
series_metadata[0]['code']

# 2. Create a consolidated file with all the data

### Utilities

#### Re-arrange columns
- Keep all dimenions together
- Keep all value columns together
- Push `value_latest_year` and `latest_year` to the end

In [None]:
def rearrange(bigtable):
    
    keys_1 = ['goal', 'target', 'indicator', 'seriesCode', 'seriesDesc', 'release',
       'geoAreaCode', 'geoAreaName', 'parentCode', 'parentName', 'type',
       'ISO3', 'UN_Member', 'X', 'Y']

    keys_2 = ['unitsCode', 'unitsDesc',
           'unitmultiplierCode', 'unitmultiplierDesc', 'reportingTypeCode',
           'reportingTypeDesc', 'valueDetails', 'footnotes', 'sources',
           'timeDetails', 'nature']

    keys_3 = ['value_latest_year', 'latest_year']
    
    value_cols = [x for x in list(bigtable.columns) if x.startswith('value_') and x != 'value_latest_year']
    value_cols = sorted(value_cols)
    
    dimension_cols = [x for x in list(bigtable.columns) if x not in keys_1 + keys_2 + keys_3 + value_cols]
    dimension_cols = sorted(dimension_cols)
    
    x =  bigtable[keys_1 + dimension_cols + keys_2 + value_cols + keys_3]
    
    x = x.sort_values(by= ['goal', 'target', 'indicator', 'seriesCode','geoAreaName'] + dimension_cols)
    
    return x

    

In [None]:
def split_by_country(bigtable, domain='', target_dir=''):
    
    country_catalog = bigtable[['geoAreaCode', 'geoAreaName']].drop_duplicates()
    
    for row in country_catalog.itertuples():

        geoAreaCode = row.geoAreaCode
        geoAreaName = camelCase(row.geoAreaName, startLowercase=False)
        
        
        country_table = bigtable.loc[bigtable['geoAreaCode'] == geoAreaCode]
        
        country_table = country_table.dropna(how='all', axis=1)
        
        country_table.to_excel(target_dir + 'Country_' +geoAreaName+'_'+ str(geoAreaCode) + '_' + domain + '.xlsx', index=False)
      

## Merge all the data for each goal

In [None]:
for g in series_metadata:
    counter = 0
    #if g['code']!= '1':
    #    continue
    for t in g['targets']:
        #if t['code']!= '1.1':
        #    continue
        for i in t['indicators']:
            #if i['reference']!= '1.1.1':
            #    continue
            if 'series' in i.keys():
                for s in i['series']:
                    filename = 'csv_Indicator_' + i['reference'] + '_Series_' + s['code']+'.csv'
                    
                    if os.path.isfile(data_dir + filename):
                        small_table = pd.read_csv(data_dir + filename, sep='\t')
                        #drop empty rows:
                        small_table = small_table.dropna(subset=['value_latest_year'])
                    
                        if counter == 0:
                            big_table = copy.deepcopy(small_table)  
                        else:
                            big_table = pd.concat([big_table, small_table], ignore_index=True, sort=False)
                        counter = counter + 1
                        
                        #display('--'+str(counter) + ' - added series ' + s['code'])
                        
    big_table = rearrange(big_table)

    split_by_country(big_table, domain='Goal_' + g['code'], target_dir=data_dir)
    
    big_table.to_excel(data_dir + 'All_Countries_Goal_' + g['code']+'.xlsx', index=False)
    
    display('finished processing goal ' + g['code'])
    
    

## get list of output files

In [6]:
import glob

path = data_dir

files = [f for f in glob.glob(path + '*.xlsx', recursive=False)]

for f in files:
    print(f)

../../data/unsd/2019.Q2.G.01\All_Countries_Goal_1.xlsx
../../data/unsd/2019.Q2.G.01\All_Countries_Goal_10.xlsx
../../data/unsd/2019.Q2.G.01\All_Countries_Goal_11.xlsx
../../data/unsd/2019.Q2.G.01\All_Countries_Goal_12.xlsx
../../data/unsd/2019.Q2.G.01\All_Countries_Goal_13.xlsx
../../data/unsd/2019.Q2.G.01\All_Countries_Goal_14.xlsx
../../data/unsd/2019.Q2.G.01\All_Countries_Goal_15.xlsx
../../data/unsd/2019.Q2.G.01\All_Countries_Goal_16.xlsx
../../data/unsd/2019.Q2.G.01\All_Countries_Goal_17.xlsx
../../data/unsd/2019.Q2.G.01\All_Countries_Goal_2.xlsx
../../data/unsd/2019.Q2.G.01\All_Countries_Goal_3.xlsx
../../data/unsd/2019.Q2.G.01\All_Countries_Goal_4.xlsx
../../data/unsd/2019.Q2.G.01\All_Countries_Goal_5.xlsx
../../data/unsd/2019.Q2.G.01\All_Countries_Goal_6.xlsx
../../data/unsd/2019.Q2.G.01\All_Countries_Goal_7.xlsx
../../data/unsd/2019.Q2.G.01\All_Countries_Goal_8.xlsx
../../data/unsd/2019.Q2.G.01\All_Countries_Goal_9.xlsx
../../data/unsd/2019.Q2.G.01\Country_Afghanistan_4_Goal_1