# Pivoting script
This script pivots into "wide" format and split regional and country data 

## Load necessary libraries

In [2]:
import csv
import json
import urllib3  # allows to access a URL with python
import math
import os
import io
import collections
import copy
import numpy as np
import pandas as pd
import xlsxwriter

# https://volderette.de/jupyter-notebook-tip-multiple-outputs/
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


## User parameters

In [3]:
release = '2019.Q1.G.02' # Make sure to have the correct release here

dir_path = os.path.dirname(os.path.realpath('__file__'))
print(dir_path)

wd_dir = r'../'
print('Main dir: ' + wd_dir)

C:\Users\L.GonzalezMorales\Documents\GitHub\FIS4SDGs\unsd\notebooks
Main dir: ../


## Utilities

#### Convert string to camelCase

In [4]:
def camelCase(st):
    """
    https://stackoverflow.com/questions/8347048/camelcase-every-string-any-standard-library
    
    """
    output = ''.join(x for x in st.title() if x.isalnum())
    return output[0].lower() + output[1:]

## Create `JSON` with metadata on each long data table

### Read list of excel files with data for each series

In [5]:
path = wd_dir + r'data/' + release + '/' 
files = os.listdir(path)
files[0:10]

['Data_1.1.1_SI_POV_DAY1.xlsx',
 'Data_1.1.1_SI_POV_EMP1.xlsx',
 'Data_1.2.1_SI_POV_NAHC.xlsx',
 'Data_1.3.1_SI_COV_BENFTS.xlsx',
 'Data_1.3.1_SI_COV_CHLD.xlsx',
 'Data_1.3.1_SI_COV_DISAB.xlsx',
 'Data_1.3.1_SI_COV_LMKT.xlsx',
 'Data_1.3.1_SI_COV_LMKTPQ.xlsx',
 'Data_1.3.1_SI_COV_MATNL.xlsx',
 'Data_1.3.1_SI_COV_PENSN.xlsx']

### For a selected file:

#### 1)  Convert content of data file into a pandas DataFrame

In [6]:
f = 'Data_1.5.2_VC_DSR_LSGP.xlsx'

xlsx = pd.ExcelFile(path + f)
x = pd.read_excel(xlsx, 'Sheet1')
x.shape
x.head(3)



(8060, 33)

Unnamed: 0,goalCode,goalDesc,targetCode,targetDesc,indicatorCode,indicatorDesc,indicatorTier,seriesCode,seriesDesc,seriesRelease,...,valueType,timeDetail,source,footnotes,natureCode,unitsCode,reportingTypeCode,natureDesc,unitsDesc,reportingTypeDesc
0,1,End poverty in all its forms everywhere,1.5,"By 2030, build the resilience of the poor and ...",1.5.2,Direct economic loss attributed to disasters i...,1,VC_DSR_LSGP,Direct economic loss attributed to disasters r...,2019.Q1.G.02,...,,,,,,,,,,
1,1,End poverty in all its forms everywhere,1.5,"By 2030, build the resilience of the poor and ...",1.5.2,Direct economic loss attributed to disasters i...,1,VC_DSR_LSGP,Direct economic loss attributed to disasters r...,2019.Q1.G.02,...,,,,,,,,,,
2,1,End poverty in all its forms everywhere,1.5,"By 2030, build the resilience of the poor and ...",1.5.2,Direct economic loss attributed to disasters i...,1,VC_DSR_LSGP,Direct economic loss attributed to disasters r...,2019.Q1.G.02,...,,,,,,,,,,


#### 2) Keep only rows with non-null value:

In [57]:
x = x[x.value.notnull()]
x = x.reset_index(drop=True)

for c in x.columns:
    x[c] = x[c].astype(str)
    x[c].replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)

x.shape
x.head(3)


(1714, 33)

Unnamed: 0,goalCode,goalDesc,targetCode,targetDesc,indicatorCode,indicatorDesc,indicatorTier,seriesCode,seriesDesc,seriesRelease,...,valueType,timeDetail,source,footnotes,natureCode,unitsCode,reportingTypeCode,natureDesc,unitsDesc,reportingTypeDesc
0,1,End poverty in all its forms everywhere,1.5,"By 2030, build the resilience of the poor and ...",1.5.2,Direct economic loss attributed to disasters i...,1,VC_DSR_LSGP,Direct economic loss attributed to disasters r...,2019.Q1.G.02,...,Float,,UNISDR (2018). Sendai Framework Monitor : Unit...,Disclaimer: the data being submitted by UNISDR...,C,PERCENT,G,Country data,Percentage,Global
1,1,End poverty in all its forms everywhere,1.5,"By 2030, build the resilience of the poor and ...",1.5.2,Direct economic loss attributed to disasters i...,1,VC_DSR_LSGP,Direct economic loss attributed to disasters r...,2019.Q1.G.02,...,Float,,UNISDR (2018). Sendai Framework Monitor : Unit...,Disclaimer: the data being submitted by UNISDR...,C,PERCENT,G,Country data,Percentage,Global
2,1,End poverty in all its forms everywhere,1.5,"By 2030, build the resilience of the poor and ...",1.5.2,Direct economic loss attributed to disasters i...,1,VC_DSR_LSGP,Direct economic loss attributed to disasters r...,2019.Q1.G.02,...,Float,,UNISDR (2018). Sendai Framework Monitor : Unit...,Disclaimer: the data being submitted by UNISDR...,C,PERCENT,G,Country data,Percentage,Global


In [58]:
def read_excel_file(path,fileName):
    xlsx = pd.ExcelFile(path + fileName)
    x = pd.read_excel(xlsx, 'Sheet1')
    x = x[x.value.notnull()]
    x = x.reset_index(drop=True)
    
    for c in x.columns:
        x[c] = x[c].astype(str)
        x[c].replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)

    return x

#### Add series metadata

In [59]:
def add_series_metadata(dict,x):
    series_metadata = ['goalCode', 'goalDesc', 'targetCode', 'targetDesc', 'indicatorCode',
                  'indicatorDesc', 'indicatorTier', 'seriesCode', 'seriesDesc', 'seriesRelease']

    for i in series_metadata:
        dict[i] = x[i][0]
    return dict

    

#### Add time-coverage metadata

In [60]:
def add_time_coverage_metadata(dict,x):
    time_coverage = list(pd.unique(x['timePeriod']))
    time_coverage.sort()
    dict['time_coverage'] = time_coverage
    return dict

#### Add geographic coverage for groups, regions and countries

In [61]:
def add_geo_coverage(dict,x):
    
    geo = x[['geoAreaCode', 'geoAreaName','type']].drop_duplicates()

    groups = geo[['geoAreaCode', 'geoAreaName']][geo.type == 'Group']
    regions = geo[['geoAreaCode', 'geoAreaName']][geo.type == 'Region']
    countries = geo[['geoAreaCode', 'geoAreaName']][geo.type == 'Country']

    dict['geo_coverage_groups'] = groups.to_dict('records')
    dict['geo_coverage_regions'] = regions.to_dict('records')
    dict['geo_coverage_countries'] = countries.to_dict('records')

    return dict


#### Add specification of slices included in the file

In [62]:
def add_slices_metadata(dict,x):

    disaggregations = np.setdiff1d(x.columns,['goalCode', 'goalDesc', 
                                              'targetCode', 'targetDesc', 
                                              'indicatorCode','indicatorDesc', 'indicatorTier', 
                                              'seriesRelease', 
                                              'timePeriod', 
                                              'geoAreaCode', 'geoAreaName','level', 'parentCode', 'parentName', 'type', 'CountryProfile', 'ISO3CD','UNMember', 'X', 'Y', 
                                              'value', 'valueType', 
                                              'timeDetail', 'source','footnotes', 'natureCode', 'unitsCode', 'natureDesc', 'unitsDesc'])

    disaggregations = list(disaggregations)
    disaggregations_new = copy.deepcopy(disaggregations)

    for i in range(len(disaggregations)):
        d = disaggregations[i] 
        if d[-4:] == 'Desc':
            disaggregations_new.remove(d)

    slices = x[disaggregations_new].drop_duplicates()
    slices = slices.reset_index(drop=True)

    dict['slices'] = slices.to_dict('records')

    return dict


    

#### Create file metadata dictionary

In [63]:
def file_metadata(path, fileName):
    
    x = read_excel_file(path,fileName)
   
    dict = {}
    dict['fileName'] = fileName
    dict = add_series_metadata(dict,x)
    dict = add_slices_metadata(dict,x)
    dict = add_time_coverage_metadata(dict,x)
    dict = add_geo_coverage(dict,x)    
    return dict


    

In [64]:
file_metadata( path , 'Data_1.5.2_VC_DSR_LSGP.xlsx')

{'fileName': 'Data_1.5.2_VC_DSR_LSGP.xlsx',
 'goalCode': '1',
 'goalDesc': 'End poverty in all its forms everywhere',
 'targetCode': '1.5',
 'targetDesc': 'By 2030, build the resilience of the poor and those in vulnerable situations and reduce their exposure and vulnerability to climate-related extreme events and other economic, social and environmental shocks and disasters',
 'indicatorCode': '1.5.2',
 'indicatorDesc': 'Direct economic loss attributed to disasters in relation to global gross domestic product (GDP)',
 'indicatorTier': '1',
 'seriesCode': 'VC_DSR_LSGP',
 'seriesDesc': 'Direct economic loss attributed to disasters relative to GDP (%)',
 'seriesRelease': '2019.Q1.G.02',
 'slices': [{'reportingTypeCode': 'G', 'seriesCode': 'VC_DSR_LSGP'}],
 'time_coverage': ['2005',
  '2006',
  '2007',
  '2008',
  '2009',
  '2010',
  '2011',
  '2012',
  '2013',
  '2014',
  '2015',
  '2016',
  '2017'],
 'geo_coverage_groups': [],
 'geo_coverage_regions': [],
 'geo_coverage_countries': [{'ge

In [65]:
file_metadata_set = []
counter_f = 0
for f in files:
    counter_f = counter_f + 1
    print("creating metadata entry for file " + f + " (" + str(counter_f) + " of " + str(len(files)) + ")")
    file_metadata_set.append(file_metadata( path , f))
    


creating metadata entry for file Data_1.1.1_SI_POV_DAY1.xlsx (1 of 430)
creating metadata entry for file Data_1.1.1_SI_POV_EMP1.xlsx (2 of 430)
creating metadata entry for file Data_1.2.1_SI_POV_NAHC.xlsx (3 of 430)
creating metadata entry for file Data_1.3.1_SI_COV_BENFTS.xlsx (4 of 430)
creating metadata entry for file Data_1.3.1_SI_COV_CHLD.xlsx (5 of 430)
creating metadata entry for file Data_1.3.1_SI_COV_DISAB.xlsx (6 of 430)
creating metadata entry for file Data_1.3.1_SI_COV_LMKT.xlsx (7 of 430)
creating metadata entry for file Data_1.3.1_SI_COV_LMKTPQ.xlsx (8 of 430)
creating metadata entry for file Data_1.3.1_SI_COV_MATNL.xlsx (9 of 430)
creating metadata entry for file Data_1.3.1_SI_COV_PENSN.xlsx (10 of 430)
creating metadata entry for file Data_1.3.1_SI_COV_POOR.xlsx (11 of 430)
creating metadata entry for file Data_1.3.1_SI_COV_SOCAST.xlsx (12 of 430)
creating metadata entry for file Data_1.3.1_SI_COV_SOCASTPQ.xlsx (13 of 430)
creating metadata entry for file Data_1.3.1_SI_

KeyboardInterrupt: 

#### Write metadata list to a `json` file

In [68]:
with open(wd_dir + 'data/file_metadata.json', 'w') as file:
    json.dump(file_metadata_set,file,indent=4)