# Pivoting script
This script pivots into "wide" format and split regional and country data 

## Load necessary libraries

In [9]:
import csv
import json
import urllib3  # allows to access a URL with python
import math
import os
import io
import collections
import numpy as np
import pandas as pd
import xlsxwriter

# https://volderette.de/jupyter-notebook-tip-multiple-outputs/
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


## User parameters

In [10]:
release = '2019.Q1.G.02' # Make sure to have the correct release here

dir_path = os.path.dirname(os.path.realpath('__file__'))
print(dir_path)

wd_dir = r'../'
print('Main dir: ' + wd_dir)

C:\Users\L.GonzalezMorales\Documents\GitHub\FIS4SDGs\unsd\notebooks
Main dir: ../


## Utilities

#### Convert string to camelCase

In [11]:
def camelCase(st):
    """
    https://stackoverflow.com/questions/8347048/camelcase-every-string-any-standard-library
    
    """
    output = ''.join(x for x in st.title() if x.isalnum())
    return output[0].lower() + output[1:]

## Create `JSON` with metadata on each long data table

In [14]:
path = wd_dir + r'data/' + release + '/' 
files = os.listdir(path)
files[0:10]

['Data_1.1.1_SI_POV_DAY1.xlsx',
 'Data_1.1.1_SI_POV_EMP1.xlsx',
 'Data_1.2.1_SI_POV_NAHC.xlsx',
 'Data_1.3.1_SI_COV_BENFTS.xlsx',
 'Data_1.3.1_SI_COV_CHLD.xlsx',
 'Data_1.3.1_SI_COV_DISAB.xlsx',
 'Data_1.3.1_SI_COV_LMKT.xlsx',
 'Data_1.3.1_SI_COV_LMKTPQ.xlsx',
 'Data_1.3.1_SI_COV_MATNL.xlsx',
 'Data_1.3.1_SI_COV_PENSN.xlsx']

In [15]:
xlsx = pd.ExcelFile(path + files[0])

In [24]:
x = pd.read_excel(xlsx, 'Sheet1')
x.head()
x.columns

Unnamed: 0,goalCode,goalDesc,targetCode,targetDesc,indicatorCode,indicatorDesc,indicatorTier,seriesCode,seriesDesc,seriesRelease,...,valueType,timeDetail,source,footnotes,natureCode,unitsCode,reportingTypeCode,natureDesc,unitsDesc,reportingTypeDesc
0,1,End poverty in all its forms everywhere,1.1,"By 2030, eradicate extreme poverty for all peo...",1.1.1,Proportion of population below the internation...,1,SI_POV_DAY1,Proportion of population below international p...,2019.Q1.G.02,...,Float,,"World Bank, Development Research Group. Data a...",World aggregate.,G,PERCENT,G,Global monitoring data,Percentage,Global
1,1,End poverty in all its forms everywhere,1.1,"By 2030, eradicate extreme poverty for all peo...",1.1.1,Proportion of population below the internation...,1,SI_POV_DAY1,Proportion of population below international p...,2019.Q1.G.02,...,,,,,,,,,,
2,1,End poverty in all its forms everywhere,1.1,"By 2030, eradicate extreme poverty for all peo...",1.1.1,Proportion of population below the internation...,1,SI_POV_DAY1,Proportion of population below international p...,2019.Q1.G.02,...,,,,,,,,,,
3,1,End poverty in all its forms everywhere,1.1,"By 2030, eradicate extreme poverty for all peo...",1.1.1,Proportion of population below the internation...,1,SI_POV_DAY1,Proportion of population below international p...,2019.Q1.G.02,...,,,,,,,,,,
4,1,End poverty in all its forms everywhere,1.1,"By 2030, eradicate extreme poverty for all peo...",1.1.1,Proportion of population below the internation...,1,SI_POV_DAY1,Proportion of population below international p...,2019.Q1.G.02,...,Float,,"World Bank, Development Research Group. Data a...",,G,PERCENT,G,Global monitoring data,Percentage,Global


Index(['goalCode', 'goalDesc', 'targetCode', 'targetDesc', 'indicatorCode',
       'indicatorDesc', 'indicatorTier', 'seriesCode', 'seriesDesc',
       'seriesRelease', 'timePeriod', 'geoAreaCode', 'geoAreaName', 'level',
       'parentCode', 'parentName', 'type', 'CountryProfile', 'ISO3CD',
       'UNMember', 'X', 'Y', 'value', 'valueType', 'timeDetail', 'source',
       'footnotes', 'natureCode', 'unitsCode', 'reportingTypeCode',
       'natureDesc', 'unitsDesc', 'reportingTypeDesc'],
      dtype='object')

A column in a DataFrame can be retrieved as a Series either by dict-like notation or by attribute:

In [40]:
dict = {}

series_metadata = ['goalCode', 'goalDesc', 'targetCode', 'targetDesc', 'indicatorCode',
              'indicatorDesc', 'indicatorTier', 'seriesCode', 'seriesDesc', 'seriesRelease']
for i in series_metadata:
    dict[i] = x[i][0]

time_coverage = list(pd.unique(x['timePeriod']))
time_coverage.sort()
dict['time_coverage'] = time_coverage

dict



{'goalCode': 1,
 'goalDesc': 'End poverty in all its forms everywhere',
 'targetCode': 1.1,
 'targetDesc': 'By 2030, eradicate extreme poverty for all people everywhere, currently measured as people living on less than $1.25 a day',
 'indicatorCode': '1.1.1',
 'indicatorDesc': 'Proportion of population below the international poverty line, by sex, age, employment status and geographical location (urban/rural)',
 'indicatorTier': 1,
 'seriesCode': 'SI_POV_DAY1',
 'seriesDesc': 'Proportion of population below international poverty line (%)',
 'seriesRelease': '2019.Q1.G.02',
 'time_coverage': [1990,
  1991,
  1992,
  1993,
  1994,
  1995,
  1996,
  1997,
  1998,
  1999,
  2000,
  2001,
  2002,
  2003,
  2004,
  2005,
  2006,
  2007,
  2008,
  2009,
  2010,
  2011,
  2012,
  2013,
  2014,
  2015,
  2016,
  2017]}

array([1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
       2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011,
       2012, 2013, 2014, 2015, 2016, 2017], dtype=int64)