# Pivoting script
This script pivots into "wide" format and split regional and country data 

## Load necessary libraries

In [None]:
import csv
import json
import urllib3  # allows to access a URL with python
import math
import os
import io
import collections
import copy
import numpy as np
import pandas as pd
import xlsxwriter

# https://volderette.de/jupyter-notebook-tip-multiple-outputs/
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


## User parameters

In [None]:
release = '2019.Q1.G.02' # Make sure to have the correct release here

dir_path = os.path.dirname(os.path.realpath('__file__'))
print(dir_path)

wd_dir = r'../'
print('Main dir: ' + wd_dir)

## Utilities

#### Convert string to camelCase

In [None]:
def camelCase(st):
    """
    https://stackoverflow.com/questions/8347048/camelcase-every-string-any-standard-library
    
    """
    output = ''.join(x for x in st.title() if x.isalnum())
    return output[0].lower() + output[1:]

## Create `JSON` with metadata on each long data table

### Read list of excel files with data for each series

In [None]:
path = wd_dir + r'data/' + release + '/' 
files = os.listdir(path)
files[0:10]

### For a selected file:

#### 1)  Convert content of data file into a pandas DataFrame

In [None]:
f = 'Data_1.5.2_VC_DSR_LSGP.xlsx'

xlsx = pd.ExcelFile(path + f)
x = pd.read_excel(xlsx, 'Sheet1')
x.shape
x.head(3)



#### 2) Keep only rows with non-null value:

In [None]:
x = x[x.value.notnull()]
x = x.reset_index(drop=True)
x.shape
x.head(3)

In [None]:
def read_excel_file(path,fileName):
    xlsx = pd.ExcelFile(path + fileName)
    x = pd.read_excel(xlsx, 'Sheet1')
    x = x[x.value.notnull()]
    x = x.reset_index(drop=True)
    return x

#### Add series metadata

In [None]:
def add_series_metadata(dict,x):
    series_metadata = ['goalCode', 'goalDesc', 'targetCode', 'targetDesc', 'indicatorCode',
                  'indicatorDesc', 'indicatorTier', 'seriesCode', 'seriesDesc', 'seriesRelease']

    for i in series_metadata:
        dict[i] = x[i][0]
    return dict

    

#### Add time-coverage metadata

In [None]:
def add_time_coverage_metadata(dict,x):
    time_coverage = list(pd.unique(x['timePeriod']))
    time_coverage.sort()
    dict['time_coverage'] = time_coverage
    return dict

#### Add geographic coverage for groups, regions and countries

In [None]:
def add_geo_coverage(dict,x):
    
    geo = x[['geoAreaCode', 'geoAreaName','type']].drop_duplicates()

    groups = geo[['geoAreaCode', 'geoAreaName']][geo.type == 'Group']
    regions = geo[['geoAreaCode', 'geoAreaName']][geo.type == 'Region']
    countries = geo[['geoAreaCode', 'geoAreaName']][geo.type == 'Country']

    dict['geo_coverage_groups'] = groups.to_dict('records')
    dict['geo_coverage_regions'] = regions.to_dict('records')
    dict['geo_coverage_countries'] = countries.to_dict('records')

    return dict


#### Add specification of slices included in the file

In [None]:
def add_slices_metadata(dict,x):

    disaggregations = np.setdiff1d(x.columns,['goalCode', 'goalDesc', 
                                              'targetCode', 'targetDesc', 
                                              'indicatorCode','indicatorDesc', 'indicatorTier', 
                                              'seriesRelease', 
                                              'timePeriod', 
                                              'geoAreaCode', 'geoAreaName','level', 'parentCode', 'parentName', 'type', 'CountryProfile', 'ISO3CD','UNMember', 'X', 'Y', 
                                              'value', 'valueType', 
                                              'timeDetail', 'source','footnotes', 'natureCode', 'unitsCode', 'natureDesc', 'unitsDesc'])

    disaggregations = list(disaggregations)
    disaggregations_new = copy.deepcopy(disaggregations)

    for i in range(len(disaggregations)):
        d = disaggregations[i] 
        if d[-4:] == 'Desc':
            disaggregations_new.remove(d)

    slices = x[disaggregations_new].drop_duplicates()
    slices = slices.reset_index(drop=True)

    dict['slices'] = slices.to_dict('records')

    return dict


    

#### Create file metadata dictionary

In [None]:
def file_metadata(path, fileName):
    
    x = read_excel_file(path,fileName)
   
    dict = {}
    dict['fileName'] = fileName
    dict = add_series_metadata(dict,x)
    dict = add_slices_metadata(dict,x)
    dict = add_time_coverage_metadata(dict,x)
    dict = add_geo_coverage(dict,x)    
    return dict


    

In [None]:
file_metadata( path , 'Data_1.5.2_VC_DSR_LSGP.xlsx')

In [None]:
file_metadata_set = {}
for f in files:
    print("creating metadata entry for file " + f + " (" + str(counter_f) + " of " + str(len(files)) + ")")
    file_metadata_set.append(file_metadata( path , f))
    
