# Pre-processing script
This script pulls SDG data from API and transforms it into csv files.
The steps are:
- Pull data from [API](https://unstats.un.org/SDGAPI/swagger/) 
- Join with geography
- save as "long" table
- pivot into "wide" format and split regional and country data 

### Load necessary libraries

In [None]:
import csv
import json
import urllib3  # allows to access a URL with python
import math
import os
import io
import collections
import numpy as np
import pandas as pd
import xlsxwriter

# https://volderette.de/jupyter-notebook-tip-multiple-outputs/
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


### User parameters

In [None]:
release = '2019.Q1.G.02' # Make sure to have the correct release here

dir_path = os.path.dirname(os.path.realpath('__file__'))
print(dir_path)

wd_dir = r'../'
print('data inputs dir: ' + wd_dir)

## Utilities

#### Convert string to camelCase

In [None]:
def camelCase(st):
    """
    https://stackoverflow.com/questions/8347048/camelcase-every-string-any-standard-library
    
    """
    output = ''.join(x for x in st.title() if x.isalnum())
    return output[0].lower() + output[1:]

#### Disable insecure request warnings when using `urllib3`.

In [None]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

#### Create array to catch errors

In [None]:
error_log = []

### List of countries to be plotted on a map (with XY coordinates)

In [None]:
def countryListXY(file):
    
    countryListXY = []
    
    with open(file, newline = '', encoding='latin-1') as countryList:                                                                                          
        countryList = csv.DictReader(countryList, delimiter='\t')
        for row in countryList:
            countryListXY.append(dict(row))
            
    countryListXY = pd.DataFrame(countryListXY).astype({'geoAreaCode':'str'})
    
    return(countryListXY)

    #print(countryListXY[1])
    #for c in countryListXY:

In [None]:
countryListXY(wd_dir + 'CountryListXY.txt').head(10)

#### Call the endpoint of the SDG API that provides the list of hierarchical groupings of geographic Areas:

In [None]:
def geoAreaTree():
    http = urllib3.PoolManager()
    response = http.request('GET', "https://unstats.un.org/SDGAPI/v1/sdg/GeoArea/Tree")
    responseData = json.loads(response.data.decode('UTF-8'))
    
    return responseData

Note: The geoAreaTree object has various "trees" in it.  We usually use the "World" tree; however, some economic and geographic groupings are only in other trees.

In [None]:
print(len(geoAreaTree()))
for t in geoAreaTree():
    print('root='+t['geoAreaName'])

#### Traverse a hierarchical tree of geographic areas and convert it to a parent-child hierarchy table:

In [None]:
def flatten(tree):
    
    global hierarchy
    
    hierarchy = []
    traverse.level = 1
    traverse(tree)
    
    return pd.DataFrame(hierarchy)

In [None]:
def traverse(tree, parentCode=None, parentName=None):
    
    # print(' ' * traverse.level + 'Level: ' + str(traverse.level) + ', ' + tree['type'] + ', ' + str(tree['geoAreaCode']) + '-' + tree['geoAreaName'] )
    
    d = {}
    
    d['level'] = traverse.level
    d['type'] = tree['type']
    d['parentCode'] = parentCode
    d['parentName'] = parentName
    d['geoAreaCode'] = str(tree['geoAreaCode'])
    d['geoAreaName'] = tree['geoAreaName']
    
    hierarchy.append(d)
        
    if tree['children']:
        for child in tree['children']:
            traverse.level += 1
            traverse(child, str(tree['geoAreaCode']), tree['geoAreaName'])
            traverse.level -= 1
    
    return pd.DataFrame(hierarchy)

#### `geoAreas` holds the flattened list of geographic areas under 'World':

In [None]:
geoAreas = flatten(geoAreaTree()[0])
geoAreas.head()
print('...')
geoAreas.tail()

##### Temporary Fix for missing regions: 

In [None]:
if('910' not in geoAreas['geoAreaCode']):
    d_910 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '910',
              'geoAreaName' : 'High income economies (WB)'
             }
    x = pd.DataFrame({k: [v] for k, v in d_910.items()})
    geoAreas = geoAreas.append(x, sort = True)


if('911' not in geoAreas['geoAreaCode']):
    d_911 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '911',
              'geoAreaName' : 'Low income economies (WB)'
             }
    x = pd.DataFrame({k: [v] for k, v in d_911.items()})
    geoAreas = geoAreas.append(x, sort = True)

    
if('912' not in geoAreas['geoAreaCode']):
    d_912 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '912',
              'geoAreaName' : 'Lower middle economies (WB)'
             }
    x = pd.DataFrame({k: [v] for k, v in d_912.items()})
    geoAreas = geoAreas.append(x, sort = True)

if('913' not in geoAreas['geoAreaCode']):
    d_913 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '913',
              'geoAreaName' : 'Low and middle income economies (WB)'
             }
    x = pd.DataFrame({k: [v] for k, v in d_913.items()})
    geoAreas = geoAreas.append(x, sort = True)

if('914' not in geoAreas['geoAreaCode']):
    d_914 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '914',
              'geoAreaName' : 'Upper middle economies (WB)'
             }
    x = pd.DataFrame({k: [v] for k, v in d_914.items()})
    geoAreas = geoAreas.append(x, sort = True)

if('135' not in geoAreas['geoAreaCode']):
    d_135 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '135',
              'geoAreaName' : 'Caucasus and Central Asia'
            }
    x = pd.DataFrame({k: [v] for k, v in d_135.items()})
    geoAreas = geoAreas.append(x, sort = True)


if('127' not in geoAreas['geoAreaCode']):
    d_127 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '127',
              'geoAreaName' : 'Southern Asia (excluding India)'
            }
    x = pd.DataFrame({k: [v] for k, v in d_127.items()})
    geoAreas = geoAreas.append(x, sort = True)

if('199' not in geoAreas['geoAreaCode']):
    d_199 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '199',
              'geoAreaName' : 'Least Developed Countries (LDC)'
            }
    x = pd.DataFrame({k: [v] for k, v in d_199.items()})
    geoAreas = geoAreas.append(x, sort = True)

if('485' not in geoAreas['geoAreaCode']):
    d_485 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '485',
              'geoAreaName' : 'Western Asia (exc. Armenia, Azerbaijan, Cyprus, Israel and Georgia)'
            }
    x = pd.DataFrame({k: [v] for k, v in d_485.items()})
    geoAreas = geoAreas.append(x, sort = True)


if('514' not in geoAreas['geoAreaCode']):
    d_514 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '514',
              'geoAreaName' : 'Developed Regions'
            }
    x = pd.DataFrame({k: [v] for k, v in d_514.items()})
    geoAreas = geoAreas.append(x, sort = True)


if('515' not in geoAreas['geoAreaCode']):
    d_515 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '515',
              'geoAreaName' : 'Developing Regions'
            }
    x = pd.DataFrame({k: [v] for k, v in d_515.items()})
    geoAreas = geoAreas.append(x, sort = True)


if('518' not in geoAreas['geoAreaCode']):
    d_518 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '518',
              'geoAreaName' : 'Eastern Asia (excluding Japan)'
            }
    x = pd.DataFrame({k: [v] for k, v in d_518.items()})
    geoAreas = geoAreas.append(x, sort = True)


if('738' not in geoAreas['geoAreaCode']):
    d_738 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '738',
              'geoAreaName' : 'Sub-Saharan Africa (inc. Sudan)'
            }
    x = pd.DataFrame({k: [v] for k, v in d_738.items()})
    geoAreas = geoAreas.append(x, sort = True)

                 
if('746' not in geoAreas['geoAreaCode']):
    d_746 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '746',
              'geoAreaName' : 'Northern Africa (exc. Sudan)'
            }
    x = pd.DataFrame({k: [v] for k, v in d_746.items()})
    geoAreas = geoAreas.append(x, sort = True)

#==================================

geoAreas = geoAreas.reset_index(drop=True)

geoAreas.tail(20)


#### Merge coordinates and list of geographic areas in SDG database

In [None]:
geoAreas.values

In [None]:
def geoareasXY(geoareas,coordinates_file):
    
    xy = countryListXY(coordinates_file)
    
    x = pd.merge(geoareas,xy.loc[:, xy.columns != 'geoAreaName'],
         how='outer',
         on=['geoAreaCode'])
    x['order'] = x['geoAreaCode']
    x['order'] = x['order'].astype(float)
    x = x.sort_values('order')
    del x['order']
    
    x = x.reset_index(drop=True)
    
    return(x)

# x.to_excel('test.xlsx', engine ='xlsxwriter')

In [None]:
geoareasXY(geoAreas, wd_dir + 'CountryListXY.txt')

### Get the list of goals, targets, indicators and series

In [None]:
def series_list(release):
    
    series_list = []

    # Call the endpoint of the SDG API that provides the list of goals with all their children:
    http = urllib3.PoolManager()
    response = http.request('GET', "https://unstats.un.org/SDGAPI/v1/sdg/Goal/List?includechildren=true")
    responseData = json.loads(response.data.decode('UTF-8'))
    
    # Define the list of 'keys' to be extracted into a flat dictionary from the response:
    keys = ["goalCode", 
            "goalDesc",
            "targetCode",
            "targetDesc",
            "indicatorCode",
            "indicatorDesc",
            "indicatorTier",
            "seriesCode",
            "seriesDesc",
            "seriesRelease"
           ]    
    
    # Iterate over goals, targets, indicators, and series for the specified release:
    for g in responseData:
        for t in g['targets']:
            for i in t['indicators']:
                for s in i['series']:
                    if s['release'] == release:
                        values = [g['code'], g['title'],
                                  t['code'], t['description'], 
                                  i['code'], i['description'], i['tier'], 
                                  s['code'], s['description'], s['release']]

                        keys_and_values = zip(keys, values)
                        serie_dic = {}
                        for key, value in keys_and_values:
                            serie_dic[key] = value
                        series_list.append(serie_dic)
                        
    series_list = pd.DataFrame(series_list)
    
    return series_list[['goalCode', 'goalDesc',
                       'targetCode', 'targetDesc',
                       'indicatorCode', 'indicatorDesc','indicatorTier', 
                       'seriesCode', 'seriesDesc', 'seriesRelease']]

In [None]:
series_list(release).head(4)

#### Cross-Join geographic area catalogue (with coordinates) and series catalogue

In [None]:
def cross_geo_series(geoAreas, countryXY, seriesCode, release):
    df1 = geoareasXY(geoAreas, countryXY)
    df2 = series_list(release)
    
    df2 = df2.loc[df2['seriesCode']==seriesCode]

    df1['key'] = 1
    df2['key'] = 1

    x = pd.merge(df1, df2,on='key')
    x.drop(['key'], axis=1, inplace=True)
    
    return x

In [None]:
x = cross_geo_series(geoAreas, wd_dir + 'CountryListXY.txt','SE_ACC_COMP','2019.Q1.G.02')
x.head()
print('...')
x.tail()

### Get the data for each series

#### Verify how many pages need to be requested to get all the data for a specific series from the SDG API. 

In [None]:
def series_request_details(seriesCode,release):
    
    seriesRequest = 'https://unstats.un.org/SDGAPI/v1/sdg/Series/Data?seriesCode=' + seriesCode + '&releaseCode=' + release + "&pageSize=2"
    
    http = urllib3.PoolManager()
    response = http.request('GET', seriesRequest)
    responseData = json.loads(response.data.decode('UTF-8'))
    
    pageSize = 500
    nPages = math.floor(responseData['totalElements'] / pageSize) + 1
    totalElements = responseData['totalElements']
    
    return {'series' : seriesCode,
            'totalElements' : totalElements,
            'nPages' : nPages, 
            'pageSize' : pageSize
           }


In [None]:
series_request_details('SE_ACC_COMP', '2019.Q1.G.02')

#### Explore the code lists of the attributes and dimensions of a series
Describe each attribute or dimension as a simple dictionary made of a set of `code`-`description` pairs.  For the code, use the SDMX code, and not the internal codeof the database.  Keep all labels in camelCase.

In [None]:
def series_code_lists(seriesCode, release):
    
    seriesRequest = 'https://unstats.un.org/SDGAPI/v1/sdg/Series/Data?seriesCode=' + seriesCode + '&releaseCode=' + release + "&pageSize=2"
    
    http = urllib3.PoolManager()
    response = http.request('GET', seriesRequest)
    responseData = json.loads(response.data.decode('UTF-8'))
    
    series_attributes = responseData['attributes']
    series_dimensions = responseData['dimensions']
    
    new_dict = {}
    
    new_dict['seriesCode'] = seriesCode
    
    for a in series_attributes:
        codelist_dict = {}
        for c in a['codes']:
            codelist_dict[c['code']] = c['description']
        new_dict[camelCase(a['id'])] = codelist_dict
    
    for d in series_dimensions:
        codelist_dict = {}
        for c in d['codes']:
            codelist_dict[c['code']] = c['description']
        new_dict[camelCase(d['id'])] = codelist_dict
        
    return new_dict


In [None]:
series_code_lists('SE_ACC_COMP', '2019.Q1.G.02')

#### Simplify further by presenting all the codes and their descriptions in a single table:

In [None]:
def series_code_lists2(seriesCode, release):
    
    seriesRequest = 'https://unstats.un.org/SDGAPI/v1/sdg/Series/Data?seriesCode=' + seriesCode + '&releaseCode=' + release + "&pageSize=2" 
    
    http = urllib3.PoolManager()
    response = http.request('GET', seriesRequest)
    responseData = json.loads(response.data.decode('UTF-8'))
    
    series_attributes = responseData['attributes']
    series_dimensions = responseData['dimensions']
    
    code_list = []
    
    for a in series_attributes:
       
        for c in a['codes']:
            new_dict = {}
            new_dict['series'] = seriesCode
            new_dict['role'] = 'attribute'
            new_dict['concept'] = camelCase(a['id'])
            new_dict['code'] = c['sdmx']
            new_dict['description'] = c['description']
            code_list.append(new_dict)
        
    for d in series_dimensions:
        for c in d['codes']:
            new_dict = {}
            new_dict['series'] = seriesCode
            new_dict['role'] = 'dimension'
            new_dict['concept'] = camelCase(d['id'])
            new_dict['code'] = c['sdmx']
            new_dict['description'] = c['description']
            code_list.append(new_dict)
        
    return pd.DataFrame(code_list)


In [None]:
series_code_lists2('SE_ACC_COMP', '2019.Q1.G.02')

#### Build query string to collect data for a specific series from the global SDG API

In [None]:
def series_query(seriesCode, release, page, pageSize):
    queryString =  r'https://unstats.un.org/SDGAPI/v1/sdg/Series/Data?seriesCode=' + seriesCode + '&releaseCode=' + release + '&page=' + str(page) + '&pageSize=' + str(pageSize)
    http = urllib3.PoolManager()
    response = http.request('GET', queryString)
    responseData =  json.loads(response.data.decode('UTF-8'))
    return(responseData)

#### Get data for a specific series from the API
*(!) Notice that a data point may appear more than once if it belongs to a "multi-purpose indicator"*

In [None]:
def series_data(seriesCode, release):
    x = series_request_details(seriesCode,release)
    series_data = []
    if x['totalElements'] > 0:
        for p in range(x['nPages']):
            print("---Series " + seriesCode + ": Processing page " + str(p+1) + " of " + str(x['nPages']))
            responseData =  series_query(seriesCode, release, p+1, x['pageSize'])
            if len(responseData['data'])>0:
                series_data = series_data + responseData['data'] 
    return series_data

In [None]:
series_data('SE_ACC_COMP','2019.Q1.G.02')[0]

#### Flatten the dictionary, extracting individual attributes and dimensions as key-value pairs in their own right.
Also convert the years (`timePeriod`) variable to `int`

In [None]:
def flat_series_data(seriesCode,release):
    new_x = []
    for d in series_data(seriesCode,release):
        new_d = {}
        for key, value in d.items():
            if type(value) is list:
                new_d[key] = ', '.join(value)
            elif type(value) is dict:
                for k, v in value.items():
                    new_d[camelCase(k+' Code')] = v
            elif key == 'time_detail':
                new_d[camelCase(key)] = value
            elif key == 'timePeriodStart':
                new_d['timePeriod'] = int(value)
            elif key == 'series':
                new_d['seriesCode'] = value
            elif key == 'seriesDescription':
                new_d['seriesDesc'] = value
            elif key == 'geoAreaCode':
                new_d['geoAreaCode'] = str(value)
            else:
                new_d[key] = value
        new_x.append(new_d)
    return new_x

In [None]:
flat_series_data('SE_ACC_COMP','2019.Q1.G.02')[0]

#### Convert flattened dictionary of data series to pandas data frame, remove goal/target/indicator columns, and remove duplicate values

In [None]:
def dataframe_unique_series_data(seriesCode,release):
    d = flat_series_data(seriesCode,release)
    x = pd.DataFrame(d, 
                     columns=d[0].keys())
    x.drop(['goal','target','indicator', 'seriesCount'], axis=1, inplace=True)
    x.drop_duplicates(inplace=True)

    return x

In [None]:
x = dataframe_unique_series_data('SE_ACC_COMP','2019.Q1.G.02')
x.to_excel('test2.xlsx', engine ='xlsxwriter')
x.head()

#### Add descriptions to coded dimension and attributes

In [None]:
def annotated_data(seriesCode, release):
    code_lists = series_code_lists(seriesCode, release)
    #print(code_lists)
    
    s = dataframe_unique_series_data(seriesCode,release)
    #print(s)
    
    concepts = code_lists.keys()
    #print(concepts)
    
    for c in concepts:
        if c != 'seriesCode':
            #print(c)
            d = code_lists[c]
            #print(d)
            
            x = pd.DataFrame(list(d.items()))
            x.columns = [c+'Code', c+'Desc']
            
            s = pd.merge(s, x,on=c+'Code')
            s = s.reset_index(drop=True)
            
    return s    

In [None]:
x = annotated_data('SL_ISV_IFRM','2019.Q1.G.02')
print(x.columns)
print(x.shape)
x.head()

#### Produce 'long' files for each indicator/series combination
(Notice that multi-purpose indicators need to be split)

In [None]:
s = 'SL_ISV_IFRM'
release = '2019.Q1.G.02'

print(s)
print(release)

x = annotated_data(s,release)
print(x.columns)
print(x.shape)
x.tail(10)



In [None]:
list(set(series_list(release).seriesCode))[0:2]

In [None]:
s = 'SG_HAZ_CMRBASEL'

data = annotated_data(s,release)
data = data.drop(['seriesDesc' , 'geoAreaName'] , axis='columns')

time_values = data[['timePeriod']].drop_duplicates()

time_values['key'] = 0
    
data_key =  data.drop(['geoAreaCode', 'timePeriod', 'value', 'valueType', 'timeDetail', 'source', 'footnotes', 
                       'natureCode', 'unitsCode', 'reportingTypeCode','natureDesc', 'unitsDesc', 'reportingTypeDesc'] , axis='columns').drop_duplicates()

data_dimensions = list(data_key.columns) + ['geoAreaCode','timePeriod']


data_dimensions 

data_key['key'] = 0

geo = geoareasXY(geoAreas, wd_dir + 'CountryListXY.txt')
geo['key'] = 0

xx = pd.merge(pd.merge(time_values, geo, on='key', how = 'left'),
              data_key, on='key', how ='left')

xx.drop('key',1, inplace=True)

xx = pd.merge(xx, data, how='left', on=data_dimensions)

xx.columns


indicators = series_list(release)
indicators = indicators.loc[indicators['seriesCode']==s]

for i in indicators['indicatorCode']:
    print(i)
    indicators.columns
    xx.columns
    y = pd.merge(indicators, xx, on='seriesCode')
    y.head() 
    print(y.columns)
    print(y.shape)

    


In [None]:
#for s in list(set(series_list(release).seriesCode)):
for s in list(set(series_list(release).seriesCode))[0:100]:


    data = annotated_data(s,release)
    data = data.drop(['seriesDesc' , 'geoAreaName'] , axis='columns')

    time_values = data[['timePeriod']].drop_duplicates()

    time_values['key'] = 0

    data_key =  data.drop(['geoAreaCode', 'timePeriod', 'value', 'valueType', 'timeDetail', 'source', 'footnotes', 
                           'natureCode', 'unitsCode', 'reportingTypeCode','natureDesc', 'unitsDesc', 'reportingTypeDesc'] , axis='columns').drop_duplicates()

    data_dimensions = list(data_key.columns) + ['geoAreaCode','timePeriod']


    data_dimensions 

    data_key['key'] = 0

    geo = geoareasXY(geoAreas, wd_dir + 'CountryListXY.txt')
    geo['key'] = 0

    xx = pd.merge(pd.merge(time_values, geo, on='key', how = 'left'),
                  data_key, on='key', how ='left')

    xx.drop('key',1, inplace=True)

    xx = pd.merge(xx, data, how='left', on=data_dimensions)

    xx.columns


    indicators = series_list(release)
    indicators = indicators.loc[indicators['seriesCode']==s]

    for i in indicators['indicatorCode']:
        y = pd.merge(indicators, xx, on='seriesCode')
        #y.head() 
        #print(y.columns)
        #print(y.shape)
        y.to_excel(wd_dir + 'Data_' + i + '_' + s + '.xlsx', engine ='xlsxwriter', index=False)
        


In [None]:
s = 'SL_ISV_IFRM'
series_data(s, release)

In [None]:
series_list('2019.Q1.G.02')

In [None]:
i