# Pre-processing script
This script pulls SDG data from API and transforms it into csv files.
The steps are:
- Pull data from [API](https://unstats.un.org/SDGAPI/swagger/) 
- Join with geography
- save as "long" table
- pivot into "wide" format and split regional and country data 

### Load necessary libraries

In [1]:
import csv
import json
import urllib3  # allows to access a URL with python
import math
import os
import io
import collections
import numpy as np
import pandas as pd
import xlsxwriter

# https://volderette.de/jupyter-notebook-tip-multiple-outputs/
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


### User parameters

In [2]:
release = '2019.Q1.G.02' # Make sure to have the correct release here

dir_path = os.path.dirname(os.path.realpath('__file__'))
print(dir_path)

wd_dir = r'../'
print('data inputs dir: ' + wd_dir)

/home/jovyan/FIS4SDGs/unsd/notebooks
data inputs dir: ../


## Utilities

#### Convert string to camelCase

In [3]:
def camelCase(st):
    """
    https://stackoverflow.com/questions/8347048/camelcase-every-string-any-standard-library
    
    """
    output = ''.join(x for x in st.title() if x.isalnum())
    return output[0].lower() + output[1:]

#### Disable insecure request warnings when using `urllib3`.

In [4]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

#### Create array to catch errors

In [5]:
error_log = []

### List of countries to be plotted on a map (with XY coordinates)

In [6]:
def countryListXY(file):
    
    countryListXY = []
    
    with open(file, newline = '', encoding='latin-1') as countryList:                                                                                          
        countryList = csv.DictReader(countryList, delimiter='\t')
        for row in countryList:
            countryListXY.append(dict(row))
            
    countryListXY = pd.DataFrame(countryListXY).astype({'geoAreaCode':'str'})
    
    return(countryListXY)

    #print(countryListXY[1])
    #for c in countryListXY:

In [7]:
countryListXY(wd_dir + 'CountryListXY.txt').head(10)

Unnamed: 0,CountryProfile,ISO3CD,UNMember,X,Y,geoAreaCode,geoAreaName
0,1,AFG,1,66.02688198,33.83160199,4,Afghanistan
1,1,ALB,1,20.06660928,41.13897007,8,Albania
2,1,ATA,0,21.47585697,-80.40897662,10,Antarctica
3,1,DZA,1,2.678164227,28.15940032,12,Algeria
4,1,ASM,0,-170.7187269,-14.30587306,16,American Samoa
5,1,AND,1,1.576257417,42.54548611,20,Andorra
6,1,AGO,1,17.57817062,-12.33724746,24,Angola
7,1,ATG,1,-61.7999755,17.07761471,28,Antigua and Barbuda
8,1,AZE,1,50.01064725,40.39229544,31,Azerbaijan
9,1,ARG,1,-65.14563274,-35.19446255,32,Argentina


#### Call the endpoint of the SDG API that provides the list of hierarchical groupings of geographic Areas:

In [8]:
def geoAreaTree():
    http = urllib3.PoolManager()
    response = http.request('GET', "https://unstats.un.org/SDGAPI/v1/sdg/GeoArea/Tree")
    responseData = json.loads(response.data.decode('UTF-8'))
    
    return responseData

Note: The geoAreaTree object has various "trees" in it.  We usually use the "World" tree; however, some economic and geographic groupings are only in other trees.

In [9]:
print(len(geoAreaTree()))
for t in geoAreaTree():
    print('root='+t['geoAreaName'])

7
root=World
root=Least Developed Countries (LDC)
root=Land Locked Developing Countries (LLDC)
root=Small Island Developing States (SIDS)
root=Developed Regions
root=Developing Regions
root=Custom groupings of data providers


#### Traverse a hierarchical tree of geographic areas and convert it to a parent-child hierarchy table:

In [10]:
def flatten(tree):
    
    global hierarchy
    
    hierarchy = []
    traverse.level = 1
    traverse(tree)
    
    return pd.DataFrame(hierarchy)

In [11]:
def traverse(tree, parentCode=None, parentName=None):
    
    # print(' ' * traverse.level + 'Level: ' + str(traverse.level) + ', ' + tree['type'] + ', ' + str(tree['geoAreaCode']) + '-' + tree['geoAreaName'] )
    
    d = {}
    
    d['level'] = traverse.level
    d['type'] = tree['type']
    d['parentCode'] = parentCode
    d['parentName'] = parentName
    d['geoAreaCode'] = str(tree['geoAreaCode'])
    d['geoAreaName'] = tree['geoAreaName']
    
    hierarchy.append(d)
        
    if tree['children']:
        for child in tree['children']:
            traverse.level += 1
            traverse(child, str(tree['geoAreaCode']), tree['geoAreaName'])
            traverse.level -= 1
    
    return pd.DataFrame(hierarchy)

#### `geoAreas` holds the flattened list of geographic areas under 'World':

In [12]:
geoAreas = flatten(geoAreaTree()[0])
geoAreas.head()
print('...')
geoAreas.tail()

Unnamed: 0,geoAreaCode,geoAreaName,level,parentCode,parentName,type
0,1,World,1,,,Region
1,10,Antarctica,2,1.0,World,Country
2,2,Africa,2,1.0,World,Region
3,15,Northern Africa,3,2.0,Africa,Region
4,12,Algeria,4,15.0,Northern Africa,Country


...


Unnamed: 0,geoAreaCode,geoAreaName,level,parentCode,parentName,type
291,543,Oceania (exc. Australia and New Zealand),3,9,Oceania,Region
292,62,Central and Southern Asia,2,1,World,Region
293,513,Europe and Northern America,2,1,World,Region
294,747,Northern Africa and Western Asia,2,1,World,Region
295,753,Eastern and South-Eastern Asia,2,1,World,Region


##### Temporary Fix for missing regions: 

In [13]:
if('910' not in geoAreas['geoAreaCode']):
    d_910 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '910',
              'geoAreaName' : 'High income economies (WB)'
             }
    x = pd.DataFrame({k: [v] for k, v in d_910.items()})
    geoAreas = geoAreas.append(x, sort = True)


if('911' not in geoAreas['geoAreaCode']):
    d_911 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '911',
              'geoAreaName' : 'Low income economies (WB)'
             }
    x = pd.DataFrame({k: [v] for k, v in d_911.items()})
    geoAreas = geoAreas.append(x, sort = True)

    
if('912' not in geoAreas['geoAreaCode']):
    d_912 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '912',
              'geoAreaName' : 'Lower middle economies (WB)'
             }
    x = pd.DataFrame({k: [v] for k, v in d_912.items()})
    geoAreas = geoAreas.append(x, sort = True)

if('913' not in geoAreas['geoAreaCode']):
    d_913 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '913',
              'geoAreaName' : 'Low and middle income economies (WB)'
             }
    x = pd.DataFrame({k: [v] for k, v in d_913.items()})
    geoAreas = geoAreas.append(x, sort = True)

if('914' not in geoAreas['geoAreaCode']):
    d_914 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '914',
              'geoAreaName' : 'Upper middle economies (WB)'
             }
    x = pd.DataFrame({k: [v] for k, v in d_914.items()})
    geoAreas = geoAreas.append(x, sort = True)

if('135' not in geoAreas['geoAreaCode']):
    d_135 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '135',
              'geoAreaName' : 'Caucasus and Central Asia'
            }
    x = pd.DataFrame({k: [v] for k, v in d_135.items()})
    geoAreas = geoAreas.append(x, sort = True)


if('127' not in geoAreas['geoAreaCode']):
    d_127 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '127',
              'geoAreaName' : 'Southern Asia (excluding India)'
            }
    x = pd.DataFrame({k: [v] for k, v in d_127.items()})
    geoAreas = geoAreas.append(x, sort = True)

if('199' not in geoAreas['geoAreaCode']):
    d_199 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '199',
              'geoAreaName' : 'Least Developed Countries (LDC)'
            }
    x = pd.DataFrame({k: [v] for k, v in d_199.items()})
    geoAreas = geoAreas.append(x, sort = True)

if('485' not in geoAreas['geoAreaCode']):
    d_485 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '485',
              'geoAreaName' : 'Western Asia (exc. Armenia, Azerbaijan, Cyprus, Israel and Georgia)'
            }
    x = pd.DataFrame({k: [v] for k, v in d_485.items()})
    geoAreas = geoAreas.append(x, sort = True)


if('514' not in geoAreas['geoAreaCode']):
    d_514 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '514',
              'geoAreaName' : 'Developed Regions'
            }
    x = pd.DataFrame({k: [v] for k, v in d_514.items()})
    geoAreas = geoAreas.append(x, sort = True)


if('515' not in geoAreas['geoAreaCode']):
    d_515 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '515',
              'geoAreaName' : 'Developing Regions'
            }
    x = pd.DataFrame({k: [v] for k, v in d_515.items()})
    geoAreas = geoAreas.append(x, sort = True)


if('518' not in geoAreas['geoAreaCode']):
    d_518 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '518',
              'geoAreaName' : 'Eastern Asia (excluding Japan)'
            }
    x = pd.DataFrame({k: [v] for k, v in d_518.items()})
    geoAreas = geoAreas.append(x, sort = True)


if('738' not in geoAreas['geoAreaCode']):
    d_738 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '738',
              'geoAreaName' : 'Sub-Saharan Africa (inc. Sudan)'
            }
    x = pd.DataFrame({k: [v] for k, v in d_738.items()})
    geoAreas = geoAreas.append(x, sort = True)

                 
if('746' not in geoAreas['geoAreaCode']):
    d_746 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '746',
              'geoAreaName' : 'Northern Africa (exc. Sudan)'
            }
    x = pd.DataFrame({k: [v] for k, v in d_746.items()})
    geoAreas = geoAreas.append(x, sort = True)

#==================================

geoAreas = geoAreas.reset_index(drop=True)

geoAreas.tail(20)


Unnamed: 0,geoAreaCode,geoAreaName,level,parentCode,parentName,type
290,876,Wallis and Futuna Islands,4,61.0,Polynesia,Country
291,543,Oceania (exc. Australia and New Zealand),3,9.0,Oceania,Region
292,62,Central and Southern Asia,2,1.0,World,Region
293,513,Europe and Northern America,2,1.0,World,Region
294,747,Northern Africa and Western Asia,2,1.0,World,Region
295,753,Eastern and South-Eastern Asia,2,1.0,World,Region
296,910,High income economies (WB),1,,,Group
297,911,Low income economies (WB),1,,,Group
298,912,Lower middle economies (WB),1,,,Group
299,913,Low and middle income economies (WB),1,,,Group


#### Merge coordinates and list of geographic areas in SDG database

In [14]:
geoAreas.values

array([['1', 'World', 1, None, None, 'Region'],
       ['10', 'Antarctica', 2, '1', 'World', 'Country'],
       ['2', 'Africa', 2, '1', 'World', 'Region'],
       ...,
       ['518', 'Eastern Asia (excluding Japan)', 1, None, None, 'Group'],
       ['738', 'Sub-Saharan Africa (inc. Sudan)', 1, None, None, 'Group'],
       ['746', 'Northern Africa (exc. Sudan)', 1, None, None, 'Group']],
      dtype=object)

In [15]:
def geoareasXY(geoareas,coordinates_file):
    
    xy = countryListXY(coordinates_file)
    
    x = pd.merge(geoareas,xy.loc[:, xy.columns != 'geoAreaName'],
         how='outer',
         on=['geoAreaCode'])
    x['order'] = x['geoAreaCode']
    x['order'] = x['order'].astype(float)
    x = x.sort_values('order')
    del x['order']
    
    x = x.reset_index(drop=True)
    
    return(x)

# x.to_excel('test.xlsx', engine ='xlsxwriter')

In [16]:
geoareasXY(geoAreas, wd_dir + 'CountryListXY.txt')

Unnamed: 0,geoAreaCode,geoAreaName,level,parentCode,parentName,type,CountryProfile,ISO3CD,UNMember,X,Y
0,1,World,1,,,Region,,,,,
1,2,Africa,2,1,World,Region,,,,,
2,4,Afghanistan,4,34,Southern Asia,Country,1,AFG,1,66.02688198,33.83160199
3,5,South America,4,419,Latin America and the Caribbean,Region,,,,,
4,8,Albania,4,39,Southern Europe,Country,1,ALB,1,20.06660928,41.13897007
5,9,Oceania,2,1,World,Region,,,,,
6,10,Antarctica,2,1,World,Country,1,ATA,0,21.47585697,-80.40897662
7,11,Western Africa,4,202,Sub-Saharan Africa,Region,,,,,
8,12,Algeria,4,15,Northern Africa,Country,1,DZA,1,2.678164227,28.15940032
9,13,Central America,4,419,Latin America and the Caribbean,Region,,,,,


### Get the list of goals, targets, indicators and series

In [17]:
def series_list(release):
    
    series_list = []

    # Call the endpoint of the SDG API that provides the list of goals with all their children:
    http = urllib3.PoolManager()
    response = http.request('GET', "https://unstats.un.org/SDGAPI/v1/sdg/Goal/List?includechildren=true")
    responseData = json.loads(response.data.decode('UTF-8'))
    
    # Define the list of 'keys' to be extracted into a flat dictionary from the response:
    keys = ["goalCode", 
            "goalDesc",
            "targetCode",
            "targetDesc",
            "indicatorCode",
            "indicatorDesc",
            "indicatorTier",
            "seriesCode",
            "seriesDesc",
            "seriesRelease"
           ]    
    
    # Iterate over goals, targets, indicators, and series for the specified release:
    for g in responseData:
        for t in g['targets']:
            for i in t['indicators']:
                for s in i['series']:
                    if s['release'] == release:
                        values = [g['code'], g['title'],
                                  t['code'], t['description'], 
                                  i['code'], i['description'], i['tier'], 
                                  s['code'], s['description'], s['release']]

                        keys_and_values = zip(keys, values)
                        serie_dic = {}
                        for key, value in keys_and_values:
                            serie_dic[key] = value
                        series_list.append(serie_dic)
                        
    series_list = pd.DataFrame(series_list)
    
    return series_list[['goalCode', 'goalDesc',
                       'targetCode', 'targetDesc',
                       'indicatorCode', 'indicatorDesc','indicatorTier', 
                       'seriesCode', 'seriesDesc', 'seriesRelease']]

In [18]:
series_list(release).head(4)

Unnamed: 0,goalCode,goalDesc,targetCode,targetDesc,indicatorCode,indicatorDesc,indicatorTier,seriesCode,seriesDesc,seriesRelease
0,1,End poverty in all its forms everywhere,1.1,"By 2030, eradicate extreme poverty for all peo...",1.1.1,Proportion of population below the internation...,1,SI_POV_DAY1,Proportion of population below international p...,2019.Q1.G.02
1,1,End poverty in all its forms everywhere,1.1,"By 2030, eradicate extreme poverty for all peo...",1.1.1,Proportion of population below the internation...,1,SI_POV_EMP1,Employed population below international povert...,2019.Q1.G.02
2,1,End poverty in all its forms everywhere,1.2,"By 2030, reduce at least by half the proportio...",1.2.1,Proportion of population living below the nati...,1,SI_POV_NAHC,Proportion of population living below the nati...,2019.Q1.G.02
3,1,End poverty in all its forms everywhere,1.3,Implement nationally appropriate social protec...,1.3.1,Proportion of population covered by social pro...,1,SI_COV_MATNL,[ILO] Proportion of mothers with newborns rece...,2019.Q1.G.02


#### Cross-Join geographic area catalogue (with coordinates) and series catalogue

In [19]:
def cross_geo_series(geoAreas, countryXY, seriesCode, release):
    df1 = geoareasXY(geoAreas, countryXY)
    df2 = series_list(release)
    
    df2 = df2.loc[df2['seriesCode']==seriesCode]

    df1['key'] = 1
    df2['key'] = 1

    x = pd.merge(df1, df2,on='key')
    x.drop(['key'], axis=1, inplace=True)
    
    return x

In [20]:
x = cross_geo_series(geoAreas, wd_dir + 'CountryListXY.txt','SE_ACC_COMP','2019.Q1.G.02')
x.head()
print('...')
x.tail()

Unnamed: 0,geoAreaCode,geoAreaName,level,parentCode,parentName,type,CountryProfile,ISO3CD,UNMember,X,...,goalCode,goalDesc,targetCode,targetDesc,indicatorCode,indicatorDesc,indicatorTier,seriesCode,seriesDesc,seriesRelease
0,1,World,1,,,Region,,,,,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
1,2,Africa,2,1.0,World,Region,,,,,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
2,4,Afghanistan,4,34.0,Southern Asia,Country,1.0,AFG,1.0,66.02688198,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
3,5,South America,4,419.0,Latin America and the Caribbean,Region,,,,,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
4,8,Albania,4,39.0,Southern Europe,Country,1.0,ALB,1.0,20.06660928,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02


...


Unnamed: 0,geoAreaCode,geoAreaName,level,parentCode,parentName,type,CountryProfile,ISO3CD,UNMember,X,...,goalCode,goalDesc,targetCode,targetDesc,indicatorCode,indicatorDesc,indicatorTier,seriesCode,seriesDesc,seriesRelease
305,910,High income economies (WB),1,,,Group,,,,,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
306,911,Low income economies (WB),1,,,Group,,,,,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
307,912,Lower middle economies (WB),1,,,Group,,,,,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
308,913,Low and middle income economies (WB),1,,,Group,,,,,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
309,914,Upper middle economies (WB),1,,,Group,,,,,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02


### Get the data for each series

#### Verify how many pages need to be requested to get all the data for a specific series from the SDG API. 

In [21]:
def series_request_details(seriesCode,release):
    
    seriesRequest = 'https://unstats.un.org/SDGAPI/v1/sdg/Series/Data?seriesCode=' + seriesCode + '&releaseCode=' + release + "&pageSize=2"
    
    http = urllib3.PoolManager()
    response = http.request('GET', seriesRequest)
    responseData = json.loads(response.data.decode('UTF-8'))
    
    pageSize = 500
    nPages = math.floor(responseData['totalElements'] / pageSize) + 1
    totalElements = responseData['totalElements']
    
    return {'series' : seriesCode,
            'totalElements' : totalElements,
            'nPages' : nPages, 
            'pageSize' : pageSize
           }


In [22]:
series_request_details('SE_ACC_COMP', '2019.Q1.G.02')

{'series': 'SE_ACC_COMP', 'totalElements': 1414, 'nPages': 3, 'pageSize': 500}

#### Explore the code lists of the attributes and dimensions of a series
Describe each attribute or dimension as a simple dictionary made of a set of `code`-`description` pairs.  For the code, use the SDMX code, and not the internal codeof the database.  Keep all labels in camelCase.

In [23]:
def series_code_lists(seriesCode, release):
    
    seriesRequest = 'https://unstats.un.org/SDGAPI/v1/sdg/Series/Data?seriesCode=' + seriesCode + '&releaseCode=' + release + "&pageSize=2"
    
    http = urllib3.PoolManager()
    response = http.request('GET', seriesRequest)
    responseData = json.loads(response.data.decode('UTF-8'))
    
    series_attributes = responseData['attributes']
    series_dimensions = responseData['dimensions']
    
    new_dict = {}
    
    new_dict['seriesCode'] = seriesCode
    
    for a in series_attributes:
        codelist_dict = {}
        for c in a['codes']:
            codelist_dict[c['code']] = c['description']
        new_dict[camelCase(a['id'])] = codelist_dict
    
    for d in series_dimensions:
        codelist_dict = {}
        for c in d['codes']:
            codelist_dict[c['code']] = c['description']
        new_dict[camelCase(d['id'])] = codelist_dict
        
    return new_dict


In [24]:
series_code_lists('SE_ACC_COMP', '2019.Q1.G.02')

{'seriesCode': 'SE_ACC_COMP',
 'nature': {'C': 'Country data',
  'CA': 'Country adjusted data',
  'E': 'Estimated data',
  'G': 'Global monitoring data',
  'M': 'Modeled data',
  'N': 'Non-relevant',
  'NA': 'Data nature not available'},
 'units': {'PERCENT': 'Percentage'},
 'educationLevel': {'GRAD23': 'Grades 2/3',
  'LOWSEC': 'Lower secondary',
  'PRIMAR': 'Primary',
  'PREPRI': 'Pre-primary',
  'SECOND': 'Secondary',
  'UPPSEC': 'Upper secondary'},
 'reportingType': {'N': 'National', 'G': 'Global'}}

#### Simplify further by presenting all the codes and their descriptions in a single table:

In [25]:
def series_code_lists2(seriesCode, release):
    
    seriesRequest = 'https://unstats.un.org/SDGAPI/v1/sdg/Series/Data?seriesCode=' + seriesCode + '&releaseCode=' + release + "&pageSize=2" 
    
    http = urllib3.PoolManager()
    response = http.request('GET', seriesRequest)
    responseData = json.loads(response.data.decode('UTF-8'))
    
    series_attributes = responseData['attributes']
    series_dimensions = responseData['dimensions']
    
    code_list = []
    
    for a in series_attributes:
       
        for c in a['codes']:
            new_dict = {}
            new_dict['series'] = seriesCode
            new_dict['role'] = 'attribute'
            new_dict['concept'] = camelCase(a['id'])
            new_dict['code'] = c['sdmx']
            new_dict['description'] = c['description']
            code_list.append(new_dict)
        
    for d in series_dimensions:
        for c in d['codes']:
            new_dict = {}
            new_dict['series'] = seriesCode
            new_dict['role'] = 'dimension'
            new_dict['concept'] = camelCase(d['id'])
            new_dict['code'] = c['sdmx']
            new_dict['description'] = c['description']
            code_list.append(new_dict)
        
    return pd.DataFrame(code_list)


In [26]:
series_code_lists2('SE_ACC_COMP', '2019.Q1.G.02')

Unnamed: 0,code,concept,description,role,series
0,C,nature,Country data,attribute,SE_ACC_COMP
1,CA,nature,Country adjusted data,attribute,SE_ACC_COMP
2,E,nature,Estimated data,attribute,SE_ACC_COMP
3,G,nature,Global monitoring data,attribute,SE_ACC_COMP
4,M,nature,Modeled data,attribute,SE_ACC_COMP
5,N,nature,Non-relevant,attribute,SE_ACC_COMP
6,,nature,Data nature not available,attribute,SE_ACC_COMP
7,PERCENT,units,Percentage,attribute,SE_ACC_COMP
8,GRAD23,educationLevel,Grades 2/3,dimension,SE_ACC_COMP
9,LOWSEC,educationLevel,Lower secondary,dimension,SE_ACC_COMP


#### Build query string to collect data for a specific series from the global SDG API

In [27]:
def series_query(seriesCode, release, page, pageSize):
    queryString =  r'https://unstats.un.org/SDGAPI/v1/sdg/Series/Data?seriesCode=' + seriesCode + '&releaseCode=' + release + '&page=' + str(page) + '&pageSize=' + str(pageSize)
    http = urllib3.PoolManager()
    response = http.request('GET', queryString)
    responseData =  json.loads(response.data.decode('UTF-8'))
    return(responseData)

#### Get data for a specific series from the API
*(!) Notice that a data point may appear more than once if it belongs to a "multi-purpose indicator"*

In [28]:
def series_data(seriesCode, release):
    x = series_request_details(seriesCode,release)
    series_data = []
    if x['totalElements'] > 0:
        for p in range(x['nPages']):
            print("---Series " + seriesCode + ": Processing page " + str(p+1) + " of " + str(x['nPages']))
            responseData =  series_query(seriesCode, release, p+1, x['pageSize'])
            if len(responseData['data'])>0:
                series_data = series_data + responseData['data'] 
    return series_data

In [29]:
series_data('SE_ACC_COMP','2019.Q1.G.02')[0]

---Series SE_ACC_COMP: Processing page 1 of 3
---Series SE_ACC_COMP: Processing page 2 of 3
---Series SE_ACC_COMP: Processing page 3 of 3


{'goal': ['4'],
 'target': ['4.a'],
 'indicator': ['4.a.1'],
 'series': 'SE_ACC_COMP',
 'seriesDescription': 'Schools with access to\xa0computers for pedagogical purposes, by education level (%)',
 'seriesCount': '1414',
 'geoAreaCode': '8',
 'geoAreaName': 'Albania',
 'timePeriodStart': 2017.0,
 'value': '83.13492',
 'valueType': 'Float',
 'time_detail': None,
 'source': 'UIS',
 'footnotes': [''],
 'attributes': {'Nature': 'C', 'Units': 'PERCENT'},
 'dimensions': {'Education level': 'UPPSEC', 'Reporting Type': 'G'}}

#### Flatten the dictionary, extracting individual attributes and dimensions as key-value pairs in their own right.
Also convert the years (`timePeriod`) variable to `int`

In [30]:
def flat_series_data(seriesCode,release):
    new_x = []
    for d in series_data(seriesCode,release):
        new_d = {}
        for key, value in d.items():
            if type(value) is list:
                new_d[key] = ', '.join(value)
            elif type(value) is dict:
                for k, v in value.items():
                    new_d[camelCase(k+' Code')] = v
            elif key == 'time_detail':
                new_d[camelCase(key)] = value
            elif key == 'timePeriodStart':
                new_d['timePeriod'] = int(value)
            elif key == 'series':
                new_d['seriesCode'] = value
            elif key == 'seriesDescription':
                new_d['seriesDesc'] = value
            elif key == 'geoAreaCode':
                new_d['geoAreaCode'] = str(value)
            else:
                new_d[key] = value
        new_x.append(new_d)
    return new_x

In [31]:
flat_series_data('SE_ACC_COMP','2019.Q1.G.02')[0]

---Series SE_ACC_COMP: Processing page 1 of 3
---Series SE_ACC_COMP: Processing page 2 of 3
---Series SE_ACC_COMP: Processing page 3 of 3


{'goal': '4',
 'target': '4.a',
 'indicator': '4.a.1',
 'seriesCode': 'SE_ACC_COMP',
 'seriesDesc': 'Schools with access to\xa0computers for pedagogical purposes, by education level (%)',
 'seriesCount': '1414',
 'geoAreaCode': '8',
 'geoAreaName': 'Albania',
 'timePeriod': 2017,
 'value': '83.13492',
 'valueType': 'Float',
 'timeDetail': None,
 'source': 'UIS',
 'footnotes': '',
 'natureCode': 'C',
 'unitsCode': 'PERCENT',
 'educationLevelCode': 'UPPSEC',
 'reportingTypeCode': 'G'}

#### Convert flattened dictionary of data series to pandas data frame, remove goal/target/indicator columns, and remove duplicate values

In [32]:
def dataframe_unique_series_data(seriesCode,release):
    d = flat_series_data(seriesCode,release)
    x = pd.DataFrame(d, 
                     columns=d[0].keys())
    x.drop(['goal','target','indicator', 'seriesCount'], axis=1, inplace=True)
    x.drop_duplicates(inplace=True)

    return x

In [33]:
x = dataframe_unique_series_data('SE_ACC_COMP','2019.Q1.G.02')
x.to_excel('test2.xlsx', engine ='xlsxwriter')
x.head()

---Series SE_ACC_COMP: Processing page 1 of 3
---Series SE_ACC_COMP: Processing page 2 of 3
---Series SE_ACC_COMP: Processing page 3 of 3


Unnamed: 0,seriesCode,seriesDesc,geoAreaCode,geoAreaName,timePeriod,value,valueType,timeDetail,source,footnotes,natureCode,unitsCode,educationLevelCode,reportingTypeCode
0,SE_ACC_COMP,Schools with access to computers for pedagogic...,8,Albania,2017,83.13492,Float,,UIS,,C,PERCENT,UPPSEC,G
1,SE_ACC_COMP,Schools with access to computers for pedagogic...,20,Andorra,2016,100.0,Float,,UIS,,C,PERCENT,PRIMAR,G
2,SE_ACC_COMP,Schools with access to computers for pedagogic...,20,Andorra,2016,100.0,Float,,UIS,,C,PERCENT,LOWSEC,G
3,SE_ACC_COMP,Schools with access to computers for pedagogic...,20,Andorra,2016,100.0,Float,,UIS,,C,PERCENT,UPPSEC,G
4,SE_ACC_COMP,Schools with access to computers for pedagogic...,20,Andorra,2017,100.0,Float,,UIS,,C,PERCENT,UPPSEC,G


#### Add descriptions to coded dimension and attributes

In [34]:
def annotated_data(seriesCode, release):
    code_lists = series_code_lists(seriesCode, release)
    #print(code_lists)
    
    s = dataframe_unique_series_data(seriesCode,release)
    #print(s)
    
    concepts = code_lists.keys()
    #print(concepts)
    
    for c in concepts:
        if c != 'seriesCode':
            #print(c)
            d = code_lists[c]
            #print(d)
            
            x = pd.DataFrame(list(d.items()))
            x.columns = [c+'Code', c+'Desc']
            
            s = pd.merge(s, x,on=c+'Code')
            s = s.reset_index(drop=True)
            
    return s    

In [35]:
x = annotated_data('SL_ISV_IFRM','2019.Q1.G.02')
print(x.columns)
print(x.shape)
x.head()

---Series SL_ISV_IFRM: Processing page 1 of 2
---Series SL_ISV_IFRM: Processing page 2 of 2
Index(['seriesCode', 'seriesDesc', 'geoAreaCode', 'geoAreaName', 'timePeriod',
       'value', 'valueType', 'timeDetail', 'source', 'footnotes', 'natureCode',
       'unitsCode', 'sexCode', 'reportingTypeCode', 'natureDesc', 'unitsDesc',
       'reportingTypeDesc', 'sexDesc'],
      dtype='object')
(783, 18)


Unnamed: 0,seriesCode,seriesDesc,geoAreaCode,geoAreaName,timePeriod,value,valueType,timeDetail,source,footnotes,natureCode,unitsCode,sexCode,reportingTypeCode,natureDesc,unitsDesc,reportingTypeDesc,sexDesc
0,SL_ISV_IFRM,Proportion of informal employment in non-agric...,8,Albania,2011,43.2,Float,,ILOSTAT - LFS - Labour Force Survey,Repository: ILO-STATISTICS - Micro data proces...,C,PERCENT,MALE,G,Country data,Percentage,Global,Male
1,SL_ISV_IFRM,Proportion of informal employment in non-agric...,8,Albania,2012,43.8,Float,,ILOSTAT - LFS - Labour Force Survey,Repository: ILO-STATISTICS - Micro data proces...,C,PERCENT,MALE,G,Country data,Percentage,Global,Male
2,SL_ISV_IFRM,Proportion of informal employment in non-agric...,8,Albania,2013,38.6,Float,,ILOSTAT - LFS - Labour Force Survey,Repository: ILO-STATISTICS - Micro data proces...,C,PERCENT,MALE,G,Country data,Percentage,Global,Male
3,SL_ISV_IFRM,Proportion of informal employment in non-agric...,24,Angola,2004,59.8,Float,,ILOSTAT - HIES - Core Welfare Indicators Quest...,Repository: ILO-STATISTICS - Micro data proces...,C,PERCENT,MALE,G,Country data,Percentage,Global,Male
4,SL_ISV_IFRM,Proportion of informal employment in non-agric...,24,Angola,2011,59.3,Float,,ILOSTAT - HIES - Core Welfare Indicators Quest...,Repository: ILO-STATISTICS - Micro data proces...,C,PERCENT,MALE,G,Country data,Percentage,Global,Male


#### Produce 'long' files for each indicator/series combination
(Notice that multi-purpose indicators need to be split)

In [36]:
s = 'SL_ISV_IFRM'
release = '2019.Q1.G.02'

print(s)
print(release)

x = annotated_data(s,release)
print(x.columns)
print(x.shape)
x.tail(10)



SL_ISV_IFRM
2019.Q1.G.02
---Series SL_ISV_IFRM: Processing page 1 of 2
---Series SL_ISV_IFRM: Processing page 2 of 2
Index(['seriesCode', 'seriesDesc', 'geoAreaCode', 'geoAreaName', 'timePeriod',
       'value', 'valueType', 'timeDetail', 'source', 'footnotes', 'natureCode',
       'unitsCode', 'sexCode', 'reportingTypeCode', 'natureDesc', 'unitsDesc',
       'reportingTypeDesc', 'sexDesc'],
      dtype='object')
(783, 18)


Unnamed: 0,seriesCode,seriesDesc,geoAreaCode,geoAreaName,timePeriod,value,valueType,timeDetail,source,footnotes,natureCode,unitsCode,sexCode,reportingTypeCode,natureDesc,unitsDesc,reportingTypeDesc,sexDesc
773,SL_ISV_IFRM,Proportion of informal employment in non-agric...,704,Viet Nam,2009,65.9,Float,,ILOSTAT - LFS - Labour force survey,Repository: ILO-STATISTICS - Micro data proces...,C,PERCENT,BOTHSEX,G,Country data,Percentage,Global,Both sexes
774,SL_ISV_IFRM,Proportion of informal employment in non-agric...,704,Viet Nam,2011,96.8,Float,,ILOSTAT - LFS - Labour force survey,Repository: ILO-STATISTICS - Micro data proces...,C,PERCENT,BOTHSEX,G,Country data,Percentage,Global,Both sexes
775,SL_ISV_IFRM,Proportion of informal employment in non-agric...,704,Viet Nam,2012,93.4,Float,,ILOSTAT - LFS - Labour force survey,Repository: ILO-STATISTICS - Micro data proces...,C,PERCENT,BOTHSEX,G,Country data,Percentage,Global,Both sexes
776,SL_ISV_IFRM,Proportion of informal employment in non-agric...,704,Viet Nam,2013,62.2,Float,,ILOSTAT - LFS - Labour force survey,Repository: ILO-STATISTICS - Micro data proces...,C,PERCENT,BOTHSEX,G,Country data,Percentage,Global,Both sexes
777,SL_ISV_IFRM,Proportion of informal employment in non-agric...,704,Viet Nam,2014,60.6,Float,,ILOSTAT - LFS - Labour force survey,Repository: ILO-STATISTICS - Micro data proces...,C,PERCENT,BOTHSEX,G,Country data,Percentage,Global,Both sexes
778,SL_ISV_IFRM,Proportion of informal employment in non-agric...,704,Viet Nam,2015,57.1,Float,,ILOSTAT - LFS - Labour force survey,Repository: ILO-STATISTICS - Micro data proces...,C,PERCENT,BOTHSEX,G,Country data,Percentage,Global,Both sexes
779,SL_ISV_IFRM,Proportion of informal employment in non-agric...,704,Viet Nam,2016,56.3,Float,,ILOSTAT - LFS - Labour force survey,Repository: ILO-STATISTICS - Micro data proces...,C,PERCENT,BOTHSEX,G,Country data,Percentage,Global,Both sexes
780,SL_ISV_IFRM,Proportion of informal employment in non-agric...,704,Viet Nam,2017,56.1,Float,,ILOSTAT - LFS - Labour force survey,Repository: ILO-STATISTICS - Micro data proces...,C,PERCENT,BOTHSEX,G,Country data,Percentage,Global,Both sexes
781,SL_ISV_IFRM,Proportion of informal employment in non-agric...,887,Yemen,2014,68.4,Float,,ILOSTAT - LFS - Labour force survey,Repository: ILO-STATISTICS - Micro data proces...,C,PERCENT,BOTHSEX,G,Country data,Percentage,Global,Both sexes
782,SL_ISV_IFRM,Proportion of informal employment in non-agric...,716,Zimbabwe,2011,62.5,Float,,ILOSTAT - LFS - Labour Force Survey,Repository: ILO-STATISTICS - Micro data proces...,C,PERCENT,BOTHSEX,G,Country data,Percentage,Global,Both sexes


In [37]:
list(set(series_list(release).seriesCode))[0:2]

['DC_TOF_HLTHL', 'SG_NHR_INTEXST']

In [52]:
s = 'SG_HAZ_CMRBASEL'

data = annotated_data(s,release)
data = data.drop(['seriesDesc' , 'geoAreaName'] , axis='columns')

time_values = data[['timePeriod']].drop_duplicates()

time_values['key'] = 0
    
data_key =  data.drop(['geoAreaCode', 'timePeriod', 'value', 'valueType', 'timeDetail', 'source', 'footnotes', 
                       'natureCode', 'unitsCode', 'reportingTypeCode','natureDesc', 'unitsDesc', 'reportingTypeDesc'] , axis='columns').drop_duplicates()

data_dimensions = list(data_key.columns) + ['geoAreaCode','timePeriod']


data_dimensions 

data_key['key'] = 0

geo = geoareasXY(geoAreas, wd_dir + 'CountryListXY.txt')
geo['key'] = 0

xx = pd.merge(pd.merge(time_values, geo, on='key', how = 'left'),
              data_key, on='key', how ='left')

xx.drop('key',1, inplace=True)

xx = pd.merge(xx, data, how='left', on=data_dimensions)

xx.columns


indicators = series_list(release)
indicators = indicators.loc[indicators['seriesCode']==s]

for i in indicators['indicatorCode']:
    print(i)
    indicators.columns
    xx.columns
    y = pd.merge(indicators, xx, on='seriesCode')
    y.head() 
    print(y.columns)
    print(y.shape)

    


---Series SG_HAZ_CMRBASEL: Processing page 1 of 1


['seriesCode', 'geoAreaCode', 'timePeriod']

Index(['timePeriod', 'geoAreaCode', 'geoAreaName', 'level', 'parentCode',
       'parentName', 'type', 'CountryProfile', 'ISO3CD', 'UNMember', 'X', 'Y',
       'seriesCode', 'value', 'valueType', 'timeDetail', 'source', 'footnotes',
       'natureCode', 'unitsCode', 'reportingTypeCode', 'natureDesc',
       'unitsDesc', 'reportingTypeDesc'],
      dtype='object')

12.4.1


Index(['goalCode', 'goalDesc', 'targetCode', 'targetDesc', 'indicatorCode',
       'indicatorDesc', 'indicatorTier', 'seriesCode', 'seriesDesc',
       'seriesRelease'],
      dtype='object')

Index(['timePeriod', 'geoAreaCode', 'geoAreaName', 'level', 'parentCode',
       'parentName', 'type', 'CountryProfile', 'ISO3CD', 'UNMember', 'X', 'Y',
       'seriesCode', 'value', 'valueType', 'timeDetail', 'source', 'footnotes',
       'natureCode', 'unitsCode', 'reportingTypeCode', 'natureDesc',
       'unitsDesc', 'reportingTypeDesc'],
      dtype='object')

Unnamed: 0,goalCode,goalDesc,targetCode,targetDesc,indicatorCode,indicatorDesc,indicatorTier,seriesCode,seriesDesc,seriesRelease,...,valueType,timeDetail,source,footnotes,natureCode,unitsCode,reportingTypeCode,natureDesc,unitsDesc,reportingTypeDesc
0,12,Ensure sustainable consumption and production ...,12.4,"By 2020, achieve the environmentally sound man...",12.4.1,Number of parties to international multilatera...,1,SG_HAZ_CMRBASEL,Compliance with the Basel Convention on hazard...,2019.Q1.G.02,...,Float,,Environment Live,,C,PERCENT,G,Country data,Percentage,Global
1,12,Ensure sustainable consumption and production ...,12.4,"By 2020, achieve the environmentally sound man...",12.4.1,Number of parties to international multilatera...,1,SG_HAZ_CMRBASEL,Compliance with the Basel Convention on hazard...,2019.Q1.G.02,...,Float,,Environment Live,,C,PERCENT,G,Country data,Percentage,Global
2,12,Ensure sustainable consumption and production ...,12.4,"By 2020, achieve the environmentally sound man...",12.4.1,Number of parties to international multilatera...,1,SG_HAZ_CMRBASEL,Compliance with the Basel Convention on hazard...,2019.Q1.G.02,...,Float,,Environment Live,,,PERCENT,G,Data nature not available,Percentage,Global
3,12,Ensure sustainable consumption and production ...,12.4,"By 2020, achieve the environmentally sound man...",12.4.1,Number of parties to international multilatera...,1,SG_HAZ_CMRBASEL,Compliance with the Basel Convention on hazard...,2019.Q1.G.02,...,Float,,Environment Live,,C,PERCENT,G,Country data,Percentage,Global
4,12,Ensure sustainable consumption and production ...,12.4,"By 2020, achieve the environmentally sound man...",12.4.1,Number of parties to international multilatera...,1,SG_HAZ_CMRBASEL,Compliance with the Basel Convention on hazard...,2019.Q1.G.02,...,Float,,Environment Live,,,PERCENT,G,Data nature not available,Percentage,Global


Index(['goalCode', 'goalDesc', 'targetCode', 'targetDesc', 'indicatorCode',
       'indicatorDesc', 'indicatorTier', 'seriesCode', 'seriesDesc',
       'seriesRelease', 'timePeriod', 'geoAreaCode', 'geoAreaName', 'level',
       'parentCode', 'parentName', 'type', 'CountryProfile', 'ISO3CD',
       'UNMember', 'X', 'Y', 'value', 'valueType', 'timeDetail', 'source',
       'footnotes', 'natureCode', 'unitsCode', 'reportingTypeCode',
       'natureDesc', 'unitsDesc', 'reportingTypeDesc'],
      dtype='object')
(310, 33)


In [54]:
#for s in list(set(series_list(release).seriesCode)):
for s in list(set(series_list(release).seriesCode))[0:2]:


    data = annotated_data(s,release)
    data = data.drop(['seriesDesc' , 'geoAreaName'] , axis='columns')

    time_values = data[['timePeriod']].drop_duplicates()

    time_values['key'] = 0

    data_key =  data.drop(['geoAreaCode', 'timePeriod', 'value', 'valueType', 'timeDetail', 'source', 'footnotes', 
                           'natureCode', 'unitsCode', 'reportingTypeCode','natureDesc', 'unitsDesc', 'reportingTypeDesc'] , axis='columns').drop_duplicates()

    data_dimensions = list(data_key.columns) + ['geoAreaCode','timePeriod']


    data_dimensions 

    data_key['key'] = 0

    geo = geoareasXY(geoAreas, wd_dir + 'CountryListXY.txt')
    geo['key'] = 0

    xx = pd.merge(pd.merge(time_values, geo, on='key', how = 'left'),
                  data_key, on='key', how ='left')

    xx.drop('key',1, inplace=True)

    xx = pd.merge(xx, data, how='left', on=data_dimensions)

    xx.columns


    indicators = series_list(release)
    indicators = indicators.loc[indicators['seriesCode']==s]

    for i in indicators['indicatorCode']:
        y = pd.merge(indicators, xx, on='seriesCode')
        #y.head() 
        #print(y.columns)
        #print(y.shape)
        y.to_excel(wd_dir + 'Data_' + i + '_' + s + '.xlsx', engine ='xlsxwriter')
        


---Series DC_TOF_HLTHL: Processing page 1 of 6
---Series DC_TOF_HLTHL: Processing page 2 of 6
---Series DC_TOF_HLTHL: Processing page 3 of 6
---Series DC_TOF_HLTHL: Processing page 4 of 6
---Series DC_TOF_HLTHL: Processing page 5 of 6
---Series DC_TOF_HLTHL: Processing page 6 of 6


['seriesCode',
 'unitmultiplierCode',
 'unitmultiplierDesc',
 'geoAreaCode',
 'timePeriod']

Index(['timePeriod', 'geoAreaCode', 'geoAreaName', 'level', 'parentCode',
       'parentName', 'type', 'CountryProfile', 'ISO3CD', 'UNMember', 'X', 'Y',
       'seriesCode', 'unitmultiplierCode', 'unitmultiplierDesc', 'value',
       'valueType', 'timeDetail', 'source', 'footnotes', 'natureCode',
       'unitsCode', 'reportingTypeCode', 'natureDesc', 'unitsDesc',
       'reportingTypeDesc'],
      dtype='object')

---Series SG_NHR_INTEXST: Processing page 1 of 1


['seriesCode', 'geoAreaCode', 'timePeriod']

Index(['timePeriod', 'geoAreaCode', 'geoAreaName', 'level', 'parentCode',
       'parentName', 'type', 'CountryProfile', 'ISO3CD', 'UNMember', 'X', 'Y',
       'seriesCode', 'value', 'valueType', 'timeDetail', 'source', 'footnotes',
       'natureCode', 'unitsCode', 'reportingTypeCode', 'natureDesc',
       'unitsDesc', 'reportingTypeDesc'],
      dtype='object')

In [None]:
s = 'SL_ISV_IFRM'
series_data(s, release)

In [None]:
series_list('2019.Q1.G.02')

In [None]:
i