# Pre-processing script
This script pulls SDG data from API and transforms it into csv files.
The steps are:
- Pull data from [API](https://unstats.un.org/SDGAPI/swagger/) 
- Join with geography
- save as "long" table
- pivot into "wide" format and split regional and country data 

### Load necessary libraries

In [1]:
import csv
import json
import urllib3  # allows to access a URL with python
import math
import os
import io
import collections
import numpy as np
import pandas as pd
import xlsxwriter

# https://volderette.de/jupyter-notebook-tip-multiple-outputs/
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


### User parameters

In [2]:
release = '2019.Q1.G.02' # Make sure to have the correct release here

dir_path = os.path.dirname(os.path.realpath('__file__'))
print(dir_path)

wd_dir = r'../'
print('data inputs dir: ' + wd_dir)

C:\Users\L.GonzalezMorales\Documents\GitHub\FIS4SDGs\unsd\notebooks
data inputs dir: ../


## Utilities

#### Convert string to camelCase

In [3]:
def camelCase(st):
    """
    https://stackoverflow.com/questions/8347048/camelcase-every-string-any-standard-library
    
    """
    output = ''.join(x for x in st.title() if x.isalnum())
    return output[0].lower() + output[1:]

#### Disable insecure request warnings when using `urllib3`.

In [4]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

#### Create array to catch errors

In [5]:
error_log = []

### List of countries to be plotted on a map (with XY coordinates)

In [6]:
def countryListXY(file):
    
    countryListXY = []
    
    with open(file, newline = '', encoding='latin-1') as countryList:                                                                                          
        countryList = csv.DictReader(countryList, delimiter='\t')
        for row in countryList:
            countryListXY.append(dict(row))
            
    countryListXY = pd.DataFrame(countryListXY).astype({'geoAreaCode':'str'})
    
    return(countryListXY)

    #print(countryListXY[1])
    #for c in countryListXY:

In [7]:
countryListXY(wd_dir + 'CountryListXY.txt').head(10)

Unnamed: 0,CountryProfile,ISO3CD,UNMember,X,Y,geoAreaCode,geoAreaName
0,1,AFG,1,66.02688198,33.83160199,4,Afghanistan
1,1,ALB,1,20.06660928,41.13897007,8,Albania
2,1,ATA,0,21.47585697,-80.40897662,10,Antarctica
3,1,DZA,1,2.678164227,28.15940032,12,Algeria
4,1,ASM,0,-170.7187269,-14.30587306,16,American Samoa
5,1,AND,1,1.576257417,42.54548611,20,Andorra
6,1,AGO,1,17.57817062,-12.33724746,24,Angola
7,1,ATG,1,-61.7999755,17.07761471,28,Antigua and Barbuda
8,1,AZE,1,50.01064725,40.39229544,31,Azerbaijan
9,1,ARG,1,-65.14563274,-35.19446255,32,Argentina


#### Call the endpoint of the SDG API that provides the list of hierarchical groupings of geographic Areas:

In [8]:
def geoAreaTree():
    http = urllib3.PoolManager()
    response = http.request('GET', "https://unstats.un.org/SDGAPI/v1/sdg/GeoArea/Tree")
    responseData = json.loads(response.data.decode('UTF-8'))
    
    return responseData

Note: The geoAreaTree object has various "trees" in it.  We usually use the "World" tree; however, some economic and geographic groupings are only in other trees.

In [9]:
print(len(geoAreaTree()))
for t in geoAreaTree():
    print('root='+t['geoAreaName'])

7
root=World
root=Least Developed Countries (LDC)
root=Land Locked Developing Countries (LLDC)
root=Small Island Developing States (SIDS)
root=Developed Regions
root=Developing Regions
root=Custom groupings of data providers


#### Traverse a hierarchical tree of geographic areas and convert it to a parent-child hierarchy table:

In [10]:
def flatten(tree):
    
    global hierarchy
    
    hierarchy = []
    traverse.level = 1
    traverse(tree)
    
    return pd.DataFrame(hierarchy)

In [11]:
def traverse(tree, parentCode=None, parentName=None):
    
    # print(' ' * traverse.level + 'Level: ' + str(traverse.level) + ', ' + tree['type'] + ', ' + str(tree['geoAreaCode']) + '-' + tree['geoAreaName'] )
    
    d = {}
    
    d['level'] = traverse.level
    d['type'] = tree['type']
    d['parentCode'] = parentCode
    d['parentName'] = parentName
    d['geoAreaCode'] = str(tree['geoAreaCode'])
    d['geoAreaName'] = tree['geoAreaName']
    
    hierarchy.append(d)
        
    if tree['children']:
        for child in tree['children']:
            traverse.level += 1
            traverse(child, str(tree['geoAreaCode']), tree['geoAreaName'])
            traverse.level -= 1
    
    return pd.DataFrame(hierarchy)

#### `geoAreas` holds the flattened list of geographic areas under 'World':

In [12]:
geoAreas = flatten(geoAreaTree()[0])
geoAreas.head()
print('...')
geoAreas.tail()

Unnamed: 0,geoAreaCode,geoAreaName,level,parentCode,parentName,type
0,1,World,1,,,Region
1,10,Antarctica,2,1.0,World,Country
2,2,Africa,2,1.0,World,Region
3,15,Northern Africa,3,2.0,Africa,Region
4,12,Algeria,4,15.0,Northern Africa,Country


...


Unnamed: 0,geoAreaCode,geoAreaName,level,parentCode,parentName,type
291,543,Oceania (exc. Australia and New Zealand),3,9,Oceania,Region
292,62,Central and Southern Asia,2,1,World,Region
293,513,Europe and Northern America,2,1,World,Region
294,747,Northern Africa and Western Asia,2,1,World,Region
295,753,Eastern and South-Eastern Asia,2,1,World,Region


##### Temporary Fix for missing regions: 

In [13]:
if('910' not in geoAreas['geoAreaCode']):
    d_910 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '910',
              'geoAreaName' : 'High income economies (WB)'
             }
    x = pd.DataFrame({k: [v] for k, v in d_910.items()})
    geoAreas = geoAreas.append(x, sort = True)


if('911' not in geoAreas['geoAreaCode']):
    d_911 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '911',
              'geoAreaName' : 'Low income economies (WB)'
             }
    x = pd.DataFrame({k: [v] for k, v in d_911.items()})
    geoAreas = geoAreas.append(x, sort = True)

    
if('912' not in geoAreas['geoAreaCode']):
    d_912 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '912',
              'geoAreaName' : 'Lower middle economies (WB)'
             }
    x = pd.DataFrame({k: [v] for k, v in d_912.items()})
    geoAreas = geoAreas.append(x, sort = True)

if('913' not in geoAreas['geoAreaCode']):
    d_913 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '913',
              'geoAreaName' : 'Low and middle income economies (WB)'
             }
    x = pd.DataFrame({k: [v] for k, v in d_913.items()})
    geoAreas = geoAreas.append(x, sort = True)

if('914' not in geoAreas['geoAreaCode']):
    d_914 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '914',
              'geoAreaName' : 'Upper middle economies (WB)'
             }
    x = pd.DataFrame({k: [v] for k, v in d_914.items()})
    geoAreas = geoAreas.append(x, sort = True)

if('135' not in geoAreas['geoAreaCode']):
    d_135 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '135',
              'geoAreaName' : 'Caucasus and Central Asia'
            }
    x = pd.DataFrame({k: [v] for k, v in d_135.items()})
    geoAreas = geoAreas.append(x, sort = True)


if('127' not in geoAreas['geoAreaCode']):
    d_127 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '127',
              'geoAreaName' : 'Southern Asia (excluding India)'
            }
    x = pd.DataFrame({k: [v] for k, v in d_127.items()})
    geoAreas = geoAreas.append(x, sort = True)

if('199' not in geoAreas['geoAreaCode']):
    d_199 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '199',
              'geoAreaName' : 'Least Developed Countries (LDC)'
            }
    x = pd.DataFrame({k: [v] for k, v in d_199.items()})
    geoAreas = geoAreas.append(x, sort = True)

if('485' not in geoAreas['geoAreaCode']):
    d_485 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '485',
              'geoAreaName' : 'Western Asia (exc. Armenia, Azerbaijan, Cyprus, Israel and Georgia)'
            }
    x = pd.DataFrame({k: [v] for k, v in d_485.items()})
    geoAreas = geoAreas.append(x, sort = True)


if('514' not in geoAreas['geoAreaCode']):
    d_514 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '514',
              'geoAreaName' : 'Developed Regions'
            }
    x = pd.DataFrame({k: [v] for k, v in d_514.items()})
    geoAreas = geoAreas.append(x, sort = True)


if('515' not in geoAreas['geoAreaCode']):
    d_515 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '515',
              'geoAreaName' : 'Developing Regions'
            }
    x = pd.DataFrame({k: [v] for k, v in d_515.items()})
    geoAreas = geoAreas.append(x, sort = True)


if('518' not in geoAreas['geoAreaCode']):
    d_518 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '518',
              'geoAreaName' : 'Eastern Asia (excluding Japan)'
            }
    x = pd.DataFrame({k: [v] for k, v in d_518.items()})
    geoAreas = geoAreas.append(x, sort = True)


if('738' not in geoAreas['geoAreaCode']):
    d_738 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '738',
              'geoAreaName' : 'Sub-Saharan Africa (inc. Sudan)'
            }
    x = pd.DataFrame({k: [v] for k, v in d_738.items()})
    geoAreas = geoAreas.append(x, sort = True)

                 
if('746' not in geoAreas['geoAreaCode']):
    d_746 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '746',
              'geoAreaName' : 'Northern Africa (exc. Sudan)'
            }
    x = pd.DataFrame({k: [v] for k, v in d_746.items()})
    geoAreas = geoAreas.append(x, sort = True)

#==================================

geoAreas = geoAreas.reset_index(drop=True)

geoAreas.tail(20)


Unnamed: 0,geoAreaCode,geoAreaName,level,parentCode,parentName,type
290,876,Wallis and Futuna Islands,4,61.0,Polynesia,Country
291,543,Oceania (exc. Australia and New Zealand),3,9.0,Oceania,Region
292,62,Central and Southern Asia,2,1.0,World,Region
293,513,Europe and Northern America,2,1.0,World,Region
294,747,Northern Africa and Western Asia,2,1.0,World,Region
295,753,Eastern and South-Eastern Asia,2,1.0,World,Region
296,910,High income economies (WB),1,,,Group
297,911,Low income economies (WB),1,,,Group
298,912,Lower middle economies (WB),1,,,Group
299,913,Low and middle income economies (WB),1,,,Group


#### Merge coordinates and list of geographic areas in SDG database

In [14]:
geoAreas.values

array([['1', 'World', 1, None, None, 'Region'],
       ['10', 'Antarctica', 2, '1', 'World', 'Country'],
       ['2', 'Africa', 2, '1', 'World', 'Region'],
       ...,
       ['518', 'Eastern Asia (excluding Japan)', 1, None, None, 'Group'],
       ['738', 'Sub-Saharan Africa (inc. Sudan)', 1, None, None, 'Group'],
       ['746', 'Northern Africa (exc. Sudan)', 1, None, None, 'Group']],
      dtype=object)

In [15]:
def geoareasXY(geoareas,coordinates_file):
    
    xy = countryListXY(coordinates_file)
    
    x = pd.merge(geoareas,xy.loc[:, xy.columns != 'geoAreaName'],
         how='outer',
         on=['geoAreaCode'])
    x['order'] = x['geoAreaCode']
    x['order'] = x['order'].astype(float)
    x = x.sort_values('order')
    del x['order']
    
    x = x.reset_index(drop=True)
    
    return(x)

# x.to_excel('test.xlsx', engine ='xlsxwriter')

In [16]:
geoareasXY(geoAreas, wd_dir + 'CountryListXY.txt')

Unnamed: 0,geoAreaCode,geoAreaName,level,parentCode,parentName,type,CountryProfile,ISO3CD,UNMember,X,Y
0,1,World,1,,,Region,,,,,
1,2,Africa,2,1,World,Region,,,,,
2,4,Afghanistan,4,34,Southern Asia,Country,1,AFG,1,66.02688198,33.83160199
3,5,South America,4,419,Latin America and the Caribbean,Region,,,,,
4,8,Albania,4,39,Southern Europe,Country,1,ALB,1,20.06660928,41.13897007
5,9,Oceania,2,1,World,Region,,,,,
6,10,Antarctica,2,1,World,Country,1,ATA,0,21.47585697,-80.40897662
7,11,Western Africa,4,202,Sub-Saharan Africa,Region,,,,,
8,12,Algeria,4,15,Northern Africa,Country,1,DZA,1,2.678164227,28.15940032
9,13,Central America,4,419,Latin America and the Caribbean,Region,,,,,


### Get the list of goals, targets, indicators and series

In [17]:
def series_list(release):
    
    series_list = []

    # Call the endpoint of the SDG API that provides the list of goals with all their children:
    http = urllib3.PoolManager()
    response = http.request('GET', "https://unstats.un.org/SDGAPI/v1/sdg/Goal/List?includechildren=true")
    responseData = json.loads(response.data.decode('UTF-8'))
    
    # Define the list of 'keys' to be extracted into a flat dictionary from the response:
    keys = ["goalCode", 
            "goalDesc",
            "targetCode",
            "targetDesc",
            "indicatorCode",
            "indicatorDesc",
            "indicatorTier",
            "seriesCode",
            "seriesDesc",
            "seriesRelease"
           ]    
    
    # Iterate over goals, targets, indicators, and series for the specified release:
    for g in responseData:
        for t in g['targets']:
            for i in t['indicators']:
                for s in i['series']:
                    if s['release'] == release:
                        values = [g['code'], g['title'],
                                  t['code'], t['description'], 
                                  i['code'], i['description'], i['tier'], 
                                  s['code'], s['description'], s['release']]

                        keys_and_values = zip(keys, values)
                        serie_dic = {}
                        for key, value in keys_and_values:
                            serie_dic[key] = value
                        series_list.append(serie_dic)
                        
    series_list = pd.DataFrame(series_list)
    
    return series_list[['goalCode', 'goalDesc',
                       'targetCode', 'targetDesc',
                       'indicatorCode', 'indicatorDesc','indicatorTier', 
                       'seriesCode', 'seriesDesc', 'seriesRelease']]

In [18]:
series_list(release).head(4)

Unnamed: 0,goalCode,goalDesc,targetCode,targetDesc,indicatorCode,indicatorDesc,indicatorTier,seriesCode,seriesDesc,seriesRelease
0,1,End poverty in all its forms everywhere,1.1,"By 2030, eradicate extreme poverty for all peo...",1.1.1,Proportion of population below the internation...,1,SI_POV_DAY1,Proportion of population below international p...,2019.Q1.G.02
1,1,End poverty in all its forms everywhere,1.1,"By 2030, eradicate extreme poverty for all peo...",1.1.1,Proportion of population below the internation...,1,SI_POV_EMP1,Employed population below international povert...,2019.Q1.G.02
2,1,End poverty in all its forms everywhere,1.2,"By 2030, reduce at least by half the proportio...",1.2.1,Proportion of population living below the nati...,1,SI_POV_NAHC,Proportion of population living below the nati...,2019.Q1.G.02
3,1,End poverty in all its forms everywhere,1.3,Implement nationally appropriate social protec...,1.3.1,Proportion of population covered by social pro...,1,SI_COV_MATNL,[ILO] Proportion of mothers with newborns rece...,2019.Q1.G.02


#### Cross-Join geographic area catalogue (with coordinates) and series catalogue

In [19]:
def cross_geo_series(geoAreas, countryXY, seriesCode, release):
    df1 = geoareasXY(geoAreas, countryXY)
    df2 = series_list(release)
    
    df2 = df2.loc[df2['seriesCode']==seriesCode]

    df1['key'] = 1
    df2['key'] = 1

    x = pd.merge(df1, df2,on='key')
    x.drop(['key'], axis=1, inplace=True)
    
    return x

In [20]:
x = cross_geo_series(geoAreas, wd_dir + 'CountryListXY.txt','SE_ACC_COMP','2019.Q1.G.02')
x.head()
print('...')
x.tail()

Unnamed: 0,geoAreaCode,geoAreaName,level,parentCode,parentName,type,CountryProfile,ISO3CD,UNMember,X,...,goalCode,goalDesc,targetCode,targetDesc,indicatorCode,indicatorDesc,indicatorTier,seriesCode,seriesDesc,seriesRelease
0,1,World,1,,,Region,,,,,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
1,2,Africa,2,1.0,World,Region,,,,,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
2,4,Afghanistan,4,34.0,Southern Asia,Country,1.0,AFG,1.0,66.02688198,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
3,5,South America,4,419.0,Latin America and the Caribbean,Region,,,,,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
4,8,Albania,4,39.0,Southern Europe,Country,1.0,ALB,1.0,20.06660928,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02


...


Unnamed: 0,geoAreaCode,geoAreaName,level,parentCode,parentName,type,CountryProfile,ISO3CD,UNMember,X,...,goalCode,goalDesc,targetCode,targetDesc,indicatorCode,indicatorDesc,indicatorTier,seriesCode,seriesDesc,seriesRelease
305,910,High income economies (WB),1,,,Group,,,,,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
306,911,Low income economies (WB),1,,,Group,,,,,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
307,912,Lower middle economies (WB),1,,,Group,,,,,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
308,913,Low and middle income economies (WB),1,,,Group,,,,,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
309,914,Upper middle economies (WB),1,,,Group,,,,,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02


### Get the data for each series

#### Verify how many pages need to be requested to get all the data for a specific series from the SDG API. 

In [21]:
def series_request_details(seriesCode,release):
    
    seriesRequest = 'https://unstats.un.org/SDGAPI/v1/sdg/Series/Data?seriesCode=' + seriesCode + '&releaseCode=' + release + "&pageSize=2"
    
    http = urllib3.PoolManager()
    response = http.request('GET', seriesRequest)
    responseData = json.loads(response.data.decode('UTF-8'))
    
    pageSize = 500
    nPages = math.floor(responseData['totalElements'] / pageSize) + 1
    totalElements = responseData['totalElements']
    
    return {'series' : seriesCode,
            'totalElements' : totalElements,
            'nPages' : nPages, 
            'pageSize' : pageSize
           }


In [22]:
series_request_details('SE_ACC_COMP', '2019.Q1.G.02')

{'series': 'SE_ACC_COMP', 'totalElements': 1414, 'nPages': 3, 'pageSize': 500}

#### Explore the code lists of the attributes and dimensions of a series
Describe each attribute or dimension as a simple dictionary made of a set of `code`-`description` pairs.  For the code, use the SDMX code, and not the internal codeof the database.  Keep all labels in camelCase.

In [23]:
def series_code_lists(seriesCode, release):
    
    seriesRequest = 'https://unstats.un.org/SDGAPI/v1/sdg/Series/Data?seriesCode=' + seriesCode + '&releaseCode=' + release + "&pageSize=2"
    
    http = urllib3.PoolManager()
    response = http.request('GET', seriesRequest)
    responseData = json.loads(response.data.decode('UTF-8'))
    
    series_attributes = responseData['attributes']
    series_dimensions = responseData['dimensions']
    
    new_dict = {}
    
    new_dict['seriesCode'] = seriesCode
    
    for a in series_attributes:
        codelist_dict = {}
        for c in a['codes']:
            codelist_dict[c['code']] = c['description']
        new_dict[camelCase(a['id'])] = codelist_dict
    
    for d in series_dimensions:
        codelist_dict = {}
        for c in d['codes']:
            codelist_dict[c['code']] = c['description']
        new_dict[camelCase(d['id'])] = codelist_dict
        
    return new_dict


In [24]:
series_code_lists('SE_ACC_COMP', '2019.Q1.G.02')

{'seriesCode': 'SE_ACC_COMP',
 'nature': {'C': 'Country data',
  'CA': 'Country adjusted data',
  'E': 'Estimated data',
  'G': 'Global monitoring data',
  'M': 'Modeled data',
  'N': 'Non-relevant',
  'NA': 'Data nature not available'},
 'units': {'PERCENT': 'Percentage'},
 'educationLevel': {'GRAD23': 'Grades 2/3',
  'LOWSEC': 'Lower secondary',
  'PRIMAR': 'Primary',
  'PREPRI': 'Pre-primary',
  'SECOND': 'Secondary',
  'UPPSEC': 'Upper secondary'},
 'reportingType': {'N': 'National', 'G': 'Global'}}

#### Simplify further by presenting all the codes and their descriptions in a single table:

In [25]:
def series_code_lists2(seriesCode, release):
    
    seriesRequest = 'https://unstats.un.org/SDGAPI/v1/sdg/Series/Data?seriesCode=' + seriesCode + '&releaseCode=' + release + "&pageSize=2" 
    
    http = urllib3.PoolManager()
    response = http.request('GET', seriesRequest)
    responseData = json.loads(response.data.decode('UTF-8'))
    
    series_attributes = responseData['attributes']
    series_dimensions = responseData['dimensions']
    
    code_list = []
    
    for a in series_attributes:
       
        for c in a['codes']:
            new_dict = {}
            new_dict['series'] = seriesCode
            new_dict['role'] = 'attribute'
            new_dict['concept'] = camelCase(a['id'])
            new_dict['code'] = c['sdmx']
            new_dict['description'] = c['description']
            code_list.append(new_dict)
        
    for d in series_dimensions:
        for c in d['codes']:
            new_dict = {}
            new_dict['series'] = seriesCode
            new_dict['role'] = 'dimension'
            new_dict['concept'] = camelCase(d['id'])
            new_dict['code'] = c['sdmx']
            new_dict['description'] = c['description']
            code_list.append(new_dict)
        
    return pd.DataFrame(code_list)


In [26]:
series_code_lists2('SE_ACC_COMP', '2019.Q1.G.02')

Unnamed: 0,code,concept,description,role,series
0,C,nature,Country data,attribute,SE_ACC_COMP
1,CA,nature,Country adjusted data,attribute,SE_ACC_COMP
2,E,nature,Estimated data,attribute,SE_ACC_COMP
3,G,nature,Global monitoring data,attribute,SE_ACC_COMP
4,M,nature,Modeled data,attribute,SE_ACC_COMP
5,N,nature,Non-relevant,attribute,SE_ACC_COMP
6,,nature,Data nature not available,attribute,SE_ACC_COMP
7,PERCENT,units,Percentage,attribute,SE_ACC_COMP
8,GRAD23,educationLevel,Grades 2/3,dimension,SE_ACC_COMP
9,LOWSEC,educationLevel,Lower secondary,dimension,SE_ACC_COMP


#### Build query string to collect data for a specific series from the global SDG API

In [27]:
def series_query(seriesCode, release, page, pageSize):
    queryString =  r'https://unstats.un.org/SDGAPI/v1/sdg/Series/Data?seriesCode=' + seriesCode + '&releaseCode=' + release + '&page=' + str(page) + '&pageSize=' + str(pageSize)
    http = urllib3.PoolManager()
    response = http.request('GET', queryString)
    responseData =  json.loads(response.data.decode('UTF-8'))
    return(responseData)

#### Get data for a specific series from the API
*(!) Notice that a data point may appear more than once if it belongs to a "multi-purpose indicator"*

In [28]:
def series_data(seriesCode, release):
    x = series_request_details(seriesCode,release)
    series_data = []
    if x['totalElements'] > 0:
        for p in range(x['nPages']):
            print("---Series " + seriesCode + ": Processing page " + str(p+1) + " of " + str(x['nPages']))
            responseData =  series_query(seriesCode, release, p+1, x['pageSize'])
            if len(responseData['data'])>0:
                series_data = series_data + responseData['data'] 
    return series_data

In [31]:
x = series_data('EG_EGY_CLEAN','2019.Q1.G.02')
if len(x) > 0:
    x[0]

---Series EG_EGY_CLEAN: Processing page 1 of 3
---Series EG_EGY_CLEAN: Processing page 2 of 3
---Series EG_EGY_CLEAN: Processing page 3 of 3


#### Flatten the dictionary, extracting individual attributes and dimensions as key-value pairs in their own right.
Also convert the years (`timePeriod`) variable to `int`

In [32]:
def flat_series_data(seriesCode,release):
    new_x = []
    for d in series_data(seriesCode,release):
        new_d = {}
        for key, value in d.items():
            if type(value) is list:
                new_d[key] = ', '.join(value)
            elif type(value) is dict:
                for k, v in value.items():
                    new_d[camelCase(k+' Code')] = v
            elif key == 'time_detail':
                new_d[camelCase(key)] = value
            elif key == 'timePeriodStart':
                new_d['timePeriod'] = int(value)
            elif key == 'series':
                new_d['seriesCode'] = value
            elif key == 'seriesDescription':
                new_d['seriesDesc'] = value
            elif key == 'geoAreaCode':
                new_d['geoAreaCode'] = str(value)
            else:
                new_d[key] = value
        new_x.append(new_d)
    return new_x

In [36]:
x = flat_series_data('EG_EGY_CLEAN','2019.Q1.G.02')
if len(x) > 0: 
    x[0]

---Series EG_EGY_CLEAN: Processing page 1 of 3
---Series EG_EGY_CLEAN: Processing page 2 of 3
---Series EG_EGY_CLEAN: Processing page 3 of 3


#### Convert flattened dictionary of data series to pandas data frame, remove goal/target/indicator columns, and remove duplicate values

In [37]:
def dataframe_unique_series_data(seriesCode,release):
    d = flat_series_data(seriesCode,release)
    if len(d) > 0:
        x = pd.DataFrame(d, 
                         columns=d[0].keys())
        x.drop(['goal','target','indicator', 'seriesCount'], axis=1, inplace=True)
        x.drop_duplicates(inplace=True)

        return x

In [40]:
x = dataframe_unique_series_data('EG_EGY_CLEAN','2019.Q1.G.02')
if x is not None:
    x.to_excel('test2.xlsx', engine ='xlsxwriter')
    x.head()

---Series EG_EGY_CLEAN: Processing page 1 of 3
---Series EG_EGY_CLEAN: Processing page 2 of 3
---Series EG_EGY_CLEAN: Processing page 3 of 3


#### Add descriptions to coded dimension and attributes

In [44]:
def annotated_data(seriesCode, release):
    
    s = dataframe_unique_series_data(seriesCode,release)
    #print(s)
    
    if s is not None:
        code_lists = series_code_lists(seriesCode, release)
        #print(code_lists)

        concepts = code_lists.keys()
        #print(concepts)

        for c in concepts:
            if c != 'seriesCode':
                #print(c)
                d = code_lists[c]
                #print(d)

                x = pd.DataFrame(list(d.items()))
                x.columns = [c+'Code', c+'Desc']

                s = pd.merge(s, x,on=c+'Code')
                s = s.reset_index(drop=True)

        return s    

In [45]:
x = annotated_data('EG_EGY_CLEAN','2019.Q1.G.02')
if x is not None:
    print(x.columns)
    print(x.shape)
    x.head()

---Series EG_EGY_CLEAN: Processing page 1 of 3
---Series EG_EGY_CLEAN: Processing page 2 of 3
---Series EG_EGY_CLEAN: Processing page 3 of 3


#### Produce 'long' files for each indicator/series combination
(Notice that multi-purpose indicators need to be split)

In [61]:
series_done = ['SI_COV_BENFTS', 'SI_COV_LMKT', 'SI_COV_MATNL', 'SI_COV_PENSN', 'SI_COV_POOR', 'SI_COV_SOCAST', 
               'SI_COV_SOCINS', 'SI_COV_SOCINSPQ', 'SI_COV_UEMP', 'SI_COV_VULN', 'SI_COV_WKINJRY', 'VC_DSR_AFFCT', 
               'VC_DSR_DADN', 'VC_DSR_DAFF', 'VC_DSR_DYDN', 'VC_DSR_DYHN', 'VC_DSR_IJILN', 'VC_DSR_MORT', 'VC_DSR_MTMN', 
               'VC_DSR_MTMP', 'VC_DSR_PDYN', 'VC_DSR_CILN', 'VC_DSR_GDPLS', 'VC_DSR_HOLN', 'SG_DSR_LEGREG', 'SG_DSR_LGRGSR', 
               'SG_DSR_SILN', 'SL_EMP_GTOTL', 'FI_FSI_FSANL', 'FI_FSI_FSERA', 'FI_FSI_FSKNL', 'FI_FSI_FSKRTC', 'FI_FSI_FSLS', 
               'FI_FSI_FSSNO', 'SG_INT_VRTDEV', 'EN_LND_SLUM', 'VC_DSR_BSDN', 'VC_DSR_CDAN', 'VC_DSR_HFDN', 'EN_ATM_PM25', 
               'SG_SCP_CNTRY', 'SG_SCP_CORMEC', 'SG_SCP_MACPOL', 'EN_MAT_DOMCMPG', 'EN_MAT_DOMCMPT', 'SG_HAZ_CMRMNTRL', 
               'ER_FFS_PRTSST', 'ER_H2O_FWTL', 'ER_MRN_MPA', 'ER_REG_UNFCIM', 'ER_REG_SSFRAR', 'AG_LND_FRSTN', 'AG_LND_TOTL', 
               'ER_PTD_FRWRT', 'AG_LND_FRSTBIOPHA', 'AG_LND_FRSTCERT', 'AG_LND_FRSTPRCT', 'ER_MTN_GRNCOV', 'ER_MTN_GRNCVI', 
               'ER_MTN_TOTL', 'ER_CBD_PTYPGRFA', 'VC_IHR_PSRCN', 'VC_VAW_MTUHRA', 'SG_INF_ACCSS', 'VC_VAW_PHYPYV', 'VC_VAW_DIST', 
               'VC_VAW_SXVLN', 'GF_XPD_GBPC', 'SG_NHR_IMPL', 'SG_NHR_IMPLN', 'SG_NHR_INTEXST', 'SG_NHR_INTEXSTN', 'SG_NHR_NOAPPLN', 
               'SG_NHR_NOSTUSN', 'TX_EXP_GBSVR', 'TX_IMP_GBMRCH', 'TX_IMP_GBSVR', 'SG_PLN_PRPOLRES', 'SG_PLN_RECRIMON', 'SG_STT_FPOS', 
               'SG_STT_NSDSFDDNR', 'SG_STT_NSDSFDGVT', 'SG_STT_CAPTY', 'SG_REG_BRTH90', 'SG_REG_DETH75', 'SG_REG_DETH75N', 'DC_ODA_LLDCG',
               'DC_ODA_SIDS', 'DC_ODA_TOTG', 'BX_TRF_PWKR', 'IT_NET_BBN', 'IT_NET_BBP', 'IT_USE_ii99', 'SN_ITK_DEFCN', 'AG_PRD_FIESSIN', 
               'SH_STA_STUNT', 'SH_STA_OVRWGT', 'SH_STA_WASTE', 'ER_GRF_ANIMKPT', 'ER_GRF_ANIMRCNT', 'ER_GRF_ANIMSTORN', 'ER_NRK_LBRED', 
               'ER_NRK_LBREDN', 'ER_RSK_LBRED', 'ER_UNK_LBRED', 'ER_UNK_LBREDN', 'AG_PRD_ORTIND', 'AG_XPD_AGSGB', 'DC_TOF_AGRL', 'AG_PRD_XSUBDY', 
               'SH_STA_MMR', 'SH_STA_BRTC', 'SH_DYN_MORT', 'SH_DYN_MORTN', 'SH_DYN_NMRTN', 'SH_HIV_INCD', 'SH_HAP_HBSAG', 'SH_DTH_RNCOM', 
               'SH_STA_SCIDE', 'SP_DYN_ADKL', 'SH_ACS_UNHC', 'SH_XPD_EARN10', 'SH_XPD_EARN25', 'SH_HAP_ASMORT', 'SH_STA_AIRP', 'SH_STA_POISN', 
               'SH_PRV_SMOK', 'SH_ACS_DTP3', 'DC_TOF_HLTHL', 'SE_ADT_ACTS', 'SE_GPI_FUNPROF', 'SE_GPI_ICTS', 'SE_GPI_MATACH', 'SE_GPI_TRATEA', 
               'SE_LGP_ACHIMA', 'SE_LGP_ACHIRE', 'SE_NAP_ACHIMA', 'SE_PRE_GPIPARTN', 'SE_SEP_MATACH', 'SE_SEP_REAACH', 'SE_URP_REAACH', 'SE_ADT_FUNS', 
               'SE_ACC_ELEC', 'SE_ACC_HNWA', 'SE_ACC_SANI', 'SE_INF_DSBL', 'SE_TRA_GRDL', 'VC_VAW_MARR', 'SP_DYN_MRBF18', 'SH_STA_FGMS', 'SL_DOM_TSPDCW',
               'SL_DOM_TSPDDC', 'SG_GEN_PARL', 'SG_GEN_PARLNT', 'IC_GEN_MGTL', 'IC_GEN_MGTN', 'SH_FPL_INFM', 'SH_FPL_INFMCU', 'SH_FPL_INFMRH', 'SH_H2O_SAFE',
               'SH_SAN_DEFECT', 'SH_SAN_SAFE', 'EN_WWT_WWDS', 'EN_H2O_RVAMBQ', 'EN_H2O_WBAMBQ', 'ER_H2O_WUEYST', 'ER_H2O_IWRMP', 'EG_TBA_H2CO', 'EN_WBE_NDOPW', 
               'EN_WBE_NDQLOPW', 'EN_WBE_NDQLRVR', 'EN_WBE_NDQLTOT', 'EN_WBE_NDQTGRW', 'EN_WBE_NDQTOPW', 'EN_WBE_NDQTRVR', 'EN_WBE_PMPN', 'EN_WBE_PMPR',
               'ER_H2O_PARTIC', 'ER_H2O_RURP', 'ER_WAT_PARTIC', 'ER_WAT_PRDU', 'ER_WAT_PROCED', 'EG_ELC_ACCS', 'EG_FEC_RNEW', 'FB_ATM_TOTL', 'FB_BNK_ACCSS', 
               'SL_ISV_IFRM', 'SL_EMP_AEARN', 'SL_TLF_UEMDIS', 'SL_TLF_NEET', 'SL_TLF_CHLDEC', 'SL_EMP_FTLINJUR', 'DC_TOF_TRDCMDL', 'DC_TOF_TRDCML', 'DC_TOF_TRDDBML', 
               'IS_RDP_FRGVOL', 'IS_RDP_PFVOL', 'IS_TRP_MAILTKM', 'SL_TLF_MANF', 'NV_IND_SSIS', 'EN_ATM_CO2', 'GB_XPD_RSDV', 'IT_MOB_NTWK',
               'AG_FPA_CFPI', 'AG_FPA_COMM', 'AG_LND_FRST', 'AG_LND_FRSTCHG', 'AG_LND_FRSTMGT', 'AG_PRD_AGVAS', 'AG_PRD_FIESSI', 'DC_FTA_TOTAL', 'DC_ODA_BDVDL', 
               'DC_ODA_BDVL', 'DC_ODA_LDCG', 'DC_ODA_LDCS', 'DC_ODA_LLDC', 'DC_ODA_SIDSG', 'DC_ODA_TOTL', 'DC_TOF_HLTHNT']

series_all = ['AG_FPA_CFPI', 'AG_FPA_COMM', 'AG_LND_FRST', 'AG_LND_FRSTBIOPHA', 'AG_LND_FRSTCERT', 'AG_LND_FRSTCHG', 'AG_LND_FRSTMGT', 'AG_LND_FRSTN',
              'AG_LND_FRSTPRCT', 'AG_LND_TOTL', 'AG_PRD_AGVAS', 'AG_PRD_FIESSI', 'AG_PRD_FIESSIN', 'AG_PRD_ORTIND', 'AG_PRD_XSUBDY', 'AG_XPD_AGSGB', 
              'BX_TRF_PWKR', 'DC_FTA_TOTAL', 'DC_ODA_BDVDL', 'DC_ODA_BDVL', 'DC_ODA_LDCG', 'DC_ODA_LDCS', 'DC_ODA_LLDC', 'DC_ODA_LLDCG', 'DC_ODA_SIDS', 
              'DC_ODA_SIDSG', 'DC_ODA_TOTG', 'DC_ODA_TOTL', 'DC_TOF_AGRL', 'DC_TOF_HLTHL', 'DC_TOF_HLTHNT', 'DC_TOF_INFRAL', 'DC_TOF_SCHIPSL', 'DC_TOF_TRDCMDL', 
              'DC_TOF_TRDCML', 'DC_TOF_TRDDBMDL', 'DC_TOF_TRDDBML', 'DC_TOF_WASHL', 'DC_TRF_TFDV', 'DC_TRF_TOTDL', 'DC_TRF_TOTL', 'DT_TDS_DECT', 'EG_EGY_CLEAN', 
              'EG_EGY_PRIM', 'EG_ELC_ACCS', 'EG_FEC_RNEW', 'EG_TBA_H2CO', 'EG_TBA_H2COAQ', 'EG_TBA_H2CORL', 'EN_ATM_CO2', 'EN_ATM_CO2GDP', 'EN_ATM_CO2MVA', 
              'EN_ATM_PM25', 'EN_H2O_GRAMBQ', 'EN_H2O_OPAMBQ', 'EN_H2O_RVAMBQ', 'EN_H2O_WBAMBQ', 'EN_LND_SLUM', 'EN_MAT_DOMCMPC', 'EN_MAT_DOMCMPG',
              'EN_MAT_DOMCMPT', 'EN_REF_WASCOL', 'EN_WBE_NDETOT', 'EN_WBE_NDOPW', 'EN_WBE_NDQLGRW', 'EN_WBE_NDQLOPW', 'EN_WBE_NDQLRVR', 'EN_WBE_NDQLTOT', 
              'EN_WBE_NDQTGRW', 'EN_WBE_NDQTOPW', 'EN_WBE_NDQTRVR', 'EN_WBE_NDQTTOT', 'EN_WBE_NDRV', 'EN_WBE_NDWTL', 'EN_WBE_PMNR', 'EN_WBE_PMPN', 'EN_WBE_PMPP',
              'EN_WBE_PMPR', 'EN_WWT_WWDS', 'ER_CBD_ABSCLRHS', 'ER_CBD_NAGOYA', 'ER_CBD_ORSPGRFA', 'ER_CBD_PTYPGRFA', 'ER_FFS_PRTSPC', 'ER_FFS_PRTSPR', 
              'ER_FFS_PRTSST', 'ER_GRF_ANIMKPT', 'ER_GRF_ANIMRCNT', 'ER_GRF_ANIMRCNTN', 'ER_GRF_ANIMSTOR', 'ER_GRF_ANIMSTORN', 'ER_GRF_PLNTSTOR', 'ER_H2O_FWTL', 
              'ER_H2O_IWRMD', 'ER_H2O_IWRMP', 'ER_H2O_PARTIC', 'ER_H2O_PRDU', 'ER_H2O_RURP', 'ER_H2O_STRESS', 'ER_H2O_WUEYST', 'ER_MRN_MARIN', 'ER_MRN_MARINT', 
              'ER_MRN_MPA', 'ER_MTN_GRNCOV', 'ER_MTN_GRNCVI', 'ER_MTN_TOTL', 'ER_NRK_LBRED', 'ER_NRK_LBREDN', 'ER_PTD_FRWRT', 'ER_PTD_MOTN', 'ER_PTD_TERRS',
              'ER_RDE_OSEX', 'ER_REG_SSFRAR', 'ER_REG_UNFCIM', 'ER_RSK_LBRED', 'ER_RSK_LBREDN', 'ER_RSK_LSTI', 'ER_UNK_LBRED', 'ER_UNK_LBREDN', 'ER_WAT_PART', 
              'ER_WAT_PARTIC', 'ER_WAT_PRDU', 'ER_WAT_PROCED', 'FB_ATM_TOTL', 'FB_BNK_ACCSS', 'FB_CBK_BRCH', 'FC_ACC_SSID', 'FI_FSI_FSANL', 'FI_FSI_FSERA', 
              'FI_FSI_FSKA', 'FI_FSI_FSKNL', 'FI_FSI_FSKRTC', 'FI_FSI_FSLS', 'FI_FSI_FSSNO', 'GB_POP_SCIERD', 'GB_XPD_RSDV', 'GF_XPD_GBPC', 'IC_FRM_BRIB', 
              'IC_GEN_MGTL', 'IC_GEN_MGTN', 'IS_RDP_FRGVOL', 'IS_RDP_PFVOL', 'IS_TRP_MAILTKM', 'IT_MOB_NTWK', 'IT_MOB_OWN', 'IT_NET_BBN', 'IT_NET_BBP', 
              'IT_USE_ii99', 'NV_IND_MANF', 'NV_IND_MANFPC', 'NV_IND_SSIS', 'NV_IND_TECH', 'NY_GDP_PCAP', 'SD_XPD_ESED', 'SE_ACC_COMP', 'SE_ACC_DWAT', 'SE_ACC_ELEC', 
              'SE_ACC_HNWA', 'SE_ACC_INTN', 'SE_ACC_SANI', 'SE_ADT_ACTS', 'SE_ADT_EDUCTRN', 'SE_ADT_FUNS', 'SE_DEV_ONTRK', 'SE_GPI_FUNPROF', 'SE_GPI_ICTS', 'SE_GPI_MATACH',
              'SE_GPI_PART', 'SE_GPI_REAACH', 'SE_GPI_TRATEA', 'SE_IMP_FPOF', 'SE_INF_DSBL', 'SE_LGP_ACHIMA', 'SE_LGP_ACHIRE', 'SE_MAT_PROF', 'SE_NAP_ACHIMA', 'SE_NAP_ACHIRE', 
              'SE_PRE_GPIPARTN', 'SE_PRE_PARTN', 'SE_REA_PROF', 'SE_SEP_FUNPROF', 'SE_SEP_MATACH', 'SE_SEP_REAACH', 'SE_TRA_GRDL', 'SE_URP_MATACH', 'SE_URP_REAACH', 
              'SG_DSR_LEGREG', 'SG_DSR_LGRGSR', 'SG_DSR_SILN', 'SG_DSR_SILS', 'SG_GEN_PARL', 'SG_GEN_PARLN', 'SG_GEN_PARLNT', 'SG_GOV_LOGV', 'SG_HAZ_CMRBASEL', 
              'SG_HAZ_CMRMNTRL', 'SG_HAZ_CMRROTDAM', 'SG_HAZ_CMRSTHOLM', 'SG_INF_ACCSS', 'SG_INT_MBRDEV', 'SG_INT_VRTDEV', 'SG_NHR_IMPL', 'SG_NHR_IMPLN', 'SG_NHR_INTEXST', 
              'SG_NHR_INTEXSTN', 'SG_NHR_NOAPPLN', 'SG_NHR_NOSTUSN', 'SG_PLN_MSTKSDG', 'SG_PLN_PRPOLRES', 'SG_PLN_PRVNDI', 'SG_PLN_PRVRICTRY', 'SG_PLN_PRVRIMON', 
              'SG_PLN_RECNDI', 'SG_PLN_RECRICTRY', 'SG_PLN_RECRIMON', 'SG_PLN_REPOLRES', 'SG_REG_BRTH', 'SG_REG_BRTH90', 'SG_REG_BRTH90N', 'SG_REG_CENSUS', 'SG_REG_CENSUSN', 
              'SG_REG_DETH75', 'SG_REG_DETH75N', 'SG_SCP_CNTRY', 'SG_SCP_CORMEC', 'SG_SCP_MACPOL', 'SG_SCP_POLINS', 'SG_STT_CAPTY', 'SG_STT_FPOS', 'SG_STT_NSDSFDDNR', 
              'SG_STT_NSDSFDGVT', 'SG_STT_NSDSFDOTHR', 'SG_STT_NSDSFND', 'SG_STT_NSDSIMPL', 'SH_AAP_ASMORT', 'SH_AAP_MORT', 'SH_ACS_DTP3', 'SH_ACS_MCV2', 'SH_ACS_PCV3',
              'SH_ACS_UNHC', 'SH_ALC_CONSPT', 'SH_DTH_NCOM', 'SH_DTH_RNCOM', 'SH_DYN_IMRT', 'SH_DYN_IMRTN', 'SH_DYN_MORT', 'SH_DYN_MORTN', 'SH_DYN_NMRT', 'SH_DYN_NMRTN', 
              'SH_FPL_INFM', 'SH_FPL_INFMCU', 'SH_FPL_INFMRH', 'SH_FPL_INFMSR', 'SH_FPL_MTMM', 'SH_H2O_SAFE', 'SH_HAP_ASMORT', 'SH_HAP_HBSAG', 'SH_HAP_MORT', 'SH_HIV_INCD', 
              'SH_IHR_CAPPRD', 'SH_IHR_CAPS', 'SH_MED_HEAWOR', 'SH_PRV_SMOK', 'SH_SAN_DEFECT', 'SH_SAN_HNDWSH', 'SH_SAN_SAFE', 'SH_STA_AIRP', 'SH_STA_ASAIRP', 'SH_STA_BRTC',
              'SH_STA_FGMS', 'SH_STA_MALR', 'SH_STA_MMR', 'SH_STA_OVRWGT', 'SH_STA_OVRWGTN', 'SH_STA_POISN', 'SH_STA_SCIDE', 'SH_STA_SCIDEN', 'SH_STA_STUNT', 'SH_STA_STUNTN',
              'SH_STA_TRAF', 'SH_STA_WASH', 'SH_STA_WASTE', 'SH_STA_WASTEN', 'SH_TBS_INCID', 'SH_TRP_INTVN', 'SH_XPD_EARN10', 'SH_XPD_EARN25', 'SI_COV_BENFTS', 'SI_COV_CHLD', 
              'SI_COV_DISAB', 'SI_COV_LMKT', 'SI_COV_LMKTPQ', 'SI_COV_MATNL', 'SI_COV_PENSN', 'SI_COV_POOR', 'SI_COV_SOCAST', 'SI_COV_SOCASTPQ', 'SI_COV_SOCINS', 'SI_COV_SOCINSPQ', 
              'SI_COV_UEMP', 'SI_COV_VULN', 'SI_COV_WKINJRY', 'SI_HEI_BTN40', 'SI_HEI_TOTL', 'SI_POV_DAY1', 'SI_POV_EMP1', 'SI_POV_NAHC', 'SI_RMT_COST', 'SL_DOM_TSPD', 
              'SL_DOM_TSPDCW', 'SL_DOM_TSPDDC', 'SL_EMP_AEARN', 'SL_EMP_FTLINJUR', 'SL_EMP_GTOTL', 'SL_EMP_INJUR', 'SL_EMP_PCAP', 'SL_ISV_IFRM', 'SL_TLF_CHLDEA', 'SL_TLF_CHLDEC', 
              'SL_TLF_MANF', 'SL_TLF_NEET', 'SL_TLF_UEM', 'SL_TLF_UEMDIS', 'SN_ITK_DEFC', 'SN_ITK_DEFCN', 'SP_DYN_ADKL', 'SP_DYN_MRBF15', 'SP_DYN_MRBF18', 'TM_TAX_ATRFD', 
              'TM_TAX_WWTAV', 'TM_TRF_ZERO', 'TX_EXP_GBMRCH', 'TX_EXP_GBSVR', 'TX_IMP_GBMRCH', 'TX_IMP_GBSVR', 'VC_DSR_AFFCT', 'VC_DSR_AGLH', 'VC_DSR_AGLN', 'VC_DSR_BSDN',
              'VC_DSR_CDAN', 'VC_DSR_CDYN', 'VC_DSR_CHLN', 'VC_DSR_CILN', 'VC_DSR_DADN', 'VC_DSR_DAFF', 'VC_DSR_DDHN', 'VC_DSR_DYDN', 'VC_DSR_DYHN', 'VC_DSR_EFDN',
              'VC_DSR_ESDN', 'VC_DSR_GDPLS', 'VC_DSR_HFDN', 'VC_DSR_HOLH', 'VC_DSR_HOLN', 'VC_DSR_HSDN', 'VC_DSR_IJILN', 'VC_DSR_LSGP', 'VC_DSR_MISS', 'VC_DSR_MMHN',
              'VC_DSR_MORT', 'VC_DSR_MTMN', 'VC_DSR_MTMP', 'VC_DSR_OBDN', 'VC_DSR_PDAN', 'VC_DSR_PDLN', 'VC_DSR_PDYN', 'VC_IHR_PSRC', 'VC_IHR_PSRCN', 'VC_PRS_UNSEC', 
              'VC_VAW_DIST', 'VC_VAW_MARR', 'VC_VAW_MTUHRA', 'VC_VAW_PHYPYV', 'VC_VAW_SXVLN', 'ER_H2O_PROCED']

series_pending = np.setdiff1d(series_all,series_done)


len(series_done)
len(series_all)
len(series_pending)
 


219

375

156

In [59]:
series_completed = []

#for s in list(set(series_list(release).seriesCode)):
for s in series_pending:


    data = annotated_data(s,release)
    data = data.drop(['seriesDesc' , 'geoAreaName'] , axis='columns')

    time_values = data[['timePeriod']].drop_duplicates()

    time_values['key'] = 0

    data_key =  data.drop(['geoAreaCode', 'timePeriod', 'value', 'valueType', 'timeDetail', 'source', 'footnotes', 
                           'natureCode', 'unitsCode', 'reportingTypeCode','natureDesc', 'unitsDesc', 'reportingTypeDesc'] , axis='columns').drop_duplicates()

    data_dimensions = list(data_key.columns) + ['geoAreaCode','timePeriod']


    data_dimensions 

    data_key['key'] = 0

    geo = geoareasXY(geoAreas, wd_dir + 'CountryListXY.txt')
    geo['key'] = 0

    xx = pd.merge(pd.merge(time_values, geo, on='key', how = 'left'),
                  data_key, on='key', how ='left')

    xx.drop('key',1, inplace=True)

    xx = pd.merge(xx, data, how='left', on=data_dimensions)

    xx.columns


    indicators = series_list(release)
    indicators = indicators.loc[indicators['seriesCode']==s]

    for i in indicators['indicatorCode']:
        y = pd.merge(indicators, xx, on='seriesCode')
        #y.head() 
        #print(y.columns)
        #print(y.shape)
        y.to_excel(wd_dir + 'Data_' + i + '_' + s + '.xlsx', engine ='xlsxwriter', index=False)
        
    series_completed.append(s)
    
    print(series_completed)
        


---Series AG_FPA_CFPI: Processing page 1 of 1


['seriesCode', 'geoAreaCode', 'timePeriod']

Index(['timePeriod', 'geoAreaCode', 'geoAreaName', 'level', 'parentCode',
       'parentName', 'type', 'CountryProfile', 'ISO3CD', 'UNMember', 'X', 'Y',
       'seriesCode', 'value', 'valueType', 'timeDetail', 'source', 'footnotes',
       'natureCode', 'unitsCode', 'reportingTypeCode', 'natureDesc',
       'unitsDesc', 'reportingTypeDesc'],
      dtype='object')

---Series AG_FPA_COMM: Processing page 1 of 5
---Series AG_FPA_COMM: Processing page 2 of 5
---Series AG_FPA_COMM: Processing page 3 of 5
---Series AG_FPA_COMM: Processing page 4 of 5
---Series AG_FPA_COMM: Processing page 5 of 5


['seriesCode',
 'typeOfProductCode',
 'typeOfProductDesc',
 'geoAreaCode',
 'timePeriod']

Index(['timePeriod', 'geoAreaCode', 'geoAreaName', 'level', 'parentCode',
       'parentName', 'type', 'CountryProfile', 'ISO3CD', 'UNMember', 'X', 'Y',
       'seriesCode', 'typeOfProductCode', 'typeOfProductDesc', 'value',
       'valueType', 'timeDetail', 'source', 'footnotes', 'natureCode',
       'unitsCode', 'reportingTypeCode', 'natureDesc', 'unitsDesc',
       'reportingTypeDesc'],
      dtype='object')

---Series AG_LND_FRST: Processing page 1 of 3
---Series AG_LND_FRST: Processing page 2 of 3
---Series AG_LND_FRST: Processing page 3 of 3


['seriesCode', 'geoAreaCode', 'timePeriod']

Index(['timePeriod', 'geoAreaCode', 'geoAreaName', 'level', 'parentCode',
       'parentName', 'type', 'CountryProfile', 'ISO3CD', 'UNMember', 'X', 'Y',
       'seriesCode', 'value', 'valueType', 'timeDetail', 'source', 'footnotes',
       'natureCode', 'unitsCode', 'reportingTypeCode', 'natureDesc',
       'unitsDesc', 'reportingTypeDesc'],
      dtype='object')

---Series AG_LND_FRSTCHG: Processing page 1 of 2
---Series AG_LND_FRSTCHG: Processing page 2 of 2


['seriesCode', 'geoAreaCode', 'timePeriod']

Index(['timePeriod', 'geoAreaCode', 'geoAreaName', 'level', 'parentCode',
       'parentName', 'type', 'CountryProfile', 'ISO3CD', 'UNMember', 'X', 'Y',
       'seriesCode', 'value', 'valueType', 'timeDetail', 'source', 'footnotes',
       'natureCode', 'unitsCode', 'reportingTypeCode', 'natureDesc',
       'unitsDesc', 'reportingTypeDesc'],
      dtype='object')

---Series AG_LND_FRSTMGT: Processing page 1 of 2
---Series AG_LND_FRSTMGT: Processing page 2 of 2


['seriesCode', 'geoAreaCode', 'timePeriod']

Index(['timePeriod', 'geoAreaCode', 'geoAreaName', 'level', 'parentCode',
       'parentName', 'type', 'CountryProfile', 'ISO3CD', 'UNMember', 'X', 'Y',
       'seriesCode', 'value', 'valueType', 'timeDetail', 'source', 'footnotes',
       'natureCode', 'unitsCode', 'reportingTypeCode', 'natureDesc',
       'unitsDesc', 'reportingTypeDesc'],
      dtype='object')

---Series AG_PRD_AGVAS: Processing page 1 of 11
---Series AG_PRD_AGVAS: Processing page 2 of 11
---Series AG_PRD_AGVAS: Processing page 3 of 11
---Series AG_PRD_AGVAS: Processing page 4 of 11
---Series AG_PRD_AGVAS: Processing page 5 of 11
---Series AG_PRD_AGVAS: Processing page 6 of 11
---Series AG_PRD_AGVAS: Processing page 7 of 11
---Series AG_PRD_AGVAS: Processing page 8 of 11
---Series AG_PRD_AGVAS: Processing page 9 of 11
---Series AG_PRD_AGVAS: Processing page 10 of 11
---Series AG_PRD_AGVAS: Processing page 11 of 11


['seriesCode', 'geoAreaCode', 'timePeriod']

Index(['timePeriod', 'geoAreaCode', 'geoAreaName', 'level', 'parentCode',
       'parentName', 'type', 'CountryProfile', 'ISO3CD', 'UNMember', 'X', 'Y',
       'seriesCode', 'value', 'valueType', 'timeDetail', 'source', 'footnotes',
       'natureCode', 'unitsCode', 'reportingTypeCode', 'natureDesc',
       'unitsDesc', 'reportingTypeDesc'],
      dtype='object')

---Series AG_PRD_FIESSI: Processing page 1 of 2
---Series AG_PRD_FIESSI: Processing page 2 of 2


['seriesCode',
 'ageCode',
 'boundsCode',
 'ageDesc',
 'boundsDesc',
 'geoAreaCode',
 'timePeriod']

Index(['timePeriod', 'geoAreaCode', 'geoAreaName', 'level', 'parentCode',
       'parentName', 'type', 'CountryProfile', 'ISO3CD', 'UNMember', 'X', 'Y',
       'seriesCode', 'ageCode', 'boundsCode', 'ageDesc', 'boundsDesc', 'value',
       'valueType', 'timeDetail', 'source', 'footnotes', 'natureCode',
       'unitsCode', 'reportingTypeCode', 'natureDesc', 'unitsDesc',
       'reportingTypeDesc'],
      dtype='object')

---Series DC_FTA_TOTAL: Processing page 1 of 7
---Series DC_FTA_TOTAL: Processing page 2 of 7
---Series DC_FTA_TOTAL: Processing page 3 of 7
---Series DC_FTA_TOTAL: Processing page 4 of 7
---Series DC_FTA_TOTAL: Processing page 5 of 7
---Series DC_FTA_TOTAL: Processing page 6 of 7
---Series DC_FTA_TOTAL: Processing page 7 of 7


['seriesCode',
 'unitmultiplierCode',
 'unitmultiplierDesc',
 'geoAreaCode',
 'timePeriod']

Index(['timePeriod', 'geoAreaCode', 'geoAreaName', 'level', 'parentCode',
       'parentName', 'type', 'CountryProfile', 'ISO3CD', 'UNMember', 'X', 'Y',
       'seriesCode', 'unitmultiplierCode', 'unitmultiplierDesc', 'value',
       'valueType', 'timeDetail', 'source', 'footnotes', 'natureCode',
       'unitsCode', 'reportingTypeCode', 'natureDesc', 'unitsDesc',
       'reportingTypeDesc'],
      dtype='object')

---Series DC_ODA_BDVDL: Processing page 1 of 2
---Series DC_ODA_BDVDL: Processing page 2 of 2


['seriesCode',
 'unitmultiplierCode',
 'unitmultiplierDesc',
 'geoAreaCode',
 'timePeriod']

Index(['timePeriod', 'geoAreaCode', 'geoAreaName', 'level', 'parentCode',
       'parentName', 'type', 'CountryProfile', 'ISO3CD', 'UNMember', 'X', 'Y',
       'seriesCode', 'unitmultiplierCode', 'unitmultiplierDesc', 'value',
       'valueType', 'timeDetail', 'source', 'footnotes', 'natureCode',
       'unitsCode', 'reportingTypeCode', 'natureDesc', 'unitsDesc',
       'reportingTypeDesc'],
      dtype='object')

---Series DC_ODA_BDVL: Processing page 1 of 11
---Series DC_ODA_BDVL: Processing page 2 of 11
---Series DC_ODA_BDVL: Processing page 3 of 11
---Series DC_ODA_BDVL: Processing page 4 of 11
---Series DC_ODA_BDVL: Processing page 5 of 11
---Series DC_ODA_BDVL: Processing page 6 of 11
---Series DC_ODA_BDVL: Processing page 7 of 11
---Series DC_ODA_BDVL: Processing page 8 of 11
---Series DC_ODA_BDVL: Processing page 9 of 11
---Series DC_ODA_BDVL: Processing page 10 of 11
---Series DC_ODA_BDVL: Processing page 11 of 11


['seriesCode',
 'unitmultiplierCode',
 'unitmultiplierDesc',
 'geoAreaCode',
 'timePeriod']

Index(['timePeriod', 'geoAreaCode', 'geoAreaName', 'level', 'parentCode',
       'parentName', 'type', 'CountryProfile', 'ISO3CD', 'UNMember', 'X', 'Y',
       'seriesCode', 'unitmultiplierCode', 'unitmultiplierDesc', 'value',
       'valueType', 'timeDetail', 'source', 'footnotes', 'natureCode',
       'unitsCode', 'reportingTypeCode', 'natureDesc', 'unitsDesc',
       'reportingTypeDesc'],
      dtype='object')

---Series DC_ODA_LDCG: Processing page 1 of 1


['seriesCode', 'geoAreaCode', 'timePeriod']

Index(['timePeriod', 'geoAreaCode', 'geoAreaName', 'level', 'parentCode',
       'parentName', 'type', 'CountryProfile', 'ISO3CD', 'UNMember', 'X', 'Y',
       'seriesCode', 'value', 'valueType', 'timeDetail', 'source', 'footnotes',
       'natureCode', 'unitsCode', 'reportingTypeCode', 'natureDesc',
       'unitsDesc', 'reportingTypeDesc'],
      dtype='object')

---Series DC_ODA_LDCS: Processing page 1 of 1


['seriesCode',
 'unitmultiplierCode',
 'unitmultiplierDesc',
 'geoAreaCode',
 'timePeriod']

Index(['timePeriod', 'geoAreaCode', 'geoAreaName', 'level', 'parentCode',
       'parentName', 'type', 'CountryProfile', 'ISO3CD', 'UNMember', 'X', 'Y',
       'seriesCode', 'unitmultiplierCode', 'unitmultiplierDesc', 'value',
       'valueType', 'timeDetail', 'source', 'footnotes', 'natureCode',
       'unitsCode', 'reportingTypeCode', 'natureDesc', 'unitsDesc',
       'reportingTypeDesc'],
      dtype='object')

---Series DC_ODA_LLDC: Processing page 1 of 1


['seriesCode',
 'unitmultiplierCode',
 'unitmultiplierDesc',
 'geoAreaCode',
 'timePeriod']

Index(['timePeriod', 'geoAreaCode', 'geoAreaName', 'level', 'parentCode',
       'parentName', 'type', 'CountryProfile', 'ISO3CD', 'UNMember', 'X', 'Y',
       'seriesCode', 'unitmultiplierCode', 'unitmultiplierDesc', 'value',
       'valueType', 'timeDetail', 'source', 'footnotes', 'natureCode',
       'unitsCode', 'reportingTypeCode', 'natureDesc', 'unitsDesc',
       'reportingTypeDesc'],
      dtype='object')

---Series DC_ODA_SIDSG: Processing page 1 of 1


['seriesCode', 'geoAreaCode', 'timePeriod']

Index(['timePeriod', 'geoAreaCode', 'geoAreaName', 'level', 'parentCode',
       'parentName', 'type', 'CountryProfile', 'ISO3CD', 'UNMember', 'X', 'Y',
       'seriesCode', 'value', 'valueType', 'timeDetail', 'source', 'footnotes',
       'natureCode', 'unitsCode', 'reportingTypeCode', 'natureDesc',
       'unitsDesc', 'reportingTypeDesc'],
      dtype='object')

---Series DC_ODA_TOTL: Processing page 1 of 2
---Series DC_ODA_TOTL: Processing page 2 of 2


['seriesCode',
 'unitmultiplierCode',
 'unitmultiplierDesc',
 'geoAreaCode',
 'timePeriod']

Index(['timePeriod', 'geoAreaCode', 'geoAreaName', 'level', 'parentCode',
       'parentName', 'type', 'CountryProfile', 'ISO3CD', 'UNMember', 'X', 'Y',
       'seriesCode', 'unitmultiplierCode', 'unitmultiplierDesc', 'value',
       'valueType', 'timeDetail', 'source', 'footnotes', 'natureCode',
       'unitsCode', 'reportingTypeCode', 'natureDesc', 'unitsDesc',
       'reportingTypeDesc'],
      dtype='object')

---Series DC_TOF_HLTHNT: Processing page 1 of 3
---Series DC_TOF_HLTHNT: Processing page 2 of 3
---Series DC_TOF_HLTHNT: Processing page 3 of 3


['seriesCode',
 'unitmultiplierCode',
 'unitmultiplierDesc',
 'geoAreaCode',
 'timePeriod']

Index(['timePeriod', 'geoAreaCode', 'geoAreaName', 'level', 'parentCode',
       'parentName', 'type', 'CountryProfile', 'ISO3CD', 'UNMember', 'X', 'Y',
       'seriesCode', 'unitmultiplierCode', 'unitmultiplierDesc', 'value',
       'valueType', 'timeDetail', 'source', 'footnotes', 'natureCode',
       'unitsCode', 'reportingTypeCode', 'natureDesc', 'unitsDesc',
       'reportingTypeDesc'],
      dtype='object')

---Series DC_TOF_INFRAL: Processing page 1 of 7
---Series DC_TOF_INFRAL: Processing page 2 of 7


KeyboardInterrupt: 

In [None]:
s = 'SL_ISV_IFRM'
series_data(s, release)

In [None]:
series_list('2019.Q1.G.02')

In [None]:
i