# Pre-processing script
This script pulls SDG data from API and transforms it into csv files.
The steps are:
- Pull data from [API](https://unstats.un.org/SDGAPI/swagger/) 
- Join with geography
- save as "long" table
- pivot into "wide" format and split regional and country data 

### Load necessary libraries

In [3]:
import csv
import json
import urllib3  # allows to access a URL with python
import math
import os
import io
import collections
import numpy as np
import pandas as pd
import xlsxwriter


### User parameters

In [4]:
release = '2019.Q1.G.02' # Make sure to have the correct release here

dir_path = os.path.dirname(os.path.realpath('__file__'))
print(dir_path)

wd_dir = r'../'
print('data inputs dir: ' + wd_dir)

/home/jovyan/FIS4SDGs/unsd/notebooks
data inputs dir: ../


## Utilities

#### Convert string to camelCase

In [5]:
def camelCase(st):
    """
    https://stackoverflow.com/questions/8347048/camelcase-every-string-any-standard-library
    
    """
    output = ''.join(x for x in st.title() if x.isalnum())
    return output[0].lower() + output[1:]

#### Disable insecure request warnings when using `urllib3`.

In [6]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

#### Create array to catch errors

In [7]:
error_log = []

### List of countries to be plotted on a map (with XY coordinates)

In [8]:
def countryListXY(file):
    
    countryListXY = []
    
    with open(file, newline = '', encoding='latin-1') as countryList:                                                                                          
        countryList = csv.DictReader(countryList, delimiter='\t')
        for row in countryList:
            countryListXY.append(dict(row))
            
    countryListXY = pd.DataFrame(countryListXY).astype({'geoAreaCode':'str'})
    
    return(countryListXY)

    #print(countryListXY[1])
    #for c in countryListXY:

In [16]:
countryListXY(wd_dir + 'CountryListXY.txt').head(10)

Unnamed: 0,CountryProfile,ISO3CD,UNMember,X,Y,geoAreaCode,geoAreaName
0,1,AFG,1,66.02688198,33.83160199,4,Afghanistan
1,1,ALB,1,20.06660928,41.13897007,8,Albania
2,1,ATA,0,21.47585697,-80.40897662,10,Antarctica
3,1,DZA,1,2.678164227,28.15940032,12,Algeria
4,1,ASM,0,-170.7187269,-14.30587306,16,American Samoa
5,1,AND,1,1.576257417,42.54548611,20,Andorra
6,1,AGO,1,17.57817062,-12.33724746,24,Angola
7,1,ATG,1,-61.7999755,17.07761471,28,Antigua and Barbuda
8,1,AZE,1,50.01064725,40.39229544,31,Azerbaijan
9,1,ARG,1,-65.14563274,-35.19446255,32,Argentina


#### Call the endpoint of the SDG API that provides the list of hierarchical groupings of geographic Areas:

In [9]:
def geoAreaTree():
    http = urllib3.PoolManager()
    response = http.request('GET', "https://unstats.un.org/SDGAPI/v1/sdg/GeoArea/Tree")
    responseData = json.loads(response.data.decode('UTF-8'))
    
    return responseData

In [18]:
print(len(geoAreaTree()))
for t in geoAreaTree():
    print('root='+t['geoAreaName'])

7
root=World
root=Least Developed Countries (LDC)
root=Land Locked Developing Countries (LLDC)
root=Small Island Developing States (SIDS)
root=Developed Regions
root=Developing Regions
root=Custom groupings of data providers


#### Traverse a hierarchical tree of geographic areas and convert it to a parent-child hierarchy table:

In [10]:
def traverse(tree, parentCode=None, parentName=None):
    
    # print(' ' * traverse.level + 'Level: ' + str(traverse.level) + ', ' + tree['type'] + ', ' + str(tree['geoAreaCode']) + '-' + tree['geoAreaName'] )
    
    d = {}
    
    d['level'] = traverse.level
    d['type'] = tree['type']
    d['parentCode'] = parentCode
    d['parentName'] = parentName
    d['geoAreaCode'] = str(tree['geoAreaCode'])
    
    if tree['geoAreaCode'] == 807:
        d['geoAreaName'] = 'North Macedonia'
    else:
        d['geoAreaName'] = tree['geoAreaName']
    
    hierarchy.append(d)
        
    if tree['children']:
        for child in tree['children']:
            traverse.level += 1
            traverse(child, str(tree['geoAreaCode']), tree['geoAreaName'])
            traverse.level -= 1
            
    

    

            
def flatten(tree):
    global hierarchy
    hierarchy = []
    traverse.level = 1
    traverse(tree)
   
    #----Temporary Fix for missing countries----
    
    d_DRC = {'level' : 5,
             'type' : 'Country',
             'parentCode' : '17',
             'parentName' : 'Middle Africa',
             'geoAreaCode' : '180',
             'geoAreaName' : 'Democratic Republic of the Congo'
            }
    
    
    d_Tanzania = {'level' : 5,
                  'type' : 'Country',
                  'parentCode' : '14',
                  'parentName' : 'Eastern Africa',
                  'geoAreaCode' : '834',
                  'geoAreaName' : 'Tanzania, United Republic of'
                 }
    
   
    #----Temporary Fix for missing economic groups----
   
    d_910 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '910',
              'geoAreaName' : 'High income economies (WB)'
             }
    
    d_911 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '911',
              'geoAreaName' : 'Low income economies (WB)'
             }
    d_912 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '912',
              'geoAreaName' : 'Lower middle economies (WB)'
             }
    d_913 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '913',
              'geoAreaName' : 'Low and middle income economies (WB)'
             }
    d_914 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '914',
              'geoAreaName' : 'Upper middle economies (WB)'
             }
    
    d_135 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '135',
              'geoAreaName' : 'Caucasus and Central Asia'
            }
    
    d_127 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '127',
              'geoAreaName' : 'Southern Asia (excluding India)'
            }
            
    
    d_199 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '199',
              'geoAreaName' : 'Least Developed Countries (LDC)'
            }
    
    d_485 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '485',
              'geoAreaName' : 'Western Asia (exc. Armenia, Azerbaijan, Cyprus, Israel and Georgia)'
            }
    
    d_514 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '514',
              'geoAreaName' : 'Developed Regions'
            }
    
    d_515 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '515',
              'geoAreaName' : 'Developing Regions'
            }
    
    d_518 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '518',
              'geoAreaName' : 'Eastern Asia (excluding Japan)'
            }
    
    d_738 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '738',
              'geoAreaName' : 'Sub-Saharan Africa (inc. Sudan)'
            }
    
    d_746 = {'level' : 1,
              'type' : 'Group',
              'parentCode' : None,
              'parentName' : None,
              'geoAreaCode' : '746',
              'geoAreaName' : 'Northern Africa (exc. Sudan)'
            }
             
             
             
    
    
    hierarchy.append(d_DRC)
    hierarchy.append(d_Tanzania)
    hierarchy.append(d_910)
    hierarchy.append(d_911)
    hierarchy.append(d_912)
    hierarchy.append(d_913)
    hierarchy.append(d_914)
    hierarchy.append(d_135)
    hierarchy.append(d_127)
    hierarchy.append(d_199)
    hierarchy.append(d_485)
    hierarchy.append(d_514)
    hierarchy.append(d_515)
    hierarchy.append(d_518)
    hierarchy.append(d_738)
    hierarchy.append(d_746)

    
    return pd.DataFrame(hierarchy)

In [20]:
flatten(geoAreaTree()[0])

Unnamed: 0,geoAreaCode,geoAreaName,level,parentCode,parentName,type
0,1,World,1,,,Region
1,10,Antarctica,2,1,World,Country
2,2,Africa,2,1,World,Region
3,15,Northern Africa,3,2,Africa,Region
4,12,Algeria,4,15,Northern Africa,Country
5,818,Egypt,4,15,Northern Africa,Country
6,434,Libya,4,15,Northern Africa,Country
7,504,Morocco,4,15,Northern Africa,Country
8,729,Sudan,4,15,Northern Africa,Country
9,788,Tunisia,4,15,Northern Africa,Country


#### Merge coordinates and list of geographic areas in SDG database

In [11]:
def geoareas(tree,coordinates_file):
    x = pd.merge(flatten(tree),
         countryListXY(coordinates_file),
         how='outer',
         on=['geoAreaCode', 'geoAreaName'])
    
    return(x)

# x.to_excel('test.xlsx', engine ='xlsxwriter')

In [12]:
geoareas(geoAreaTree()[0], wd_dir + 'CountryListXY.txt')

Unnamed: 0,geoAreaCode,geoAreaName,level,parentCode,parentName,type,CountryProfile,ISO3CD,UNMember,X,Y
0,1,World,1,,,Region,,,,,
1,10,Antarctica,2,1,World,Country,1,ATA,0,21.47585697,-80.40897662
2,2,Africa,2,1,World,Region,,,,,
3,15,Northern Africa,3,2,Africa,Region,,,,,
4,12,Algeria,4,15,Northern Africa,Country,1,DZA,1,2.678164227,28.15940032
5,818,Egypt,4,15,Northern Africa,Country,1,EGY,1,29.77468534,26.57438204
6,434,Libya,4,15,Northern Africa,Country,1,LBY,1,18.0295985,27.04042819
7,504,Morocco,4,15,Northern Africa,Country,1,MAR,1,-6.281942841,31.8440131
8,729,Sudan,4,15,Northern Africa,Country,1,SDN,0,29.95650159,16.02643066
9,788,Tunisia,4,15,Northern Africa,Country,1,TUN,1,9.572737411,34.11439971


### Get the list of goals, targets, indicators and series

In [13]:
def series_list(release):
    
    series_list = []

    # Call the endpoint of the SDG API that provides the list of goals with all their children:
    http = urllib3.PoolManager()
    response = http.request('GET', "https://unstats.un.org/SDGAPI/v1/sdg/Goal/List?includechildren=true")
    responseData = json.loads(response.data.decode('UTF-8'))
    
    # Define the list of 'keys' to be extracted into a flat dictionary from the response:
    keys = ["goalCode", 
            "goalDesc",
            "targetCode",
            "targetDesc",
            "indicatorCode",
            "indicatorDesc",
            "indicatorTier",
            "seriesCode",
            "seriesDesc",
            "seriesRelease"
           ]    
    
    # Iterate over goals, targets, indicators, and series for the specified release:
    for g in responseData:
        for t in g['targets']:
            for i in t['indicators']:
                for s in i['series']:
                    if s['release'] == release:
                        values = [g['code'], g['title'],
                                  t['code'], t['description'], 
                                  i['code'], i['description'], i['tier'], 
                                  s['code'], s['description'], s['release']]

                        keys_and_values = zip(keys, values)
                        serie_dic = {}
                        for key, value in keys_and_values:
                            serie_dic[key] = value
                        series_list.append(serie_dic)
                        
    series_list = pd.DataFrame(series_list)
    
    return series_list[['goalCode', 'goalDesc',
                       'targetCode', 'targetDesc',
                       'indicatorCode', 'indicatorDesc','indicatorTier', 
                       'seriesCode', 'seriesDesc', 'seriesRelease']]

In [24]:
series_list(release).head(4)

Unnamed: 0,goalCode,goalDesc,targetCode,targetDesc,indicatorCode,indicatorDesc,indicatorTier,seriesCode,seriesDesc,seriesRelease
0,1,End poverty in all its forms everywhere,1.1,"By 2030, eradicate extreme poverty for all peo...",1.1.1,Proportion of population below the internation...,1,SI_POV_DAY1,Proportion of population below international p...,2019.Q1.G.02
1,1,End poverty in all its forms everywhere,1.1,"By 2030, eradicate extreme poverty for all peo...",1.1.1,Proportion of population below the internation...,1,SI_POV_EMP1,Employed population below international povert...,2019.Q1.G.02
2,1,End poverty in all its forms everywhere,1.2,"By 2030, reduce at least by half the proportio...",1.2.1,Proportion of population living below the nati...,1,SI_POV_NAHC,Proportion of population living below the nati...,2019.Q1.G.02
3,1,End poverty in all its forms everywhere,1.3,Implement nationally appropriate social protec...,1.3.1,Proportion of population covered by social pro...,1,SI_COV_MATNL,[ILO] Proportion of mothers with newborns rece...,2019.Q1.G.02


#### Cross-Join geographic area catalogue (with coordinates) and series catalogue

In [14]:
def cross_geo_series(geoTree, countryXY, seriesCode, release):
    df1 = geoareas(geoTree, countryXY)
    df2 = series_list(release)
    
    df2 = df2.loc[df2['seriesCode']==seriesCode]

    df1['key'] = 1
    df2['key'] = 1

    x = pd.merge(df1, df2,on='key')
    x.drop(['key'], axis=1, inplace=True)
    
    return x

In [15]:
cross_geo_series(geoAreaTree()[0], wd_dir + 'CountryListXY.txt','SE_ACC_COMP','2019.Q1.G.02')

Unnamed: 0,geoAreaCode,geoAreaName,level,parentCode,parentName,type,CountryProfile,ISO3CD,UNMember,X,...,goalCode,goalDesc,targetCode,targetDesc,indicatorCode,indicatorDesc,indicatorTier,seriesCode,seriesDesc,seriesRelease
0,1,World,1,,,Region,,,,,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
1,10,Antarctica,2,1,World,Country,1,ATA,0,21.47585697,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
2,2,Africa,2,1,World,Region,,,,,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
3,15,Northern Africa,3,2,Africa,Region,,,,,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
4,12,Algeria,4,15,Northern Africa,Country,1,DZA,1,2.678164227,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
5,818,Egypt,4,15,Northern Africa,Country,1,EGY,1,29.77468534,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
6,434,Libya,4,15,Northern Africa,Country,1,LBY,1,18.0295985,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
7,504,Morocco,4,15,Northern Africa,Country,1,MAR,1,-6.281942841,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
8,729,Sudan,4,15,Northern Africa,Country,1,SDN,0,29.95650159,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
9,788,Tunisia,4,15,Northern Africa,Country,1,TUN,1,9.572737411,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02


### Get the data for each series

#### Verify how many pages need to be requested to get all the data for a specific series from the SDG API. 

In [16]:
def series_request_details(seriesCode,release):
    
    seriesRequest = 'https://unstats.un.org/SDGAPI/v1/sdg/Series/Data?seriesCode=' + seriesCode + '&releaseCode=' + release + "&pageSize=2"
    
    http = urllib3.PoolManager()
    response = http.request('GET', seriesRequest)
    responseData = json.loads(response.data.decode('UTF-8'))
    
    pageSize = 500
    nPages = math.floor(responseData['totalElements'] / pageSize) + 1
    totalElements = responseData['totalElements']
    
    return {'series' : seriesCode,
            'totalElements' : totalElements,
            'nPages' : nPages, 
            'pageSize' : pageSize
           }


In [28]:
series_request_details('SE_ACC_COMP', '2019.Q1.G.02')

{'series': 'SE_ACC_COMP', 'totalElements': 1414, 'nPages': 3, 'pageSize': 500}

#### Explore the code lists of the attributes and dimensions of a series
Describe each attribute or dimension as a simple dictionary made of a set of `code`-`description` pairs.  For the code, use the SDMX code, and not the internal codeof the database.  Keep all labels in camelCase.

In [17]:
def series_code_lists(seriesCode, release):
    
    seriesRequest = 'https://unstats.un.org/SDGAPI/v1/sdg/Series/Data?seriesCode=' + seriesCode + '&releaseCode=' + release + "&pageSize=2"
    
    http = urllib3.PoolManager()
    response = http.request('GET', seriesRequest)
    responseData = json.loads(response.data.decode('UTF-8'))
    
    series_attributes = responseData['attributes']
    series_dimensions = responseData['dimensions']
    
    new_dict = {}
    
    new_dict['seriesCode'] = seriesCode
    
    for a in series_attributes:
        codelist_dict = {}
        for c in a['codes']:
            codelist_dict[c['sdmx']] = c['description']
        new_dict[camelCase(a['id'])] = codelist_dict
    
    for d in series_dimensions:
        codelist_dict = {}
        for c in d['codes']:
            codelist_dict[c['sdmx']] = c['description']
        new_dict[camelCase(d['id'])] = codelist_dict
        
    return new_dict


In [30]:
series_code_lists('SE_ACC_COMP', '2019.Q1.G.02')

{'seriesCode': 'SE_ACC_COMP',
 'nature': {'C': 'Country data',
  'CA': 'Country adjusted data',
  'E': 'Estimated data',
  'G': 'Global monitoring data',
  'M': 'Modeled data',
  'N': 'Non-relevant',
  'NA': 'Data nature not available'},
 'units': {'PERCENT': 'Percentage'},
 'educationLevel': {'GRAD23': 'Grades 2/3',
  'LOWSEC': 'Lower secondary',
  'PRIMAR': 'Primary',
  'PREPRI': 'Pre-primary',
  'SECOND': 'Secondary',
  'UPPSEC': 'Upper secondary'},
 'reportingType': {'N': 'National', 'G': 'Global'}}

#### Simplify further by presenting all the codes and their descriptions in a single table:

In [18]:
def series_code_lists2(seriesCode, release):
    
    seriesRequest = 'https://unstats.un.org/SDGAPI/v1/sdg/Series/Data?seriesCode=' + seriesCode + '&releaseCode=' + release + "&pageSize=2" 
    
    http = urllib3.PoolManager()
    response = http.request('GET', seriesRequest)
    responseData = json.loads(response.data.decode('UTF-8'))
    
    series_attributes = responseData['attributes']
    series_dimensions = responseData['dimensions']
    
    code_list = []
    
    for a in series_attributes:
       
        for c in a['codes']:
            new_dict = {}
            new_dict['series'] = seriesCode
            new_dict['role'] = 'attribute'
            new_dict['concept'] = camelCase(a['id'])
            new_dict['code'] = c['sdmx']
            new_dict['description'] = c['description']
            code_list.append(new_dict)
        
    for d in series_dimensions:
        for c in d['codes']:
            new_dict = {}
            new_dict['series'] = seriesCode
            new_dict['role'] = 'dimension'
            new_dict['concept'] = camelCase(d['id'])
            new_dict['code'] = c['sdmx']
            new_dict['description'] = c['description']
            code_list.append(new_dict)
        
    return pd.DataFrame(code_list)


In [32]:
series_code_lists2('SE_ACC_COMP', '2019.Q1.G.02')

Unnamed: 0,code,concept,description,role,series
0,C,nature,Country data,attribute,SE_ACC_COMP
1,CA,nature,Country adjusted data,attribute,SE_ACC_COMP
2,E,nature,Estimated data,attribute,SE_ACC_COMP
3,G,nature,Global monitoring data,attribute,SE_ACC_COMP
4,M,nature,Modeled data,attribute,SE_ACC_COMP
5,N,nature,Non-relevant,attribute,SE_ACC_COMP
6,,nature,Data nature not available,attribute,SE_ACC_COMP
7,PERCENT,units,Percentage,attribute,SE_ACC_COMP
8,GRAD23,educationLevel,Grades 2/3,dimension,SE_ACC_COMP
9,LOWSEC,educationLevel,Lower secondary,dimension,SE_ACC_COMP


#### Build query string to collect data for a specific series from the global SDG API

In [19]:
def series_query(seriesCode, release, page, pageSize):
    queryString =  r'https://unstats.un.org/SDGAPI/v1/sdg/Series/Data?seriesCode=' + seriesCode + '&releaseCode=' + release + '&page=' + str(page) + '&pageSize=' + str(pageSize)
    http = urllib3.PoolManager()
    response = http.request('GET', queryString)
    responseData =  json.loads(response.data.decode('UTF-8'))
    return(responseData)

#### Get data for a specific series from the API
*(!) Notice that a data point may appear more than once if it belongs to a "multi-purpose indicator"*

In [20]:
def series_data(seriesCode, release):
    x = series_request_details(seriesCode,release)
    series_data = []
    if x['totalElements'] > 0:
        for p in range(x['nPages']):
            print("---Series " + seriesCode + ": Processing page " + str(p+1) + " of " + str(x['nPages']))
            responseData =  series_query(seriesCode, release, p+1, x['pageSize'])
            if len(responseData['data'])>0:
                series_data = series_data + responseData['data'] 
    return series_data

In [35]:
series_data('SE_ACC_COMP','2019.Q1.G.02')[0]

---Series SE_ACC_COMP: Processing page 1 of 3
---Series SE_ACC_COMP: Processing page 2 of 3
---Series SE_ACC_COMP: Processing page 3 of 3


{'goal': ['4'],
 'target': ['4.a'],
 'indicator': ['4.a.1'],
 'series': 'SE_ACC_COMP',
 'seriesDescription': 'Schools with access to\xa0computers for pedagogical purposes, by education level (%)',
 'seriesCount': '1414',
 'geoAreaCode': '8',
 'geoAreaName': 'Albania',
 'timePeriodStart': 2017.0,
 'value': '83.13492',
 'valueType': 'Float',
 'time_detail': None,
 'source': 'UIS',
 'footnotes': [''],
 'attributes': {'Nature': 'C', 'Units': 'PERCENT'},
 'dimensions': {'Education level': 'UPPSEC', 'Reporting Type': 'G'}}

#### Flatten the dictionary, extracting individual attributes and dimensions as key-value pairs in their own right.
Also convert the years (`timePeriod`) variable to `int`

In [25]:
def flat_series_data(seriesCode,release):
    new_x = []
    for d in series_data(seriesCode,release):
        new_d = {}
        for key, value in d.items():
            if type(value) is list:
                new_d[key] = ', '.join(value)
            elif type(value) is dict:
                for k, v in value.items():
                    new_d[camelCase(k+' Code')] = v
            elif key == 'time_detail':
                new_d[camelCase(key)] = value
            elif key == 'timePeriodStart':
                new_d['timePeriod'] = int(value)
            elif key == 'series':
                new_d['seriesCode'] = value
            elif key == 'seriesDescription':
                new_d['seriesDesc'] = value
            elif key == 'geoAreaCode':
                new_d['geoAreaCode'] = str(value)
            else:
                new_d[key] = value
        new_x.append(new_d)
    return new_x

In [37]:
flat_series_data('SE_ACC_COMP','2019.Q1.G.02')[0]

---Series SE_ACC_COMP: Processing page 1 of 3
---Series SE_ACC_COMP: Processing page 2 of 3
---Series SE_ACC_COMP: Processing page 3 of 3


{'goal': '4',
 'target': '4.a',
 'indicator': '4.a.1',
 'seriesCode': 'SE_ACC_COMP',
 'seriesDesc': 'Schools with access to\xa0computers for pedagogical purposes, by education level (%)',
 'seriesCount': '1414',
 'geoAreaCode': '8',
 'geoAreaName': 'Albania',
 'timePeriod': 2017,
 'value': '83.13492',
 'valueType': 'Float',
 'timeDetail': None,
 'source': 'UIS',
 'footnotes': '',
 'natureCode': 'C',
 'unitsCode': 'PERCENT',
 'educationLevelCode': 'UPPSEC',
 'reportingTypeCode': 'G'}

#### Convert flattened dictionary of data series to pandas data frame, remove goal/target/indicator columns, and remove duplicate values

In [21]:
def dataframe_unique_series_data(seriesCode,release):
    x = pd.DataFrame(flat_series_data(seriesCode,release))
    x.drop(['goal','target','indicator', 'seriesCount'], axis=1, inplace=True)
    x.drop_duplicates(inplace=True)

    return x

In [None]:
x = dataframe_unique_series_data('SE_ACC_COMP','2019.Q1.G.02')
x.to_excel('test2.xlsx', engine ='xlsxwriter')

#### Add descriptions to coded dimension and attributes

In [22]:
geo_series = cross_geo_series(geoAreaTree()[0], wd_dir + 'CountryListXY.txt','SE_ACC_COMP','2019.Q1.G.02')
geo_series.head(10)


Unnamed: 0,geoAreaCode,geoAreaName,level,parentCode,parentName,type,CountryProfile,ISO3CD,UNMember,X,...,goalCode,goalDesc,targetCode,targetDesc,indicatorCode,indicatorDesc,indicatorTier,seriesCode,seriesDesc,seriesRelease
0,1,World,1,,,Region,,,,,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
1,10,Antarctica,2,1.0,World,Country,1.0,ATA,0.0,21.47585697,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
2,2,Africa,2,1.0,World,Region,,,,,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
3,15,Northern Africa,3,2.0,Africa,Region,,,,,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
4,12,Algeria,4,15.0,Northern Africa,Country,1.0,DZA,1.0,2.678164227,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
5,818,Egypt,4,15.0,Northern Africa,Country,1.0,EGY,1.0,29.77468534,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
6,434,Libya,4,15.0,Northern Africa,Country,1.0,LBY,1.0,18.0295985,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
7,504,Morocco,4,15.0,Northern Africa,Country,1.0,MAR,1.0,-6.281942841,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
8,729,Sudan,4,15.0,Northern Africa,Country,1.0,SDN,0.0,29.95650159,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02
9,788,Tunisia,4,15.0,Northern Africa,Country,1.0,TUN,1.0,9.572737411,...,4,Ensure inclusive and equitable quality educati...,4.a,Build and upgrade education facilities that ar...,4.a.1,Proportion of schools with access to: (a) elec...,1,SE_ACC_COMP,Schools with access to computers for pedagogic...,2019.Q1.G.02


In [32]:
set(geo_series['indicatorCode'])

{'4.a.1'}

In [23]:
def annotated_data(geoTree, countryXY, seriesCode, release):
    
    geo_series = cross_geo_series(geoTree, countryXY, seriesCode, release)
    
    codes = series_code_lists2(seriesCode, release)
    data = dataframe_unique_series_data(seriesCode, release)
    
    concepts = set(codes.concept)
    
    for c in concepts:
        code_list = codes.loc[codes['concept']==c][['code','description']]
        code_list.rename(columns={'code': c+'Code', 'description': c+'Desc'}, inplace=True)
        
        if c+'Code' in data.columns:    #This accounts for missing dimension in API (it's a workaround)
            data = pd.merge(data,
                         code_list,
                         how='left',
                         on=[c+'Code'])
    
    data.drop(['seriesDesc'], axis=1, inplace=True)
    data.drop(['geoAreaName'], axis=1, inplace=True)
         
    data = pd.merge(geo_series,
                    data,
                    how = 'right',
                    on=['seriesCode', 'geoAreaCode']
                   )
    
    c_series = ['goalCode', 'goalDesc', 
                'targetCode', 'targetDesc', 
                'indicatorCode', 'indicatorDesc', 'indicatorTier', 
                'seriesCode', 'seriesDesc', 'seriesRelease']
    
    c_geo  = ['geoAreaCode', 'geoAreaName',
              'level', 'parentCode', 'parentName',
              'type', 'CountryProfile', 'ISO3CD', 'UNMember', 'X', 'Y']
    
    
    c_time = ['timePeriod', 'timeDetail']
    
    c_values = ['value', 'valueType', 'unitsCode', 'unitsDesc']
    
    c_fn = ['natureCode', 'natureDesc', 'footnotes', 'source']
    
    fixed_columns = c_series + c_geo + c_time + c_values + c_fn
    
    c_dim = [c for c in data.columns if c not in fixed_columns]
    
    
    return data[c_series + c_geo + c_dim + c_time + c_values + c_fn]
    

In [None]:
x = annotated_data(geoAreaTree()[0], wd_dir + 'CountryListXY.txt','ER_REG_UNFCIM','2019.Q1.G.02')
print(x.columns)
print(x.shape)
x.head(4)

In [35]:
data['indicatorCode']

0       6.4.1
1       6.4.1
2       6.4.1
3       6.4.1
4       6.4.1
5       6.4.1
6       6.4.1
7       6.4.1
8       6.4.1
9       6.4.1
10      6.4.1
11      6.4.1
12      6.4.1
13      6.4.1
14      6.4.1
15      6.4.1
16      6.4.1
17      6.4.1
18      6.4.1
19      6.4.1
20      6.4.1
21      6.4.1
22      6.4.1
23      6.4.1
24      6.4.1
25      6.4.1
26      6.4.1
27      6.4.1
28      6.4.1
29      6.4.1
        ...  
1054    6.4.1
1055    6.4.1
1056    6.4.1
1057    6.4.1
1058    6.4.1
1059    6.4.1
1060    6.4.1
1061    6.4.1
1062    6.4.1
1063    6.4.1
1064    6.4.1
1065    6.4.1
1066    6.4.1
1067    6.4.1
1068    6.4.1
1069    6.4.1
1070    6.4.1
1071    6.4.1
1072    6.4.1
1073    6.4.1
1074    6.4.1
1075    6.4.1
1076      NaN
1077      NaN
1078      NaN
1079      NaN
1080      NaN
1081      NaN
1082      NaN
1083      NaN
Name: indicatorCode, Length: 1084, dtype: object

#### Produce 'long' files for each indicator/series combination
(Notice that multi-purpose indicators need to be split)

In [26]:
for s in list(set(series_list('2019.Q1.G.02').seriesCode)):
    data = annotated_data(geoAreaTree()[0], wd_dir + 'CountryListXY.txt',s,'2019.Q1.G.02')
    
    indicators = list(set(data.indicatorCode))
    
    for i in indicators:
        x = data.loc[data.indicatorCode == i]
        x.to_excel(wd_dir+'data/csv/long_'+i+ '_' + s + '.xlsx', engine ='xlsxwriter')

---Series ER_H2O_WUEYST: Processing page 1 of 3
---Series ER_H2O_WUEYST: Processing page 2 of 3
---Series ER_H2O_WUEYST: Processing page 3 of 3


TypeError: can only concatenate str (not "float") to str

In [None]:
series_list('2019.Q1.G.02')

In [28]:
i

nan