# Pre-processing script
This script pulls SDG data from API and transforms it into csv files.
The steps are:
- Pull data from [API](https://unstats.un.org/SDGAPI/swagger/) 
- Join with geography
- save as "long" table
- pivot into "wide" format and split regional and country data 

### Load necessary libraries

In [1]:
import csv
import json
import urllib3  # allows to access a URL with python
import math
import os
import collections
import numpy as np
import pandas as pd


### User parameters

In [2]:
release = '2019.Q1.G.01' # Make sure to have the correct release here

dir_path = os.path.dirname(os.path.realpath('__file__'))
print(dir_path)

wd_dir = r'../'
print('data inputs dir: ' + wd_dir)

/home/jovyan/FIS4SDGs/unsd/notebooks
data inputs dir: ../


### Array to catch errors

In [3]:
error_log = []

### List of countries to be plotted on a map (with XY coordinates)

In [4]:
countryListXY = []
with open(wd_dir + 'CountryListXY.txt', newline = '', encoding='latin-1') as countryList:                                                                                          
    countryList = csv.DictReader(countryList, delimiter='\t')
    for row in countryList:
        countryListXY.append(dict(row))
        
#print(countryListXY[1])
#for c in countryListXY:

In [5]:
countryListXY[0]

{'X': '66.02688198',
 'Y': '33.83160199',
 'ISO3CD': 'AFG',
 'geoAreaCode': '4',
 'geoAreaName': 'Afghanistan',
 'UNMember': '1',
 'CountryProfile': '1'}

### Get the list of goals, targets, indicators and series

Start by creating a `PoolManager()` object using `urllib3`.

In [6]:
http = urllib3.PoolManager()

Call the endpoint of the SDG API that provides the list of goals with all their children

In [7]:
response = http.request('GET', "https://unstats.un.org/SDGAPI/v1/sdg/Goal/List?includechildren=true")
responseData = json.loads(response.data.decode('UTF-8'))
#responseData[0]



Define the list of 'keys' to be extracted into a flat dictionary from the response

In [8]:
series_list = []

keys = ["goalCode", 
        "goalDesc",
        "targetCode",
        "targetDesc",
        "indicatorCode",
        "indicatorDesc",
        "indicatorTier",
        "seriesCode",
        "seriesDesc",
        "seriesRelease"
       ]

Traverse all goals, targets, indicators and series:

In [9]:
for g in responseData:
    for t in g['targets']:
        for i in t['indicators']:
            for s in i['series']:
                if s['release'] == release:
                    values = [g['code'], g['title'],
                              t['code'], t['description'], 
                              i['code'], i['description'], i['tier'], 
                              s['code'], s['description'], s['release']]
                    
                    keys_and_values = zip(keys, values)
                    serie_dic = {}
                    for key, value in keys_and_values:
                        serie_dic[key] = value
                    series_list.append(serie_dic)

In [10]:
series_list[0:2]

[{'goalCode': '1',
  'goalDesc': 'End poverty in all its forms everywhere',
  'targetCode': '1.1',
  'targetDesc': 'By 2030, eradicate extreme poverty for all people everywhere, currently measured as people living on less than $1.25 a day',
  'indicatorCode': '1.1.1',
  'indicatorDesc': 'Proportion of population below the international poverty line, by sex, age, employment status and geographical location (urban/rural)',
  'indicatorTier': '1',
  'seriesCode': 'SI_POV_DAY1',
  'seriesDesc': 'Proportion of population below international poverty line (%)',
  'seriesRelease': '2019.Q1.G.01'},
 {'goalCode': '1',
  'goalDesc': 'End poverty in all its forms everywhere',
  'targetCode': '1.1',
  'targetDesc': 'By 2030, eradicate extreme poverty for all people everywhere, currently measured as people living on less than $1.25 a day',
  'indicatorCode': '1.1.1',
  'indicatorDesc': 'Proportion of population below the international poverty line, by sex, age, employment status and geographical loc

### Get the data for each series

#### Verify how many pages need to be requested to get all the data for a specific series from the SDG API. 

In [11]:
def series_request_details(seriesCode):
    
    seriesRequest = 'https://unstats.un.org/SDGAPI/v1/sdg/Series/Data?seriesCode=' + seriesCode + "&pageSize=2"
    
    response = http.request('GET', seriesRequest)
    responseData = json.loads(response.data.decode('UTF-8'))
    
    pageSize = 500
    nPages = math.floor(responseData['totalElements'] / pageSize) + 1
    totalElements = responseData['totalElements']
    
    return {'series' : seriesCode,
            'totalElements' : totalElements,
            'nPages' : nPages, 
            'pageSize' : pageSize
           }


In [12]:
x = series_request_details(series_list[0]['seriesCode'])
x



{'series': 'SI_POV_DAY1', 'totalElements': 1393, 'nPages': 3, 'pageSize': 500}

#### Get the list of attributes and dimensions for the series, with their respective code lists:

In [13]:
seriesRequest = 'https://unstats.un.org/SDGAPI/v1/sdg/Series/Data?seriesCode=' + 'SI_POV_DAY1' + "&pageSize=2"

response = http.request('GET', seriesRequest)
responseData = json.loads(response.data.decode('UTF-8'))

series_attributes = responseData['attributes']
series_dimensions = responseData['dimensions']

series_attributes




[{'id': 'Nature',
  'codes': [{'code': 'G',
    'description': 'Global monitoring data',
    'sdmx': 'G'}]},
 {'id': 'Units',
  'codes': [{'code': 'PERCENT',
    'description': 'Percentage',
    'sdmx': 'PERCENT'}]}]

In [14]:
def series_code_lists(seriesCode):
    
    seriesRequest = 'https://unstats.un.org/SDGAPI/v1/sdg/Series/Data?seriesCode=' + seriesCode + "&pageSize=2"
    
    response = http.request('GET', seriesRequest)
    responseData = json.loads(response.data.decode('UTF-8'))
    
    series_attributes = responseData['attributes']
    series_dimensions = responseData['dimensions']
    
    new_dict = {}
    
    new_dict['series'] = seriesCode
    
    for a in series_attributes:
        codelist_dict = {}
        for c in a['codes']:
            codelist_dict[c['sdmx']] = c['description']
        new_dict[a['id']] = codelist_dict
    
    for d in series_dimensions:
        codelist_dict = {}
        for c in d['codes']:
            codelist_dict[c['sdmx']] = c['description']
        new_dict[d['id']] = codelist_dict
        
    return new_dict


In [15]:
x = series_code_lists(series_list[0]['seriesCode'])
x



{'series': 'SI_POV_DAY1',
 'Nature': {'G': 'Global monitoring data'},
 'Units': {'PERCENT': 'Percentage'},
 'Reporting Type': {'N': 'National', 'G': 'Global'}}

#### Build query string to collect data for a specific series from the global SDG API

In [16]:
def series_query(seriesCode, page, pageSize):
    queryString =  r'https://unstats.un.org/SDGAPI/v1/sdg/Series/Data?seriesCode=' + seriesCode + '&page=' + str(page) + '&pageSize=' + str(pageSize)
    response = http.request('GET', queryString)
    responseData =  json.loads(response.data.decode('UTF-8'))
    return(responseData)

#### Get data for a specific series from the API
*(!) Notice that a data point may appear more than once if it belongs to a "multi-purpose indicator"*

In [17]:
def series_data(seriesCode):
    x = series_request_details(seriesCode)
    series_data = []
    if x['totalElements'] > 0:
        for p in range(x['nPages']):
            print("---Series " + seriesCode + ": Processing page " + str(p+1) + " of " + str(x['nPages']))
            responseData =  series_query(seriesCode, p+1, x['pageSize'])
            if len(responseData['data'])>0:
                series_data = series_data + responseData['data'] 
    return series_data

In [18]:
#x = series_data(series_list[0]['seriesCode'])
x = series_data('SI_POV_DAY1')




---Series SI_POV_DAY1: Processing page 1 of 3




---Series SI_POV_DAY1: Processing page 2 of 3




---Series SI_POV_DAY1: Processing page 3 of 3




#### Flatten the dictionary, extracting individual attributes and dimensions as key-value pairs in their own right.

In [19]:
def flat_series_data(seriesCode):
    new_x = []
    for d in series_data(seriesCode):
        new_d = {}
        for key, value in d.items():
            if type(value) is list:
                new_d[key] = ', '.join(value)
            elif type(value) is dict:
                for k, v in value.items():
                    new_d[k] = v
            else:
                new_d[key] = value
        new_x.append(new_d)
    return new_x

In [20]:
flat_series_data('SI_POV_DAY1')[0]



---Series SI_POV_DAY1: Processing page 1 of 3




---Series SI_POV_DAY1: Processing page 2 of 3




---Series SI_POV_DAY1: Processing page 3 of 3




{'goal': '1',
 'target': '1.1',
 'indicator': '1.1.1',
 'series': 'SI_POV_DAY1',
 'seriesDescription': 'Proportion of population below international poverty line (%)',
 'seriesCount': '1393',
 'geoAreaCode': '8',
 'geoAreaName': 'Albania',
 'timePeriodStart': 1996.0,
 'value': '1.1',
 'valueType': 'Float',
 'time_detail': None,
 'source': 'World Bank, Development Research Group. Data are based on primary household survey data obtained from government statistical agencies and World Bank country departments. Data for high-income economies are from the Luxembourg Income Study database. For more information and methodology, please see PovcalNet (http://iresearch.worldbank.org/PovcalNet/index.htm). Accessed 01 November 2018, World Development Indicators Database.',
 'footnotes': '',
 'Nature': 'G',
 'Units': 'PERCENT',
 'Reporting Type': 'G'}

In [None]:
def series_dataset(seriesCode, indicatorCode):
    
    series_dataset = []
        
    for k in [d for d in series_data(seriesCode) if d['indicator'][0] == indicatorCode]:  # this is to avoid duplicates in multipurpose indicators
                
        record = collections.OrderedDict()
        
        for ss in s.keys():
            key_name = "%s%s" % (ss[0].upper(), ss[1:])
   
            record[key_name] = s[ss]
            
        value_dimensions = k['dimensions']
        
        for d in value_dimensions.keys():
             dimension_name = d
             dimension_code = value_dimensions[d]
             
             for dd in series_dimensions:
                 if dd['id'] == dimension_name:
                     for cc in dd['codes']:
                         if cc['code'] == dimension_code:
                             dimension_desc = cc['description']
                             continue
                         
             dimension_name = ''.join(x for x in dimension_name.title() if not x.isspace())
             
             record[dimension_name + "_Code"] = dimension_code 
             record[dimension_name + "_Desc"] = dimension_desc
        
        
        value_attributes = k['attributes']
        
        for a in value_attributes.keys():
             attribute_name = a
             attribute_code = value_attributes[a]
             
             for aa in series_attributes:
                 if aa['id'] == attribute_name:
                     for cc in aa['codes']:
                         if cc['code'] == attribute_code:
                             attribute_desc = cc['description']
                             continue
                         
             attribute_name = ''.join(x for x in attribute_name.title() if not x.isspace())
             
             record[attribute_name + "_Code"] = attribute_code 
             record[attribute_name + "_Desc"] = attribute_desc
        
        
                 
        data_items = [ 'geoAreaCode', 'geoAreaName','timePeriodStart', 'time_detail','value', 'valueType', 'source',  'footnotes']
        
        for i in data_items:
            key_name = ''.join(x for x in i.title())
            
            if i == 'geoAreaCode': 
                key_name = 'GeoArea_Code'
            if i == 'geoAreaName': 
                key_name = 'GeoArea_Desc'
            if i == 'source': 
                key_name = 'Source'
            if i == 'timePeriodStart': 
                key_name = 'Year'
            if i == 'time_detail':
                key_name = 'TimeDetail'
            if i == 'footnotes' : 
                key_name = 'Footnotes'
                k[i] =  "; ".join(k[i])
            if i == 'value' : 
                key_name = 'Value'
            if i == 'valueType': 
                key_name = 'ValueType'
                
            record[key_name] = k[i]  
        
        for xy in countryListXY:
            if xy['geoAreaCode'] == record['GeoArea_Code']:
                record['ISO3CD'] = xy['ISO3CD']
                record['X'] = xy['X']
                record['Y'] = xy['Y']
            continue
        
        series_dataset.append(record)
     
    try:
        with open('data\\csv\\'+ record['IndicatorCode'] + "-" + s['seriesCode']+'_long.csv', 'w', newline='') as outfile:
            fp = csv.DictWriter(outfile, series_dataset[0].keys(), quoting=csv.QUOTE_NONNUMERIC)
            fp.writeheader()
            fp.writerows(series_dataset)
            
            print('=====FINISHED WRITING SERIES ' + record['SeriesCode'] + ' TO FILE=====')
            
              
          
    
    except:
        
        error_log.append(record['SeriesCode'])
        
        print('=====SERIES ' + record['SeriesCode'] + ' COULD NOT BE WRITTEN TO FILE=====')
        
