# Data from UNESCO

Download csvs from UNESCO's SDMX API. 

## Load required libraries

In [38]:
import xmltodict
import json
import requests 
import os
from bs4 import BeautifulSoup
import re
import pandas as pd

import io

## Interactive shell

In [2]:
# https://volderette.de/jupyter-notebook-tip-multiple-outputs/
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Use indicator mapping and dictionary to create new indicators

In [3]:
unesco_data_dictionary = pd.read_excel('..\\data\\UNESCO_UIS_indicators\\uis-data-dictionary-education-statistics.xlsx',
                                       sheet_name = 'Students and Teachers')

indicator_mapping = pd.read_excel('..\\data\\UNESCO_UIS_indicators\\MGSI - unesco.xlsx')
indicator_mapping.head(5)


Unnamed: 0,Series,IndicatorDesc,Disaggregation,Code
0,20,"Youth literacy rate of persons (15-24 years), ...",Both_sexes,LR_Ag15t24
1,20,"Youth literacy rate of persons (15-24 years), ...",Female,LR_Ag15t24_F
2,20,"Youth literacy rate of persons (15-24 years), ...",Male,LR_Ag15t24_M
3,20gpi,Gender partity index of youth literacy rate of...,GPI,LR_Ag15t24_gpi
4,21,Adjusted net enrolment rate in primary educati...,Both_sexes,NERA_1_cp


## Query builder for selected indicators and disaggregations

In [4]:
indicator_mapping.Code = [i.replace('_','.') for i in indicator_mapping.Code]
indicator_mapping.Code = indicator_mapping['Code'].str.lower()
indicator_mapping.head(5)

indicator_id = unesco_data_dictionary['Indicator ID'].str.lower()

New_list = []

print('--This will be the query parameters:')
unesco_data_dictionary.columns[6:-2]

for i in indicator_mapping.Code:
    
    a = 'UNESCO,EDU_NON_FINANCE,3.0/'
    
    query_parameters = unesco_data_dictionary[indicator_id == i].iloc[0,6:-2]
    
    for s in query_parameters:
        a = a  + str(s) +'.'
    
    New_list.append(a)

indicator_mapping['Query'] = New_list

print('--This is an example of a query construction:')
indicator_mapping['Query'][0]


Unnamed: 0,Series,IndicatorDesc,Disaggregation,Code
0,20,"Youth literacy rate of persons (15-24 years), ...",Both_sexes,lr.ag15t24
1,20,"Youth literacy rate of persons (15-24 years), ...",Female,lr.ag15t24.f
2,20,"Youth literacy rate of persons (15-24 years), ...",Male,lr.ag15t24.m
3,20gpi,Gender partity index of youth literacy rate of...,GPI,lr.ag15t24.gpi
4,21,Adjusted net enrolment rate in primary educati...,Both_sexes,nera.1.cp


--This will be the query parameters:


Index(['Statistical unit', 'Unit of measure', 'Level of education',
       'Orientation', 'Sex', 'Age', 'Grade', 'Type of institution',
       'Level of educational attainment', 'Wealth quintile', 'Location',
       'Type of education', 'Field of education', 'School subject',
       'Infrastructure', 'Socioeconomic background', 'Teaching experience',
       'Type of contract', 'Country/Region of origin', 'Destination region',
       'Immigration status'],
      dtype='object')

--This is an example of a query construction:


'UNESCO,EDU_NON_FINANCE,3.0/LR.PT._Z._Z._T.Y15T24._Z._Z._Z._Z._T._Z._Z._Z._Z._Z._Z._Z.W00.W00._Z.'

## Input API key

In [5]:
key = 'eacdf60587ba47a092eaf020b4bd42f5'
headers ={'Ocp-Apim-Subscription-key': key}

### 1. Get `EDU_NON_FINANCE` data structure definition from API endpoint, save as xml file on disk, and parse using `xmltodict.parse`

In [83]:
def get_dsd(dataflow, headers):

    baseURL = 'https://api.uis.unesco.org/sdmx/datastructure/UNESCO/'
    # url of API endpoint returning all the dissemination dataflows (i.e. indicators) with data available
    url = baseURL + dataflow + '/latest/?format=sdmx-2.1&detail=full&references=all&locale=en' # 

    # creating HTTP response object from given url 
    resp = requests.get(url, headers = headers) 

    # saving the xml message into an xml file 
    with open('DSD__'+dataflow+'.xml', 'wb') as f: 
        f.write(resp.content) 
        
    with open('DSD__'+dataflow+'.xml', encoding="utf-8") as fd:
        doc = xmltodict.parse(fd.read())

    return doc

In [84]:
dsd = get_dsd('EDU_NON_FINANCE', headers)

#### Explore the contents of the first levels of the xml file

In [85]:
dsd.keys()

odict_keys(['mes:Structure'])

In [86]:
dsd['mes:Structure'].keys()

odict_keys(['@xmlns:xsi', '@xmlns:xml', '@xmlns:mes', '@xmlns:str', '@xmlns:com', 'mes:Header', 'mes:Structures'])

In [87]:
dsd['mes:Structure']['mes:Structures'].keys()

odict_keys(['str:OrganisationSchemes', 'str:Dataflows', 'str:Codelists', 'str:Concepts', 'str:DataStructures'])

In [88]:
dsd['mes:Structure']['mes:Structures']['str:DataStructures'].keys()

odict_keys(['str:DataStructure'])

In [89]:
dsd['mes:Structure']['mes:Structures']['str:DataStructures']['str:DataStructure'].keys()

odict_keys(['@urn', '@isExternalReference', '@agencyID', '@id', '@isFinal', '@version', 'com:Name', 'str:DataStructureComponents'])

### 2. Get the data pertaining to the dataflow

In [90]:
def dataflow_info(dsd):

    dataflow = dsd['mes:Structure']['mes:Structures']['str:Dataflows']['str:Dataflow']
    
    dataflow_id = dataflow['@id']
    dataflow_agencyID = dataflow['@agencyID']
    dataflow_name = dataflow['com:Name']['#text']


    return {'dataflow_id': dataflow_id,
            'dataflow_agencyID' : dataflow_agencyID,
            'dataflow_name': dataflow_name}

In [91]:
dataflow_info(get_dsd('EDU_NON_FINANCE', headers))

{'dataflow_id': 'EDU_NON_FINANCE',
 'dataflow_agencyID': 'UNESCO',
 'dataflow_name': 'Education: Students and Teachers'}

### 3. Get the code lists pertaining to the dataflow

In [98]:
dsd['mes:Structure']['mes:Structures']['str:Codelists']['str:Codelist'][0].keys()


odict_keys(['@urn', '@isExternalReference', '@agencyID', '@id', '@isFinal', '@version', 'com:Name', 'com:Description', 'str:Code'])

In [109]:
def codelists(dsd):

    codelists = []

    for cl in dsd['mes:Structure']['mes:Structures']['str:Codelists']['str:Codelist']:
        if cl['@agencyID'] != 'UNESCO':
            continue
        codelist = {}
        codelist_agency = cl['@agencyID']
        codelist_id = cl['@id']
        codelist_name = cl['com:Name']['#text']
        if 'com:Description' in cl.keys():
            codelist_desc = cl['com:Description']['#text']
        else:
            codelist_desc = None

        codelist['id'] = codelist_id
        codelist['agency'] = codelist_agency
        codelist['name'] = codelist_name
        codelist['desc'] = codelist_desc
        codelist['codes'] = []

        # Some code lists have only one code; in this case, they are returned as a single dictionary. Other code
        # lists have many codes, and in this case they are returned as lists of dictionaries. 

        if isinstance(cl['str:Code'], list):
            for c in cl['str:Code']:
                code_desc =  c['com:Name']['#text']

                codelist['codes'].append({'code' : c['@id'], 'description' : code_desc})
        else:
            code_desc =  cl['structure:Code']['common:Name']['#text']

            codelist['codes'].append({'code': cl['structure:Code']['@id'], 'description' : code_desc})

        codelists.append(codelist)

    # Display first four code lists
    return codelists


In [111]:
codelists(dsd)[0]

{'id': 'CL_AGE',
 'agency': 'UNESCO',
 'name': 'Age',
 'desc': 'Age of the statistical units',
 'codes': [{'code': 'Y_LT15', 'description': 'Less than 15 years'},
  {'code': 'Y_GE65', 'description': '65 years and over'},
  {'code': 'Y0T1', 'description': '0-1 years'},
  {'code': '_T', 'description': 'Total'},
  {'code': 'Y15T24', 'description': '15-24 years'},
  {'code': 'Y25T64', 'description': '25-64 years'},
  {'code': 'Y_GE15', 'description': '15 years and over'},
  {'code': 'Y_GE25', 'description': '25 years and over'},
  {'code': 'Y_GE50', 'description': '50 years and over'},
  {'code': 'TH_ENTRY_AGE', 'description': 'Official entrance age'},
  {'code': 'SCH_AGE_GROUP', 'description': 'School-age population'},
  {'code': 'UNDER_AGE', 'description': 'Under age'},
  {'code': 'OVER_AGE', 'description': 'Over age'},
  {'code': '_Z', 'description': 'Not applicable'},
  {'code': 'UNDER1_AGE',
   'description': 'One year younger than official entry age'},
  {'code': 'OVER1_AGE',
   'des

### Get data from API endpoint

In [32]:
def buildUrl(indicatorCode, start = '2000', end = '2018'):
    baseURL = 'https://api.uis.unesco.org/sdmx/data/'
    url = baseURL +  indicatorCode + '?startPeriod='+start+'&endPeriod='+end+'2018&format=csv&locale=en'  # 
    return url

In [35]:
def getcsv (url, file):
    # creating HTTP response object from given url 
    resp = requests.get(url, headers = headers) 
    return resp
#    file_loc = '..\\data\\UNESCO_UIS_indicators\\' + file
#    # saving the xml message into an xml file 
#    with open(file_loc, 'wb') as f: 
#        f.write(resp.content) 
#    display('finished downloading ' + file)
        

In [36]:
def parse_data(indicatorCode, start = '2000', end = '2018'):
    
    baseURL = 'https://api.uis.unesco.org/sdmx/data/'
    url = baseURL +  indicatorCode + '?startPeriod='+start+'&endPeriod='+end+'&format=csv&locale=en' 
    
    s=requests.get(url, headers = headers).content
    
    c=pd.read_csv(io.StringIO(s.decode('utf-8')))
    
    #c['FREE_TEXT_NOTE'] = c['FREE_TEXT_NOTE'].astype(str)
    
    return c

In [41]:
ic = indicator_mapping['Query'][0]

parse_data(ic).head(5)

Unnamed: 0,STAT_UNIT,UNIT_MEASURE,EDU_LEVEL,EDU_CAT,SEX,AGE,GRADE,SECTOR_EDU,EDU_ATTAIN,WEALTH_QUINTILE,...,COUNTRY_ORIGIN,REGION_DEST,IMM_STATUS,REF_AREA,TIME_PERIOD,OBS_VALUE,UNIT_MULT,OBS_STATUS,FREQ,DECIMALS
0,LR,PT,_Z,_Z,_T,Y15T24,_Z,_Z,_Z,_Z,...,W00,W00,_Z,AE,2005,95.00645,0,A,A,5
1,LR,PT,_Z,_Z,_T,Y15T24,_Z,_Z,_Z,_Z,...,W00,W00,_Z,AF,2011,46.99005,0,A,A,5
2,LR,PT,_Z,_Z,_T,Y15T24,_Z,_Z,_Z,_Z,...,W00,W00,_Z,AF,2018,65.42055,0,E,A,5
3,LR,PT,_Z,_Z,_T,Y15T24,_Z,_Z,_Z,_Z,...,W00,W00,_Z,AIMS_ASIA_PAC,2000,86.68584,0,A,A,5
4,LR,PT,_Z,_Z,_T,Y15T24,_Z,_Z,_Z,_Z,...,W00,W00,_Z,AIMS_ASIA_PAC,2001,86.59168,0,A,A,5
