# Parse an ILO data structure definition

### Load required libraries

In [1]:
import xmltodict
import json
import requests 
import xml.etree.ElementTree as ET 

### Interactive shell

In [2]:
# https://volderette.de/jupyter-notebook-tip-multiple-outputs/
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Get data from API endpoint, save as xml file on disk, and parse using `xmltodict.parse`

In [3]:
def get_dsd(dataflow):

    baseURL = 'https://www.ilo.org/sdmx/rest/datastructure/ILO/'

    # url of API endpoint returning all the dissemination dataflows (i.e. indicators) with data available
    url = baseURL + dataflow + '?references=all' # 

    # creating HTTP response object from given url 
    resp = requests.get(url) 

    # saving the xml message into an xml file 
    with open('DSD__'+dataflow+'.xml', 'wb') as f: 
        f.write(resp.content) 
        
    with open('DSD__'+dataflow+'.xml', encoding="utf-8") as fd:
        doc = xmltodict.parse(fd.read())

    return doc

In [4]:

test_dsd = get_dsd('YI_ALL_EAP_DWAP_SEX_AGE_GEO_RT')


## Explore the contents of the first levels of the xml file

In [5]:
test_dsd.keys()

odict_keys(['message:Structure'])

In [6]:
test_dsd['message:Structure'].keys()

odict_keys(['@xmlns:message', '@xmlns:structure', '@xmlns:common', 'message:Header', 'message:Structures'])

In [7]:
test_dsd['message:Structure']['message:Structures'].keys()

odict_keys(['structure:OrganisationSchemes', 'structure:Dataflows', 'structure:Codelists', 'structure:Concepts', 'structure:DataStructures'])

### 1. Get the data pertaining to the dataflow

In [8]:
def dataflow_info(dsd):

    dataflow = dsd['message:Structure']['message:Structures']['structure:Dataflows']['structure:Dataflow']
    #dataflow
    dataflow_id = dataflow['@id']
    dataflow_agencyID = dataflow['@agencyID']
    for dfn in dataflow['common:Name']:
        if dfn['@xml:lang'] == 'en':
            dataflow_name = (dfn['#text'])
    for dfn in dataflow['common:Description']:
        if dfn['@xml:lang'] == 'en':
            dataflow_description = (dfn['#text'])

    return {'dataflow_id': dataflow_id,
            'dataflow_agencyID' : dataflow_agencyID,
            'dataflow_name': dataflow_name,
            'dataflow_description' : dataflow_description}

In [9]:
dataflow_info(get_dsd('YI_ALL_EAP_DWAP_SEX_AGE_GEO_RT'))

{'dataflow_id': 'DF_YI_ALL_EAP_DWAP_SEX_AGE_GEO_RT',
 'dataflow_agencyID': 'ILO',
 'dataflow_name': 'Labour force participation rate by sex, age and rural / urban areas',
 'dataflow_description': 'The labour force participation rate is the labour force as a percent of the working age population.'}

### 2. Get the code lists pertaining to the dataflow

In [10]:
def codelists(dsd):

    codelists = []

    for cl in dsd['message:Structure']['message:Structures']['structure:Codelists']['structure:Codelist']:

        codelist = {}
        codelist_id = cl['@id']

        # Get the code list name in English:
        for cln in cl['common:Name']:
            if cln['@xml:lang'] == 'en':
                codelist_name = (cln['#text'])

        codelist['id'] = codelist_id
        codelist['name'] = codelist_name
        codelist['codes'] = []

        # Some code lists have only one code; in this case, they are returned as a single dictionary. Other code
        # lists have many codes, and in this case they are returned as lists of dictionaries. 

        if isinstance(cl['structure:Code'], list):
            for c in cl['structure:Code']:
                for cn in c['common:Name']:
                    if cn['@xml:lang'] == 'en':
                        code_desc = (cn['#text'])

                        codelist['codes'].append({'code' : c['@id'], 'description' : code_desc})
        else:
            for cn in cl['structure:Code']['common:Name']:
                if cn['@xml:lang'] == 'en':
                    code_desc = (cn['#text'])

                    codelist['codes'].append({'code': cl['structure:Code']['@id'], 'description' : code_desc})

        codelists.append(codelist)

    # Display first four code lists
    return codelists


In [11]:
codelists(get_dsd('YI_ALL_EAP_DWAP_SEX_AGE_GEO_RT'))[0]

{'id': 'CL_AGE',
 'name': 'Classification: AGE',
 'codes': [{'code': 'AGE_AGGREGATE', 'description': 'Aggregate age bands'},
  {'code': 'AGE_YTHADULT', 'description': 'Youth and adults'},
  {'code': 'AGE_AGGREGATE_TOTAL', 'description': 'Total'},
  {'code': 'AGE_AGGREGATE_YLT15', 'description': '<15'},
  {'code': 'AGE_AGGREGATE_Y15-24', 'description': '15-24'},
  {'code': 'AGE_AGGREGATE_Y25-54', 'description': '25-54'},
  {'code': 'AGE_AGGREGATE_Y55-64', 'description': '55-64'},
  {'code': 'AGE_AGGREGATE_YGE65', 'description': '65+'},
  {'code': 'AGE_YTHADULT_YGE15', 'description': '15+'},
  {'code': 'AGE_YTHADULT_Y15-64', 'description': '15-64'},
  {'code': 'AGE_YTHADULT_Y15-24', 'description': '15-24'},
  {'code': 'AGE_YTHADULT_YGE25', 'description': '25+'}]}

### 3. Get the concepts (and their respective code list ids) pertaining to the data flow

In [12]:
def concepts(dsd):

    concept_schemes = dsd['message:Structure']['message:Structures']['structure:Concepts']['structure:ConceptScheme']

    concepts = []

    for cs in concept_schemes:

        for c in cs['structure:Concept']:
            concept = {}

            concept['id'] = c['@id']

            # Get the concept scheme name in English:
            for cn in c['common:Name']:
                if cn['@xml:lang'] == 'en':
                    concept['name'] = (cn['#text'])



            # Check if c contains any entry with key 'structure:CoreRepresentation' 
            if 'structure:CoreRepresentation' in c:
                concept['code_list'] = c['structure:CoreRepresentation']['structure:Enumeration']['Ref']['@id']
            else:
                concept['code_list'] = None

            concepts.append(concept)

    return concepts


In [13]:
concepts(get_dsd('YI_ALL_EAP_DWAP_SEX_AGE_GEO_RT'))

[{'id': 'SEX', 'name': 'Sex', 'code_list': 'CL_SEX'},
 {'id': 'AGE', 'name': 'Age', 'code_list': 'CL_AGE'},
 {'id': 'GEO', 'name': 'Area type', 'code_list': 'CL_GEO'},
 {'id': 'COLLECTION', 'name': 'Data collection', 'code_list': 'CL_COLLECTION'},
 {'id': 'REF_AREA', 'name': 'Reference area', 'code_list': 'CL_AREA'},
 {'id': 'FREE_TEXT_NOTE',
  'name': 'Free text note; should only be used when a note cannot be coded or in the case of multiple values for one note type',
  'code_list': None},
 {'id': 'FREQ', 'name': 'Frequency', 'code_list': 'CL_FREQ'},
 {'id': 'OBS_VALUE', 'name': 'Observation value', 'code_list': None},
 {'id': 'TIME_PERIOD', 'name': 'Time period', 'code_list': None},
 {'id': 'DECIMALS', 'name': 'Decimals', 'code_list': None},
 {'id': 'SURVEY', 'name': 'Survey', 'code_list': 'CL_SURVEY'},
 {'id': 'OBS_STATUS',
  'name': 'Observation status',
  'code_list': 'CL_OBS_STATUS'},
 {'id': 'UNIT_MEASURE_TYPE',
  'name': 'Unit of measure type',
  'code_list': 'CL_UNIT_MEASURE_T

### 4. Get data structures

In [14]:
def structure(dsd):
        
    dimensionList =  dsd['message:Structure']['message:Structures']['structure:DataStructures']['structure:DataStructure']['structure:DataStructureComponents']['structure:DimensionList']
    attributeList =  dsd['message:Structure']['message:Structures']['structure:DataStructures']['structure:DataStructure']['structure:DataStructureComponents']['structure:AttributeList']
    measureList =  dsd['message:Structure']['message:Structures']['structure:DataStructures']['structure:DataStructure']['structure:DataStructureComponents']['structure:MeasureList']

    dimensions = []
    for d in dimensionList['structure:Dimension']:
        dimensions.append(d['@id'])                      
    #display(dimensions)


    attributes = []
    for a in attributeList['structure:Attribute']:
        attributes.append(a['@id'])                      
    #display(attributes)

    measures = []
    measures.append(measureList['structure:PrimaryMeasure']['@id'])
    #display(measures)

    structure = {'dimensions' : dimensions,
                 'attributes' : attributes,
                 'measures' : measures}

    return structure

In [15]:
structure(get_dsd('YI_ALL_EAP_DWAP_SEX_AGE_GEO_RT'))

{'dimensions': ['COLLECTION',
  'REF_AREA',
  'FREQ',
  'SURVEY',
  'MEASURE',
  'SEX',
  'AGE',
  'GEO'],
 'attributes': ['S3',
  'T3',
  'C12',
  'I13',
  'S4',
  'R1',
  'T5',
  'C6',
  'S14',
  'I11',
  'S5',
  'T2',
  'I8',
  'OBS_STATUS',
  'UNIT_MEASURE_TYPE',
  'UNIT_MEASURE',
  'UNIT_MULT',
  'FREE_TEXT_NOTE',
  'DECIMALS',
  'SOURCE',
  'INDICATOR'],
 'measures': ['OBS_VALUE']}

### 5. Pack everything together

In [16]:
def parse_dsd(dsd):

    dsd2 = dataflow_info(dsd)

    dsd2['concepts'] = concepts(dsd)
    dsd2['structure'] = structure(dsd)
    dsd2['codelists'] = codelists(dsd)

    return(dsd2)



In [17]:
parse_dsd(get_dsd('YI_ALL_EAP_DWAP_SEX_AGE_GEO_RT'))

{'dataflow_id': 'DF_YI_ALL_EAP_DWAP_SEX_AGE_GEO_RT',
 'dataflow_agencyID': 'ILO',
 'dataflow_name': 'Labour force participation rate by sex, age and rural / urban areas',
 'dataflow_description': 'The labour force participation rate is the labour force as a percent of the working age population.',
 'concepts': [{'id': 'SEX', 'name': 'Sex', 'code_list': 'CL_SEX'},
  {'id': 'AGE', 'name': 'Age', 'code_list': 'CL_AGE'},
  {'id': 'GEO', 'name': 'Area type', 'code_list': 'CL_GEO'},
  {'id': 'COLLECTION',
   'name': 'Data collection',
   'code_list': 'CL_COLLECTION'},
  {'id': 'REF_AREA', 'name': 'Reference area', 'code_list': 'CL_AREA'},
  {'id': 'FREE_TEXT_NOTE',
   'name': 'Free text note; should only be used when a note cannot be coded or in the case of multiple values for one note type',
   'code_list': None},
  {'id': 'FREQ', 'name': 'Frequency', 'code_list': 'CL_FREQ'},
  {'id': 'OBS_VALUE', 'name': 'Observation value', 'code_list': None},
  {'id': 'TIME_PERIOD', 'name': 'Time period'