# Parse an ILO's SDMX dataflow and data structure definition

### Load required libraries

In [1]:
import xmltodict
import json
import requests 
import xml.etree.ElementTree as ET 

import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
import io

import math

### Interactive shell

In [2]:
# https://volderette.de/jupyter-notebook-tip-multiple-outputs/
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### 1. Get data from API endpoint, save as xml file on disk, and parse using `xmltodict.parse`

In [3]:
def get_dsd(dataflow):

    baseURL = 'https://www.ilo.org/sdmx/rest/datastructure/ILO/'

    # url of API endpoint returning all the dissemination dataflows (i.e. indicators) with data available
    url = baseURL + dataflow + '?references=all' # 

    # creating HTTP response object from given url 
    resp = requests.get(url) 

    # saving the xml message into an xml file 
    with open('DSD__'+dataflow+'.xml', 'wb') as f: 
        f.write(resp.content) 
        
    with open('DSD__'+dataflow+'.xml', encoding="utf-8") as fd:
        doc = xmltodict.parse(fd.read())

    return doc

In [4]:

#test_dsd = get_dsd('YI_ALL_EAP_DWAP_SEX_AGE_GEO_RT')
#test_dsd = get_dsd('YI_ALL_EMP_TEMP_SEX_STE_DT')


#### Explore the contents of the first levels of the xml file

In [5]:
#test_dsd.keys()

In [6]:
#test_dsd['message:Structure'].keys()

In [7]:
#test_dsd['message:Structure']['message:Structures'].keys()

### 2. Get the data pertaining to the dataflow

In [8]:
def dataflow_info(dsd):

    dataflow = dsd['message:Structure']['message:Structures']['structure:Dataflows']['structure:Dataflow']
    #dataflow
    dataflow_id = dataflow['@id']
    dataflow_agencyID = dataflow['@agencyID']
    for dfn in dataflow['common:Name']:
        if dfn['@xml:lang'] == 'en':
            dataflow_name = (dfn['#text'])
    for dfn in dataflow['common:Description']:
        if dfn['@xml:lang'] == 'en':
            dataflow_description = (dfn['#text'])

    return {'dataflow_id': dataflow_id,
            'dataflow_agencyID' : dataflow_agencyID,
            'dataflow_name': dataflow_name,
            'dataflow_description' : dataflow_description}

In [9]:
#dataflow_info(get_dsd('YI_ALL_EMP_TEMP_SEX_STE_DT'))

### 3. Get the code lists pertaining to the dataflow

In [10]:
def codelists(dsd):

    codelists = []

    for cl in dsd['message:Structure']['message:Structures']['structure:Codelists']['structure:Codelist']:

        codelist = {}
        codelist_id = cl['@id']

        # Get the code list name in English:
        for cln in cl['common:Name']:
            if cln['@xml:lang'] == 'en':
                codelist_name = (cln['#text'])

        codelist['id'] = codelist_id
        codelist['name'] = codelist_name
        codelist['codes'] = []

        # Some code lists have only one code; in this case, they are returned as a single dictionary. Other code
        # lists have many codes, and in this case they are returned as lists of dictionaries. 

        if isinstance(cl['structure:Code'], list):
            for c in cl['structure:Code']:
                for cn in c['common:Name']:
                    if cn['@xml:lang'] == 'en':
                        code_desc = (cn['#text'])

                        codelist['codes'].append({'code' : c['@id'], 'description' : code_desc})
        else:
            for cn in cl['structure:Code']['common:Name']:
                if cn['@xml:lang'] == 'en':
                    code_desc = (cn['#text'])

                    codelist['codes'].append({'code': cl['structure:Code']['@id'], 'description' : code_desc})

        codelists.append(codelist)

    # Display first four code lists
    return codelists


In [37]:
#codelists(get_dsd('YI_ALL_EMP_PTER_SEX_RT'))[0]

### 4. Get the concepts (and their respective code list ids) pertaining to the data flow

In [54]:
dsd_test = get_dsd('YI_ALL_EMP_PTER_SEX_RT')

concept_schemes = dsd_test['message:Structure']['message:Structures']['structure:Concepts']['structure:ConceptScheme']

concepts = []

for cs in concept_schemes:
    
    if isinstance(cs['structure:Concept'], list):
        
        for c in cs['structure:Concept']:
            concept = {}

            concept['id'] = c['@id']

            # Get the concept scheme name in English:
            for cn in c['common:Name']:
                if cn['@xml:lang'] == 'en':
                    concept['name'] = (cn['#text'])
                    concept['scheme'] =  cs['@id']



            # Check if c contains any entry with key 'structure:CoreRepresentation' 
            if 'structure:CoreRepresentation' in c:
                concept['code_list'] = c['structure:CoreRepresentation']['structure:Enumeration']['Ref']['@id']
            else:
                concept['code_list'] = None

            concepts.append(concept)
        
    else:
        
        c = cs['structure:Concept']
        
        concept = {}

        concept['id'] = c['@id']

        # Get the concept scheme name in English:
        for cn in c['common:Name']:
            if cn['@xml:lang'] == 'en':
                concept['name'] = (cn['#text'])
                concept['scheme'] =  cs['@id']



        # Check if c contains any entry with key 'structure:CoreRepresentation' 
        if 'structure:CoreRepresentation' in c:
            concept['code_list'] = c['structure:CoreRepresentation']['structure:Enumeration']['Ref']['@id']
        else:
            concept['code_list'] = None

        concepts.append(concept)
        
concepts
        


[{'id': 'SEX',
  'name': 'Sex',
  'scheme': 'CS_CLASSIF_TYPE',
  'code_list': 'CL_SEX'},
 {'id': 'COLLECTION',
  'name': 'Data collection',
  'scheme': 'CS_ILOSTAT',
  'code_list': 'CL_COLLECTION'},
 {'id': 'REF_AREA',
  'name': 'Reference area',
  'scheme': 'CS_ILOSTAT',
  'code_list': 'CL_AREA'},
 {'id': 'FREE_TEXT_NOTE',
  'name': 'Free text note; should only be used when a note cannot be coded or in the case of multiple values for one note type',
  'scheme': 'CS_ILOSTAT',
  'code_list': None},
 {'id': 'FREQ',
  'name': 'Frequency',
  'scheme': 'CS_ILOSTAT',
  'code_list': 'CL_FREQ'},
 {'id': 'OBS_VALUE',
  'name': 'Observation value',
  'scheme': 'CS_ILOSTAT',
  'code_list': None},
 {'id': 'TIME_PERIOD',
  'name': 'Time period',
  'scheme': 'CS_ILOSTAT',
  'code_list': None},
 {'id': 'DECIMALS',
  'name': 'Decimals',
  'scheme': 'CS_ILOSTAT',
  'code_list': None},
 {'id': 'SURVEY',
  'name': 'Survey',
  'scheme': 'CS_ILOSTAT',
  'code_list': 'CL_SURVEY'},
 {'id': 'OBS_STATUS',
  'n

In [55]:
def concepts(dsd):

    concept_schemes = dsd['message:Structure']['message:Structures']['structure:Concepts']['structure:ConceptScheme']

    concepts = []

    for cs in concept_schemes:

        if isinstance(cs['structure:Concept'], list):

            for c in cs['structure:Concept']:
                concept = {}

                concept['id'] = c['@id']

                # Get the concept scheme name in English:
                for cn in c['common:Name']:
                    if cn['@xml:lang'] == 'en':
                        concept['name'] = (cn['#text'])
                        concept['scheme'] =  cs['@id']

                # Check if c contains any entry with key 'structure:CoreRepresentation' 
                if 'structure:CoreRepresentation' in c:
                    concept['code_list'] = c['structure:CoreRepresentation']['structure:Enumeration']['Ref']['@id']
                else:
                    concept['code_list'] = None

                concepts.append(concept)

        else:

            c = cs['structure:Concept']

            concept = {}

            concept['id'] = c['@id']

            # Get the concept scheme name in English:
            for cn in c['common:Name']:
                if cn['@xml:lang'] == 'en':
                    concept['name'] = (cn['#text'])
                    concept['scheme'] =  cs['@id']
                    
            # Check if c contains any entry with key 'structure:CoreRepresentation' 
            if 'structure:CoreRepresentation' in c:
                concept['code_list'] = c['structure:CoreRepresentation']['structure:Enumeration']['Ref']['@id']
            else:
                concept['code_list'] = None

            concepts.append(concept)

    return concepts

In [57]:
#concepts(get_dsd('YI_ALL_EMP_PTER_SEX_RT'))

### 5. Get data structures

In [58]:
def structure(dsd):
        
    dimensionList =  dsd['message:Structure']['message:Structures']['structure:DataStructures']['structure:DataStructure']['structure:DataStructureComponents']['structure:DimensionList']
    attributeList =  dsd['message:Structure']['message:Structures']['structure:DataStructures']['structure:DataStructure']['structure:DataStructureComponents']['structure:AttributeList']
    measureList =  dsd['message:Structure']['message:Structures']['structure:DataStructures']['structure:DataStructure']['structure:DataStructureComponents']['structure:MeasureList']

    dimensions = []
    for d in dimensionList['structure:Dimension']:
        dimensions.append(d['@id'])                      
    #display(dimensions)


    attributes = []
    for a in attributeList['structure:Attribute']:
        attributes.append(a['@id'])                      
    #display(attributes)

    measures = []
    measures.append(measureList['structure:PrimaryMeasure']['@id'])
    #display(measures)

    structure = {'dimensions' : dimensions,
                 'attributes' : attributes,
                 'measures' : measures}

    return structure

In [59]:
#structure(get_dsd('YI_ALL_EMP_TEMP_SEX_STE_DT'))

### 6. Pack everything together

In [60]:
def parse_dsd(dsd):

    dsd2 = dataflow_info(dsd)

    dsd2['concepts'] = concepts(dsd)
    dsd2['structure'] = structure(dsd)
    dsd2['codelists'] = codelists(dsd)

    return(dsd2)



In [61]:
#parse_dsd(get_dsd('YI_ALL_EMP_TEMP_SEX_STE_DT'))

## Read data in csv format

In [62]:
def parse_data(dataflow,start,end):
    
    baseURL = 'https://www.ilo.org/sdmx/rest/data/ILO,DF_'
    url = baseURL + dataflow + '/?format=csv&startPeriod=' + start + '&endPeriod=' + end# 
    s=requests.get(url).content
    
    c=pd.read_csv(io.StringIO(s.decode('utf-8')))
    
    c['FREE_TEXT_NOTE'] = c['FREE_TEXT_NOTE'].astype(str)
    
    return c

In [63]:
#start = '2000-01-01'
#end = '2019-12-31'
#dataflow = 'YI_ALL_EMP_TEMP_SEX_STE_DT'

#x = parse_data(dataflow, start, end)
#x.head(5)
#x.columns

## Verify existence of duplicates due to multiple surveys, and remove duplicates if found

In [64]:
def remove_duplicates(dataflow,start,end):
    
    dimensions = parse_dsd(get_dsd(dataflow))['structure']['dimensions']
    dimensions.remove('SURVEY')
    dimensions.append('TIME_PERIOD')
    
    x = parse_data(dataflow, start, end)
    
    test1 = x[dimensions]
    display(test1.shape)
    test2 = x[dimensions].drop_duplicates()
    display(test2.shape)
    
    if test1.shape != test2.shape:
        display("There are instances of multiple surveys in the same year")
        idx = x.groupby(dimensions)['SURVEY'].transform(max) == x['SURVEY']
        x = x[idx]
    else:
        display("There are no instances of multiple surveys in the same year")
        
    return x

In [65]:
#start = '2000-01-01'
#end = '2019-12-31'
#dataflow = 'YI_ALL_EMP_TEMP_SEX_STE_DT'
#
#x = remove_duplicates(dataflow,start,end)
#x.shape
#x.columns
#x.head(10)

### Parse DSD for the dataflow

In [66]:
#dsd2 = parse_dsd(get_dsd(dataflow))
#dsd2.keys()

### Add code description to each coded dimension 

In [107]:
def add_descriptions(dataflow,start,end):

    x = remove_duplicates(dataflow,start,end)
    
    dsd = parse_dsd(get_dsd(dataflow))
                
    #----------------------------------------------
    # Add code description to each dimension
    #----------------------------------------------

    for d in dsd['structure']['dimensions']:
        for c in dsd['concepts']:
            if d != c['id']:
                continue
            codelist_id = c['code_list']

            #print('-----------------')
            #print(d)
            #print(codelist_id)
            #print('-----------------')

            for cl in dsd['codelists']:
                if codelist_id != cl['id']:
                    continue

                # if dimension is numerit, to str

                cl_dict = pd.DataFrame.from_dict(cl['codes'])
                cl_dict.columns = [d, d+'_DESC']

                if is_numeric_dtype(x[d]):
                    cl_dict[d] = cl_dict[d].astype(float)

                #cl_dict.head(5)

                x = pd.merge(x,
                     cl_dict,
                     on=d, 
                     how='left')
                
                   
    #----------------------------------------------
    # Add code description to each coded attribute
    #----------------------------------------------

    for a in dsd['structure']['attributes']:
        for c in dsd['concepts']:
            if a != c['id']:
                continue

            codelist_id =c['code_list']

            for cl in dsd['codelists']:

                if codelist_id == cl['id']:
                    cl_dict = pd.DataFrame.from_dict(cl['codes'])
                    cl_dict.columns = [a, a+'_DESC']

                    if is_numeric_dtype(x[a]):
                        try:
                            cl_dict[a] = cl_dict[a].astype(float)
                        except ValueError:
                            x[a].astype(str)
                            cl_dict[a].astype(str)

                    #cl_dict.head(5)
                    try:
                        x = pd.merge(x,
                             cl_dict,
                             on=a, 
                             how='left')
                    except ValueError:
                        print('did not merge')

    return x


In [108]:
start = '2000-01-01'
end = '2019-12-31'
dataflow = 'YI_ALL_EMP_PTER_SEX_RT'

x = add_descriptions(dataflow,start,end)
x.columns
x.head(5)


(3929, 6)

(3818, 6)

'There are instances of multiple surveys in the same year'

did not merge


Index(['DATAFLOW', 'COLLECTION', 'REF_AREA', 'FREQ', 'SURVEY', 'MEASURE',
       'SEX', 'TIME_PERIOD', 'OBS_VALUE', 'S3', 'T3', 'I13', 'T12', 'S4', 'R1',
       'T33', 'T5', 'S8', 'S14', 'I11', 'S7', 'T2', 'S5', 'OBS_STATUS',
       'UNIT_MEASURE_TYPE', 'UNIT_MEASURE', 'UNIT_MULT', 'FREE_TEXT_NOTE',
       'DECIMALS', 'SOURCE', 'INDICATOR', 'COLLECTION_DESC', 'REF_AREA_DESC',
       'FREQ_DESC', 'SURVEY_DESC', 'MEASURE_DESC', 'SEX_DESC', 'S3_DESC',
       'T3_DESC', 'I13_DESC', 'T12_DESC', 'S4_DESC', 'R1_DESC', 'T33_DESC',
       'T5_DESC', 'S8_DESC', 'S14_DESC', 'I11_DESC', 'S7_DESC', 'T2_DESC',
       'S5_DESC', 'UNIT_MEASURE_TYPE_DESC', 'UNIT_MEASURE_DESC',
       'UNIT_MULT_DESC', 'SOURCE_DESC', 'INDICATOR_DESC'],
      dtype='object')

Unnamed: 0,DATAFLOW,COLLECTION,REF_AREA,FREQ,SURVEY,MEASURE,SEX,TIME_PERIOD,OBS_VALUE,S3,...,S14_DESC,I11_DESC,S7_DESC,T2_DESC,S5_DESC,UNIT_MEASURE_TYPE_DESC,UNIT_MEASURE_DESC,UNIT_MULT_DESC,SOURCE_DESC,INDICATOR_DESC
0,ILO:DF_YI_ALL_EMP_PTER_SEX_RT(1.0),YI,USA,A,453,EMP_PTER_RT,SEX_T,2002,22.84,,...,,,,16 years old,,Rate,Percentage,Units,Labour force survey,Incidence of part-time employment by sex -- Co...
1,ILO:DF_YI_ALL_EMP_PTER_SEX_RT(1.0),YI,USA,A,453,EMP_PTER_RT,SEX_T,2003,23.14,,...,,,,16 years old,,Rate,Percentage,Units,Labour force survey,Incidence of part-time employment by sex -- Co...
2,ILO:DF_YI_ALL_EMP_PTER_SEX_RT(1.0),YI,USA,A,453,EMP_PTER_RT,SEX_T,2004,23.25,,...,,,,16 years old,,Rate,Percentage,Units,Labour force survey,Incidence of part-time employment by sex -- Co...
3,ILO:DF_YI_ALL_EMP_PTER_SEX_RT(1.0),YI,USA,A,453,EMP_PTER_RT,SEX_T,2005,22.77,,...,,,,16 years old,,Rate,Percentage,Units,Labour force survey,Incidence of part-time employment by sex -- Co...
4,ILO:DF_YI_ALL_EMP_PTER_SEX_RT(1.0),YI,USA,A,453,EMP_PTER_RT,SEX_T,2006,22.45,,...,,,,16 years old,,Rate,Percentage,Units,Labour force survey,Incidence of part-time employment by sex -- Co...


### Concatenate coded notes and free text footnote

In [109]:
def consolidate_footnotes(dataflow,start,end):
    
    x = add_descriptions(dataflow,start,end)
    dsd = parse_dsd(get_dsd(dataflow))
    
    for index, row in x.iterrows():
        footnote = ''
        for c in dsd['concepts']:
            if c['scheme'] != 'CS_NOTE_TYPE':
                continue
            # access data using column names
            if not math.isnan(row[c['id']]):
                footnote = footnote + c['name']+': '+row[c['id']+'_DESC'] + '. '

        if x.at[index, 'FREE_TEXT_NOTE'] == 'nan':
            x.at[index, 'FREE_TEXT_NOTE'] = footnote
        else:
             x.at[index, 'FREE_TEXT_NOTE'] = footnote + x.at[index, 'FREE_TEXT_NOTE']
                
    return x


In [110]:
#start = '2000-01-01'
#end = '2019-12-31'
#dataflow = 'YI_ALL_EMP_TEMP_SEX_STE_DT'
#
#x = consolidate_footnotes(dataflow,start,end)
#x.columns
#x.head(5)

### Select columns to save into excel

In [111]:
def transform(dataflow,start,end,sortv):
    
    x = consolidate_footnotes(dataflow,start,end)
    
    dsd = parse_dsd(get_dsd(dataflow))

    final_columns = ['DATAFLOW']

    # Add dimension columns:
    for d in dsd['structure']['dimensions']:
        final_columns.append(d)
        final_columns.append(d+'_DESC')

    # Add time dimension column:
    final_columns.append('TIME_PERIOD')

    # Add primary measure column:
    final_columns = final_columns + dsd['structure']['measures']

    # Add attribute columns (except coded notes):
    for a in dsd['structure']['attributes']:
        for c in dsd['concepts']:
            if c['scheme'] == 'CS_NOTE_TYPE':
                continue
            if a == c['id']:
                final_columns.append(a)
                if a+'_DESC' in x.columns:
                    final_columns.append(a+'_DESC')
    
    sort_vector = ['REF_AREA'] + sortv + ['TIME_PERIOD']

    x = x.sort_values(by=sort_vector)
    
    return x[final_columns]

In [112]:
start = '2000-01-01'
end = '2019-12-31'


parameters = [{'dataflow' : 'YI_ALL_EAP_DWAP_SEX_AGE_GEO_RT',
              'sorting' : ['SEX', 'AGE', 'GEO']},
              {'dataflow' : 'YI_ALL_EMP_TEMP_SEX_ECO_DT',
              'sorting' :  ['SEX','ECO']},
              {'dataflow' : 'YI_ALL_EMP_PTER_SEX_RT',
              'sorting' : ['SEX']},
              {'dataflow' : 'YI_ALL_EMP_TEMP_SEX_STE_DT',
              'sorting' : ['SEX', 'STE']}
             ]

                                    
for p in parameters:
    print(p['dataflow'])
    print(p['sorting'])
    x = transform(p['dataflow'],start,end, p['sorting'])
    print("---finished transforming")
    x.to_excel(p['dataflow'] + '_final' + '.xlsx', index=False)
    print("---finished writing to excel")




YI_ALL_EMP_PTER_SEX_RT
['SEX']


(3929, 6)

(3818, 6)

'There are instances of multiple surveys in the same year'

did not merge
---finished transforming
---finished writing to excel
YI_ALL_EMP_TEMP_SEX_STE_DT
['SEX', 'STE']


(47675, 7)

(45584, 7)

'There are instances of multiple surveys in the same year'

---finished transforming
---finished writing to excel
