# Set up

## Check configuration
Should return path to correct python version (from virtual environment)

In [17]:
import sys
print(sys.executable)
# print('\n'.join(sys.path[:6]))

c:\Users\WB499754\wb-projects\my_packages\tidysdmx\.venv\Scripts\python.exe


## Load libraries

In [41]:
# Automatically reload modules before execution of each cell
# so when you edit src/mypackage/*.py in your editor and rerun cells, 
# changes appear immediately.
%reload_ext autoreload
%autoreload 2

# python
from __future__ import annotations

# Standard library
from pathlib import Path

# Third-party
import pandas as pd
import pysdmx as px
import pickle as pkl

# Custom
## Functions
from tidysdmx import filter_tidy_raw, validate_dataset_local, map_structures, infer_schema, infer_role_dimension


## Define globals

In [5]:
# CAUTION! FOR TESTING ONLY. DO NOT USE IN PRODUCTION.
# os.environ["PYTHONHTTPSVERIFY"] = "0"

# FMR and artefacts information
fmr_url = "https://fmrqa.worldbank.org/FMR/sdmx/v2"
# raw schema
raw_structure_agency = "WB"
raw_structure_id = "IFPRI_ASTI"
raw_structure_version = "1.0"
# dissemination schema
dis_structure_agency = "WB.DATA360"
dis_structure_id = "DS_DATA360"
dis_structure_version = "1.3"
# structure map
raw_structure_map = "SM_IFPRI_ASTI_TO_DATA360"

# Path to raw tidy data
path_to_data = Path(
    "../../tests/fixtures/data/ifpri_asti_sample.csv"
)

## Initiate API client

In [4]:
print(fmr_url)
client = px.api.fmr.RegistryClient(fmr_url)
client

https://fmrqa.worldbank.org/FMR/sdmx/v2


<pysdmx.api.fmr.RegistryClient at 0x2217963d210>

# STEP 1 - Load raw data (in tidy format)


We start this notebook with tidy data that has already been fetch from the source and gone through basic cleaning and reshaping. This part (fetching, cleaining, reshaping) will always be speficic to a given dataset. It is therefore not possible to standardize this part of a pipeline, and it will not be covered in this notebook.

In [6]:
df = pd.read_csv(path_to_data)
df.head()

Unnamed: 0,INDICATOR,NOTE,AREA,TIME_PERIOD,OBS_VALUE
0,EXP_CAP_TOT_SHRE,Shares are based on data for DARSS only.,SWZ,2012,0.0
1,EXP_CAP_TOT_SHRE,Shares are based on data for IRAF only.,GAB,2012,0.0
2,EXP_CAP_TOT_SHRE,Shares are based on data for DAR only.,LSO,2012,0.0
3,EXP_CAP_TOT_SHRE,Shares are based on data for INERA only. Capit...,BFA,2012,0.0
4,EXP_CAP_TOT_SHRE,Shares are based on data for IER only. Salarie...,MLI,2012,0.0


# STEP 2 - Get metadata artefacts from FMR

## Get raw dataset schema
Information about the expected tidy data schema is stored in the FMR. We will fetch this information using the pysdmx library. This information will be used for early validation of the tidy data. That information could also be used to detect non-structural changes in the source data over time: Addition/removal of indicators for instance. This is not yet covered in this notebook. 

In [8]:
raw_schema = client.get_schema("datastructure", agency=raw_structure_agency, id=raw_structure_id, version=raw_structure_version)
raw_schema

Schema(context='datastructure', agency='WB', id='IFPRI_ASTI', components=Components(data=[Component(id='AREA', required=True, role=Role.DIMENSION, concept=Concept(id='AREA', urn='urn:sdmx:org.sdmx.infomodel.conceptscheme.Concept=WB:CS_WB(1.0.0).AREA', name='Area', dtype=DataType.STRING, enum_ref='urn:sdmx:org.sdmx.infomodel.codelist.Codelist=WB:CL_AREA(1.0)'), local_dtype=DataType.STRING, name='Area', local_codes=Codelist(id='CL_AREA', name='World Bank Reference Area Code List', version='2.0', agency='WB', items=[Code(id='NZL', name='New Zealand', description='New Zealand'), Code(id='FJI', name='Fiji', description='Fiji'), Code(id='PNG', name='Papua New Guinea', description='Papua New Guinea'), Code(id='GLP', name='Guadeloupe', description='Guadeloupe'), Code(id='STP', name='Sao Tome and Principe', description='Sao Tome and Principe'), Code(id='MHL', name='Marshall Islands', description='Marshall Islands'), Code(id='WLF', name='Wallis-et-Futuna (Fr.)', description='Wallis and Futuna'),

## Get structure map
Mapping information from raw data schema to dissemination schema is also being stored in FMR. That information can be easily retrieved using from the FRM API using `pysdmx`.

In [10]:
sm = client.get_mapping("WB", "SM_IFPRI_ASTI_TO_DATA360")
sm

StructureMap(id='SM_IFPRI_ASTI_TO_DATA360', name='Structure map from IFPRI_ASTI to DATA360', version='1.0', agency='WB', source='urn:sdmx:org.sdmx.infomodel.datastructure.DataStructure=WB:IFPRI_ASTI(1.0)', target='urn:sdmx:org.sdmx.infomodel.datastructure.DataStructure=WB.DATA360:DS_DATA360(1.3)', maps=(ImplicitComponentMap(source='TIME_PERIOD', target='TIME_PERIOD'), ImplicitComponentMap(source='OBS_VALUE', target='OBS_VALUE'), ComponentMap(source='INDICATOR', target='SEX', values=RepresentationMap(id='RM_IFPRI_ASTI_TO_DATA360_SEX', name='Sex mapping for IFPRI_ASTI to DATA360', version='1.0', agency='WB', source='urn:sdmx:org.sdmx.infomodel.codelist.Codelist=WB:CL_IFPRI_ASTI_INDICATORS(1.0)', target='urn:sdmx:org.sdmx.infomodel.codelist.Codelist=SDMX:CL_SEX(2.1)', maps=[ValueMap(source='RES_FEMALE_TOT_FTE', target='F'), ValueMap(source='RES_MALE_TOT_FTE', target='M'), ValueMap(source='RES_TOT_FTE', target='_T')])), ImplicitComponentMap(source='AREA', target='REF_AREA'), MultiComponent

## Get dissemination dataset schema

In [11]:
dis_schema = client.get_schema("datastructure", agency=dis_structure_agency, id=dis_structure_id, version=dis_structure_version)
dis_schema



# STEP 3 - Check, filter, validate raw data

Information contained in the SDMX artefacts (schema + content constraints) can be used to validate raw data, flag new / removed indicators, and filter rows to be further processed. 

## Filter out rows that are not needed

In [12]:
df = filter_tidy_raw(df=df, schema=raw_schema)
df.head()

Unnamed: 0,INDICATOR,NOTE,AREA,TIME_PERIOD,OBS_VALUE
11029,RES_TOT_FTE,Includes estimates for expatriate researchers ...,MUS,1992,108.960606
11030,RES_TOT_FTE,Includes estimates for expatriate researchers ...,MUS,1982,74.49
11031,RES_TOT_FTE,Includes estimates for expatriate researchers ...,MUS,2020,
11032,RES_TOT_FTE,Includes estimates for expatriate researchers ...,MUS,2021,
11033,RES_TOT_FTE,Includes estimates for expatriate researchers ...,MUS,2002,145.604849


## Validate cleaned-up raw data before mapping

In [13]:
# errors = validate_dataset_local(df, schema=schema, sdmx_cols=[])
errors = validate_dataset_local(df, schema=raw_schema, sdmx_cols=[])

# STEP 4: Map data from raw to dissemination schema

In [14]:
out = map_structures(df = df, structure_map = sm)
out.head()

Unnamed: 0,INDICATOR,NOTE,AREA,TIME_PERIOD,OBS_VALUE,COMP_BREAKDOWN_1,COMP_BREAKDOWN_2,COMP_BREAKDOWN_3,REF_AREA,SEX,URBANISATION
11029,RES_TOT_FTE,Includes estimates for expatriate researchers ...,MUS,1992,108.960606,_Z,_Z,_Z,MUS,_T,_Z
11030,RES_TOT_FTE,Includes estimates for expatriate researchers ...,MUS,1982,74.49,_Z,_Z,_Z,MUS,_T,_Z
11031,RES_TOT_FTE,Includes estimates for expatriate researchers ...,MUS,2020,,_Z,_Z,_Z,MUS,_T,_Z
11032,RES_TOT_FTE,Includes estimates for expatriate researchers ...,MUS,2021,,_Z,_Z,_Z,MUS,_T,_Z
11033,RES_TOT_FTE,Includes estimates for expatriate researchers ...,MUS,2002,145.604849,_Z,_Z,_Z,MUS,_T,_Z


# STEP 5: Validate output

In [15]:
dis_errors = validate_dataset_local(df = out, schema = dis_schema)
dis_errors

Unnamed: 0,Validation,Error
0,columns,Found unexpected column: NOTE
1,mandatory_columns,"Missing mandatory columns: {'DATABASE_ID', 'UN..."


In [None]:
# TEMPORARY CHUNK
# out.to_csv('./data/formatted-data.csv', index=False)

# TESTING

In [43]:
my_schema = infer_schema(df,agency="WB", id="INFERRED_SCHEMA")
my_schema

# df2 = pd.DataFrame({"id": [1, 2, 3], "value": ["a", "b", "c"]})

# tst=infer_role_dimension(df2, "value")
# tst

Schema(context='datastructure', agency='WB', id='INFERRED_SCHEMA', components=Components(data=[Component(id='INDICATOR', required=True, role=Role.DIMENSION, concept=Concept(id='INDICATOR', name='Indicator', dtype=DataType.STRING), local_dtype=DataType.STRING, name='Indicator', local_codes=Codelist(id='CL_INDICATOR', name='INDICATOR Codes', agency='WB', items=[Code(id='EXP_CAP_TOT_SHRE', name='EXP_CAP_TOT_SHRE'), Code(id='EXP_OPERAT_TOT_SHRE', name='EXP_OPERAT_TOT_SHRE'), Code(id='EXP_SALARIES_TOT_SHRE', name='EXP_SALARIES_TOT_SHRE'), Code(id='EXP_TOT_ARI_AGGDP', name='EXP_TOT_ARI_AGGDP'), Code(id='EXP_TOT_CONSTLCU_FTE', name='EXP_TOT_CONSTLCU_FTE'), Code(id='EXP_TOT_PPP_FTE', name='EXP_TOT_PPP_FTE'), Code(id='EXP_TOT_USD_FTE', name='EXP_TOT_USD_FTE'), Code(id='RES_31_40_TOT_SHRE', name='RES_31_40_TOT_SHRE'), Code(id='RES_41_50_TOT_SHRE', name='RES_41_50_TOT_SHRE'), Code(id='RES_51_60_TOT_SHRE', name='RES_51_60_TOT_SHRE'), Code(id='RES_BSC_TOT_FTE', name='RES_BSC_TOT_FTE'), Code(id='RES