# DFx ETL Pipeline

## unicef.org

An ETL pipeline for [UNICEF Indicator Data Warehouse](https://sdmx.data.unicef.org) from United Nations Children's Fund (UNICEF).

### Libraries

In [None]:
import pandas as pd
from tqdm import tqdm

from dfpp.sources import unicef_org as source
from dfpp.storage import AzureStorage as Storage

DATAFLOW = "UNICEF,GLOBAL_DATAFLOW,1.0"
storage = Storage()

### Extract

In [2]:
df_metadata = source.get_series_metadata(DATAFLOW)
print("Shape:", df_metadata.shape)
display(df_metadata.head())

Shape: (354, 3)


Unnamed: 0,series_id,series_name,series_description
0,HVA_PED_EID_NUM,Reported number of infants born to pregnant wo...,Reported number of infants born to pregnant wo...
1,MG_RFGS_CNTRY_ASYLM_PER1000,"Refugees by host country, per 1000 population",
2,DM_POP_CHILD_PROP,Children population as proportion of total pop...,
3,ECON_GVT_EDU_EXP_PTEXP,government expenditure on education (% governm...,
4,MNCH_INSTDEL,Institutional deliveries - percentage of deliv...,Institutional deliveries - percentage of deliv...


In [None]:
data = []
for series_id in tqdm(df_metadata["series_id"]):
    try:
        df = source.get_series_data(DATAFLOW, indicator=series_id)
        df["series_id"] = series_id
    except Exception as error:
        print(series_id, error)
        continue
    data.append(df)
df_raw = pd.concat(data, axis=0, ignore_index=True)
print("Shape:", df_raw.shape)
display(df_raw.head())

100%|██████████| 5/5 [00:07<00:00,  1.41s/it]

Shape: (29287, 31)





Unnamed: 0,REF_AREA,Geographic area,INDICATOR,Indicator,SEX,Sex,TIME_PERIOD,OBS_VALUE,UNIT_MULTIPLIER,Unit multiplier,...,DATA_SOURCE,SOURCE_LINK,CUSTODIAN,TIME_PERIOD_METHOD,Time period activity related to when the data are collected,REF_PERIOD,COVERAGE_TIME,AGE,Current age,series_id
0,SDGRC_ECA_CA,(SDGRC) Central Africa,WS_HCF_WM-B,Proportion of health care facilities with basi...,_T,Total,2010,39.278759,,,...,WHO/UNICEF Joint Monitoring Programme () WASH ...,,,,,,,_T,Total,WS_HCF_WM-B
1,SDGRC_ECA_CA,(SDGRC) Central Africa,WS_HCF_WM-B,Proportion of health care facilities with basi...,_T,Total,2011,40.183521,,,...,WHO/UNICEF Joint Monitoring Programme () WASH ...,,,,,,,_T,Total,WS_HCF_WM-B
2,SDGRC_ECA_CA,(SDGRC) Central Africa,WS_HCF_WM-B,Proportion of health care facilities with basi...,_T,Total,2012,41.115047,,,...,WHO/UNICEF Joint Monitoring Programme () WASH ...,,,,,,,_T,Total,WS_HCF_WM-B
3,SDGRC_ECA_CA,(SDGRC) Central Africa,WS_HCF_WM-B,Proportion of health care facilities with basi...,_T,Total,2013,39.322906,,,...,WHO/UNICEF Joint Monitoring Programme () WASH ...,,,,,,,_T,Total,WS_HCF_WM-B
4,SDGRC_ECA_CA,(SDGRC) Central Africa,WS_HCF_WM-B,Proportion of health care facilities with basi...,_T,Total,2014,37.164085,,,...,WHO/UNICEF Joint Monitoring Programme () WASH ...,,,,,,,_T,Total,WS_HCF_WM-B


### Transform

In [4]:
df_transformed = source.transform(df_raw)
print("Shape:", df_transformed.shape)
display(df_transformed.head())





Shape: (23460, 11)


Unnamed: 0,source,series_id,series_name,disagr_sex,disagr_age,alpha_3_code,prop_unit,prop_observation_type,year,value,prop_value_label
0,https://sdmx.data.unicef.org,WS_HCF_WM-B,Proportion of health care facilities with basi...,Total,Total,AFG,%,,2016,82.0,
1,https://sdmx.data.unicef.org,WS_HCF_WM-B,Proportion of health care facilities with basi...,Total,Total,AFG,%,,2017,82.0,
2,https://sdmx.data.unicef.org,WS_HCF_WM-B,Proportion of health care facilities with basi...,Total,Total,AFG,%,,2018,82.0,
3,https://sdmx.data.unicef.org,WS_HCF_WM-B,Proportion of health care facilities with basi...,Total,Total,AFG,%,,2019,82.0,
4,https://sdmx.data.unicef.org,WS_HCF_WM-B,Proportion of health care facilities with basi...,Total,Total,AFG,%,,2020,82.0,


### Load

In [5]:
for series_id, df in tqdm(df_transformed.groupby("series_id")):
    df.name = series_id
    storage.publish_dataset(df, folder_path=source.SOURCE_NAME)

100%|██████████| 5/5 [00:00<00:00, 112.15it/s]
