In [1]:
# this is intended to process ROFST for flagship only!
import pandas as pd
import numpy as np
# import from TMEE ETL
from transformation.destination import Destination
from transformation.dataflow import Dataflow, define_maps
from utils import api_request
import os

  from pandas.util.testing import assert_frame_equal


In [2]:
# path to NATMON data package
path = "./tests/uis_bdss/"
# read csv files
edu_data = pd.read_csv(path+"SDG_DATA_NATIONAL.csv", dtype=str)
meta_data = pd.read_csv(path+"SDG_METADATA.csv", dtype=str)
country_labels = pd.read_csv(path+"SDG_COUNTRY.csv", dtype=str)
edu_labels = pd.read_csv(path+"SDG_LABEL.csv", dtype=str)


In [None]:
# identify indicators: ROFST (out-of-school children rate)
logic_rofst = edu_labels.INDICATOR_ID.str.lower().str.contains('rofst')
# identify indicators: ROFST without household survery data
logic_hhs = edu_labels.INDICATOR_ID.str.lower().str.contains('rofst.h.')
# identify indicators: GER (SDG)
logic_ger = edu_labels.INDICATOR_ID.str.lower().str.contains('ger.')

In [26]:
# indentify indicators for sarah 14 april (learning outcome indicators)
logic_web = edu_labels.INDICATOR_LABEL_EN.str.lower().str.contains('internet')
logic_dig = edu_labels.INDICATOR_LABEL_EN.str.lower().str.contains('digit')

In [None]:
# print edu_labels oosc subset
edu_labels[logic_rofst & ~logic_hhs]
edu_labels[logic_ger]

In [27]:
edu_labels[logic_web]
edu_labels[logic_dig]

Unnamed: 0,INDICATOR_ID,INDICATOR_LABEL_EN


In [14]:
# select index from indicator labels
rofst_index = np.concatenate((np.arange(922,930), np.arange(934,938), np.arange(942,950)))
# reduce list to include only sex total
# rofst_index = [922, 924, 926, 928, 934, 936, 946, 948]
# double check: print full indicator labels
edu_labels.INDICATOR_LABEL_EN.loc[rofst_index].values

array(['Out-of-school rate for children of primary school age, both sexes (%)',
       'Out-of-school rate for children of primary school age, female (%)',
       'Out-of-school rate for children of primary school age, adjusted gender parity index (GPIA)',
       'Out-of-school rate for children of primary school age, male (%)',
       'Out-of-school rate for children and adolescents of primary and lower secondary school age, both sexes (%)',
       'Out-of-school rate for children and adolescents of primary and lower secondary school age, female (%)',
       'Out-of-school rate for children and adolescents of primary and lower secondary school age, adjusted gender parity index (GPIA)',
       'Out-of-school rate for children and adolescents of primary and lower secondary school age, male (%)',
       'Out-of-school rate for adolescents of lower secondary school age, both sexes (%)',
       'Out-of-school rate for adolescents of lower secondary school age, female (%)',
       'Out-of-s

In [10]:
# select index for internet indicators (L1, 2 and 3)
web_index = [1251, 1259, 1269]

In [15]:
# indicator array (based on INDICATOR_ID)
ind_array = edu_labels.INDICATOR_ID.loc[rofst_index].values
# country list (based on TM + FLAGSHIP)
country_list = ["ALB", "ARM", "AZE", "BLR", "BIH", "BGR", "HRV", "CZE", "EST", "GEO", "HUN", "KAZ", "KGZ", "LVA", "LTU", "MNE", "MKD", "POL", "MDA", "ROU", "RUS", "SRB", "SVK", "SVN", "TJK", "TUR", "TKM", "UKR", "UZB", "AND", "AUT", "BEL", "CYP", "DNK", "FIN", "FRA", "DEU", "GRC", "VAT", "ISL", "IRL", "ITA", "LIE", "LUX", "MLT", "MCO", "NLD", "NOR", "PRT", "SMR", "ESP", "SWE", "CHE", "GBR", "XKX"]
# spot check Flagship 5 missing: "VAT", "LIE", "AND", "SMR", "MCO"

In [16]:
# indicator array (based on INDICATOR_ID)
ind_array_web = edu_labels.INDICATOR_ID.loc[web_index].values

In [17]:
# which countries are not at all in UIS: only kosovo
np.setdiff1d(country_list, country_labels.COUNTRY_ID.values)

array(['XKX'], dtype='<U3')

In [None]:
# subsetting data for flagship by indicator codes and country list (all years)
logic_rofst = edu_data.INDICATOR_ID.isin(ind_array)
logic_ecaro = edu_data.COUNTRY_ID.isin(country_list)
edu_rofst_ecaro = edu_data[logic_rofst & logic_ecaro].copy()

In [18]:
# subsetting data for internet access (all years)
logic_web = edu_data.INDICATOR_ID.isin(ind_array_web)
logic_ecaro = edu_data.COUNTRY_ID.isin(country_list)
edu_web_ecaro = edu_data[logic_web & logic_ecaro].copy()

In [None]:
# which countries are not reporting on OOSC
np.setdiff1d(country_list, edu_rofst_ecaro.COUNTRY_ID.unique())

In [19]:
# which countries are not reporting on internet access
np.setdiff1d(country_list, edu_web_ecaro.COUNTRY_ID.unique())

array(['AUT', 'BGR', 'BIH', 'CYP', 'CZE', 'DEU', 'GBR', 'GRC', 'HRV',
       'IRL', 'ISL', 'KAZ', 'LIE', 'LTU', 'LUX', 'MKD', 'MLT', 'MNE',
       'ROU', 'RUS', 'SRB', 'SWE', 'TJK', 'TUR', 'VAT', 'XKX'],
      dtype='<U3')

In [20]:
# metadata subset by type: source
meta_data_type = "Source:Data sources"
logic_type = meta_data.TYPE == meta_data_type
meta_data_source = meta_data.loc[logic_type].copy()
# drop metadata column type
meta_data_source.drop(columns="TYPE", inplace=True)

In [None]:
# meta_data_source for ecaro oosc
logic_rofst = meta_data_source.INDICATOR_ID.isin(ind_array)
logic_ecaro = meta_data_source.COUNTRY_ID.isin(country_list)
meta_data_rofst_ecaro = meta_data_source[logic_rofst & logic_ecaro]
# meta_data_source for ecaro oosc is empty
meta_data_rofst_ecaro

In [21]:
# meta_data_source for ecaro web
logic_web_meta = meta_data_source.INDICATOR_ID.isin(ind_array_web)
logic_ecaro_meta = meta_data_source.COUNTRY_ID.isin(country_list)
meta_data_web_ecaro = meta_data_source[logic_web_meta & logic_ecaro_meta]
# meta_data_source check: empty
meta_data_web_ecaro

Unnamed: 0,INDICATOR_ID,COUNTRY_ID,YEAR,METADATA


In [None]:
# add dummie columns to data set for transform
edu_rofst_ecaro["SEX"] = "_T"
edu_rofst_ecaro["AGE"] = "_T"
edu_rofst_ecaro["UNIT"] = "PCNT"
edu_rofst_ecaro["SOURCE"] = ""
edu_rofst_ecaro["NATURE"] = ""

In [None]:
# raw data destination
raw_path = "./tests/uis_bdss/"
# raw data filename to write
raw_file_write = "bdss_rofst_raw"
# write bdss raw data (all indicators) to csv file
edu_rofst_ecaro.to_csv(f"{raw_path}{raw_file_write}.csv", index=False)

In [22]:
# raw data destination
raw_path = "./tests/uis_bdss/"
# raw data filename to write
raw_file_write = "bdss_internet_raw"
# write bdss raw data (all indicators) to csv file
edu_web_ecaro.to_csv(f"{raw_path}{raw_file_write}.csv", index=False)

In [None]:
# transformed data destination
trans_path = "./tests/uis_bdss/"
# transformed data filename to write
trans_file_write = "bdss_rofst_transf"
# TMEE DSD (data structure definition)
dest_dsd = Destination("TMEE")

In [None]:
# transform into TMEE data structure
dflow_col_map = {
    "BDSS": {
        "REF_AREA": {"type": "col", "role": "dim", "value": "COUNTRY_ID"},
        "INDICATOR": {"type": "col", "role": "dim", "value": "INDICATOR_ID"},
        "SEX": {"type": "col", "role": "dim", "value": "SEX"},
        "AGE": {"type": "col", "role": "dim", "value": "AGE"},
        "WEALTH_QUINTILE": {"type": "const", "role": "dim", "value": ""},
        "RESIDENCE": {"type": "const", "role": "dim", "value": ""},
        "TIME_PERIOD": {"type": "col", "role": "time", "value": "YEAR"},
        "OBS_VALUE": {"type": "col", "role": "obs", "value": "VALUE"},
        "COVERAGE_TIME": {"type": "const", "role": "attrib", "value": ""},
        "UNIT_MEASURE": {"type": "col", "role": "attrib", "value": "UNIT"},
        "OBS_FOOTNOTE": {"type": "col", "role": "attrib", "value": "QUALIFIER"},
        "FREQ": {"type": "const", "role": "attrib", "value": ""},
        "DATA_SOURCE": {"type": "col", "role": "attrib", "value": "SOURCE"},
        "UNIT_MULTIPLIER": {"type": "const", "role": "attrib", "value": ""},
        "OBS_STATUS": {"type": "col", "role": "attrib", "value": "NATURE"},
    }
}

In [None]:
# metadata for bdss to sdmx mapping
# OFST.1.CP, OFST.1.F.CP, OFST.1.M.CP
# OFST.2.CP, OFST.2.F.CP, OFST.2.M.CP
# OFST.AGM1.CP, OFST.AGM1.F.CP, OFST.AGM1.M.CP
# augmented for ROFST in SDG (only totals)
bdss_sdmx_map = {
    'ind_id': [
        "OFST.1.CP", "OFST.1.F.CP", "OFST.1.M.CP",
        "OFST.2.CP", "OFST.2.F.CP", "OFST.2.M.CP",
        "OFST.AGM1.CP", "OFST.AGM1.F.CP", "OFST.AGM1.M.CP",
        "ROFST.1.CP", "ROFST.1.GPIA.CP",
        "ROFST.2.CP", "ROFST.2.GPIA.CP",
        "ROFST.1T2.CP", "ROFST.1T2.GPIA.CP",
        "ROFST.AGM1.CP", "ROFST.AGM1.GPIA.CP"
    ],
    'code': [
        "EDUNF_OFST_L1", "EDUNF_OFST_L1", "EDUNF_OFST_L1",
        "EDUNF_OFST_L2", "EDUNF_OFST_L2", "EDUNF_OFST_L2",
        "EDUNF_OFST_L1_UNDER1", "EDUNF_OFST_L1_UNDER1", "EDUNF_OFST_L1_UNDER1",
        "EDUNF_ROFST_L1", "EDUNF_ROFST_L1_GPIA",
        "EDUNF_ROFST_L2", "EDUNF_ROFST_L2_GPIA",
        "EDUNF_ROFST_L1T2", "EDUNF_ROFST_L1T2_GPIA",
        "EDUNF_ROFST_L1_UNDER1", "EDUNF_ROFST_L1_UNDER1_GPIA"
    ],
    'sex': [
        "_T", "F", "M",
        "_T", "F", "M",
        "_T", "F", "M",
        "_T", "_T",
        "_T", "_T",
        "_T", "_T",
        "_T", "_T"
    ],
    'age': [
        "SCHOOL_AGE", "SCHOOL_AGE", "SCHOOL_AGE",
        "SCHOOL_AGE", "SCHOOL_AGE", "SCHOOL_AGE",
        "UNDER1_SCHOOL_ENTRY", "UNDER1_SCHOOL_ENTRY", "UNDER1_SCHOOL_ENTRY",
        "SCHOOL_AGE", "SCHOOL_AGE",
        "SCHOOL_AGE", "SCHOOL_AGE",
        "SCHOOL_AGE", "SCHOOL_AGE",
        "UNDER1_SCHOOL_ENTRY", "UNDER1_SCHOOL_ENTRY"
    ],
    'unit': [
        "PS", "PS", "PS",
        "PS", "PS", "PS",
        "PS", "PS", "PS",
        "PCNT", "GPIA",
        "PCNT", "GPIA",
        "PCNT", "GPIA",
        "PCNT", "GPIA",
    ],
    'source': [
        "UIS: EDUNF_OFST_L1", "UIS: EDUNF_OFST_L1", "UIS: EDUNF_OFST_L1",
        "UIS: EDUNF_OFST_L2", "UIS: EDUNF_OFST_L2", "UIS: EDUNF_OFST_L2",
        "UIS: EDUNF_OFST_L1_UNDER1", "UIS: EDUNF_OFST_L1_UNDER1", "UIS: EDUNF_OFST_L1_UNDER1",
        "UIS: EDUNF_ROFST_L1", "UIS: EDUNF_ROFST_L1_GPIA",
        "UIS: EDUNF_ROFST_L2", "UIS: EDUNF_ROFST_L2_GPIA",
        "UIS: EDUNF_ROFST_L1T2", "UIS: EDUNF_ROFST_L1T2_GPIA",
        "UIS: EDUNF_ROFST_L1_UNDER1", "UIS: EDUNF_ROFST_L1_UNDER1_GPIA"
    ],
}
# bdss to sdmx mapping dataframe
bdss_sdmx_df = pd.DataFrame.from_dict(bdss_sdmx_map)

In [None]:
code_mapping = {
    "BDSS": {
        "SEX": {
            "depends": "INDICATOR_ID",
            "map": dict(zip(bdss_sdmx_df.ind_id, bdss_sdmx_df.sex)),
        },
        "AGE": {
            "depends": "INDICATOR_ID",
            "map": dict(zip(bdss_sdmx_df.ind_id, bdss_sdmx_df.age)),
        },
        "SOURCE": {
            "depends": "INDICATOR_ID",
            "map": dict(zip(bdss_sdmx_df.ind_id, bdss_sdmx_df.source)),
        },
        "UNIT": {
            "depends": "INDICATOR_ID",
            "map": dict(zip(bdss_sdmx_df.ind_id, bdss_sdmx_df.unit)),
        },
        # "INDICATOR_ID" mapping here (note previous dependece) !!!
        "INDICATOR_ID": dict(zip(bdss_sdmx_df.ind_id, bdss_sdmx_df.code)),
        # trick to fill NATURE mapping QUALIFIER column
        "NATURE": {
            "depends": "QUALIFIER",
            "map": {"UIS_EST": "E", "NAT_EST": "E"}},
    },
}

In [None]:
# update imported define_maps with local variables (BDSS mappings)
define_maps.dflow_col_map.update(dflow_col_map)
define_maps.code_mapping.update(code_mapping)

In [None]:
# dataflow to process is BDSS
dataflow_key = "BDSS"
# instantiate dataflow class with the actual key (LEGACY)
dflow_actual = Dataflow(dataflow_key)
# pre-view duplicates in BDSS data
if dflow_actual.check_duplicates(edu_rofst_ecaro):
    print(f"BDSS data contains duplicates")

In [None]:
# map the codes - normalization
dflow_actual.map_codes(edu_rofst_ecaro)

In [None]:
# initialize constants empty (no data from dictionary for BDSS)
constants = {}
# map the columns
data_map = dflow_actual.map_dataframe(edu_rofst_ecaro, constants)

In [None]:
# use data_map in pandas for transformed indicators data
data_trans = pd.DataFrame(columns=dest_dsd.get_csv_columns(), dtype=str)
data_trans = data_trans.append(data_map)

In [None]:
# drop nan values if present
data_trans.dropna(subset=["OBS_VALUE"], inplace=True)
# check non-numerics in data observations
filter_non_num = pd.to_numeric(data_trans.OBS_VALUE, errors="coerce").isnull()
# eliminate non-numerics
if filter_non_num.sum() > 0:
    not_num_array = data_trans.OBS_VALUE[filter_non_num].unique()
    print(f"Non-numeric observations discarded in BDSS data:\n{not_num_array}")
    data_trans.drop(data_trans[filter_non_num].index, inplace=True)
# save file
data_trans.to_csv(f"{trans_path}{trans_file_write}.csv", index=False)

In [None]:
# provide sdmx codelists
ind_cl = {
    "INDICATOR": [
        "EDUNF_OFST_L1",
        "EDUNF_OFST_L2",
        "EDUNF_OFST_L1_UNDER1",
        "EDUNF_ROFST_L1",
        "EDUNF_ROFST_L2",
        "EDUNF_ROFST_L1T2",
        "EDUNF_ROFST_L1_UNDER1",
        "EDUNF_ROFST_L1_GPIA",
        "EDUNF_ROFST_L2_GPIA",
        "EDUNF_ROFST_L1T2_GPIA",
        "EDUNF_ROFST_L1_UNDER1_GPIA"
    ],
    "Indicator": [
        "Number of out-of-school children of primary school age by sex",
        "Number of out-of-school adolescents of lower secondary school age by sex",
        "Number of out-of-school children one year younger than the official entry age to primary education by sex",
        "Out-of-school rate for children of primary school age by sex (%)",
        "Out-of-school rate for adolescents of lower secondary school age by sex (%)",
        "Out-of-school rate for children and adolescents of primary and lower secondary school age (%, by sex)",
        "Out-of-school rate for children one year younger than the official entry age to primary education by sex (%)",
        "Out-of-school rate for children of primary school age, adjusted gender parity index (GPIA)",
        "Out-of-school rate for adolescents of lower secondary school age, adjusted gender parity index (GPIA)",
        "Out-of-school rate for children and adolescents of primary and lower secondary school age, adjusted gender parity index (GPIA)",
        "Out-of-school rate for children one year younger than the official entry age to primary education, adjusted gender parity index (GPIA)"
    ]
}
sex_cl = {
    "SEX": ["_T", "F", "M"],
    "Sex": ["Total", "Female", "Male"]
}
age_cl = {
    "AGE": ["SCHOOL_AGE", "UNDER1_SCHOOL_ENTRY"],
    "Age": ["School age", "One year younger than official school entrance age"]
}
unit_cl = {
    "UNIT_MEASURE": ["PS", "PCNT", "GPIA"],
    "Unit of measure": ["Persons", "%", "Adjusted gender parity index"]
}
# UNICEF’s REST API endpoint for codelists
url_endpoint = "https://sdmx.data.unicef.org/ws/public/sdmxapi/rest/codelist/"
codelist = "UNICEF/CL_COUNTRY"
# address and parameters for codelist request
api_address = url_endpoint + codelist
api_params = {"format": "sdmx-json"}
# API codelist request
country_cl_json = api_request(api_address, api_params).json()
# Country Codelist from SDMX-JSON
country_map = {
    elem["id"]: elem["name"] for elem in country_cl_json["data"]["codelists"][0]["codes"]
}
country_cl = {
    "REF_AREA": country_map.keys(),
    "Geographic area": country_map.values(),
}
os_cl = {
    "OBS_STATUS": ["E"],
    "Observation Status": ["Estimated"]
}

In [None]:
# checkout data (feed sdmx label description into data_trans)
sdmx_data = data_trans.merge(
        pd.DataFrame.from_dict(ind_cl), on="INDICATOR", how="left", sort=False
    ).merge(
        pd.DataFrame.from_dict(sex_cl), on="SEX", how="left", sort=False
    ).merge(
        pd.DataFrame.from_dict(age_cl), on="AGE", how="left", sort=False
    ).merge(
        pd.DataFrame.from_dict(unit_cl), on="UNIT_MEASURE", how="left", sort=False
    ).merge(
        pd.DataFrame.from_dict(country_cl), on="REF_AREA", how="left", sort=False
    ).merge(
        pd.DataFrame.from_dict(os_cl), on="OBS_STATUS", how="left", sort=False
    )

In [None]:
# checkout trivials for BDSS (residence, wealth, etc)
sdmx_data["Residence"] = "Total"
sdmx_data["Wealth Quintile"] = "Total"
sdmx_data["Frequency"] = ""
sdmx_data["Unit multiplier"] = ""

In [None]:
# reorder as of SDMX format csv and checkout!
col_ind = ["REF_AREA", "Geographic area", "INDICATOR", "Indicator", "SEX", "Sex", "AGE", "Age", "RESIDENCE", "Residence", "WEALTH_QUINTILE", "Wealth Quintile", "TIME_PERIOD", "OBS_VALUE", "COVERAGE_TIME", "UNIT_MEASURE", "Unit of measure", "OBS_FOOTNOTE", "FREQ", "Frequency", "DATA_SOURCE", "UNIT_MULTIPLIER", "Unit multiplier", "OBS_STATUS", "Observation Status"]
sdmx_write_file = "ROFST_L1_L2_L1T2_TMEE"
sdmx_data.reindex(columns=col_ind).to_csv(f"{trans_path}{sdmx_write_file}.csv", index=False)

In [None]:
# csv files with data transformed
files_trans = ["OFST_L1_L2_L1T2_TMEE.csv", "ROFST_L1_L2_L1T2_TMEE.csv"]

# pandas concat
dest_dsd_df = pd.concat(
    [pd.read_csv(f"{trans_path}{f}", dtype=str) for f in files_trans]
)

In [None]:
# save file if not present to avoid re-writing
etl_out_file = "OFST_ROFST_L1_L2_L1T2_TMEE"

if f"{etl_out_file}.csv" not in [file for file in os.listdir(trans_path)]:
    dest_dsd_df.to_csv(f"{trans_path}{etl_out_file}.csv", index=False)
else:
    print(f"{etl_out_file} file not re-written, please first delete it to update.")