#### Objective: feed TMEE indicators into UNICEF rdm

In [None]:
import pandas as pd
from utils import get_API_code_address_etc
import numpy as np
import re
# get_codelist_API_legacy

##### RDM bulk upload (indicators and attributes): https://rdm.unicef.org/Documentation (as of 03/Feb/21)

In [None]:
# data from UNICEF rdm indicators: hierachy (agencies, sector, domains, subdomains) and attributes
rdm_attributes_file = "./rdm/RDM_agency_sector_domain_attribute.xlsx"
agency_df = pd.read_excel(rdm_attributes_file, sheet_name=0, header=1, usecols="B:E")
sector_df = pd.read_excel(rdm_attributes_file, sheet_name=1, header=1, usecols="B:G")
domain_df = pd.read_excel(rdm_attributes_file, sheet_name=2, header=1, usecols="B:G")
attrib_df = pd.read_excel(rdm_attributes_file, sheet_name=3, header=1, usecols="B:E")

##### Data from TMEE ETL (Data Dictionary + Legacy metadata)

In [None]:
# excel data-dictionary
data_dict_file = "./data_in/data_dictionary/indicator_dictionary_TM_v5.xlsx"
# get indicators that are extracted by API
api_code_addr_df = get_API_code_address_etc(data_dict_file)

In [None]:
# path to file with legacy indicators meta data (age, sex, code, units)
path_legacy = "./data_in/legacy_data/content_legacy_codes_v3.csv"
legacy_metadata = pd.read_csv(path_legacy, dtype=str)

In [None]:
# legacy metadata processes up to codes (for SDMX load)
# indicators codelist still contains some manual inputs (file below)
ind_codes_file = "./data_out/codelists/CL_TMEE_INDICATORS.csv"
ind_codelist = pd.read_csv(ind_codes_file, dtype=str)

#### Conform the First bulk upload file: indicators

In [None]:
indicator_columns = [
    "HelixCode",
    "English Name",
    "Agency Code",
    "Sector Name",
    "Domain Name",
    "is Published",
]
indicator_upload = pd.DataFrame(columns=indicator_columns)

In [None]:
# add indicators retrieved by API
api_code_addr_df.rename(columns={"Code":"HelixCode","Indicator_name":"English Name"}, inplace=True)
indicator_upload = indicator_upload.append(api_code_addr_df[["HelixCode", "English Name"]],ignore_index=True)

In [None]:
# make sure we don't bring nan's into indicator_upload.HelixCode
logic_not_nan = indicator_upload.HelixCode.notna()
# indicators from codelist that are not from API (legacy)
legacy_ind = np.setdiff1d(ind_codelist.Code, indicator_upload.HelixCode[logic_not_nan])
# array with logics
all_leg_logics = [ind_codelist.Code == code for code in legacy_ind]
red_leg_logics = np.logical_or.reduce(all_leg_logics)
# rename for append later
ind_codelist.rename(columns={"Code":"HelixCode","Indicator_name":"English Name"}, inplace=True)

In [None]:
# add indicators processed from Excel TMEE dissemination (or "legacy")
indicator_upload = indicator_upload.append(ind_codelist[red_leg_logics],ignore_index=True)
# easy set: all are published
indicator_upload["is Published"] = True

##### Agency ECARO for those not Helix (else UNICEF)

In [None]:
# get Helix from Data_Source
source_helix = api_code_addr_df.Data_Source.str.contains("Helix")
# get codes from indicators originally in helix
org_code_helix = api_code_addr_df.HelixCode[source_helix].values
# array with logics helix
all_helix_logics = [indicator_upload.HelixCode == code for code in org_code_helix]
red_helix_logics = np.logical_or.reduce(all_helix_logics)

In [None]:
# set indicators from Helix as UNICEF Agency
indicator_upload["Agency Code"][red_helix_logics] = "UNICEF"
# set remaining indicators as ECARO Agency
indicator_upload["Agency Code"][~red_helix_logics] = "ECARO"

##### Assign Sector and Domain for ECARO indicators: Education

In [None]:
# start with the easy ones, Sector Education (Domain Education)
# get Education from Theme
edu_theme = api_code_addr_df.Theme == "Education"
# get codes from indicators with Education Theme
edu_codes = api_code_addr_df.HelixCode[edu_theme].values
# array with logics Education
edu_logics = [indicator_upload.HelixCode == code for code in edu_codes]
red_edu_logics = np.logical_or.reduce(edu_logics)
# get only edu that are ecaro
edu_and_ecaro = red_edu_logics & (~red_helix_logics)

In [None]:
# set indicators Sector and Domain Education
indicator_upload["Sector Name"][edu_and_ecaro] = "Education"
indicator_upload["Domain Name"][edu_and_ecaro] = "Education"

In [None]:
# add disabilities in Education from TM legacy (not in dictionary)
# get EDU_ from legacy metadata
edu_in_legacy = legacy_metadata.code.str.contains("EDU_")
# get codes from indicators EDU in legacy
edu_leg_codes = legacy_metadata.code[edu_in_legacy].unique()
# array with logics Education in legacy
edu_leg_logics = [indicator_upload.HelixCode == code for code in edu_leg_codes]
red_edu_leg_logics = np.logical_or.reduce(edu_leg_logics)

In [None]:
# set indicators Sector and Domain Education for Legacy Education
indicator_upload["Sector Name"][red_edu_leg_logics] = "Education"
indicator_upload["Domain Name"][red_edu_leg_logics] = "Education"

##### Assign Sector and Domain for ECARO indicators: Demography

In [None]:
# Population Theme should go to Sector Demography (Domain Demography)
# get Population from Theme
pop_theme = api_code_addr_df.Theme == "Population"
# get codes from indicators with Population Theme
pop_codes = api_code_addr_df.HelixCode[pop_theme].values
# array with logics Population
pop_logics = [indicator_upload.HelixCode == code for code in pop_codes]
red_pop_logics = np.logical_or.reduce(pop_logics)
# get only pop that are ecaro - none as of 05 Feb 2021 (may come in future updates)
pop_and_ecaro = red_pop_logics & (~red_helix_logics)

In [None]:
# set indicators Sector and Domain Demography
indicator_upload["Sector Name"][pop_and_ecaro] = "Demography"
# set remaining indicators as ECARO Agency
indicator_upload["Domain Name"][pop_and_ecaro] = "Demography"

In [None]:
# add Demography from TM legacy (not in dictionary)
# get DM_ from legacy metadata
dm_in_legacy = legacy_metadata.code.str.contains("DM_")
# get codes from indicators DM in legacy
dm_leg_codes = legacy_metadata.code[dm_in_legacy].unique()
# array with logics Demography in legacy
dm_leg_logics = [indicator_upload.HelixCode == code for code in dm_leg_codes]
red_dm_leg_logics = np.logical_or.reduce(dm_leg_logics)

In [None]:
# set indicators Sector and Domain Demography for Legacy Demography
indicator_upload["Sector Name"][red_dm_leg_logics] = "Demography"
indicator_upload["Domain Name"][red_dm_leg_logics] = "Demography"

##### Assign Sector and Domain for ECARO indicators - Remaining Fertility, Mortality and Health

In [None]:
# TBDev
# api_code_addr_df.Theme.unique()
api_code_addr_df.columns

#### Work to deliver status to Deepak - Indicators source in SDMX

In [None]:
# I'll start with CL_indicators and add data dict. on top
etl_status_api = ind_codelist.merge(
        api_code_addr_df[["HelixCode", "Theme", "Data_Source"]],
        on="HelixCode", how="left", sort=False
    )

In [None]:
# function to assign theme to legacy
def theme_2_legacy(legacy_codes):
    """
    param: legacy_codes (pandas series)
    """
    # build df from series new column to series
    legacy_theme = pd.DataFrame(columns=['HelixCode', 'Theme'])
    legacy_theme["HelixCode"] = legacy_codes
    # Themes to Assign: Population
    logic_pop = legacy_codes.str.contains('DM_')
    # assign Population
    legacy_theme['Theme'][logic_pop] = 'Population'
    # Themes to Assign: Fertility
    logic_ft = legacy_codes.str.contains('FT_')
    # assign Fertility
    legacy_theme['Theme'][logic_ft] = 'Fertility'
    # Themes to Assign: EDUCATION
    logic_edu = legacy_codes.str.contains('EDU_')
    # assign Education
    legacy_theme['Theme'][logic_edu] = 'Education'
    # Themes to Assign: Child Protection
    logic_pt = legacy_codes.str.contains('PT_')
    # assign PT
    legacy_theme['Theme'][logic_pt] = 'Child Protection'
    # Themes to Assign: Social Protection
    logic_sp = legacy_codes.str.contains('SP_')
    # assign SP
    legacy_theme['Theme'][logic_sp] = 'Social Protection'
    # Themes to Assign: Justice for Children
    logic_jj = legacy_codes.str.contains('JJ_')
    # assign JJ
    legacy_theme['Theme'][logic_jj] = 'Justice for Children'
    # Manual assignemt for now (1)
    logic_man1 = legacy_codes.str.contains('EDUNF_GECER')
    # assign man1
    legacy_theme['Theme'][logic_man1] = 'Education'
    # Manual assignemt for now (2)
    logic_man2 = legacy_codes.str.contains('GN_MTNTY|GN_PTNTY')
    # assign man2
    legacy_theme['Theme'][logic_man2] = 'Social Protection'
    # Manual assignemt for now (3)
    logic_man3 = legacy_codes.str.contains('U5_LFT-ALN')
    # assign man3
    legacy_theme['Theme'][logic_man3] = 'Child Protection'
    return legacy_theme

In [None]:
# logic non Theme assigned --> legacy codes
logic_legacy = etl_status_api.Theme.isna()
legacy_themes = theme_2_legacy(etl_status_api.HelixCode[logic_legacy])
# fill nan's with legacy_themes
etl_status_api.Theme.fillna(legacy_themes.Theme, inplace=True)

In [None]:
# fill nan's in source as NSI (legacy source)
etl_status_api.Data_Source.fillna("NSI:", inplace=True)
# particular example for GECER (missmatch data dictionary v5 in dev)
logic_man1 = etl_status_api.HelixCode.str.contains('EDUNF_GECER')
etl_status_api.Data_Source[logic_man1] = 'UIS:'
# replace data source with regex extraction!
pattern = r"(.*?):"
etl_status_api.Data_Source = etl_status_api.Data_Source.apply(lambda x: re.findall(pattern, x)[0])

In [None]:
# sort by Theme (then index?): own function (might not be the simplest way)
def sort_etl_by_theme(etl_status_api):
    # first all Population
    logic_dm = etl_status_api.Theme == "Population"
    # dataframe output create
    etl_status_by_theme = etl_status_api[logic_dm]
    # Fertility
    logic_ft = etl_status_api.Theme == "Fertility"
    etl_status_by_theme = pd.concat([etl_status_by_theme, etl_status_api[logic_ft]])
    # Mortality
    logic_mt = etl_status_api.Theme == "Mortality"
    etl_status_by_theme = pd.concat([etl_status_by_theme, etl_status_api[logic_mt]])
    # Health
    logic_ht = etl_status_api.Theme == "Health"
    etl_status_by_theme = pd.concat([etl_status_by_theme, etl_status_api[logic_ht]])
    # Education
    logic_edu = etl_status_api.Theme == "Education"
    etl_status_by_theme = pd.concat([etl_status_by_theme, etl_status_api[logic_edu]])
    # Child Protection
    logic_pt = etl_status_api.Theme == "Child Protection"
    etl_status_by_theme = pd.concat([etl_status_by_theme, etl_status_api[logic_pt]])
    # Justice for Children
    logic_jj = etl_status_api.Theme == "Justice for Children"
    etl_status_by_theme = pd.concat([etl_status_by_theme, etl_status_api[logic_jj]])
    # Social Protection
    logic_sp = etl_status_api.Theme == "Social Protection"
    etl_status_by_theme = pd.concat([etl_status_by_theme, etl_status_api[logic_sp]])
    etl_status_by_theme.reset_index(inplace=True, drop=True)
    return etl_status_by_theme

In [None]:
col_sort_list = ['Theme', 'HelixCode', 'English Name', 'Data_Source']
etl_status_api = sort_etl_by_theme(etl_status_api).reindex(columns=col_sort_list)

cl_path = "./data_out/codelists/"
etl_theme_source_file = "ETL_TM_theme_source"
etl_status_api.to_csv(f"{cl_path}{etl_theme_source_file}.csv", index=False)
