In [11]:
from utils import get_API_code_address_etc
from transformation.destination import Destination
from transformation.dataflow import Dataflow
from transformation import define_maps
import pandas as pd
import re
import os

#### Read from data_dictionary all API sources (so far only two among all indicators populated)

In [12]:
# path to excel data dictionary in repo
data_dict_file = './data_dictionary/indicator_dictionary_TM_v1.xlsx'

In [13]:
# get indicators that are extracted by API (code and address in dataframe)
api_code_addr_df = get_API_code_address_etc(data_dict_file)
len(api_code_addr_df)

26

#### Perform transformations on raw data extracted from API sources
I could do the transformations in the same loop from extraction. For the sake of this notebook clarity I will repeat the loop below assuming data has already been extracted.

*Output:* One big csv with all the input to warehouse AND its equivalent pieces splitted for indicators in different csv's

In [14]:
# raw data destination path
raw_path = './data_raw/'
# name of transmonee dataflow in UNICEF warehouse
dataflow_out = "ECARO:TRANSMONEE(1.0)"
# transformed data destination path
trans_path = './data_transformed/'

In [15]:
# destination dataframe from TMEE DSD (data structure definition)
dest_dsd = Destination('TMEE')
dest_dsd_df = pd.DataFrame(columns=dest_dsd.get_csv_columns(), dtype=str)

# Re write the loop along `api_code_addr_df` dataframe
for index, row in api_code_addr_df.iterrows():

    indicator_code = row['Code']
    indicator_source = row['Data_Source']
    indicator_notes = row['Obs_Footnote']
    
    # "metadata" from data dictionary retained for dataflow constants
    # if any of these below are dataflow columns, then they won't be used
    # Development NOTE: redefine this logic if we want data dictionary predominance
    constants = {
        'UNICEF_INDICATOR': indicator_code,
        'DATA_SOURCE': indicator_source,
        'OBS_FOOTNOTE': indicator_notes
    }
    
    # Just for prototype --> skip indicators not downloaded
    if os.path.exists(f"{raw_path}{indicator_code}.csv"):
        # build dataframe with indicator raw data
        data_raw = pd.read_csv(f"{raw_path}{indicator_code}.csv", dtype=str)
    else:
        continue
    
    # retain only codes form csv headers (recall HTTP header: application/vnd.sdmx.data+csv;version=1.0.0)
    raw_columns = data_raw.columns.values
    rename_dict = {k:v.split(':')[0] for k,v in zip(raw_columns,raw_columns)}
    data_raw.rename(columns=rename_dict,inplace=True)
    
    # get dataflow from data raw anchor [0,0]
    text = data_raw.iloc[0,0]
    pattern = ':(.+?)\('
    dataflow_key = re.findall(pattern, text)[0]
    
    print(f"Transform indicator: {indicator_code}, from dataflow: {dataflow_key}")
    
    # instantiate dataflow class with the actual one
    dflow_actual = Dataflow(dataflow_key)
    if dflow_actual.cod_map:
        # map the codes - normalization - works 'inplace'
        dflow_actual.map_codes(data_raw)
    # map the columns
    data_map = dflow_actual.map_dataframe(data_raw, constants)
    # append to destination dataframe
    dest_dsd_df = dest_dsd_df.append(data_map)
    
    # save transformed indicator info independently (through pandas)
    data_trans = pd.DataFrame(columns=dest_dsd.get_csv_columns(), dtype=str)
    data_trans = data_trans.append(data_map)
    # destination Dataflow: corresponding UNICEF Warehouse DSD name
    data_trans['Dataflow'] = dataflow_out
    # save file
    data_trans.to_csv(f"{trans_path}{indicator_code}.csv",index=False)
    

Transform indicator: DM_BRTS, from dataflow: DM
Transform indicator: DM_POP_URBN, from dataflow: DM
Transform indicator: DM_FRATE_TOT, from dataflow: DM
Transform indicator: NT_BW_LBW, from dataflow: NUTRITION
Transform indicator: CME_MRY0, from dataflow: CME
Transform indicator: CME_MRY0T4, from dataflow: CME
Transform indicator: CME_MRM0, from dataflow: CME
Transform indicator: MNCH_MMR, from dataflow: MNCH
Transform indicator: HVA_PMTCT_ARV_CVG, from dataflow: HIV_AIDS
Transform indicator: MNCH_PNCMOM, from dataflow: MNCH
Transform indicator: MNCH_PNEUCARE, from dataflow: MNCH
Transform indicator: IM_DTP3, from dataflow: IMMUNISATION
Transform indicator: IM_MCV2, from dataflow: IMMUNISATION
Transform indicator: NT_BF_EIBF, from dataflow: NUTRITION
Transform indicator: NT_BF_EXBF, from dataflow: NUTRITION
Transform indicator: NT_CF_MAD, from dataflow: NUTRITION
Transform indicator: NT_ANT_WHZ_PO2, from dataflow: NUTRITION
Transform indicator: NT_ANT_HAZ_NE2, from dataflow: NUTRITION


In [16]:
# All transform done? Add destination DSD name to destination dataframe
dest_dsd_df['Dataflow'] = dataflow_out
# save file
load_path = './data_2_load/'
load_file = 'TMEE_2_load'
dest_dsd_df.to_csv(f"{load_path}{load_file}.csv",index=False)