<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

# Preliminaries

## Import packages

In [1]:
# Required standard libraries
import pandas as pd
import json
import urllib
import requests
import os
import re
import numpy as np
import bs4 as bs
import selenium
import html5lib
import nltk
import datetime
from selenium import webdriver

# Extractors (cluster specific)
import extract
# from extract.unesco_extractor import extract_unesco_api_data
# from extract.ilo_extractor import extract_ilo_api_data
# from extract.sdg_extractor import extract_sdg_api_data
# from extract.who_extractor import extract_who_api_data
# from extract.un_treaty_extractor import extract_un_treaties_data
# from extract.ilo_normlex_extractor import extract_ilo_normlex_data

# from extract import save_raw_data

# Cleansers (cluster specific)
import cleanse
# from cleanse.unesco_cleanser import cleanse_unesco_api_data
# from cleanse.ilo_cleanser import cleanse_ilo_api_data
# from cleanse.sdg_cleanser import cleanse_sdg_api_data
# from cleanse.who_cleanser import cleanse_who_api_num_data
# from cleanse.un_treaty_cleanser import cleanse_un_treaty_data
# from cleanse.wpac_cleanser import cleanse_wpac_data

# from cleanse.save_cleansed_data import save_cleansed_data 

# Normalizer (generalised across all clusters)
from normalize import scaler
# from normalize import save_normalized_data

# Utils
from utils import utils

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Define filepaths

In [2]:
# Define the export path for all data exports
from pathlib import Path

# CUrrent working directory
cwd = Path('.')

# Folder with data-in artifacts, quired to run this script
data_in = cwd / 'data_in'

# Folder to export raw data
data_sources_raw = cwd / 'data_out' / 'data_raw'
data_sources_raw.mkdir(parents=True, exist_ok=True)

# Folder to export cleansed data
data_sources_cleansed = cwd / 'data_out' / 'data_cleansed'
data_sources_cleansed.mkdir(parents=True, exist_ok=True)

# Folder to export normalized data
data_sources_normalized = cwd / 'data_out' / 'data_normalized'
data_sources_normalized.mkdir(parents=True, exist_ok=True)

## Load country list and mapping dictionary

In [3]:
# Load the list of countries which contains all different variations of country names 
country_full_list = pd.read_excel(
    data_in / 'all_countrynames_list.xlsx',
    keep_default_na = False).drop_duplicates()

# Create a version of the list with unique ISO2 and ISO3 codes
country_iso_list = country_full_list.drop_duplicates(subset = 'COUNTRY_ISO_2')

# Country CRBA list, this is the list of the countries that should be in the final CRBA indicator list
country_crba_list = pd.read_excel(
    data_in / 'crba_country_list.xlsx',
    header = None,
    usecols = [0, 1], 
    names = ['COUNTRY_ISO_3', 'COUNTRY_NAME']).merge(
        right = country_iso_list[['COUNTRY_ISO_2', 'COUNTRY_ISO_3']],
        how = 'left',
        on='COUNTRY_ISO_3',
        validate = 'one_to_one')

# Run the column mapper script to load the mapping dictionary
with open(data_in / 'column_mapping.py') as file:
    exec(file.read())

# Run the column mapper script to load the mapping dictionary
with open(data_in / 'value_mapping.py') as file:
    exec(file.read())

## Read data dictionary

In [4]:
# sources sheet
crba_data_dictionary_source = pd.read_excel(
    data_in / 'indicator_dictionary_CRBA.xlsx',
    sheet_name = "Source",
    keep_default_na = False
)

# snapshot sheet
crba_data_dictionary_snapshot = pd.read_excel(
    data_in / 'indicator_dictionary_CRBA.xlsx',
    sheet_name = "Snapshot",
    keep_default_na = False
)

# indicator sheet
crba_data_dictionary_indicator = pd.read_excel(
    data_in / 'indicator_dictionary_CRBA.xlsx',
    sheet_name = "Indicator",
    keep_default_na = False
)

# Input lists
crba_data_dictionary_input_list = pd.read_excel(
    data_in / 'indicator_dictionary_CRBA.xlsx',
    sheet_name = "Input_Lists",
    keep_default_na = False
)

# Add 2-digit shortcodes of index, issue and category to indicators sheet
crba_data_dictionary_indicator = crba_data_dictionary_indicator.merge(
    right=crba_data_dictionary_input_list[['INDEX', 'INDEX_CODE']],
    left_on='INDEX',
    right_on='INDEX'
).merge(
    right=crba_data_dictionary_input_list[['ISSUE', 'ISSUE_CODE']],
    left_on='ISSUE',
    right_on='ISSUE'
).merge(
    right=crba_data_dictionary_input_list[['CATEGORY', 'CATEGORY_CODE']],
    left_on='CATEGORY',
    right_on='CATEGORY'
)

# Create indicator code prefix (INDEX-ISSUE_CAEGORY CODE)
crba_data_dictionary_indicator = crba_data_dictionary_indicator.assign(
    INDICATOR_CODE_PREFIX = crba_data_dictionary_indicator.INDEX_CODE +
    "_" +
    crba_data_dictionary_indicator.ISSUE_CODE+
    "_"+
    crba_data_dictionary_indicator.CATEGORY_CODE+
    "_")

# Create indicator code
crba_data_dictionary_indicator = crba_data_dictionary_indicator.assign(
    INDICATOR_CODE = crba_data_dictionary_indicator.INDICATOR_CODE_PREFIX + crba_data_dictionary_indicator.INDICATOR_NAME.apply(
    lambda x: utils.create_ind_code(x)
))

In [5]:
import importlib, inspect

extractors = { 
    cls.type: cls for name, cls in inspect.getmembers(
        importlib.import_module("extract"), 
        inspect.isclass
    ) if hasattr(cls, 'type')
}

# Extract (JAMES LOOK INTO THIS CODE BIT)
## API sources
### CSV API sources

In [5]:
# CSV sources
api_sources = crba_data_dictionary_source[
    (crba_data_dictionary_source["SOURCE_TYPE"] == "API (ILO)") | 
    (crba_data_dictionary_source["SOURCE_TYPE"] == "API (UNESCO)") | 
    (crba_data_dictionary_source["SOURCE_TYPE"] == "API (WHO)")
].merge(
    right = crba_data_dictionary_snapshot,
    on = "SOURCE_ID"
).merge(
    right = crba_data_dictionary_indicator,
    on = 'INDICATOR_ID'
)

# define emty dataframe
combined_cleansed_csv = pd.DataFrame()

# Loop to extract data from API sources
for index, row in api_sources.iterrows():
    # Log
    print("\n - - - - - \n Extracting source {} \n".format(row["SOURCE_ID"]))
    
    # Extraction section
    try:
        # Extract data
        dataframe = extract.CSVExtractor.extract(url = row["ENDPOINT_URL"])
        
        # Save raw data
        dataframe.to_csv(
            data_sources_raw / str(row["SOURCE_ID"] + "_raw.csv"),
            sep = ";"
            )
    
    except:
       print("There was a problem with extraction of source {} \n".format(row["SOURCE_ID"]))
    
    # Log that we are entering cleasning
    print("\n - - - - - \n Cleansing source {} \n".format(row["SOURCE_ID"]))
    
    # Cleansing section
    try: 
        print(row["VALUE_LABELS"])
        dataframe_cleansed = cleanse.Cleanser().cleanse(
            raw_data = dataframe,
            mapping_dictionary = mapping_dict,
            final_sdmx_col_list = sdmx_df_columns_all,
            dim_cols = sdmx_df_columns_dims,
            country_cols = sdmx_df_columns_country,
            time_cols = sdmx_df_columns_time,
            country_list_full = country_full_list,
            crba_country_list = country_crba_list,
            variable_type = row["VALUE_LABELS"]
            )

        # Map column values
        dataframe_cleansed = cleanse.Cleanser().map_values(
            cleansed_data = dataframe_cleansed,
            value_mapping_dict = value_mapper)

        # Add columns
        # Indicator name
        dataframe_cleansed["INDICATOR_NAME"] = row["INDICATOR_NAME_x"]

        # Index name
        dataframe_cleansed["INDICATOR_INDEX"] = row["INDEX"]

        # Issue name
        dataframe_cleansed["INDICATOR_ISSUE"] = row["ISSUE"]

        # Category name
        dataframe_cleansed["INDICATOR_CATEGORY"] = row["CATEGORY"]

        # YEAR_CRBA_RELEASE with current year
        dataframe_cleansed["CRBA_RELEASE_YEAR"] = datetime.datetime.now().year

        # Create column indicator code
        dataframe_cleansed["INDICATOR_CODE"] = row["INDICATOR_CODE"]

        # Save cleansed data
        dataframe_cleansed.to_csv(
            data_sources_cleansed / str(row["SOURCE_ID"] + "_cleansed.csv"),
            sep = ";")

    # Log if there is an error
    except:
        print("There was a problem with cleansing of source {}".format(row["SOURCE_ID"]))
    
    
    # Normalizing section
    #try:
    dataframe_normalized = scaler.normalizer(
        cleansed_data = dataframe_cleansed,
        sql_subset_query_string=row["DIMENSION_VALUES_NORMALIZATION"],
        variable_type = row["VALUE_LABELS"],
        is_inverted = row["INVERT_NORMALIZATION"],
        whisker_factor=1.5,
        raw_data_col="RAW_OBS_VALUE"
        )
    #except:
    #    print("There is an issue with normalizing this source")
    
    
    # Append dataframe to combined dataframe
    combined_cleansed_csv = combined_cleansed_csv.append(
        other = dataframe_cleansed
    )


 - - - - - 
 Extracting source S-51 

The following columns are present in the datasets, and this is the number of unique values they have. 
The column DATAFLOW has 1 unique values.
The column COLLECTION has 1 unique values.
The column REF_AREA has 112 unique values.
The column FREQ has 1 unique values.
The column MEASURE has 1 unique values.
The column SEX has 3 unique values.
The column TIME_PERIOD has 10 unique values.
The column OBS_VALUE has 763 unique values.
The column OBS_STATUS has 2 unique values.
The column UNIT_MEASURE_TYPE has 1 unique values.
The column UNIT_MEASURE has 1 unique values.
The column UNIT_MULT has 1 unique values.
The column SOURCE_NOTE has 47 unique values.
The column INDICATOR_NOTE has 48 unique values.
The column CLASSIFICATION_NOTE has 1 unique values.
The column CURRENCY_NOTE has 1 unique values.
The column DECIMALS has 1 unique values.
The column UPPER_BOUND has 1 unique values.
The column LOWER_BOUND has 1 unique values.

 - - - - - 
 Cleansing sourc

KeyError: 'SCALED_OBS_VALUE'

### JSON API sources

In [8]:
# JSON sources
api_sources = crba_data_dictionary_source[
    (crba_data_dictionary_source["SOURCE_TYPE"] == "API (SDG)")
].merge(
    right = crba_data_dictionary_snapshot,
    on = "SOURCE_ID"
).merge(
    right = crba_data_dictionary_indicator,
    on = 'INDICATOR_ID'
)

# Loop to extract data from API sources
for index, row in api_sources.iterrows():
    print("\n - - - - - \n Extracting source {} \n".format(row["SOURCE_ID"]))
    
    # Exraction section
    try:
        dataframe = extract.JSONExtractor.extract(url = row["ENDPOINT_URL"])
        dataframe.to_csv(
            data_sources_raw / str(row["SOURCE_ID"] + "_raw.csv"),
            sep = ";")
    except:
        print("There was an issue with source {}".format(row["SOURCE_ID"]))

    # Log that we are entering cleasning
    print("\n - - - - - \n Cleansing source {} \n".format(row["SOURCE_ID"]))
    
    # Cleansing section 
    try:
        dataframe_cleansed = cleanse.Cleanser().cleanse(
            raw_data = dataframe,
            mapping_dictionary = mapping_dict,
            final_sdmx_col_list = sdmx_df_columns_all,
            dim_cols = sdmx_df_columns_dims,
            country_cols = sdmx_df_columns_country,
            time_cols = sdmx_df_columns_time,
            country_list_full = country_full_list,
            crba_country_list = country_crba_list,
            variable_type=row["VALUE_LABELS"]
            )
        
        # Map column values
        dataframe_cleansed = cleanse.Cleanser().map_values(
            cleansed_data = dataframe_cleansed,
            value_mapping_dict = value_mapper)

        # Add additional columns
        # Indicator name
        dataframe_cleansed["INDICATOR_NAME"] = row["INDICATOR_NAME_x"]

        # Index name
        dataframe_cleansed["INDICATOR_INDEX"] = row["INDEX"]

        # Issue name
        dataframe_cleansed["INDICATOR_ISSUE"] = row["ISSUE"]

        # Category name
        dataframe_cleansed["INDICATOR_CATEGORY"] = row["CATEGORY"]

        # YEAR_CRBA_RELEASE with current year
        dataframe_cleansed["CRBA_RELEASE_YEAR"] = datetime.datetime.now().year

        # Create column indicator code
        dataframe_cleansed["INDICATOR_CODE"] = row["INDICATOR_CODE"]

        # Save dataframe
        dataframe_cleansed.to_csv(
            data_sources_cleansed / str(row["SOURCE_ID"] + "_cleansed.csv"),
            sep = ";")
    
    except:
       print("There was an issue with cleansing of source {}".format(row["SOURCE_ID"]))

    """
    # Normalizing section
    try:
        dataframe_normalized = normalizer(
            cleansed_data = dataframe_cleansed,
            cat_var = row["VALUE_LABELS"],
            is_inverted = row["INVERT_NORMALIZATION"],
            row["DIMENSION_VALUES_NORMALIZATION"] 
        )

    except:
        print("There was an issue with normalizing this source")
    """
    
    # Append dataframe to combined dataframe
    combined_cleansed_csv = combined_cleansed_csv.append(
        other = dataframe_cleansed
    )

# TO DO: Also include JSON and HTML as extractor --> No other way to put it into the loop than eval()?


 - - - - - 
 Extracting source S-23 

The following columns are present in the datasets, and this is the number of unique values they have. 
The column goal has 1 unique values.
The column target has 1 unique values.
The column indicator has 1 unique values.
The column series has 1 unique values.
The column seriesDescription has 1 unique values.
The column seriesCount has 1 unique values.
The column geoAreaCode has 86 unique values.
The column geoAreaName has 86 unique values.
The column timePeriodStart has 20 unique values.
The column value has 3190 unique values.
The column valueType has 1 unique values.
The column time_detail has 1 unique values.
The column timeCoverage has 1 unique values.
The column upperBound has 1 unique values.
The column lowerBound has 1 unique values.
The column basePeriod has 1 unique values.
The column source has 51 unique values.
The column geoInfoUrl has 1 unique values.
The column footnotes has 17 unique values.
The column attributes.Nature has 2 unique

### Export the combined dataframe from CSV and JSON loop

In [9]:
# Idenify all dimension columns in combined dataframe
available_dim_cols = []
for col in combined_cleansed_csv.columns:
    dim_col = re.findall("DIM_.+", col)
    # print(dim_col)
    if len(dim_col) == 1:
        available_dim_cols += dim_col

# Fill _T for all NA values of dimension columns
# 5b Fill in current year for time variable
combined_cleansed_csv[available_dim_cols] = combined_cleansed_csv[
    available_dim_cols
].fillna(value="_T")

# Export combined cleansed dataframe as a sample
combined_cleansed_csv.to_csv(
    path_or_buf = cwd / 'data_out' / 'combined_cleansed.csv',
    sep = ";"
)

# DEVELOPMENT AND TRASH AREA

## integrate query rather than eval() in the function

In [19]:
s103_cleansed = pd.read_csv(
    filepath_or_buffer = data_sources_cleansed / "S-103_cleansed.csv",
    sep = ";"
)

s103_cleansed.query("DIM_SEX == 'BOTH_SEXES' & DIM_AGE_GROUP == '5-19 YEARS'")

Unnamed: 0.1,Unnamed: 0,TIME_PERIOD,COUNTRY_ISO_3,DIM_AGE_GROUP,DIM_SEX,RAW_OBS_VALUE,ATTR_SOURCE_COMMENTS,_merge,INDICATOR_NAME,INDICATOR_INDEX,INDICATOR_ISSUE,INDICATOR_CATEGORY,CRBA_RELEASE_YEAR,INDICATOR_CODE
0,0,2016.0,AFG,5-19 YEARS,BOTH_SEXES,9.4,,both,Older children and teenagers overweight,Marketplace,Marketing and Advertising,Outcome,2020,MP_MA_OC_OLCHTO
17,17,2016.0,ALB,5-19 YEARS,BOTH_SEXES,25.0,,both,Older children and teenagers overweight,Marketplace,Marketing and Advertising,Outcome,2020,MP_MA_OC_OLCHTO
18,18,2016.0,AND,5-19 YEARS,BOTH_SEXES,35.8,,both,Older children and teenagers overweight,Marketplace,Marketing and Advertising,Outcome,2020,MP_MA_OC_OLCHTO
27,27,2016.0,DZA,5-19 YEARS,BOTH_SEXES,31.0,,both,Older children and teenagers overweight,Marketplace,Marketing and Advertising,Outcome,2020,MP_MA_OC_OLCHTO
36,36,2016.0,AGO,5-19 YEARS,BOTH_SEXES,11.0,,both,Older children and teenagers overweight,Marketplace,Marketing and Advertising,Outcome,2020,MP_MA_OC_OLCHTO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1686,1686,2016.0,VEN,5-19 YEARS,BOTH_SEXES,34.1,,both,Older children and teenagers overweight,Marketplace,Marketing and Advertising,Outcome,2020,MP_MA_OC_OLCHTO
1695,1695,2016.0,VNM,5-19 YEARS,BOTH_SEXES,9.7,,both,Older children and teenagers overweight,Marketplace,Marketing and Advertising,Outcome,2020,MP_MA_OC_OLCHTO
1704,1704,2016.0,YEM,5-19 YEARS,BOTH_SEXES,20.0,,both,Older children and teenagers overweight,Marketplace,Marketing and Advertising,Outcome,2020,MP_MA_OC_OLCHTO
1713,1713,2016.0,ZMB,5-19 YEARS,BOTH_SEXES,12.7,,both,Older children and teenagers overweight,Marketplace,Marketing and Advertising,Outcome,2020,MP_MA_OC_OLCHTO


In [28]:
def normalizer(
    cleansed_data,
    sql_subset_query_string,
    variable_type="Continuous variable",
    is_inverted="not inverted",
    whisker_factor=1.5,
    raw_data_col="RAW_OBS_VALUE",
):
    """Normalize the RAW_OBS_VALUES into indicator scores

    TO DO

    Parameters:
    TO DO
    **dimensions (mapping type): Define the present dimension variables as keys
    along with the dimension value that is supposed to be taken for the normalization.

    """

    # Define the dimension subgroup for which normalization is done:
    # normalization_subset = cleansed_data

    # Empty string which will be filled with subset conditions
    # subset = ""

    # Run loop to get dimensions vaues specified in **dimensions
    # for key in dimensions:
    #    subset += "(cleansed_data['{}'] == '{}')&".format(key, dimensions[key])

    # Get rid of the "&-sign" at the end
    # subset = subset.rstrip("& ")

    # Subset the actual dataframe
    #print(sql_subset_query_string)
    cleansed_data_subset = cleansed_data.query(sql_subset_query_string)
    #print(cleansed_data.query('{}'.format(sql_subset_query_string)))

    if variable_type != "Continuous variable":
        print("\n Categorical variable, still have to develop this section")

    elif variable_type == "Continuous variable":

        # Determine basic descriptive statistics of the distribution that are required for the normalization
        min_val = np.nanmin(cleansed_data_subset[raw_data_col].astype("float"))
        max_val = np.nanmax(cleansed_data_subset[raw_data_col].astype("float"))
        q1 = cleansed_data_subset[raw_data_col].astype("float").quantile(q=0.25)
        q2 = cleansed_data_subset[raw_data_col].astype("float").quantile(q=0.50)
        q3 = cleansed_data_subset[raw_data_col].astype("float").quantile(q=0.75)
        iqr = q3 - q1

        # Define what max value to use for the normalization
        if max_val > q3 + whisker_factor * iqr:
            max_to_use = q3 + whisker_factor * iqr
            print(
                "The distribution of the raw data values this subgroup contains outliers or is too skewed on the upper end. The maximum value to be used for the normalisation is: 3rd quartile or distribution + {} * IQR. It is: {} \n See histogram printed below for info. \n".format(
                    whisker_factor, max_to_use
                )
            )
        else:
            max_to_use = max_val
            print(
                "The distribution of the raw data for this subgroup does not contain outliers on the upper end. It is also not too skewed on the upper end. The maximum value used for the normalisation is the maximum value in the dataset, which is {}. This value corresponds to country: {} \n".format(
                    max_to_use,
                    cleansed_data_subset[
                        cleansed_data_subset[raw_data_col].astype("float") == max_val
                    ],
                )
            )

        # Define what min value to use for the normalization
        if min_val < q1 - whisker_factor * iqr:
            min_to_use = q1 - whisker_factor * iqr
            print(
                "The distribution of the raw data values for this subgroup contains outliers or is too skewed on the lower end. The minimum value to be used for the normalisation is 1st quartile or distribution - {} * IQR. It is: {} \n See histogram printed below for info. \n".format(
                    whisker_factor, min_to_use
                )
            )
        else:
            min_to_use = min_val
            print(
                "The distribution of the raw data for this subgroup does not contain outliers or is too skewed on the lower end. The minimum value used for the normalisation is the minimum value in the dataset, which is {}. This value corresponds to country: {} \n".format(
                    min_to_use,
                    cleansed_data_subset[
                        cleansed_data_subset[raw_data_col].astype("float") == min_val
                    ],
                )
            )

        """
        # If there are outliers or a skewed distribution, print the distribution for the user.
        if (min_val < q1 - whisker_factor * iqr) or (
            max_val > q3 + whisker_factor * iqr
        ):
            print(
                "\n This is the distribution of the raw data of the indicator."
            )
            print(
                pd.to_numeric(s55_cleansed["RAW_OBS_VALUE"]).hist(
                    bins=30
                )
            )"""

        # Define the value range that is used for the scaling (normalization)
        tot_range = max_val - min_val

        # Compute the normalized value of the raw data in the column "SCALED"
        # Distinguish between indicators, whose value must be inverted
        if is_inverted == "inverted":
            cleansed_data_subset["SCALED_OBS_VALUE"] = round(
                10
                - 10
                * (cleansed_data_subset[raw_data_col].astype("float") - min_val)
                / tot_range,
                2,
            )
        else:
            cleansed_data_subset["SCALED_OBS_VALUE"] = round(
                10
                * (cleansed_data_subset[raw_data_col].astype("float") - min_val)
                / tot_range,
                2,
            )

    # join normalized data to original dataframe
    cleansed_data = cleansed_data.merge(right=cleansed_data_subset, how="outer")

    # cleansed_data = pd.concat([cleansed_data, cleansed_data_subset], axis=1, copy = False)

    # insert column to indicate OBS status
    result = cleansed_data.assign(
        OBS_STATUS=np.where(cleansed_data["SCALED_OBS_VALUE"].isnull(), np.nan, "O")
    )

    # Return result
    return result

s103_normalized = normalizer(
    cleansed_data = s103_cleansed,
    sql_subset_query_string="DIM_SEX=='BOTH_SEXES' & DIM_AGE_GROUP=='5-19 YEARS'"
)

s103_normalized

The distribution of the raw data values this subgroup contains outliers or is too skewed on the upper end. The maximum value to be used for the normalisation is: 3rd quartile or distribution + 1.5 * IQR. It is: 56.437499999999986 
 See histogram printed below for info. 

The distribution of the raw data for this subgroup does not contain outliers or is too skewed on the lower end. The minimum value used for the normalisation is the minimum value in the dataset, which is 6.8. This value corresponds to country:      Unnamed: 0  TIME_PERIOD COUNTRY_ISO_3 DIM_AGE_GROUP     DIM_SEX  \
684         684       2016.0           IND    5-19 YEARS  BOTH_SEXES   

     RAW_OBS_VALUE ATTR_SOURCE_COMMENTS _merge  \
684            6.8                  NaN   both   

                              INDICATOR_NAME INDICATOR_INDEX  \
684  Older children and teenagers overweight     Marketplace   

               INDICATOR_ISSUE INDICATOR_CATEGORY  CRBA_RELEASE_YEAR  \
684  Marketing and Advertising        

Unnamed: 0.1,Unnamed: 0,TIME_PERIOD,COUNTRY_ISO_3,DIM_AGE_GROUP,DIM_SEX,RAW_OBS_VALUE,ATTR_SOURCE_COMMENTS,_merge,INDICATOR_NAME,INDICATOR_INDEX,INDICATOR_ISSUE,INDICATOR_CATEGORY,CRBA_RELEASE_YEAR,INDICATOR_CODE,SCALED_OBS_VALUE,OBS_STATUS
0,0,2016.0,AFG,5-19 YEARS,BOTH_SEXES,9.4,,both,Older children and teenagers overweight,Marketplace,Marketing and Advertising,Outcome,2020,MP_MA_OC_OLCHTO,0.45,O
1,1,2016.0,AFG,5-09 YEARS,BOTH_SEXES,10.6,,both,Older children and teenagers overweight,Marketplace,Marketing and Advertising,Outcome,2020,MP_MA_OC_OLCHTO,,
2,2,2016.0,AFG,10-19 YEARS,BOTH_SEXES,8.8,,both,Older children and teenagers overweight,Marketplace,Marketing and Advertising,Outcome,2020,MP_MA_OC_OLCHTO,,
3,3,2016.0,AFG,5-19 YEARS,FEMALE,9.9,,both,Older children and teenagers overweight,Marketplace,Marketing and Advertising,Outcome,2020,MP_MA_OC_OLCHTO,,
4,4,2016.0,AFG,5-09 YEARS,FEMALE,10.7,,both,Older children and teenagers overweight,Marketplace,Marketing and Advertising,Outcome,2020,MP_MA_OC_OLCHTO,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1726,1726,2016.0,ZWE,5-09 YEARS,FEMALE,23.6,,both,Older children and teenagers overweight,Marketplace,Marketing and Advertising,Outcome,2020,MP_MA_OC_OLCHTO,,
1727,1727,2016.0,ZWE,10-19 YEARS,FEMALE,21.7,,both,Older children and teenagers overweight,Marketplace,Marketing and Advertising,Outcome,2020,MP_MA_OC_OLCHTO,,
1728,1728,2016.0,ZWE,5-19 YEARS,MALE,6.6,,both,Older children and teenagers overweight,Marketplace,Marketing and Advertising,Outcome,2020,MP_MA_OC_OLCHTO,,
1729,1729,2016.0,ZWE,5-09 YEARS,MALE,7.5,,both,Older children and teenagers overweight,Marketplace,Marketing and Advertising,Outcome,2020,MP_MA_OC_OLCHTO,,


In [10]:
s82_raw = pd.read_csv(
    filepath_or_buffer = data_sources_raw / "S-82_raw.csv",
    sep = ";"
)

if "Display Value" in s82_raw.columns:
    s82_raw["Display Value"] = s82_raw["Display Value"].astype(str)
    s82_raw["Display Value"] = s82_raw["Display Value"].apply(
            lambda x: re.sub(" \[.*\]", "", x)
    )

"""
s82_cleansed = cleanse.Cleanser().cleanse(
    raw_data = s82_raw,
    mapping_dictionary = mapping_dict,
    final_sdmx_col_list = sdmx_df_columns_all,
    dim_cols = sdmx_df_columns_dims,
    country_cols = sdmx_df_columns_country,
    time_cols = sdmx_df_columns_time,
    country_list_full = country_full_list,
    crba_country_list = country_crba_list
    )
"""


s82_raw

Unnamed: 0.1,Unnamed: 0,GHO,PUBLISHSTATE,YEAR,REGION,COUNTRY,Display Value,Numeric,Low,High,Comments
0,0,VIOLENCE_EXTENTIMP_CHILDPROTECTION,PUBLISHED,2012-2014,EMR,OMN,Limited,,,,
1,1,VIOLENCE_EXTENTIMP_CHILDPROTECTION,PUBLISHED,2012-2014,AMR,PER,Larger scale,,,,
2,2,VIOLENCE_EXTENTIMP_CHILDPROTECTION,PUBLISHED,2012-2014,EUR,PRT,Larger scale,,,,
3,3,VIOLENCE_EXTENTIMP_CHILDPROTECTION,PUBLISHED,2012-2014,EUR,ROU,Larger scale,,,,
4,4,VIOLENCE_EXTENTIMP_CHILDPROTECTION,PUBLISHED,2012-2014,WPR,WSM,Larger scale,,,,
...,...,...,...,...,...,...,...,...,...,...,...
127,127,VIOLENCE_EXTENTIMP_CHILDPROTECTION,PUBLISHED,2012-2014,AFR,BDI,Limited,,,,
128,128,VIOLENCE_EXTENTIMP_CHILDPROTECTION,PUBLISHED,2012-2014,AFR,BWA,Limited,,,,
129,129,VIOLENCE_EXTENTIMP_CHILDPROTECTION,PUBLISHED,2012-2014,AMR,BOL,Limited,,,,
130,130,VIOLENCE_EXTENTIMP_CHILDPROTECTION,PUBLISHED,2012-2014,EUR,BLR,Larger scale,,,,


In [60]:
s88_cleansed = pd.read_csv(
    filepath_or_buffer = data_sources_cleansed / "S-88_cleansed.csv",
    sep = ";"
)

norm_values = ["No", "Yes", "No data"]
contains_zeros = False

conditions = [
    (s88_cleansed["RAW_OBS_VALUE"] == index)
    for index, value in enumerate(norm_values, start=0 if contains_zeros else 1)
]

# s88_cleansed["RAW_OBS_VALUE"] == index
# print(index)

# conditions.size_of

conditions = 

--> Get unique values (not necessary, just do manually and then insert it)
--> Then do a mapping ("No" = 0, "Yes" = 1) --> This must be indicator specific --> CLEANSER, INFO FOR THAT IN THE data dictionary --> Create two columns "RAW_VALUES" and "ENCODEDE VALUES"
--> Then convert these into scores --> SCLAER (Global and )

In [62]:
# dir(conditions)
#conditions.size_of()
# conditions[1].sum()
enumerate(norm_values, start=0 if contains_zeros else 1)[0]

TypeError: 'enumerate' object is not subscriptable

In [24]:


"""
conditions = [
    (cleansed_data[indicator_raw_value] == index)
    for index, value in enumerate(norm_values, start=0 if contains_zeros else 1)
]

# create a new column and assign values to it using our lists
cleansed_data["SCALED"] = np.select(conditions, norm_values)

# Right join country list
cleansed_data_full = cleansed_data.merge(
    right=crba_final_country_list,
    how="right",
    left_on=cleansed_df_iso2_col,
    right_on=crba_final_country_list_iso_col,
    indicator="RJ_CRBA_FULL_LIST",
)
"""

'\nconditions = [\n    (cleansed_data[indicator_raw_value] == index)\n    for index, value in enumerate(norm_values, start=0 if contains_zeros else 1)\n]\n\n# create a new column and assign values to it using our lists\ncleansed_data["SCALED"] = np.select(conditions, norm_values)\n\n# Right join country list\ncleansed_data_full = cleansed_data.merge(\n    right=crba_final_country_list,\n    how="right",\n    left_on=cleansed_df_iso2_col,\n    right_on=crba_final_country_list_iso_col,\n    indicator="RJ_CRBA_FULL_LIST",\n)\n'

In [47]:
def test_func(**kwargs):
    for key in kwargs:
        print(key, kwargs[key])

test_func(a="Real", b="Python", c="Is", d="Great", e="!")

a Real
b Python
c Is
d Great
e !


In [67]:
# s55_cleansed.SCALED_OBS_VALUE.describe()

# s55_cleansed[s55_cleansed.SCALED_OBS_VALUE > 9.2][["COUNTRY_ISO_2", "RAW_OBS_VALUE", "SCALED_OBS_VALUE"]]

### Develop normalier function

In [107]:
# # # # # # # # # 

s103_cleansed = pd.read_csv(
    filepath_or_buffer = data_sources_cleansed / "S-103_cleansed.csv",
    sep = ";"
)

print(s103_cleansed["RAW_OBS_VALUE"][10])
print(s103_cleansed["RAW_OBS_VALUE"].isnull().sum())



# print(s103_cleansed["RAW_OBS_VALUE"].dtypes== object) 

if s103_cleansed["RAW_OBS_VALUE"].dtypes== object:
    #s103_cleansed = s103_cleansed[s103_cleansed["RAW_OBS_VALUE"].dropna()]
    s103_cleansed["RAW_OBS_VALUE"] = s103_cleansed["RAW_OBS_VALUE"].astype(str)
    s103_cleansed["RAW_OBS_VALUE"] = pd.to_numeric(s103_cleansed["RAW_OBS_VALUE"].apply(
          lambda x: re.sub(
                "No data", "", re.sub(
                         " \[.*\]", '', x
                    ))),
            errors = "coerce"
    )  

    # s103_cleansed = s103_cleansed["RAW_OBS_VALUE"].apply(
    #      lambda x: re.sub(
    #         " \[.*\]", "", re.sub(
    #            "No data", None, x))) 

s103_cleansed["RAW_OBS_VALUE"].describe()




23.8 [18.1-29.9]
3


count    1692.000000
mean       24.318617
std        11.999569
min         4.300000
25%        14.800000
50%        24.800000
75%        31.225000
max        70.400000
Name: RAW_OBS_VALUE, dtype: float64

In [82]:


s55_cleansed = pd.read_csv(
    filepath_or_buffer = data_sources_cleansed / "S-55_cleansed.csv",
    sep = ";"
)

def normalizer(
    cleansed_data,
    cat_var = False,
    inverted = False,
    whisker_factor = 1.5,
    raw_data_col = "RAW_OBS_VALUE",
    **dimensions
):
    """Normalize the RAW_OBS_VALUES into indicator scores

    TO DO

    Parameters: 
    TO DO
    **dimensions (mapping type): Define the present dimension variables as keys
    along with the dimension value that is supposed to be taken for the normalization.

    """

    # Define the dimension subgroup for which normalization is done:
    # normalization_subset = cleansed_data

    # Empty string which will be filled with subset conditions
    subset = ""

    # Run loop to get dimensions vaues specified in **dimensions
    for key in dimensions:
        subset += "(cleansed_data['{}'] == '{}')&".format(
            key,
            dimensions[key]
        )
    
    # Get rid of the "&-sign" at the end
    subset = subset.rstrip("& ")

    # Subset the actual dataframe
    cleansed_data_subset = cleansed_data[eval(subset)]

    if cat_var == False: 
        # Determine basic descriptive statistics of the distribution that are required for the normalization
        min_val = np.nanmin(
            cleansed_data_subset[raw_data_col].astype("float")
        )
        max_val = np.nanmax(
            cleansed_data_subset[raw_data_col].astype("float")
        )
        q1 = (
            cleansed_data_subset[raw_data_col]
            .astype("float")
            .quantile(q=0.25)
        )
        q2 = (
            cleansed_data_subset[raw_data_col]
            .astype("float")
            .quantile(q=0.50)
        )
        q3 = (
            cleansed_data_subset[raw_data_col]
            .astype("float")
            .quantile(q=0.75)
        )
        iqr = q3 - q1

        # Define what max value to use for the normalization
        if max_val > q3 + whisker_factor * iqr:
            max_to_use = q3 + whisker_factor * iqr
            print(
                "The distribution of the raw data values this subgroup contains outliers or is too skewed on the upper end. The maximum value to be used for the normalisation is: 3rd quartile or distribution + {} * IQR. It is: {} \n See histogram printed below for info. \n".format(
                    whisker_factor, max_to_use
                )
            )
        else:
            max_to_use = max_val
            print(
                "The distribution of the raw data for this subgroup does not contain outliers on the upper end. It is also not too skewed on the upper end. The maximum value used for the normalisation is the maximum value in the dataset, which is {}. This value corresponds to country: {} \n".format(
                    max_to_use,
                    cleansed_data_subset[
                        cleansed_data_subset[raw_data_col].astype("float")
                        == max_val
                    ],
                )
            )

        # Define what min value to use for the normalization
        if min_val < q1 - whisker_factor * iqr:
            min_to_use = q1 - whisker_factor * iqr
            print(
                "The distribution of the raw data values for this subgroup contains outliers or is too skewed on the lower end. The minimum value to be used for the normalisation is 1st quartile or distribution - {} * IQR. It is: {} \n See histogram printed below for info. \n".format(
                    whisker_factor, min_to_use
                )
            )
        else:
            min_to_use = min_val
            print(
                "The distribution of the raw data for this subgroup does not contain outliers or is too skewed on the lower end. The minimum value used for the normalisation is the minimum value in the dataset, which is {}. This value corresponds to country: {} \n".format(
                    min_to_use,
                    cleansed_data_subset[
                        cleansed_data_subset[raw_data_col].astype("float")
                        == min_val
                    ],
                )
            )

        """
        # If there are outliers or a skewed distribution, print the distribution for the user.
        if (min_val < q1 - whisker_factor * iqr) or (
            max_val > q3 + whisker_factor * iqr
        ):
            print(
                "\n This is the distribution of the raw data of the indicator."
            )
            print(
                pd.to_numeric(s55_cleansed["RAW_OBS_VALUE"]).hist(
                    bins=30
                )
            )"""

        # Define the value range that is used for the scaling (normalization)
        tot_range = max_val - min_val

        # Compute the normalized value of the raw data in the column "SCALED"
        # Distinguish between indicators, whose value must be inverted
        if inverted == True:
            cleansed_data_subset["SCALED_OBS_VALUE"] = round(
                10
                - 10
                * (
                    cleansed_data_subset[raw_data_col].astype("float")
                    - min_val
                )
                / tot_range,
                2,
            )
        else:
            cleansed_data_subset["SCALED_OBS_VALUE"] = round(
                10
                * (
                    cleansed_data_subset[raw_data_col].astype("float")
                    - min_val
                )
                / tot_range,
                2,
            )
    
    # join normalized data to original dataframe
    cleansed_data = cleansed_data.merge(
        right=cleansed_data_subset,
        how="outer"
    )
    

    # cleansed_data = pd.concat([cleansed_data, cleansed_data_subset], axis=1, copy = False)

    # insert column to indicate OBS status
    result = cleansed_data.assign(
        OBS_STATUS=np.where(
            cleansed_data["SCALED_OBS_VALUE"].isnull(), np.nan, "O"
        ))

    # Return result
    return result

s55_normalized = normalizer(
    cleansed_data=s55_cleansed,
    DIM_SEX = "BOTH_SEXES",
    DIM_EDU_LEVEL = "LOWER SECONDARY EDUCATION",
    DIM_AGE = "SCHOOL_AGE_POPULATION"
)

print(s55_cleansed.shape)
print(s55_normalized.shape)
print(s55_normalized.columns)
s55_normalized


SyntaxError: invalid syntax (<ipython-input-82-6e04ca9c2250>, line 170)

In [31]:
s55_cleansed = pd.read_csv(
    filepath_or_buffer = data_sources_cleansed / "S-55_cleansed.csv",
    sep = ";"
)

s55_cleansed_scaler = s55_cleansed[(s55_cleansed.DIM_EDU_LEVEL == "LOWER SECONDARY EDUCATION") &
    (s55_cleansed.DIM_AGE == "SCHOOL_AGE_POPULATION") &
    (s55_cleansed.DIM_SEX == "BOTH_SEXES")]

whisker_factor = 1.5
inverted = True
numeric 

# iii) Determine basic descriptive statistics of the distribution that are required for the normalization
min_val = np.nanmin(
    s55_cleansed["RAW_OBS_VALUE"].astype("float")
)
max_val = np.nanmax(
    s55_cleansed["RAW_OBS_VALUE"].astype("float")
)
q1 = (
    s55_cleansed["RAW_OBS_VALUE"]
    .astype("float")
    .quantile(q=0.25)
)
q2 = (
    s55_cleansed["RAW_OBS_VALUE"]
    .astype("float")
    .quantile(q=0.50)
)
q3 = (
    s55_cleansed["RAW_OBS_VALUE"]
    .astype("float")
    .quantile(q=0.75)
)
iqr = q3 - q1

# Define what max value to use for the normalization
if max_val > q3 + whisker_factor * iqr:
    max_to_use = q3 + whisker_factor * iqr
    print(
        "The distribution of the raw data values this subgroup contains outliers or is too skewed on the upper end. The maximum value to be used for the normalisation is: 3rd quartile or distribution + {} * IQR. It is: {} \n See histogram printed below for info. \n".format(
            whisker_factor, max_to_use
        )
    )
else:
    max_to_use = max_val
    """
    print(
        "The distribution of the raw data for this subgroup does not contain outliers or is too skewed on the upper end. The maximum value used for the normalisation is the maximum value in the dataset, which is {}. This value corresponds to country: {} \n".format(
            max_to_use,
            s55_cleansed[
                s55_cleansed["RAW_OBS_VALUE"].astype("float")
                == max_val
            ].COUNTRY_NAME,
        )
    )"""

# Define what min value to use for the normalization
if min_val < q1 - whisker_factor * iqr:
    min_to_use = q1 - whisker_factor * iqr
    print(
        "The distribution of the raw data values for this subgroup contains outliers or is too skewed on the lower end. The minimum value to be used for the normalisation is 1st quartile or distribution - {} * IQR. It is: {} \n See histogram printed below for info. \n".format(
            whisker_factor, min_to_use
        )
    )
else:
    min_to_use = min_val
    """
    print(
        "The distribution of the raw data for this subgroup does not contain outliers or is too skewed on the lower end. The minimum value used for the normalisation is the minimum value in the dataset, which is {}. This value corresponds to country: {} \n".format(
            min_to_use,
            cleansed_data[
                cleansed_data[indicator_raw_value].astype("float")
                == min_val
            ].COUNTRY_NAME,
        )
    )

# If there are outliers or a skewed distribution, print the distribution for the user.
if (min_val < q1 - whisker_factor * iqr) or (
    max_val > q3 + whisker_factor * iqr
):
    print(
        "\n This is the distribution of the raw data of the indicator."
    )
    print(
        pd.to_numeric(s55_cleansed["RAW_OBS_VALUE"]).hist(
            bins=30
        )
    )"""

# Define the value range that is used for the scaling (normalization)
tot_range = max_val - min_val

# Compute the normalized value of the raw data in the column "SCALED"
# Distinguish between indicators, whose value must be inverted
if inverted == True:
    s55_cleansed["SCALED_OBS_VALUE"] = round(
        10
        - 10
        * (
            s55_cleansed["RAW_OBS_VALUE"].astype("float")
            - min_val
        )
        / tot_range,
        2,
    )
else:
    s55_cleansed["SCALED_OBS_VALUE"] = round(
        10
        * (
            s55_cleansed["RAW_OBS_VALUE"].astype("float")
            - min_val
        )
        / tot_range,
        2,
    )

"""

# iv) Append the subset including its scaled value to the final returned dataframe
# Right join to have all countries from the final crba master list
# cleansed_data_subset_rj = cleansed_data_subset.merge(
#     right=crba_final_country_list,
#     how="right",
#    left_on=cleansed_df_iso2_col,
#    right_on=crba_final_country_list_iso_col,
#    indicator="RJ_CRBA_FULL_LIST",
#)

# Append the values
# cleansed_data_full = cleansed_data_full.append(cleansed_data_subset_rj)


# For debugging, include
print(
    "\n The shape of the dataframe should be 195 x X. It is:  {} \n ".format(
        cleansed_data_subset_rj.shape
    )
)


except:
print("Dataframe is empty. There are no values to append.")

# Log: print information that this loop run is terminated
print(" \n This is the end of loop #{}. \n - \n ".format(j + 1))

# Sanity Check: The resulting dataframe should always have 195 rows. NB: if you put the line of code before the above "Append the values" bit, Python throws and error
assert (
pd.to_numeric(cleansed_data_full.shape[0]) % 195 == 0
), "Number of rows should be a multiple of 195, but it is not. Check if all columns which should be part of the group by statement are listed"
print(
"The number of rows of the final dataframe (before the conversion from wide to long format is) is divisible by 195. It is: {}".format(
cleansed_data_full.shape
)
)

"""




The distribution of the raw data values this subgroup contains outliers or is too skewed on the upper end. The maximum value to be used for the normalisation is: 3rd quartile or distribution + 1.5 * IQR. It is: 32.41972750000001 
 See histogram printed below for info. 



'\n\n# iv) Append the subset including its scaled value to the final returned dataframe\n# Right join to have all countries from the final crba master list\n# cleansed_data_subset_rj = cleansed_data_subset.merge(\n#     right=crba_final_country_list,\n#     how="right",\n#    left_on=cleansed_df_iso2_col,\n#    right_on=crba_final_country_list_iso_col,\n#    indicator="RJ_CRBA_FULL_LIST",\n#)\n\n# Append the values\n# cleansed_data_full = cleansed_data_full.append(cleansed_data_subset_rj)\n\n\n# For debugging, include\nprint(\n    "\n The shape of the dataframe should be 195 x X. It is:  {} \n ".format(\n        cleansed_data_subset_rj.shape\n    )\n)\n\n\nexcept:\nprint("Dataframe is empty. There are no values to append.")\n\n# Log: print information that this loop run is terminated\nprint(" \n This is the end of loop #{}. \n - \n ".format(j + 1))\n\n# Sanity Check: The resulting dataframe should always have 195 rows. NB: if you put the line of code before the above "Append the values

In [42]:
# s55_cleansed.DIM_SEX.unique()

# s55_cleansed.DIM_AGE.unique()

# s55_cleansed[(s55_cleansed.DIM_EDU_LEVEL == "LOWER SECONDARY EDUCATION") &
    #  (s55_cleansed.DIM_AGE == "SCHOOL_AGE_POPULATION") &
    # (s55_cleansed.DIM_SEX == "BOTH_SEXES")
# ]

s55_cleansed

Unnamed: 0.1,Unnamed: 0,ATTR_UNIT_MEASURE,DIM_EDU_LEVEL,DIM_SEX,DIM_AGE,COUNTRY_ISO_2,TIME_PERIOD,RAW_OBS_VALUE,ATTR_SOURCE_OBS_STATUS,_merge,INDICATOR_NAME,INDICATOR_INDEX,INDICATOR_ISSUE,INDICATOR_CATEGORY,CRBA_RELEASE_YEAR,INDICATOR_CODE,SCALED_OBS_VALUE
1,1,PT,LOWER SECONDARY EDUCATION,BOTH_SEXES,SCHOOL_AGE_POPULATION,AL,2018.0,3.05680,A,both,Freedom of association.,Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS,9.53
7,7,PT,LOWER SECONDARY EDUCATION,BOTH_SEXES,SCHOOL_AGE_POPULATION,AG,2018.0,1.43781,A,both,Freedom of association.,Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS,9.78
10,10,PT,LOWER SECONDARY EDUCATION,BOTH_SEXES,SCHOOL_AGE_POPULATION,AR,2017.0,0.00779,A,both,Freedom of association.,Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS,10.00
11,11,PT,LOWER SECONDARY EDUCATION,BOTH_SEXES,SCHOOL_AGE_POPULATION,AM,2018.0,8.21479,A,both,Freedom of association.,Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS,8.74
14,14,PT,LOWER SECONDARY EDUCATION,BOTH_SEXES,SCHOOL_AGE_POPULATION,AU,2017.0,1.88277,A,both,Freedom of association.,Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS,9.71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365,365,PT,LOWER SECONDARY EDUCATION,BOTH_SEXES,SCHOOL_AGE_POPULATION,GB,2017.0,0.12779,A,both,Freedom of association.,Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS,9.98
366,366,PT,LOWER SECONDARY EDUCATION,BOTH_SEXES,SCHOOL_AGE_POPULATION,US,2017.0,0.22621,E,both,Freedom of association.,Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS,9.97
367,367,PT,LOWER SECONDARY EDUCATION,BOTH_SEXES,SCHOOL_AGE_POPULATION,UY,2017.0,0.26134,A,both,Freedom of association.,Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS,9.96
368,368,PT,LOWER SECONDARY EDUCATION,BOTH_SEXES,SCHOOL_AGE_POPULATION,UZ,2018.0,3.66859,E,both,Freedom of association.,Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS,9.44


In [50]:
s55_cleansed = pd.read_csv(
    filepath_or_buffer = data_sources_cleansed / "S-55_cleansed.csv",
    sep = ";"
)
s55_cleansed

Unnamed: 0.1,Unnamed: 0,ATTR_UNIT_MEASURE,DIM_EDU_LEVEL,DIM_SEX,DIM_AGE,COUNTRY_ISO_2,TIME_PERIOD,RAW_OBS_VALUE,ATTR_SOURCE_OBS_STATUS,_merge,INDICATOR_NAME,INDICATOR_INDEX,INDICATOR_ISSUE,INDICATOR_CATEGORY,CRBA_RELEASE_YEAR,INDICATOR_CODE
0,0,,0,BOTH_SEXES,_T,AF,2020.0,,,right_only,Freedom of association.,Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
1,1,PT,LOWER SECONDARY EDUCATION,BOTH_SEXES,SCHOOL_AGE_POPULATION,AL,2018.0,3.05680,A,both,Freedom of association.,Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
2,2,PT,LOWER SECONDARY EDUCATION,FEMALE,SCHOOL_AGE_POPULATION,AL,2018.0,0.55400,A,both,Freedom of association.,Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
3,3,PT,LOWER SECONDARY EDUCATION,MALE,SCHOOL_AGE_POPULATION,AL,2018.0,5.22058,A,both,Freedom of association.,Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
4,4,,0,BOTH_SEXES,_T,AD,2020.0,,,right_only,Freedom of association.,Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
374,374,PT,LOWER SECONDARY EDUCATION,MALE,SCHOOL_AGE_POPULATION,VE,2017.0,14.91150,A,both,Freedom of association.,Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
375,375,,0,BOTH_SEXES,_T,VN,2020.0,,,right_only,Freedom of association.,Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
376,376,,0,BOTH_SEXES,_T,YE,2020.0,,,right_only,Freedom of association.,Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
377,377,,0,BOTH_SEXES,_T,ZM,2020.0,,,right_only,Freedom of association.,Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS


In [62]:
def subset_creator(
    cleansed_data,
    **dimensions
):
    # Empty string which will be filled with subset conditions
    subset = ""

    # Run loop to get dimensions vaues specified in **dimensions
    for key in dimensions:
        subset += "(cleansed_data['{}'] == '{}')&".format(
            key,
            dimensions[key]
        )
    
    # Get rid of the "&-sign" at the end
    subset = subset.rstrip("& ")

    # Just for dev:
    print(subset)

    # Subset
    cleansed_data_subset = cleansed_data[eval(subset)]

    # 
    # print(cleansed_data_subset)

    # Return cleansed data
    return cleansed_data_subset


s55_cleansed = pd.read_csv(
    filepath_or_buffer = data_sources_cleansed / "S-55_cleansed.csv",
    sep = ";"
)

s55_subset = subset_creator(
    cleansed_data = s55_cleansed,
    DIM_SEX = "BOTH_SEXES",
    DIM_EDU_LEVEL = "LOWER SECONDARY EDUCATION",
    DIM_AGE = "SCHOOL_AGE_POPULATION"
)

s55_subset


(cleansed_data['DIM_SEX'] == 'BOTH_SEXES')&(cleansed_data['DIM_EDU_LEVEL'] == 'LOWER SECONDARY EDUCATION')&(cleansed_data['DIM_AGE'] == 'SCHOOL_AGE_POPULATION')


Unnamed: 0.1,Unnamed: 0,ATTR_UNIT_MEASURE,DIM_EDU_LEVEL,DIM_SEX,DIM_AGE,COUNTRY_ISO_2,TIME_PERIOD,RAW_OBS_VALUE,ATTR_SOURCE_OBS_STATUS,_merge,INDICATOR_NAME,INDICATOR_INDEX,INDICATOR_ISSUE,INDICATOR_CATEGORY,CRBA_RELEASE_YEAR,INDICATOR_CODE
1,1,PT,LOWER SECONDARY EDUCATION,BOTH_SEXES,SCHOOL_AGE_POPULATION,AL,2018.0,3.05680,A,both,Freedom of association.,Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
7,7,PT,LOWER SECONDARY EDUCATION,BOTH_SEXES,SCHOOL_AGE_POPULATION,AG,2018.0,1.43781,A,both,Freedom of association.,Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
10,10,PT,LOWER SECONDARY EDUCATION,BOTH_SEXES,SCHOOL_AGE_POPULATION,AR,2017.0,0.00779,A,both,Freedom of association.,Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
11,11,PT,LOWER SECONDARY EDUCATION,BOTH_SEXES,SCHOOL_AGE_POPULATION,AM,2018.0,8.21479,A,both,Freedom of association.,Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
14,14,PT,LOWER SECONDARY EDUCATION,BOTH_SEXES,SCHOOL_AGE_POPULATION,AU,2017.0,1.88277,A,both,Freedom of association.,Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365,365,PT,LOWER SECONDARY EDUCATION,BOTH_SEXES,SCHOOL_AGE_POPULATION,GB,2017.0,0.12779,A,both,Freedom of association.,Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
366,366,PT,LOWER SECONDARY EDUCATION,BOTH_SEXES,SCHOOL_AGE_POPULATION,US,2017.0,0.22621,E,both,Freedom of association.,Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
367,367,PT,LOWER SECONDARY EDUCATION,BOTH_SEXES,SCHOOL_AGE_POPULATION,UY,2017.0,0.26134,A,both,Freedom of association.,Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
368,368,PT,LOWER SECONDARY EDUCATION,BOTH_SEXES,SCHOOL_AGE_POPULATION,UZ,2018.0,3.66859,E,both,Freedom of association.,Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS


In [9]:
# 

def map_values(
    dataframe,
    value_mapping_dict
):

    for key in value_mapping_dict:
        # try:
        # Define emtpy lists to be mapped to each other
        original_values = []
        mapped_values = []

        # Loop obtain all possible original/ mapped value variations mappings
        for sub_key in value_mapper[key]:
            original_values += value_mapping_dict[key][sub_key]
            mapped_values += len(value_mapping_dict[key][sub_key]) * [sub_key]
        
        print(original_values)
        print(mapped_values)

        # Convert the values
        dataframe[key] = np.select(
            original_values, mapped_values
        )
    
        # log info for user
        print("Successfully mapped value of column: {}".format(
            key
        ))
        #except:
        #    print("The following dimension is not present in the raw dataframe {}. There are thus no values to be mapped.".format(
        #        key
        #    ))
    
    return dataframe


In [24]:

s55_raw["REF_AREA"].apply(lambda x: len(str(x))).quantile(q=0.25)




2.0

In [14]:
# Extract data
from statistics import median, mean

s55_raw = extract.CSVExtractor.extract(url =
    'https://api.uis.unesco.org/sdmx/data/UNESCO,SDG4,2.0/ROFST.PT.L2+L2_3+L3._T._T+F+M.SCH_AGE_GROUP._T.INST_T._Z._T._Z._Z._Z._T._T._Z._Z._Z.?startPeriod=2005&endPeriod=2018&format=csv-sdmx&locale=en&subscription-key=460ab272abdd43c892bb59c218c22c09'
)

lenght = s55_raw["REF_AREA"].apply(lambda x: len(x))

# s55_raw.to_csv(data_sources_raw / "S_55_raw.csv")

The following columns are present in the datasets, and this is the number of unique values they have. 
The column Dataflow has 1 unique values.
The column STAT_UNIT has 1 unique values.
The column UNIT_MEASURE has 1 unique values.
The column EDU_LEVEL has 3 unique values.
The column EDU_CAT has 1 unique values.
The column SEX has 3 unique values.
The column AGE has 1 unique values.
The column GRADE has 1 unique values.
The column SECTOR_EDU has 1 unique values.
The column EDU_ATTAIN has 1 unique values.
The column SUBJECT has 1 unique values.
The column WEALTH_QUINTILE has 1 unique values.
The column INFRASTR has 1 unique values.
The column LOCATION has 1 unique values.
The column EDU_TYPE has 1 unique values.
The column SE_BKGRD has 1 unique values.
The column SOURCE_FUND has 1 unique values.
The column FUND_FLOW has 1 unique values.
The column IMM_STATUS has 1 unique values.
The column REF_AREA has 326 unique values.
The column TIME_PERIOD has 14 unique values.
The column OBS_VALUE h

TypeError: object of type 'float' has no len()

# Cleansing


< STOPPED HERE , the below code runs (but have to define tha mapping_dict first) --> Next step is to Bring this thing into a loop and take care of the exceptions

In [11]:
s98_raw = pd.read_csv(
    filepath_or_buffer = data_sources_raw / "S-98_raw.csv"
)

s98_cleansed = cleanse.Cleanser().cleanse(
    raw_data = s98_raw,
    mapping_dictionary = mapping_dict,
    final_sdmx_col_list = sdmx_df_columns_all,
    dim_cols = sdmx_df_columns_dims,
    country_cols = sdmx_df_columns_country,
    time_cols = sdmx_df_columns_time,
    country_list_full = country_full_list,
    crba_country_list = country_crba_list
)

"""
s98_cleansed_mapped = map_values(
    dataframe = s98_cleansed,
    value_mapping_dict = value_mapper
)
"""


# Loop through all possible columns as defined for the final SDMX structure
for key in value_mapper:
    try:
        # Define emtpy lists to be mapped to each other
        original_values = []
        mapped_values = []

        # Loop obtain all possible original/ mapped value variations mappings
        for sub_key in value_mapper[key]:
            # Obtain boolean arrays for each possible original value
            for list_element in range(len(value_mapper[key][sub_key])):
                original_values += [s98_cleansed[key] == value_mapper[key][sub_key][list_element]]
            
            # Define the target value if original_values evaluates to true
            mapped_values += len(value_mapper[key][sub_key]) * [sub_key]
        
        # Convert (map) the values
        s98_cleansed[key] = np.select(
            original_values, mapped_values
        )

        # log info for user
        print("\n Successfully mapped value of column: {}".format(
            key
        ))

    # If column is not presnt (or if there are other issues)
    except:
        print("Values of column: {} couldn't be mapped. If column {} is present, there is an error with the code. ".format(
            key,
            key
        ))

Cleansing done. There are 569 rows in the dataframe and 1.41% have a NA-value in the column 'OBS_RAW_VALUE

 Successfully mapped value of column: DIM_SEX
Values of column: DIM_EDU_LEVEL couldn't be mapped. If column DIM_EDU_LEVEL is present, there is an error with the code. 
Values of column: DIM_AGE couldn't be mapped. If column DIM_AGE is present, there is an error with the code. 
Values of column: DIM_AGE_GROUP couldn't be mapped. If column DIM_AGE_GROUP is present, there is an error with the code. 
Values of column: DIM_MANAGEMENT_LEVEL couldn't be mapped. If column DIM_MANAGEMENT_LEVEL is present, there is an error with the code. 
Values of column: DIM_AREA_TYPE couldn't be mapped. If column DIM_AREA_TYPE is present, there is an error with the code. 
Values of column: DIM_SECTOR couldn't be mapped. If column DIM_SECTOR is present, there is an error with the code. 


In [12]:
import cleanse
import pandas as pd

s102_raw = pd.read_csv(
    filepath_or_buffer = data_sources_raw / "S-102_raw.csv"
)

s102_cleansed = cleanse.Cleanser().cleanse(
    raw_data = s102_raw,
    mapping_dictionary = mapping_dict,
    final_sdmx_col_list = sdmx_df_columns_all,
    dim_cols = sdmx_df_columns_dims,
    country_cols = sdmx_df_columns_country,
    time_cols = sdmx_df_columns_time,
    country_list_full = country_full_list,
    crba_country_list = country_crba_list
)

# s102_cleansed["REF_AREA"].apply(lambda x: mean(len(x)))
s102_cleansed.head(30)

s102_cleansed_mapped = map_values(
    dataframe = s102_cleansed,
    value_mapping_dict = value_mapper
)

s102_cleansed_mapped


Cleansing done. There are 195 rows in the dataframe and 26.15% have a NA-value in the column 'OBS_RAW_VALUE
['_T', 'BOTHSEX', 'BTSX', 'SEX_T', 'M', 'MALE', 'MLE', 'SEX_M', 'F', 'FEMALE', 'FMLE', 'SEX_F']
['BOTH_SEXES', 'BOTH_SEXES', 'BOTH_SEXES', 'BOTH_SEXES', 'MALE', 'MALE', 'MALE', 'MALE', 'FEMALE', 'FEMALE', 'FEMALE', 'FEMALE']


TypeError: invalid entry 0 in condlist: should be boolean ndarray

In [9]:
import cleanse
import pandas as pd

s52_raw = pd.read_csv(
    filepath_or_buffer = data_sources_raw / "S-52_raw.csv"
)

s52_cleansed = cleanse.Cleanser().cleanse(
    raw_data = s52_raw,
    mapping_dictionary = mapping_dict,
    final_sdmx_col_list = sdmx_df_columns_all,
    dim_cols = sdmx_df_columns_dims,
    country_cols = sdmx_df_columns_country,
    time_cols = sdmx_df_columns_time,
    country_list_full = country_full_list,
    crba_country_list = country_crba_list
)

# s102_cleansed["REF_AREA"].apply(lambda x: mean(len(x)))
s52_cleansed

The column REF_AREA has been renamed into COUNTRY_ISO_3, but should be COUNTRY_ISO_2. Now renaming it into COUNTRY_ISO_2
COUNTRY_ISO_2
2
Cleansing done. There are 1051 rows in the dataframe and 1.52% have a NA-value in the column 'OBS_RAW_VALUE


Unnamed: 0,ATTR_UNIT_MEASURE,DIM_EDU_LEVEL,DIM_SEX,DIM_AGE,COUNTRY_ISO_2,TIME_PERIOD,RAW_OBS_VALUE,ATTR_SOURCE_OBS_STATUS,_merge
0,PT,L01,_T,_T,AF,2018.0,0.00000,Z,both
1,PT,L01,F,_T,AF,2018.0,0.00000,Z,both
2,PT,L01,M,_T,AF,2018.0,0.00000,Z,both
3,PT,L01,_T,_T,AL,2012.0,0.00000,Z,both
4,PT,L01,F,_T,AL,2012.0,0.00000,Z,both
...,...,...,...,...,...,...,...,...,...
1046,PT,L01,F,_T,ZW,2013.0,0.00000,Z,both
1047,PT,L01,M,_T,ZW,2013.0,0.00000,Z,both
1048,PT,L02,_T,_T,ZW,2013.0,46.95562,A,both
1049,PT,L02,F,_T,ZW,2013.0,47.51739,A,both


In [13]:
# from utils import mapping_dictionary
# %run utils.mapping_dictionary.py

# %run "D:\Documents\2020\28_UNICEF\10_working_repo\data-etl\utils\mapping_dictionary.py"

country_tuple = ("REF_AREA", "COUNTRY")
country_mapper = {key: "REF_AREA" for key in country_tuple}


year_tuple = (
    "TIME_PERIOD",
    "YEAR",
)
year_mapper = {key: "TIME_PERIOD" for key in year_tuple}


obs_value_tuple = ("OBS_VALUE", "Display Value")
obs_value_mapper = {key: "OBS_VALUE" for key in obs_value_tuple}


dim_sex_tuple = "SEX"
dim_sex_mapper = {key: "OBS_VALUE" for key in obs_value_tuple}

"""
dim_edu_tuple = (
    ""
)

dim_age_tuple = (
    "SEX"
)
"""

# Create list of all mapper dictionaries
mapper_tuple_list = [country_mapper, year_mapper, obs_value_mapper, dim_sex_mapper]

# Define the mapping dictionary
mapping_dict = {}

for mapper_tuple in mapper_tuple_list:
    mapping_dict.update(mapper_tuple)

with open("mapping_dict.json", "w") as fp:
    json.dump(mapping_dict, fp)

In [21]:
year_tuple = (
    "TIME_PERIOD",
    "YEAR", 
)

x = {key: "xxx" for key in country_tuple}
y = {key: "yyy" for key in year_tuple}

x.update(y)
x}
y = {key: "yyy" for key in year_tuple}

x.update(y)
x

{'REF_AREA': 'xxx', 'COUNTRY': 'xxx', 'TIME_PERIOD': 'yyy', 'YEAR': 'yyy'}

In [8]:
s55_cleansed = cleanse.Cleanser.cleanse(
    raw_data = s55_raw,
    raw_data_iso_2_col = 'REF_AREA',
    country_df = country_crba_list,
    country_df_iso2_col = 'COUNTRY_ISO_2',
    non_dim_cols = ['OBS_VALUE', 'TIME_PERIOD', 'OBS_STATUS']
)

s55_cleansed.to_csv(data_sources_raw / "S_55_cleansed.csv")

In [9]:
from normalize.scaler import normalizer

s55_normalized = normalizer(
    cleansed_data = s55_cleansed,
    indicator_raw_value = 'OBS_VALUE',
    indicator_code = 'WP_DW_OC_FREASS',
    indicator_name = 'Out-of-school adolescents (lower secondary)',
    indicator_index = 'Workplace',
    indicator_issue = 'Decent working conditions',
    indicator_category = 'Outcome',
    cleansed_df_iso2_col = 'REF_AREA',
    crba_final_country_list = country_crba_list,
    crba_final_country_list_iso_col = 'COUNTRY_ISO_2',
    inverted = True,
    non_dim_cols = [
        'TIME_PERIOD', 
        'REF_AREA', 
        'OBS_VALUE', 
        'OBS_STATUS', 
        'COUNTRY_ISO_3', 
        'COUNTRY_NAME', 
        'COUNTRY_ISO_2', 
        '_merge'
    ])

s55_normalized

You have a selected a few columns, which will not be regarded as dimensions.These are the remaining columns in the dataset, along with the number of values they take in the dataset.
The column Dataflow has 1 unique values.
The column STAT_UNIT has 1 unique values.
The column UNIT_MEASURE has 1 unique values.
The column EDU_LEVEL has 3 unique values.
The column EDU_CAT has 1 unique values.
The column SEX has 3 unique values.
The column AGE has 1 unique values.
The column GRADE has 1 unique values.
The column SECTOR_EDU has 1 unique values.
The column EDU_ATTAIN has 1 unique values.
The column SUBJECT has 1 unique values.
The column WEALTH_QUINTILE has 1 unique values.
The column INFRASTR has 1 unique values.
The column LOCATION has 1 unique values.
The column EDU_TYPE has 1 unique values.
The column SE_BKGRD has 1 unique values.
The column SOURCE_FUND has 1 unique values.
The column FUND_FLOW has 1 unique values.
The column IMM_STATUS has 1 unique values.
The column UNIT_MULT has 1 uniq

Unnamed: 0,Dataflow,STAT_UNIT,UNIT_MEASURE,EDU_LEVEL,EDU_CAT,SEX,AGE,GRADE,SECTOR_EDU,EDU_ATTAIN,...,COUNTRY_ISO_3_y,COUNTRY_NAME_y,COUNTRY_ISO_2_y,RJ_CRBA_FULL_LIST,INDICATOR_NAME,INDICATOR_INDEX,INDICATOR_ISSUE,INDICATOR_CATEGORY,CRBA_RELEASE_YEAR,INDICATOR_CODE
0,UNESCO:SDG4(2.0),ROFST,PT,L3,_T,M,SCH_AGE_GROUP,_T,INST_T,_Z,...,AFG,Afghanistan,AF,both,Out-of-school adolescents (lower secondary),Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
1,UNESCO:SDG4(2.0),ROFST,PT,L3,_T,M,SCH_AGE_GROUP,_T,INST_T,_Z,...,ALB,Albania,AL,both,Out-of-school adolescents (lower secondary),Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
2,,,,,,,,,,,...,AND,Andorra,AD,right_only,Out-of-school adolescents (lower secondary),Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
3,,,,,,,,,,,...,DZA,Algeria,DZ,right_only,Out-of-school adolescents (lower secondary),Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
4,UNESCO:SDG4(2.0),ROFST,PT,L3,_T,M,SCH_AGE_GROUP,_T,INST_T,_Z,...,AGO,Angola,AO,both,Out-of-school adolescents (lower secondary),Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,UNESCO:SDG4(2.0),ROFST,PT,L3,_T,_T,SCH_AGE_GROUP,_T,INST_T,_Z,...,VEN,Venezuela,VE,both,Out-of-school adolescents (lower secondary),Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
191,,,,,,,,,,,...,VNM,Vietnam,VN,right_only,Out-of-school adolescents (lower secondary),Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
192,UNESCO:SDG4(2.0),ROFST,PT,L3,_T,_T,SCH_AGE_GROUP,_T,INST_T,_Z,...,YEM,Yemen,YE,both,Out-of-school adolescents (lower secondary),Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
193,,,,,,,,,,,...,ZMB,Zambia,ZM,right_only,Out-of-school adolescents (lower secondary),Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
