In [None]:
import pandas as pd

In [None]:
# unicef data WH
wh_url = "https://sdmx.data.unicef.org/ws/public/sdmxapi/rest/data/"
dataflow = "TRANSMONEE/"

In [None]:
# indicators pulled
ind_list = [
    "EDUNF_OFST_L1T3",
    "EDUNF_ROFST_L1T3",
    "EDUNF_SAP_L1T3",
    "EDUNF_OFST_L1_UNDER1",
    "EDUNF_OFST_L1",
    "EDUNF_OFST_L2",
    "EDUNF_OFST_L3",
    "EDUNF_NER_L02",
    "EDUNF_SAP_L02",
    "EDUNF_CR_L2",
    "EDUNF_SAP_L2",
    "EDUNF_ESL_L1",
    "EDUNF_RPTR_L1",
    "EDUNF_RPTR_L2",
    "EDUNF_ADMIN_L1_GLAST_REA",
    "EDUNF_ADMIN_L1_GLAST_MAT",
    "EDU_PISA_MAT",
    "EDU_PISA_REA",
    "EDU_PISA_SCI",
    "EDU_SDG_FREE_EDU_L02",
    "EDUNF_STU_L1_PRV",
    "EDUNF_STU_L2_PRV",
    "EDUNF_STU_L3_PRV",
    "EDUNF_TEACH_L1",
    "EDUNF_TEACH_L2",
    "EDUNF_TEACH_L3",
]

In [None]:
# query SDMX with pandas
query_key = ".{}....".format("+".join(ind_list))
data = pd.read_csv(wh_url + dataflow + query_key + "?format=csv")

In [None]:
# set countries and period
countries = ["Armenia", "Azerbaijan", "Georgia"]
time = list(range(2008,2021))
total_code = ["_T"]

In [None]:
# calculation single numerator
numerator = ["EDUNF_OFST_L1T3"]
denominator = ["EDUNF_SAP_L1T3"]

In [None]:
query = (
        "INDICATOR in @ind_to_query & TIME_PERIOD in @time & `Geographic area` in @countries & SEX in @total_code \
& RESIDENCE in @total_code & WEALTH_QUINTILE in @total_code"
    )

In [None]:
ind_to_query = numerator
numerators_df = data.query(query).groupby(
    [
        "INDICATOR",
        "Indicator",
        "Geographic area"
    ]
).agg(
    {"TIME_PERIOD": "last", "OBS_VALUE": "last"}
).reset_index().set_index(["Geographic area", "TIME_PERIOD"])
numerators_df

In [None]:
# select the denominators and set them with numerator compound-index
ind_to_query = denominator
denominators_df = data.query(query).set_index(["Geographic area", "TIME_PERIOD"])
# match denominators and numerators compound-index
den_match_df = denominators_df[
    denominators_df.index.isin(numerators_df.index)
][["INDICATOR", "Indicator", "OBS_VALUE"]]
den_match_df

In [None]:
# join numerators denominators with nan drops (missmatches)
num_den_not_nan = numerators_df.merge(
    den_match_df, on=["Geographic area", "TIME_PERIOD"], how="left", sort=False
).dropna(subset=["OBS_VALUE_y"])
num_den_not_nan

In [None]:
print((29523.0+32589.0+11266.0) / (413525.0 + 1637373.0 + 574212.0)*100)

In [None]:
# calculation with join
round(num_den_not_nan["OBS_VALUE_x"].sum()/num_den_not_nan["OBS_VALUE_y"].sum()*100,1)

In [None]:
ind_intersect = denominators_df.index.intersection(numerators_df.index)
ind_intersect

In [None]:
# calculation without join
round(numerators_df.loc[ind_intersect]["OBS_VALUE"].sum()/denominators_df.loc[ind_intersect]["OBS_VALUE"].sum()*100,1)

In [None]:
# calculation single numerator
numerator = ["EDUNF_OFST_L1_UNDER1"]
ind_to_query = numerator
numerators_df = data.query(query).groupby(
    [
        "INDICATOR",
        "Indicator",
        "Geographic area"
    ]
).agg(
    {"TIME_PERIOD": "last", "OBS_VALUE": "last"}
).reset_index().set_index(["Geographic area", "TIME_PERIOD"])
numerators_df

In [None]:
# calculation single numerator
numerator = ["EDUNF_ESL_L1"]
ind_to_query = numerator
numerators_df = data.query(query).groupby(
    [
        "INDICATOR",
        "Indicator",
        "Geographic area"
    ]
).agg(
    {"TIME_PERIOD": "last", "OBS_VALUE": "last"}
).reset_index().set_index(["Geographic area", "TIME_PERIOD"])
numerators_df

In [None]:
# calculation single numerator
numerator = ["EDUNF_ADMIN_L1_GLAST_REA"]
ind_to_query = numerator
numerators_df = data.query(query).groupby(
    [
        "INDICATOR",
        "Indicator",
        "Geographic area"
    ]
).agg(
    {"TIME_PERIOD": "last", "OBS_VALUE": "last"}
).reset_index().set_index(["Geographic area", "TIME_PERIOD"])
numerators_df

In [None]:
# calculation single numerator
numerator = ["EDUNF_ADMIN_L1_GLAST_MAT"]
ind_to_query = numerator
numerators_df = data.query(query).groupby(
    [
        "INDICATOR",
        "Indicator",
        "Geographic area"
    ]
).agg(
    {"TIME_PERIOD": "last", "OBS_VALUE": "last"}
).reset_index().set_index(["Geographic area", "TIME_PERIOD"])
numerators_df

In [None]:
# calculation single numerator
numerator = ["EDU_SDG_FREE_EDU_L02"]
ind_to_query = numerator
numerators_df = data.query(query).groupby(
    [
        "INDICATOR",
        "Indicator",
        "Geographic area"
    ]
).agg(
    {"TIME_PERIOD": "last", "OBS_VALUE": "last"}
).reset_index().set_index(["Geographic area", "TIME_PERIOD"])
numerators_df

In [None]:
# calculation with a "pack" of numerators
numerator = ["EDUNF_OFST_L1", "EDUNF_OFST_L2", "EDUNF_OFST_L3"]
ind_to_query = numerator
numerators_df = data.query(query).groupby(
    [
        "Geographic area",
        "TIME_PERIOD"
    ]
).agg(
    {"OBS_VALUE": "sum", "DATA_SOURCE": "count"}
).reset_index()
numerator_pairs = numerators_df[numerators_df.DATA_SOURCE == len(ind_to_query)].groupby(
    "Geographic area", as_index=False
).last().set_index(["Geographic area", "TIME_PERIOD"])
numerator_pairs

In [None]:
# regional average using "pack" of numerators and one denominator
round(numerator_pairs.loc[ind_intersect]["OBS_VALUE"].sum() / 
      denominators_df.loc[ind_intersect]["OBS_VALUE"].sum()*100,1)

In [None]:
# calculation with a "pack" of numerators (REDONE FOR SEX FEMALE)
sex_code = ["F"]
query = (
        "INDICATOR in @ind_to_query & TIME_PERIOD in @time & `Geographic area` in @countries & SEX in @sex_code \
& RESIDENCE in @total_code & WEALTH_QUINTILE in @total_code"
    )
numerator = ["EDUNF_OFST_L1", "EDUNF_OFST_L2", "EDUNF_OFST_L3"]
ind_to_query = numerator
numerators_df = data.query(query).groupby(
    [
        "Geographic area",
        "TIME_PERIOD"
    ]
).agg(
    {"OBS_VALUE": "sum", "DATA_SOURCE": "count"}
).reset_index()
numerator_pairs = numerators_df[numerators_df.DATA_SOURCE == len(ind_to_query)].groupby(
    "Geographic area", as_index=False
).last().set_index(["Geographic area", "TIME_PERIOD"])
numerator_pairs

In [None]:
# calculation with a "pack" of numerators (repeaters)
sex_code = ["_T"]
numerator = ["EDUNF_RPTR_L1","EDUNF_RPTR_L2"]
ind_to_query = numerator
numerators_df = data.query(query).groupby(
    [
        "Geographic area",
        "TIME_PERIOD"
    ]
).agg(
    {"OBS_VALUE": "sum", "DATA_SOURCE": "count"}
).reset_index()
numerator_pairs = numerators_df[numerators_df.DATA_SOURCE == len(ind_to_query)].groupby(
    "Geographic area", as_index=False
).last().set_index(["Geographic area", "TIME_PERIOD"])
numerator_pairs

In [None]:
# calculation with a "pack" of numerators (PISA)
numerator = ["EDU_PISA_MAT","EDU_PISA_REA","EDU_PISA_SCI"]
ind_to_query = numerator
numerators_df = data.query(query).groupby(
    [
        "Geographic area",
        "TIME_PERIOD"
    ]
).agg(
    {"OBS_VALUE": "sum", "DATA_SOURCE": "count"}
).reset_index()
# check all PISA not only those appearing for all "pack" of numerators
numerator_pairs = numerators_df.groupby("Geographic area", as_index=False).last().set_index(["Geographic area", "TIME_PERIOD"])
numerator_pairs

In [None]:
# calculation with a "pack" of numerators (private enrolments)
numerator = ["EDUNF_STU_L1_PRV","EDUNF_STU_L2_PRV","EDUNF_STU_L3_PRV"]
ind_to_query = numerator
numerators_df = data.query(query).groupby(
    [
        "Geographic area",
        "TIME_PERIOD"
    ]
).agg(
    {"OBS_VALUE": "sum", "DATA_SOURCE": "count"}
).reset_index()
# check all PISA not only those appearing for all "pack" of numerators
numerator_pairs = numerators_df.groupby("Geographic area", as_index=False).last().set_index(["Geographic area", "TIME_PERIOD"])
numerator_pairs

In [None]:
# calculation with a "pack" of numerators (private enrolments)
numerator = ["EDUNF_TEACH_L1", "EDUNF_TEACH_L2", "EDUNF_TEACH_L3"]
ind_to_query = numerator
numerators_df = data.query(query).groupby(
    [
        "Geographic area",
        "TIME_PERIOD"
    ]
).agg(
    {"OBS_VALUE": "sum", "DATA_SOURCE": "count"}
).reset_index()
# check all PISA not only those appearing for all "pack" of numerators
numerator_pairs = numerators_df.groupby("Geographic area", as_index=False).last().set_index(["Geographic area", "TIME_PERIOD"])
numerator_pairs

In [None]:
# check the call with rates! (and only one numerator in the list)
numerator = ["EDUNF_NER_L02"]
ind_to_query = numerator
numerators_df = data.query(query).groupby(
    [
        "Geographic area",
        "TIME_PERIOD"
    ]
).agg(
    {"OBS_VALUE": "sum", "DATA_SOURCE": "count"}
).reset_index()
numerator_pairs = numerators_df[numerators_df.DATA_SOURCE == len(ind_to_query)].groupby(
    "Geographic area", as_index=False
).last().set_index(["Geographic area", "TIME_PERIOD"])
numerator_pairs

In [None]:
denominator = ["EDUNF_SAP_L02"]
# select the denominators and set them with numerator compound-index
ind_to_query = denominator
denominators_df = data.query(query).set_index(["Geographic area", "TIME_PERIOD"])

In [None]:
# select only those denominators that match avalible indicators
ind_intersect = numerator_pairs.index.intersection(
    denominators_df.index)

In [None]:
absolute = False
denominators = denominators_df.loc[ind_intersect]["OBS_VALUE"]
indicator_sum = (
    numerator_pairs.loc[ind_intersect]["OBS_VALUE"].to_numpy().sum(
    )*100 / denominators.to_numpy().sum()
    if absolute
    else (
        numerator_pairs["OBS_VALUE"] * denominators / denominators.to_numpy().sum()
    ).dropna().to_numpy().sum()
)
# will drop missing countires
round(indicator_sum,1)

In [None]:
# check it out BETO!
denominators

In [None]:
print((35.14269*129438.0 + 35.36377*511204.0) / (129438.0 + 511204.0))
print((35.14269*129438.0 + 35.36377*511204.0 + 40.53383*137347.0) / (129438.0 + 511204.0 + 137347.0))

In [None]:
# check the call with rates! (and only one numerator in the list)
numerator = ["EDUNF_CR_L2"]
ind_to_query = numerator
numerators_df = data.query(query).groupby(
    [
        "Geographic area",
        "TIME_PERIOD"
    ]
).agg(
    {"OBS_VALUE": "sum", "DATA_SOURCE": "count"}
).reset_index()
numerator_pairs = numerators_df[numerators_df.DATA_SOURCE == len(ind_to_query)].groupby(
    "Geographic area", as_index=False
).last().set_index(["Geographic area", "TIME_PERIOD"])
numerator_pairs

In [None]:
denominator = ["EDUNF_SAP_L2"]
# select the denominators and set them with numerator compound-index
ind_to_query = denominator
denominators_df = data.query(query).set_index(["Geographic area", "TIME_PERIOD"])

In [None]:
# select only those denominators that match avalible indicators
ind_intersect = numerator_pairs.index.intersection(
    denominators_df.index)

In [None]:
absolute = False
denominators = denominators_df.loc[ind_intersect]["OBS_VALUE"]
indicator_sum = (
    numerator_pairs.loc[ind_intersect]["OBS_VALUE"].to_numpy().sum(
    )*100 / denominators.to_numpy().sum()
    if absolute
    else (
        numerator_pairs["OBS_VALUE"] * denominators / denominators.to_numpy().sum()
    ).dropna().to_numpy().sum()
)
# will drop missing countires
round(indicator_sum,4)

In [None]:
# check it out BETO!
denominators

In [None]:
print((96.96779*173593.0 + 98.65000*142350.0) / (173593.0 + 142350.0))
print((96.96779*173593.0 + 98.65000*142350.0 + 91.65569*868438.0) / (173593.0 + 142350.0 + 868438.0))

In [None]:
# actual function implemented in base page DASH
def indicator_card(
    time,
    countries,
    numerator,
    suffix,
    denominator=None,
    absolute=False,
    sex_code = ["_T"]
):
    # sex_code = ["_T"]  # potentially move to this config
    total_code = ["_T"]  # potentially move to this config
    query = "CODE in @indicator & TIME_PERIOD in @time & `Geographic area` in @countries & SEX in @sex_code \
        & RESIDENCE in @total_code & WEALTH_QUINTILE in @total_code"
    numors = numerator.split(",")
    indicator = numors
    # select last value for each country
    indicator_values = (
        data.query(query)
        .groupby(
            [
                "Geographic area",
                "TIME_PERIOD",
            ]
        )
        .agg({"OBS_VALUE": "sum", "DATA_SOURCE": "count"})
    ).reset_index()

    numerator_pairs = (
        indicator_values[indicator_values.DATA_SOURCE == len(numors)]
        .groupby("Geographic area", as_index=False)
        .last()
        .set_index(["Geographic area", "TIME_PERIOD"])
    )

    # check for denominator
    if denominator:
        # select the avalible denominators for countiries in selected years
        indicator = [denominator]
        denominator_values = data.query(query).set_index(["Geographic area", "TIME_PERIOD"])
        # select only those denominators that match avalible indicators
        index_intersect = numerator_pairs.index.intersection(denominator_values.index)

        denominators = denominator_values.loc[index_intersect]["OBS_VALUE"]
    

        indicator_sum = (
            numerator_pairs.loc[index_intersect]["OBS_VALUE"].to_numpy().sum()
            / denominators.to_numpy().sum()
            * 100
            if absolute
            else (
                numerator_pairs["OBS_VALUE"] * denominators / denominators.to_numpy().sum()
            )
            .dropna()  # will drop missing countires
            .to_numpy()
            .sum()
        )
        sources = index_intersect.tolist()

    elif suffix == "Countries":
        # this is a hack to accomodate small cases (to discuss with James)
        if "FREE" in numerator:
            # trick to filter number of years of free education
            indicator_sum = (numerator_pairs.OBS_VALUE >= 1).to_numpy().sum()
            sources = numerator_pairs.index.tolist()
        elif absolute:
            # trick cards data availability among group of indicators and latest time_period
            # doesn't require filtering by count == len(numors)
            numerator_pairs = indicator_values.groupby("Geographic area", as_index=False).last()
            max_time_filter = numerator_pairs.TIME_PERIOD < numerator_pairs.TIME_PERIOD.max()
            numerator_pairs.drop(numerator_pairs[max_time_filter].index, inplace=True)
            numerator_pairs.set_index(["Geographic area", "TIME_PERIOD"], inplace=True)
            sources = numerator_pairs.index.tolist()
            indicator_sum = len(sources)
        else:
            # trick to accomodate cards for admin exams (AND for boolean indicators)
            # filter exams according to number of indicators
            indicator_sum = (numerator_pairs.OBS_VALUE == len(numors)).to_numpy().sum()
            sources = numerator_pairs.index.tolist()

    else:
        indicator_sum = numerator_pairs["OBS_VALUE"].to_numpy().sum()
        sources = numerator_pairs.index.tolist()

    return ("{:.0f} {}".format(indicator_sum, suffix), sources)

In [None]:
# rename INDICATOR as CODE as in DASH init.py
data.rename(columns={"INDICATOR": "CODE"}, inplace=True)
# test and compare function
countries = ["Armenia", "Azerbaijan", "Georgia"]
time = list(range(2008,2021))
numerator = "EDUNF_OFST_L1,EDUNF_OFST_L2,EDUNF_OFST_L3"
# numerator = "EDUNF_OFST_L1T3"
denominator = "EDUNF_SAP_L1T3"
# numerator = "EDUNF_CR_L2"
# denominator = "EDUNF_SAP_L2"
# numerator = "EDUNF_NER_L02"
# denominator = "EDUNF_SAP_L02"
suffix = "%"
absolute = True
a, b = indicator_card(time,countries,numerator,suffix,denominator,absolute)
print(a)
print(b)

In [None]:
# Test of Cards Version 2
participation = [
    {
        "name": "who are out-of-school",
        "indicator": "EDUNF_OFST_L1,EDUNF_OFST_L2,EDUNF_OFST_L3",
        "suffix": "primary-to-upper-secondary-aged children and adolescents",
    },
    {
        "name": "who are out-of-school",
        "indicator": "EDUNF_OFST_L1,EDUNF_OFST_L2,EDUNF_OFST_L3",
        "suffix": "primary-to-upper-secondary-aged girls",
        "sex": "F",
    },
    {
        "name": "who are out-of-school",
        "indicator": "EDUNF_OFST_L1_UNDER1",
        "suffix": "children one year younger than the official primary entry age",
    },
]

In [None]:
countries = ['Armenia', 'Azerbaijan', 'Georgia', 'Albania', 'Bosnia and Herzegovina', 'Croatia', 'Kosovo (UN SC resolution 1244)', 'North Macedonia', 'Montenegro', 'Serbia', 'Kazakhstan', 'Kyrgyzstan', 'Tajikistan', 'Turkmenistan', 'Uzbekistan', 'Bulgaria', 'Belarus', 'Republic of Moldova', 'Romania', 'Russian Federation', 'Turkey', 'Ukraine', 'Andorra', 'Austria', 'Belgium', 'Cyprus', 'Czechia', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Holy See', 'Hungary', 'Iceland', 'Ireland', 'Italy', 'Latvia', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Malta', 'Monaco', 'Netherlands', 'Norway', 'Poland', 'Portugal', 'San Marino', 'Slovakia', 'Slovenia', 'Spain', 'Sweden', 'Switzerland', 'United Kingdom']
numerator = "EDUNF_OFST_L1,EDUNF_OFST_L2,EDUNF_OFST_L3"
suffix = "Children, Adolescents and Youth"
a, b= indicator_card(time,countries,numerator,suffix)
print(a)
print(b)

In [None]:
29523.0	+ 32589.0 + 11265.0

In [None]:
numerator = "EDUNF_OFST_L1,EDUNF_OFST_L2,EDUNF_OFST_L3"
suffix = "Children, Adolescents and Youth"
sex = ["F"]
a, b= indicator_card(time,countries,numerator,suffix,sex_code=sex)
print(a)
print(b)

In [None]:
12799.0 + 8205.0

In [None]:
numerator = "EDUNF_OFST_L1_UNDER1"
suffix = "Children"
a, b= indicator_card(time,countries,numerator,suffix)
print(a)
print(b)

In [None]:
22591.0 + 53196.0

In [None]:
quality =  [
    {
        "name": "(enrolled in the same grade for a second or further year) in primary and lower secondary education",
        "indicator": "EDUNF_RPTR_L1,EDUNF_RPTR_L2",
        "suffix": "children and adolescent repeaters",
    },
    {
        "name": "from primary education",
        "indicator": "EDUNF_ESL_L1",
        "suffix": "early school leavers",
    },
    {
        "name": "administering nationally representative learning assessment in both reading and math at the end of primary education",
        "indicator": "EDUNF_ADMIN_L1_GLAST_REA,EDUNF_ADMIN_L1_GLAST_MAT",
        "suffix": "countries",
    },
    {
        "name": "participating in the latest round of PISA",
        "indicator": "EDU_PISA_MAT,EDU_PISA_REA,EDU_PISA_SCI",
        "suffix": "countries",
        "absolute": True,
    },
]

In [None]:
numerator = "EDUNF_RPTR_L1,EDUNF_RPTR_L2"
suffix = "Children and Adolescents"
a, b= indicator_card(time,countries,numerator,suffix)
print(a)
print(b)

In [None]:
763.0 + 2588.0 + 2736.0

In [None]:
numerator = "EDUNF_ESL_L1"
suffix = "Children"
a, b= indicator_card(time,countries,numerator,suffix)
print(a)
print(b)

In [None]:
2176.58087 + 9209.62321 + 591.04149

In [None]:
numerator = "EDUNF_ADMIN_L1_GLAST_REA,EDUNF_ADMIN_L1_GLAST_MAT"
suffix = "Countries"
a, b= indicator_card(time,countries,numerator,suffix)
print(a)
print(b)

In [None]:
numerator = "EDU_PISA_MAT,EDU_PISA_REA,EDU_PISA_SCI"
suffix = "Countries"
absolute = True
a, b= indicator_card(time,countries,numerator,suffix,absolute=absolute)
print(a)
print(b)

In [None]:
governance = [
    {
        "name": "guaranteeing at least one year of free pre-primary education in their legal frameworks",
        "indicator": "EDU_SDG_FREE_EDU_L02",
        "suffix": "countries",
    },
    {
        "name": "enrolled in private institutions (primary, lower secondary and upper secondary education)",
        "indicator": "EDUNF_STU_L1_PRV,EDUNF_STU_L2_PRV,EDUNF_STU_L3_PRV",
        "suffix": "children and adolescents",
    },
    {
        "name": "total in primary, lower secondary and upper secondary education",
        "indicator": "EDUNF_TEACH_L1,EDUNF_TEACH_L2,EDUNF_TEACH_L3",
        "suffix": "classroom teachers",
    },
]

In [None]:
numerator = "EDU_SDG_FREE_EDU_L02"
suffix = "Countries"
a, b= indicator_card(time,countries,numerator,suffix)
print(a)
print(b)

In [None]:
numerator = "EDUNF_STU_L1_PRV,EDUNF_STU_L2_PRV,EDUNF_STU_L3_PRV"
suffix = "Children, Adolescents and Youth"
a, b= indicator_card(time,countries,numerator,suffix)
print(a)
print(b)

In [None]:
9353.0 + 126300.0 + 60062.0

In [None]:
numerator = "EDUNF_TEACH_L1,EDUNF_TEACH_L2,EDUNF_TEACH_L3"
suffix = "Children, Adolescents and Youth"
a, b= indicator_card(time,countries,numerator,suffix)
print(a)
print(b)

In [None]:
39261.0 + 165375.0 + 71016.0