#### Validation Engine Prototyping

In [None]:
import pandas as pd
import pandera as pa
import warnings
import importlib
import test_functions.def_val_func

# importlib reload to be used only in debugging face!
importlib.reload(test_functions.def_val_func)
from test_functions.def_val_func import (
    valid_empty_data,
    valid_data_compilation,
    valid_flag_compilation,
    valid_flag_correspondence,
    valid_partial_disaggregation,
    valid_partial_total,
    valid_p_or_e_flag,
    valid_disaggregation,
    valid_disabilities,
    valid_stock_and_flows,
    valid_category_other,
    valid_formula,
    get_totals_by_sex,
    get_totals_and_disag,
    get_stock_and_flows,
    get_totals_and_others,
    match_population_and_disable,
    is_entry_row,
)

#### Reference to Data Validation rules

In [None]:
path_read_file = "validation_rules.xlsx"
df_rules = pd.read_excel(path_read_file, sheet_name="validation_rules", dtype=str)

#### Reference to Excel Data Collection Template

In [None]:
path_read_file = "Transmonee - Data Collection Template - 24 June.xlsx"
read_rows_offset = 12
df_collection = pd.read_excel(
    path_read_file, sheet_name="Template", dtype=str, skiprows=read_rows_offset
)

#### Dev Note: functions below `get_totals_and_disag`, `match_population_and_disable`, `get_stock_and_flows`
Could probably be gather in one using the split separator as input

In [None]:
# get sex disaggregation rows from validation rules
totals_by_sex = get_totals_and_disag(
    df_rules.loc[8, "Formulas test failed"], read_rows_offset
)
# get age disaggregation rows from validation rules
totals_by_age = get_totals_and_disag(
    df_rules.loc[9, "Formulas test failed"], read_rows_offset
)
# get reasons for placement disaggregation rows from validation rules - Child Formal Alternative Care
totals_by_placement = get_totals_and_disag(
    df_rules.loc[10, "Formulas test failed"], read_rows_offset
)
# get reasons for leaving disaggregation rows from validation rules - Child Formal Alternative Care
totals_by_leaving = get_totals_and_disag(
    df_rules.loc[11, "Formulas test failed"], read_rows_offset
)
# get category of offence disaggregation rows from validation rules - Child Access to Justice
totals_by_offence = get_totals_and_disag(
    df_rules.loc[12, "Formulas test failed"], read_rows_offset
)
# get type of school disaggregation rows from validation rules - Children with disabilities in Education
totals_by_school = get_totals_and_disag(
    df_rules.loc[13, "Formulas test failed"], read_rows_offset
)
# get level of ISCED disaggregation rows from validation rules - Children with disabilities in Education
totals_by_ISCED = get_totals_and_disag(
    df_rules.loc[14, "Formulas test failed"], read_rows_offset
)
# Children in Formal Residential Care - get sample population and with disabilities
disable_child_in_care = match_population_and_disable(
    df_rules.loc[15, "Formulas test failed"], read_rows_offset
)
# Young people in Formal Residential Care - get sample population and with disabilities
disable_young_in_care = match_population_and_disable(
    df_rules.loc[16, "Formulas test failed"], read_rows_offset
)
# Children who entered Formal Residential Care - get sample population and with disabilities
disable_child_care_enter = match_population_and_disable(
    df_rules.loc[17, "Formulas test failed"], read_rows_offset
)
# Children who left Formal Residential Care - get sample population and with disabilities
disable_child_care_left = match_population_and_disable(
    df_rules.loc[18, "Formulas test failed"], read_rows_offset
)
# Young people who left Formal Residential Care - get sample population and with disabilities
disable_young_care_left = match_population_and_disable(
    df_rules.loc[19, "Formulas test failed"], read_rows_offset
)
# Child Protection aternative care stock and flows (entrants/exits)
cp_stock_and_flows = get_stock_and_flows(
    df_rules.loc[20, "Formulas test failed"], read_rows_offset
)
# Child Protection - category "other" in disaggregation
cp_others_in_disag = get_totals_and_others(
    df_rules.loc[23, "Formulas test failed"], read_rows_offset
)
# Children in Formal Family Care - get sample population and with disabilities
disable_child_in_family_care = match_population_and_disable(
    df_rules.loc[24, "Formulas test failed"], read_rows_offset
)
# Young people in Formal Family Care - get sample population and with disabilities
disable_young_in_family_care = match_population_and_disable(
    df_rules.loc[25, "Formulas test failed"], read_rows_offset
)
# Child Protection - children in formal family care by type (uses totals and others parse)
cp_family_care_by_type = get_totals_and_others(
    df_rules.loc[26, "Formulas test failed"], read_rows_offset
)

#### Validate data collection using template

In [None]:
# Input year T for data reference
input_year = 2017
# Get column index from input_year
col_index = df_collection.columns.get_loc(input_year)
# Get column name for input_year flags column
col_flag = df_collection.columns[col_index + 1]
# Mask: rows expected to have data entered
data_entry_rows = is_entry_row(df_collection["CODE"])


#### Pandera check list of callables: Validation functions at a DataFrame Level
##### Operate on the whole year data collection column (where entries are expected: `data_entry_rows`)

In [None]:
# Any ideas to define this nicer? Use a dictionary for key args?
check_list_df = [
    pa.Check(
        valid_data_compilation,
        name="Availability between years",
        col_ind=col_index,
        data_rows=data_entry_rows,
        raise_warning=True,
        ignore_na=False,
    )
]
check_list_df.append(
    pa.Check(
        valid_flag_compilation,
        name="Flag availability between years",
        col_ind=col_index,
        data_rows=data_entry_rows,
        raise_warning=True,
        ignore_na=False,
    )
)
check_list_df.append(
    pa.Check(
        valid_flag_correspondence,
        name="Flag correspondence between years",
        col_ind=col_index,
        data_rows=data_entry_rows,
        raise_warning=True,
        ignore_na=False,
    )
)

##### Operate on column fractions only (Sex Disaggregation)

In [None]:
check_list_df.extend(
    pa.Check(
        valid_disaggregation,
        name=f"Consistent Sex Disaggregation {i}",
        col_ind=col_index,
        df_rows=rows,
        raise_warning=True,
        ignore_na=False,
    )
    for i, rows in enumerate(totals_by_sex, 1)
)
# To implement below: evaluate ONLY if total[i] equal sum(disag)[i] for i in totals_by_sex
check_list_df.extend(
    pa.Check(
        valid_partial_disaggregation,
        name=f"Consistent Flags Partial: Sex Disaggregation {i}",
        col_ind=col_index,
        df_rows=rows,
        raise_warning=True,
        ignore_na=False,
    )
    for i, rows in enumerate(totals_by_sex, 1)
)
# To implement below: evaluate ONLY if total[i] equal sum(disag)[i] for i in totals_by_sex
check_list_df.extend(
    pa.Check(
        valid_partial_total,
        name=f"Consistent Totals flagged Partial: Sex Disaggregation {i}",
        col_ind=col_index,
        df_rows=rows,
        raise_warning=True,
        ignore_na=False,
    )
    for i, rows in enumerate(totals_by_sex, 1)
)

##### Operate on column fractions only (Age Disaggregation)

In [None]:
check_list_df.extend(
    pa.Check(
        valid_disaggregation,
        name=f"Consistent Age Disaggregation {i}",
        col_ind=col_index,
        df_rows=rows,
        raise_warning=True,
        ignore_na=False,
    )
    for i, rows in enumerate(totals_by_age, 1)
)
# To implement below: evaluate ONLY if total[i] equal sum(disag)[i] for i in totals_by_age
# valid_partial_disaggregation and valid_partial_total in totals_by_age!

##### Operate on column fractions only (Child Formal Alternative Care: Reasons for Placement Disaggregation)

In [None]:
check_list_df.extend(
    pa.Check(
        valid_disaggregation,
        name=f"Consistent Reasons for Placement Disaggregation {i} - Child Formal Alternative Care",
        col_ind=col_index,
        df_rows=rows,
        raise_warning=True,
        ignore_na=False,
    )
    for i, rows in enumerate(totals_by_placement, 1)
)
# To implement below: evaluate ONLY if total[i] equal sum(disag)[i] for i in totals_by_placement
# valid_partial_disaggregation and valid_partial_total in totals_by_placement!

##### Operate on column fractions only (Child Formal Alternative Care: Reasons for Leaving Disaggregation)

In [None]:
check_list_df.extend(
    pa.Check(
        valid_disaggregation,
        name=f"Consistent Reasons for Leaving Disaggregation {i} - Child Formal Alternative Care",
        col_ind=col_index,
        df_rows=rows,
        raise_warning=True,
        ignore_na=False,
    )
    for i, rows in enumerate(totals_by_leaving, 1)
)
# To implement below: evaluate ONLY if total[i] equal sum(disag)[i] for i in totals_by_leaving
# valid_partial_disaggregation and valid_partial_total in totals_by_leaving!

##### Operate on column fractions only (Child Access to Justice: Category of Offence Disaggregation)

In [None]:
check_list_df.extend(
    pa.Check(
        valid_disaggregation,
        name=f"Consistent Category of Offence Disaggregation {i} - Child Access to Justice",
        col_ind=col_index,
        df_rows=rows,
        raise_warning=True,
        ignore_na=False,
    )
    for i, rows in enumerate(totals_by_offence, 1)
)
# To implement below: evaluate ONLY if total[i] equal sum(disag)[i] for i in totals_by_offence
# valid_partial_disaggregation and valid_partial_total in totals_by_offence!

##### Operate on column fractions only (Child with disabilities in Education: Type of School Disaggregation)

In [None]:
check_list_df.extend(
    pa.Check(
        valid_disaggregation,
        name=f"Consistent Type of School Disaggregation {i} - Children with disabilities in Education",
        col_ind=col_index,
        df_rows=rows,
        raise_warning=True,
        ignore_na=False,
    )
    for i, rows in enumerate(totals_by_school, 1)
)
# To implement below: evaluate ONLY if total[i] equal sum(disag)[i] for i in totals_by_school
# valid_partial_disaggregation and valid_partial_total in totals_by_school!

##### Operate on column fractions only (Child with disabilities in Education: Type of ISCED level)

In [None]:
check_list_df.extend(
    pa.Check(
        valid_disaggregation,
        name=f"Consistent Level of ISCED Disaggregation {i} - Children with disabilities in Education",
        col_ind=col_index,
        df_rows=rows,
        raise_warning=True,
        ignore_na=False,
    )
    for i, rows in enumerate(totals_by_ISCED, 1)
)
# To implement below: evaluate ONLY if total[i] equal sum(disag)[i] for i in totals_by_ISCED
# valid_partial_disaggregation and valid_partial_total in totals_by_ISCED!

#### Pandera check list of callables: Validation functions at a Series (column) Level
##### Operate on the whole year data collection column (where entries are expected: `data_entry_rows`)

In [None]:
check_list_col = [
    pa.Check(
        valid_empty_data,
        name="Incomplete Data",
        data_rows=data_entry_rows,
        raise_warning=True,
        ignore_na=False,
    )
]

##### Operate on column fractions only (disabilities)

In [None]:
check_list_col.append(
    pa.Check(
        valid_disabilities,
        name="Consistent Disabilities - Children in Formal Residential Care",
        df_rows=disable_child_in_care,
        raise_warning=True,
        ignore_na=False,
    )
)
check_list_col.append(
    pa.Check(
        valid_disabilities,
        name="Consistent Disabilities - Young people in Formal Residential Care",
        df_rows=disable_young_in_care,
        raise_warning=True,
        ignore_na=False,
    )
)
check_list_col.append(
    pa.Check(
        valid_disabilities,
        name="Consistent Disabilities - Children who entered Formal Residential Care",
        df_rows=disable_child_care_enter,
        raise_warning=True,
        ignore_na=False,
    )
)
check_list_col.append(
    pa.Check(
        valid_disabilities,
        name="Consistent Disabilities - Children who left Formal Residential Care",
        df_rows=disable_child_care_left,
        raise_warning=True,
        ignore_na=False,
    )
)
check_list_col.append(
    pa.Check(
        valid_disabilities,
        name="Consistent Disabilities - Young who left Formal Residential Care",
        df_rows=disable_young_care_left,
        raise_warning=True,
        ignore_na=False,
    )
)
# I group here more disabilities that come later in the TMEE validation flow from Flavio
check_list_col.append(
    pa.Check(
        valid_disabilities,
        name="Consistent Disabilities - Children in Formal Family-based Care",
        df_rows=disable_child_in_family_care,
        raise_warning=True,
        ignore_na=False,
    )
)
check_list_col.append(
    pa.Check(
        valid_disabilities,
        name="Consistent Disabilities - Young people in Formal Family-based Care",
        df_rows=disable_young_in_family_care,
        raise_warning=True,
        ignore_na=False,
    )
)

##### Operate on column fractions only (Child Protection Stock and Flows)

In [None]:
check_list_col.extend(
    pa.Check(
        valid_stock_and_flows,
        name=f"Consistency Alert Stock and Flows {i} - Child Protection Alternative Care",
        df_rows=rows,
        raise_warning=True,
        ignore_na=False,
    )
    for i, rows in enumerate(cp_stock_and_flows, 1)
)

##### Operate on column fractions only (disaggregation with category "other")

In [None]:
check_list_col.append(
    pa.Check(
        valid_category_other,
        name="Consistent 'Other' disaggregation - Child Protection reason for placement/of leaving and offence",
        df_rows=cp_others_in_disag,
        raise_warning=True,
        ignore_na=False,
    )
)

##### Operate on the year data collection flag column (fraction Sex Disaggregations)

In [None]:
check_list_col_flag = [
    pa.Check(
        valid_p_or_e_flag,
        name="Consistent Provisional Flags - Sex Disaggregation {i}",
        df_rows=rows,
        raise_warning=True,
        ignore_na=False,
    )
    for i, rows in enumerate(totals_by_sex, 1)
]

##### Operate on the year data collection flag column (fraction Age Disaggregations)

In [None]:
# Here valid_p_or_e_flag must be extended for totals_by_age

##### Operate on the year data collection flag column (all other disaggregations)

In [None]:
# Here valid_p_or_e_flag must be extended for totals_by_placement
# Here valid_p_or_e_flag must be extended for totals_by_leaving
# Here valid_p_or_e_flag must be extended for totals_by_offence
# Here valid_p_or_e_flag must be extended for totals_by_school
# Here valid_p_or_e_flag must be extended for totals_by_ISCED

#### Define Pandera Schema and Check Functions to be applied!

In [None]:
schema = pa.DataFrameSchema(
    {
        2017: pa.Column(None, checks=check_list_col, nullable=True),
        col_flag: pa.Column(None, checks=check_list_col_flag, nullable=True),
    },
    checks=check_list_df,
)

#### Leverage warning package for error handling

In [None]:
# catch and print warnings
with warnings.catch_warnings(record=True) as caught_warnings:
    warnings.simplefilter("always")
    validated_df = schema(df_collection)
    for warning in caught_warnings:
        print(warning.message)

#### Commented code

In [None]:
# check_list = [pa.Check(valid_data_compilation, name="Availability between years",
#         col_ind=col_index, data_rows=data_entry_rows, raise_warning=True, ignore_na=False)]
# check_list.append(
#     pa.Check(valid_flag_compilation, name="Flag availability between years",
#         col_ind=col_index, data_rows=data_entry_rows, raise_warning=True, ignore_na=False)
# )
# check_list.append(
#     pa.Check(valid_flag_correspondence, name="Flag correspondence between years",
#         col_ind=col_index, data_rows=data_entry_rows, raise_warning=True, ignore_na=False)
# )
# check_list.extend(
#     pa.Check(
#         valid_disaggregation, name=f"Consistent Sex Disaggregation {i}", col_ind=col_index,
#         df_rows=rows, raise_warning=True, ignore_na=False) for i, rows in enumerate(totals_by_sex, 1)
# )

# schema = pa.DataFrameSchema({
#     2017:pa.Column(None, pa.Check(valid_empty_data, name="Incomplete Data",
#     data_rows=data_entry_rows, raise_warning=True, ignore_na=False), nullable=True),
#     col_flag:pa.Column(None, nullable=True)
# },
#     checks=check_list
# )