#### Validation Engine Prototyping

In [12]:
import pandas as pd
import pandera as pa
import warnings
import importlib
import test_functions.def_val_func

# importlib reload to be used only in debugging face!
importlib.reload(test_functions.def_val_func)
from test_functions.def_val_func import (
    valid_empty_data,
    valid_data_compilation,
    valid_flag_compilation,
    valid_flag_correspondence,
    valid_partial_disaggregation,
    valid_partial_total,
    valid_p_or_e_flag,
    valid_disaggregation,
    get_totals_by_sex,
    is_entry_row,
)

#### Reference to Data Validation rules

In [13]:
path_read_file = "validation_rules.xlsx"
df_rules = pd.read_excel(path_read_file, sheet_name="validation_rules", dtype=str)

#### Reference to Excel Data Collection Template

In [14]:
path_read_file = "Transmonee - Data Collection Template - 24 June.xlsx"
read_rows_offset = 12
df_collection = pd.read_excel(
    path_read_file, sheet_name="Template", dtype=str, skiprows=read_rows_offset
)

In [15]:
# get sex disaggregation rows from validation rules
totals_by_sex = get_totals_by_sex(
    df_rules.loc[8, "Formulas test failed"], read_rows_offset
)
totals_by_sex

[{'total': 3, 'sex1': 5, 'sex2': 6},
 {'total': 26, 'sex1': 28, 'sex2': 29},
 {'total': 49, 'sex1': 52, 'sex2': 53},
 {'total': 54, 'sex1': 56, 'sex2': 57},
 {'total': 84, 'sex1': 86, 'sex2': 87},
 {'total': 107, 'sex1': 109, 'sex2': 110},
 {'total': 137, 'sex1': 139, 'sex2': 140},
 {'total': 160, 'sex1': 163, 'sex2': 164},
 {'total': 166, 'sex1': 169, 'sex2': 170},
 {'total': 190, 'sex1': 193, 'sex2': 194},
 {'total': 214, 'sex1': 217, 'sex2': 218},
 {'total': 238, 'sex1': 240, 'sex2': 241},
 {'total': 261, 'sex1': 264, 'sex2': 265},
 {'total': 266, 'sex1': 268, 'sex2': 269},
 {'total': 296, 'sex1': 298, 'sex2': 299},
 {'total': 319, 'sex1': 322, 'sex2': 323},
 {'total': 343, 'sex1': 346, 'sex2': 347},
 {'total': 367, 'sex1': 370, 'sex2': 371},
 {'total': 391, 'sex1': 393, 'sex2': 394},
 {'total': 421, 'sex1': 423, 'sex2': 424},
 {'total': 444, 'sex1': 447, 'sex2': 448},
 {'total': 449, 'sex1': 452, 'sex2': 453},
 {'total': 481, 'sex1': 484, 'sex2': 485},
 {'total': 513, 'sex1': 516, 

#### Validate data collection using template

In [None]:
# Input year T for data reference
input_year = 2017
# Get column index from input_year
col_ind = df_collection.columns.get_loc(input_year)
# Get column name for input_year flags column
col_flag = df_collection.columns[col_ind + 1]
# Mask: rows expected to have data entered
data_entry_rows = is_entry_row(df_collection["CODE"])


#### Pandera check list of callables: Validation functions at a DataFrame Level
##### Operate on the whole year data collection column (where entries are expected: `data_entry_rows`)

In [None]:
# Any ideas to define this nicer? Use a dictionary for key args?
check_list_df = [
    pa.Check(
        valid_data_compilation,
        name="Availability between years",
        col_val=col_ind,
        data_rows=data_entry_rows,
        raise_warning=True,
        ignore_na=False,
    )
]
check_list_df.append(
    pa.Check(
        valid_flag_compilation,
        name="Flag availability between years",
        col_val=col_ind,
        data_rows=data_entry_rows,
        raise_warning=True,
        ignore_na=False,
    )
)
check_list_df.append(
    pa.Check(
        valid_flag_correspondence,
        name="Flag correspondence between years",
        col_val=col_ind,
        data_rows=data_entry_rows,
        raise_warning=True,
        ignore_na=False,
    )
)

##### Operate on column fractions only (disaggregations)

In [None]:
check_list_df.extend(
    pa.Check(
        valid_disaggregation,
        name=f"Consistent Sex Disaggregation {i}",
        col_val=col_ind,
        df_rows=rows,
        raise_warning=True,
        ignore_na=False,
    )
    for i, rows in enumerate(totals_by_sex, 1)
)
check_list_df.extend(
    pa.Check(
        valid_partial_disaggregation,
        name=f"Consistent Flags Partial: Sex Disaggregation {i}",
        col_val=col_ind,
        df_rows=rows,
        raise_warning=True,
        ignore_na=False,
    )
    for i, rows in enumerate(totals_by_sex, 1)
)
check_list_df.extend(
    pa.Check(
        valid_partial_total,
        name=f"Consistent Totals flagged Partial: Sex Disaggregation {i}",
        col_val=col_ind,
        df_rows=rows,
        raise_warning=True,
        ignore_na=False,
    )
    for i, rows in enumerate(totals_by_sex, 1)
)

#### Pandera check list of callables: Validation functions at a Series (column) Level
##### Operate on the whole year data collection column (where entries are expected: `data_entry_rows`)

In [None]:
check_list_col = [
    pa.Check(
        valid_empty_data,
        name="Incomplete Data",
        data_rows=data_entry_rows,
        raise_warning=True,
        ignore_na=False,
    )
]

##### Operate on column fractions only (disaggregations)

In [None]:
check_list_col_flag = [
    pa.Check(
        valid_p_or_e_flag,
        name="Consistent Provisional Flags - Sex Disaggregation {i}",
        df_rows=rows,
        raise_warning=True,
        ignore_na=False,
    )
    for i, rows in enumerate(totals_by_sex, 1)
]

#### Define Pandera Schema and Check Functions to be applied!

In [None]:
schema = pa.DataFrameSchema(
    {
        2017: pa.Column(None, checks=check_list_col, nullable=True,),
        col_flag: pa.Column(None, checks=check_list_col_flag, nullable=True),
    },
    checks=check_list_df,
)

#### Leverage warning package for error handling

In [None]:
# catch and print warnings
with warnings.catch_warnings(record=True) as caught_warnings:
    warnings.simplefilter("always")
    validated_df = schema(df_collection)
    for warning in caught_warnings:
        print(warning.message)

#### Commented code

In [None]:
# check_list = [pa.Check(valid_data_compilation, name="Availability between years",
#         col_val=col_ind, data_rows=data_entry_rows, raise_warning=True, ignore_na=False)]
# check_list.append(
#     pa.Check(valid_flag_compilation, name="Flag availability between years",
#         col_val=col_ind, data_rows=data_entry_rows, raise_warning=True, ignore_na=False)
# )
# check_list.append(
#     pa.Check(valid_flag_correspondence, name="Flag correspondence between years",
#         col_val=col_ind, data_rows=data_entry_rows, raise_warning=True, ignore_na=False)
# )
# check_list.extend(
#     pa.Check(
#         valid_disaggregation, name=f"Consistent Sex Disaggregation {i}", col_val=col_ind,
#         df_rows=rows, raise_warning=True, ignore_na=False) for i, rows in enumerate(totals_by_sex, 1)
# )

# schema = pa.DataFrameSchema({
#     2017:pa.Column(None, pa.Check(valid_empty_data, name="Incomplete Data",
#     data_rows=data_entry_rows, raise_warning=True, ignore_na=False), nullable=True),
#     col_flag:pa.Column(None, nullable=True)
# },
#     checks=check_list
# )