# Validation

Each version of `openfisca-uk-data` is checked against UKMOD input data to ensure consistency of variables where comparable.

## Quantiles

In [16]:
from openfisca_uk_data.tests.frs.test_against_ukmod import (
    test_quantile,
    test_aggregate,
    test_average_error_among_nonzero,
    test_ukmod_nonzero_agreement,
    test_nonzero_count,
    metadata,
)
import pandas as pd
import numpy as np
from itertools import product

quantiles = [
    (variable, quantile, *test_quantile(variable, quantile))
    for variable, quantile in product(
        metadata.keys(), np.linspace(0.1, 0.9, 9).round(1)
    )
]
df = pd.DataFrame(
    quantiles, columns=("Variable", "Quantile", "OpenFisca-UK-Data", "UKMOD")
).set_index("Variable")
df["Abs. Error"] = ((df["OpenFisca-UK-Data"] - df["UKMOD"]).abs() * 100).round(
    1
)
df["Rel. Error (%)"] = (
    (df["OpenFisca-UK-Data"] / df["UKMOD"] - 1).abs() * 100
).round(1)
df["OpenFisca-UK-Data"] = df["OpenFisca-UK-Data"].round(1)
df["UKMOD"] = df["UKMOD"].round(1)
df = df.fillna(0)
df

Unnamed: 0_level_0,Quantile,OpenFisca-UK-Data,UKMOD,Abs. Error,Rel. Error (%)
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
age,0.1,9.0,9.0,0.0,0.0
age,0.2,18.0,18.0,0.0,0.0
age,0.3,25.0,25.0,0.0,0.0
age,0.4,32.0,32.0,0.0,0.0
age,0.5,40.0,40.0,0.0,0.0
...,...,...,...,...,...
housing_service_charges,0.5,359.0,359.0,0.4,0.0
housing_service_charges,0.6,748.0,747.9,0.5,0.0
housing_service_charges,0.7,1196.7,1196.7,0.2,0.0
housing_service_charges,0.8,1695.3,1695.3,0.3,0.0


## Aggregates

In [17]:
aggregates = [
    (variable, *test_aggregate(variable)) for variable in metadata.keys()
]
df = pd.DataFrame(
    aggregates, columns=("Variable", "OpenFisca-UK-Data (£bn)", "UKMOD (£bn)")
).set_index("Variable")
df["Abs. Error (£bn)"] = (
    (df["OpenFisca-UK-Data (£bn)"] - df["UKMOD (£bn)"]).abs() / 1e9
).round(1)
df["Rel. Error (%)"] = (
    (df["OpenFisca-UK-Data (£bn)"] / df["UKMOD (£bn)"] - 1).abs() * 100
).round(1)
df["OpenFisca-UK-Data (£bn)"] = (df["OpenFisca-UK-Data (£bn)"] / 1e9).round(1)
df["UKMOD (£bn)"] = (df["UKMOD (£bn)"] / 1e9).round(1)
df = df.fillna(0)
df

Unnamed: 0_level_0,OpenFisca-UK-Data (£bn),UKMOD (£bn),Abs. Error (£bn),Rel. Error (%)
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
age,2.6,2.6,0.0,0.0
employment_income,785.1,785.1,0.0,0.0
self_employment_income,115.9,115.9,0.0,0.0
tax_free_savings_income,3.3,3.3,0.0,0.2
capital_income,17.1,17.2,0.1,0.6
property_income,12.7,12.6,0.2,1.3
maintenance_income,2.4,2.4,0.0,0.0
miscellaneous_income,3.2,3.2,0.0,0.0
private_transfer_income,8.8,8.9,0.1,0.9
lump_sum_income,5.0,5.0,0.0,0.0


## Non-zero counts

Figures given in millions.

In [24]:
aggregates = [
    (
        variable,
        *test_nonzero_count(variable),
        test_ukmod_nonzero_agreement(variable),
    )
    for variable in metadata.keys()
]
df = pd.DataFrame(
    aggregates,
    columns=(
        "Variable",
        "OpenFisca-UK-Data (m)",
        "UKMOD (m)",
        "Non-zero agreement",
    ),
).set_index("Variable")
df["Non-zero agreement"] = (100 - df["Non-zero agreement"] * 100).round(1)
df["Abs. Error (m)"] = (
    (df["OpenFisca-UK-Data (m)"] - df["UKMOD (m)"]).abs() / 1e6
).round(1)
df["Rel. Error (%)"] = (
    (df["OpenFisca-UK-Data (m)"] / df["UKMOD (m)"] - 1).abs() * 100
).round(1)
df["OpenFisca-UK-Data (m)"] = (df["OpenFisca-UK-Data (m)"] / 1e6).round(1)
df["UKMOD (m)"] = (df["UKMOD (m)"] / 1e6).round(1)
df = df.fillna(0)
df

Unnamed: 0_level_0,OpenFisca-UK-Data (m),UKMOD (m),Non-zero agreement,Abs. Error (m),Rel. Error (%)
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
age,64.7,64.7,100.0,0.0,0.0
employment_income,26.7,26.7,100.0,0.0,0.0
self_employment_income,4.2,4.2,99.8,0.0,0.0
tax_free_savings_income,9.1,9.1,100.0,0.0,0.1
capital_income,21.8,21.9,99.9,0.0,0.2
property_income,2.1,2.0,100.0,0.1,2.7
maintenance_income,0.7,0.7,100.0,0.0,0.4
miscellaneous_income,0.9,0.9,100.0,0.0,0.0
private_transfer_income,1.2,1.3,99.9,0.0,1.6
lump_sum_income,0.2,0.2,100.0,0.0,0.0


## Average errors

Mean relative error among households which have a non-zero value in UKMOD.

In [26]:
aggregates = [
    (variable, test_average_error_among_nonzero(variable))
    for variable in metadata.keys()
]
df = pd.DataFrame(
    aggregates, columns=("Variable", "Mean Rel. Error (%)")
).set_index("Variable")
df["Mean Rel. Error (%)"] = (df["Mean Rel. Error (%)"] * 100).round(1)
df = df.fillna(0)
df

Unnamed: 0_level_0,Mean Rel. Error (%)
Variable,Unnamed: 1_level_1
age,0.0
employment_income,0.0
self_employment_income,0.0
tax_free_savings_income,0.1
capital_income,0.8
property_income,0.9
maintenance_income,0.0
miscellaneous_income,0.0
private_transfer_income,3.1
lump_sum_income,0.0
