In [1]:
from survey_enhance.experiment.initialisation import (
    dataset, 
    calibration_parameters,
    Loss,
    create_frs_dataset,
    household_weights,
)

## Experiment 1: Evaluate percentile matching method

In [2]:
initial_loss = Loss(dataset, calibration_parameters)

from survey_enhance.percentile_matching import match_percentiles_df, match_percentiles

VARIABLES_TO_ADJUST = [
    "employment_income",
    "self_employment_income",
    "pension_income",
    "savings_interest_income",
    "dividend_income",
]

from policyengine_uk import Microsimulation, SPI

spi_simulation = Microsimulation(dataset=SPI, dataset_year=2019)

spi_df = spi_simulation.calculate_dataframe(VARIABLES_TO_ADJUST, period=2019)
frs_df = dataset.person_df[VARIABLES_TO_ADJUST]
percentile_adjusted_frs_person_df = frs_df.copy()
for variable in VARIABLES_TO_ADJUST:
    percentile_adjusted_frs_person_df[variable] = match_percentiles(
        frs_df[variable], spi_df[variable],
        percentile_threshold=0.97,
        num_groups=12,
    )

percentile_adjusted_dataset = create_frs_dataset(percentile_adjusted_frs_person_df)

In [3]:
frs_df.sum()/1e9

employment_income          892.257181
self_employment_income     114.199002
pension_income             112.636594
savings_interest_income      6.824097
dividend_income              9.667614
dtype: float64

In [4]:
percentile_adjusted_frs_person_df.sum()/1e9

employment_income          932.617010
self_employment_income     130.953874
pension_income             115.528538
savings_interest_income     14.877669
dividend_income             54.361771
dtype: float64

In [5]:
loss = Loss(dataset, calibration_parameters)

frs_loss = loss(household_weights, dataset)
percentile_adjusted_loss = loss(household_weights, percentile_adjusted_dataset)

print(f"FRS loss: {frs_loss}")
print(f"Percentile adjusted loss: {percentile_adjusted_loss}")

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [None]:
df = loss.collect_comparison_log().reset_index()

In [None]:
# There are groups of five rows in the dataframe. Combine them by subtracing the row with epoch=4 from the row with epoch=3.as_integer_ratio

df_4 = df[df.epoch == 4]
df_3 = df[df.epoch == 3]
df_4 = df_4.set_index("name")
df_3 = df_3.set_index("name")
df_4 = df_4.subtract(df_3)

df_4 = df_4.reset_index()

In [None]:
df[df.name == "IncomeTax"]

Unnamed: 0,index,epoch,name,y_true,y_pred,loss
1847,18,0,IncomeTax,0.0,0.0,0.052632
1866,37,0,IncomeTax,0.0,0.0,0.052632
1885,56,0,IncomeTax,0.0,0.0,0.052632
1904,75,0,IncomeTax,0.0,0.0,0.104175


In [None]:
import plotly.express as px
import numpy as np

px.bar(
    df_4.sort_values("loss"),
    y="loss",
    x="name",
    hover_data=df_4.columns
)