# Percentile matching

A common method for correcting income under-reporting is matching the income distribution of the survey to an administrative tax dataset.

In [1]:
from loss.loss import Loss, calibration_parameters
from datasets.frs import FRS_2019_20, PercentileMatchedFRS
from datasets.output_dataset import OutputDataset
import torch
import yaml

original_frs = OutputDataset.from_dataset(FRS_2019_20, 2019, 2022)()

percentile_matched_frs = OutputDataset.from_dataset(
    PercentileMatchedFRS.from_dataset(
        FRS_2019_20,
        percentile_matched_variables=["dividend_income"],
        force_generate=True,
    ),
    2019,
    2022,
)()

loss = Loss(
    original_frs,
    calibration_parameters(f"2022-01-01"),
    static_dataset=False,
)

weights = torch.tensor(original_frs.household.household_weight.values)

frs_loss = loss(weights, original_frs)

percentile_matched_frs_loss = loss(weights, percentile_matched_frs)

print(yaml.dump(loss.computation_tree(weights, percentile_matched_frs)))

Loss.Programs:
  1_loss: 1.0213157906954613
  2_weight: 1
  3_children:
    Loss.Programs.DividendIncome:
      1_loss: 0.773819055140283
      2_weight: 65.2
      3_children:
        Loss.Programs.DividendIncome.dividend_income_budgetary_impact:
          1_loss: 0.5476381102805661
          2_weight: 1
          3_children:
            dividend_income_budgetary_impact_ENGLAND:
              1_loss: 2.2063085119521027e-06
              2_loss_0: 0.6773687680202487
              3_y_pred: 53,320,681,506.21
              4_y_0_pred: 9,450,501,623.28
              5_y_true: 53,400,000,000.00
            dividend_income_budgetary_impact_NORTHERN_IRELAND:
              1_loss: 0.9537625500042562
              2_loss_0: 0.9537625500042562
              3_y_pred: 32,739,489.54
              4_y_0_pred: 32,739,489.54
              5_y_true: 1,400,000,000.00
            dividend_income_budgetary_impact_SCOTLAND:
              1_loss: 0.37513747584301366
              2_loss_0: 0.6671235232756