In [1]:
from policyengine_uk.data import SPI_2020_21

SPI_2020_21().generate()

In [2]:
from policyengine_uk import Microsimulation

sim = Microsimulation(dataset=SPI_2020_21)

In [3]:
sim.calculate("income_tax").sum()/1e9

195.06868217001394

In [4]:
import pandas as pd

df = pd.read_csv(SPI_2020_21.spi_data_file_path, delimiter="\t")

In [5]:
variables_to_add = [
    "income_tax",
    "adjusted_net_income",
    "other_tax_credits",
    "gift_aid",
    "received_allowances",
    "personal_allowance",
    "taxed_income",
    "taxed_dividend_income",
    "taxed_savings_income",
    "earned_taxable_income",
    "taxable_pension_income",
    "savings_allowance",
    "dividend_allowance",
    "received_allowances_earned_income",
    "received_allowances_dividend_income",
    "basic_rate_savings_income",
    "higher_rate_savings_income",
    "add_rate_savings_income",
    "received_allowances_savings_income",
    "savings_starter_rate_income",
]

for variable in variables_to_add:
    df[f"zzz_policyengine_{variable}"] = sim.calculate(variable)

In [6]:
df["error"] = df.zzz_policyengine_income_tax - df.TOTTAX_DEVO_TXP
df["abs_error"] = df.error.abs()

# Add the EMPINC column
df['EMPINC'] = df.apply(lambda row: max(0, row['PAY'] + row['EPB'] - row['EXPS']) + 
                                    row['INCPBEN'] + row['OSSBEN'] + row['TAXTERM'] + 
                                    row['UBISJA'] + row['MOTHINC'], axis=1)

import numpy as np

# pandas show all rows
pd.set_option('display.max_rows', None)

In [7]:
(df[df.abs_error.between(100, np.inf)]
 .sort_values("abs_error", ascending=True)
 .head()
 .T
 .sort_index(key=lambda x: x.str.lower()))



Unnamed: 0,53466,509728,712504,132332,509407
abs_error,100.0,100.0,100.0,100.0,100.0
AGERANGE,5,4,2,3,5
BPADUE,0,0,0,0,0
CAPALL,0,0,0,0,0
COVNTS,0,0,0,0,0
DEFICIEN,0,0,0,0,0
DIVIDENDS,0,36000,0,0,0
DSHIPS,3,1,3,3,3
EIDF,0,0,2870,5680,4500
EMPINC,0,14400,30900,42980,0


In [8]:
def error_category(error):
    if error <= 10:
        return "<=10"
    elif error < 100:
        return "<100"
    elif error < 1000:
        return "<1000"
    else:
        return ">=1000"
df.abs_error.apply(error_category).value_counts(normalize=True) * 100


#495286

abs_error
<=10      84.148543
<1000      8.200767
<100       4.079463
>=1000     3.571227
Name: proportion, dtype: float64

In [9]:
result_df = df[(df["abs_error"] == 250)].sort_values("zzz_policyengine_earned_taxable_income", ascending=True).head().T.sort_index(key=lambda x: x.str.lower())

# Display the result
display(result_df)

Unnamed: 0,79574,208098,208759,501707,520288
abs_error,250.0,250.0,250.0,250.0,250.0
AGERANGE,4,3,5,2,5
BPADUE,0,0,0,0,0
CAPALL,0,0,0,0,0
COVNTS,0,0,0,0,0
DEFICIEN,0,0,0,0,0
DIVIDENDS,0,0,0,0,92600
DSHIPS,3,3,3,2,3
EIDF,0,2210,0,0,0
EMPINC,12500,14700,0,12500,8780


In [10]:
result_df = df[(df['zzz_policyengine_personal_allowance'] > df['PAS'])].sort_values("abs_error", ascending=False).head().T.sort_index(key=lambda x: x.str.lower())

display(result_df)

Unnamed: 0,484154,426391,709684,577471,597837
abs_error,20095.0,8721.0,7159.0,7000.0,6564.5
AGERANGE,4,4,2,3,4
BPADUE,0,0,0,0,0
CAPALL,0,0,0,0,0
COVNTS,0,0,0,0,0
DEFICIEN,0,0,0,0,0
DIVIDENDS,0,0,0,0,67300
DSHIPS,3,3,3,3,3
EIDF,0,0,0,0,0
EMPINC,110300,108635,99490,60300,0
