In [1]:
from policyengine_uk.data import SPI_2020_21

SPI_2020_21().generate()

In [2]:
from policyengine_uk import Microsimulation

sim = Microsimulation(dataset=SPI_2020_21)

In [3]:
sim.calculate("income_tax").sum()/1e9

195.48206357797875

In [4]:
import pandas as pd

df = pd.read_csv(SPI_2020_21.spi_data_file_path, delimiter="\t")

In [5]:
variables_to_add = [
    "income_tax",
    "adjusted_net_income",
    "other_tax_credits",
    "received_allowances",
    "personal_allowance",
    "taxed_income",
    "taxed_dividend_income",
    "taxed_savings_income",
    "earned_taxable_income",
    "earned_taxable_income",
    "taxable_employment_income",
    "employment_deductions",
    "gift_aid",
    "savings_allowance",
    "taxable_savings_interest_income",
    "marriage_allowance",
    "received_allowances_earned_income",
    "received_allowances_savings_income",
    "received_allowances_dividend_income",
]

for variable in variables_to_add:
    df[f"zzz_policyengine_{variable}"] = sim.calculate(variable)

In [6]:
df["error"] = df.zzz_policyengine_income_tax - df.TOTTAX_DEVO_TXP
df["abs_error"] = df.error.abs()

# Add the EMPINC column
df['EMPINC'] = df.apply(lambda row: max(0, row['PAY'] + row['EPB'] - row['EXPS']) + 
                                    row['INCPBEN'] + row['OSSBEN'] + row['TAXTERM'] + 
                                    row['UBISJA'] + row['MOTHINC'], axis=1)

import numpy as np

# pandas show all rows
pd.set_option('display.max_rows', None)

In [7]:
(df[df.abs_error.between(100, np.inf)]
 .sort_values("abs_error", ascending=True)
 .head()
 .T
 .sort_index(key=lambda x: x.str.lower()))



Unnamed: 0,204501,668506,769908,770166,591136
abs_error,100.0,100.0,100.0,100.0,100.0
AGERANGE,4,7,5,7,7
BPADUE,0,0,0,0,0
CAPALL,0,0,0,0,0
COVNTS,0,0,0,0,0
DEFICIEN,0,0,0,0,0
DIVIDENDS,0,0,18600,350,29600
DSHIPS,3,3,1,3,3
EIDF,315,0,0,0,0
EMPINC,12060,0,41300,0,2150


In [8]:
def error_category(error):
    if error <= 10:
        return "<=10"
    elif error < 100:
        return "<100"
    elif error < 1000:
        return "<1000"
    else:
        return ">=1000"
df[df.MAIND != -1].abs_error.apply(error_category).value_counts(normalize=True) * 100


#495286

abs_error
<=10      92.685731
<1000      4.806584
>=1000     1.671160
<100       0.836525
Name: proportion, dtype: float64

In [9]:
result_df = df[(df["abs_error"] == 250)].sort_values("zzz_policyengine_earned_taxable_income", ascending=True).head().T.sort_index(key=lambda x: x.str.lower())

# Display the result
display(result_df)

Unnamed: 0,79574,201397,200975,520288,524966
abs_error,250.0,250.0,250.0,250.0,250.0
AGERANGE,4,5,5,5,7
BPADUE,0,0,0,0,0
CAPALL,0,0,0,0,0
COVNTS,0,0,0,0,0
DEFICIEN,0,0,0,0,0
DIVIDENDS,0,0,0,92600,55
DSHIPS,3,3,3,3,3
EIDF,0,0,0,0,0
EMPINC,12500,3500,0,8780,30


In [10]:
result_df = df[
    (df['EMPINC'] == 0) & 
    (df['DIVIDENDS'] == 0) & 
    (df['INCBBS'] == 0) & 
    (df['PROFITS'] == 0) &
    (df['CAPALL'] == 0) & 
    (df['PAY'] == 0) &
    (df['PENSION'] == 0) &
    (df['INCPROP'] != 0) &
    (df['PAS'] != 0) &
    (df['MOTHDED'] == 0) &
    (df['abs_error'] < 250) &
    # (df['abs_error'] < 1005) &
    (df['MAIND'] != -1) &
    (df['GIFTAID'] == 0)
].sort_values("abs_error", ascending=False).head().T.sort_index(key=lambda x: x.str.lower())

display(result_df)

Unnamed: 0,742750,677745,425233,591040,431699
abs_error,249.0,249.0,248.0,237.800003,130.0
AGERANGE,6,6,6,6,6
BPADUE,0,0,0,0,0
CAPALL,0,0,0,0,0
COVNTS,0,0,0,0,0
DEFICIEN,0,0,0,0,0
DIVIDENDS,0,0,0,0,0
DSHIPS,3,3,3,3,3
EIDF,0,0,0,0,0
EMPINC,0,0,0,0,0
