In [None]:
import pandas as pd
from microdf import MicroDataFrame
import numpy as np
from policyengine_us import Microsimulation
from microimpute.models import QRF

def train_tip_model():
    cols = ['SSUID','PNUM','MONTHCODE','ERESIDENCEID','ERELRPE','SPANEL','SWAVE',
        
        'WPFINWGT',
        
        'ESEX','TAGE','TAGE_EHC','ERACE','EORIGIN','EEDUC', 'EDEPCLM', 'EMS', 'EFSTATUS',
        
        'TJB1_TXAMT', 'TJB1_MSUM', 'TJB1_OCC', 'TJB1_IND', 'AJB1_TXAMT', 'EJB1_TYPPAY3',
        'TJB2_TXAMT', 'TJB2_MSUM', 'TJB2_OCC', 'TJB2_IND', 'AJB2_TXAMT', 'EJB2_TYPPAY3',
        'TJB3_TXAMT', 'TJB3_MSUM', 'TJB3_OCC', 'TJB3_IND', 'AJB3_TXAMT', 'EJB3_TYPPAY3',
        'TJB4_TXAMT', 'TJB4_MSUM', 'TJB4_OCC', 'TJB4_IND', 'AJB4_TXAMT', 'EJB4_TYPPAY3',
        
        'TPTOTINC']

    for col in cols:
        if "JB1" in col:
            for i in range(2, 8):
                cols.append(col.replace("JB1", f"JB{i}"))

    df = pd.read_csv("~/Downloads/pu2022.csv", delimiter="|", usecols=cols)

    df["tip_income"] = df[df.columns[df.columns.str.contains("TXAMT")]].fillna(0).sum(axis=1) * 12
    df["employment_income"] = df.TPTOTINC * 12
    df["is_under_18"] = (df.TAGE < 18) & (df.MONTHCODE == 12)
    df["is_under_6"] = (df.TAGE < 6) & (df.MONTHCODE == 12)
    df["count_under_18"] = df.groupby("SSUID")["is_under_18"].sum().loc[df.SSUID.values].values
    df["count_under_6"] = df.groupby("SSUID")["is_under_6"].sum().loc[df.SSUID.values].values
    df["household_weight"] = df.WPFINWGT / 12
    df["household_id"] = df.SSUID
    df["age"] = df.TAGE

    sipp = df[["household_id", "employment_income", "tip_income", "count_under_18", "count_under_6", "age", "household_weight"]]


    sipp = sipp[~sipp.isna().any(axis=1)]

    sipp = sipp.loc[np.random.choice(sipp.index, size=len(sipp), replace=True, p=sipp.household_weight/sipp.household_weight.sum())]

    model = QRF()

    model = model.fit(X_train=sipp, predictors=["employment_income", "age", "count_under_18", "count_under_6"], imputed_variables=["tip_income"])

In [None]:



sim = Microsimulation(dataset="hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5")

cps = sim.calculate_dataframe([
    "person_id",
    "household_id",
    "employment_income",
    "age",
    "household_weight",
], 2025)

cps["is_under_18"] = cps.age < 18
cps["is_under_6"] = cps.age < 6
cps["count_under_18"] = cps.groupby("household_id")["is_under_18"].sum().loc[cps.household_id.values].values
cps["count_under_6"] = cps.groupby("household_id")["is_under_6"].sum().loc[cps.household_id.values].values
cps = pd.DataFrame(cps)

In [230]:
x = x[x.age >= 17]
x = x[x.employment_income > 0]

x = MicroDataFrame(x, weights="household_weight")
from policyengine_core.charts import format_fig

fig = px.line(
    (x.tip_income > 0).groupby((x.employment_income - x.tip_income).decile_rank()).mean(),
).update_layout(
    title="Share of workers with tip income by employment income (less tip income) decile"
)

format_fig(fig)

In [231]:
from microimpute.comparisons.autoimpute import autoimpute
from microimpute.models import QRF

model = QRF()

model = model.fit(X_train=sipp, predictors=["employment_income", "age", "count_under_18", "count_under_6"], imputed_variables=["tip_income"])

KeyboardInterrupt: 

In [185]:
results = model.predict(X_test=cps, mean_quantile=0.5)[0.5].tip_income.values

In [186]:
from policyengine_core.reforms import Reform
from policyengine_us.model_api import *

class tip_income(Variable):
    entity = Person
    label = "tip income"
    value_type = float
    definition_period = YEAR

class taxable_income(Variable):
    value_type = float
    entity = TaxUnit
    label = "IRS taxable income"
    unit = USD
    definition_period = YEAR

    def formula(tax_unit, period, parameters):
        agi = tax_unit("adjusted_gross_income", period)
        exemptions = tax_unit("exemptions", period)
        deductions = tax_unit("taxable_income_deductions", period) + add(tax_unit, period, ["tip_income"])
        return max_(0, agi - exemptions - deductions)


class no_tax_on_tips(Reform):
    def apply(self):
        self.update_variable(tip_income)
        self.update_variable(taxable_income)

baseline = Microsimulation(dataset="hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5")
reformed = Microsimulation(dataset="hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5", reform=no_tax_on_tips)
reformed.set_input("tip_income", 2025, results)

In [187]:
reformed.calculate("income_tax", 2025).sum()/1e9 - baseline.calculate("income_tax", 2025).sum()/1e9

-8.680291319645221

In [190]:
reformed.calculate("tip_income", 2026).sum()/1e9

68.16808059492298

In [191]:
from policyengine import Simulation

sim = Simulation(
    country="us",
    scope="macro",
    reform=no_tax_on_tips,
    time_period=2025,
    data="hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5",
)
sim.reform_simulation.set_input("tip_income", 2025, results)

Using Hugging Face for download.


In [192]:
result = sim.calculate_economy_comparison()

In [196]:
result.decile.average

{1: 0.00348574407993841,
 2: 18.45026810639836,
 3: 9.328634744125484,
 4: 6.549008282319208,
 5: 57.18697246691306,
 6: 20.656704530053158,
 7: 64.86318000585555,
 8: 65.33898222134735,
 9: 108.83844173569874,
 10: 388.8275585288143}

In [203]:
x = reformed.calculate_dataframe(["tip_income", "employment_income", "age"], 2025)
x = x[x.age >= 17][x.employment_income > 0]
px.bar(
    (x.tip_income > 0).groupby((x.employment_income - x.tip_income).decile_rank()).mean()
)

In [216]:
result.intra_decile.all

{'Lose more than 5%': 0.0,
 'Lose less than 5%': 0.0,
 'No change': 0.9513252863321602,
 'Gain less than 5%': 0.039617839288410686,
 'Gain more than 5%': 0.009056874379429248}

In [215]:

px.bar((np.array(result.intra_decile.deciles["Gain less than 5%"]) + np.array(result.intra_decile.deciles["Gain less than 5%"]))[::2])