In [1]:
from policyengine_us import Microsimulation
import pandas as pd
import microdf as mdf

In [2]:
baseline = Microsimulation(
    dataset="hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5",
)

enhanced_cps_2024.h5:   0%|          | 0.00/107M [00:00<?, ?B/s]

In [3]:
OTHER_DEDS = [
    "charitable_deduction",
    "interest_deduction",
    "medical_expense_deduction",
    "casualty_loss_deduction",
    "misc_deduction"
]

df = pd.DataFrame(baseline.calculate_dataframe(["employment_income", "tax_unit_children", "filing_status", "tax_unit_weight"] + OTHER_DEDS, map_to="tax_unit", period=2026))
df["non_salt_deductions"] = df.loc[:, OTHER_DEDS].sum(axis=1)

In [4]:
# Check total tax units
df.tax_unit_weight.sum() / 1e6

205.288608

In [5]:
other_ded = mdf.MicroSeries(df.non_salt_deductions, weights=df.tax_unit_weight)
other_ded.median()

0.0

In [6]:
def get_median_other_deductions(min_wages, max_wages, filing_status):
    filtered = df[df.employment_income.between(min_wages, max_wages) & (df.filing_status == filing_status)]
    return mdf.MicroSeries(filtered.non_salt_deductions, weights=filtered.tax_unit_weight).median()

In [7]:
get_median_other_deductions(225_000, 275_000, "JOINT")

0.0

In [8]:
get_median_other_deductions(450_000, 550_000, "JOINT")

0.0

In [9]:
get_median_other_deductions(900_000, 1_100_000, "JOINT")

9007.257646209178

In [10]:
def get_average_other_deductions(min_wages, max_wages, filing_status):
    filtered = df[df.employment_income.between(min_wages, max_wages) & (df.filing_status == filing_status)]
    total = (filtered.non_salt_deductions * filtered.tax_unit_weight).sum()
    count = filtered.tax_unit_weight.sum()
    print(f"Total: {total}, Count: {count}, Records: {len(filtered)}")
    return total / count

In [11]:
get_average_other_deductions(225_000, 275_000, "JOINT")

Total: 9307375616.0, Count: 821040.375, Records: 982


11336.075

In [12]:
get_average_other_deductions(450_000, 550_000, "JOINT")

Total: 43151908864.0, Count: 695217.75, Records: 469


62069.63

In [13]:
get_average_other_deductions(900_000, 1_100_000, "JOINT")

Total: 7530016768.0, Count: 299692.8125, Records: 193


25125.783

In [14]:
import pandas as pd

# IMPORTANT:
# 1. Ensure your Pandas DataFrame 'df' is loaded with your ECPS data BEFORE running this script.
#    For example:
#    # df = pd.read_csv('your_ecps_data.csv')
#
# 2. Ensure your 'mdf' module (with the MicroSeries class) is imported and available.
#    For example:
#    # import your_mdf_library as mdf # Or however you import/access it

def get_stats_for_row(min_wages, max_wages, filing_status_val, children_filter_val, df_data, mdf_module):
    """
    Calculates mean and median non-SALT deductions, number of filers, and record counts
    for a single row of the output table.

    Args:
        min_wages (float): Minimum employment income for filtering.
        max_wages (float): Maximum employment income for filtering.
        filing_status_val (str/int): Identifier for the filing status.
        children_filter_val (str/int): Number of children to filter by. "All" for no filter.
        df_data (pd.DataFrame): The input DataFrame with ECPS data.
                                Must contain 'employment_income', 'filing_status',
                                'tax_unit_children', 'non_salt_deductions', 'tax_unit_weight'.
        mdf_module: The module containing the MicroSeries class for weighted median.

    Returns:
        tuple: (mean_deductions, median_deductions, total_filers, num_records)
    """
    children_col_name = 'tax_unit_children' # Updated column name for children

    # Filter by income and filing status
    filtered_df = df_data[
        df_data['employment_income'].between(min_wages, max_wages) &
        (df_data['filing_status'] == filing_status_val)
    ]

    # Apply children filter
    if children_filter_val != "All":
        if children_col_name in filtered_df.columns:
            filtered_df = filtered_df[filtered_df[children_col_name] == children_filter_val]
        else:
            # This case should ideally not be hit if pre-run checks are done,
            # but it's a safeguard.
            print(f"Warning: Column '{children_col_name}' not found. Cannot filter by children for this row.")
            return float('nan'), float('nan'), float('nan'), 0


    if filtered_df.empty:
        return 0, 0, 0, 0 # Mean, Median, Filers, Records

    # Calculate total weighted deductions and total weight (filers)
    total_weighted_deductions = (filtered_df['non_salt_deductions'] * filtered_df['tax_unit_weight']).sum()
    total_filers = filtered_df['tax_unit_weight'].sum()

    # Calculate mean deductions
    mean_deductions = total_weighted_deductions / total_filers if total_filers > 0 else 0
    
    # Calculate median deductions
    median_deductions = 0
    if not filtered_df.empty and total_filers > 0:
        # Ensure data passed to MicroSeries is not empty and weights are valid
        series_data = filtered_df['non_salt_deductions']
        series_weights = filtered_df['tax_unit_weight']
        if not series_data.empty and not series_weights.empty and series_weights.sum() > 0:
             median_deductions = mdf_module.MicroSeries(series_data, weights=series_weights).median()

    # Number of records
    num_records = len(filtered_df)

    return mean_deductions, median_deductions, total_filers, num_records

def generate_deductions_csv(output_filename="non_salt_deductions_2026_joint_filers.csv"):
    """
    Generates and saves a CSV file for non-SALT itemized deductions.
    This function assumes 'df' (Pandas DataFrame) and 'mdf' (module with MicroSeries)
    are available in the global scope or the scope where this function is called.

    Args:
        output_filename (str): The name of the CSV file to be created.
    """
    # --- User Configuration ---
    # Verify these column names match your DataFrame 'df'
    employment_income_col = 'employment_income'
    filing_status_col = 'filing_status'
    children_column_name = 'tax_unit_children' # Updated
    non_salt_deductions_col = 'non_salt_deductions'
    tax_unit_weight_col = 'tax_unit_weight'

    # Verify the identifier for joint filers in your 'filing_status' column
    filing_status_joint = 'JOINT' # Adjust if your identifier is different (e.g., 2 for joint)
    year = 2026 # For context, not directly in CSV columns here
    # --- End User Configuration ---

    # Check if necessary columns exist in the DataFrame 'df'
    # This 'df' is expected to be globally available or passed in if modified.
    required_cols = [employment_income_col, filing_status_col, children_column_name,
                     non_salt_deductions_col, tax_unit_weight_col]
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        print(f"Error: The following required columns are missing from your DataFrame 'df': {', '.join(missing_cols)}")
        print("Please ensure your DataFrame 'df' is loaded correctly and column names match before calling this function.")
        return
    
    # Check if mdf and mdf.MicroSeries are available
    # This 'mdf' is expected to be globally available or passed in.
    if 'mdf' not in globals() or not hasattr(mdf, 'MicroSeries'):
        print("Error: The 'mdf' module or 'mdf.MicroSeries' is not available.")
        print("Please ensure your 'mdf' module is correctly imported and accessible.")
        return


    wage_configs = [
        (250000, 0.10),
        (500000, 0.10),
        (1000000, 0.10)
    ]
    children_categories = ["All", 0, 2] # "All" means no filter on children

    results_list = []

    # Header for the CSV file
    csv_headers = [
        "Wages and salaries (+/- 10%)",
        "Children", # This corresponds to 'tax_unit_children' filtering
        "Mean Deductions",
        "Median Deductions",
        "Filers (Weighted Count)",
        "Records in ECPS data"
    ]

    for base_wage, pct_range in wage_configs:
        min_w = base_wage * (1 - pct_range)
        max_w = base_wage * (1 + pct_range)
        wage_label = f"${base_wage:,.0f}"

        for children_cat in children_categories:
            children_label = str(children_cat) # "All", "0", "2"

            # Call get_stats_for_row, passing the globally available df and mdf
            mean_val, median_val, filers_val, records_val = get_stats_for_row(
                min_w, max_w, filing_status_joint, children_cat, df, mdf
            )
            results_list.append({
                csv_headers[0]: wage_label,
                csv_headers[1]: children_label,
                csv_headers[2]: mean_val,
                csv_headers[3]: median_val,
                csv_headers[4]: filers_val,
                csv_headers[5]: records_val
            })

    # Create a Pandas DataFrame from the results list
    results_df = pd.DataFrame(results_list)

    # Reorder columns to match the desired CSV header order
    if not results_df.empty:
        results_df = results_df[csv_headers]

    # Save the DataFrame to a CSV file
    try:
        results_df.to_csv(output_filename, index=False, float_format='%.2f') # Format floats to 2 decimal places
        print(f"Successfully generated CSV: {output_filename}")
    except Exception as e:
        print(f"Error writing to CSV file {output_filename}: {e}")

if __name__ == '__main__':
    # --- SCRIPT EXECUTION STARTS HERE ---

    # 1. LOAD YOUR DATAFRAME 'df' HERE:
    #    Example:
    #    try:
    #        df = pd.read_csv('path_to_your_ecps_data.csv')
    #        print("DataFrame 'df' loaded successfully.")
    #    except FileNotFoundError:
    #        print("Error: ECPS data CSV file not found. Please set the correct path for 'df'.")
    #        exit() # Exit if data can't be loaded
    #    except Exception as e:
    #        print(f"An error occurred while loading the DataFrame 'df': {e}")
    #        exit()

    # 2. IMPORT/DEFINE YOUR 'mdf' MODULE AND 'MicroSeries' HERE:
    #    Example:
    #    try:
    #        # Assuming 'mdf_module.py' contains your MicroSeries class
    #        # import mdf_module as mdf
    #        # Or if mdf is a pre-existing object/module in your environment:
    #        # if 'mdf' not in globals(): raise ImportError("mdf module not found")
    #        print("'mdf' module assumed to be available.")
    #    except ImportError:
    #        print("Error: Could not import or find 'mdf' module. Ensure it's in your Python path or defined.")
    #        exit() # Exit if mdf module isn't available
    #    except Exception as e:
    #        print(f"An error occurred with the 'mdf' module: {e}")
    #        exit()

    # --- IMPORTANT PRE-RUN CHECKS (Example - you might need to adapt) ---
    # Check if df and mdf are loaded (you would uncomment and adapt the loading above)
    if 'df' not in globals() or 'mdf' not in globals():
        print("CRITICAL ERROR: DataFrame 'df' or module 'mdf' is not loaded or defined.")
        print("Please load your data into 'df' and import/define 'mdf' at the beginning of the script execution block.")
        print("Script will not run without 'df' and 'mdf'.")
        # In a real scenario, you would exit here if they are not loaded.
        # For this example, if you run it without loading df and mdf, it will fail inside generate_deductions_csv.
    else:
        print("DataFrame 'df' and module 'mdf' are assumed to be loaded. Proceeding with CSV generation...")
        # Call the function to generate the CSV
        generate_deductions_csv(output_filename="non_salt_itemized_deductions_2026_joint_filers.csv")



DataFrame 'df' and module 'mdf' are assumed to be loaded. Proceeding with CSV generation...
Successfully generated CSV: non_salt_itemized_deductions_2026_joint_filers.csv


In [16]:
import pandas as pd

# IMPORTANT:
# 1. Ensure your Pandas DataFrame 'df' is loaded with your ECPS data BEFORE running this script.
#    For example:
#    # df = pd.read_csv('your_ecps_data.csv')
#
# 2. Ensure your 'mdf' module (with the MicroSeries class) is imported and available.
#    For example:
#    # import your_mdf_library as mdf  # Or however you import/access it


def get_stats_for_row(min_wages, max_wages, filing_status_val, children_filter_val, df_data, mdf_module):
    """
    Calculates mean, median, 90th-percentile non-SALT deductions, number of filers, and record counts
    for a single row of the output table.

    Args:
        min_wages (float): Minimum employment income for filtering.
        max_wages (float): Maximum employment income for filtering.
        filing_status_val (str | int): Identifier for the filing status.
        children_filter_val (str | int): Number of children to filter by. "All" for no filter.
        df_data (pd.DataFrame): The input DataFrame with ECPS data. Must contain
            'employment_income', 'filing_status', 'tax_unit_children',
            'non_salt_deductions', and 'tax_unit_weight'.
        mdf_module: Module containing the MicroSeries class for weighted statistics.

    Returns:
        tuple: (mean, median, p90, total_filers, num_records)
    """

    children_col_name = 'tax_unit_children'

    # Income & filing-status filter
    filtered_df = df_data[
        df_data['employment_income'].between(min_wages, max_wages) &
        (df_data['filing_status'] == filing_status_val)
    ]

    # Children filter (if requested)
    if children_filter_val != "All":
        filtered_df = filtered_df[filtered_df[children_col_name] == children_filter_val]

    if filtered_df.empty:
        return 0, 0, 0, 0, 0

    deductions = filtered_df['non_salt_deductions']
    weights = filtered_df['tax_unit_weight']
    total_filers = weights.sum()

    # Mean
    mean_val = (deductions * weights).sum() / total_filers if total_filers > 0 else 0

    # Weighted median & 90th percentile
    median_val = 0
    p90_val = 0
    if total_filers > 0:
        ms = mdf_module.MicroSeries(deductions, weights=weights)
        median_val = ms.median()
        # Some MicroSeries versions expose a quantile method; fall back if absent
        if hasattr(ms, 'quantile'):
            p90_val = ms.quantile(0.9)
        else:  # Manual weighted quantile
            qdf = filtered_df[['non_salt_deductions', 'tax_unit_weight']].sort_values('non_salt_deductions')
            cum_w = qdf['tax_unit_weight'].cumsum()
            cutoff = 0.9 * total_filers
            p90_val = qdf.loc[cum_w >= cutoff, 'non_salt_deductions'].iloc[0]

    num_records = len(filtered_df)

    return mean_val, median_val, p90_val, total_filers, num_records


def generate_deductions_csv(output_filename="non_salt_deductions_2026_joint_filers.csv"):
    """
    Generates a CSV summarising non-SALT itemised deductions by wage band and child category,
    including mean, median, and 90th-percentile values.
    Assumes global variables 'df' (DataFrame) and 'mdf' (module with MicroSeries).
    """

    # Column and constant configuration
    employment_income_col = 'employment_income'
    filing_status_col = 'filing_status'
    children_column_name = 'tax_unit_children'
    non_salt_deductions_col = 'non_salt_deductions'
    tax_unit_weight_col = 'tax_unit_weight'
    filing_status_joint = 'JOINT'  # Adjust if different in your data

    required_cols = [employment_income_col, filing_status_col, children_column_name,
                     non_salt_deductions_col, tax_unit_weight_col]
    missing_cols = [c for c in required_cols if c not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns in df: {', '.join(missing_cols)}")

    if 'mdf' not in globals() or not hasattr(mdf, 'MicroSeries'):
        raise ImportError("'mdf.MicroSeries' is not available. Ensure the module is imported.")

    # Wage bands (±10 %) centred at 250 k, 500 k, 1 M
    wage_configs = [
        (250000, 0.10),
        (500000, 0.10),
        (1000000, 0.10)
    ]
    children_categories = ["All", 0, 2]

    results = []
    headers = [
        "Wages and salaries (+/- 10%)",
        "Children",
        "Mean Deductions",
        "Median Deductions",
        "90th Percentile Deductions",
        "Filers (Weighted Count)",
        "Records in ECPS data"
    ]

    for base_w, pct in wage_configs:
        min_w, max_w = base_w * (1 - pct), base_w * (1 + pct)
        wage_lbl = f"${base_w:,.0f}"

        for child_cat in children_categories:
            mean_v, med_v, p90_v, filers_v, rec_v = get_stats_for_row(
                min_w, max_w, filing_status_joint, child_cat, df, mdf
            )
            results.append({
                headers[0]: wage_lbl,
                headers[1]: str(child_cat),
                headers[2]: mean_v,
                headers[3]: med_v,
                headers[4]: p90_v,
                headers[5]: filers_v,
                headers[6]: rec_v
            })

    pd.DataFrame(results)[headers].to_csv(
        output_filename, index=False, float_format='%.2f'
    )
    print(f"CSV written to {output_filename}")


if __name__ == "__main__":
    # Quick sanity checks (expects df & mdf loaded in the environment)
    if 'df' not in globals() or 'mdf' not in globals():
        raise RuntimeError("Load DataFrame 'df' and module 'mdf' before running.")

    generate_deductions_csv()


CSV written to non_salt_deductions_2026_joint_filers.csv
