# Imputation

This page shows how in instances where microdata variables are present in one dataset but not another, we impute values across them, achieving a single dataset for microsimulation with all the necessary variables in it. Particularly, we explore imputation of the wealth "net_woth" variable from the Survey of Consumer Finances (SCF) onto the Current Population Survey (CPS).

In [1]:
from policyengine_us_data.datasets.cps.cps import CPS_2022
from policyengine_us_data.datasets.scf.scf import SCF_2022
import numpy as np
import pandas as pd
import plotly.graph_objects as go

scf = SCF_2022(require=True)
scf_data = scf.load_dataset()
cps = CPS_2022(require=True)
cps_data = cps.load_dataset()

scf_networth_raw = scf_data["networth"]
cps_networth_raw = cps_data["net_worth"]

def safe_log10(x: np.ndarray) -> np.ndarray:
    sign = np.sign(x)
    mags = np.maximum(np.abs(x), 1e-10)
    return sign * np.log10(mags)

scf_net_log = safe_log10(scf_networth_raw)
cps_net_log = safe_log10(cps_networth_raw)

num_bins = 150
bin_edges = np.linspace(-6, 10, num_bins + 1)  # e.g. from 10⁻⁶ to 10¹⁰

scf_counts, _ = np.histogram(scf_net_log, bins=bin_edges)
scf_perc = scf_counts / scf_counts.sum() * 100

cps_counts, _ = np.histogram(cps_net_log, bins=bin_edges)
cps_perc = cps_counts / cps_counts.sum() * 100

scf_median = np.median(scf_net_log)
scf_mean = np.mean(scf_net_log)

cps_median = np.median(cps_net_log)
cps_mean = np.mean(cps_net_log)

fig = go.Figure()

fig.add_trace(go.Bar(
    x=(bin_edges[:-1] + bin_edges[1:]) / 2,  # bin centers
    y=scf_perc,
    width=(bin_edges[1] - bin_edges[0]),
    name="SCF_2022 Networth",
    marker_color="#1f77b4",  # blue
    opacity=0.7,
    hovertemplate="%{y:.2f}%<br>log₁₀(networth)=%{x:.2f}<extra></extra>",
))

fig.add_trace(go.Bar(
    x=(bin_edges[:-1] + bin_edges[1:]) / 2,
    y=cps_perc,
    width=(bin_edges[1] - bin_edges[0]),
    name="CPS_2022 Net Worth",
    marker_color="#9467bd",  # purple
    opacity=0.7,
    hovertemplate="%{y:.2f}%<br>log₁₀(net_worth)=%{x:.2f}<extra></extra>",
))

max_pct = max(scf_perc.max(), cps_perc.max()) * 1.05


fig.add_trace(go.Scatter(
    x=[scf_median, scf_median],
    y=[0, max_pct],
    mode="lines",
    line=dict(color="#1f77b4", width=2, dash="dash"),
    name=f"SCF Median: ${10**scf_median:,.0f}",
    hoverinfo="skip"
))
fig.add_trace(go.Scatter(
    x=[scf_mean, scf_mean],
    y=[0, max_pct],
    mode="lines",
    line=dict(color="#1f77b4", width=2, dash="dot"),
    name=f"SCF Mean: ${10**scf_mean:,.0f}",
    hoverinfo="skip"
))

fig.add_trace(go.Scatter(
    x=[cps_median, cps_median],
    y=[0, max_pct],
    mode="lines",
    line=dict(color="#9467bd", width=2, dash="dash"),
    name=f"CPS Median: ${10**cps_median:,.0f}",
    hoverinfo="skip"
))
fig.add_trace(go.Scatter(
    x=[cps_mean, cps_mean],
    y=[0, max_pct],
    mode="lines",
    line=dict(color="#9467bd", width=2, dash="dot"),
    name=f"CPS Mean: ${10**cps_mean:,.0f}",
    hoverinfo="skip"
))

fig.update_layout(
    title="SCF_2022 vs. CPS_2022 Net‐Worth Distributions (Log₁₀ Scale)",
    xaxis_title="Net Worth (log₁₀ scale)",
    yaxis_title="Percentage of sample (%)",
    barmode="overlay",
    bargap=0.1,
    width=900,
    height=600,
    paper_bgcolor="#F0F0F0",
    plot_bgcolor="#F0F0F0",
    font=dict(
        family="Open Sans, Verdana, Arial, sans-serif",
        size=12
    ),
    legend=dict(
        x=0.01, y=0.99,
        bgcolor="rgba(255,255,255,0.8)",
        bordercolor="rgba(0,0,0,0.3)",
        borderwidth=1,
        xanchor="left", yanchor="top"
    ),
)

tick_values = [-6, -4, -2, 0, 2, 4, 6, 8, 10]
tick_labels = []
for x in tick_values:
    if x >= 0:
        tick_labels.append(f"${10**x:,.0f}")
    else:
        tick_labels.append(f"-${10**abs(x):,.0f}")

fig.update_xaxes(
    tickvals=tick_values,
    ticktext=tick_labels,
    showgrid=False 
)
fig.update_yaxes(showgrid=False)  
fig.show()

INFO:root:Adding ID variables
INFO:root:Adding personal variables
INFO:root:Adding personal income variables
INFO:root:Adding previous year income variables
INFO:root:Adding SSN card type
INFO:root:Adding family variables
INFO:root:Adding household variables
INFO:root:Adding rent


Training imputation model for rent and real estate taxes.
Imputing rent and real estate taxes.
Imputation complete.


INFO:root:Adding auto loan interest
INFO:rpy2.situation:cffi mode is CFFI_MODE.ANY
INFO:rpy2.situation:R home found: /opt/homebrew/Cellar/r/4.5.0/lib/R
INFO:rpy2.situation:R library path: 
INFO:rpy2.situation:LD_LIBRARY_PATH: 
INFO:rpy2.rinterface_lib.embedded:Default options to initialize R: rpy2, --quiet, --no-save
INFO:rpy2.rinterface_lib.embedded:R is already initialized. No need to initialize.
INFO:root:Adding tips
INFO:root:Adding wealth
INFO:root:Added all variables
INFO:root:Adding takeup
INFO:root:Downsampling


In [2]:
scf_net = np.asarray(scf_data["networth"])
cps_net = np.asarray(cps_data["net_worth"])

pct_edges = np.quantile(scf_net, q=np.linspace(0, 1, 21))
labels = list(range(1, 21))  # 1…20

scf_percentile = pd.cut(
    scf_net,
    bins=pct_edges,
    labels=labels,
    include_lowest=True
).astype(int)

cps_percentile = pd.cut(
    cps_net,
    bins=pct_edges,
    labels=labels,
    include_lowest=True
).astype(int)

def safe_signed_log(x):
    """Return +log10(x) if x>0, −log10(|x|) if x<0, else 0."""
    if pd.isna(x) or x == 0:
        return 0.0
    return np.sign(x) * np.log10(abs(x))

rows = []
for pct in labels:
    scf_vals = scf_net[scf_percentile == pct]
    cps_vals = cps_net[cps_percentile == pct]

    scf_mean = np.mean(scf_vals) if len(scf_vals) > 0 else np.nan
    cps_mean = np.mean(cps_vals) if len(cps_vals) > 0 else np.nan

    signed_diff = cps_mean - scf_mean
    abs_diff = np.abs(signed_diff)

    rows.append({
        "pctile": pct,
        "scf_mean": scf_mean,
        "cps_mean": cps_mean,
        "signed_diff": signed_diff,
        "abs_diff": abs_diff,
        "scf_mean_log":     safe_signed_log(scf_mean),
        "cps_mean_log":     safe_signed_log(cps_mean),
        "abs_diff_log":     safe_signed_log(abs_diff),
        "signed_diff_log":  safe_signed_log(signed_diff)
    })

summary_df = pd.DataFrame(rows)

band_labels = []
for i in range(1, len(pct_edges)):
    low = pct_edges[i - 1]
    high = pct_edges[i]
    low_label = f"${low:,.0f}"
    high_label = f"${high:,.0f}"
    band_labels.append(f"{low_label} to {high_label}")

all_logs = np.concatenate([
    summary_df["scf_mean_log"].values,
    summary_df["cps_mean_log"].values,
    summary_df["abs_diff_log"].values,
    summary_df["signed_diff_log"].values
])
vmin, vmax = np.nanmin(all_logs), np.nanmax(all_logs)
tick_min = int(np.floor(vmin))
tick_max = int(np.ceil(vmax))
tickvals = list(range(tick_min, tick_max + 1))

def format_tick_label(k):
    """“-\$10^|k|” if k<0, “\$1” if k==0, “\$10^k” if k>0."""
    if k == 0:
        return "$1"
    base = 10 ** abs(k)
    formatted = f"${base:,.0f}"
    return f"-{formatted}" if k < 0 else formatted

ticktext = [format_tick_label(k) for k in tickvals]

fig = go.Figure()
bar_width = 0.08 

x_vals = summary_df["pctile"].astype(str)

fig.add_trace(go.Bar(
    x=x_vals,
    y=summary_df["scf_mean_log"],
    name="SCF 2022 (log₁₀ mean)",
    marker_color="#1f77b4",
    width=bar_width,
    offset=-1.5 * bar_width
))

fig.add_trace(go.Bar(
    x=x_vals,
    y=summary_df["cps_mean_log"],
    name="CPS 2022 (log₁₀ mean)",
    marker_color="#ff7f0e",
    width=bar_width,
    offset=-0.5 * bar_width
))

fig.add_trace(go.Bar(
    x=x_vals,
    y=summary_df["abs_diff_log"],
    name="|Δ| (log₁₀ abs diff)",
    marker_color="#2ca02c",
    width=bar_width,
    offset=0.5 * bar_width
))

fig.add_trace(go.Bar(
    x=x_vals,
    y=summary_df["signed_diff_log"],
    name="Δ (CPS − SCF; signed log₁₀)",
    marker_color="#d62728",
    width=bar_width,
    offset=1.5 * bar_width
))


fig.update_layout(
    title="SCF 2022 vs CPS 2022 Net Worth Means",
    barmode="overlay",
    xaxis=dict(
        title="SCF net worth bands ($)",
        tickmode="array",
        tickvals=[str(p) for p in labels],
        ticktext=band_labels,    
        tickangle=-45,
        tickfont=dict(size=10),
        showgrid=False
    ),
    yaxis=dict(
        title="Value (signed log₁₀ $)",
        tickmode="array",
        tickvals=tickvals,
        ticktext=ticktext,
        showgrid=True,
        gridcolor="rgba(0,0,0,0.05)"
    ),
    legend=dict(
        x=1.01, y=0.6,
        bgcolor="rgba(255,255,255,0.8)",
        bordercolor="rgba(0,0,0,0.3)",
        borderwidth=1,
        xanchor="left", yanchor="top"
    ),
    width=1000,
    height=600,
    paper_bgcolor = "#F0F0F0",
    plot_bgcolor="#F0F0F0",
    font=dict(family="Open Sans, Verdana, Arial, sans-serif", size=12),
    margin=dict(l=40, r=40, t=80, b=80)
)

fig.show()

In [3]:
from policyengine_us import Microsimulation
sim = Microsimulation(dataset=CPS_2022)
np.random.seed(42)  # For reproducibility
sim.calculate("net_worth").sum() / 1e12  # Total net worth in trillions of dollars

183.85709905522282