### Example Implementation
This file goes through every step of the process, using features available to HotDeckImputer.

In [2]:
from hot_deck_imputer import HotDeckImputer
import polars as pl

#### Generate Data

In [3]:
donor_data = {
    'donor_assets': [50000, 20000, 300000, 2000, 
                     10000, 10000, 200, 2000, 4000, 500000],
    'race_cell': ['Black','Black','Black','White','White',
                     'White','Black','White','Black','Black'],
    'sex_cell': ['M','F','F','M','F',
                     'M','F','F','M','F'],
    'work_cell': [1,0,1,0,1,
                     0,1,1,1,0],
    'weight': [1, 2, 1, 2, 1,
               2, 1, 2, 1, 2]
}

donor_data = pl.DataFrame(donor_data)

recipient_data = {
    'race_cell': ['Black','Black','Black','White','White',
                     'White','Black','White','Black','Black','Black','Black','White','White'],
    'sex_cell': ['M','F','F','M','F',
                     'M','F','F','M','F', 'F', 'M', 'M', 'F'],
    'work_cell': [1,0,1,0,1,
                     0,1,1,1,0,0,1,0,1],
    'weight': [1, 3, 2, 3, 2,
               1, 4, 2, 1, 3, 4, 2, 1, 1]
}

recipient_data = pl.DataFrame(recipient_data)

#### Create Imputer + Define Cells

In [4]:
imputer = HotDeckImputer(donor_data = donor_data, 
                         imputation_var = 'donor_assets', 
                         weight_var = 'weight', 
                         recipient_data = recipient_data)

In [20]:
# If you are imputing from one dataset to another across years, this will adjust the donor data to the recipient year
imputer.age_dollar_amounts(donor_year_cpi = 223.1, imp_year_cpi = 322.1)

Summary of donor_assets pre CPI aging:
{'mean': 129677.37337516811, 'median': 14437.471985656657, 'min': 288.74943971313314, 'max': 721873.5992828329, 'std_dev': 246542.2179142546, 'count': 10, 'missing_values': 0}
Summary of donor_assets post CPI aging:
{'mean': 187221.3445277528, 'median': 20844.059733662078, 'min': 416.88119467324157, 'max': 1042202.9866831041, 'std_dev': 355944.63644187094, 'count': 10, 'missing_values': 0}


In [6]:
# The variables create cells held in imputer.donor_cells, a dictionary.
# These cells divide the data according to every unique combination of the variables. 
# Key: combination of variables defining cell, Value: pl.DataFrame of data in that cell
variables = ['race_cell','sex_cell']

imputer.define_cells(variables)

# Take a look at cell definitions
imputer.cell_definitions

["race_cell == 'Black' & sex_cell == 'M'",
 "race_cell == 'Black' & sex_cell == 'F'",
 "race_cell == 'White' & sex_cell == 'M'",
 "race_cell == 'White' & sex_cell == 'F'"]

In [7]:
# Create the cells after defining them, assuming above looks right
imputer.generate_cells()

[(col("race_cell")) == (String(Black))]
[(col("sex_cell")) == (String(M))]
[(col("race_cell")) == (String(Black))]
[(col("sex_cell")) == (String(F))]
[(col("race_cell")) == (String(White))]
[(col("sex_cell")) == (String(M))]
[(col("race_cell")) == (String(White))]
[(col("sex_cell")) == (String(F))]


In [9]:
# Take a look at the recipient cells
imputer.recipient_cells

{"race_cell == 'Black' & sex_cell == 'M'": shape: (3, 4)
 ┌───────────┬──────────┬───────────┬────────┐
 │ race_cell ┆ sex_cell ┆ work_cell ┆ weight │
 │ ---       ┆ ---      ┆ ---       ┆ ---    │
 │ str       ┆ str      ┆ i64       ┆ i64    │
 ╞═══════════╪══════════╪═══════════╪════════╡
 │ Black     ┆ M        ┆ 1         ┆ 1      │
 │ Black     ┆ M        ┆ 1         ┆ 1      │
 │ Black     ┆ M        ┆ 1         ┆ 2      │
 └───────────┴──────────┴───────────┴────────┘,
 "race_cell == 'Black' & sex_cell == 'F'": shape: (5, 4)
 ┌───────────┬──────────┬───────────┬────────┐
 │ race_cell ┆ sex_cell ┆ work_cell ┆ weight │
 │ ---       ┆ ---      ┆ ---       ┆ ---    │
 │ str       ┆ str      ┆ i64       ┆ i64    │
 ╞═══════════╪══════════╪═══════════╪════════╡
 │ Black     ┆ F        ┆ 0         ┆ 3      │
 │ Black     ┆ F        ┆ 1         ┆ 2      │
 │ Black     ┆ F        ┆ 1         ┆ 4      │
 │ Black     ┆ F        ┆ 0         ┆ 3      │
 │ Black     ┆ F        ┆ 0         ┆ 4

In [10]:
# Take a look at donor cells
imputer.donor_cells

{"race_cell == 'Black' & sex_cell == 'M'": shape: (2, 5)
 ┌──────────────┬───────────┬──────────┬───────────┬────────┐
 │ donor_assets ┆ race_cell ┆ sex_cell ┆ work_cell ┆ weight │
 │ ---          ┆ ---       ┆ ---      ┆ ---       ┆ ---    │
 │ f64          ┆ str       ┆ str      ┆ i64       ┆ i64    │
 ╞══════════════╪═══════════╪══════════╪═══════════╪════════╡
 │ 72187.359928 ┆ Black     ┆ M        ┆ 1         ┆ 1      │
 │ 5774.988794  ┆ Black     ┆ M        ┆ 1         ┆ 1      │
 └──────────────┴───────────┴──────────┴───────────┴────────┘,
 "race_cell == 'Black' & sex_cell == 'F'": shape: (4, 5)
 ┌───────────────┬───────────┬──────────┬───────────┬────────┐
 │ donor_assets  ┆ race_cell ┆ sex_cell ┆ work_cell ┆ weight │
 │ ---           ┆ ---       ┆ ---      ┆ ---       ┆ ---    │
 │ f64           ┆ str       ┆ str      ┆ i64       ┆ i64    │
 ╞═══════════════╪═══════════╪══════════╪═══════════╪════════╡
 │ 28874.943971  ┆ Black     ┆ F        ┆ 0         ┆ 2      │
 │ 433124.1

In [21]:
# After deciding that you want to split on work_cell for Black Women, you can do the following
imputer.split_cell("race_cell == 'Black' & sex_cell == 'F'", "work_cell")


KeyError: "race_cell == 'Black' & sex_cell == 'F'"

In [12]:
# Cell definitions have changed, as have donor cells
imputer.cell_definitions

["race_cell == 'Black' & sex_cell == 'M'",
 "race_cell == 'White' & sex_cell == 'M'",
 "race_cell == 'White' & sex_cell == 'F'",
 "race_cell == 'Black' & sex_cell == 'F' & work_cell == 0",
 "race_cell == 'Black' & sex_cell == 'F' & work_cell == 1"]

In [None]:
imputer.donor_cells[]

In [13]:
imputer.impute()

In [14]:
imputer.recipient_cells

{"race_cell == 'Black' & sex_cell == 'M'": shape: (3, 5)
 ┌───────────┬──────────┬───────────┬────────┬──────────────────┐
 │ race_cell ┆ sex_cell ┆ work_cell ┆ weight ┆ imp_donor_assets │
 │ ---       ┆ ---      ┆ ---       ┆ ---    ┆ ---              │
 │ str       ┆ str      ┆ i64       ┆ i64    ┆ f64              │
 ╞═══════════╪══════════╪═══════════╪════════╪══════════════════╡
 │ Black     ┆ M        ┆ 1         ┆ 1      ┆ 72187.359928     │
 │ Black     ┆ M        ┆ 1         ┆ 1      ┆ 72187.359928     │
 │ Black     ┆ M        ┆ 1         ┆ 2      ┆ 72187.359928     │
 └───────────┴──────────┴───────────┴────────┴──────────────────┘,
 "race_cell == 'White' & sex_cell == 'M'": shape: (3, 5)
 ┌───────────┬──────────┬───────────┬────────┬──────────────────┐
 │ race_cell ┆ sex_cell ┆ work_cell ┆ weight ┆ imp_donor_assets │
 │ ---       ┆ ---      ┆ ---       ┆ ---    ┆ ---              │
 │ str       ┆ str      ┆ i64       ┆ i64    ┆ f64              │
 ╞═══════════╪══════════╪══

In [15]:
imputer.recipient_data

race_cell,sex_cell,work_cell,weight,imp_donor_assets
str,str,i64,i64,f64
"""Black""","""M""",1,1,72187.359928
"""Black""","""M""",1,1,72187.359928
"""Black""","""M""",1,2,72187.359928
"""White""","""M""",0,3,2887.494397
"""White""","""M""",0,1,2887.494397
…,…,…,…,…
"""Black""","""F""",0,3,721873.599283
"""Black""","""F""",0,3,721873.599283
"""Black""","""F""",0,4,721873.599283
"""Black""","""F""",1,2,433124.15957


In [16]:
imputer.apply_random_noise(variation_stdev = (1/6), floor_noise = 1.5)

In [17]:
imputer.random_noise

0.16666666666666666

In [18]:
imputer.gen_analysis_file('hot_deck_stats', '')

Cell data written to '.\hot_deck_stats.xlsx'.


In [19]:
imputer.gen_analysis_file('hot_deck_stats')

Cell data written to '.\hot_deck_stats.xlsx'.
