In [808]:
import polars as pl

In [809]:
"""
Class defining DYNASIM-FEH file reader.

DYNASIM FEH produces three files:

    - header file,
    - family file, and
    - person file.


read_feh and save_feh modules define functionality for accessing, processing, and writing files.
This module defines classes that can be used to edit those functionalities.
"""
import numpy as np
import polars as pl
import itertools
from statsmodels.stats.weightstats import DescrStatsW
from xlsxwriter import Workbook
import os

class HotDeckImputer:
    def __init__(self, donor_data:pl.DataFrame, 
                 imputation_var:str, weight_var:str,
                 recipient_data:pl.DataFrame):
        """
        Initialize with the dataset. Donor data is the source for the hot deck.
        Recipient data is the dataset that will receive the imputation.
        """
        self.donor_data = donor_data.clone()
        self.imputation_var = imputation_var
        self.weight_var = weight_var
        self.recipient_data = recipient_data.clone()

        # Cell definition attributes to be defined in methods
        self.cell_definitions = None
        self.donor_cells = None
        self.recipient_cells = None

        # Random noise attributes defined in methods
        self.random_noise = None

        # Validate input data
        self._validate_data()

    def _validate_data(self):
            """
            Validate the input data and parameters.
            """
            # Check for non-empty DataFrames
            if self.donor_data.is_empty():
                raise ValueError("Donor data is empty")
            if self.recipient_data.is_empty():
                raise ValueError("Recipient data is empty")
            
            # Check for imputation variable's presence on both sides
            if self.imputation_var not in self.donor_data.columns:
                raise ValueError(f"Column '{self.imputation_var}' is missing from donor data")
            if self.imputation_var in self.recipient_data.columns:
                raise ValueError(f"Column '{self.imputation_var}' is already in recipient data, does not need to be imputed")
            
            # Check for weight variables + missingness if weighted imputation is requiured
            if self.weight_var is not None:
                if self.weight_var not in self.donor_data.columns:
                    raise ValueError(f"Column '{self.weight_var}' is missing from donor data")
                if self.weight_var not in self.recipient_data.columns:
                    raise ValueError(f"Column '{self.weight_var}' is missing from recipient data")

                # Check for missing values in required columns
                if self.donor_data[self.weight_var].null_count() > 0:
                    raise ValueError(f"Column '{self.weight_var}' in donor data contains {self.donor_data[self.weight_var].null_count()} missing values")
                if self.recipient_data[self.weight_var].null_count() > 0:
                    raise ValueError(f"Column '{self.weight_var}' in recipient data contains {self.recipient_data[self.weight_var].null_count()} missing values")
                
            return
            
    def _parse_condition(self, condition):
        """
        Parse a condition string and return a Polars expression.

        Args:
            condition (str): The condition string to parse.

        Returns:
            pl.Expr: The Polars expression.
        """
        # Remove outer parentheses
        condition = condition.strip("()")
        
        # Split the condition into individual criteria
        criteria = condition.split(" & ")
        
        # Initialize combined expression with a default "true" condition
        combined_expression = pl.lit(True)
        
        # Parse each criterion and combine them using logical AND
        for criterion in criteria:
            # Remove any extra parentheses and spaces
            criterion = criterion.strip("()").strip()
            column, value = criterion.split("==")
            column = column.strip()
            value = value.strip().strip("'")
            # Detect the type of the value
            if value.isdigit():
                value = int(value)
            elif value.replace('.', '', 1).isdigit():
                value = float(value)
            expr = pl.col(column) == value
            # Combine the expressions
            print(expr)
            combined_expression &= expr
        
        return combined_expression

    def generate_cells(self):
        """
        Method to generate cells based on cell definitions.
        It splits the data according to the conditions provided in the cell_definitions.
        """
        if not self.cell_definitions:
            raise ValueError("Cell definitions are not provided")

        # Create empty dictionary to store the partitions
        donor_cells = {}
        recipient_cells = {}
        
        for i, condition in enumerate(self.cell_definitions):
            # Create cell based on condition
            filter_expr = self._parse_condition(condition)
            # Filter the donor and recipient data based on the condition
            donor_cells[f'{condition}'] = self.donor_data.filter(filter_expr)
            recipient_cells[f'{condition}'] = self.recipient_data.filter(filter_expr)

        self.donor_cells = donor_cells
        self.recipient_cells = recipient_cells
        return
    
    def _check_variable_consistency(self, variables):
        """
        Non-callable method to check if the unique values and types of the variables
        are the same in donor and recipient datasets.
        :param variables: List of variables to check
        :raises TypeError: If data types do not match between donor and recipient
        :raises ValueError: If unique values do not match between donor and recipient
        """
        for var in variables:
            donor_unique = self.donor_data[var].unique()
            recipient_unique = self.recipient_data[var].unique()

            # Check if the types match
            if self.donor_data[var].dtype != self.recipient_data[var].dtype:
                raise TypeError(f"Data types for variable '{var}' do not match between donor and recipient datasets.")
            
            # Check if the unique values match
            if set(donor_unique) != set(recipient_unique):
                raise ValueError(f"Unique values for variable '{var}' do not match between donor and recipient datasets.")

    def define_cells(self, variables):
        """
        Method to define all possible cell definitions given a list of input variables.
        :param variables: A list of column names (variables) from the data to partition by.
        For example: ['homeowner_hh_flag', 'member_over_60']
        :return: A list of strings representing all possible conditions
        """
        # First, check if the variables are consistent across donor and recipient datasets
        self._check_variable_consistency(variables)

        # Extract unique values from the donor data for each variable
        var_values = {var: self.donor_data[var].unique() for var in variables}

        # Generate all possible combinations of variable values
        var_combinations = list(itertools.product(*var_values.values()))

        # Create the condition strings
        cell_definitions = []
        for combination in var_combinations:
            conditions = [
            f"{variables[i]} == '{combination[i]}'" if isinstance(combination[i], str) else f"{variables[i]} == {combination[i]}"
            for i in range(len(combination))
            ]
            cell_definitions.append(' & '.join(conditions))

        self.cell_definitions = cell_definitions
        return 

    def split_cell(self, cell_condition, split_column):
        """
        Method to split an individual cell further based on a new condition.
        :param cell_condition: A condition string representing the cell to be split.
        :param split_column: The column to check for unique values to split the cell.
        :return: None
        """
        # Get the data for the cell that is going to be split
        split_donor = self.donor_cells[cell_condition]
        split_recipient = self.recipient_cells[cell_condition]
        
        # Get unique values in the split column
        unique_values = split_donor.select(split_column).unique().to_series().to_list()
        
        # Remove the original cell from the donor and recipient cell dictionaries
        del self.donor_cells[cell_condition]
        del self.recipient_cells[cell_condition]

        # Split the cell based on unique values in the split column
        for value in unique_values:
            split_condition = f"{split_column} == {value}"
            combined_condition = f"{cell_condition} & {split_condition}"
            split_expr = self._parse_condition(combined_condition)
            
            # Add the newly split cells into the dictionaries
            self.donor_cells[combined_condition] = split_donor.filter(split_expr)
            self.recipient_cells[combined_condition] = split_recipient.filter(split_expr)
            
            # Update cell definitions
            self.cell_definitions.append(combined_condition)
        
        # Remove the original cell condition from cell definitions
        self.cell_definitions.remove(cell_condition)
        return
    
    def summarize_cells(self):
        results = {}
        for i, recipient_cell in self.recipient_cells.items():

            # Donor stat generation
            donor_cell = self.donor_cells.get(i)
            source_var = donor_cell[self.imputation_var]

            if self.weight_var in donor_cell.columns:
                donor_stats = DescrStatsW(source_var, weights=donor_cell[self.weight_var], ddof=0)
            else:
                donor_stats = DescrStatsW(source_var, ddof=0)

            # Recipient stat generation
            source_var = recipient_cell[f'imp_{self.imputation_var}']
            if self.weight_var in recipient_cell.columns:
                recipient_stats = DescrStatsW(source_var, weights=recipient_cell[self.weight_var], ddof=0)
            else:
                recipient_stats = DescrStatsW(source_var, ddof=0)

            data = {
                'statistic': [
                    '95int_low', 'mean', '95int_high', 'stddev', 'var', 'stderr', 'sum', 'obs'
                ],
                'donor': [
                    donor_stats.mean - 1.96 * donor_stats.std_mean,  # 95% CI low for donor
                    donor_stats.mean,                                # Mean for donor
                    donor_stats.mean + 1.96 * donor_stats.std_mean,  # 95% CI high for donor
                    donor_stats.std,                                 # Stddev for donor
                    donor_stats.var,                                 # Variance for donor
                    donor_stats.std_mean,                            # Std error for donor
                    donor_stats.sum_weights,                         # Weighted sum for donor
                    np.float64(donor_cell.shape[0])                  # Observations for donor
                ],

                'imp': [
                    recipient_stats.mean - 1.96 * recipient_stats.std_mean,  # 95% CI low for imp
                    recipient_stats.mean,                                    # Mean for imp
                    recipient_stats.mean + 1.96 * recipient_stats.std_mean,  # 95% CI high for imp
                    recipient_stats.std,                                     # Stddev for imp
                    recipient_stats.var,                                     # Variance for imp
                    recipient_stats.std_mean,                                # Std error for imp
                    recipient_stats.sum_weights,                             # Weighted sum for imp
                    np.float64(recipient_cell.shape[0])                      # Observations for imp
                ]
            }

            # Convert dictionary to DataFrame
            stats_df = pl.DataFrame(data)
            stats_df = stats_df.with_columns((stats_df['imp'] - stats_df['donor']).alias('diff'))
            stats_df = stats_df.with_columns((stats_df['imp']/stats_df['donor']).alias('imp_to_donor_ratio'))

            results[i] = stats_df

        return results

    def gen_analysis_file(self, out_file:str, out_path:str =''):
        """
        Generate an analysis file summarizing the imputation results.
        :param out_file (str): Name of the output file.
        :param out_path (str): Path to save the output file.
        :return: None
        """
        if out_path == '':
            out_path = '.'
        # Ensure the output directory exists
        if not os.path.exists(out_path):
            raise FileNotFoundError(f"The directory '{out_path}' does not exist.")

        # Construct the full file path
        full_path = os.path.join(out_path, f'{out_file}.xlsx')

        # Get dictionary of DFs for each cell
        data = self.summarize_cells()
        
        # Get iterator for worksheet locations
        row = 1
        col = 0

        # Iterate through each cell's data
        with Workbook(full_path) as wb:  
            ws = wb.add_worksheet('Summary')
            for key, df in data.items():
                ws.write(row-1, col, key)
                # Write table to excel  
                df.write_excel(workbook = wb, 
                               worksheet = ws,
                               position = (row, col),
                               table_style="Table Style Light 1",
                               autofit = True)
            
                # 2 row gap between each DF's results
                row = row + df.shape[0] + 3
        print(f"Cell data written to '{out_path}\\{out_file}.xlsx'.")

    def apply_random_noise(self, variation_stdev, floor_noise = None):
        """
        Add random noise to smooth out issue of clustering
            * Within each cell, sort by asset value in donor data 
            * Get a lagged variable for each row showing asset value of next neighbor
            * Compute for the whole cell, the average distance between asset values and their neighbors.
            * Add noise to every recipient- a RV with mean 0 and standard deviation of 1/6th of the mean distance for that cell
        """
        imputed_recipient_cells = []

        for condition, donor_cell in self.donor_cells.items():
            # Sort donor cell by asset value
            donor_cell = donor_cell.sort(by=self.imputation_var)

            # Calculate the next neighbor values
            donor_cell = donor_cell.with_columns(
                donor_cell[self.imputation_var].shift(-1).alias('next_val')
            )

            # Compute the distance to prior and next neighbor
            donor_cell = donor_cell.with_columns(
                (donor_cell['next_val'] - donor_cell[self.imputation_var]).alias('next_distance')
            )
            
            # Calculate the average neighbor distance for the cell, ignoring NaN values
            ## First get mean of each row and then get mean of the result
            mean_distance = donor_cell['next_distance'].mean()

            # Calculate noise level as a proportion of the mean distance between neighbors
            noise_stdev = mean_distance * variation_stdev

            # Calculate the threshold value based on relevant floor for asset tests
            if floor_noise is not None:
                threshold = floor_noise
            else:
                threshold = self.donor_data[f'{self.imputation_var}'].min()

            # Generate random noise for each recipient in the cell
            # Only apply this random noise for those who are less than some factor of the standard deviation of neighboring distances
            # i.e. if floor_stdev_multiplier = 2, observations with <2x the standard deviation of neighboring distances are left alone
            recipient_cell = self.recipient_cells[condition]

            # identify the observations who are above the threshold, who will have random noise added
            # when there is no thresholding by floor_stdev_multiplier, this is handled by the minimum identified above
            ge_thresh = recipient_cell[f'imp_{self.imputation_var}'] >= threshold
            noise = np.random.normal(loc=0, scale=noise_stdev, size=recipient_cell.shape[0])
            
            # Indicate to user that noise was not generated if all values are below the threshold
            if ge_thresh.sum() == 0:
                print(f'\nCell:\n{condition}')
                print(f'NO NOISE GENERATED for cell due to thresholding.\n' 
                        f'All values are below the threshold of {threshold}\n'
                        f'Mean value of cell observations for imp_{self.imputation_var}: ' 
                        f'{recipient_cell[f'imp_{self.imputation_var}'].mean()}')

            # Apply noise to the imputed liquid assets in the recipient cell
            recipient_cell = recipient_cell.with_columns(
                pl.when(ge_thresh)
                .then(pl.col(f'imp_{self.imputation_var}') + noise)
                .otherwise(pl.col(f'imp_{self.imputation_var}'))
                .alias(f'imp_{self.imputation_var}')
            )

            # Ensure that values that have noise applied are not below the minimum donor value
            min_donor_val = donor_cell[self.imputation_var].min()
            recipient_cell = recipient_cell.with_columns(
                pl.col(f'imp_{self.imputation_var}')
                .clip(lower_bound = min_donor_val)
                .alias(f'imp_{self.imputation_var}')
            )

            # Update recipient data with noisy values
            self.recipient_cells[condition] = recipient_cell.with_columns(
                pl.col(f'imp_{self.imputation_var}')
            )
            imputed_recipient_cells.append(recipient_cell)
        # Store the variation standard deviation parameter
        self.random_noise = variation_stdev
        self.recipient_data = pl.concat(imputed_recipient_cells)
        
        return

    def summarize_column(self, data, column_name):
        """
        Summarize a column in data, returning basic statistics.
        :param column_name: The column to summarize
        :return: A dictionary with summary statistics
        """
        # Check if the column exists in the DataFrame
        if column_name not in data.columns:
            raise ValueError(f"Column '{column_name}' does not exist in donor_data.")
        
        # Calculate summary statistics
        summary_stats = {
            'mean': data[column_name].mean(),
            'median': data[column_name].median(),
            'min': data[column_name].min(),
            'max': data[column_name].max(),
            'std_dev': data[column_name].std(),
            'count': data[column_name].count(),
            'missing_values': data[column_name].is_null().sum()
        }

        return summary_stats

    def age_dollar_amounts(self, donor_year_cpi, imp_year_cpi):
        """
        Age the imputed values to the target year. Relevant when the source data and target data differ.
        https://www.cbo.gov/data/budget-economic-data#4 for CPI indexes
        """
        
        print(f'Summary of {self.imputation_var} pre CPI aging:\n{self.summarize_column(self.donor_data, self.imputation_var)}')
        scaling_factor = imp_year_cpi / donor_year_cpi

        self.donor_data = self.donor_data.with_columns(
            (pl.col(self.imputation_var) * scaling_factor).alias(self.imputation_var)
        )
        print(f'Summary of {self.imputation_var} post CPI aging:\n{self.summarize_column(self.donor_data, self.imputation_var)}')

        return

    def impute(self):
        """
        Impute the missing values in the recipient data using the donor data for corresponding cells.
        This method assumes that both donor and recipient data have been partitioned using generate_cells.
        """
        if not self.cell_definitions:
            raise ValueError("Cell definitions are not provided")
        
        # List to hold imputed recipient cells
        imputed_recipient_cells = []

        # For each recipient cell, find the corresponding donor cell and perform imputation
        for condition, recipient_cell in self.recipient_cells.items():
            donor_cell = self.donor_cells.get(condition)
            
            if donor_cell is not None and not donor_cell.shape[0] == 0:
                # Perform weighted random selection for the required number of values
                if self.weight_var:
                    weights = donor_cell[self.weight_var]
                    donor_values = donor_cell[self.imputation_var].drop_nulls()

                    # Randomly select `missing_count` values from the donor set using the weights
                    # Using weighted selection according to probability proportional to weights https://documentation.sas.com/doc/en/statcdc/14.2/statug/statug_surveyimpute_details25.htm#statug.surveyimpute.weightedDet
                    selected_values = np.random.choice(donor_values, size=len(recipient_cell), replace=True, p=weights / weights.sum())
                else:
                    # Without weights, simply sample donor values
                    donor_values = donor_cell[self.imputation_var].drop_nulls()
                    selected_values = np.random.choice(donor_values, size=len(recipient_cell), replace=True)

                # Add the imputed values to the recipient cell
                recipient_cell = recipient_cell.with_columns(
                    pl.Series(f'imp_{self.imputation_var}', selected_values)
                )
                # Add the imputed recipient cell to the list
                imputed_recipient_cells.append(recipient_cell)
                self.recipient_cells[condition] = recipient_cell.clone()

            else:
                # If no donors are available, imputation is not performed (or can apply other fallback logic here)
                print(f"No donors available for {condition}, global mean applied")
                recipient_cell[f'imp_{self.imputation_var}'] = np.average(self.donor_data[self.imputation_var], 
                                                                          self.donor_data[self.weight_var])

                # Add the imputed recipient cell to the list
                imputed_recipient_cells.append(recipient_cell)
                self.recipient_cell = recipient_cell.clone()

        # Combine all the imputed recipient cells into one DataFrame
        self.recipient_data = pl.concat(imputed_recipient_cells)

        return

In [810]:
donor_data = {
    'donor_assets': [5000000, 20000, 300000, 200000, 
                     100000, 10000, 200, 200000, 4000, 500000],
    'race_cell': ['Black','Black','Black','White','White',
                     'White','Black','White','Black','Black'],
    'sex_cell': ['M','F','F','M','F',
                     'M','F','F','M','F'],
    'work_cell': [1,0,1,0,1,
                     0,1,1,1,0],
    'weight': [1, 2, 1, 2, 1,
               2, 1, 2, 1, 2]
}

donor_data = pl.DataFrame(donor_data)

recipient_data = {
    'race_cell': ['Black','Black','Black','White','White',
                     'White','Black','White','Black','Black','Black','Black','White','White'],
    'sex_cell': ['M','F','F','M','F',
                     'M','F','F','M','F', 'F', 'M', 'M', 'F'],
    'work_cell': [1,0,1,0,1,
                     0,1,1,1,0,0,1,0,1],
    'weight': [1, 3, 2, 3, 2,
               1, 4, 2, 1, 3, 4, 2, 1, 1]
}

recipient_data = pl.DataFrame(recipient_data)

In [811]:
imputer = HotDeckImputer(donor_data = donor_data, 
                         imputation_var = 'donor_assets', 
                         weight_var = 'weight', 
                         recipient_data = recipient_data)

In [812]:
imputer.age_dollar_amounts(donor_year_cpi = 223.1, imp_year_cpi = 322.1)


Summary of donor_assets pre CPI aging:
{'mean': 633420.0, 'median': 150000.0, 'min': 200, 'max': 5000000, 'std_dev': 1542663.896425055, 'count': 10, 'missing_values': 0}
Summary of donor_assets post CPI aging:
{'mean': 914498.3505154641, 'median': 216562.07978484986, 'min': 288.74943971313314, 'max': 7218735.992828329, 'std_dev': 2227216.678792068, 'count': 10, 'missing_values': 0}


In [813]:
variables = ['race_cell','sex_cell']

imputer.define_cells(variables)
imputer.cell_definitions

["race_cell == 'White' & sex_cell == 'M'",
 "race_cell == 'White' & sex_cell == 'F'",
 "race_cell == 'Black' & sex_cell == 'M'",
 "race_cell == 'Black' & sex_cell == 'F'"]

In [814]:
imputer.generate_cells()

[(col("race_cell")) == (String(White))]
[(col("sex_cell")) == (String(M))]
[(col("race_cell")) == (String(White))]
[(col("sex_cell")) == (String(F))]
[(col("race_cell")) == (String(Black))]
[(col("sex_cell")) == (String(M))]
[(col("race_cell")) == (String(Black))]
[(col("sex_cell")) == (String(F))]


In [815]:
imputer.cell_definitions

["race_cell == 'White' & sex_cell == 'M'",
 "race_cell == 'White' & sex_cell == 'F'",
 "race_cell == 'Black' & sex_cell == 'M'",
 "race_cell == 'Black' & sex_cell == 'F'"]

In [816]:
imputer.recipient_cells

{"race_cell == 'White' & sex_cell == 'M'": shape: (3, 4)
 ┌───────────┬──────────┬───────────┬────────┐
 │ race_cell ┆ sex_cell ┆ work_cell ┆ weight │
 │ ---       ┆ ---      ┆ ---       ┆ ---    │
 │ str       ┆ str      ┆ i64       ┆ i64    │
 ╞═══════════╪══════════╪═══════════╪════════╡
 │ White     ┆ M        ┆ 0         ┆ 3      │
 │ White     ┆ M        ┆ 0         ┆ 1      │
 │ White     ┆ M        ┆ 0         ┆ 1      │
 └───────────┴──────────┴───────────┴────────┘,
 "race_cell == 'White' & sex_cell == 'F'": shape: (3, 4)
 ┌───────────┬──────────┬───────────┬────────┐
 │ race_cell ┆ sex_cell ┆ work_cell ┆ weight │
 │ ---       ┆ ---      ┆ ---       ┆ ---    │
 │ str       ┆ str      ┆ i64       ┆ i64    │
 ╞═══════════╪══════════╪═══════════╪════════╡
 │ White     ┆ F        ┆ 1         ┆ 2      │
 │ White     ┆ F        ┆ 1         ┆ 2      │
 │ White     ┆ F        ┆ 1         ┆ 1      │
 └───────────┴──────────┴───────────┴────────┘,
 "race_cell == 'Black' & sex_cell == '

In [817]:
imputer.donor_cells

{"race_cell == 'White' & sex_cell == 'M'": shape: (2, 5)
 ┌───────────────┬───────────┬──────────┬───────────┬────────┐
 │ donor_assets  ┆ race_cell ┆ sex_cell ┆ work_cell ┆ weight │
 │ ---           ┆ ---       ┆ ---      ┆ ---       ┆ ---    │
 │ f64           ┆ str       ┆ str      ┆ i64       ┆ i64    │
 ╞═══════════════╪═══════════╪══════════╪═══════════╪════════╡
 │ 288749.439713 ┆ White     ┆ M        ┆ 0         ┆ 2      │
 │ 14437.471986  ┆ White     ┆ M        ┆ 0         ┆ 2      │
 └───────────────┴───────────┴──────────┴───────────┴────────┘,
 "race_cell == 'White' & sex_cell == 'F'": shape: (2, 5)
 ┌───────────────┬───────────┬──────────┬───────────┬────────┐
 │ donor_assets  ┆ race_cell ┆ sex_cell ┆ work_cell ┆ weight │
 │ ---           ┆ ---       ┆ ---      ┆ ---       ┆ ---    │
 │ f64           ┆ str       ┆ str      ┆ i64       ┆ i64    │
 ╞═══════════════╪═══════════╪══════════╪═══════════╪════════╡
 │ 144374.719857 ┆ White     ┆ F        ┆ 1         ┆ 1      │
 │ 

In [818]:
imputer.split_cell("race_cell == 'Black' & sex_cell == 'F'", "work_cell")


[(col("race_cell")) == (String(Black))]
[(col("sex_cell")) == (String(F))]
[(col("work_cell")) == (dyn int: 0)]
[(col("race_cell")) == (String(Black))]
[(col("sex_cell")) == (String(F))]
[(col("work_cell")) == (dyn int: 1)]


In [819]:
imputer.cell_definitions

["race_cell == 'White' & sex_cell == 'M'",
 "race_cell == 'White' & sex_cell == 'F'",
 "race_cell == 'Black' & sex_cell == 'M'",
 "race_cell == 'Black' & sex_cell == 'F' & work_cell == 0",
 "race_cell == 'Black' & sex_cell == 'F' & work_cell == 1"]

In [820]:
imputer.impute()

In [821]:
imputer.recipient_cells

{"race_cell == 'White' & sex_cell == 'M'": shape: (3, 5)
 ┌───────────┬──────────┬───────────┬────────┬──────────────────┐
 │ race_cell ┆ sex_cell ┆ work_cell ┆ weight ┆ imp_donor_assets │
 │ ---       ┆ ---      ┆ ---       ┆ ---    ┆ ---              │
 │ str       ┆ str      ┆ i64       ┆ i64    ┆ f64              │
 ╞═══════════╪══════════╪═══════════╪════════╪══════════════════╡
 │ White     ┆ M        ┆ 0         ┆ 3      ┆ 288749.439713    │
 │ White     ┆ M        ┆ 0         ┆ 1      ┆ 14437.471986     │
 │ White     ┆ M        ┆ 0         ┆ 1      ┆ 288749.439713    │
 └───────────┴──────────┴───────────┴────────┴──────────────────┘,
 "race_cell == 'White' & sex_cell == 'F'": shape: (3, 5)
 ┌───────────┬──────────┬───────────┬────────┬──────────────────┐
 │ race_cell ┆ sex_cell ┆ work_cell ┆ weight ┆ imp_donor_assets │
 │ ---       ┆ ---      ┆ ---       ┆ ---    ┆ ---              │
 │ str       ┆ str      ┆ i64       ┆ i64    ┆ f64              │
 ╞═══════════╪══════════╪══

In [822]:
imputer.recipient_data

race_cell,sex_cell,work_cell,weight,imp_donor_assets
str,str,i64,i64,f64
"""White""","""M""",0,3,288749.439713
"""White""","""M""",0,1,14437.471986
"""White""","""M""",0,1,288749.439713
"""White""","""F""",1,2,288749.439713
"""White""","""F""",1,2,288749.439713
…,…,…,…,…
"""Black""","""F""",0,3,721873.599283
"""Black""","""F""",0,3,28874.943971
"""Black""","""F""",0,4,721873.599283
"""Black""","""F""",1,2,288.74944


In [823]:
imputer.apply_random_noise(variation_stdev = (1/6), floor_noise = 1.5)

In [824]:
imputer.random_noise

0.16666666666666666

In [827]:
imputer.gen_analysis_file('hot_deck_stats', '')

FileNotFoundError: The directory '' does not exist.

In [None]:
imputer.gen_analysis_file()