From bdcae37f414a5fa502d01018374d902293807481 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Fri, 14 Feb 2025 17:28:48 -0600 Subject: [PATCH 01/35] Formatting the core stochtree python code with ruff --- stochtree/__init__.py | 30 +- stochtree/bart.py | 925 ++++++++++++++------- stochtree/bcf.py | 1590 +++++++++++++++++++++++++----------- stochtree/calibration.py | 45 +- stochtree/data.py | 34 +- stochtree/forest.py | 528 +++++++----- stochtree/preprocessing.py | 530 +++++++----- stochtree/sampler.py | 206 +++-- stochtree/serialization.py | 126 ++- stochtree/utils.py | 4 +- 10 files changed, 2732 insertions(+), 1286 deletions(-) diff --git a/stochtree/__init__.py b/stochtree/__init__.py index 8e3cc643..5c68ccdb 100644 --- a/stochtree/__init__.py +++ b/stochtree/__init__.py @@ -9,18 +9,18 @@ from .utils import NotSampledError __all__ = [ - 'BARTModel', - 'BCFModel', - 'Dataset', - 'Residual', - 'ForestContainer', - 'Forest', - 'CovariatePreprocessor', - 'RNG', - 'ForestSampler', - 'GlobalVarianceModel', - 'LeafVarianceModel', - 'JSONSerializer', - 'NotSampledError', - 'calibrate_global_error_variance' -] \ No newline at end of file + "BARTModel", + "BCFModel", + "Dataset", + "Residual", + "ForestContainer", + "Forest", + "CovariatePreprocessor", + "RNG", + "ForestSampler", + "GlobalVarianceModel", + "LeafVarianceModel", + "JSONSerializer", + "NotSampledError", + "calibrate_global_error_variance", +] diff --git a/stochtree/bart.py b/stochtree/bart.py index 0159fc92..3d1a00c1 100644 --- a/stochtree/bart.py +++ b/stochtree/bart.py @@ -1,6 +1,7 @@ """ Bayesian Additive Regression Trees (BART) module """ + import warnings from numbers import Number, Integral from math import log @@ -14,6 +15,7 @@ from .serialization import JSONSerializer from .utils import NotSampledError + class BARTModel: r""" Class that handles sampling, storage, and serialization of stochastic forest models for supervised learning. @@ -48,17 +50,28 @@ class BARTModel: - Leaf Regression: Rather than letting `f(X)` define a standard decision tree ensemble, in which each tree uses `X` to partition the data and then serve up constant predictions, we allow for models `f(X,Z)` in which `X` and `Z` together define a partitioned linear model (`X` partitions the data and `Z` serves as the basis for regression models). This model can be run by specifying `basis_train` in the `sample` method. - Heteroskedasticity: Rather than define $\epsilon$ parameterically, we can let a forest $\sigma^2(X)$ model a conditional error variance function. This can be done by setting `num_trees_variance > 0` in the `params` dictionary passed to the `sample` method. """ + def __init__(self) -> None: # Internal flag for whether the sample() method has been run self.sampled = False self.rng = np.random.default_rng() - - def sample(self, X_train: Union[np.array, pd.DataFrame], y_train: np.array, basis_train: np.array = None, - X_test: Union[np.array, pd.DataFrame] = None, basis_test: np.array = None, - num_gfr: int = 5, num_burnin: int = 0, num_mcmc: int = 100, general_params: Optional[Dict[str, Any]] = None, - mean_forest_params: Optional[Dict[str, Any]] = None, variance_forest_params: Optional[Dict[str, Any]] = None) -> None: - """Runs a BART sampler on provided training set. Predictions will be cached for the training set and (if provided) the test set. - Does not require a leaf regression basis. + + def sample( + self, + X_train: Union[np.array, pd.DataFrame], + y_train: np.array, + basis_train: np.array = None, + X_test: Union[np.array, pd.DataFrame] = None, + basis_test: np.array = None, + num_gfr: int = 5, + num_burnin: int = 0, + num_mcmc: int = 100, + general_params: Optional[Dict[str, Any]] = None, + mean_forest_params: Optional[Dict[str, Any]] = None, + variance_forest_params: Optional[Dict[str, Any]] = None, + ) -> None: + """Runs a BART sampler on provided training set. Predictions will be cached for the training set and (if provided) the test set. + Does not require a leaf regression basis. Parameters ---------- @@ -71,7 +84,7 @@ def sample(self, X_train: Union[np.array, pd.DataFrame], y_train: np.array, basi X_test : np.array, optional Optional test set covariates. basis_test : np.array, optional - Optional test set basis vector used to define a regression to be run in the leaves of each tree. + Optional test set basis vector used to define a regression to be run in the leaves of each tree. Must be included / omitted consistently (i.e. if basis_train is provided, then basis_test must be provided alongside X_test). num_gfr : int, optional Number of "warm-start" iterations run using the grow-from-root algorithm (He and Hahn, 2021). Defaults to `5`. @@ -94,7 +107,7 @@ def sample(self, X_train: Union[np.array, pd.DataFrame], y_train: np.array, basi * `keep_gfr` (`bool`): Whether or not "warm-start" / grow-from-root samples should be included in predictions. Defaults to `False`. Ignored if `num_mcmc == 0`. * `keep_every` (`int`): How many iterations of the burned-in MCMC sampler should be run before forests and parameters are retained. Defaults to `1`. Setting `keep_every = k` for some `k > 1` will "thin" the MCMC samples by retaining every `k`-th sample, rather than simply every sample. This can reduce the autocorrelation of the MCMC samples. * `num_chains` (`int`): How many independent MCMC chains should be sampled. If `num_mcmc = 0`, this is ignored. If `num_gfr = 0`, then each chain is run from root for `num_mcmc * keep_every + num_burnin` iterations, with `num_mcmc` samples retained. If `num_gfr > 0`, each MCMC chain will be initialized from a separate GFR ensemble, with the requirement that `num_gfr >= num_chains`. Defaults to `1`. - + mean_forest_params : dict, optional Dictionary of mean forest model parameters, each of which has a default value processed internally, so this argument is optional. @@ -132,18 +145,18 @@ def sample(self, X_train: Union[np.array, pd.DataFrame], y_train: np.array, basi """ # Update general BART parameters general_params_default = { - 'cutpoint_grid_size' : 100, - 'standardize' : True, - 'sample_sigma2_global' : True, - 'sigma2_init' : None, - 'sigma2_global_shape' : 0, - 'sigma2_global_scale' : 0, - 'variable_weights' : None, - 'random_seed' : -1, - 'keep_burnin' : False, - 'keep_gfr' : False, - 'keep_every' : 1, - 'num_chains' : 1 + "cutpoint_grid_size": 100, + "standardize": True, + "sample_sigma2_global": True, + "sigma2_init": None, + "sigma2_global_shape": 0, + "sigma2_global_scale": 0, + "variable_weights": None, + "random_seed": -1, + "keep_burnin": False, + "keep_gfr": False, + "keep_every": 1, + "num_chains": 1, } general_params_updated = _preprocess_params( general_params_default, general_params @@ -151,81 +164,83 @@ def sample(self, X_train: Union[np.array, pd.DataFrame], y_train: np.array, basi # Update mean forest BART parameters mean_forest_params_default = { - 'num_trees' : 200, - 'alpha' : 0.95, - 'beta' : 2.0, - 'min_samples_leaf' : 5, - 'max_depth' : 10, - 'sample_sigma2_leaf' : True, - 'sigma2_leaf_init' : None, - 'sigma2_leaf_shape' : 3, - 'sigma2_leaf_scale' : None, - 'keep_vars' : None, - 'drop_vars' : None + "num_trees": 200, + "alpha": 0.95, + "beta": 2.0, + "min_samples_leaf": 5, + "max_depth": 10, + "sample_sigma2_leaf": True, + "sigma2_leaf_init": None, + "sigma2_leaf_shape": 3, + "sigma2_leaf_scale": None, + "keep_vars": None, + "drop_vars": None, } mean_forest_params_updated = _preprocess_params( mean_forest_params_default, mean_forest_params ) - + # Update variance forest BART parameters variance_forest_params_default = { - 'num_trees' : 0, - 'alpha' : 0.95, - 'beta' : 2.0, - 'min_samples_leaf' : 5, - 'max_depth' : 10, - 'leaf_prior_calibration_param': 1.5, - 'var_forest_leaf_init' : None, - 'var_forest_prior_shape' : None, - 'var_forest_prior_scale' : None, - 'keep_vars' : None, - 'drop_vars' : None + "num_trees": 0, + "alpha": 0.95, + "beta": 2.0, + "min_samples_leaf": 5, + "max_depth": 10, + "leaf_prior_calibration_param": 1.5, + "var_forest_leaf_init": None, + "var_forest_prior_shape": None, + "var_forest_prior_scale": None, + "keep_vars": None, + "drop_vars": None, } variance_forest_params_updated = _preprocess_params( variance_forest_params_default, variance_forest_params ) - + ### Unpack all parameter values # 1. General parameters - cutpoint_grid_size = general_params_updated['cutpoint_grid_size'] - self.standardize = general_params_updated['standardize'] - sample_sigma_global = general_params_updated['sample_sigma2_global'] - sigma2_init = general_params_updated['sigma2_init'] - a_global = general_params_updated['sigma2_global_shape'] - b_global = general_params_updated['sigma2_global_scale'] - variable_weights = general_params_updated['variable_weights'] - random_seed = general_params_updated['random_seed'] - keep_burnin = general_params_updated['keep_burnin'] - keep_gfr = general_params_updated['keep_gfr'] - keep_every = general_params_updated['keep_every'] - num_chains = general_params_updated['num_chains'] + cutpoint_grid_size = general_params_updated["cutpoint_grid_size"] + self.standardize = general_params_updated["standardize"] + sample_sigma_global = general_params_updated["sample_sigma2_global"] + sigma2_init = general_params_updated["sigma2_init"] + a_global = general_params_updated["sigma2_global_shape"] + b_global = general_params_updated["sigma2_global_scale"] + variable_weights = general_params_updated["variable_weights"] + random_seed = general_params_updated["random_seed"] + keep_burnin = general_params_updated["keep_burnin"] + keep_gfr = general_params_updated["keep_gfr"] + keep_every = general_params_updated["keep_every"] + num_chains = general_params_updated["num_chains"] # 2. Mean forest parameters - num_trees_mean = mean_forest_params_updated['num_trees'] - alpha_mean = mean_forest_params_updated['alpha'] - beta_mean = mean_forest_params_updated['beta'] - min_samples_leaf_mean = mean_forest_params_updated['min_samples_leaf'] - max_depth_mean = mean_forest_params_updated['max_depth'] - sample_sigma_leaf = mean_forest_params_updated['sample_sigma2_leaf'] - sigma_leaf = mean_forest_params_updated['sigma2_leaf_init'] - a_leaf = mean_forest_params_updated['sigma2_leaf_shape'] - b_leaf = mean_forest_params_updated['sigma2_leaf_scale'] - keep_vars_mean = mean_forest_params_updated['keep_vars'] - drop_vars_mean = mean_forest_params_updated['drop_vars'] + num_trees_mean = mean_forest_params_updated["num_trees"] + alpha_mean = mean_forest_params_updated["alpha"] + beta_mean = mean_forest_params_updated["beta"] + min_samples_leaf_mean = mean_forest_params_updated["min_samples_leaf"] + max_depth_mean = mean_forest_params_updated["max_depth"] + sample_sigma_leaf = mean_forest_params_updated["sample_sigma2_leaf"] + sigma_leaf = mean_forest_params_updated["sigma2_leaf_init"] + a_leaf = mean_forest_params_updated["sigma2_leaf_shape"] + b_leaf = mean_forest_params_updated["sigma2_leaf_scale"] + keep_vars_mean = mean_forest_params_updated["keep_vars"] + drop_vars_mean = mean_forest_params_updated["drop_vars"] # 3. Variance forest parameters - num_trees_variance = variance_forest_params_updated['num_trees'] - alpha_variance = variance_forest_params_updated['alpha'] - beta_variance = variance_forest_params_updated['beta'] - min_samples_leaf_variance = variance_forest_params_updated['min_samples_leaf'] - max_depth_variance = variance_forest_params_updated['max_depth'] - a_0 = variance_forest_params_updated['leaf_prior_calibration_param'] - variance_forest_leaf_init = variance_forest_params_updated['var_forest_leaf_init'] - a_forest = variance_forest_params_updated['var_forest_prior_shape'] - b_forest = variance_forest_params_updated['var_forest_prior_scale'] - keep_vars_variance = variance_forest_params_updated['keep_vars'] - drop_vars_variance = variance_forest_params_updated['drop_vars'] - + num_trees_variance = variance_forest_params_updated["num_trees"] + alpha_variance = variance_forest_params_updated["alpha"] + beta_variance = variance_forest_params_updated["beta"] + min_samples_leaf_variance = variance_forest_params_updated["min_samples_leaf"] + max_depth_variance = variance_forest_params_updated["max_depth"] + a_0 = variance_forest_params_updated["leaf_prior_calibration_param"] + variance_forest_leaf_init = variance_forest_params_updated[ + "var_forest_leaf_init" + ] + a_forest = variance_forest_params_updated["var_forest_prior_shape"] + b_forest = variance_forest_params_updated["var_forest_prior_scale"] + keep_vars_variance = variance_forest_params_updated["keep_vars"] + drop_vars_variance = variance_forest_params_updated["drop_vars"] + # Check that num_chains >= 1 if not isinstance(num_chains, Integral) or num_chains < 1: raise ValueError("num_chains must be an integer greater than 0") @@ -233,17 +248,23 @@ def sample(self, X_train: Union[np.array, pd.DataFrame], y_train: np.array, basi # Check if there are enough GFR samples to seed num_chains samplers if num_gfr > 0: if num_chains > num_gfr: - raise ValueError("num_chains > num_gfr, meaning we do not have enough GFR samples to seed num_chains distinct MCMC chains") - + raise ValueError( + "num_chains > num_gfr, meaning we do not have enough GFR samples to seed num_chains distinct MCMC chains" + ) + # Determine which models (conditional mean, conditional variance, or both) we will fit self.include_mean_forest = True if num_trees_mean > 0 else False self.include_variance_forest = True if num_trees_variance > 0 else False - + # Check data inputs - if not isinstance(X_train, pd.DataFrame) and not isinstance(X_train, np.ndarray): + if not isinstance(X_train, pd.DataFrame) and not isinstance( + X_train, np.ndarray + ): raise ValueError("X_train must be a pandas dataframe or numpy array") if X_test is not None: - if not isinstance(X_test, pd.DataFrame) and not isinstance(X_test, np.ndarray): + if not isinstance(X_test, pd.DataFrame) and not isinstance( + X_test, np.ndarray + ): raise ValueError("X_test must be a pandas dataframe or numpy array") if not isinstance(y_train, np.ndarray): raise ValueError("y_train must be a numpy array") @@ -253,7 +274,7 @@ def sample(self, X_train: Union[np.array, pd.DataFrame], y_train: np.array, basi if basis_test is not None: if not isinstance(basis_test, np.ndarray): raise ValueError("X_test must be a numpy array") - + # Convert everything to standard shape (2-dimensional) if isinstance(X_train, np.ndarray): if X_train.ndim == 1: @@ -270,46 +291,58 @@ def sample(self, X_train: Union[np.array, pd.DataFrame], y_train: np.array, basi if basis_test is not None: if basis_test.ndim == 1: basis_test = np.expand_dims(basis_test, 1) - + # Data checks if X_test is not None: if X_test.shape[1] != X_train.shape[1]: - raise ValueError("X_train and X_test must have the same number of columns") + raise ValueError( + "X_train and X_test must have the same number of columns" + ) if basis_test is not None: if basis_train is not None: if basis_test.shape[1] != basis_train.shape[1]: - raise ValueError("basis_train and basis_test must have the same number of columns") + raise ValueError( + "basis_train and basis_test must have the same number of columns" + ) else: raise ValueError("basis_test provided but basis_train was not") if basis_train is not None: if basis_train.shape[0] != X_train.shape[0]: - raise ValueError("basis_train and Z_train must have the same number of rows") + raise ValueError( + "basis_train and Z_train must have the same number of rows" + ) if y_train.shape[0] != X_train.shape[0]: raise ValueError("X_train and y_train must have the same number of rows") if X_test is not None and basis_test is not None: if X_test.shape[0] != basis_test.shape[0]: - raise ValueError("X_test and basis_test must have the same number of rows") + raise ValueError( + "X_test and basis_test must have the same number of rows" + ) # Variable weight preprocessing (and initialization if necessary) p = X_train.shape[1] if variable_weights is None: if X_train.ndim > 1: - variable_weights = np.repeat(1.0/p, p) + variable_weights = np.repeat(1.0 / p, p) else: - variable_weights = np.repeat(1., 1) + variable_weights = np.repeat(1.0, 1) if np.any(variable_weights < 0): raise ValueError("variable_weights cannot have any negative weights") variable_weights_mean = variable_weights variable_weights_variance = variable_weights - + # Covariate preprocessing self._covariate_preprocessor = CovariatePreprocessor() self._covariate_preprocessor.fit(X_train) X_train_processed = self._covariate_preprocessor.transform(X_train) if X_test is not None: X_test_processed = self._covariate_preprocessor.transform(X_test) - feature_types = np.asarray(self._covariate_preprocessor._processed_feature_types) - original_var_indices = self._covariate_preprocessor.fetch_original_feature_indices() + feature_types = np.asarray( + self._covariate_preprocessor._processed_feature_types + ) + original_var_indices = ( + self._covariate_preprocessor.fetch_original_feature_indices() + ) # Determine whether a test set is provided self.has_test = X_test is not None @@ -328,26 +361,48 @@ def sample(self, X_train: Union[np.array, pd.DataFrame], y_train: np.array, basi if isinstance(keep_vars_mean, list): if all(isinstance(i, str) for i in keep_vars_mean): if not np.all(np.isin(keep_vars_mean, X_train.columns)): - raise ValueError("keep_vars_mean includes some variable names that are not in X_train") - variable_subset_mean = [i for i in X_train.shape[1] if keep_vars_mean.count(X_train.columns.array[i]) > 0] + raise ValueError( + "keep_vars_mean includes some variable names that are not in X_train" + ) + variable_subset_mean = [ + i + for i in X_train.shape[1] + if keep_vars_mean.count(X_train.columns.array[i]) > 0 + ] elif all(isinstance(i, int) for i in keep_vars_mean): if any(i >= X_train.shape[1] for i in keep_vars_mean): - raise ValueError("keep_vars_mean includes some variable indices that exceed the number of columns in X_train") + raise ValueError( + "keep_vars_mean includes some variable indices that exceed the number of columns in X_train" + ) if any(i < 0 for i in keep_vars_mean): - raise ValueError("keep_vars_mean includes some negative variable indices") + raise ValueError( + "keep_vars_mean includes some negative variable indices" + ) variable_subset_mean = keep_vars_mean else: - raise ValueError("keep_vars_mean must be a list of variable names (str) or column indices (int)") + raise ValueError( + "keep_vars_mean must be a list of variable names (str) or column indices (int)" + ) elif isinstance(keep_vars_mean, np.ndarray): if keep_vars_mean.dtype == np.str_: if not np.all(np.isin(keep_vars_mean, X_train.columns)): - raise ValueError("keep_vars_mean includes some variable names that are not in X_train") - variable_subset_mean = [i for i in X_train.shape[1] if keep_vars_mean.count(X_train.columns.array[i]) > 0] + raise ValueError( + "keep_vars_mean includes some variable names that are not in X_train" + ) + variable_subset_mean = [ + i + for i in X_train.shape[1] + if keep_vars_mean.count(X_train.columns.array[i]) > 0 + ] else: if np.any(keep_vars_mean >= X_train.shape[1]): - raise ValueError("keep_vars_mean includes some variable indices that exceed the number of columns in X_train") + raise ValueError( + "keep_vars_mean includes some variable indices that exceed the number of columns in X_train" + ) if np.any(keep_vars_mean < 0): - raise ValueError("keep_vars_mean includes some negative variable indices") + raise ValueError( + "keep_vars_mean includes some negative variable indices" + ) variable_subset_mean = [i for i in keep_vars_mean] else: raise ValueError("keep_vars_mean must be a list or np.array") @@ -355,27 +410,49 @@ def sample(self, X_train: Union[np.array, pd.DataFrame], y_train: np.array, basi if isinstance(drop_vars_mean, list): if all(isinstance(i, str) for i in drop_vars_mean): if not np.all(np.isin(drop_vars_mean, X_train.columns)): - raise ValueError("drop_vars_mean includes some variable names that are not in X_train") - variable_subset_mean = [i for i in range(X_train.shape[1]) if drop_vars_mean.count(X_train.columns.array[i]) == 0] + raise ValueError( + "drop_vars_mean includes some variable names that are not in X_train" + ) + variable_subset_mean = [ + i + for i in range(X_train.shape[1]) + if drop_vars_mean.count(X_train.columns.array[i]) == 0 + ] elif all(isinstance(i, int) for i in drop_vars_mean): if any(i >= X_train.shape[1] for i in drop_vars_mean): - raise ValueError("drop_vars_mean includes some variable indices that exceed the number of columns in X_train") + raise ValueError( + "drop_vars_mean includes some variable indices that exceed the number of columns in X_train" + ) if any(i < 0 for i in drop_vars_mean): - raise ValueError("drop_vars_mean includes some negative variable indices") - variable_subset_mean = [i for i in range(X_train.shape[1]) if drop_vars_mean.count(i) == 0] + raise ValueError( + "drop_vars_mean includes some negative variable indices" + ) + variable_subset_mean = [ + i + for i in range(X_train.shape[1]) + if drop_vars_mean.count(i) == 0 + ] else: - raise ValueError("drop_vars_mean must be a list of variable names (str) or column indices (int)") + raise ValueError( + "drop_vars_mean must be a list of variable names (str) or column indices (int)" + ) elif isinstance(drop_vars_mean, np.ndarray): if drop_vars_mean.dtype == np.str_: if not np.all(np.isin(drop_vars_mean, X_train.columns)): - raise ValueError("drop_vars_mean includes some variable names that are not in X_train") + raise ValueError( + "drop_vars_mean includes some variable names that are not in X_train" + ) keep_inds = ~np.isin(X_train.columns.array, drop_vars_mean) variable_subset_mean = [i for i in keep_inds] else: if np.any(drop_vars_mean >= X_train.shape[1]): - raise ValueError("drop_vars_mean includes some variable indices that exceed the number of columns in X_train") + raise ValueError( + "drop_vars_mean includes some variable indices that exceed the number of columns in X_train" + ) if np.any(drop_vars_mean < 0): - raise ValueError("drop_vars_mean includes some negative variable indices") + raise ValueError( + "drop_vars_mean includes some negative variable indices" + ) keep_inds = ~np.isin(np.arange(X_train.shape[1]), drop_vars_mean) variable_subset_mean = [i for i in keep_inds] else: @@ -386,26 +463,48 @@ def sample(self, X_train: Union[np.array, pd.DataFrame], y_train: np.array, basi if isinstance(keep_vars_variance, list): if all(isinstance(i, str) for i in keep_vars_variance): if not np.all(np.isin(keep_vars_variance, X_train.columns)): - raise ValueError("keep_vars_variance includes some variable names that are not in X_train") - variable_subset_variance = [i for i in X_train.shape[1] if keep_vars_variance.count(X_train.columns.array[i]) > 0] + raise ValueError( + "keep_vars_variance includes some variable names that are not in X_train" + ) + variable_subset_variance = [ + i + for i in X_train.shape[1] + if keep_vars_variance.count(X_train.columns.array[i]) > 0 + ] elif all(isinstance(i, int) for i in keep_vars_variance): if any(i >= X_train.shape[1] for i in keep_vars_variance): - raise ValueError("keep_vars_variance includes some variable indices that exceed the number of columns in X_train") + raise ValueError( + "keep_vars_variance includes some variable indices that exceed the number of columns in X_train" + ) if any(i < 0 for i in keep_vars_variance): - raise ValueError("keep_vars_variance includes some negative variable indices") + raise ValueError( + "keep_vars_variance includes some negative variable indices" + ) variable_subset_variance = keep_vars_variance else: - raise ValueError("keep_vars_variance must be a list of variable names (str) or column indices (int)") + raise ValueError( + "keep_vars_variance must be a list of variable names (str) or column indices (int)" + ) elif isinstance(keep_vars_variance, np.ndarray): if keep_vars_variance.dtype == np.str_: if not np.all(np.isin(keep_vars_variance, X_train.columns)): - raise ValueError("keep_vars_variance includes some variable names that are not in X_train") - variable_subset_variance = [i for i in X_train.shape[1] if keep_vars_variance.count(X_train.columns.array[i]) > 0] + raise ValueError( + "keep_vars_variance includes some variable names that are not in X_train" + ) + variable_subset_variance = [ + i + for i in X_train.shape[1] + if keep_vars_variance.count(X_train.columns.array[i]) > 0 + ] else: if np.any(keep_vars_variance >= X_train.shape[1]): - raise ValueError("keep_vars_variance includes some variable indices that exceed the number of columns in X_train") + raise ValueError( + "keep_vars_variance includes some variable indices that exceed the number of columns in X_train" + ) if np.any(keep_vars_variance < 0): - raise ValueError("keep_vars_variance includes some negative variable indices") + raise ValueError( + "keep_vars_variance includes some negative variable indices" + ) variable_subset_variance = [i for i in keep_vars_variance] else: raise ValueError("keep_vars_variance must be a list or np.array") @@ -413,47 +512,82 @@ def sample(self, X_train: Union[np.array, pd.DataFrame], y_train: np.array, basi if isinstance(drop_vars_variance, list): if all(isinstance(i, str) for i in drop_vars_variance): if not np.all(np.isin(drop_vars_variance, X_train.columns)): - raise ValueError("drop_vars_variance includes some variable names that are not in X_train") - variable_subset_variance = [i for i in range(X_train.shape[1]) if drop_vars_variance.count(X_train.columns.array[i]) == 0] + raise ValueError( + "drop_vars_variance includes some variable names that are not in X_train" + ) + variable_subset_variance = [ + i + for i in range(X_train.shape[1]) + if drop_vars_variance.count(X_train.columns.array[i]) == 0 + ] elif all(isinstance(i, int) for i in drop_vars_variance): if any(i >= X_train.shape[1] for i in drop_vars_variance): - raise ValueError("drop_vars_variance includes some variable indices that exceed the number of columns in X_train") + raise ValueError( + "drop_vars_variance includes some variable indices that exceed the number of columns in X_train" + ) if any(i < 0 for i in drop_vars_variance): - raise ValueError("drop_vars_variance includes some negative variable indices") - variable_subset_variance = [i for i in range(X_train.shape[1]) if drop_vars_variance.count(i) == 0] + raise ValueError( + "drop_vars_variance includes some negative variable indices" + ) + variable_subset_variance = [ + i + for i in range(X_train.shape[1]) + if drop_vars_variance.count(i) == 0 + ] else: - raise ValueError("drop_vars_variance must be a list of variable names (str) or column indices (int)") + raise ValueError( + "drop_vars_variance must be a list of variable names (str) or column indices (int)" + ) elif isinstance(drop_vars_variance, np.ndarray): if drop_vars_variance.dtype == np.str_: if not np.all(np.isin(drop_vars_variance, X_train.columns)): - raise ValueError("drop_vars_variance includes some variable names that are not in X_train") + raise ValueError( + "drop_vars_variance includes some variable names that are not in X_train" + ) keep_inds = ~np.isin(X_train.columns.array, drop_vars_variance) variable_subset_variance = [i for i in keep_inds] else: if np.any(drop_vars_variance >= X_train.shape[1]): - raise ValueError("drop_vars_variance includes some variable indices that exceed the number of columns in X_train") + raise ValueError( + "drop_vars_variance includes some variable indices that exceed the number of columns in X_train" + ) if np.any(drop_vars_variance < 0): - raise ValueError("drop_vars_variance includes some negative variable indices") - keep_inds = ~np.isin(np.arange(X_train.shape[1]), drop_vars_variance) + raise ValueError( + "drop_vars_variance includes some negative variable indices" + ) + keep_inds = ~np.isin( + np.arange(X_train.shape[1]), drop_vars_variance + ) variable_subset_variance = [i for i in keep_inds] else: raise ValueError("drop_vars_variance must be a list or np.array") else: variable_subset_variance = [i for i in range(X_train.shape[1])] - + # Update variable weights if the covariates have been resized (by e.g. one-hot encoding) if X_train_processed.shape[1] != X_train.shape[1]: - variable_counts = [original_var_indices.count(i) for i in original_var_indices] - variable_weights_adj = np.array([1/i for i in variable_counts]) + variable_counts = [ + original_var_indices.count(i) for i in original_var_indices + ] + variable_weights_adj = np.array([1 / i for i in variable_counts]) if self.include_mean_forest: - variable_weights_mean = variable_weights_mean[original_var_indices]*variable_weights_adj + variable_weights_mean = ( + variable_weights_mean[original_var_indices] * variable_weights_adj + ) if self.include_variance_forest: - variable_weights_variance = variable_weights_variance[original_var_indices]*variable_weights_adj - + variable_weights_variance = ( + variable_weights_variance[original_var_indices] + * variable_weights_adj + ) + # Zero out weights for excluded variables - variable_weights_mean[[variable_subset_mean.count(i) == 0 for i in original_var_indices]] = 0 - variable_weights_variance[[variable_subset_variance.count(i) == 0 for i in original_var_indices]] = 0 - + variable_weights_mean[ + [variable_subset_mean.count(i) == 0 for i in original_var_indices] + ] = 0 + variable_weights_variance[ + [variable_subset_variance.count(i) == 0 for i in original_var_indices] + ] = 0 + # Scale outcome if self.standardize: self.y_bar = np.squeeze(np.mean(y_train)) @@ -461,21 +595,29 @@ def sample(self, X_train: Union[np.array, pd.DataFrame], y_train: np.array, basi else: self.y_bar = 0 self.y_std = 1 - resid_train = (y_train-self.y_bar)/self.y_std + resid_train = (y_train - self.y_bar) / self.y_std # Calibrate priors for global sigma^2 and sigma_leaf (don't use regression initializer for warm-start or XBART) if not sigma2_init: - sigma2_init = 1.0*np.var(resid_train) + sigma2_init = 1.0 * np.var(resid_train) if not variance_forest_leaf_init: - variance_forest_leaf_init = 0.6*np.var(resid_train) + variance_forest_leaf_init = 0.6 * np.var(resid_train) current_sigma2 = sigma2_init self.sigma2_init = sigma2_init if self.include_mean_forest: - b_leaf = np.squeeze(np.var(resid_train)) / num_trees_mean if b_leaf is None else b_leaf - sigma_leaf = np.squeeze(np.var(resid_train)) / num_trees_mean if sigma_leaf is None else sigma_leaf + b_leaf = ( + np.squeeze(np.var(resid_train)) / num_trees_mean + if b_leaf is None + else b_leaf + ) + sigma_leaf = ( + np.squeeze(np.var(resid_train)) / num_trees_mean + if sigma_leaf is None + else sigma_leaf + ) current_leaf_scale = np.array([[sigma_leaf]]) else: - current_leaf_scale = np.array([[1.]]) + current_leaf_scale = np.array([[1.0]]) if self.include_variance_forest: if not a_forest: a_forest = num_trees_variance / a_0**2 + 0.5 @@ -483,9 +625,9 @@ def sample(self, X_train: Union[np.array, pd.DataFrame], y_train: np.array, basi b_forest = num_trees_variance / a_0**2 else: if not a_forest: - a_forest = 1. + a_forest = 1.0 if not b_forest: - b_forest = 1. + b_forest = 1.0 # Container of variance parameter samples self.num_gfr = num_gfr @@ -503,11 +645,11 @@ def sample(self, X_train: Union[np.array, pd.DataFrame], y_train: np.array, basi self.sample_sigma_global = sample_sigma_global self.sample_sigma_leaf = sample_sigma_leaf if sample_sigma_global: - self.global_var_samples = np.empty(self.num_samples, dtype = np.float64) + self.global_var_samples = np.empty(self.num_samples, dtype=np.float64) if sample_sigma_leaf: - self.leaf_scale_samples = np.empty(self.num_samples, dtype = np.float64) + self.leaf_scale_samples = np.empty(self.num_samples, dtype=np.float64) sample_counter = -1 - + # Forest Dataset (covariates and optional basis) forest_dataset_train = Dataset() forest_dataset_train.add_covariates(X_train_processed) @@ -523,16 +665,34 @@ def sample(self, X_train: Union[np.array, pd.DataFrame], y_train: np.array, basi residual_train = Residual(resid_train) # C++ random number generator - if random_seed is None: + if random_seed is None: cpp_rng = RNG(-1) else: cpp_rng = RNG(random_seed) - + # Sampling data structures if self.include_mean_forest: - forest_sampler_mean = ForestSampler(forest_dataset_train, feature_types, num_trees_mean, self.n_train, alpha_mean, beta_mean, min_samples_leaf_mean, max_depth_mean) + forest_sampler_mean = ForestSampler( + forest_dataset_train, + feature_types, + num_trees_mean, + self.n_train, + alpha_mean, + beta_mean, + min_samples_leaf_mean, + max_depth_mean, + ) if self.include_variance_forest: - forest_sampler_variance = ForestSampler(forest_dataset_train, feature_types, num_trees_variance, self.n_train, alpha_variance, beta_variance, min_samples_leaf_variance, max_depth_variance) + forest_sampler_variance = ForestSampler( + forest_dataset_train, + feature_types, + num_trees_variance, + self.n_train, + alpha_variance, + beta_variance, + min_samples_leaf_variance, + max_depth_variance, + ) # Set variance leaf model type (currently only one option) leaf_model_variance_forest = 3 @@ -547,12 +707,22 @@ def sample(self, X_train: Union[np.array, pd.DataFrame], y_train: np.array, basi # Container of forest samples if self.include_mean_forest: - self.forest_container_mean = ForestContainer(num_trees_mean, 1, True, False) if not self.has_basis else ForestContainer(num_trees_mean, self.num_basis, False, False) - active_forest_mean = Forest(num_trees_mean, 1, True, False) if not self.has_basis else Forest(num_trees_mean, self.num_basis, False, False) + self.forest_container_mean = ( + ForestContainer(num_trees_mean, 1, True, False) + if not self.has_basis + else ForestContainer(num_trees_mean, self.num_basis, False, False) + ) + active_forest_mean = ( + Forest(num_trees_mean, 1, True, False) + if not self.has_basis + else Forest(num_trees_mean, self.num_basis, False, False) + ) if self.include_variance_forest: - self.forest_container_variance = ForestContainer(num_trees_variance, 1, True, True) + self.forest_container_variance = ForestContainer( + num_trees_variance, 1, True, True + ) active_forest_variance = Forest(num_trees_variance, 1, True, True) - + # Variance samplers if self.sample_sigma_global: global_var_model = GlobalVarianceModel() @@ -562,15 +732,27 @@ def sample(self, X_train: Union[np.array, pd.DataFrame], y_train: np.array, basi # Initialize the leaves of each tree in the mean forest if self.include_mean_forest: if self.has_basis: - init_val_mean = np.repeat(0., basis_train.shape[1]) + init_val_mean = np.repeat(0.0, basis_train.shape[1]) else: - init_val_mean = np.array([0.]) - forest_sampler_mean.prepare_for_sampler(forest_dataset_train, residual_train, active_forest_mean, leaf_model_mean_forest, init_val_mean) + init_val_mean = np.array([0.0]) + forest_sampler_mean.prepare_for_sampler( + forest_dataset_train, + residual_train, + active_forest_mean, + leaf_model_mean_forest, + init_val_mean, + ) # Initialize the leaves of each tree in the variance forest if self.include_variance_forest: init_val_variance = np.array([variance_forest_leaf_init]) - forest_sampler_variance.prepare_for_sampler(forest_dataset_train, residual_train, active_forest_variance, leaf_model_variance_forest, init_val_variance) + forest_sampler_variance.prepare_for_sampler( + forest_dataset_train, + residual_train, + active_forest_variance, + leaf_model_variance_forest, + init_val_variance, + ) # Run GFR (warm start) if specified if self.num_gfr > 0: @@ -583,29 +765,61 @@ def sample(self, X_train: Union[np.array, pd.DataFrame], y_train: np.array, basi # Sample the mean forest if self.include_mean_forest: forest_sampler_mean.sample_one_iteration( - self.forest_container_mean, active_forest_mean, forest_dataset_train, residual_train, - cpp_rng, feature_types, cutpoint_grid_size, current_leaf_scale, variable_weights_mean, a_forest, b_forest, - current_sigma2, leaf_model_mean_forest, keep_sample, True, True + self.forest_container_mean, + active_forest_mean, + forest_dataset_train, + residual_train, + cpp_rng, + feature_types, + cutpoint_grid_size, + current_leaf_scale, + variable_weights_mean, + a_forest, + b_forest, + current_sigma2, + leaf_model_mean_forest, + keep_sample, + True, + True, ) - + # Sample the variance forest if self.include_variance_forest: forest_sampler_variance.sample_one_iteration( - self.forest_container_variance, active_forest_variance, forest_dataset_train, residual_train, - cpp_rng, feature_types, cutpoint_grid_size, current_leaf_scale, variable_weights_variance, a_forest, b_forest, - current_sigma2, leaf_model_variance_forest, keep_sample, True, True + self.forest_container_variance, + active_forest_variance, + forest_dataset_train, + residual_train, + cpp_rng, + feature_types, + cutpoint_grid_size, + current_leaf_scale, + variable_weights_variance, + a_forest, + b_forest, + current_sigma2, + leaf_model_variance_forest, + keep_sample, + True, + True, ) # Sample variance parameters (if requested) if self.sample_sigma_global: - current_sigma2 = global_var_model.sample_one_iteration(residual_train, cpp_rng, a_global, b_global) + current_sigma2 = global_var_model.sample_one_iteration( + residual_train, cpp_rng, a_global, b_global + ) if keep_sample: self.global_var_samples[sample_counter] = current_sigma2 if self.sample_sigma_leaf: - current_leaf_scale[0,0] = leaf_var_model.sample_one_iteration(active_forest_mean, cpp_rng, a_leaf, b_leaf) + current_leaf_scale[0, 0] = leaf_var_model.sample_one_iteration( + active_forest_mean, cpp_rng, a_leaf, b_leaf + ) if keep_sample: - self.leaf_scale_samples[sample_counter] = current_leaf_scale[0,0] - + self.leaf_scale_samples[sample_counter] = current_leaf_scale[ + 0, 0 + ] + # Run MCMC if self.num_burnin + self.num_mcmc > 0: for chain_num in range(num_chains): @@ -613,30 +827,58 @@ def sample(self, X_train: Union[np.array, pd.DataFrame], y_train: np.array, basi forest_ind = num_gfr - chain_num - 1 if self.include_mean_forest: active_forest_mean.reset(self.forest_container_mean, forest_ind) - forest_sampler_mean.reconstitute_from_forest(active_forest_mean, forest_dataset_train, residual_train, True) + forest_sampler_mean.reconstitute_from_forest( + active_forest_mean, + forest_dataset_train, + residual_train, + True, + ) if self.include_variance_forest: - active_forest_variance.reset(self.forest_container_variance, forest_ind) - forest_sampler_variance.reconstitute_from_forest(active_forest_variance, forest_dataset_train, residual_train, False) + active_forest_variance.reset( + self.forest_container_variance, forest_ind + ) + forest_sampler_variance.reconstitute_from_forest( + active_forest_variance, + forest_dataset_train, + residual_train, + False, + ) if sample_sigma_global: current_sigma2 = self.global_var_samples[forest_ind] else: if self.include_mean_forest: active_forest_mean.reset_root() if init_val_mean.shape[0] == 1: - active_forest_mean.set_root_leaves(init_val_mean[0] / num_trees_mean) + active_forest_mean.set_root_leaves( + init_val_mean[0] / num_trees_mean + ) else: - active_forest_mean.set_root_leaves(init_val_mean / num_trees_mean) - forest_sampler_mean.reconstitute_from_forest(active_forest_mean, forest_dataset_train, residual_train, True) + active_forest_mean.set_root_leaves( + init_val_mean / num_trees_mean + ) + forest_sampler_mean.reconstitute_from_forest( + active_forest_mean, + forest_dataset_train, + residual_train, + True, + ) if self.include_variance_forest: active_forest_variance.reset_root() - active_forest_variance.set_root_leaves(log(variance_forest_leaf_init) / num_trees_mean) - forest_sampler_variance.reconstitute_from_forest(active_forest_variance, forest_dataset_train, residual_train, False) - + active_forest_variance.set_root_leaves( + log(variance_forest_leaf_init) / num_trees_mean + ) + forest_sampler_variance.reconstitute_from_forest( + active_forest_variance, + forest_dataset_train, + residual_train, + False, + ) + for i in range(self.num_gfr, num_temp_samples): is_mcmc = i + 1 > num_gfr + num_burnin if is_mcmc: mcmc_counter = i - num_gfr - num_burnin + 1 - if (mcmc_counter % keep_every == 0): + if mcmc_counter % keep_every == 0: keep_sample = True else: keep_sample = False @@ -650,29 +892,61 @@ def sample(self, X_train: Union[np.array, pd.DataFrame], y_train: np.array, basi # Sample the mean forest if self.include_mean_forest: forest_sampler_mean.sample_one_iteration( - self.forest_container_mean, active_forest_mean, forest_dataset_train, residual_train, - cpp_rng, feature_types, cutpoint_grid_size, current_leaf_scale, variable_weights_mean, a_forest, b_forest, - current_sigma2, leaf_model_mean_forest, keep_sample, False, True + self.forest_container_mean, + active_forest_mean, + forest_dataset_train, + residual_train, + cpp_rng, + feature_types, + cutpoint_grid_size, + current_leaf_scale, + variable_weights_mean, + a_forest, + b_forest, + current_sigma2, + leaf_model_mean_forest, + keep_sample, + False, + True, ) - + # Sample the variance forest if self.include_variance_forest: forest_sampler_variance.sample_one_iteration( - self.forest_container_variance, active_forest_variance, forest_dataset_train, residual_train, - cpp_rng, feature_types, cutpoint_grid_size, current_leaf_scale, variable_weights_variance, a_forest, b_forest, - current_sigma2, leaf_model_variance_forest, keep_sample, False, True + self.forest_container_variance, + active_forest_variance, + forest_dataset_train, + residual_train, + cpp_rng, + feature_types, + cutpoint_grid_size, + current_leaf_scale, + variable_weights_variance, + a_forest, + b_forest, + current_sigma2, + leaf_model_variance_forest, + keep_sample, + False, + True, ) # Sample variance parameters (if requested) if self.sample_sigma_global: - current_sigma2 = global_var_model.sample_one_iteration(residual_train, cpp_rng, a_global, b_global) + current_sigma2 = global_var_model.sample_one_iteration( + residual_train, cpp_rng, a_global, b_global + ) if keep_sample: self.global_var_samples[sample_counter] = current_sigma2 if self.sample_sigma_leaf: - current_leaf_scale[0,0] = leaf_var_model.sample_one_iteration(active_forest_mean, cpp_rng, a_leaf, b_leaf) + current_leaf_scale[0, 0] = leaf_var_model.sample_one_iteration( + active_forest_mean, cpp_rng, a_leaf, b_leaf + ) if keep_sample: - self.leaf_scale_samples[sample_counter] = current_leaf_scale[0,0] - + self.leaf_scale_samples[sample_counter] = ( + current_leaf_scale[0, 0] + ) + # Mark the model as sampled self.sampled = True @@ -691,38 +965,60 @@ def sample(self, X_train: Union[np.array, pd.DataFrame], y_train: np.array, basi # Store predictions if self.sample_sigma_global: - self.global_var_samples = self.global_var_samples*self.y_std*self.y_std + self.global_var_samples = self.global_var_samples * self.y_std * self.y_std if self.sample_sigma_leaf: self.leaf_scale_samples = self.leaf_scale_samples - + if self.include_mean_forest: - yhat_train_raw = self.forest_container_mean.forest_container_cpp.Predict(forest_dataset_train.dataset_cpp) - self.y_hat_train = yhat_train_raw*self.y_std + self.y_bar + yhat_train_raw = self.forest_container_mean.forest_container_cpp.Predict( + forest_dataset_train.dataset_cpp + ) + self.y_hat_train = yhat_train_raw * self.y_std + self.y_bar if self.has_test: - yhat_test_raw = self.forest_container_mean.forest_container_cpp.Predict(forest_dataset_test.dataset_cpp) - self.y_hat_test = yhat_test_raw*self.y_std + self.y_bar - + yhat_test_raw = self.forest_container_mean.forest_container_cpp.Predict( + forest_dataset_test.dataset_cpp + ) + self.y_hat_test = yhat_test_raw * self.y_std + self.y_bar + if self.include_variance_forest: - sigma_x_train_raw = self.forest_container_variance.forest_container_cpp.Predict(forest_dataset_train.dataset_cpp) + sigma_x_train_raw = ( + self.forest_container_variance.forest_container_cpp.Predict( + forest_dataset_train.dataset_cpp + ) + ) if self.sample_sigma_global: self.sigma2_x_train = sigma_x_train_raw for i in range(self.num_samples): - self.sigma2_x_train[:,i] = sigma_x_train_raw[:,i]*self.global_var_samples[i] + self.sigma2_x_train[:, i] = ( + sigma_x_train_raw[:, i] * self.global_var_samples[i] + ) else: - self.sigma2_x_train = sigma_x_train_raw*self.sigma2_init*self.y_std*self.y_std + self.sigma2_x_train = ( + sigma_x_train_raw * self.sigma2_init * self.y_std * self.y_std + ) if self.has_test: - sigma_x_test_raw = self.forest_container_variance.forest_container_cpp.Predict(forest_dataset_test.dataset_cpp) + sigma_x_test_raw = ( + self.forest_container_variance.forest_container_cpp.Predict( + forest_dataset_test.dataset_cpp + ) + ) if self.sample_sigma_global: self.sigma2_x_test = sigma_x_test_raw for i in range(self.num_samples): - self.sigma2_x_test[:,i] = sigma_x_test_raw[:,i]*self.global_var_samples[i] + self.sigma2_x_test[:, i] = ( + sigma_x_test_raw[:, i] * self.global_var_samples[i] + ) else: - self.sigma2_x_test = sigma_x_test_raw*self.sigma2_init*self.y_std*self.y_std + self.sigma2_x_test = ( + sigma_x_test_raw * self.sigma2_init * self.y_std * self.y_std + ) - def predict(self, covariates: Union[np.array, pd.DataFrame], basis: np.array = None) -> Union[np.array, tuple]: - """Return predictions from every forest sampled (either / both of mean and variance). - Return type is either a single array of predictions, if a BART model only includes a + def predict( + self, covariates: Union[np.array, pd.DataFrame], basis: np.array = None + ) -> Union[np.array, tuple]: + """Return predictions from every forest sampled (either / both of mean and variance). + Return type is either a single array of predictions, if a BART model only includes a mean or variance term, or a tuple of prediction arrays, if a BART model includes both. Parameters @@ -731,7 +1027,7 @@ def predict(self, covariates: Union[np.array, pd.DataFrame], basis: np.array = N Test set covariates. basis : np.array, optional Optional test set basis vector, must be provided if the model was trained with a leaf regression basis. - + Returns ------- mu_x : np.array, optional @@ -745,16 +1041,20 @@ def predict(self, covariates: Union[np.array, pd.DataFrame], basis: np.array = N "appropriate arguments before using this model." ) raise NotSampledError(msg) - + # Data checks - if not isinstance(covariates, pd.DataFrame) and not isinstance(covariates, np.ndarray): + if not isinstance(covariates, pd.DataFrame) and not isinstance( + covariates, np.ndarray + ): raise ValueError("covariates must be a pandas dataframe or numpy array") if basis is not None: if not isinstance(basis, np.ndarray): raise ValueError("basis must be a numpy array") if basis.shape[0] != covariates.shape[0]: - raise ValueError("covariates and basis must have the same number of rows") - + raise ValueError( + "covariates and basis must have the same number of rows" + ) + # Convert everything to standard shape (2-dimensional) if isinstance(covariates, np.ndarray): if covariates.ndim == 1: @@ -762,15 +1062,24 @@ def predict(self, covariates: Union[np.array, pd.DataFrame], basis: np.array = N if basis is not None: if basis.ndim == 1: basis = np.expand_dims(basis, 1) - + # Covariate preprocessing if not self._covariate_preprocessor._check_is_fitted(): if not isinstance(covariates, np.ndarray): - raise ValueError("Prediction cannot proceed on a pandas dataframe, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing covariate data as a Pandas dataframe.") + raise ValueError( + "Prediction cannot proceed on a pandas dataframe, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing covariate data as a Pandas dataframe." + ) else: - warnings.warn("This BART model has not run any covariate preprocessing routines. We will attempt to predict on the raw covariate values, but this will trigger an error with non-numeric columns. Please refit your model by passing non-numeric covariate data a a Pandas dataframe.", RuntimeWarning) - if not np.issubdtype(covariates.dtype, np.floating) and not np.issubdtype(covariates.dtype, np.integer): - raise ValueError("Prediction cannot proceed on a non-numeric numpy array, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing non-numeric covariate data as a Pandas dataframe.") + warnings.warn( + "This BART model has not run any covariate preprocessing routines. We will attempt to predict on the raw covariate values, but this will trigger an error with non-numeric columns. Please refit your model by passing non-numeric covariate data a a Pandas dataframe.", + RuntimeWarning, + ) + if not np.issubdtype( + covariates.dtype, np.floating + ) and not np.issubdtype(covariates.dtype, np.integer): + raise ValueError( + "Prediction cannot proceed on a non-numeric numpy array, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing non-numeric covariate data as a Pandas dataframe." + ) covariates_processed = covariates else: self._covariate_preprocessor = CovariatePreprocessor() @@ -782,26 +1091,36 @@ def predict(self, covariates: Union[np.array, pd.DataFrame], basis: np.array = N pred_dataset.add_covariates(covariates_processed) if basis is not None: pred_dataset.add_basis(basis) - + # Forest predictions if self.include_mean_forest: - mean_pred_raw = self.forest_container_mean.forest_container_cpp.Predict(pred_dataset.dataset_cpp) - mean_pred = mean_pred_raw*self.y_std + self.y_bar + mean_pred_raw = self.forest_container_mean.forest_container_cpp.Predict( + pred_dataset.dataset_cpp + ) + mean_pred = mean_pred_raw * self.y_std + self.y_bar if self.include_variance_forest: - variance_pred_raw = self.forest_container_variance.forest_container_cpp.Predict(pred_dataset.dataset_cpp) + variance_pred_raw = ( + self.forest_container_variance.forest_container_cpp.Predict( + pred_dataset.dataset_cpp + ) + ) if self.sample_sigma_global: variance_pred = variance_pred_raw for i in range(self.num_samples): - variance_pred[:,i] = np.sqrt(variance_pred_raw[:,i]*self.global_var_samples[i]) + variance_pred[:, i] = np.sqrt( + variance_pred_raw[:, i] * self.global_var_samples[i] + ) else: - variance_pred = np.sqrt(variance_pred_raw*self.sigma2_init)*self.y_std + variance_pred = ( + np.sqrt(variance_pred_raw * self.sigma2_init) * self.y_std + ) if self.include_mean_forest and self.include_variance_forest: return (mean_pred, variance_pred) elif self.include_mean_forest and not self.include_variance_forest: - return (mean_pred) + return mean_pred elif not self.include_mean_forest and self.include_variance_forest: - return (variance_pred) + return variance_pred def predict_mean(self, covariates: np.array, basis: np.array = None) -> np.array: """Predict expected conditional outcome from a BART model. @@ -812,7 +1131,7 @@ def predict_mean(self, covariates: np.array, basis: np.array = None) -> np.array Test set covariates. basis : np.array, optional Optional test set basis vector, must be provided if the model was trained with a leaf regression basis. - + Returns ------- np.array @@ -831,16 +1150,20 @@ def predict_mean(self, covariates: np.array, basis: np.array = None) -> np.array "Call 'fit' with appropriate arguments before using this model." ) raise NotSampledError(msg) - + # Data checks - if not isinstance(covariates, pd.DataFrame) and not isinstance(covariates, np.ndarray): + if not isinstance(covariates, pd.DataFrame) and not isinstance( + covariates, np.ndarray + ): raise ValueError("covariates must be a pandas dataframe or numpy array") if basis is not None: if not isinstance(basis, np.ndarray): raise ValueError("basis must be a numpy array") if basis.shape[0] != covariates.shape[0]: - raise ValueError("covariates and basis must have the same number of rows") - + raise ValueError( + "covariates and basis must have the same number of rows" + ) + # Convert everything to standard shape (2-dimensional) if isinstance(covariates, np.ndarray): if covariates.ndim == 1: @@ -848,15 +1171,24 @@ def predict_mean(self, covariates: np.array, basis: np.array = None) -> np.array if basis is not None: if basis.ndim == 1: basis = np.expand_dims(basis, 1) - + # Covariate preprocessing if not self._covariate_preprocessor._check_is_fitted(): if not isinstance(covariates, np.ndarray): - raise ValueError("Prediction cannot proceed on a pandas dataframe, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing covariate data as a Pandas dataframe.") + raise ValueError( + "Prediction cannot proceed on a pandas dataframe, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing covariate data as a Pandas dataframe." + ) else: - warnings.warn("This BART model has not run any covariate preprocessing routines. We will attempt to predict on the raw covariate values, but this will trigger an error with non-numeric columns. Please refit your model by passing non-numeric covariate data a a Pandas dataframe.", RuntimeWarning) - if not np.issubdtype(covariates.dtype, np.floating) and not np.issubdtype(covariates.dtype, np.integer): - raise ValueError("Prediction cannot proceed on a non-numeric numpy array, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing non-numeric covariate data as a Pandas dataframe.") + warnings.warn( + "This BART model has not run any covariate preprocessing routines. We will attempt to predict on the raw covariate values, but this will trigger an error with non-numeric columns. Please refit your model by passing non-numeric covariate data a a Pandas dataframe.", + RuntimeWarning, + ) + if not np.issubdtype( + covariates.dtype, np.floating + ) and not np.issubdtype(covariates.dtype, np.integer): + raise ValueError( + "Prediction cannot proceed on a non-numeric numpy array, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing non-numeric covariate data as a Pandas dataframe." + ) covariates_processed = covariates else: self._covariate_preprocessor = CovariatePreprocessor() @@ -868,10 +1200,12 @@ def predict_mean(self, covariates: np.array, basis: np.array = None) -> np.array pred_dataset.add_covariates(covariates_processed) if basis is not None: pred_dataset.add_basis(basis) - + # Mean forest predictions - mean_pred_raw = self.forest_container_mean.forest_container_cpp.Predict(pred_dataset.dataset_cpp) - mean_pred = mean_pred_raw*self.y_std + self.y_bar + mean_pred_raw = self.forest_container_mean.forest_container_cpp.Predict( + pred_dataset.dataset_cpp + ) + mean_pred = mean_pred_raw * self.y_std + self.y_bar return mean_pred @@ -882,7 +1216,7 @@ def predict_variance(self, covariates: np.array) -> np.array: ---------- covariates : np.array Test set covariates. - + Returns ------- np.array @@ -901,16 +1235,20 @@ def predict_variance(self, covariates: np.array) -> np.array: "Call 'fit' with appropriate arguments before using this model." ) raise NotSampledError(msg) - + # Data checks - if not isinstance(covariates, pd.DataFrame) and not isinstance(covariates, np.ndarray): + if not isinstance(covariates, pd.DataFrame) and not isinstance( + covariates, np.ndarray + ): raise ValueError("covariates must be a pandas dataframe or numpy array") if basis is not None: if not isinstance(basis, np.ndarray): raise ValueError("basis must be a numpy array") if basis.shape[0] != covariates.shape[0]: - raise ValueError("covariates and basis must have the same number of rows") - + raise ValueError( + "covariates and basis must have the same number of rows" + ) + # Convert everything to standard shape (2-dimensional) if isinstance(covariates, np.ndarray): if covariates.ndim == 1: @@ -918,39 +1256,54 @@ def predict_variance(self, covariates: np.array) -> np.array: if basis is not None: if basis.ndim == 1: basis = np.expand_dims(basis, 1) - + # Covariate preprocessing if not self._covariate_preprocessor._check_is_fitted(): if not isinstance(covariates, np.ndarray): - raise ValueError("Prediction cannot proceed on a pandas dataframe, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing covariate data as a Pandas dataframe.") + raise ValueError( + "Prediction cannot proceed on a pandas dataframe, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing covariate data as a Pandas dataframe." + ) else: - warnings.warn("This BART model has not run any covariate preprocessing routines. We will attempt to predict on the raw covariate values, but this will trigger an error with non-numeric columns. Please refit your model by passing non-numeric covariate data a a Pandas dataframe.", RuntimeWarning) - if not np.issubdtype(covariates.dtype, np.floating) and not np.issubdtype(covariates.dtype, np.integer): - raise ValueError("Prediction cannot proceed on a non-numeric numpy array, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing non-numeric covariate data as a Pandas dataframe.") + warnings.warn( + "This BART model has not run any covariate preprocessing routines. We will attempt to predict on the raw covariate values, but this will trigger an error with non-numeric columns. Please refit your model by passing non-numeric covariate data a a Pandas dataframe.", + RuntimeWarning, + ) + if not np.issubdtype( + covariates.dtype, np.floating + ) and not np.issubdtype(covariates.dtype, np.integer): + raise ValueError( + "Prediction cannot proceed on a non-numeric numpy array, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing non-numeric covariate data as a Pandas dataframe." + ) covariates_processed = covariates else: self._covariate_preprocessor = CovariatePreprocessor() self._covariate_preprocessor.fit(covariates) covariates_processed = self._covariate_preprocessor.transform(covariates) - + # Dataset construction pred_dataset = Dataset() pred_dataset.add_covariates(covariates_processed) - + # Variance forest predictions - variance_pred_raw = self.forest_container_variance.forest_container_cpp.Predict(pred_dataset.dataset_cpp) + variance_pred_raw = self.forest_container_variance.forest_container_cpp.Predict( + pred_dataset.dataset_cpp + ) if self.sample_sigma_global: variance_pred = variance_pred_raw for i in range(self.num_samples): - variance_pred[:,i] = variance_pred_raw[:,i]*self.global_var_samples[i] + variance_pred[:, i] = ( + variance_pred_raw[:, i] * self.global_var_samples[i] + ) else: - variance_pred = variance_pred_raw*self.sigma2_init*self.y_std*self.y_std + variance_pred = ( + variance_pred_raw * self.sigma2_init * self.y_std * self.y_std + ) return variance_pred - + def to_json(self) -> str: """ - Converts a sampled BART model to JSON string representation (which can then be saved to a file or + Converts a sampled BART model to JSON string representation (which can then be saved to a file or processed using the `json` library) Returns @@ -964,16 +1317,16 @@ def to_json(self) -> str: "Call 'fit' with appropriate arguments before using this model." ) raise NotSampledError(msg) - + # Initialize JSONSerializer object bart_json = JSONSerializer() - + # Add the forests if self.include_mean_forest: bart_json.add_forest(self.forest_container_mean) if self.include_variance_forest: bart_json.add_forest(self.forest_container_variance) - + # Add global parameters bart_json.add_scalar("outcome_scale", self.y_std) bart_json.add_scalar("outcome_mean", self.y_bar) @@ -989,17 +1342,21 @@ def to_json(self) -> str: bart_json.add_scalar("num_samples", self.num_samples) bart_json.add_scalar("num_basis", self.num_basis) bart_json.add_boolean("requires_basis", self.has_basis) - + # Add parameter samples if self.sample_sigma_global: - bart_json.add_numeric_vector("sigma2_global_samples", self.global_var_samples, "parameters") + bart_json.add_numeric_vector( + "sigma2_global_samples", self.global_var_samples, "parameters" + ) if self.sample_sigma_leaf: - bart_json.add_numeric_vector("sigma2_leaf_samples", self.leaf_scale_samples, "parameters") - + bart_json.add_numeric_vector( + "sigma2_leaf_samples", self.leaf_scale_samples, "parameters" + ) + # Add covariate preprocessor covariate_preprocessor_string = self._covariate_preprocessor.to_json() bart_json.add_string("covariate_preprocessor", covariate_preprocessor_string) - + return bart_json.return_json_string() def from_json(self, json_string: str) -> None: @@ -1014,23 +1371,29 @@ def from_json(self, json_string: str) -> None: # Parse string to a JSON object in C++ bart_json = JSONSerializer() bart_json.load_from_json_string(json_string) - + # Unpack forests self.include_mean_forest = bart_json.get_boolean("include_mean_forest") self.include_variance_forest = bart_json.get_boolean("include_variance_forest") if self.include_mean_forest: # TODO: don't just make this a placeholder that we overwrite self.forest_container_mean = ForestContainer(0, 0, False, False) - self.forest_container_mean.forest_container_cpp.LoadFromJson(bart_json.json_cpp, "forest_0") + self.forest_container_mean.forest_container_cpp.LoadFromJson( + bart_json.json_cpp, "forest_0" + ) if self.include_variance_forest: # TODO: don't just make this a placeholder that we overwrite self.forest_container_variance = ForestContainer(0, 0, False, False) - self.forest_container_variance.forest_container_cpp.LoadFromJson(bart_json.json_cpp, "forest_1") + self.forest_container_variance.forest_container_cpp.LoadFromJson( + bart_json.json_cpp, "forest_1" + ) else: # TODO: don't just make this a placeholder that we overwrite self.forest_container_variance = ForestContainer(0, 0, False, False) - self.forest_container_variance.forest_container_cpp.LoadFromJson(bart_json.json_cpp, "forest_0") - + self.forest_container_variance.forest_container_cpp.LoadFromJson( + bart_json.json_cpp, "forest_0" + ) + # Unpack global parameters self.y_std = bart_json.get_scalar("outcome_scale") self.y_bar = bart_json.get_scalar("outcome_mean") @@ -1047,18 +1410,22 @@ def from_json(self, json_string: str) -> None: # Unpack parameter samples if self.sample_sigma_global: - self.global_var_samples = bart_json.get_numeric_vector("sigma2_global_samples", "parameters") + self.global_var_samples = bart_json.get_numeric_vector( + "sigma2_global_samples", "parameters" + ) if self.sample_sigma_leaf: - self.leaf_scale_samples = bart_json.get_numeric_vector("sigma2_leaf_samples", "parameters") - + self.leaf_scale_samples = bart_json.get_numeric_vector( + "sigma2_leaf_samples", "parameters" + ) + # Unpack covariate preprocessor covariate_preprocessor_string = bart_json.get_string("covariate_preprocessor") self._covariate_preprocessor = CovariatePreprocessor() self._covariate_preprocessor.from_json(covariate_preprocessor_string) - + # Mark the deserialized model as "sampled" self.sampled = True - + def is_sampled(self) -> bool: """Whether or not a BART model has been sampled. diff --git a/stochtree/bcf.py b/stochtree/bcf.py index 4f24234b..3697a6ce 100644 --- a/stochtree/bcf.py +++ b/stochtree/bcf.py @@ -1,6 +1,7 @@ """ Bayesian Causal Forests (BCF) module """ + import numpy as np import pandas as pd from sklearn.utils import check_scalar @@ -13,6 +14,7 @@ from .serialization import JSONSerializer from .utils import NotSampledError + class BCFModel: r""" Class that handles sampling, storage, and serialization of stochastic forest models for causal effect estimation. @@ -56,17 +58,30 @@ class BCFModel: - Continuous Treatment: If $Z$ is continuous rather than binary, we define $b_z(X) = \tau(X, Z) = Z \tau(X)$, where the "leaf model" for the $\tau$ forest is essentially a regression on continuous $Z$. - Heteroskedasticity: Rather than define $\epsilon$ parameterically, we can let a forest $\sigma^2(X)$ model a conditional error variance function. This can be done by setting `num_trees_variance > 0` in the `params` dictionary passed to the `sample` method. """ + def __init__(self) -> None: # Internal flag for whether the sample() method has been run self.sampled = False self.rng = np.random.default_rng() - - def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_train: np.array, pi_train: np.array = None, - X_test: Union[pd.DataFrame, np.array] = None, Z_test: np.array = None, pi_test: np.array = None, - num_gfr: int = 5, num_burnin: int = 0, num_mcmc: int = 100, general_params: Optional[Dict[str, Any]] = None, - mu_forest_params: Optional[Dict[str, Any]] = None, tau_forest_params: Optional[Dict[str, Any]] = None, - variance_forest_params: Optional[Dict[str, Any]] = None) -> None: - """Runs a BCF sampler on provided training set. Outcome predictions and estimates of the prognostic and treatment effect functions + + def sample( + self, + X_train: Union[pd.DataFrame, np.array], + Z_train: np.array, + y_train: np.array, + pi_train: np.array = None, + X_test: Union[pd.DataFrame, np.array] = None, + Z_test: np.array = None, + pi_test: np.array = None, + num_gfr: int = 5, + num_burnin: int = 0, + num_mcmc: int = 100, + general_params: Optional[Dict[str, Any]] = None, + mu_forest_params: Optional[Dict[str, Any]] = None, + tau_forest_params: Optional[Dict[str, Any]] = None, + variance_forest_params: Optional[Dict[str, Any]] = None, + ) -> None: + """Runs a BCF sampler on provided training set. Outcome predictions and estimates of the prognostic and treatment effect functions will be cached for the training set and (if provided) the test set. Parameters @@ -102,9 +117,9 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr * `sigma2_global_shape` (`float`): Shape parameter in the `IG(sigma2_global_shape, b_glsigma2_global_scaleobal)` global error variance model. Defaults to `0`. * `sigma2_global_scale` (`float`): Scale parameter in the `IG(sigma2_global_shape, b_glsigma2_global_scaleobal)` global error variance model. Defaults to `0`. * `variable_weights` (`np.array`): Numeric weights reflecting the relative probability of splitting on each variable in each of the forests. Does not need to sum to 1 but cannot be negative. Defaults to `np.repeat(1/X_train.shape[1], X_train.shape[1])` if not set here. Note that if the propensity score is included as a covariate in either forest, its weight will default to `1/X_train.shape[1]`. A workaround if you wish to provide a custom weight for the propensity score is to include it as a column in `X_train` and then set `propensity_covariate` to `'none'` and adjust `keep_vars` accordingly for the mu or tau forests. - * `propensity_covariate` (`str`): Whether to include the propensity score as a covariate in either or both of the forests. Enter `"none"` for neither, `"mu"` for the prognostic forest, `"tau"` for the treatment forest, and `"both"` for both forests. + * `propensity_covariate` (`str`): Whether to include the propensity score as a covariate in either or both of the forests. Enter `"none"` for neither, `"mu"` for the prognostic forest, `"tau"` for the treatment forest, and `"both"` for both forests. If this is not `"none"` and a propensity score is not provided, it will be estimated from (`X_train`, `Z_train`) using `BARTModel`. Defaults to `"mu"`. - * `adaptive_coding` (`bool`): Whether or not to use an "adaptive coding" scheme in which a binary treatment variable is not coded manually as (0,1) or (-1,1) but learned via + * `adaptive_coding` (`bool`): Whether or not to use an "adaptive coding" scheme in which a binary treatment variable is not coded manually as (0,1) or (-1,1) but learned via parameters `b_0` and `b_1` that attach to the outcome model `[b_0 (1-Z) + b_1 Z] tau(X)`. This is ignored when Z is not binary. Defaults to True. * `control_coding_init` (`float`): Initial value of the "control" group coding parameter. This is ignored when `Z` is not binary. Default: `-0.5`. * `treated_coding_init` (`float`): Initial value of the "treated" group coding parameter. This is ignored when `Z` is not binary. Default: `0.5`. @@ -113,7 +128,7 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr * `keep_gfr` (`bool`): Whether or not "warm-start" / grow-from-root samples should be included in predictions. Defaults to `False`. Ignored if `num_mcmc == 0`. * `keep_every` (`int`): How many iterations of the burned-in MCMC sampler should be run before forests and parameters are retained. Defaults to `1`. Setting `keep_every = k` for some `k > 1` will "thin" the MCMC samples by retaining every `k`-th sample, rather than simply every sample. This can reduce the autocorrelation of the MCMC samples. * `num_chains` (`int`): How many independent MCMC chains should be sampled. If `num_mcmc = 0`, this is ignored. If `num_gfr = 0`, then each chain is run from root for `num_mcmc * keep_every + num_burnin` iterations, with `num_mcmc` samples retained. If `num_gfr > 0`, each MCMC chain will be initialized from a separate GFR ensemble, with the requirement that `num_gfr >= num_chains`. Defaults to `1`. - + mu_forest_params : dict, optional Dictionary of prognostic forest model parameters, each of which has a default value processed internally, so this argument is optional. @@ -159,7 +174,7 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr * `var_forest_prior_scale` (`float`): Scale parameter in the [optional] `IG(var_forest_prior_shape, var_forest_prior_scale)` conditional error variance forest (which is only sampled if `num_trees > 0`). Calibrated internally as `num_trees / 1.5^2` if not set here. * `keep_vars` (`list` or `np.array`): Vector of variable names or column indices denoting variables that should be included in the variance forest. Defaults to `None`. * `drop_vars` (`list` or `np.array`): Vector of variable names or column indices denoting variables that should be excluded from the variance forest. Defaults to `None`. If both `drop_vars` and `keep_vars` are set, `drop_vars` will be ignored. - + Returns ------- self : BCFModel @@ -167,22 +182,22 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr """ # Update general BART parameters general_params_default = { - 'cutpoint_grid_size' : 100, - 'standardize' : True, - 'sample_sigma2_global' : True, - 'sigma2_global_init' : None, - 'sigma2_global_shape' : 0, - 'sigma2_global_scale' : 0, - 'variable_weights' : None, - 'propensity_covariate' : "mu", - 'adaptive_coding' : True, - 'control_coding_init' : -0.5, - 'treated_coding_init' : 0.5, - 'random_seed' : -1, - 'keep_burnin' : False, - 'keep_gfr' : False, - 'keep_every' : 1, - 'num_chains' : 1 + "cutpoint_grid_size": 100, + "standardize": True, + "sample_sigma2_global": True, + "sigma2_global_init": None, + "sigma2_global_shape": 0, + "sigma2_global_scale": 0, + "variable_weights": None, + "propensity_covariate": "mu", + "adaptive_coding": True, + "control_coding_init": -0.5, + "treated_coding_init": 0.5, + "random_seed": -1, + "keep_burnin": False, + "keep_gfr": False, + "keep_every": 1, + "num_chains": 1, } general_params_updated = _preprocess_params( general_params_default, general_params @@ -190,17 +205,17 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr # Update mu forest BART parameters mu_forest_params_default = { - 'num_trees' : 250, - 'alpha' : 0.95, - 'beta' : 2.0, - 'min_samples_leaf' : 5, - 'max_depth' : 10, - 'sample_sigma2_leaf' : True, - 'sigma2_leaf_init' : None, - 'sigma2_leaf_shape' : 3, - 'sigma2_leaf_scale' : None, - 'keep_vars' : None, - 'drop_vars' : None + "num_trees": 250, + "alpha": 0.95, + "beta": 2.0, + "min_samples_leaf": 5, + "max_depth": 10, + "sample_sigma2_leaf": True, + "sigma2_leaf_init": None, + "sigma2_leaf_shape": 3, + "sigma2_leaf_scale": None, + "keep_vars": None, + "drop_vars": None, } mu_forest_params_updated = _preprocess_params( mu_forest_params_default, mu_forest_params @@ -208,35 +223,35 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr # Update tau forest BART parameters tau_forest_params_default = { - 'num_trees' : 50, - 'alpha' : 0.25, - 'beta' : 3.0, - 'min_samples_leaf' : 5, - 'max_depth' : 5, - 'sample_sigma2_leaf' : False, - 'sigma2_leaf_init' : None, - 'sigma2_leaf_shape' : 3, - 'sigma2_leaf_scale' : None, - 'keep_vars' : None, - 'drop_vars' : None + "num_trees": 50, + "alpha": 0.25, + "beta": 3.0, + "min_samples_leaf": 5, + "max_depth": 5, + "sample_sigma2_leaf": False, + "sigma2_leaf_init": None, + "sigma2_leaf_shape": 3, + "sigma2_leaf_scale": None, + "keep_vars": None, + "drop_vars": None, } tau_forest_params_updated = _preprocess_params( tau_forest_params_default, tau_forest_params ) - + # Update variance forest BART parameters variance_forest_params_default = { - 'num_trees' : 0, - 'alpha' : 0.95, - 'beta' : 2.0, - 'min_samples_leaf' : 5, - 'max_depth' : 10, - 'leaf_prior_calibration_param': 1.5, - 'var_forest_leaf_init' : None, - 'var_forest_prior_shape' : None, - 'var_forest_prior_scale' : None, - 'keep_vars' : None, - 'drop_vars' : None + "num_trees": 0, + "alpha": 0.95, + "beta": 2.0, + "min_samples_leaf": 5, + "max_depth": 10, + "leaf_prior_calibration_param": 1.5, + "var_forest_leaf_init": None, + "var_forest_prior_shape": None, + "var_forest_prior_scale": None, + "keep_vars": None, + "drop_vars": None, } variance_forest_params_updated = _preprocess_params( variance_forest_params_default, variance_forest_params @@ -244,78 +259,82 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr ### Unpack all parameter values # 1. General parameters - cutpoint_grid_size = general_params_updated['cutpoint_grid_size'] - self.standardize = general_params_updated['standardize'] - sample_sigma_global = general_params_updated['sample_sigma2_global'] - sigma2_init = general_params_updated['sigma2_global_init'] - a_global = general_params_updated['sigma2_global_shape'] - b_global = general_params_updated['sigma2_global_scale'] - variable_weights = general_params_updated['variable_weights'] - propensity_covariate = general_params_updated['propensity_covariate'] - adaptive_coding = general_params_updated['adaptive_coding'] - b_0 = general_params_updated['control_coding_init'] - b_1 = general_params_updated['treated_coding_init'] - random_seed = general_params_updated['random_seed'] - keep_burnin = general_params_updated['keep_burnin'] - keep_gfr = general_params_updated['keep_gfr'] - keep_every = general_params_updated['keep_every'] + cutpoint_grid_size = general_params_updated["cutpoint_grid_size"] + self.standardize = general_params_updated["standardize"] + sample_sigma_global = general_params_updated["sample_sigma2_global"] + sigma2_init = general_params_updated["sigma2_global_init"] + a_global = general_params_updated["sigma2_global_shape"] + b_global = general_params_updated["sigma2_global_scale"] + variable_weights = general_params_updated["variable_weights"] + propensity_covariate = general_params_updated["propensity_covariate"] + adaptive_coding = general_params_updated["adaptive_coding"] + b_0 = general_params_updated["control_coding_init"] + b_1 = general_params_updated["treated_coding_init"] + random_seed = general_params_updated["random_seed"] + keep_burnin = general_params_updated["keep_burnin"] + keep_gfr = general_params_updated["keep_gfr"] + keep_every = general_params_updated["keep_every"] # 2. Mu forest parameters - num_trees_mu = mu_forest_params_updated['num_trees'] - alpha_mu = mu_forest_params_updated['alpha'] - beta_mu = mu_forest_params_updated['beta'] - min_samples_leaf_mu = mu_forest_params_updated['min_samples_leaf'] - max_depth_mu = mu_forest_params_updated['max_depth'] - sample_sigma_leaf_mu = mu_forest_params_updated['sample_sigma2_leaf'] - sigma_leaf_mu = mu_forest_params_updated['sigma2_leaf_init'] - a_leaf_mu = mu_forest_params_updated['sigma2_leaf_shape'] - b_leaf_mu = mu_forest_params_updated['sigma2_leaf_scale'] - keep_vars_mu = mu_forest_params_updated['keep_vars'] - drop_vars_mu = mu_forest_params_updated['drop_vars'] + num_trees_mu = mu_forest_params_updated["num_trees"] + alpha_mu = mu_forest_params_updated["alpha"] + beta_mu = mu_forest_params_updated["beta"] + min_samples_leaf_mu = mu_forest_params_updated["min_samples_leaf"] + max_depth_mu = mu_forest_params_updated["max_depth"] + sample_sigma_leaf_mu = mu_forest_params_updated["sample_sigma2_leaf"] + sigma_leaf_mu = mu_forest_params_updated["sigma2_leaf_init"] + a_leaf_mu = mu_forest_params_updated["sigma2_leaf_shape"] + b_leaf_mu = mu_forest_params_updated["sigma2_leaf_scale"] + keep_vars_mu = mu_forest_params_updated["keep_vars"] + drop_vars_mu = mu_forest_params_updated["drop_vars"] # 3. Tau forest parameters - num_trees_tau = tau_forest_params_updated['num_trees'] - alpha_tau = tau_forest_params_updated['alpha'] - beta_tau = tau_forest_params_updated['beta'] - min_samples_leaf_tau = tau_forest_params_updated['min_samples_leaf'] - max_depth_tau = tau_forest_params_updated['max_depth'] - sample_sigma_leaf_tau = tau_forest_params_updated['sample_sigma2_leaf'] - sigma_leaf_tau = tau_forest_params_updated['sigma2_leaf_init'] - a_leaf_tau = tau_forest_params_updated['sigma2_leaf_shape'] - b_leaf_tau = tau_forest_params_updated['sigma2_leaf_scale'] - keep_vars_tau = tau_forest_params_updated['keep_vars'] - drop_vars_tau = tau_forest_params_updated['drop_vars'] + num_trees_tau = tau_forest_params_updated["num_trees"] + alpha_tau = tau_forest_params_updated["alpha"] + beta_tau = tau_forest_params_updated["beta"] + min_samples_leaf_tau = tau_forest_params_updated["min_samples_leaf"] + max_depth_tau = tau_forest_params_updated["max_depth"] + sample_sigma_leaf_tau = tau_forest_params_updated["sample_sigma2_leaf"] + sigma_leaf_tau = tau_forest_params_updated["sigma2_leaf_init"] + a_leaf_tau = tau_forest_params_updated["sigma2_leaf_shape"] + b_leaf_tau = tau_forest_params_updated["sigma2_leaf_scale"] + keep_vars_tau = tau_forest_params_updated["keep_vars"] + drop_vars_tau = tau_forest_params_updated["drop_vars"] # 4. Variance forest parameters - num_trees_variance = variance_forest_params_updated['num_trees'] - alpha_variance = variance_forest_params_updated['alpha'] - beta_variance = variance_forest_params_updated['beta'] - min_samples_leaf_variance = variance_forest_params_updated['min_samples_leaf'] - max_depth_variance = variance_forest_params_updated['max_depth'] - a_0 = variance_forest_params_updated['leaf_prior_calibration_param'] - variance_forest_leaf_init = variance_forest_params_updated['var_forest_leaf_init'] - a_forest = variance_forest_params_updated['var_forest_prior_shape'] - b_forest = variance_forest_params_updated['var_forest_prior_scale'] - keep_vars_variance = variance_forest_params_updated['keep_vars'] - drop_vars_variance = variance_forest_params_updated['drop_vars'] - + num_trees_variance = variance_forest_params_updated["num_trees"] + alpha_variance = variance_forest_params_updated["alpha"] + beta_variance = variance_forest_params_updated["beta"] + min_samples_leaf_variance = variance_forest_params_updated["min_samples_leaf"] + max_depth_variance = variance_forest_params_updated["max_depth"] + a_0 = variance_forest_params_updated["leaf_prior_calibration_param"] + variance_forest_leaf_init = variance_forest_params_updated[ + "var_forest_leaf_init" + ] + a_forest = variance_forest_params_updated["var_forest_prior_shape"] + b_forest = variance_forest_params_updated["var_forest_prior_scale"] + keep_vars_variance = variance_forest_params_updated["keep_vars"] + drop_vars_variance = variance_forest_params_updated["drop_vars"] + # Variable weight preprocessing (and initialization if necessary) if variable_weights is None: if X_train.ndim > 1: - variable_weights = np.repeat(1/X_train.shape[1], X_train.shape[1]) + variable_weights = np.repeat(1 / X_train.shape[1], X_train.shape[1]) else: - variable_weights = np.repeat(1., 1) + variable_weights = np.repeat(1.0, 1) if np.any(variable_weights < 0): raise ValueError("variable_weights cannot have any negative weights") variable_weights_mu = variable_weights variable_weights_tau = variable_weights variable_weights_variance = variable_weights - + # Determine whether conditional variance model will be fit self.include_variance_forest = True if num_trees_variance > 0 else False - + # Check data inputs - if not isinstance(X_train, pd.DataFrame) and not isinstance(X_train, np.ndarray): + if not isinstance(X_train, pd.DataFrame) and not isinstance( + X_train, np.ndarray + ): raise ValueError("X_train must be a pandas dataframe or numpy array") if not isinstance(Z_train, np.ndarray): raise ValueError("Z_train must be a numpy array") @@ -325,7 +344,9 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr if not isinstance(y_train, np.ndarray): raise ValueError("y_train must be a numpy array") if X_test is not None: - if not isinstance(X_test, pd.DataFrame) and not isinstance(X_test, np.ndarray): + if not isinstance(X_test, pd.DataFrame) and not isinstance( + X_test, np.ndarray + ): raise ValueError("X_test must be a pandas dataframe or numpy array") if Z_test is not None: if not isinstance(Z_test, np.ndarray): @@ -333,7 +354,7 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr if pi_test is not None: if not isinstance(pi_test, np.ndarray): raise ValueError("pi_test must be a numpy array") - + # Convert everything to standard shape (2-dimensional) if isinstance(X_train, np.ndarray): if X_train.ndim == 1: @@ -359,111 +380,258 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr # Original number of covariates num_cov_orig = X_train.shape[1] - + # Data checks if X_test is not None: if X_test.shape[1] != X_train.shape[1]: - raise ValueError("X_train and X_test must have the same number of columns") + raise ValueError( + "X_train and X_test must have the same number of columns" + ) if Z_test is not None: if Z_test.shape[1] != Z_train.shape[1]: - raise ValueError("Z_train and Z_test must have the same number of columns") + raise ValueError( + "Z_train and Z_test must have the same number of columns" + ) if Z_train.shape[0] != X_train.shape[0]: raise ValueError("X_train and Z_train must have the same number of rows") if y_train.shape[0] != X_train.shape[0]: raise ValueError("X_train and y_train must have the same number of rows") if pi_train is not None: if pi_train.shape[0] != X_train.shape[0]: - raise ValueError("X_train and pi_train must have the same number of rows") + raise ValueError( + "X_train and pi_train must have the same number of rows" + ) if X_test is not None and Z_test is not None: if X_test.shape[0] != Z_test.shape[0]: raise ValueError("X_test and Z_test must have the same number of rows") if X_test is not None and pi_test is not None: if X_test.shape[0] != pi_test.shape[0]: raise ValueError("X_test and pi_test must have the same number of rows") - + # Treatment details self.treatment_dim = Z_train.shape[1] self.multivariate_treatment = True if self.treatment_dim > 1 else False treatment_leaf_model = 2 if self.multivariate_treatment else 1 - + # Set variance leaf model type (currently only one option) leaf_model_variance_forest = 3 self.variance_scale = 1 - + # Check parameters if sigma_leaf_tau is not None: - if not isinstance(sigma_leaf_tau, float) and not isinstance(sigma_leaf_tau, np.ndarray): + if not isinstance(sigma_leaf_tau, float) and not isinstance( + sigma_leaf_tau, np.ndarray + ): raise ValueError("sigma_leaf_tau must be a float or numpy array") if self.multivariate_treatment: if sigma_leaf_tau is not None: if isinstance(sigma_leaf_tau, np.ndarray): if sigma_leaf_tau.ndim != 2: - raise ValueError("sigma_leaf_tau must be 2-dimensional if passed as a np.array") - if self.treatment_dim != sigma_leaf_tau.shape[0] or self.treatment_dim != sigma_leaf_tau.shape[1]: - raise ValueError("sigma_leaf_tau must have the same number of rows and columns, which must match Z_train.shape[1]") + raise ValueError( + "sigma_leaf_tau must be 2-dimensional if passed as a np.array" + ) + if ( + self.treatment_dim != sigma_leaf_tau.shape[0] + or self.treatment_dim != sigma_leaf_tau.shape[1] + ): + raise ValueError( + "sigma_leaf_tau must have the same number of rows and columns, which must match Z_train.shape[1]" + ) if sigma_leaf_mu is not None: - sigma_leaf_mu = check_scalar(x=sigma_leaf_mu, name="sigma_leaf_mu", target_type=float, - min_val=0., max_val=None, include_boundaries="neither") + sigma_leaf_mu = check_scalar( + x=sigma_leaf_mu, + name="sigma_leaf_mu", + target_type=float, + min_val=0.0, + max_val=None, + include_boundaries="neither", + ) if cutpoint_grid_size is not None: - cutpoint_grid_size = check_scalar(x=cutpoint_grid_size, name="cutpoint_grid_size", target_type=int, - min_val=1, max_val=None, include_boundaries="left") + cutpoint_grid_size = check_scalar( + x=cutpoint_grid_size, + name="cutpoint_grid_size", + target_type=int, + min_val=1, + max_val=None, + include_boundaries="left", + ) if min_samples_leaf_mu is not None: - min_samples_leaf_mu = check_scalar(x=min_samples_leaf_mu, name="min_samples_leaf_mu", target_type=int, - min_val=1, max_val=None, include_boundaries="left") + min_samples_leaf_mu = check_scalar( + x=min_samples_leaf_mu, + name="min_samples_leaf_mu", + target_type=int, + min_val=1, + max_val=None, + include_boundaries="left", + ) if min_samples_leaf_tau is not None: - min_samples_leaf_tau = check_scalar(x=min_samples_leaf_tau, name="min_samples_leaf_tau", target_type=int, - min_val=1, max_val=None, include_boundaries="left") + min_samples_leaf_tau = check_scalar( + x=min_samples_leaf_tau, + name="min_samples_leaf_tau", + target_type=int, + min_val=1, + max_val=None, + include_boundaries="left", + ) if num_trees_mu is not None: - num_trees_mu = check_scalar(x=num_trees_mu, name="num_trees_mu", target_type=int, - min_val=1, max_val=None, include_boundaries="left") + num_trees_mu = check_scalar( + x=num_trees_mu, + name="num_trees_mu", + target_type=int, + min_val=1, + max_val=None, + include_boundaries="left", + ) if num_trees_tau is not None: - num_trees_tau = check_scalar(x=num_trees_tau, name="num_trees_tau", target_type=int, - min_val=1, max_val=None, include_boundaries="left") - num_gfr = check_scalar(x=num_gfr, name="num_gfr", target_type=int, - min_val=0, max_val=None, include_boundaries="left") - num_burnin = check_scalar(x=num_burnin, name="num_burnin", target_type=int, - min_val=0, max_val=None, include_boundaries="left") - num_mcmc = check_scalar(x=num_mcmc, name="num_mcmc", target_type=int, - min_val=0, max_val=None, include_boundaries="left") + num_trees_tau = check_scalar( + x=num_trees_tau, + name="num_trees_tau", + target_type=int, + min_val=1, + max_val=None, + include_boundaries="left", + ) + num_gfr = check_scalar( + x=num_gfr, + name="num_gfr", + target_type=int, + min_val=0, + max_val=None, + include_boundaries="left", + ) + num_burnin = check_scalar( + x=num_burnin, + name="num_burnin", + target_type=int, + min_val=0, + max_val=None, + include_boundaries="left", + ) + num_mcmc = check_scalar( + x=num_mcmc, + name="num_mcmc", + target_type=int, + min_val=0, + max_val=None, + include_boundaries="left", + ) num_samples = num_gfr + num_burnin + num_mcmc - num_samples = check_scalar(x=num_samples, name="num_samples", target_type=int, - min_val=1, max_val=None, include_boundaries="left") + num_samples = check_scalar( + x=num_samples, + name="num_samples", + target_type=int, + min_val=1, + max_val=None, + include_boundaries="left", + ) if random_seed is not None: - random_seed = check_scalar(x=random_seed, name="random_seed", target_type=int, - min_val=-1, max_val=None, include_boundaries="left") + random_seed = check_scalar( + x=random_seed, + name="random_seed", + target_type=int, + min_val=-1, + max_val=None, + include_boundaries="left", + ) if alpha_mu is not None: - alpha_mu = check_scalar(x=alpha_mu, name="alpha_mu", target_type=(float,int), - min_val=0, max_val=1, include_boundaries="neither") + alpha_mu = check_scalar( + x=alpha_mu, + name="alpha_mu", + target_type=(float, int), + min_val=0, + max_val=1, + include_boundaries="neither", + ) if alpha_tau is not None: - alpha_tau = check_scalar(x=alpha_tau, name="alpha_tau", target_type=(float,int), - min_val=0, max_val=1, include_boundaries="neither") + alpha_tau = check_scalar( + x=alpha_tau, + name="alpha_tau", + target_type=(float, int), + min_val=0, + max_val=1, + include_boundaries="neither", + ) if beta_mu is not None: - beta_mu = check_scalar(x=beta_mu, name="beta_mu", target_type=(float,int), - min_val=1, max_val=None, include_boundaries="left") + beta_mu = check_scalar( + x=beta_mu, + name="beta_mu", + target_type=(float, int), + min_val=1, + max_val=None, + include_boundaries="left", + ) if beta_tau is not None: - beta_tau = check_scalar(x=beta_tau, name="beta_tau", target_type=(float,int), - min_val=1, max_val=None, include_boundaries="left") + beta_tau = check_scalar( + x=beta_tau, + name="beta_tau", + target_type=(float, int), + min_val=1, + max_val=None, + include_boundaries="left", + ) if a_global is not None: - a_global = check_scalar(x=a_global, name="a_global", target_type=(float,int), - min_val=0, max_val=None, include_boundaries="left") + a_global = check_scalar( + x=a_global, + name="a_global", + target_type=(float, int), + min_val=0, + max_val=None, + include_boundaries="left", + ) if b_global is not None: - b_global = check_scalar(x=b_global, name="b_global", target_type=(float,int), - min_val=0, max_val=None, include_boundaries="left") + b_global = check_scalar( + x=b_global, + name="b_global", + target_type=(float, int), + min_val=0, + max_val=None, + include_boundaries="left", + ) if a_leaf_mu is not None: - a_leaf_mu = check_scalar(x=a_leaf_mu, name="a_leaf_mu", target_type=(float,int), - min_val=0, max_val=None, include_boundaries="left") + a_leaf_mu = check_scalar( + x=a_leaf_mu, + name="a_leaf_mu", + target_type=(float, int), + min_val=0, + max_val=None, + include_boundaries="left", + ) if a_leaf_tau is not None: - a_leaf_tau = check_scalar(x=a_leaf_tau, name="a_leaf_tau", target_type=(float,int), - min_val=0, max_val=None, include_boundaries="left") + a_leaf_tau = check_scalar( + x=a_leaf_tau, + name="a_leaf_tau", + target_type=(float, int), + min_val=0, + max_val=None, + include_boundaries="left", + ) if b_leaf_mu is not None: - b_leaf_mu = check_scalar(x=b_leaf_mu, name="b_leaf_mu", target_type=(float,int), - min_val=0, max_val=None, include_boundaries="left") + b_leaf_mu = check_scalar( + x=b_leaf_mu, + name="b_leaf_mu", + target_type=(float, int), + min_val=0, + max_val=None, + include_boundaries="left", + ) if b_leaf_tau is not None: - b_leaf_tau = check_scalar(x=b_leaf_tau, name="b_leaf_tau", target_type=(float,int), - min_val=0, max_val=None, include_boundaries="left") + b_leaf_tau = check_scalar( + x=b_leaf_tau, + name="b_leaf_tau", + target_type=(float, int), + min_val=0, + max_val=None, + include_boundaries="left", + ) if sigma2_init is not None: - sigma2_init = check_scalar(x=sigma2_init, name="sigma2_init", target_type=(float,int), - min_val=0, max_val=None, include_boundaries="neither") + sigma2_init = check_scalar( + x=sigma2_init, + name="sigma2_init", + target_type=(float, int), + min_val=0, + max_val=None, + include_boundaries="neither", + ) if sample_sigma_leaf_mu is not None: if not isinstance(sample_sigma_leaf_mu, bool): raise ValueError("sample_sigma_leaf_mu must be a bool") @@ -472,45 +640,81 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr raise ValueError("sample_sigma_leaf_tau must be a bool") if propensity_covariate is not None: if propensity_covariate not in ["mu", "tau", "both", "none"]: - raise ValueError("propensity_covariate must be one of 'mu', 'tau', 'both', or 'none'") + raise ValueError( + "propensity_covariate must be one of 'mu', 'tau', 'both', or 'none'" + ) if b_0 is not None: - b_0 = check_scalar(x=b_0, name="b_0", target_type=(float,int), - min_val=None, max_val=None, include_boundaries="neither") + b_0 = check_scalar( + x=b_0, + name="b_0", + target_type=(float, int), + min_val=None, + max_val=None, + include_boundaries="neither", + ) if b_1 is not None: - b_1 = check_scalar(x=b_1, name="b_1", target_type=(float,int), - min_val=None, max_val=None, include_boundaries="neither") + b_1 = check_scalar( + x=b_1, + name="b_1", + target_type=(float, int), + min_val=None, + max_val=None, + include_boundaries="neither", + ) if keep_burnin is not None: if not isinstance(keep_burnin, bool): raise ValueError("keep_burnin must be a bool") if keep_gfr is not None: if not isinstance(keep_gfr, bool): raise ValueError("keep_gfr must be a bool") - + # Standardize the keep variable lists to numeric indices if keep_vars_mu is not None: if isinstance(keep_vars_mu, list): if all(isinstance(i, str) for i in keep_vars_mu): if not np.all(np.isin(keep_vars_mu, X_train.columns)): - raise ValueError("keep_vars_mu includes some variable names that are not in X_train") - variable_subset_mu = [i for i in X_train.shape[1] if keep_vars_mu.count(X_train.columns.array[i]) > 0] + raise ValueError( + "keep_vars_mu includes some variable names that are not in X_train" + ) + variable_subset_mu = [ + i + for i in X_train.shape[1] + if keep_vars_mu.count(X_train.columns.array[i]) > 0 + ] elif all(isinstance(i, int) for i in keep_vars_mu): if any(i >= X_train.shape[1] for i in keep_vars_mu): - raise ValueError("keep_vars_mu includes some variable indices that exceed the number of columns in X_train") + raise ValueError( + "keep_vars_mu includes some variable indices that exceed the number of columns in X_train" + ) if any(i < 0 for i in keep_vars_mu): - raise ValueError("keep_vars_mu includes some negative variable indices") + raise ValueError( + "keep_vars_mu includes some negative variable indices" + ) variable_subset_mu = keep_vars_mu else: - raise ValueError("keep_vars_mu must be a list of variable names (str) or column indices (int)") + raise ValueError( + "keep_vars_mu must be a list of variable names (str) or column indices (int)" + ) elif isinstance(keep_vars_mu, np.ndarray): if keep_vars_mu.dtype == np.str_: if not np.all(np.isin(keep_vars_mu, X_train.columns)): - raise ValueError("keep_vars_mu includes some variable names that are not in X_train") - variable_subset_mu = [i for i in X_train.shape[1] if keep_vars_mu.count(X_train.columns.array[i]) > 0] + raise ValueError( + "keep_vars_mu includes some variable names that are not in X_train" + ) + variable_subset_mu = [ + i + for i in X_train.shape[1] + if keep_vars_mu.count(X_train.columns.array[i]) > 0 + ] else: if np.any(keep_vars_mu >= X_train.shape[1]): - raise ValueError("keep_vars_mu includes some variable indices that exceed the number of columns in X_train") + raise ValueError( + "keep_vars_mu includes some variable indices that exceed the number of columns in X_train" + ) if np.any(keep_vars_mu < 0): - raise ValueError("keep_vars_mu includes some negative variable indices") + raise ValueError( + "keep_vars_mu includes some negative variable indices" + ) variable_subset_mu = [i for i in keep_vars_mu] else: raise ValueError("keep_vars_mu must be a list or np.array") @@ -518,27 +722,47 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr if isinstance(drop_vars_mu, list): if all(isinstance(i, str) for i in drop_vars_mu): if not np.all(np.isin(drop_vars_mu, X_train.columns)): - raise ValueError("drop_vars_mu includes some variable names that are not in X_train") - variable_subset_mu = [i for i in range(X_train.shape[1]) if drop_vars_mu.count(X_train.columns.array[i]) == 0] + raise ValueError( + "drop_vars_mu includes some variable names that are not in X_train" + ) + variable_subset_mu = [ + i + for i in range(X_train.shape[1]) + if drop_vars_mu.count(X_train.columns.array[i]) == 0 + ] elif all(isinstance(i, int) for i in drop_vars_mu): if any(i >= X_train.shape[1] for i in drop_vars_mu): - raise ValueError("drop_vars_mu includes some variable indices that exceed the number of columns in X_train") + raise ValueError( + "drop_vars_mu includes some variable indices that exceed the number of columns in X_train" + ) if any(i < 0 for i in drop_vars_mu): - raise ValueError("drop_vars_mu includes some negative variable indices") - variable_subset_mu = [i for i in range(X_train.shape[1]) if drop_vars_mu.count(i) == 0] + raise ValueError( + "drop_vars_mu includes some negative variable indices" + ) + variable_subset_mu = [ + i for i in range(X_train.shape[1]) if drop_vars_mu.count(i) == 0 + ] else: - raise ValueError("drop_vars_mu must be a list of variable names (str) or column indices (int)") + raise ValueError( + "drop_vars_mu must be a list of variable names (str) or column indices (int)" + ) elif isinstance(drop_vars_mu, np.ndarray): if drop_vars_mu.dtype == np.str_: if not np.all(np.isin(drop_vars_mu, X_train.columns)): - raise ValueError("drop_vars_mu includes some variable names that are not in X_train") + raise ValueError( + "drop_vars_mu includes some variable names that are not in X_train" + ) keep_inds = ~np.isin(X_train.columns.array, drop_vars_mu) variable_subset_mu = [i for i in keep_inds] else: if np.any(drop_vars_mu >= X_train.shape[1]): - raise ValueError("drop_vars_mu includes some variable indices that exceed the number of columns in X_train") + raise ValueError( + "drop_vars_mu includes some variable indices that exceed the number of columns in X_train" + ) if np.any(drop_vars_mu < 0): - raise ValueError("drop_vars_mu includes some negative variable indices") + raise ValueError( + "drop_vars_mu includes some negative variable indices" + ) keep_inds = ~np.isin(np.arange(X_train.shape[1]), drop_vars_mu) variable_subset_mu = [i for i in keep_inds] else: @@ -549,26 +773,48 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr if isinstance(keep_vars_tau, list): if all(isinstance(i, str) for i in keep_vars_tau): if not np.all(np.isin(keep_vars_tau, X_train.columns)): - raise ValueError("keep_vars_tau includes some variable names that are not in X_train") - variable_subset_tau = [i for i in range(X_train.shape[1]) if keep_vars_tau.count(X_train.columns.array[i]) > 0] + raise ValueError( + "keep_vars_tau includes some variable names that are not in X_train" + ) + variable_subset_tau = [ + i + for i in range(X_train.shape[1]) + if keep_vars_tau.count(X_train.columns.array[i]) > 0 + ] elif all(isinstance(i, int) for i in keep_vars_tau): if any(i >= X_train.shape[1] for i in keep_vars_tau): - raise ValueError("keep_vars_tau includes some variable indices that exceed the number of columns in X_train") + raise ValueError( + "keep_vars_tau includes some variable indices that exceed the number of columns in X_train" + ) if any(i < 0 for i in keep_vars_tau): - raise ValueError("keep_vars_tau includes some negative variable indices") + raise ValueError( + "keep_vars_tau includes some negative variable indices" + ) variable_subset_tau = keep_vars_tau else: - raise ValueError("keep_vars_tau must be a list of variable names (str) or column indices (int)") + raise ValueError( + "keep_vars_tau must be a list of variable names (str) or column indices (int)" + ) elif isinstance(keep_vars_tau, np.ndarray): if keep_vars_tau.dtype == np.str_: if not np.all(np.isin(keep_vars_tau, X_train.columns)): - raise ValueError("keep_vars_tau includes some variable names that are not in X_train") - variable_subset_tau = [i for i in range(X_train.shape[1]) if keep_vars_tau.count(X_train.columns.array[i]) > 0] + raise ValueError( + "keep_vars_tau includes some variable names that are not in X_train" + ) + variable_subset_tau = [ + i + for i in range(X_train.shape[1]) + if keep_vars_tau.count(X_train.columns.array[i]) > 0 + ] else: if np.any(keep_vars_tau >= X_train.shape[1]): - raise ValueError("keep_vars_tau includes some variable indices that exceed the number of columns in X_train") + raise ValueError( + "keep_vars_tau includes some variable indices that exceed the number of columns in X_train" + ) if np.any(keep_vars_tau < 0): - raise ValueError("keep_vars_tau includes some negative variable indices") + raise ValueError( + "keep_vars_tau includes some negative variable indices" + ) variable_subset_tau = [i for i in keep_vars_tau] else: raise ValueError("keep_vars_tau must be a list or np.array") @@ -576,27 +822,49 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr if isinstance(drop_vars_tau, list): if all(isinstance(i, str) for i in drop_vars_tau): if not np.all(np.isin(drop_vars_tau, X_train.columns)): - raise ValueError("drop_vars_tau includes some variable names that are not in X_train") - variable_subset_tau = [i for i in range(X_train.shape[1]) if drop_vars_tau.count(X_train.columns.array[i]) == 0] + raise ValueError( + "drop_vars_tau includes some variable names that are not in X_train" + ) + variable_subset_tau = [ + i + for i in range(X_train.shape[1]) + if drop_vars_tau.count(X_train.columns.array[i]) == 0 + ] elif all(isinstance(i, int) for i in drop_vars_tau): if any(i >= X_train.shape[1] for i in drop_vars_tau): - raise ValueError("drop_vars_tau includes some variable indices that exceed the number of columns in X_train") + raise ValueError( + "drop_vars_tau includes some variable indices that exceed the number of columns in X_train" + ) if any(i < 0 for i in drop_vars_tau): - raise ValueError("drop_vars_tau includes some negative variable indices") - variable_subset_tau = [i for i in range(X_train.shape[1]) if drop_vars_tau.count(i) == 0] + raise ValueError( + "drop_vars_tau includes some negative variable indices" + ) + variable_subset_tau = [ + i + for i in range(X_train.shape[1]) + if drop_vars_tau.count(i) == 0 + ] else: - raise ValueError("drop_vars_tau must be a list of variable names (str) or column indices (int)") + raise ValueError( + "drop_vars_tau must be a list of variable names (str) or column indices (int)" + ) elif isinstance(drop_vars_tau, np.ndarray): if drop_vars_tau.dtype == np.str_: if not np.all(np.isin(drop_vars_tau, X_train.columns)): - raise ValueError("drop_vars_tau includes some variable names that are not in X_train") + raise ValueError( + "drop_vars_tau includes some variable names that are not in X_train" + ) keep_inds = ~np.isin(X_train.columns.array, drop_vars_tau) variable_subset_tau = [i for i in keep_inds] else: if np.any(drop_vars_tau >= X_train.shape[1]): - raise ValueError("drop_vars_tau includes some variable indices that exceed the number of columns in X_train") + raise ValueError( + "drop_vars_tau includes some variable indices that exceed the number of columns in X_train" + ) if np.any(drop_vars_tau < 0): - raise ValueError("drop_vars_tau includes some negative variable indices") + raise ValueError( + "drop_vars_tau includes some negative variable indices" + ) keep_inds = ~np.isin(np.arange(X_train.shape[1]), drop_vars_tau) variable_subset_tau = [i for i in keep_inds] else: @@ -607,26 +875,48 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr if isinstance(keep_vars_variance, list): if all(isinstance(i, str) for i in keep_vars_variance): if not np.all(np.isin(keep_vars_variance, X_train.columns)): - raise ValueError("keep_vars_variance includes some variable names that are not in X_train") - variable_subset_variance = [i for i in range(X_train.shape[1]) if keep_vars_variance.count(X_train.columns.array[i]) > 0] + raise ValueError( + "keep_vars_variance includes some variable names that are not in X_train" + ) + variable_subset_variance = [ + i + for i in range(X_train.shape[1]) + if keep_vars_variance.count(X_train.columns.array[i]) > 0 + ] elif all(isinstance(i, int) for i in keep_vars_variance): if any(i >= X_train.shape[1] for i in keep_vars_variance): - raise ValueError("keep_vars_variance includes some variable indices that exceed the number of columns in X_train") + raise ValueError( + "keep_vars_variance includes some variable indices that exceed the number of columns in X_train" + ) if any(i < 0 for i in keep_vars_variance): - raise ValueError("keep_vars_variance includes some negative variable indices") + raise ValueError( + "keep_vars_variance includes some negative variable indices" + ) variable_subset_variance = keep_vars_variance else: - raise ValueError("keep_vars_variance must be a list of variable names (str) or column indices (int)") + raise ValueError( + "keep_vars_variance must be a list of variable names (str) or column indices (int)" + ) elif isinstance(keep_vars_variance, np.ndarray): if keep_vars_variance.dtype == np.str_: if not np.all(np.isin(keep_vars_variance, X_train.columns)): - raise ValueError("keep_vars_variance includes some variable names that are not in X_train") - variable_subset_variance = [i for i in range(X_train.shape[1]) if keep_vars_variance.count(X_train.columns.array[i]) > 0] + raise ValueError( + "keep_vars_variance includes some variable names that are not in X_train" + ) + variable_subset_variance = [ + i + for i in range(X_train.shape[1]) + if keep_vars_variance.count(X_train.columns.array[i]) > 0 + ] else: if np.any(keep_vars_variance >= X_train.shape[1]): - raise ValueError("keep_vars_variance includes some variable indices that exceed the number of columns in X_train") + raise ValueError( + "keep_vars_variance includes some variable indices that exceed the number of columns in X_train" + ) if np.any(keep_vars_variance < 0): - raise ValueError("keep_vars_variance includes some negative variable indices") + raise ValueError( + "keep_vars_variance includes some negative variable indices" + ) variable_subset_variance = [i for i in keep_vars_variance] else: raise ValueError("keep_vars_variance must be a list or np.array") @@ -634,46 +924,74 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr if isinstance(drop_vars_variance, list): if all(isinstance(i, str) for i in drop_vars_variance): if not np.all(np.isin(drop_vars_variance, X_train.columns)): - raise ValueError("drop_vars_variance includes some variable names that are not in X_train") - variable_subset_variance = [i for i in range(X_train.shape[1]) if drop_vars_variance.count(X_train.columns.array[i]) == 0] + raise ValueError( + "drop_vars_variance includes some variable names that are not in X_train" + ) + variable_subset_variance = [ + i + for i in range(X_train.shape[1]) + if drop_vars_variance.count(X_train.columns.array[i]) == 0 + ] elif all(isinstance(i, int) for i in drop_vars_variance): if any(i >= X_train.shape[1] for i in drop_vars_variance): - raise ValueError("drop_vars_variance includes some variable indices that exceed the number of columns in X_train") + raise ValueError( + "drop_vars_variance includes some variable indices that exceed the number of columns in X_train" + ) if any(i < 0 for i in drop_vars_variance): - raise ValueError("drop_vars_variance includes some negative variable indices") - variable_subset_variance = [i for i in range(X_train.shape[1]) if drop_vars_variance.count(i) == 0] + raise ValueError( + "drop_vars_variance includes some negative variable indices" + ) + variable_subset_variance = [ + i + for i in range(X_train.shape[1]) + if drop_vars_variance.count(i) == 0 + ] else: - raise ValueError("drop_vars_variance must be a list of variable names (str) or column indices (int)") + raise ValueError( + "drop_vars_variance must be a list of variable names (str) or column indices (int)" + ) elif isinstance(drop_vars_variance, np.ndarray): if drop_vars_variance.dtype == np.str_: if not np.all(np.isin(drop_vars_variance, X_train.columns)): - raise ValueError("drop_vars_variance includes some variable names that are not in X_train") + raise ValueError( + "drop_vars_variance includes some variable names that are not in X_train" + ) keep_inds = ~np.isin(X_train.columns.array, drop_vars_variance) variable_subset_variance = [i for i in keep_inds] else: if np.any(drop_vars_variance >= X_train.shape[1]): - raise ValueError("drop_vars_variance includes some variable indices that exceed the number of columns in X_train") + raise ValueError( + "drop_vars_variance includes some variable indices that exceed the number of columns in X_train" + ) if np.any(drop_vars_variance < 0): - raise ValueError("drop_vars_variance includes some negative variable indices") - keep_inds = ~np.isin(np.arange(X_train.shape[1]), drop_vars_variance) + raise ValueError( + "drop_vars_variance includes some negative variable indices" + ) + keep_inds = ~np.isin( + np.arange(X_train.shape[1]), drop_vars_variance + ) variable_subset_variance = [i for i in keep_inds] else: raise ValueError("drop_vars_variance must be a list or np.array") else: variable_subset_variance = [i for i in range(X_train.shape[1])] - + # Covariate preprocessing self._covariate_preprocessor = CovariatePreprocessor() self._covariate_preprocessor.fit(X_train) X_train_processed = self._covariate_preprocessor.transform(X_train) if X_test is not None: X_test_processed = self._covariate_preprocessor.transform(X_test) - feature_types = np.asarray(self._covariate_preprocessor._processed_feature_types) - original_var_indices = self._covariate_preprocessor.fetch_original_feature_indices() + feature_types = np.asarray( + self._covariate_preprocessor._processed_feature_types + ) + original_var_indices = ( + self._covariate_preprocessor.fetch_original_feature_indices() + ) # Determine whether a test set is provided self.has_test = X_test is not None - + # Unpack data dimensions self.n_train = y_train.shape[0] self.n_test = X_test.shape[0] if self.has_test else 0 @@ -696,16 +1014,35 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr # Check if user has provided propensities that are needed in the model if pi_train is None and propensity_covariate != "none": if self.multivariate_treatment: - raise ValueError("Propensities must be provided (via pi_train and / or pi_test parameters) or omitted by setting propensity_covariate = 'none' for multivariate treatments") + raise ValueError( + "Propensities must be provided (via pi_train and / or pi_test parameters) or omitted by setting propensity_covariate = 'none' for multivariate treatments" + ) else: self.bart_propensity_model = BARTModel() if self.has_test: - self.bart_propensity_model.sample(X_train=X_train_processed, y_train=Z_train, X_test=X_test_processed, num_gfr=10, num_mcmc=10) - pi_train = np.mean(self.bart_propensity_model.y_hat_train, axis = 1, keepdims = True) - pi_test = np.mean(self.bart_propensity_model.y_hat_test, axis = 1, keepdims = True) + self.bart_propensity_model.sample( + X_train=X_train_processed, + y_train=Z_train, + X_test=X_test_processed, + num_gfr=10, + num_mcmc=10, + ) + pi_train = np.mean( + self.bart_propensity_model.y_hat_train, axis=1, keepdims=True + ) + pi_test = np.mean( + self.bart_propensity_model.y_hat_test, axis=1, keepdims=True + ) else: - self.bart_propensity_model.sample(X_train=X_train_processed, y_train=Z_train, num_gfr=10, num_mcmc=10) - pi_train = np.mean(self.bart_propensity_model.y_hat_train, axis = 1, keepdims = True) + self.bart_propensity_model.sample( + X_train=X_train_processed, + y_train=Z_train, + num_gfr=10, + num_mcmc=10, + ) + pi_train = np.mean( + self.bart_propensity_model.y_hat_train, axis=1, keepdims=True + ) self.internal_propensity_model = True else: self.internal_propensity_model = False @@ -717,20 +1054,38 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr else: self.y_bar = 0 self.y_std = 1 - resid_train = (y_train-self.y_bar)/self.y_std + resid_train = (y_train - self.y_bar) / self.y_std # Calibrate priors for global sigma^2 and sigma_leaf_mu / sigma_leaf_tau (don't use regression initializer for warm-start or XBART) if not sigma2_init: - sigma2_init = 1.0*np.var(resid_train) + sigma2_init = 1.0 * np.var(resid_train) if not variance_forest_leaf_init: - variance_forest_leaf_init = 0.6*np.var(resid_train) - b_leaf_mu = np.squeeze(np.var(resid_train)) / num_trees_mu if b_leaf_mu is None else b_leaf_mu - b_leaf_tau = np.squeeze(np.var(resid_train)) / (2*num_trees_tau) if b_leaf_tau is None else b_leaf_tau - sigma_leaf_mu = np.squeeze(np.var(resid_train)) / num_trees_mu if sigma_leaf_mu is None else sigma_leaf_mu - sigma_leaf_tau = np.squeeze(np.var(resid_train)) / (2*num_trees_tau) if sigma_leaf_tau is None else sigma_leaf_tau + variance_forest_leaf_init = 0.6 * np.var(resid_train) + b_leaf_mu = ( + np.squeeze(np.var(resid_train)) / num_trees_mu + if b_leaf_mu is None + else b_leaf_mu + ) + b_leaf_tau = ( + np.squeeze(np.var(resid_train)) / (2 * num_trees_tau) + if b_leaf_tau is None + else b_leaf_tau + ) + sigma_leaf_mu = ( + np.squeeze(np.var(resid_train)) / num_trees_mu + if sigma_leaf_mu is None + else sigma_leaf_mu + ) + sigma_leaf_tau = ( + np.squeeze(np.var(resid_train)) / (2 * num_trees_tau) + if sigma_leaf_tau is None + else sigma_leaf_tau + ) if self.multivariate_treatment: if not isinstance(sigma_leaf_tau, np.ndarray): - sigma_leaf_tau = np.diagflat(np.repeat(sigma_leaf_tau, self.treatment_dim)) + sigma_leaf_tau = np.diagflat( + np.repeat(sigma_leaf_tau, self.treatment_dim) + ) current_sigma2 = sigma2_init self.sigma2_init = sigma2_init current_leaf_scale_mu = np.array([[sigma_leaf_mu]]) @@ -745,48 +1100,79 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr b_forest = num_trees_variance / a_0**2 else: if not a_forest: - a_forest = 1. + a_forest = 1.0 if not b_forest: - b_forest = 1. + b_forest = 1.0 # Update variable weights variable_counts = [original_var_indices.count(i) for i in original_var_indices] - variable_weights_mu_adj = [1/i for i in variable_counts] - variable_weights_tau_adj = [1/i for i in variable_counts] - variable_weights_variance_adj = [1/i for i in variable_counts] - variable_weights_mu = variable_weights_mu[original_var_indices]*variable_weights_mu_adj - variable_weights_tau = variable_weights_tau[original_var_indices]*variable_weights_tau_adj - variable_weights_variance = variable_weights_variance[original_var_indices]*variable_weights_variance_adj + variable_weights_mu_adj = [1 / i for i in variable_counts] + variable_weights_tau_adj = [1 / i for i in variable_counts] + variable_weights_variance_adj = [1 / i for i in variable_counts] + variable_weights_mu = ( + variable_weights_mu[original_var_indices] * variable_weights_mu_adj + ) + variable_weights_tau = ( + variable_weights_tau[original_var_indices] * variable_weights_tau_adj + ) + variable_weights_variance = ( + variable_weights_variance[original_var_indices] + * variable_weights_variance_adj + ) # Zero out weights for excluded variables - variable_weights_mu[[variable_subset_mu.count(i) == 0 for i in original_var_indices]] = 0 - variable_weights_tau[[variable_subset_tau.count(i) == 0 for i in original_var_indices]] = 0 - variable_weights_variance[[variable_subset_variance.count(i) == 0 for i in original_var_indices]] = 0 - + variable_weights_mu[ + [variable_subset_mu.count(i) == 0 for i in original_var_indices] + ] = 0 + variable_weights_tau[ + [variable_subset_tau.count(i) == 0 for i in original_var_indices] + ] = 0 + variable_weights_variance[ + [variable_subset_variance.count(i) == 0 for i in original_var_indices] + ] = 0 + # Update covariates to include propensities if requested if propensity_covariate not in ["none", "mu", "tau", "both"]: - raise ValueError("propensity_covariate must equal one of 'none', 'mu', 'tau', or 'both'") + raise ValueError( + "propensity_covariate must equal one of 'none', 'mu', 'tau', or 'both'" + ) if propensity_covariate != "none": - feature_types = np.append(feature_types, 0).astype('int') + feature_types = np.append(feature_types, 0).astype("int") X_train_processed = np.c_[X_train_processed, pi_train] if self.has_test: X_test_processed = np.c_[X_test_processed, pi_test] if propensity_covariate == "mu": - variable_weights_mu = np.append(variable_weights_mu, np.repeat(1/num_cov_orig, pi_train.shape[1])) - variable_weights_tau = np.append(variable_weights_tau, np.repeat(0., pi_train.shape[1])) + variable_weights_mu = np.append( + variable_weights_mu, np.repeat(1 / num_cov_orig, pi_train.shape[1]) + ) + variable_weights_tau = np.append( + variable_weights_tau, np.repeat(0.0, pi_train.shape[1]) + ) elif propensity_covariate == "tau": - variable_weights_mu = np.append(variable_weights_mu, np.repeat(0., pi_train.shape[1])) - variable_weights_tau = np.append(variable_weights_tau, np.repeat(1/num_cov_orig, pi_train.shape[1])) + variable_weights_mu = np.append( + variable_weights_mu, np.repeat(0.0, pi_train.shape[1]) + ) + variable_weights_tau = np.append( + variable_weights_tau, np.repeat(1 / num_cov_orig, pi_train.shape[1]) + ) elif propensity_covariate == "both": - variable_weights_mu = np.append(variable_weights_mu, np.repeat(1/num_cov_orig, pi_train.shape[1])) - variable_weights_tau = np.append(variable_weights_tau, np.repeat(1/num_cov_orig, pi_train.shape[1])) - variable_weights_variance = np.append(variable_weights_variance, np.repeat(0., pi_train.shape[1])) - + variable_weights_mu = np.append( + variable_weights_mu, np.repeat(1 / num_cov_orig, pi_train.shape[1]) + ) + variable_weights_tau = np.append( + variable_weights_tau, np.repeat(1 / num_cov_orig, pi_train.shape[1]) + ) + variable_weights_variance = np.append( + variable_weights_variance, np.repeat(0.0, pi_train.shape[1]) + ) + # Renormalize variable weights variable_weights_mu = variable_weights_mu / np.sum(variable_weights_mu) variable_weights_tau = variable_weights_tau / np.sum(variable_weights_tau) - variable_weights_variance = variable_weights_variance / np.sum(variable_weights_variance) - + variable_weights_variance = variable_weights_variance / np.sum( + variable_weights_variance + ) + # Store propensity score requirements of the BCF forests self.propensity_covariate = propensity_covariate @@ -814,7 +1200,7 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr if sample_sigma_leaf_tau: self.leaf_scale_tau_samples = np.empty(self.num_samples, dtype=np.float64) sample_counter = -1 - + # Prepare adaptive coding structure if self.adaptive_coding: if np.size(b_0) > 1 or np.size(b_1) > 1: @@ -825,9 +1211,9 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr self.b1_samples = np.empty(self.num_samples, dtype=np.float64) current_b_0 = b_0 current_b_1 = b_1 - tau_basis_train = (1-Z_train)*current_b_0 + Z_train*current_b_1 + tau_basis_train = (1 - Z_train) * current_b_0 + Z_train * current_b_1 if self.has_test: - tau_basis_test = (1-Z_test)*current_b_0 + Z_test*current_b_1 + tau_basis_test = (1 - Z_test) * current_b_0 + Z_test * current_b_1 else: tau_basis_train = Z_train if self.has_test: @@ -846,26 +1232,57 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr residual_train = Residual(resid_train) # C++ random number generator - if random_seed is None: + if random_seed is None: cpp_rng = RNG(-1) else: cpp_rng = RNG(random_seed) - + # Sampling data structures - forest_sampler_mu = ForestSampler(forest_dataset_train, feature_types, num_trees_mu, self.n_train, alpha_mu, beta_mu, min_samples_leaf_mu, max_depth_mu) - forest_sampler_tau = ForestSampler(forest_dataset_train, feature_types, num_trees_tau, self.n_train, alpha_tau, beta_tau, min_samples_leaf_tau, max_depth_tau) + forest_sampler_mu = ForestSampler( + forest_dataset_train, + feature_types, + num_trees_mu, + self.n_train, + alpha_mu, + beta_mu, + min_samples_leaf_mu, + max_depth_mu, + ) + forest_sampler_tau = ForestSampler( + forest_dataset_train, + feature_types, + num_trees_tau, + self.n_train, + alpha_tau, + beta_tau, + min_samples_leaf_tau, + max_depth_tau, + ) if self.include_variance_forest: - forest_sampler_variance = ForestSampler(forest_dataset_train, feature_types, num_trees_variance, self.n_train, alpha_variance, beta_variance, min_samples_leaf_variance, max_depth_variance) + forest_sampler_variance = ForestSampler( + forest_dataset_train, + feature_types, + num_trees_variance, + self.n_train, + alpha_variance, + beta_variance, + min_samples_leaf_variance, + max_depth_variance, + ) # Container of forest samples self.forest_container_mu = ForestContainer(num_trees_mu, 1, True, False) - self.forest_container_tau = ForestContainer(num_trees_tau, Z_train.shape[1], False, False) + self.forest_container_tau = ForestContainer( + num_trees_tau, Z_train.shape[1], False, False + ) active_forest_mu = Forest(num_trees_mu, 1, True, False) active_forest_tau = Forest(num_trees_tau, Z_train.shape[1], False, False) if self.include_variance_forest: - self.forest_container_variance = ForestContainer(num_trees_variance, 1, True, True) + self.forest_container_variance = ForestContainer( + num_trees_variance, 1, True, True + ) active_forest_variance = Forest(num_trees_variance, 1, True, True) - + # Variance samplers if self.sample_sigma_global: global_var_model = GlobalVarianceModel() @@ -876,19 +1293,33 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr # Initialize the leaves of each tree in the prognostic forest init_mu = np.array([np.squeeze(np.mean(resid_train))]) - forest_sampler_mu.prepare_for_sampler(forest_dataset_train, residual_train, active_forest_mu, 0, init_mu) + forest_sampler_mu.prepare_for_sampler( + forest_dataset_train, residual_train, active_forest_mu, 0, init_mu + ) # Initialize the leaves of each tree in the treatment forest if self.multivariate_treatment: init_tau = np.zeros(Z_train.shape[1]) else: - init_tau = np.array([0.]) - forest_sampler_tau.prepare_for_sampler(forest_dataset_train, residual_train, active_forest_tau, treatment_leaf_model, init_tau) + init_tau = np.array([0.0]) + forest_sampler_tau.prepare_for_sampler( + forest_dataset_train, + residual_train, + active_forest_tau, + treatment_leaf_model, + init_tau, + ) # Initialize the leaves of each tree in the variance forest if self.include_variance_forest: init_val_variance = np.array([variance_forest_leaf_init]) - forest_sampler_variance.prepare_for_sampler(forest_dataset_train, residual_train, active_forest_variance, leaf_model_variance_forest, init_val_variance) + forest_sampler_variance.prepare_for_sampler( + forest_dataset_train, + residual_train, + active_forest_variance, + leaf_model_variance_forest, + init_val_variance, + ) # Run GFR (warm start) if specified if num_gfr > 0: @@ -900,76 +1331,149 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr sample_counter += 1 # Sample the prognostic forest forest_sampler_mu.sample_one_iteration( - self.forest_container_mu, active_forest_mu, forest_dataset_train, residual_train, cpp_rng, feature_types, - cutpoint_grid_size, current_leaf_scale_mu, variable_weights_mu, a_forest, b_forest, - current_sigma2, 0, keep_sample, True, True + self.forest_container_mu, + active_forest_mu, + forest_dataset_train, + residual_train, + cpp_rng, + feature_types, + cutpoint_grid_size, + current_leaf_scale_mu, + variable_weights_mu, + a_forest, + b_forest, + current_sigma2, + 0, + keep_sample, + True, + True, ) # Sample variance parameters (if requested) if self.sample_sigma_global: - current_sigma2 = global_var_model.sample_one_iteration(residual_train, cpp_rng, a_global, b_global) + current_sigma2 = global_var_model.sample_one_iteration( + residual_train, cpp_rng, a_global, b_global + ) if self.sample_sigma_leaf_mu: - current_leaf_scale_mu[0,0] = leaf_var_model_mu.sample_one_iteration(active_forest_mu, cpp_rng, a_leaf_mu, b_leaf_mu) + current_leaf_scale_mu[0, 0] = ( + leaf_var_model_mu.sample_one_iteration( + active_forest_mu, cpp_rng, a_leaf_mu, b_leaf_mu + ) + ) if keep_sample: - self.leaf_scale_mu_samples[sample_counter] = current_leaf_scale_mu[0,0] - + self.leaf_scale_mu_samples[sample_counter] = ( + current_leaf_scale_mu[0, 0] + ) + # Sample the treatment forest forest_sampler_tau.sample_one_iteration( - self.forest_container_tau, active_forest_tau, forest_dataset_train, residual_train, cpp_rng, feature_types, - cutpoint_grid_size, current_leaf_scale_tau, variable_weights_tau, a_forest, b_forest, - current_sigma2, treatment_leaf_model, keep_sample, True, True + self.forest_container_tau, + active_forest_tau, + forest_dataset_train, + residual_train, + cpp_rng, + feature_types, + cutpoint_grid_size, + current_leaf_scale_tau, + variable_weights_tau, + a_forest, + b_forest, + current_sigma2, + treatment_leaf_model, + keep_sample, + True, + True, ) - + # Sample coding parameters (if requested) if self.adaptive_coding: mu_x = active_forest_mu.predict_raw(forest_dataset_train) - tau_x = np.squeeze(active_forest_tau.predict_raw(forest_dataset_train)) - s_tt0 = np.sum(tau_x*tau_x*(np.squeeze(Z_train)==0)) - s_tt1 = np.sum(tau_x*tau_x*(np.squeeze(Z_train)==1)) + tau_x = np.squeeze( + active_forest_tau.predict_raw(forest_dataset_train) + ) + s_tt0 = np.sum(tau_x * tau_x * (np.squeeze(Z_train) == 0)) + s_tt1 = np.sum(tau_x * tau_x * (np.squeeze(Z_train) == 1)) partial_resid_mu = np.squeeze(resid_train - mu_x) - s_ty0 = np.sum(tau_x*partial_resid_mu*(np.squeeze(Z_train)==0)) - s_ty1 = np.sum(tau_x*partial_resid_mu*(np.squeeze(Z_train)==1)) - current_b_0 = self.rng.normal(loc = (s_ty0/(s_tt0 + 2*current_sigma2)), - scale = np.sqrt(current_sigma2/(s_tt0 + 2*current_sigma2)), size = 1) - current_b_1 = self.rng.normal(loc = (s_ty1/(s_tt1 + 2*current_sigma2)), - scale = np.sqrt(current_sigma2/(s_tt1 + 2*current_sigma2)), size = 1) - tau_basis_train = (1-np.squeeze(Z_train))*current_b_0 + np.squeeze(Z_train)*current_b_1 + s_ty0 = np.sum( + tau_x * partial_resid_mu * (np.squeeze(Z_train) == 0) + ) + s_ty1 = np.sum( + tau_x * partial_resid_mu * (np.squeeze(Z_train) == 1) + ) + current_b_0 = self.rng.normal( + loc=(s_ty0 / (s_tt0 + 2 * current_sigma2)), + scale=np.sqrt(current_sigma2 / (s_tt0 + 2 * current_sigma2)), + size=1, + ) + current_b_1 = self.rng.normal( + loc=(s_ty1 / (s_tt1 + 2 * current_sigma2)), + scale=np.sqrt(current_sigma2 / (s_tt1 + 2 * current_sigma2)), + size=1, + ) + tau_basis_train = ( + 1 - np.squeeze(Z_train) + ) * current_b_0 + np.squeeze(Z_train) * current_b_1 forest_dataset_train.update_basis(tau_basis_train) if self.has_test: - tau_basis_test = (1-np.squeeze(Z_test))*current_b_0 + np.squeeze(Z_test)*current_b_1 + tau_basis_test = ( + 1 - np.squeeze(Z_test) + ) * current_b_0 + np.squeeze(Z_test) * current_b_1 forest_dataset_test.update_basis(tau_basis_test) if keep_sample: self.b0_samples[sample_counter] = current_b_0 self.b1_samples[sample_counter] = current_b_1 # Update residual to reflect adjusted basis - forest_sampler_tau.propagate_basis_update(forest_dataset_train, residual_train, active_forest_tau) - + forest_sampler_tau.propagate_basis_update( + forest_dataset_train, residual_train, active_forest_tau + ) + # Sample the variance forest if self.include_variance_forest: forest_sampler_variance.sample_one_iteration( - self.forest_container_variance, active_forest_variance, forest_dataset_train, residual_train, - cpp_rng, feature_types, cutpoint_grid_size, current_leaf_scale_mu, variable_weights_variance, a_forest, b_forest, - current_sigma2, leaf_model_variance_forest, keep_sample, True, True + self.forest_container_variance, + active_forest_variance, + forest_dataset_train, + residual_train, + cpp_rng, + feature_types, + cutpoint_grid_size, + current_leaf_scale_mu, + variable_weights_variance, + a_forest, + b_forest, + current_sigma2, + leaf_model_variance_forest, + keep_sample, + True, + True, ) - + # Sample variance parameters (if requested) if self.sample_sigma_global: - current_sigma2 = global_var_model.sample_one_iteration(residual_train, cpp_rng, a_global, b_global) + current_sigma2 = global_var_model.sample_one_iteration( + residual_train, cpp_rng, a_global, b_global + ) if keep_sample: self.global_var_samples[sample_counter] = current_sigma2 if self.sample_sigma_leaf_tau: - current_leaf_scale_tau[0,0] = leaf_var_model_tau.sample_one_iteration(active_forest_tau, cpp_rng, a_leaf_tau, b_leaf_tau) + current_leaf_scale_tau[0, 0] = ( + leaf_var_model_tau.sample_one_iteration( + active_forest_tau, cpp_rng, a_leaf_tau, b_leaf_tau + ) + ) if keep_sample: - self.leaf_scale_tau_samples[sample_counter] = current_leaf_scale_tau[0,0] - + self.leaf_scale_tau_samples[sample_counter] = ( + current_leaf_scale_tau[0, 0] + ) + # Run MCMC if num_burnin + num_mcmc > 0: for i in range(num_gfr, num_temp_samples): is_mcmc = i + 1 > num_gfr + num_burnin if is_mcmc: mcmc_counter = i - num_gfr - num_burnin + 1 - if (mcmc_counter % keep_every == 0): + if mcmc_counter % keep_every == 0: keep_sample = True else: keep_sample = False @@ -982,69 +1486,142 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr sample_counter += 1 # Sample the prognostic forest forest_sampler_mu.sample_one_iteration( - self.forest_container_mu, active_forest_mu, forest_dataset_train, residual_train, cpp_rng, feature_types, - cutpoint_grid_size, current_leaf_scale_mu, variable_weights_mu, a_forest, b_forest, - current_sigma2, 0, keep_sample, False, True + self.forest_container_mu, + active_forest_mu, + forest_dataset_train, + residual_train, + cpp_rng, + feature_types, + cutpoint_grid_size, + current_leaf_scale_mu, + variable_weights_mu, + a_forest, + b_forest, + current_sigma2, + 0, + keep_sample, + False, + True, ) # Sample variance parameters (if requested) if self.sample_sigma_global: - current_sigma2 = global_var_model.sample_one_iteration(residual_train, cpp_rng, a_global, b_global) + current_sigma2 = global_var_model.sample_one_iteration( + residual_train, cpp_rng, a_global, b_global + ) if self.sample_sigma_leaf_mu: - current_leaf_scale_mu[0,0] = leaf_var_model_mu.sample_one_iteration(active_forest_mu, cpp_rng, a_leaf_mu, b_leaf_mu) + current_leaf_scale_mu[0, 0] = ( + leaf_var_model_mu.sample_one_iteration( + active_forest_mu, cpp_rng, a_leaf_mu, b_leaf_mu + ) + ) if keep_sample: - self.leaf_scale_mu_samples[sample_counter] = current_leaf_scale_mu[0,0] - + self.leaf_scale_mu_samples[sample_counter] = ( + current_leaf_scale_mu[0, 0] + ) + # Sample the treatment forest forest_sampler_tau.sample_one_iteration( - self.forest_container_tau, active_forest_tau, forest_dataset_train, residual_train, cpp_rng, feature_types, - cutpoint_grid_size, current_leaf_scale_tau, variable_weights_tau, a_forest, b_forest, - current_sigma2, treatment_leaf_model, keep_sample, False, True - ) - + self.forest_container_tau, + active_forest_tau, + forest_dataset_train, + residual_train, + cpp_rng, + feature_types, + cutpoint_grid_size, + current_leaf_scale_tau, + variable_weights_tau, + a_forest, + b_forest, + current_sigma2, + treatment_leaf_model, + keep_sample, + False, + True, + ) + # Sample coding parameters (if requested) if self.adaptive_coding: mu_x = active_forest_mu.predict_raw(forest_dataset_train) - tau_x = np.squeeze(active_forest_tau.predict_raw(forest_dataset_train)) - s_tt0 = np.sum(tau_x*tau_x*(np.squeeze(Z_train)==0)) - s_tt1 = np.sum(tau_x*tau_x*(np.squeeze(Z_train)==1)) + tau_x = np.squeeze( + active_forest_tau.predict_raw(forest_dataset_train) + ) + s_tt0 = np.sum(tau_x * tau_x * (np.squeeze(Z_train) == 0)) + s_tt1 = np.sum(tau_x * tau_x * (np.squeeze(Z_train) == 1)) partial_resid_mu = np.squeeze(resid_train - mu_x) - s_ty0 = np.sum(tau_x*partial_resid_mu*(np.squeeze(Z_train)==0)) - s_ty1 = np.sum(tau_x*partial_resid_mu*(np.squeeze(Z_train)==1)) - current_b_0 = self.rng.normal(loc = (s_ty0/(s_tt0 + 2*current_sigma2)), - scale = np.sqrt(current_sigma2/(s_tt0 + 2*current_sigma2)), size = 1) - current_b_1 = self.rng.normal(loc = (s_ty1/(s_tt1 + 2*current_sigma2)), - scale = np.sqrt(current_sigma2/(s_tt1 + 2*current_sigma2)), size = 1) - tau_basis_train = (1-np.squeeze(Z_train))*current_b_0 + np.squeeze(Z_train)*current_b_1 + s_ty0 = np.sum( + tau_x * partial_resid_mu * (np.squeeze(Z_train) == 0) + ) + s_ty1 = np.sum( + tau_x * partial_resid_mu * (np.squeeze(Z_train) == 1) + ) + current_b_0 = self.rng.normal( + loc=(s_ty0 / (s_tt0 + 2 * current_sigma2)), + scale=np.sqrt(current_sigma2 / (s_tt0 + 2 * current_sigma2)), + size=1, + ) + current_b_1 = self.rng.normal( + loc=(s_ty1 / (s_tt1 + 2 * current_sigma2)), + scale=np.sqrt(current_sigma2 / (s_tt1 + 2 * current_sigma2)), + size=1, + ) + tau_basis_train = ( + 1 - np.squeeze(Z_train) + ) * current_b_0 + np.squeeze(Z_train) * current_b_1 forest_dataset_train.update_basis(tau_basis_train) if self.has_test: - tau_basis_test = (1-np.squeeze(Z_test))*current_b_0 + np.squeeze(Z_test)*current_b_1 + tau_basis_test = ( + 1 - np.squeeze(Z_test) + ) * current_b_0 + np.squeeze(Z_test) * current_b_1 forest_dataset_test.update_basis(tau_basis_test) if keep_sample: self.b0_samples[sample_counter] = current_b_0 self.b1_samples[sample_counter] = current_b_1 # Update residual to reflect adjusted basis - forest_sampler_tau.propagate_basis_update(forest_dataset_train, residual_train, active_forest_tau) - + forest_sampler_tau.propagate_basis_update( + forest_dataset_train, residual_train, active_forest_tau + ) + # Sample the variance forest if self.include_variance_forest: forest_sampler_variance.sample_one_iteration( - self.forest_container_variance, active_forest_variance, forest_dataset_train, residual_train, - cpp_rng, feature_types, cutpoint_grid_size, current_leaf_scale_mu, variable_weights_variance, a_forest, b_forest, - current_sigma2, leaf_model_variance_forest, keep_sample, False, True + self.forest_container_variance, + active_forest_variance, + forest_dataset_train, + residual_train, + cpp_rng, + feature_types, + cutpoint_grid_size, + current_leaf_scale_mu, + variable_weights_variance, + a_forest, + b_forest, + current_sigma2, + leaf_model_variance_forest, + keep_sample, + False, + True, ) - + # Sample variance parameters (if requested) if self.sample_sigma_global: - current_sigma2 = global_var_model.sample_one_iteration(residual_train, cpp_rng, a_global, b_global) + current_sigma2 = global_var_model.sample_one_iteration( + residual_train, cpp_rng, a_global, b_global + ) if keep_sample: self.global_var_samples[sample_counter] = current_sigma2 if self.sample_sigma_leaf_tau: - current_leaf_scale_tau[0,0] = leaf_var_model_tau.sample_one_iteration(active_forest_tau, cpp_rng, a_leaf_tau, b_leaf_tau) + current_leaf_scale_tau[0, 0] = ( + leaf_var_model_tau.sample_one_iteration( + active_forest_tau, cpp_rng, a_leaf_tau, b_leaf_tau + ) + ) if keep_sample: - self.leaf_scale_tau_samples[sample_counter] = current_leaf_scale_tau[0,0] - + self.leaf_scale_tau_samples[sample_counter] = ( + current_leaf_scale_tau[0, 0] + ) + # Mark the model as sampled self.sampled = True @@ -1067,65 +1644,107 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr self.num_samples -= num_gfr # Store predictions - mu_raw = self.forest_container_mu.forest_container_cpp.Predict(forest_dataset_train.dataset_cpp) - self.mu_hat_train = mu_raw*self.y_std + self.y_bar - tau_raw_train = self.forest_container_tau.forest_container_cpp.PredictRaw(forest_dataset_train.dataset_cpp) + mu_raw = self.forest_container_mu.forest_container_cpp.Predict( + forest_dataset_train.dataset_cpp + ) + self.mu_hat_train = mu_raw * self.y_std + self.y_bar + tau_raw_train = self.forest_container_tau.forest_container_cpp.PredictRaw( + forest_dataset_train.dataset_cpp + ) self.tau_hat_train = tau_raw_train if self.adaptive_coding: - adaptive_coding_weights = np.expand_dims(self.b1_samples - self.b0_samples, axis=(0,2)) - self.tau_hat_train = self.tau_hat_train*adaptive_coding_weights - self.tau_hat_train = np.squeeze(self.tau_hat_train*self.y_std) + adaptive_coding_weights = np.expand_dims( + self.b1_samples - self.b0_samples, axis=(0, 2) + ) + self.tau_hat_train = self.tau_hat_train * adaptive_coding_weights + self.tau_hat_train = np.squeeze(self.tau_hat_train * self.y_std) if self.multivariate_treatment: - treatment_term_train = np.multiply(np.atleast_3d(Z_train).swapaxes(1,2),self.tau_hat_train).sum(axis=2) + treatment_term_train = np.multiply( + np.atleast_3d(Z_train).swapaxes(1, 2), self.tau_hat_train + ).sum(axis=2) else: - treatment_term_train = Z_train*np.squeeze(self.tau_hat_train) + treatment_term_train = Z_train * np.squeeze(self.tau_hat_train) self.y_hat_train = self.mu_hat_train + treatment_term_train if self.has_test: - mu_raw_test = self.forest_container_mu.forest_container_cpp.Predict(forest_dataset_test.dataset_cpp) - self.mu_hat_test = mu_raw_test*self.y_std + self.y_bar - tau_raw_test = self.forest_container_tau.forest_container_cpp.PredictRaw(forest_dataset_test.dataset_cpp) + mu_raw_test = self.forest_container_mu.forest_container_cpp.Predict( + forest_dataset_test.dataset_cpp + ) + self.mu_hat_test = mu_raw_test * self.y_std + self.y_bar + tau_raw_test = self.forest_container_tau.forest_container_cpp.PredictRaw( + forest_dataset_test.dataset_cpp + ) self.tau_hat_test = tau_raw_test if self.adaptive_coding: - adaptive_coding_weights_test = np.expand_dims(self.b1_samples - self.b0_samples, axis=(0,2)) - self.tau_hat_test = self.tau_hat_test*adaptive_coding_weights_test - self.tau_hat_test = np.squeeze(self.tau_hat_test*self.y_std) + adaptive_coding_weights_test = np.expand_dims( + self.b1_samples - self.b0_samples, axis=(0, 2) + ) + self.tau_hat_test = self.tau_hat_test * adaptive_coding_weights_test + self.tau_hat_test = np.squeeze(self.tau_hat_test * self.y_std) if self.multivariate_treatment: - treatment_term_test = np.multiply(np.atleast_3d(Z_test).swapaxes(1,2),self.tau_hat_test).sum(axis=2) + treatment_term_test = np.multiply( + np.atleast_3d(Z_test).swapaxes(1, 2), self.tau_hat_test + ).sum(axis=2) else: - treatment_term_test = Z_test*np.squeeze(self.tau_hat_test) + treatment_term_test = Z_test * np.squeeze(self.tau_hat_test) self.y_hat_test = self.mu_hat_test + treatment_term_test - + if self.include_variance_forest: - sigma2_x_train_raw = self.forest_container_variance.forest_container_cpp.Predict(forest_dataset_train.dataset_cpp) + sigma2_x_train_raw = ( + self.forest_container_variance.forest_container_cpp.Predict( + forest_dataset_train.dataset_cpp + ) + ) if self.sample_sigma_global: self.sigma2_x_train = sigma2_x_train_raw for i in range(self.num_samples): - self.sigma2_x_train[:,i] = sigma2_x_train_raw[:,i]*self.global_var_samples[i] + self.sigma2_x_train[:, i] = ( + sigma2_x_train_raw[:, i] * self.global_var_samples[i] + ) else: - self.sigma2_x_train = sigma2_x_train_raw*self.sigma2_init*self.y_std*self.y_std/self.variance_scale + self.sigma2_x_train = ( + sigma2_x_train_raw + * self.sigma2_init + * self.y_std + * self.y_std + / self.variance_scale + ) if self.has_test: - sigma2_x_test_raw = self.forest_container_variance.forest_container_cpp.Predict(forest_dataset_test.dataset_cpp) + sigma2_x_test_raw = ( + self.forest_container_variance.forest_container_cpp.Predict( + forest_dataset_test.dataset_cpp + ) + ) if self.sample_sigma_global: self.sigma2_x_test = sigma2_x_test_raw for i in range(self.num_samples): - self.sigma2_x_test[:,i] = sigma2_x_test_raw[:,i]*self.global_var_samples[i] + self.sigma2_x_test[:, i] = ( + sigma2_x_test_raw[:, i] * self.global_var_samples[i] + ) else: - self.sigma2_x_test = sigma2_x_test_raw*self.sigma2_init*self.y_std*self.y_std/self.variance_scale - + self.sigma2_x_test = ( + sigma2_x_test_raw + * self.sigma2_init + * self.y_std + * self.y_std + / self.variance_scale + ) + if self.sample_sigma_global: - self.global_var_samples = self.global_var_samples*self.y_std*self.y_std - + self.global_var_samples = self.global_var_samples * self.y_std * self.y_std + if self.sample_sigma_leaf_mu: self.leaf_scale_mu_samples = self.leaf_scale_mu_samples if self.sample_sigma_leaf_tau: self.leaf_scale_tau_samples = self.leaf_scale_tau_samples - + if self.adaptive_coding: self.b0_samples = self.b0_samples self.b1_samples = self.b1_samples - - def predict_tau(self, X: np.array, Z: np.array, propensity: np.array = None) -> np.array: + + def predict_tau( + self, X: np.array, Z: np.array, propensity: np.array = None + ) -> np.array: """Predict CATE function for every provided observation. Parameters @@ -1136,7 +1755,7 @@ def predict_tau(self, X: np.array, Z: np.array, propensity: np.array = None) -> Test set treatment indicators. propensity : np.array, optional Optional test set propensities. Must be provided if propensities were provided when the model was sampled. - + Returns ------- np.array @@ -1148,7 +1767,7 @@ def predict_tau(self, X: np.array, Z: np.array, propensity: np.array = None) -> "appropriate arguments before using this model." ) raise NotSampledError(msg) - + # Convert everything to standard shape (2-dimensional) if X.ndim == 1: X = np.expand_dims(X, 1) @@ -1160,7 +1779,7 @@ def predict_tau(self, X: np.array, Z: np.array, propensity: np.array = None) -> if propensity is not None: if propensity.ndim == 1: propensity = np.expand_dims(propensity, 1) - + # Data checks if Z.shape[0] != X.shape[0]: raise ValueError("X and Z must have the same number of rows") @@ -1170,37 +1789,47 @@ def predict_tau(self, X: np.array, Z: np.array, propensity: np.array = None) -> else: if self.propensity_covariate == "tau": if not self.internal_propensity_model: - raise ValueError("Propensity scores not provided, but no propensity model was trained during sampling") + raise ValueError( + "Propensity scores not provided, but no propensity model was trained during sampling" + ) else: - propensity = np.mean(self.bart_propensity_model.predict(X), axis=1, keepdims=True) + propensity = np.mean( + self.bart_propensity_model.predict(X), axis=1, keepdims=True + ) else: # Dummy propensities if not provided but also not needed propensity = np.ones(X.shape[0]) propensity = np.expand_dims(propensity, 1) - + # Update covariates to include propensities if requested if self.propensity_covariate == "none": X_combined = X else: X_combined = np.c_[X, propensity] - + # Forest dataset forest_dataset_test = Dataset() forest_dataset_test.add_covariates(X_combined) forest_dataset_test.add_basis(Z) - + # Estimate treatment effect - tau_raw = self.forest_container_tau.forest_container_cpp.PredictRaw(forest_dataset_test.dataset_cpp) + tau_raw = self.forest_container_tau.forest_container_cpp.PredictRaw( + forest_dataset_test.dataset_cpp + ) tau_raw = tau_raw if self.adaptive_coding: - adaptive_coding_weights = np.expand_dims(self.b1_samples - self.b0_samples, axis=(0,2)) - tau_raw = tau_raw*adaptive_coding_weights - tau_x = np.squeeze(tau_raw*self.y_std) + adaptive_coding_weights = np.expand_dims( + self.b1_samples - self.b0_samples, axis=(0, 2) + ) + tau_raw = tau_raw * adaptive_coding_weights + tau_x = np.squeeze(tau_raw * self.y_std) # Return result matrix return tau_x - def predict_variance(self, covariates: np.array, propensity: np.array = None) -> np.array: + def predict_variance( + self, covariates: np.array, propensity: np.array = None + ) -> np.array: """Predict expected conditional variance from a BART model. Parameters @@ -1209,7 +1838,7 @@ def predict_variance(self, covariates: np.array, propensity: np.array = None) -> Test set covariates. propensity : np.array, optional Test set propensity scores. Optional (not currently used in variance forests). - + Returns ------- np.array @@ -1228,14 +1857,14 @@ def predict_variance(self, covariates: np.array, propensity: np.array = None) -> "Call 'fit' with appropriate arguments before using this model." ) raise NotSampledError(msg) - + # Convert everything to standard shape (2-dimensional) if covariates.ndim == 1: covariates = np.expand_dims(covariates, 1) if propensity is not None: if propensity.ndim == 1: propensity = np.expand_dims(propensity, 1) - + # Update covariates to include propensities if requested if self.propensity_covariate == "none": X_combined = covariates @@ -1247,23 +1876,33 @@ def predict_variance(self, covariates: np.array, propensity: np.array = None) -> propensity = np.ones(covariates.shape[0]) propensity = np.expand_dims(propensity, 1) X_combined = np.c_[covariates, propensity] - + # Forest dataset pred_dataset = Dataset() pred_dataset.add_covariates(X_combined) - - variance_pred_raw = self.forest_container_variance.forest_container_cpp.Predict(pred_dataset.dataset_cpp) + + variance_pred_raw = self.forest_container_variance.forest_container_cpp.Predict( + pred_dataset.dataset_cpp + ) if self.sample_sigma_global: variance_pred = variance_pred_raw for i in range(self.num_samples): - variance_pred[:,i] = variance_pred_raw[:,i]*self.global_var_samples[i] + variance_pred[:, i] = ( + variance_pred_raw[:, i] * self.global_var_samples[i] + ) else: - variance_pred = variance_pred_raw*self.sigma2_init*self.y_std*self.y_std/self.variance_scale + variance_pred = ( + variance_pred_raw + * self.sigma2_init + * self.y_std + * self.y_std + / self.variance_scale + ) return variance_pred - + def predict(self, X: np.array, Z: np.array, propensity: np.array = None) -> tuple: - """Predict outcome model components (CATE function and prognostic function) as well as overall outcome for every provided observation. + """Predict outcome model components (CATE function and prognostic function) as well as overall outcome for every provided observation. Predicted outcomes are computed as `yhat = mu_x + Z*tau_x` where mu_x is a sample of the prognostic function and tau_x is a sample of the treatment effect (CATE) function. Parameters @@ -1274,7 +1913,7 @@ def predict(self, X: np.array, Z: np.array, propensity: np.array = None) -> tupl Test set treatment indicators. propensity : `np.array`, optional Optional test set propensities. Must be provided if propensities were provided when the model was sampled. - + Returns ------- tau_x : np.array @@ -1284,7 +1923,7 @@ def predict(self, X: np.array, Z: np.array, propensity: np.array = None) -> tupl yhat_x : np.array Outcome prediction samples for every observation provided. sigma2_x : np.array, optional - Variance forest samples for every observation provided. Only returned if the + Variance forest samples for every observation provided. Only returned if the model includes a heteroskedasticity forest. """ if not self.is_sampled(): @@ -1293,7 +1932,7 @@ def predict(self, X: np.array, Z: np.array, propensity: np.array = None) -> tupl "appropriate arguments before using this model." ) raise NotSampledError(msg) - + # Convert everything to standard shape (2-dimensional) if X.ndim == 1: X = np.expand_dims(X, 1) @@ -1305,7 +1944,7 @@ def predict(self, X: np.array, Z: np.array, propensity: np.array = None) -> tupl if propensity is not None: if propensity.ndim == 1: propensity = np.expand_dims(propensity, 1) - + # Data checks if Z.shape[0] != X.shape[0]: raise ValueError("X and Z must have the same number of rows") @@ -1315,45 +1954,65 @@ def predict(self, X: np.array, Z: np.array, propensity: np.array = None) -> tupl else: if self.propensity_covariate != "none": if not self.internal_propensity_model: - raise ValueError("Propensity scores not provided, but no propensity model was trained during sampling") + raise ValueError( + "Propensity scores not provided, but no propensity model was trained during sampling" + ) else: - propensity = np.mean(self.bart_propensity_model.predict(X), axis=1, keepdims=True) - + propensity = np.mean( + self.bart_propensity_model.predict(X), axis=1, keepdims=True + ) + # Update covariates to include propensities if requested if self.propensity_covariate == "none": X_combined = X else: X_combined = np.c_[X, propensity] - + # Forest dataset forest_dataset_test = Dataset() forest_dataset_test.add_covariates(X_combined) forest_dataset_test.add_basis(Z) - + # Compute predicted outcome and decomposed outcome model terms - mu_raw = self.forest_container_mu.forest_container_cpp.Predict(forest_dataset_test.dataset_cpp) - mu_x = mu_raw*self.y_std/np.sqrt(self.variance_scale) + self.y_bar - tau_raw = self.forest_container_tau.forest_container_cpp.PredictRaw(forest_dataset_test.dataset_cpp) + mu_raw = self.forest_container_mu.forest_container_cpp.Predict( + forest_dataset_test.dataset_cpp + ) + mu_x = mu_raw * self.y_std / np.sqrt(self.variance_scale) + self.y_bar + tau_raw = self.forest_container_tau.forest_container_cpp.PredictRaw( + forest_dataset_test.dataset_cpp + ) if self.adaptive_coding: - adaptive_coding_weights = np.expand_dims(self.b1_samples - self.b0_samples, axis=(0,2)) - tau_raw = tau_raw*adaptive_coding_weights - tau_x = np.squeeze(tau_raw*self.y_std/np.sqrt(self.variance_scale)) + adaptive_coding_weights = np.expand_dims( + self.b1_samples - self.b0_samples, axis=(0, 2) + ) + tau_raw = tau_raw * adaptive_coding_weights + tau_x = np.squeeze(tau_raw * self.y_std / np.sqrt(self.variance_scale)) if Z.shape[1] > 1: - treatment_term = np.multiply(np.atleast_3d(Z).swapaxes(1,2),tau_x).sum(axis=2) + treatment_term = np.multiply(np.atleast_3d(Z).swapaxes(1, 2), tau_x).sum( + axis=2 + ) else: - treatment_term = Z*np.squeeze(tau_x) + treatment_term = Z * np.squeeze(tau_x) yhat_x = mu_x + treatment_term # Compute predictions from the variance forest (if included) if self.include_variance_forest: - sigma2_x_raw = self.forest_container_variance.forest_container_cpp.Predict(forest_dataset_test.dataset_cpp) + sigma2_x_raw = self.forest_container_variance.forest_container_cpp.Predict( + forest_dataset_test.dataset_cpp + ) if self.sample_sigma_global: sigma2_x = sigma2_x_raw for i in range(self.num_samples): - sigma2_x[:,i] = sigma2_x_raw[:,i]*self.global_var_samples[i] + sigma2_x[:, i] = sigma2_x_raw[:, i] * self.global_var_samples[i] else: - sigma2_x = sigma2_x_raw*self.sigma2_init*self.y_std*self.y_std/self.variance_scale - + sigma2_x = ( + sigma2_x_raw + * self.sigma2_init + * self.y_std + * self.y_std + / self.variance_scale + ) + # Return result matrices as a tuple if self.include_variance_forest: return (tau_x, mu_x, yhat_x, sigma2_x) @@ -1362,7 +2021,7 @@ def predict(self, X: np.array, Z: np.array, propensity: np.array = None) -> tupl def to_json(self) -> str: """ - Converts a sampled BART model to JSON string representation (which can then be saved to a file or + Converts a sampled BART model to JSON string representation (which can then be saved to a file or processed using the `json` library) Returns @@ -1376,16 +2035,16 @@ def to_json(self) -> str: "Call 'fit' with appropriate arguments before using this model." ) raise NotSampledError(msg) - + # Initialize JSONSerializer object bcf_json = JSONSerializer() - + # Add the forests bcf_json.add_forest(self.forest_container_mu) bcf_json.add_forest(self.forest_container_tau) if self.include_variance_forest: bcf_json.add_forest(self.forest_container_variance) - + # Add global parameters bcf_json.add_scalar("variance_scale", self.variance_scale) bcf_json.add_scalar("outcome_scale", self.y_std) @@ -1402,28 +2061,36 @@ def to_json(self) -> str: bcf_json.add_scalar("num_samples", self.num_samples) bcf_json.add_boolean("adaptive_coding", self.adaptive_coding) bcf_json.add_string("propensity_covariate", self.propensity_covariate) - bcf_json.add_boolean("internal_propensity_model", self.internal_propensity_model) - + bcf_json.add_boolean( + "internal_propensity_model", self.internal_propensity_model + ) + # Add parameter samples if self.sample_sigma_global: - bcf_json.add_numeric_vector("sigma2_global_samples", self.global_var_samples, "parameters") + bcf_json.add_numeric_vector( + "sigma2_global_samples", self.global_var_samples, "parameters" + ) if self.sample_sigma_leaf_mu: - bcf_json.add_numeric_vector("sigma2_leaf_mu_samples", self.leaf_scale_mu_samples, "parameters") + bcf_json.add_numeric_vector( + "sigma2_leaf_mu_samples", self.leaf_scale_mu_samples, "parameters" + ) if self.sample_sigma_leaf_tau: - bcf_json.add_numeric_vector("sigma2_leaf_tau_samples", self.leaf_scale_tau_samples, "parameters") + bcf_json.add_numeric_vector( + "sigma2_leaf_tau_samples", self.leaf_scale_tau_samples, "parameters" + ) if self.adaptive_coding: bcf_json.add_numeric_vector("b0_samples", self.b0_samples, "parameters") bcf_json.add_numeric_vector("b1_samples", self.b1_samples, "parameters") - + # Add propensity model (if it exists) if self.internal_propensity_model: bart_propensity_string = self.bart_propensity_model.to_json() bcf_json.add_string("bart_propensity_model", bart_propensity_string) - + # Add covariate preprocessor covariate_preprocessor_string = self._covariate_preprocessor.to_json() bcf_json.add_string("covariate_preprocessor", covariate_preprocessor_string) - + return bcf_json.return_json_string() def from_json(self, json_string: str) -> None: @@ -1438,20 +2105,26 @@ def from_json(self, json_string: str) -> None: # Parse string to a JSON object in C++ bcf_json = JSONSerializer() bcf_json.load_from_json_string(json_string) - + # Unpack forests self.include_variance_forest = bcf_json.get_boolean("include_variance_forest") # TODO: don't just make this a placeholder that we overwrite self.forest_container_mu = ForestContainer(0, 0, False, False) - self.forest_container_mu.forest_container_cpp.LoadFromJson(bcf_json.json_cpp, "forest_0") + self.forest_container_mu.forest_container_cpp.LoadFromJson( + bcf_json.json_cpp, "forest_0" + ) # TODO: don't just make this a placeholder that we overwrite self.forest_container_tau = ForestContainer(0, 0, False, False) - self.forest_container_tau.forest_container_cpp.LoadFromJson(bcf_json.json_cpp, "forest_1") + self.forest_container_tau.forest_container_cpp.LoadFromJson( + bcf_json.json_cpp, "forest_1" + ) if self.include_variance_forest: # TODO: don't just make this a placeholder that we overwrite self.forest_container_variance = ForestContainer(0, 0, False, False) - self.forest_container_variance.forest_container_cpp.LoadFromJson(bcf_json.json_cpp, "forest_2") - + self.forest_container_variance.forest_container_cpp.LoadFromJson( + bcf_json.json_cpp, "forest_2" + ) + # Unpack global parameters self.variance_scale = bcf_json.get_scalar("variance_scale") self.y_std = bcf_json.get_scalar("outcome_scale") @@ -1467,33 +2140,41 @@ def from_json(self, json_string: str) -> None: self.num_samples = int(bcf_json.get_scalar("num_samples")) self.adaptive_coding = bcf_json.get_boolean("adaptive_coding") self.propensity_covariate = bcf_json.get_string("propensity_covariate") - self.internal_propensity_model = bcf_json.get_boolean("internal_propensity_model") + self.internal_propensity_model = bcf_json.get_boolean( + "internal_propensity_model" + ) # Unpack parameter samples if self.sample_sigma_global: - self.global_var_samples = bcf_json.get_numeric_vector("sigma2_global_samples", "parameters") + self.global_var_samples = bcf_json.get_numeric_vector( + "sigma2_global_samples", "parameters" + ) if self.sample_sigma_leaf_mu: - self.leaf_scale_mu_samples = bcf_json.get_numeric_vector("sigma2_leaf_mu_samples", "parameters") + self.leaf_scale_mu_samples = bcf_json.get_numeric_vector( + "sigma2_leaf_mu_samples", "parameters" + ) if self.sample_sigma_leaf_tau: - self.leaf_scale_tau_samples = bcf_json.get_numeric_vector("sigma2_leaf_tau_samples", "parameters") + self.leaf_scale_tau_samples = bcf_json.get_numeric_vector( + "sigma2_leaf_tau_samples", "parameters" + ) if self.adaptive_coding: self.b1_samples = bcf_json.get_numeric_vector("b1_samples", "parameters") self.b0_samples = bcf_json.get_numeric_vector("b0_samples", "parameters") - + # Unpack internal propensity model if self.internal_propensity_model: bart_propensity_string = bcf_json.get_string("bart_propensity_model") self.bart_propensity_model = BARTModel() self.bart_propensity_model.from_json(bart_propensity_string) - + # Unpack covariate preprocessor covariate_preprocessor_string = bcf_json.get_string("covariate_preprocessor") self._covariate_preprocessor = CovariatePreprocessor() self._covariate_preprocessor.from_json(covariate_preprocessor_string) - + # Mark the deserialized model as "sampled" self.sampled = True - + def is_sampled(self) -> bool: """Whether or not a BCF model has been sampled. @@ -1503,4 +2184,3 @@ def is_sampled(self) -> bool: `True` if a BCF model has been sampled, `False` otherwise """ return self.sampled - diff --git a/stochtree/calibration.py b/stochtree/calibration.py index 7ff6a0f3..b19a7e3f 100644 --- a/stochtree/calibration.py +++ b/stochtree/calibration.py @@ -6,8 +6,10 @@ from scipy.stats import gamma -def calibrate_global_error_variance(X: np.array, y: np.array, nu: float = 3, q: float = 0.9, standardize: bool = True) -> float: - """Calibrates scale parameter of global error variance model as in Chipman et al (2010) by setting a value of lambda, +def calibrate_global_error_variance( + X: np.array, y: np.array, nu: float = 3, q: float = 0.9, standardize: bool = True +) -> float: + """Calibrates scale parameter of global error variance model as in Chipman et al (2010) by setting a value of lambda, part of the scale parameter in the `sigma2 ~ IG(nu/2, (nu*lambda)/2)` prior. Parameters @@ -22,7 +24,7 @@ def calibrate_global_error_variance(X: np.array, y: np.array, nu: float = 3, q: Quantile used to calibrated `lamb` as in Sparapani et al (2021). Defaults to `0.9`. standardize : bool, optional Whether or not `y` should be standardized before calibration. Defaults to `True`. - + Returns ------- float @@ -33,7 +35,7 @@ def calibrate_global_error_variance(X: np.array, y: np.array, nu: float = 3, q: X_processed = X elif X.ndim == 1: X_processed = np.expand_dims(X, 1) - else: + else: raise ValueError("X must be a 1 or 2 dimensional numpy array") n, p = X_processed.shape @@ -41,7 +43,7 @@ def calibrate_global_error_variance(X: np.array, y: np.array, nu: float = 3, q: y_processed = np.squeeze(y) elif y.ndim == 1: y_processed = y - else: + else: raise ValueError("y must be a 1 or 2 dimensional numpy array") # Standardize outcome if necessary @@ -50,34 +52,42 @@ def calibrate_global_error_variance(X: np.array, y: np.array, nu: float = 3, q: mean_y = np.mean(y) if standardize: y_processed = (y_processed - mean_y) / sd_y - + # Fit a linear model of y ~ X lm_calibrator = linear_model.LinearRegression() lm_calibrator.fit(X_processed, y_processed) - + # Compute MSE y_hat_processed = lm_calibrator.predict(X_processed) mse = mean_squared_error(y_processed, y_hat_processed) - + # Check for overdetermination, revert to variance of y if model is overdetermined eps = np.finfo("double").eps if _is_model_overdetermined(lm_calibrator, n, mse, eps): sigma2hat = var_y - warnings.warn("Default calibration method for global error variance failed; covariate dimension exceeds number of samples. " - "Initializing global error variance scale parameter based on the variance of the standardized outcome.", UserWarning) + warnings.warn( + "Default calibration method for global error variance failed; covariate dimension exceeds number of samples. " + "Initializing global error variance scale parameter based on the variance of the standardized outcome.", + UserWarning, + ) else: sigma2hat = mse if _is_model_rank_deficient(lm_calibrator, p): - warnings.warn("Default calibration method for global error variance detected rank deficiency in covariate matrix. " - "This should not impact the calibrated values, but may indicate the presence of duplicated covariates.", UserWarning) - + warnings.warn( + "Default calibration method for global error variance detected rank deficiency in covariate matrix. " + "This should not impact the calibrated values, but may indicate the presence of duplicated covariates.", + UserWarning, + ) + # Calibrate lamb if no initial value is provided - lamb = (sigma2hat*gamma.ppf(1-q,nu))/nu - + lamb = (sigma2hat * gamma.ppf(1 - q, nu)) / nu + return lamb -def _is_model_overdetermined(reg_model: linear_model.LinearRegression, n: int, mse: float, eps: float) -> bool: - + +def _is_model_overdetermined( + reg_model: linear_model.LinearRegression, n: int, mse: float, eps: float +) -> bool: if reg_model.rank_ == n: return True elif np.abs(mse) < eps: @@ -85,6 +95,7 @@ def _is_model_overdetermined(reg_model: linear_model.LinearRegression, n: int, m else: return False + def _is_model_rank_deficient(reg_model: linear_model.LinearRegression, p: int) -> bool: if reg_model.rank_ < p: return True diff --git a/stochtree/data.py b/stochtree/data.py index 83a2a662..aecd6ac4 100644 --- a/stochtree/data.py +++ b/stochtree/data.py @@ -1,6 +1,7 @@ import numpy as np from stochtree_cpp import ForestDatasetCpp, ResidualCpp + class Dataset: """ Wrapper around a C++ class that stores all of the non-outcome data used in `stochtree`. This includes: @@ -9,12 +10,13 @@ class Dataset: 2. Basis vectors used to define non-constant leaf models. This is optional but may be included via the `add_basis` method. 3. Variance weights used to define heteroskedastic or otherwise weighted models. This is optional but may be included via the `add_variance_weights` method. """ + def __init__(self) -> None: """ Initialize a `Dataset` object """ self.dataset_cpp = ForestDatasetCpp() - + def add_covariates(self, covariates: np.array): """ Add covariates to a dataset @@ -22,14 +24,16 @@ def add_covariates(self, covariates: np.array): Parameters ---------- covariates : np.array - Numpy array of covariates. If data contain categorical, string, time series, or other columns in a + Numpy array of covariates. If data contain categorical, string, time series, or other columns in a dataframe, please first preprocess using the `CovariateTransformer`. """ - covariates_ = np.expand_dims(covariates, 1) if np.ndim(covariates) == 1 else covariates + covariates_ = ( + np.expand_dims(covariates, 1) if np.ndim(covariates) == 1 else covariates + ) n, p = covariates_.shape covariates_rowmajor = np.ascontiguousarray(covariates) self.dataset_cpp.AddCovariates(covariates_rowmajor, n, p, True) - + def add_basis(self, basis: np.array): """ Add basis matrix to a dataset @@ -43,10 +47,10 @@ def add_basis(self, basis: np.array): n, p = basis_.shape basis_rowmajor = np.ascontiguousarray(basis_) self.dataset_cpp.AddBasis(basis_rowmajor, n, p, True) - + def update_basis(self, basis: np.array): """ - Update basis matrix in a dataset. Allows users to build an ensemble whose leaves + Update basis matrix in a dataset. Allows users to build an ensemble whose leaves regress on bases that are updated throughout the sampler. Parameters @@ -58,7 +62,7 @@ def update_basis(self, basis: np.array): n, p = basis_.shape basis_rowmajor = np.ascontiguousarray(basis_) self.dataset_cpp.UpdateBasis(basis_rowmajor, n, p, True) - + def add_variance_weights(self, variance_weights: np.array): """ Add variance weights to a dataset @@ -71,19 +75,21 @@ def add_variance_weights(self, variance_weights: np.array): n = variance_weights.size self.dataset_cpp.AddVarianceWeights(variance_weights, n) + class Residual: """ - Wrapper around a C++ class that stores residual data used in `stochtree`. - This object becomes part of the real-time model "state" in that its contents + Wrapper around a C++ class that stores residual data used in `stochtree`. + This object becomes part of the real-time model "state" in that its contents always contain a full or partial residual, depending on the state of the sampler. - Typically this object is initialized with the original outcome and then "residualized" - by subtracting out the initial prediction value of every tree in every forest term + Typically this object is initialized with the original outcome and then "residualized" + by subtracting out the initial prediction value of every tree in every forest term (as well as the predictions of any other model term). """ + def __init__(self, residual: np.array) -> None: """ - Initialize a `Residual` object + Initialize a `Residual` object Parameters ---------- @@ -92,7 +98,7 @@ def __init__(self, residual: np.array) -> None: """ n = residual.size self.residual_cpp = ResidualCpp(residual, n) - + def get_residual(self) -> np.array: """ Extract the current values of the residual as a numpy array @@ -103,7 +109,7 @@ def get_residual(self) -> np.array: Current values of the residual (which may be net of any forest / other model terms) """ return self.residual_cpp.GetResidualArray() - + def update_data(self, new_vector: np.array) -> None: """ Update the current state of the outcome (i.e. partial residual) data by replacing each element with the elements of `new_vector` diff --git a/stochtree/forest.py b/stochtree/forest.py index 576dc60f..b12cc924 100644 --- a/stochtree/forest.py +++ b/stochtree/forest.py @@ -1,15 +1,17 @@ """ Python classes wrapping C++ forest container object """ + import numpy as np from .data import Dataset from stochtree_cpp import ForestContainerCpp, ForestCpp from typing import Union + class ForestContainer: """ Container that stores sampled (and retained) tree ensembles from BART, BCF or a custom sampler. - + Parameters ---------- num_trees : int @@ -17,48 +19,56 @@ class ForestContainer: output_dimension : int, optional Dimension of the leaf node parameters in each tree leaf_constant : bool, optional - Whether the leaf node model is "constant" (i.e. prediction is simply a - sum of leaf node parameters for every observation in a dataset) or not (i.e. - each leaf node parameter is multiplied by a "basis vector" before being returned + Whether the leaf node model is "constant" (i.e. prediction is simply a + sum of leaf node parameters for every observation in a dataset) or not (i.e. + each leaf node parameter is multiplied by a "basis vector" before being returned as a prediction). is_exponentiated : bool, optional - Whether or not the leaf node parameters are stored in log scale (in which case, they + Whether or not the leaf node parameters are stored in log scale (in which case, they must be exponentiated before being returned as predictions). """ - def __init__(self, num_trees: int, output_dimension: int = 1, - leaf_constant: bool = True, is_exponentiated: bool = False) -> None: - self.forest_container_cpp = ForestContainerCpp(num_trees, output_dimension, leaf_constant, is_exponentiated) + + def __init__( + self, + num_trees: int, + output_dimension: int = 1, + leaf_constant: bool = True, + is_exponentiated: bool = False, + ) -> None: + self.forest_container_cpp = ForestContainerCpp( + num_trees, output_dimension, leaf_constant, is_exponentiated + ) self.num_trees = num_trees self.output_dimension = output_dimension self.leaf_constant = leaf_constant self.is_exponentiated = is_exponentiated - + def predict(self, dataset: Dataset) -> np.array: """ Predict from each forest in the container, using the provided `Dataset` object. - + Parameters ---------- dataset : Dataset Python object wrapping the "dataset" class used by C++ sampling and prediction data structures. - + Returns ------- np.array - Numpy array with (`n`, `m`) dimensions, where `n` is the number of observations in `dataset` and `m` + Numpy array with (`n`, `m`) dimensions, where `n` is the number of observations in `dataset` and `m` is the number of samples in the forest container. """ return self.forest_container_cpp.Predict(dataset.dataset_cpp) - + def predict_raw(self, dataset: Dataset) -> np.array: """ Predict raw leaf values for a every forest in the container, using the provided `Dataset` object - + Parameters ---------- dataset : Dataset Python object wrapping the "dataset" class used by C++ sampling and prediction data structures. - + Returns ------- np.array @@ -71,31 +81,35 @@ def predict_raw(self, dataset: Dataset) -> np.array: if result.shape[1] == 1: result = result.reshape(result.shape[0], result.shape[2]) return result - + def predict_raw_single_forest(self, dataset: Dataset, forest_num: int) -> np.array: """ Predict raw leaf values for a specific forest (indexed by `forest_num`), using the provided `Dataset` object - + Parameters ---------- dataset : Dataset Python object wrapping the "dataset" class used by C++ sampling and prediction data structures. forest_num : int Index of the forest from which to predict. Forest indices are 0-based. - + Returns ------- np.array Numpy array with (`n`, `k`) dimensions, where `n` is the number of observations in `dataset` and `k` is the dimension of the leaf parameter. """ - return self.forest_container_cpp.PredictRawSingleForest(dataset.dataset_cpp, forest_num) - - def predict_raw_single_tree(self, dataset: Dataset, forest_num: int, tree_num: int) -> np.array: + return self.forest_container_cpp.PredictRawSingleForest( + dataset.dataset_cpp, forest_num + ) + + def predict_raw_single_tree( + self, dataset: Dataset, forest_num: int, tree_num: int + ) -> np.array: """ - Predict raw leaf values for a specific tree of a specific forest (indexed by `tree_num` and `forest_num` + Predict raw leaf values for a specific tree of a specific forest (indexed by `tree_num` and `forest_num` respectively), using the provided `Dataset` object. - + Parameters ---------- dataset : Dataset @@ -104,27 +118,31 @@ def predict_raw_single_tree(self, dataset: Dataset, forest_num: int, tree_num: i Index of the forest from which to predict. Forest indices are 0-based. tree_num : int Index of the tree which to predict (within forest indexed by `forest_num`). Tree indices are 0-based. - + Returns ------- np.array Numpy array with (`n`, `k`) dimensions, where `n` is the number of observations in `dataset` and `k` is the dimension of the leaf parameter. """ - return self.forest_container_cpp.PredictRawSingleTree(dataset.dataset_cpp, forest_num, tree_num) - - def set_root_leaves(self, forest_num: int, leaf_value: Union[float, np.array]) -> None: + return self.forest_container_cpp.PredictRawSingleTree( + dataset.dataset_cpp, forest_num, tree_num + ) + + def set_root_leaves( + self, forest_num: int, leaf_value: Union[float, np.array] + ) -> None: """ - Set constant (root) leaf node values for every tree in the forest indexed by `forest_num`. + Set constant (root) leaf node values for every tree in the forest indexed by `forest_num`. Assumes the forest consists of all root (single-node) trees. - + Parameters ---------- forest_num : int Index of the forest for which we will set root node parameters. leaf_value : float or np.array - Constant values to which root nodes are to be set. If the trees in forest `forest_num` - are univariate, then `leaf_value` must be a `float`, while if the trees in forest `forest_num` + Constant values to which root nodes are to be set. If the trees in forest `forest_num` + are univariate, then `leaf_value` must be a `float`, while if the trees in forest `forest_num` are multivariate, then `leaf_value` must be a `np.array`. """ if not isinstance(leaf_value, np.ndarray) and not isinstance(leaf_value, float): @@ -133,18 +151,20 @@ def set_root_leaves(self, forest_num: int, leaf_value: Union[float, np.array]) - leaf_value = np.squeeze(leaf_value) if len(leaf_value.shape) != 1: raise ValueError("leaf_value must be either a one-dimensional array") - self.forest_container_cpp.SetRootVector(forest_num, leaf_value, leaf_value.shape[0]) + self.forest_container_cpp.SetRootVector( + forest_num, leaf_value, leaf_value.shape[0] + ) else: self.forest_container_cpp.SetRootValue(forest_num, leaf_value) def save_to_json_file(self, json_filename: str) -> None: """ Save the forests in the container to a JSON file. - + Parameters ---------- json_filename : str - Name of JSON file to which forest container state will be saved. + Name of JSON file to which forest container state will be saved. May contain absolute or relative paths. """ self.forest_container_cpp.SaveToJsonFile(json_filename) @@ -152,20 +172,20 @@ def save_to_json_file(self, json_filename: str) -> None: def load_from_json_file(self, json_filename: str) -> None: """ Load a forest container from output stored in a JSON file. - + Parameters ---------- json_filename : str - Name of JSON file from which forest container state will be restored. + Name of JSON file from which forest container state will be restored. May contain absolute or relative paths. """ self.forest_container_cpp.LoadFromJsonFile(json_filename) def dump_json_string(self) -> str: """ - Dump a forest container into an in-memory JSON string (which can be directly serialized or + Dump a forest container into an in-memory JSON string (which can be directly serialized or combined with other JSON strings before serialization). - + Returns ------- str @@ -176,14 +196,14 @@ def dump_json_string(self) -> str: def load_from_json_string(self, json_string: str) -> None: """ Reload a forest container from an in-memory JSON string. - + Parameters ---------- json_string : str In-memory string containing state of a forest container. """ self.forest_container_cpp.LoadFromJsonString(json_string) - + def add_sample(self, leaf_value: Union[float, np.array]) -> None: """ Add a new all-root ensemble to the container, with all of the leaves set to the value / vector provided @@ -198,9 +218,17 @@ def add_sample(self, leaf_value: Union[float, np.array]) -> None: self.forest_container_cpp.AddSampleVector(leaf_value) else: self.forest_container_cpp.AddSampleValue(leaf_value) - - def add_numeric_split(self, forest_num: int, tree_num: int, leaf_num: int, feature_num: int, split_threshold: float, - left_leaf_value: Union[float, np.array], right_leaf_value: Union[float, np.array]) -> None: + + def add_numeric_split( + self, + forest_num: int, + tree_num: int, + leaf_num: int, + feature_num: int, + split_threshold: float, + left_leaf_value: Union[float, np.array], + right_leaf_value: Union[float, np.array], + ) -> None: """ Add a numeric (i.e. X[,i] <= c) split to a given tree in the ensemble @@ -224,10 +252,26 @@ def add_numeric_split(self, forest_num: int, tree_num: int, leaf_num: int, featu if isinstance(left_leaf_value, np.ndarray): left_leaf_value = np.squeeze(left_leaf_value) right_leaf_value = np.squeeze(right_leaf_value) - self.forest_container_cpp.AddNumericSplitVector(forest_num, tree_num, leaf_num, feature_num, split_threshold, left_leaf_value, right_leaf_value) + self.forest_container_cpp.AddNumericSplitVector( + forest_num, + tree_num, + leaf_num, + feature_num, + split_threshold, + left_leaf_value, + right_leaf_value, + ) else: - self.forest_container_cpp.AddNumericSplitValue(forest_num, tree_num, leaf_num, feature_num, split_threshold, left_leaf_value, right_leaf_value) - + self.forest_container_cpp.AddNumericSplitValue( + forest_num, + tree_num, + leaf_num, + feature_num, + split_threshold, + left_leaf_value, + right_leaf_value, + ) + def get_tree_leaves(self, forest_num: int, tree_num: int) -> np.array: """ Retrieve a vector of indices of leaf nodes for a given tree in a given forest @@ -246,7 +290,9 @@ def get_tree_leaves(self, forest_num: int, tree_num: int) -> np.array: """ return self.forest_container_cpp.GetTreeLeaves(forest_num, tree_num) - def get_tree_split_counts(self, forest_num: int, tree_num: int, num_features: int) -> np.array: + def get_tree_split_counts( + self, forest_num: int, tree_num: int, num_features: int + ) -> np.array: """ Retrieve a vector of split counts for every training set feature in a given tree in a given forest @@ -258,14 +304,16 @@ def get_tree_split_counts(self, forest_num: int, tree_num: int, num_features: in Index of the tree for which split counts will be retrieved num_features : int Total number of features in the training set - + Returns ------- np.array - One-dimensional numpy array with as many elements as in the forest model's training set, + One-dimensional numpy array with as many elements as in the forest model's training set, containing the split count for each feature for a given forest and tree. """ - return self.forest_container_cpp.GetTreeSplitCounts(forest_num, tree_num, num_features) + return self.forest_container_cpp.GetTreeSplitCounts( + forest_num, tree_num, num_features + ) def get_forest_split_counts(self, forest_num: int, num_features: int) -> np.array: """ @@ -281,7 +329,7 @@ def get_forest_split_counts(self, forest_num: int, num_features: int) -> np.arra Returns ------- np.array - One-dimensional numpy array with as many elements as in the forest model's training set, + One-dimensional numpy array with as many elements as in the forest model's training set, containing the split count for each feature for a given forest (summed across every tree in the forest). """ return self.forest_container_cpp.GetForestSplitCounts(forest_num, num_features) @@ -298,7 +346,7 @@ def get_overall_split_counts(self, num_features: int) -> np.array: Returns ------- np.array - One-dimensional numpy array with as many elements as in the forest model's training set, + One-dimensional numpy array with as many elements as in the forest model's training set, containing the split count for each feature summed across every forest of every tree in the container. """ return self.forest_container_cpp.GetOverallSplitCounts(num_features) @@ -316,7 +364,7 @@ def get_granular_split_counts(self, num_features: int) -> np.array: ------- np.array Three-dimensional numpy array, containing the number of splits a variable receives in each tree of each forest in a ``ForestContainer``. - Array will have dimensions (`m`,`b`,`p`) where `m` is the number of forests in the container, `b` is the number of trees in each + Array will have dimensions (`m`,`b`,`p`) where `m` is the number of forests in the container, `b` is the number of trees in each forest, and `p` is the number of features in the forest model's training dataset. """ return self.forest_container_cpp.GetGranularSplitCounts(num_features) @@ -365,15 +413,17 @@ def is_leaf_node(self, forest_num: int, tree_num: int, node_id: int) -> bool: Index of the tree to be queried node_id : int Index of the node to be queried - + Returns ------- - bool + bool `True` if node `node_id` in tree `tree_num` of forest `forest_num` is a leaf, `False` otherwise """ return self.forest_container_cpp.IsLeafNode(forest_num, tree_num, node_id) - - def is_numeric_split_node(self, forest_num: int, tree_num: int, node_id: int) -> bool: + + def is_numeric_split_node( + self, forest_num: int, tree_num: int, node_id: int + ) -> bool: """ Whether or not a given node of a given tree in a given forest in the ``ForestContainer`` is a numeric split node @@ -385,15 +435,19 @@ def is_numeric_split_node(self, forest_num: int, tree_num: int, node_id: int) -> Index of the tree to be queried node_id : int Index of the node to be queried - + Returns ------- - bool + bool `True` if node `node_id` in tree `tree_num` of forest `forest_num` is a numeric split node, `False` otherwise """ - return self.forest_container_cpp.IsNumericSplitNode(forest_num, tree_num, node_id) - - def is_categorical_split_node(self, forest_num: int, tree_num: int, node_id: int) -> bool: + return self.forest_container_cpp.IsNumericSplitNode( + forest_num, tree_num, node_id + ) + + def is_categorical_split_node( + self, forest_num: int, tree_num: int, node_id: int + ) -> bool: """ Whether or not a given node of a given tree in a given forest in the ``ForestContainer`` is a categorical split node @@ -405,14 +459,16 @@ def is_categorical_split_node(self, forest_num: int, tree_num: int, node_id: int Index of the tree to be queried node_id : int Index of the node to be queried - + Returns ------- - bool + bool `True` if node `node_id` in tree `tree_num` of forest `forest_num` is a categorical split node, `False` otherwise """ - return self.forest_container_cpp.IsCategoricalSplitNode(forest_num, tree_num, node_id) - + return self.forest_container_cpp.IsCategoricalSplitNode( + forest_num, tree_num, node_id + ) + def parent_node(self, forest_num: int, tree_num: int, node_id: int) -> int: """ Parent node of given node of a given tree in a given forest in the ``ForestContainer`` @@ -425,15 +481,15 @@ def parent_node(self, forest_num: int, tree_num: int, node_id: int) -> int: Index of the tree to be queried node_id : int Index of the node to be queried - + Returns ------- - int - Index of the parent of node `node_id` in tree `tree_num` of forest `forest_num`. + int + Index of the parent of node `node_id` in tree `tree_num` of forest `forest_num`. If `node_id` is a root node, returns `-1`. """ return self.forest_container_cpp.ParentNode(forest_num, tree_num, node_id) - + def left_child_node(self, forest_num: int, tree_num: int, node_id: int) -> int: """ Left child node of given node of a given tree in a given forest in the ``ForestContainer`` @@ -446,15 +502,15 @@ def left_child_node(self, forest_num: int, tree_num: int, node_id: int) -> int: Index of the tree to be queried node_id : int Index of the node to be queried - + Returns ------- - int - Index of the left child of node `node_id` in tree `tree_num` of forest `forest_num`. + int + Index of the left child of node `node_id` in tree `tree_num` of forest `forest_num`. If `node_id` is a leaf, returns `-1`. """ return self.forest_container_cpp.LeftChildNode(forest_num, tree_num, node_id) - + def right_child_node(self, forest_num: int, tree_num: int, node_id: int) -> int: """ Right child node of given node of a given tree in a given forest in the ``ForestContainer`` @@ -467,15 +523,15 @@ def right_child_node(self, forest_num: int, tree_num: int, node_id: int) -> int: Index of the tree to be queried node_id : int Index of the node to be queried - + Returns ------- - int - Index of the right child of node `node_id` in tree `tree_num` of forest `forest_num`. + int + Index of the right child of node `node_id` in tree `tree_num` of forest `forest_num`. If `node_id` is a leaf, returns `-1`. """ return self.forest_container_cpp.RightChildNode(forest_num, tree_num, node_id) - + def node_depth(self, forest_num: int, tree_num: int, node_id: int) -> int: """ Depth of given node of a given tree in a given forest in the ``ForestContainer``. @@ -488,15 +544,15 @@ def node_depth(self, forest_num: int, tree_num: int, node_id: int) -> int: Index of the tree to be queried node_id : int Index of the node to be queried - + Returns ------- - int - Depth of node `node_id` in tree `tree_num` of forest `forest_num`. The root node is defined + int + Depth of node `node_id` in tree `tree_num` of forest `forest_num`. The root node is defined as "depth zero." """ return self.forest_container_cpp.NodeDepth(forest_num, tree_num, node_id) - + def node_split_index(self, forest_num: int, tree_num: int, node_id: int) -> int: """ Split index of given node of a given tree in a given forest in the ``ForestContainer``. @@ -510,18 +566,20 @@ def node_split_index(self, forest_num: int, tree_num: int, node_id: int) -> int: Index of the tree to be queried node_id : int Index of the node to be queried - + Returns ------- - int + int Split index of `node_id` in tree `tree_num` of forest `forest_num`. """ if self.is_leaf_node(forest_num, tree_num, node_id): return -1 else: return self.forest_container_cpp.SplitIndex(forest_num, tree_num, node_id) - - def node_split_threshold(self, forest_num: int, tree_num: int, node_id: int) -> float: + + def node_split_threshold( + self, forest_num: int, tree_num: int, node_id: int + ) -> float: """ Threshold that defines a numeric split for a given node of a given tree in a given forest in the ``ForestContainer``. Returns ``np.Inf`` if the node is a leaf or a categorical split node. @@ -534,18 +592,24 @@ def node_split_threshold(self, forest_num: int, tree_num: int, node_id: int) -> Index of the tree to be queried node_id : int Index of the node to be queried - + Returns ------- - float + float Threshold that defines a numeric split for node `node_id` in tree `tree_num` of forest `forest_num`. """ - if self.is_leaf_node(forest_num, tree_num, node_id) or self.is_categorical_split_node(forest_num, tree_num, node_id): + if self.is_leaf_node( + forest_num, tree_num, node_id + ) or self.is_categorical_split_node(forest_num, tree_num, node_id): return np.Inf else: - return self.forest_container_cpp.SplitThreshold(forest_num, tree_num, node_id) - - def node_split_categories(self, forest_num: int, tree_num: int, node_id: int) -> np.array: + return self.forest_container_cpp.SplitThreshold( + forest_num, tree_num, node_id + ) + + def node_split_categories( + self, forest_num: int, tree_num: int, node_id: int + ) -> np.array: """ Array of category indices that define a categorical split for a given node of a given tree in a given forest in the ``ForestContainer``. Returns ``np.array([np.Inf])`` if the node is a leaf or a numeric split node. @@ -558,18 +622,24 @@ def node_split_categories(self, forest_num: int, tree_num: int, node_id: int) -> Index of the tree to be queried node_id : int Index of the node to be queried - + Returns ------- - np.array + np.array Array of category indices that define a categorical split for node `node_id` in tree `tree_num` of forest `forest_num`. """ - if self.is_leaf_node(forest_num, tree_num, node_id) or self.is_numeric_split_node(forest_num, tree_num, node_id): + if self.is_leaf_node( + forest_num, tree_num, node_id + ) or self.is_numeric_split_node(forest_num, tree_num, node_id): return np.array([np.Inf]) else: - return self.forest_container_cpp.SplitCategories(forest_num, tree_num, node_id) - - def node_leaf_values(self, forest_num: int, tree_num: int, node_id: int) -> np.array: + return self.forest_container_cpp.SplitCategories( + forest_num, tree_num, node_id + ) + + def node_leaf_values( + self, forest_num: int, tree_num: int, node_id: int + ) -> np.array: """ Node parameter value(s) for a given node of a given tree in a given forest in the ``ForestContainer``. Values are stale if the node is a split node. @@ -582,14 +652,14 @@ def node_leaf_values(self, forest_num: int, tree_num: int, node_id: int) -> np.a Index of the tree to be queried node_id : int Index of the node to be queried - + Returns ------- - np.array + np.array Array of parameter values for node `node_id` in tree `tree_num` of forest `forest_num`. """ return self.forest_container_cpp.NodeLeafValues(forest_num, tree_num, node_id) - + def num_nodes(self, forest_num: int, tree_num: int) -> int: """ Number of nodes in a given tree in a given forest in the ``ForestContainer``. @@ -600,14 +670,14 @@ def num_nodes(self, forest_num: int, tree_num: int) -> int: Index of the forest to be queried tree_num : int Index of the tree to be queried - + Returns ------- - int + int Total number of nodes in tree `tree_num` of forest `forest_num`. """ return self.forest_container_cpp.NumNodes(forest_num, tree_num) - + def num_leaves(self, forest_num: int, tree_num: int) -> int: """ Number of leaves in a given tree in a given forest in the ``ForestContainer``. @@ -618,14 +688,14 @@ def num_leaves(self, forest_num: int, tree_num: int) -> int: Index of the forest to be queried tree_num : int Index of the tree to be queried - + Returns ------- - int + int Total number of leaves in tree `tree_num` of forest `forest_num`. """ return self.forest_container_cpp.NumLeaves(forest_num, tree_num) - + def num_leaf_parents(self, forest_num: int, tree_num: int) -> int: """ Number of leaf parents (split nodes with two leaves as children) in a given tree in a given forest in the ``ForestContainer``. @@ -636,14 +706,14 @@ def num_leaf_parents(self, forest_num: int, tree_num: int) -> int: Index of the forest to be queried tree_num : int Index of the tree to be queried - + Returns ------- - int + int Total number of leaf parents in tree `tree_num` of forest `forest_num`. """ return self.forest_container_cpp.NumLeafParents(forest_num, tree_num) - + def num_split_nodes(self, forest_num: int, tree_num: int) -> int: """ Number of split_nodes in a given tree in a given forest in the ``ForestContainer``. @@ -654,14 +724,14 @@ def num_split_nodes(self, forest_num: int, tree_num: int) -> int: Index of the forest to be queried tree_num : int Index of the tree to be queried - + Returns ------- - int + int Total number of split nodes in tree `tree_num` of forest `forest_num`. """ return self.forest_container_cpp.NumSplitNodes(forest_num, tree_num) - + def nodes(self, forest_num: int, tree_num: int) -> np.array: """ Array of node indices in a given tree in a given forest in the ``ForestContainer``. @@ -672,14 +742,14 @@ def nodes(self, forest_num: int, tree_num: int) -> np.array: Index of the forest to be queried tree_num : int Index of the tree to be queried - + Returns ------- - np.array + np.array Array of indices of nodes in tree `tree_num` of forest `forest_num`. """ return self.forest_container_cpp.Nodes(forest_num, tree_num) - + def leaves(self, forest_num: int, tree_num: int) -> np.array: """ Array of leaf indices in a given tree in a given forest in the ``ForestContainer``. @@ -690,14 +760,14 @@ def leaves(self, forest_num: int, tree_num: int) -> np.array: Index of the forest to be queried tree_num : int Index of the tree to be queried - + Returns ------- - np.array + np.array Array of indices of leaf nodes in tree `tree_num` of forest `forest_num`. """ return self.forest_container_cpp.Leaves(forest_num, tree_num) - + def delete_sample(self, forest_num: int) -> None: """ Modify the ``ForestContainer`` by removing the forest sample indexed by ``forest_num``. @@ -708,7 +778,8 @@ def delete_sample(self, forest_num: int) -> None: Index of the forest to be removed from the ``ForestContainer`` """ return self.forest_container_cpp.DeleteSample(forest_num) - + + class Forest: """ In-memory python wrapper around a C++ tree ensemble object @@ -720,28 +791,36 @@ class Forest: output_dimension : int, optional Dimension of the leaf node parameters in each tree leaf_constant : bool, optional - Whether the leaf node model is "constant" (i.e. prediction is simply a - sum of leaf node parameters for every observation in a dataset) or not (i.e. - each leaf node parameter is multiplied by a "basis vector" before being returned + Whether the leaf node model is "constant" (i.e. prediction is simply a + sum of leaf node parameters for every observation in a dataset) or not (i.e. + each leaf node parameter is multiplied by a "basis vector" before being returned as a prediction). is_exponentiated : bool, optional - Whether or not the leaf node parameters are stored in log scale (in which case, they + Whether or not the leaf node parameters are stored in log scale (in which case, they must be exponentiated before being returned as predictions). """ - def __init__(self, num_trees: int, output_dimension: int = 1, - leaf_constant: bool = True, is_exponentiated: bool = False) -> None: - self.forest_cpp = ForestCpp(num_trees, output_dimension, leaf_constant, is_exponentiated) + + def __init__( + self, + num_trees: int, + output_dimension: int = 1, + leaf_constant: bool = True, + is_exponentiated: bool = False, + ) -> None: + self.forest_cpp = ForestCpp( + num_trees, output_dimension, leaf_constant, is_exponentiated + ) self.num_trees = num_trees self.output_dimension = output_dimension self.leaf_constant = leaf_constant self.is_exponentiated = is_exponentiated - + def reset_root(self) -> None: """ Reset forest to a forest with all single node (i.e. "root") trees """ self.forest_cpp.ResetRoot() - + def reset(self, forest_container: ForestContainer, forest_num: int) -> None: """ Reset forest to the forest indexed by ``forest_num`` in ``forest_container`` @@ -754,37 +833,37 @@ def reset(self, forest_container: ForestContainer, forest_num: int) -> None: Index of the ensemble used to reset the ``Forest`` """ self.forest_cpp.Reset(forest_container.forest_container_cpp, forest_num) - + def predict(self, dataset: Dataset) -> np.array: """ Predict from each forest in the container, using the provided `Dataset` object. - + Parameters ---------- dataset : Dataset Python object wrapping the "dataset" class used by C++ sampling and prediction data structures. - + Returns ------- np.array One-dimensional numpy array with length equal to the number of observations in `dataset`. """ return self.forest_cpp.Predict(dataset.dataset_cpp) - + def predict_raw(self, dataset: Dataset) -> np.array: """ Predict raw leaf values for a every forest in the container, using the provided `Dataset` object - + Parameters ---------- dataset : Dataset Python object wrapping the "dataset" class used by C++ sampling and prediction data structures. - + Returns ------- np.array Numpy array with (`n`, `k`) dimensions, where `n` is the number of observations in `dataset` and - `k` is the dimension of the leaf parameter. If `k = 1`, then the returned array is simply one-dimensional + `k` is the dimension of the leaf parameter. If `k = 1`, then the returned array is simply one-dimensional with `n` observations. """ result = self.forest_cpp.PredictRaw(dataset.dataset_cpp) @@ -792,17 +871,17 @@ def predict_raw(self, dataset: Dataset) -> np.array: if result.shape[1] == 1: result = result.reshape(result.shape[0], result.shape[2]) return result - + def set_root_leaves(self, leaf_value: Union[float, np.array]) -> None: """ - Set constant (root) leaf node values for every tree in the forest. + Set constant (root) leaf node values for every tree in the forest. Assumes the forest consists of all root (single-node) trees. - + Parameters ---------- leaf_value : float or np.array - Constant values to which root nodes are to be set. If the trees in forest `forest_num` - are univariate, then `leaf_value` must be a `float`, while if the trees in forest `forest_num` + Constant values to which root nodes are to be set. If the trees in forest `forest_num` + are univariate, then `leaf_value` must be a `float`, while if the trees in forest `forest_num` are multivariate, then `leaf_value` must be a `np.array`. """ if not isinstance(leaf_value, np.ndarray) and not isinstance(leaf_value, float): @@ -815,8 +894,15 @@ def set_root_leaves(self, leaf_value: Union[float, np.array]) -> None: else: self.forest_cpp.SetRootValue(leaf_value) - def add_numeric_split(self, tree_num: int, leaf_num: int, feature_num: int, split_threshold: float, - left_leaf_value: Union[float, np.array], right_leaf_value: Union[float, np.array]) -> None: + def add_numeric_split( + self, + tree_num: int, + leaf_num: int, + feature_num: int, + split_threshold: float, + left_leaf_value: Union[float, np.array], + right_leaf_value: Union[float, np.array], + ) -> None: """ Add a numeric (i.e. X[,i] <= c) split to a given tree in the forest @@ -838,10 +924,24 @@ def add_numeric_split(self, tree_num: int, leaf_num: int, feature_num: int, spli if isinstance(left_leaf_value, np.ndarray): left_leaf_value = np.squeeze(left_leaf_value) right_leaf_value = np.squeeze(right_leaf_value) - self.forest_cpp.AddNumericSplitVector(tree_num, leaf_num, feature_num, split_threshold, left_leaf_value, right_leaf_value) + self.forest_cpp.AddNumericSplitVector( + tree_num, + leaf_num, + feature_num, + split_threshold, + left_leaf_value, + right_leaf_value, + ) else: - self.forest_cpp.AddNumericSplitValue(tree_num, leaf_num, feature_num, split_threshold, left_leaf_value, right_leaf_value) - + self.forest_cpp.AddNumericSplitValue( + tree_num, + leaf_num, + feature_num, + split_threshold, + left_leaf_value, + right_leaf_value, + ) + def get_tree_leaves(self, tree_num: int) -> np.array: """ Retrieve a vector of indices of leaf nodes for a given tree in the forest @@ -850,7 +950,7 @@ def get_tree_leaves(self, tree_num: int) -> np.array: ---------- tree_num : float or np.array Index of the tree for which leaf indices will be retrieved - + Returns ------- np.array @@ -868,11 +968,11 @@ def get_tree_split_counts(self, tree_num: int, num_features: int) -> np.array: Index of the tree for which split counts will be retrieved num_features : int Total number of features in the training set - + Returns ------- np.array - One-dimensional numpy array with as many elements as in the forest model's training set, + One-dimensional numpy array with as many elements as in the forest model's training set, containing the split count for each feature for a given tree of the forest. """ return self.forest_cpp.GetTreeSplitCounts(tree_num, num_features) @@ -885,11 +985,11 @@ def get_overall_split_counts(self, num_features: int) -> np.array: ---------- num_features : int Total number of features in the training set - + Returns ------- np.array - One-dimensional numpy array with as many elements as in the forest model's training set, + One-dimensional numpy array with as many elements as in the forest model's training set, containing the overall split count in the forest for each feature. """ return self.forest_cpp.GetOverallSplitCounts(num_features) @@ -902,11 +1002,11 @@ def get_granular_split_counts(self, num_features: int) -> np.array: ---------- num_features : int Total number of features in the training set - + Returns ------- np.array - One-dimensional numpy array with as many elements as in the forest model's training set, + One-dimensional numpy array with as many elements as in the forest model's training set, containing the split count for each feature for a every tree in the forest. """ return self.forest_cpp.GetGranularSplitCounts(num_features) @@ -941,14 +1041,14 @@ def is_leaf_node(self, tree_num: int, node_id: int) -> bool: Index of the tree to be queried node_id : int Index of the node to be queried - + Returns ------- - bool + bool `True` if node `node_id` in tree `tree_num` is a leaf, `False` otherwise """ return self.forest_cpp.IsLeafNode(tree_num, node_id) - + def is_numeric_split_node(self, tree_num: int, node_id: int) -> bool: """ Whether or not a given node of a given tree of a forest is a numeric split node @@ -959,14 +1059,14 @@ def is_numeric_split_node(self, tree_num: int, node_id: int) -> bool: Index of the tree to be queried node_id : int Index of the node to be queried - + Returns ------- - bool + bool `True` if node `node_id` in tree `tree_num` is a numeric split node, `False` otherwise """ return self.forest_cpp.IsNumericSplitNode(tree_num, node_id) - + def is_categorical_split_node(self, tree_num: int, node_id: int) -> bool: """ Whether or not a given node of a given tree of a forest is a categorical split node @@ -977,14 +1077,14 @@ def is_categorical_split_node(self, tree_num: int, node_id: int) -> bool: Index of the tree to be queried node_id : int Index of the node to be queried - + Returns ------- - bool + bool `True` if node `node_id` in tree `tree_num` is a categorical split node, `False` otherwise """ return self.forest_cpp.IsCategoricalSplitNode(tree_num, node_id) - + def parent_node(self, tree_num: int, node_id: int) -> int: """ Parent node of given node of a given tree of a forest @@ -995,15 +1095,15 @@ def parent_node(self, tree_num: int, node_id: int) -> int: Index of the tree to be queried node_id : int Index of the node to be queried - + Returns ------- - int - Index of the parent of node `node_id` in tree `tree_num`. + int + Index of the parent of node `node_id` in tree `tree_num`. If `node_id` is a root node, returns `-1`. """ return self.forest_cpp.ParentNode(tree_num, node_id) - + def left_child_node(self, tree_num: int, node_id: int) -> int: """ Left child node of given node of a given tree of a forest @@ -1014,15 +1114,15 @@ def left_child_node(self, tree_num: int, node_id: int) -> int: Index of the tree to be queried node_id : int Index of the node to be queried - + Returns ------- - int - Index of the left child of node `node_id` in tree `tree_num`. + int + Index of the left child of node `node_id` in tree `tree_num`. If `node_id` is a leaf, returns `-1`. """ return self.forest_cpp.LeftChildNode(tree_num, node_id) - + def right_child_node(self, tree_num: int, node_id: int) -> int: """ Right child node of given node of a given tree of a forest @@ -1033,15 +1133,15 @@ def right_child_node(self, tree_num: int, node_id: int) -> int: Index of the tree to be queried node_id : int Index of the node to be queried - + Returns ------- - int - Index of the right child of node `node_id` in tree `tree_num`. + int + Index of the right child of node `node_id` in tree `tree_num`. If `node_id` is a leaf, returns `-1`. """ return self.forest_cpp.RightChildNode(tree_num, node_id) - + def node_depth(self, tree_num: int, node_id: int) -> int: """ Depth of given node of a given tree of a forest @@ -1053,17 +1153,17 @@ def node_depth(self, tree_num: int, node_id: int) -> int: Index of the tree to be queried node_id : int Index of the node to be queried - + Returns ------- - int + int Depth of node `node_id` in tree `tree_num`. The root node is defined as "depth zero." """ return self.forest_cpp.NodeDepth(tree_num, node_id) - + def node_split_index(self, tree_num: int, node_id: int) -> int: """ - Split index of given node of a given tree of a forest. + Split index of given node of a given tree of a forest. Returns ``-1`` if the node is a leaf. Parameters @@ -1072,20 +1172,20 @@ def node_split_index(self, tree_num: int, node_id: int) -> int: Index of the tree to be queried node_id : int Index of the node to be queried - + Returns ------- - int + int Split index of `node_id` in tree `tree_num`. """ if self.is_leaf_node(tree_num, node_id): return -1 else: return self.forest_cpp.SplitIndex(tree_num, node_id) - + def node_split_threshold(self, tree_num: int, node_id: int) -> float: """ - Threshold that defines a numeric split for a given node of a given tree of a forest. + Threshold that defines a numeric split for a given node of a given tree of a forest. Returns ``np.Inf`` if the node is a leaf or a categorical split node. Parameters @@ -1094,20 +1194,22 @@ def node_split_threshold(self, tree_num: int, node_id: int) -> float: Index of the tree to be queried node_id : int Index of the node to be queried - + Returns ------- - float + float Threshold that defines a numeric split for node `node_id` in tree `tree_num`. """ - if self.is_leaf_node(tree_num, node_id) or self.is_categorical_split_node(tree_num, node_id): + if self.is_leaf_node(tree_num, node_id) or self.is_categorical_split_node( + tree_num, node_id + ): return np.Inf else: return self.forest_cpp.SplitThreshold(tree_num, node_id) - + def node_split_categories(self, tree_num: int, node_id: int) -> np.array: """ - Array of category indices that define a categorical split for a given node of a given tree of a forest. + Array of category indices that define a categorical split for a given node of a given tree of a forest. Returns ``np.array([np.Inf])`` if the node is a leaf or a numeric split node. Parameters @@ -1116,20 +1218,22 @@ def node_split_categories(self, tree_num: int, node_id: int) -> np.array: Index of the tree to be queried node_id : int Index of the node to be queried - + Returns ------- - np.array + np.array Array of category indices that define a categorical split for node `node_id` in tree `tree_num`. """ - if self.is_leaf_node(tree_num, node_id) or self.is_numeric_split_node(tree_num, node_id): + if self.is_leaf_node(tree_num, node_id) or self.is_numeric_split_node( + tree_num, node_id + ): return np.array([np.Inf]) else: return self.forest_cpp.SplitCategories(tree_num, node_id) - + def node_leaf_values(self, tree_num: int, node_id: int) -> np.array: """ - Leaf node value(s) for a given node of a given tree of a forest. + Leaf node value(s) for a given node of a given tree of a forest. Values are stale if the node is a split node. Parameters @@ -1138,14 +1242,14 @@ def node_leaf_values(self, tree_num: int, node_id: int) -> np.array: Index of the tree to be queried node_id : int Index of the node to be queried - + Returns ------- - np.array + np.array Array of parameter values for node `node_id` in tree `tree_num`. """ return self.forest_cpp.NodeLeafValues(tree_num, node_id) - + def num_nodes(self, tree_num: int) -> int: """ Number of nodes in a given tree of a forest @@ -1154,14 +1258,14 @@ def num_nodes(self, tree_num: int) -> int: ---------- tree_num : int Index of the tree to be queried - + Returns ------- - int + int Total number of nodes in tree `tree_num`. """ return self.forest_cpp.NumNodes(tree_num) - + def num_leaves(self, tree_num: int) -> int: """ Number of leaves in a given tree of a forest @@ -1170,14 +1274,14 @@ def num_leaves(self, tree_num: int) -> int: ---------- tree_num : int Index of the tree to be queried - + Returns ------- - int + int Total number of leaves in tree `tree_num`. """ return self.forest_cpp.NumLeaves(tree_num) - + def num_leaf_parents(self, tree_num: int) -> int: """ Number of leaf parents in a given tree of a forest @@ -1186,14 +1290,14 @@ def num_leaf_parents(self, tree_num: int) -> int: ---------- tree_num : int Index of the tree to be queried - + Returns ------- - int + int Total number of leaf parents in tree `tree_num`. """ return self.forest_cpp.NumLeafParents(tree_num) - + def num_split_nodes(self, tree_num: int) -> int: """ Number of split_nodes in a given tree of a forest @@ -1202,14 +1306,14 @@ def num_split_nodes(self, tree_num: int) -> int: ---------- tree_num : int Index of the tree to be queried - + Returns ------- - int + int Total number of split nodes in tree `tree_num`. """ return self.forest_cpp.NumSplitNodes(tree_num) - + def nodes(self, tree_num: int) -> np.array: """ Array of node indices in a given tree of a forest @@ -1218,14 +1322,14 @@ def nodes(self, tree_num: int) -> np.array: ---------- tree_num : int Index of the tree to be queried - + Returns ------- - np.array + np.array Array of indices of nodes in tree `tree_num`. """ return self.forest_cpp.Nodes(tree_num) - + def leaves(self, tree_num: int) -> np.array: """ Array of leaf indices in a given tree of a forest @@ -1234,10 +1338,10 @@ def leaves(self, tree_num: int) -> np.array: ---------- tree_num : int Index of the tree to be queried - + Returns ------- - np.array + np.array Array of indices of leaf nodes in tree `tree_num`. """ return self.forest_cpp.Leaves(tree_num) diff --git a/stochtree/preprocessing.py b/stochtree/preprocessing.py index 35633264..73d76655 100644 --- a/stochtree/preprocessing.py +++ b/stochtree/preprocessing.py @@ -3,6 +3,7 @@ Copyright (c) 2007-2024 The scikit-learn developers. """ + from typing import Union, Optional, Any, Dict from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder import numpy as np @@ -11,127 +12,132 @@ import warnings from .serialization import JSONSerializer -def _preprocess_params(default_params: Dict[str, Any], user_params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + +def _preprocess_params( + default_params: Dict[str, Any], user_params: Optional[Dict[str, Any]] = None +) -> Dict[str, Any]: if user_params: for key, value in user_params.items(): if key in default_params: default_params[key] = value - + return default_params def _preprocess_bart_params(params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: processed_params = { - 'cutpoint_grid_size' : 100, - 'sigma_leaf' : None, - 'alpha_mean' : 0.95, - 'beta_mean' : 2.0, - 'min_samples_leaf_mean' : 5, - 'max_depth_mean' : 10, - 'alpha_variance' : 0.95, - 'beta_variance' : 2.0, - 'min_samples_leaf_variance' : 5, - 'max_depth_variance' : 10, - 'a_global' : 0, - 'b_global' : 0, - 'a_leaf' : 3, - 'b_leaf' : None, - 'a_forest' : None, - 'b_forest' : None, - 'sigma2_init' : None, - 'variance_forest_leaf_init' : None, - 'pct_var_sigma2_init' : 1, - 'pct_var_variance_forest_init' : 1, - 'variance_scale' : 1, - 'variable_weights_mean' : None, - 'variable_weights_variance' : None, - 'num_trees_mean' : 200, - 'num_trees_variance' : 0, - 'sample_sigma_global' : True, - 'sample_sigma_leaf' : True, - 'random_seed' : -1, - 'keep_burnin' : False, - 'keep_gfr' : False, - 'standardize': True, - 'num_chains' : 1, - 'keep_every' : 1 + "cutpoint_grid_size": 100, + "sigma_leaf": None, + "alpha_mean": 0.95, + "beta_mean": 2.0, + "min_samples_leaf_mean": 5, + "max_depth_mean": 10, + "alpha_variance": 0.95, + "beta_variance": 2.0, + "min_samples_leaf_variance": 5, + "max_depth_variance": 10, + "a_global": 0, + "b_global": 0, + "a_leaf": 3, + "b_leaf": None, + "a_forest": None, + "b_forest": None, + "sigma2_init": None, + "variance_forest_leaf_init": None, + "pct_var_sigma2_init": 1, + "pct_var_variance_forest_init": 1, + "variance_scale": 1, + "variable_weights_mean": None, + "variable_weights_variance": None, + "num_trees_mean": 200, + "num_trees_variance": 0, + "sample_sigma_global": True, + "sample_sigma_leaf": True, + "random_seed": -1, + "keep_burnin": False, + "keep_gfr": False, + "standardize": True, + "num_chains": 1, + "keep_every": 1, } - + if params: for key, value in params.items(): if key not in processed_params: - raise ValueError(f'Parameter {key} not a valid BART parameter') + raise ValueError(f"Parameter {key} not a valid BART parameter") processed_params[key] = value - + return processed_params def _preprocess_bcf_params(params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: processed_params = { - 'cutpoint_grid_size': 100, - 'sigma_leaf_mu': None, - 'sigma_leaf_tau': None, - 'alpha_mu': 0.95, - 'alpha_tau': 0.25, - 'alpha_variance': 0.95, - 'beta_mu': 2.0, - 'beta_tau': 3.0, - 'beta_variance': 2.0, - 'min_samples_leaf_mu': 5, - 'min_samples_leaf_tau': 5, - 'min_samples_leaf_variance': 5, - 'max_depth_mu': 10, - 'max_depth_tau': 5, - 'max_depth_variance': 10, - 'a_global': 0, - 'b_global': 0, - 'a_leaf_mu': 3, - 'a_leaf_tau': 3, - 'b_leaf_mu': None, - 'b_leaf_tau': None, - 'a_forest' : None, - 'b_forest' : None, - 'sigma2_init': None, - 'variance_forest_leaf_init' : None, - 'pct_var_sigma2_init': 1, - 'pct_var_variance_forest_init' : 1, - 'variable_weights_mu': None, - 'variable_weights_tau': None, - 'variable_weights_variance': None, - 'keep_vars_mu': None, - 'drop_vars_mu': None, - 'keep_vars_tau': None, - 'drop_vars_tau': None, - 'keep_vars_variance': None, - 'drop_vars_variance': None, - 'num_trees_mu': 200, - 'num_trees_tau': 50, - 'num_trees_variance': 0, - 'sample_sigma_global': True, - 'sample_sigma_leaf_mu': True, - 'sample_sigma_leaf_tau': False, - 'propensity_covariate': "mu", - 'adaptive_coding': True, - 'b_0': -0.5, - 'b_1': 0.5, - 'random_seed': -1, - 'keep_burnin': False, - 'keep_gfr': False, - 'standardize': True, - 'num_chains' : 1, - 'keep_every' : 1 + "cutpoint_grid_size": 100, + "sigma_leaf_mu": None, + "sigma_leaf_tau": None, + "alpha_mu": 0.95, + "alpha_tau": 0.25, + "alpha_variance": 0.95, + "beta_mu": 2.0, + "beta_tau": 3.0, + "beta_variance": 2.0, + "min_samples_leaf_mu": 5, + "min_samples_leaf_tau": 5, + "min_samples_leaf_variance": 5, + "max_depth_mu": 10, + "max_depth_tau": 5, + "max_depth_variance": 10, + "a_global": 0, + "b_global": 0, + "a_leaf_mu": 3, + "a_leaf_tau": 3, + "b_leaf_mu": None, + "b_leaf_tau": None, + "a_forest": None, + "b_forest": None, + "sigma2_init": None, + "variance_forest_leaf_init": None, + "pct_var_sigma2_init": 1, + "pct_var_variance_forest_init": 1, + "variable_weights_mu": None, + "variable_weights_tau": None, + "variable_weights_variance": None, + "keep_vars_mu": None, + "drop_vars_mu": None, + "keep_vars_tau": None, + "drop_vars_tau": None, + "keep_vars_variance": None, + "drop_vars_variance": None, + "num_trees_mu": 200, + "num_trees_tau": 50, + "num_trees_variance": 0, + "sample_sigma_global": True, + "sample_sigma_leaf_mu": True, + "sample_sigma_leaf_tau": False, + "propensity_covariate": "mu", + "adaptive_coding": True, + "b_0": -0.5, + "b_1": 0.5, + "random_seed": -1, + "keep_burnin": False, + "keep_gfr": False, + "standardize": True, + "num_chains": 1, + "keep_every": 1, } - + if params: for key, value in params.items(): if key not in processed_params: - raise ValueError(f'Parameter {key} not a valid BCF parameter') + raise ValueError(f"Parameter {key} not a valid BCF parameter") processed_params[key] = value - + return processed_params -def _map_to_integer(values: Union[np.array, list], uniques: Union[np.array, list]) -> np.array: +def _map_to_integer( + values: Union[np.array, list], uniques: Union[np.array, list] +) -> np.array: r""" Slightly modified version of a [scikit-learn function](https://github.com/scikit-learn/scikit-learn/blob/43d440f1f874ac2117ed848b10a6f07d9083488d/sklearn/utils/_encode.py#L170) by the same name. Converts dataframe column values (which might be string, categorical, etc...) to numpy integer indices. @@ -151,12 +157,13 @@ class CovariatePreprocessor: r""" Preprocessing engine for covariates provided as either `np.array` or `pd.DataFrame`, which standardizes inputs as a `np.array`. - `CovariatePreprocessor` uses [column dtypes](https://pandas.pydata.org/docs/user_guide/basics.html#basics-dtypes) in provided - dataframes to convert string / categorical variables to numeric variables, either by mapping ordinal variables to integers + `CovariatePreprocessor` uses [column dtypes](https://pandas.pydata.org/docs/user_guide/basics.html#basics-dtypes) in provided + dataframes to convert string / categorical variables to numeric variables, either by mapping ordinal variables to integers or by one-hot encoding unordered categorical variables. - + This class is modeled after the [scikit-learn preprocessing classes](https://scikit-learn.org/1.5/modules/preprocessing.html). """ + def __init__(self) -> None: self._is_fitted = False self._num_ordinal_features = 0 @@ -169,24 +176,31 @@ def __init__(self) -> None: self._processed_feature_types = None self._original_feature_types = None self._original_feature_indices = None - + def _check_is_numeric_dtype(self, dtype: np.dtype) -> bool: - if dtype.kind == "b" or dtype.kind == "i" or dtype.kind == "u" or dtype.kind == "f": + if ( + dtype.kind == "b" + or dtype.kind == "i" + or dtype.kind == "u" + or dtype.kind == "f" + ): return True else: return False - + def _extract_categories_unordered_categorical(self, covariate: pd.Series) -> int: covariate_categories = covariate.array.categories.to_numpy() self._onehot_categories_list.append(covariate_categories) return self._num_onehot_features - + def _extract_categories_ordered_categorical(self, covariate: pd.Series) -> int: covariate_categories = covariate.array.categories.to_numpy() self._ordinal_categories_list.append(covariate_categories) return self._num_ordinal_features - - def _transform_unordered_categorical(self, covariate: pd.Series, covariate_categories: np.array) -> np.array: + + def _transform_unordered_categorical( + self, covariate: pd.Series, covariate_categories: np.array + ) -> np.array: """ Adapted from https://github.com/scikit-learn/scikit-learn/blob/8f2c1cab50262bcf4a1ade070446c40028ee27f4/sklearn/preprocessing/_encoders.py#L1000 """ @@ -201,25 +215,39 @@ def _transform_unordered_categorical(self, covariate: pd.Series, covariate_categ dtype=np.float64, ) return out.toarray() - - def _transform_ordered_categorical(self, covariate: pd.Series, covariate_categories: np.array) -> np.array: + + def _transform_ordered_categorical( + self, covariate: pd.Series, covariate_categories: np.array + ) -> np.array: covariate_data = covariate.array.to_numpy() return _map_to_integer(covariate_data, covariate_categories) def _fit_pandas(self, covariates: pd.DataFrame) -> None: self._num_original_features = covariates.shape[1] - self._ordinal_feature_index = np.array([-1 for i in range(self._num_original_features)], dtype=int) - self._onehot_feature_index = np.array([-1 for i in range(self._num_original_features)], dtype=int) + self._ordinal_feature_index = np.array( + [-1 for i in range(self._num_original_features)], dtype=int + ) + self._onehot_feature_index = np.array( + [-1 for i in range(self._num_original_features)], dtype=int + ) original_feature_types = [-1 for i in range(self._num_original_features)] - datetime_types = covariates.apply(lambda x: pd.api.types.is_datetime64_any_dtype(x)) + datetime_types = covariates.apply( + lambda x: pd.api.types.is_datetime64_any_dtype(x) + ) object_types = covariates.apply(lambda x: pd.api.types.is_object_dtype(x)) - interval_types = covariates.apply(lambda x: isinstance(x.dtype, pd.IntervalDtype)) + interval_types = covariates.apply( + lambda x: isinstance(x.dtype, pd.IntervalDtype) + ) period_types = covariates.apply(lambda x: isinstance(x.dtype, pd.PeriodDtype)) - timedelta_types = np.logical_or(covariates.apply(lambda x: pd.api.types.is_timedelta64_dtype(x)), - covariates.apply(lambda x: pd.api.types.is_timedelta64_ns_dtype(x))) + timedelta_types = np.logical_or( + covariates.apply(lambda x: pd.api.types.is_timedelta64_dtype(x)), + covariates.apply(lambda x: pd.api.types.is_timedelta64_ns_dtype(x)), + ) sparse_types = covariates.apply(lambda x: isinstance(x.dtype, pd.SparseDtype)) bool_types = covariates.apply(lambda x: pd.api.types.is_bool_dtype(x)) - categorical_types = covariates.apply(lambda x: isinstance(x.dtype, pd.CategoricalDtype)) + categorical_types = covariates.apply( + lambda x: isinstance(x.dtype, pd.CategoricalDtype) + ) float_types = covariates.apply(lambda x: pd.api.types.is_float_dtype(x)) integer_types = covariates.apply(lambda x: pd.api.types.is_integer_dtype(x)) string_types = covariates.apply(lambda x: pd.api.types.is_string_dtype(x)) @@ -253,10 +281,10 @@ def _fit_pandas(self, covariates: pd.DataFrame) -> None: object_cols = covariates.columns[object_types].to_list() warn_msg = "The following columns are a type unsupported by stochtree (object) and will be ignored: {}" warnings.warn(warn_msg.format(object_cols)) - + processed_feature_types = [] for i in range(covariates.shape[1]): - covariate = covariates.iloc[:,i] + covariate = covariates.iloc[:, i] if categorical_types.iloc[i]: original_feature_types[i] = "category" if covariate.array.ordered: @@ -265,16 +293,22 @@ def _fit_pandas(self, covariates: pd.DataFrame) -> None: processed_feature_types.append(1) self._num_ordinal_features += 1 else: - onehot_index = self._extract_categories_unordered_categorical(covariate) + onehot_index = self._extract_categories_unordered_categorical( + covariate + ) self._onehot_feature_index[i] = onehot_index - feature_ones = np.repeat(1, len(covariate.array.categories)).tolist() + feature_ones = np.repeat( + 1, len(covariate.array.categories) + ).tolist() processed_feature_types.extend(feature_ones) self._num_onehot_features += 1 elif string_types.iloc[i]: original_feature_types[i] = "string" onehot_index = self._extract_categories_unordered_categorical(covariate) self._onehot_feature_index[i] = onehot_index - feature_ones = np.repeat(1, len(self._onehot_encoders[onehot_index].categories_[0])).tolist() + feature_ones = np.repeat( + 1, len(self._onehot_encoders[onehot_index].categories_[0]) + ).tolist() processed_feature_types.extend(feature_ones) elif bool_types.iloc[i]: original_feature_types[i] = "boolean" @@ -287,20 +321,26 @@ def _fit_pandas(self, covariates: pd.DataFrame) -> None: processed_feature_types.append(0) else: original_feature_types[i] = "unsupported" - + self._processed_feature_types = np.array(processed_feature_types, dtype=int) self._original_feature_types = np.array(original_feature_types) - + def _fit_numpy(self, covariates: np.array) -> None: if covariates.ndim == 1: covariates = np.expand_dims(covariates, 1) elif covariates.ndim > 2: raise ValueError("Covariates passed as a numpy array must be 1d or 2d") - + self._num_original_features = covariates.shape[1] - self._ordinal_feature_index = np.array([-1 for i in range(self._num_original_features)], dtype=int) - self._onehot_feature_index = np.array([-1 for i in range(self._num_original_features)], dtype=int) - self._original_feature_types = np.array(["float" for i in range(self._num_original_features)]) + self._ordinal_feature_index = np.array( + [-1 for i in range(self._num_original_features)], dtype=int + ) + self._onehot_feature_index = np.array( + [-1 for i in range(self._num_original_features)], dtype=int + ) + self._original_feature_types = np.array( + ["float" for i in range(self._num_original_features)] + ) # Check whether the array is numeric cov_dtype = covariates.dtype @@ -312,18 +352,20 @@ def _fit_numpy(self, covariates: np.array) -> None: if not self._check_is_numeric_dtype(cov_dtype[i]): array_numeric = False if not array_numeric: - raise ValueError("Covariates passed as np.array must all be simple numeric types (bool, integer, unsigned integer, floating point)") - + raise ValueError( + "Covariates passed as np.array must all be simple numeric types (bool, integer, unsigned integer, floating point)" + ) + # Scan for binary columns processed_feature_types = [] for i in range(self._num_original_features): - num_unique = np.unique(covariates[:,i]).size + num_unique = np.unique(covariates[:, i]).size if num_unique == 2: processed_feature_types.append(1) else: processed_feature_types.append(0) # TODO: Convert to integer if not passed as integer - + self._processed_feature_types = np.array(processed_feature_types, dtype=int) def _fit(self, covariates: Union[pd.DataFrame, np.array]) -> None: @@ -337,43 +379,59 @@ def _fit(self, covariates: Union[pd.DataFrame, np.array]) -> None: def _transform_pandas(self, covariates: pd.DataFrame) -> np.array: if self._num_original_features != covariates.shape[1]: - raise ValueError("Attempting to call transform from a CovariateTransformer that was fit on a dataset with different dimensionality") - - output_array = np.empty((covariates.shape[0], len(self._processed_feature_types)), dtype=np.float64) + raise ValueError( + "Attempting to call transform from a CovariateTransformer that was fit on a dataset with different dimensionality" + ) + + output_array = np.empty( + (covariates.shape[0], len(self._processed_feature_types)), dtype=np.float64 + ) output_iter = 0 original_feature_indices = [] print(self._original_feature_types) for i in range(covariates.shape[1]): - covariate = covariates.iloc[:,i] - if self._original_feature_types[i] == "category" or self._original_feature_types[i] == "string": + covariate = covariates.iloc[:, i] + if ( + self._original_feature_types[i] == "category" + or self._original_feature_types[i] == "string" + ): if self._ordinal_feature_index[i] != -1: ord_ind = self._ordinal_feature_index[i] covariate_categories = self._ordinal_categories_list[ord_ind] - covariate_transformed = self._transform_ordered_categorical(covariate, covariate_categories) - output_array[:,output_iter] = np.squeeze(covariate_transformed) + covariate_transformed = self._transform_ordered_categorical( + covariate, covariate_categories + ) + output_array[:, output_iter] = np.squeeze(covariate_transformed) output_iter += 1 original_feature_indices.append(i) else: onehot_ind = self._onehot_feature_index[i] covariate_categories = self._onehot_categories_list[onehot_ind] - covariate_transformed = self._transform_unordered_categorical(covariate, covariate_categories) + covariate_transformed = self._transform_unordered_categorical( + covariate, covariate_categories + ) output_dim = covariate_transformed.shape[1] - output_array[:,np.arange(output_iter, output_iter + output_dim)] = np.squeeze(covariate_transformed) + output_array[ + :, np.arange(output_iter, output_iter + output_dim) + ] = np.squeeze(covariate_transformed) output_iter += output_dim original_feature_indices.extend([i for _ in range(output_dim)]) - + elif self._original_feature_types[i] == "boolean": - output_array[:,output_iter] = (covariate*1.0).to_numpy() + output_array[:, output_iter] = (covariate * 1.0).to_numpy() output_iter += 1 original_feature_indices.append(i) - - elif self._original_feature_types[i] == "integer" or self._original_feature_types[i] == "float": - output_array[:,output_iter] = (covariate).to_numpy() + + elif ( + self._original_feature_types[i] == "integer" + or self._original_feature_types[i] == "float" + ): + output_array[:, output_iter] = (covariate).to_numpy() output_iter += 1 original_feature_indices.append(i) - + self._original_feature_indices = np.array(original_feature_indices, dtype=int) - + return output_array def _transform_numpy(self, covariates: np.array) -> np.array: @@ -382,8 +440,12 @@ def _transform_numpy(self, covariates: np.array) -> np.array: elif covariates.ndim > 2: raise ValueError("Covariates passed as a numpy array must be 1d or 2d") if self._num_original_features != covariates.shape[1]: - raise ValueError("Attempting to call transform from a CovariateTransformer that was fit on a dataset with different dimensionality") - self._original_feature_indices = np.array([i for i in range(covariates.shape[1])]) + raise ValueError( + "Attempting to call transform from a CovariateTransformer that was fit on a dataset with different dimensionality" + ) + self._original_feature_indices = np.array( + [i for i in range(covariates.shape[1])] + ) return covariates def _transform(self, covariates: Union[pd.DataFrame, np.array]) -> np.array: @@ -395,7 +457,9 @@ def _transform(self, covariates: Union[pd.DataFrame, np.array]) -> np.array: else: raise ValueError("covariates must be a pd.DataFrame or a np.array") else: - raise ValueError("Attempting to call transform() from an CovariateTransformer that has not yet been fit") + raise ValueError( + "Attempting to call transform() from an CovariateTransformer that has not yet been fit" + ) def _check_is_fitted(self) -> bool: return self._is_fitted @@ -404,9 +468,9 @@ def fit(self, covariates: Union[pd.DataFrame, np.array]) -> None: r"""Fits a `CovariatePreprocessor` by unpacking (and storing) data type information on the input (raw) covariates and then converting to a numpy array which can be passed to a tree ensemble sampler. - If `covariates` is a `pd.DataFrame`, [column dtypes](https://pandas.pydata.org/docs/user_guide/basics.html#basics-dtypes) + If `covariates` is a `pd.DataFrame`, [column dtypes](https://pandas.pydata.org/docs/user_guide/basics.html#basics-dtypes) will be handled as follows: - + * `category`: one-hot encoded if unordered, ordinal encoded if ordered * `string`: one-hot encoded * `boolean`: passed through as binary integer, treated as ordered categorical by tree samplers @@ -419,11 +483,11 @@ def fit(self, covariates: Union[pd.DataFrame, np.array]) -> None: * Sparse (i.e. `Sparse`, `Sparse[float]`): currently unsupported, convert sparse columns to dense before passing Columns with unsupported types will be ignored, with a warning. - - If `covariates` is a `np.array`, columns must be numeric and the only preprocessing done by `CovariateTransformer.fit()` is to - auto-detect binary columns. All other integer-valued columns will be passed through to the tree sampler as (continuous) numeric data. - If you would like to treat integer-valued data as categorical, you can either convert your numpy array to a pandas dataframe and - explicitly tag such columns as ordered / unordered categorical, or preprocess manually using `sklearn.preprocessing.OneHotEncoder` + + If `covariates` is a `np.array`, columns must be numeric and the only preprocessing done by `CovariateTransformer.fit()` is to + auto-detect binary columns. All other integer-valued columns will be passed through to the tree sampler as (continuous) numeric data. + If you would like to treat integer-valued data as categorical, you can either convert your numpy array to a pandas dataframe and + explicitly tag such columns as ordered / unordered categorical, or preprocess manually using `sklearn.preprocessing.OneHotEncoder` and `sklearn.preprocessing.OrdinalEncoder`. Parameters @@ -435,20 +499,20 @@ def fit(self, covariates: Union[pd.DataFrame, np.array]) -> None: return self def transform(self, covariates: Union[pd.DataFrame, np.array]) -> np.array: - r"""Run a fitted a `CovariateTransformer` on a new covariate set, - returning a numpy array of covariates preprocessed into a format needed + r"""Run a fitted a `CovariateTransformer` on a new covariate set, + returning a numpy array of covariates preprocessed into a format needed to sample or predict from a `stochtree` ensemble. Parameters ---------- covariates : np.array or pd.DataFrame Covariates to be preprocessed. - + Returns ------- np.array - Numpy array of preprocessed covariates, with as many rows as in `covariates` - and as many columns as were created during pre-processing (including one-hot encoding + Numpy array of preprocessed covariates, with as many rows as in `covariates` + and as many columns as were created during pre-processing (including one-hot encoding categorical features). """ return self._transform(covariates) @@ -460,36 +524,36 @@ def fit_transform(self, covariates: Union[pd.DataFrame, np.array]) -> np.array: ---------- covariates : np.array or pd.DataFrame Covariates to be preprocessed. - + Returns ------- np.array - Numpy array of preprocessed covariates, with as many rows as in `covariates` - and as many columns as were created during pre-processing (including one-hot encoding + Numpy array of preprocessed covariates, with as many rows as in `covariates` + and as many columns as were created during pre-processing (including one-hot encoding categorical features). """ self._fit(covariates) return self._transform(covariates) - + def fetch_original_feature_indices(self) -> list: - r"""Map features in a preprocessed covariate set back to the + r"""Map features in a preprocessed covariate set back to the original set of features provided to a `CovariateTransformer`. Returns ------- list - List with as many entries as features in the preprocessed results - returned by a fitted `CovariateTransformer`. Each element is a feature + List with as many entries as features in the preprocessed results + returned by a fitted `CovariateTransformer`. Each element is a feature index indicating the feature from which a given preprocessed feature was generated. - If a single categorical feature were one-hot encoded into 5 binary features, + If a single categorical feature were one-hot encoded into 5 binary features, this method would return a list `[0,0,0,0,0]`. If the transformer merely passes through `k` numeric features, this method would return a list `[0,...,k-1]`. """ return self._original_feature_indices.tolist() - + def to_json(self) -> str: """ - Converts a covariate preprocessor to JSON string representation (which can then be saved to a file or + Converts a covariate preprocessor to JSON string representation (which can then be saved to a file or processed using the `json` library) Returns @@ -499,12 +563,16 @@ def to_json(self) -> str: """ # Initialize JSONSerializer object preprocessor_json = JSONSerializer() - + # Add internal scalars preprocessor_json.add_boolean("is_fitted", self._is_fitted) - preprocessor_json.add_integer("num_ordinal_features", self._num_ordinal_features) + preprocessor_json.add_integer( + "num_ordinal_features", self._num_ordinal_features + ) preprocessor_json.add_integer("num_onehot_features", self._num_onehot_features) - preprocessor_json.add_integer("num_original_features", self._num_original_features) + preprocessor_json.add_integer( + "num_original_features", self._num_original_features + ) # Add internal lists for i in range(self._num_ordinal_features): @@ -512,33 +580,61 @@ def to_json(self) -> str: list_name = "cats_{:d}".format(i) if np.issubdtype(self._ordinal_categories_list[i].dtype, np.integer): array_type = "int" - preprocessor_json.add_integer_vector(list_name, self._ordinal_categories_list[i], "ordinal_categories_list") + preprocessor_json.add_integer_vector( + list_name, + self._ordinal_categories_list[i], + "ordinal_categories_list", + ) elif np.issubdtype(self._ordinal_categories_list[i].dtype, np.floating): array_type = "float" - preprocessor_json.add_numeric_vector(list_name, self._ordinal_categories_list[i], "ordinal_categories_list") + preprocessor_json.add_numeric_vector( + list_name, + self._ordinal_categories_list[i], + "ordinal_categories_list", + ) else: array_type = "str" - preprocessor_json.add_string_vector(list_name, self._ordinal_categories_list[i], "ordinal_categories_list") + preprocessor_json.add_string_vector( + list_name, + self._ordinal_categories_list[i], + "ordinal_categories_list", + ) preprocessor_json.add_string(dtype_name, array_type, "ordinal_dtype_list") for i in range(self._num_onehot_features): dtype_name = "dtype_{:d}".format(i) list_name = "cats_{:d}".format(i) if np.issubdtype(self._onehot_categories_list[i].dtype, np.integer): array_type = "int" - preprocessor_json.add_integer_vector(list_name, self._onehot_categories_list[i], "onehot_categories_list") + preprocessor_json.add_integer_vector( + list_name, self._onehot_categories_list[i], "onehot_categories_list" + ) elif np.issubdtype(self._onehot_categories_list[i].dtype, np.floating): array_type = "float" - preprocessor_json.add_numeric_vector(list_name, self._onehot_categories_list[i], "onehot_categories_list") + preprocessor_json.add_numeric_vector( + list_name, self._onehot_categories_list[i], "onehot_categories_list" + ) else: array_type = "str" - preprocessor_json.add_string_vector(list_name, self._onehot_categories_list[i], "onehot_categories_list") + preprocessor_json.add_string_vector( + list_name, self._onehot_categories_list[i], "onehot_categories_list" + ) preprocessor_json.add_string(dtype_name, array_type, "onehot_dtype_list") - preprocessor_json.add_integer_vector("ordinal_feature_index", self._ordinal_feature_index) - preprocessor_json.add_integer_vector("onehot_feature_index", self._onehot_feature_index) - preprocessor_json.add_integer_vector("processed_feature_types", self._processed_feature_types) - preprocessor_json.add_string_vector("original_feature_types", self._original_feature_types) - preprocessor_json.add_integer_vector("original_feature_indices", self._original_feature_indices) - + preprocessor_json.add_integer_vector( + "ordinal_feature_index", self._ordinal_feature_index + ) + preprocessor_json.add_integer_vector( + "onehot_feature_index", self._onehot_feature_index + ) + preprocessor_json.add_integer_vector( + "processed_feature_types", self._processed_feature_types + ) + preprocessor_json.add_string_vector( + "original_feature_types", self._original_feature_types + ) + preprocessor_json.add_integer_vector( + "original_feature_indices", self._original_feature_indices + ) + return preprocessor_json.return_json_string() def from_json(self, json_string: str) -> None: @@ -553,12 +649,16 @@ def from_json(self, json_string: str) -> None: # Parse string to a JSON object in C++ preprocessor_json = JSONSerializer() preprocessor_json.load_from_json_string(json_string) - + # Unpack internal scalars self._is_fitted = preprocessor_json.get_boolean("is_fitted") - self._num_ordinal_features = preprocessor_json.get_integer("num_ordinal_features") + self._num_ordinal_features = preprocessor_json.get_integer( + "num_ordinal_features" + ) self._num_onehot_features = preprocessor_json.get_integer("num_onehot_features") - self._num_original_features = preprocessor_json.get_integer("num_original_features") + self._num_original_features = preprocessor_json.get_integer( + "num_original_features" + ) # Unpack internal lists self._ordinal_categories_list = [] @@ -567,24 +667,60 @@ def from_json(self, json_string: str) -> None: list_name = "cats_{:d}".format(i) array_type = preprocessor_json.get_string(dtype_name, "ordinal_dtype_list") if array_type == "int": - self._ordinal_categories_list.append(preprocessor_json.get_integer_vector(list_name, "ordinal_categories_list")) + self._ordinal_categories_list.append( + preprocessor_json.get_integer_vector( + list_name, "ordinal_categories_list" + ) + ) elif array_type == "float": - self._ordinal_categories_list.append(preprocessor_json.get_numeric_vector(list_name, "ordinal_categories_list")) + self._ordinal_categories_list.append( + preprocessor_json.get_numeric_vector( + list_name, "ordinal_categories_list" + ) + ) else: - self._ordinal_categories_list.append(preprocessor_json.get_string_vector(list_name, "ordinal_categories_list")) + self._ordinal_categories_list.append( + preprocessor_json.get_string_vector( + list_name, "ordinal_categories_list" + ) + ) self._onehot_categories_list = [] for i in range(self._num_onehot_features): dtype_name = "dtype_{:d}".format(i) list_name = "cats_{:d}".format(i) array_type = preprocessor_json.get_string(dtype_name, "onehot_dtype_list") if array_type == "int": - self._onehot_categories_list.append(preprocessor_json.get_integer_vector(list_name, "onehot_categories_list")) + self._onehot_categories_list.append( + preprocessor_json.get_integer_vector( + list_name, "onehot_categories_list" + ) + ) elif array_type == "float": - self._onehot_categories_list.append(preprocessor_json.get_numeric_vector(list_name, "onehot_categories_list")) + self._onehot_categories_list.append( + preprocessor_json.get_numeric_vector( + list_name, "onehot_categories_list" + ) + ) else: - self._onehot_categories_list.append(np.array(preprocessor_json.get_string_vector(list_name, "onehot_categories_list"))) - self._ordinal_feature_index = preprocessor_json.get_integer_vector("ordinal_feature_index") - self._onehot_feature_index = preprocessor_json.get_integer_vector("onehot_feature_index") - self._processed_feature_types = preprocessor_json.get_integer_vector("processed_feature_types") - self._original_feature_types = preprocessor_json.get_string_vector("original_feature_types") - self._original_feature_indices = preprocessor_json.get_integer_vector("original_feature_indices") + self._onehot_categories_list.append( + np.array( + preprocessor_json.get_string_vector( + list_name, "onehot_categories_list" + ) + ) + ) + self._ordinal_feature_index = preprocessor_json.get_integer_vector( + "ordinal_feature_index" + ) + self._onehot_feature_index = preprocessor_json.get_integer_vector( + "onehot_feature_index" + ) + self._processed_feature_types = preprocessor_json.get_integer_vector( + "processed_feature_types" + ) + self._original_feature_types = preprocessor_json.get_string_vector( + "original_feature_types" + ) + self._original_feature_indices = preprocessor_json.get_integer_vector( + "original_feature_indices" + ) diff --git a/stochtree/sampler.py b/stochtree/sampler.py index a5876099..95f2e353 100644 --- a/stochtree/sampler.py +++ b/stochtree/sampler.py @@ -1,24 +1,32 @@ """ Python classes wrapping C++ sampler objects """ + import numpy as np from .data import Dataset, Residual from .forest import ForestContainer, Forest -from stochtree_cpp import RngCpp, ForestSamplerCpp, GlobalVarianceModelCpp, LeafVarianceModelCpp +from stochtree_cpp import ( + RngCpp, + ForestSamplerCpp, + GlobalVarianceModelCpp, + LeafVarianceModelCpp, +) from typing import Union + class RNG: """ - Wrapper around the C++ standard library random number generator. + Wrapper around the C++ standard library random number generator. Accepts an optional random seed at initialization for replicability. Parameters ---------- random_seed : int, optional - Random seed for replicability. If not specified, the default value of `-1` - triggers an initialization of the RNG based on + Random seed for replicability. If not specified, the default value of `-1` + triggers an initialization of the RNG based on [std::random_device](https://en.cppreference.com/w/cpp/numeric/random/random_device). """ + def __init__(self, random_seed: int = -1) -> None: self.rng_cpp = RngCpp(random_seed) @@ -32,8 +40,8 @@ class ForestSampler: dataset : Dataset `stochtree` dataset object storing covariates / bases / weights feature_types : np.array - Array of integer-coded values indicating the column type of each feature in `dataset`. - Integer codes map `0` to "numeric" (continuous), `1` to "ordered categorical, and `2` to + Array of integer-coded values indicating the column type of each feature in `dataset`. + Integer codes map `0` to "numeric" (continuous), `1` to "ordered categorical, and `2` to "unordered categorical". num_trees : int Number of trees in the forest model that this sampler class will fit. @@ -48,10 +56,32 @@ class ForestSampler: max_depth : int, optional Maximum depth of any tree in the ensemble in a forest model. """ - def __init__(self, dataset: Dataset, feature_types: np.array, num_trees: int, num_obs: int, alpha: float, beta: float, min_samples_leaf: int, max_depth: int = -1) -> None: - self.forest_sampler_cpp = ForestSamplerCpp(dataset.dataset_cpp, feature_types, num_trees, num_obs, alpha, beta, min_samples_leaf, max_depth) - - def reconstitute_from_forest(self, forest: Forest, dataset: Dataset, residual: Residual, is_mean_model: bool) -> None: + + def __init__( + self, + dataset: Dataset, + feature_types: np.array, + num_trees: int, + num_obs: int, + alpha: float, + beta: float, + min_samples_leaf: int, + max_depth: int = -1, + ) -> None: + self.forest_sampler_cpp = ForestSamplerCpp( + dataset.dataset_cpp, + feature_types, + num_trees, + num_obs, + alpha, + beta, + min_samples_leaf, + max_depth, + ) + + def reconstitute_from_forest( + self, forest: Forest, dataset: Dataset, residual: Residual, is_mean_model: bool + ) -> None: """ Re-initialize a forest sampler tracking data structures from a specific forest in a `ForestContainer` @@ -66,12 +96,29 @@ def reconstitute_from_forest(self, forest: Forest, dataset: Dataset, residual: R is_mean_model : bool Indicator of whether the model being updated a conditional mean model (`True`) or a conditional variance model (`False`) """ - self.forest_sampler_cpp.ReconstituteTrackerFromForest(forest.forest_cpp, dataset.dataset_cpp, residual.residual_cpp, is_mean_model) - - def sample_one_iteration(self, forest_container: ForestContainer, forest: Forest, dataset: Dataset, - residual: Residual, rng: RNG, feature_types: np.array, cutpoint_grid_size: int, - leaf_model_scale_input: np.array, variable_weights: np.array, a_forest: float, b_forest: float, - global_variance: float, leaf_model_int: int, keep_forest: bool, gfr: bool, pre_initialized: bool) -> None: + self.forest_sampler_cpp.ReconstituteTrackerFromForest( + forest.forest_cpp, dataset.dataset_cpp, residual.residual_cpp, is_mean_model + ) + + def sample_one_iteration( + self, + forest_container: ForestContainer, + forest: Forest, + dataset: Dataset, + residual: Residual, + rng: RNG, + feature_types: np.array, + cutpoint_grid_size: int, + leaf_model_scale_input: np.array, + variable_weights: np.array, + a_forest: float, + b_forest: float, + global_variance: float, + leaf_model_int: int, + keep_forest: bool, + gfr: bool, + pre_initialized: bool, + ) -> None: """ Sample one iteration of a forest using the specified model and tree sampling algorithm @@ -110,11 +157,33 @@ def sample_one_iteration(self, forest_container: ForestContainer, forest: Forest pre_initialized : bool Whether or not the forest being sampled has already been initialized """ - self.forest_sampler_cpp.SampleOneIteration(forest_container.forest_container_cpp, forest.forest_cpp, dataset.dataset_cpp, residual.residual_cpp, rng.rng_cpp, - feature_types, cutpoint_grid_size, leaf_model_scale_input, variable_weights, - a_forest, b_forest, global_variance, leaf_model_int, keep_forest, gfr, pre_initialized) - - def prepare_for_sampler(self, dataset: Dataset, residual: Residual, forest: Forest, leaf_model: int, initial_values: np.array) -> None: + self.forest_sampler_cpp.SampleOneIteration( + forest_container.forest_container_cpp, + forest.forest_cpp, + dataset.dataset_cpp, + residual.residual_cpp, + rng.rng_cpp, + feature_types, + cutpoint_grid_size, + leaf_model_scale_input, + variable_weights, + a_forest, + b_forest, + global_variance, + leaf_model_int, + keep_forest, + gfr, + pre_initialized, + ) + + def prepare_for_sampler( + self, + dataset: Dataset, + residual: Residual, + forest: Forest, + leaf_model: int, + initial_values: np.array, + ) -> None: """ Initialize forest and tracking data structures with constant root values before running a sampler @@ -131,13 +200,26 @@ def prepare_for_sampler(self, dataset: Dataset, residual: Residual, forest: Fore initial_values : np.array Constant root node value(s) at which to initialize forest prediction (internally, it is divided by the number of trees and typically it is 0 for mean models and 1 for variance models). """ - self.forest_sampler_cpp.InitializeForestModel(dataset.dataset_cpp, residual.residual_cpp, forest.forest_cpp, leaf_model, initial_values) - - def adjust_residual(self, dataset: Dataset, residual: Residual, forest: Forest, requires_basis: bool, add: bool) -> None: + self.forest_sampler_cpp.InitializeForestModel( + dataset.dataset_cpp, + residual.residual_cpp, + forest.forest_cpp, + leaf_model, + initial_values, + ) + + def adjust_residual( + self, + dataset: Dataset, + residual: Residual, + forest: Forest, + requires_basis: bool, + add: bool, + ) -> None: """ - Method that "adjusts" the residual used for training tree ensembles by either adding or subtracting the prediction of each tree to the existing residual. - - This is typically run just once at the beginning of a forest sampling algorithm --- after trees are initialized with constant root node predictions, their + Method that "adjusts" the residual used for training tree ensembles by either adding or subtracting the prediction of each tree to the existing residual. + + This is typically run just once at the beginning of a forest sampling algorithm --- after trees are initialized with constant root node predictions, their root predictions are subtracted out of the residual. Parameters @@ -153,15 +235,23 @@ def adjust_residual(self, dataset: Dataset, residual: Residual, forest: Forest, add : bool Whether the predictions of each tree are added (if `add=True`) or subtracted (`add=False`) from the outcome to form the new residual """ - forest.forest_cpp.AdjustResidual(dataset.dataset_cpp, residual.residual_cpp, self.forest_sampler_cpp, requires_basis, add) - - def propagate_basis_update(self, dataset: Dataset, residual: Residual, forest: Forest) -> None: + forest.forest_cpp.AdjustResidual( + dataset.dataset_cpp, + residual.residual_cpp, + self.forest_sampler_cpp, + requires_basis, + add, + ) + + def propagate_basis_update( + self, dataset: Dataset, residual: Residual, forest: Forest + ) -> None: """ - Propagates basis update through to the (full/partial) residual by iteratively (a) adding back in the previous prediction of each tree, (b) recomputing predictions + Propagates basis update through to the (full/partial) residual by iteratively (a) adding back in the previous prediction of each tree, (b) recomputing predictions for each tree (caching on the C++ side), (c) subtracting the new predictions from the residual. - This is useful in cases where a basis (for e.g. leaf regression) is updated outside of a tree sampler (as with e.g. adaptive coding for binary treatment BCF). - Once a basis has been updated, the overall "function" represented by a tree model has changed and this should be reflected through to the residual before the + This is useful in cases where a basis (for e.g. leaf regression) is updated outside of a tree sampler (as with e.g. adaptive coding for binary treatment BCF). + Once a basis has been updated, the overall "function" represented by a tree model has changed and this should be reflected through to the residual before the next sampling loop is run. Parameters @@ -173,8 +263,10 @@ def propagate_basis_update(self, dataset: Dataset, residual: Residual, forest: F forest : Forest Stochtree object storing the "active" forest being sampled """ - self.forest_sampler_cpp.PropagateBasisUpdate(dataset.dataset_cpp, residual.residual_cpp, forest.forest_cpp) - + self.forest_sampler_cpp.PropagateBasisUpdate( + dataset.dataset_cpp, residual.residual_cpp, forest.forest_cpp + ) + def update_alpha(self, alpha: float) -> None: """ Update `alpha` in the tree prior @@ -185,7 +277,7 @@ def update_alpha(self, alpha: float) -> None: New value of `alpha` to be used """ self.forest_sampler_cpp.UpdateAlpha(alpha) - + def update_beta(self, beta: float) -> None: """ Update `beta` in the tree prior @@ -196,7 +288,7 @@ def update_beta(self, beta: float) -> None: New value of `beta` to be used """ self.forest_sampler_cpp.UpdateBeta(beta) - + def update_min_samples_leaf(self, min_samples_leaf: int) -> None: """ Update `min_samples_leaf` in the tree prior @@ -207,7 +299,7 @@ def update_min_samples_leaf(self, min_samples_leaf: int) -> None: New value of `min_samples_leaf` to be used """ self.forest_sampler_cpp.UpdateMinSamplesLeaf(min_samples_leaf) - + def update_max_depth(self, max_depth: int) -> None: """ Update `max_depth` in the tree prior @@ -222,13 +314,16 @@ def update_max_depth(self, max_depth: int) -> None: class GlobalVarianceModel: """ - Wrapper around methods / functions for sampling a "global" error variance model + Wrapper around methods / functions for sampling a "global" error variance model with [inverse gamma](https://en.wikipedia.org/wiki/Inverse-gamma_distribution) prior. """ + def __init__(self) -> None: self.variance_model_cpp = GlobalVarianceModelCpp() - - def sample_one_iteration(self, residual: Residual, rng: RNG, a: float, b: float) -> float: + + def sample_one_iteration( + self, residual: Residual, rng: RNG, a: float, b: float + ) -> float: """ Sample one iteration of a global error variance parameter @@ -242,27 +337,32 @@ def sample_one_iteration(self, residual: Residual, rng: RNG, a: float, b: float) Shape parameter for the inverse gamma error variance model b : float Scale parameter for the inverse gamma error variance model - + Returns ------- float - One draw from a Gibbs sampler for the error variance model, which depends - on the rest of the model only through the "full" residual stored in - a `Residual` object (net of predictions of any mean term such as a forest or + One draw from a Gibbs sampler for the error variance model, which depends + on the rest of the model only through the "full" residual stored in + a `Residual` object (net of predictions of any mean term such as a forest or an additive parametric fixed / random effect term). """ - return self.variance_model_cpp.SampleOneIteration(residual.residual_cpp, rng.rng_cpp, a, b) + return self.variance_model_cpp.SampleOneIteration( + residual.residual_cpp, rng.rng_cpp, a, b + ) class LeafVarianceModel: """ - Wrapper around methods / functions for sampling a "leaf scale" model for the variance term of a Gaussian + Wrapper around methods / functions for sampling a "leaf scale" model for the variance term of a Gaussian leaf model with [inverse gamma](https://en.wikipedia.org/wiki/Inverse-gamma_distribution) prior. """ + def __init__(self) -> None: self.variance_model_cpp = LeafVarianceModelCpp() - - def sample_one_iteration(self, forest: Forest, rng: RNG, a: float, b: float) -> float: + + def sample_one_iteration( + self, forest: Forest, rng: RNG, a: float, b: float + ) -> float: """ Sample one iteration of a forest leaf model's variance parameter (assuming a location-scale leaf model, most commonly `N(0, tau)`) @@ -276,11 +376,13 @@ def sample_one_iteration(self, forest: Forest, rng: RNG, a: float, b: float) -> Shape parameter for the inverse gamma leaf scale model b : float Scale parameter for the inverse gamma leaf scale model - + Returns ------- float - One draw from a Gibbs sampler for the leaf scale model, which depends + One draw from a Gibbs sampler for the leaf scale model, which depends on the rest of the model only through its respective forest. """ - return self.variance_model_cpp.SampleOneIteration(forest.forest_cpp, rng.rng_cpp, a, b) + return self.variance_model_cpp.SampleOneIteration( + forest.forest_cpp, rng.rng_cpp, a, b + ) diff --git a/stochtree/serialization.py b/stochtree/serialization.py index b6d3a93b..5f91902e 100644 --- a/stochtree/serialization.py +++ b/stochtree/serialization.py @@ -7,15 +7,17 @@ from .forest import ForestContainer from stochtree_cpp import JsonCpp + class JSONSerializer: """ Class that handles serialization and deserialization of stochastic forest models """ + def __init__(self) -> None: self.json_cpp = JsonCpp() self.num_forests = 0 self.forest_labels = [] - + def return_json_string(self) -> str: """ Convert JSON object to in-memory string @@ -26,7 +28,7 @@ def return_json_string(self) -> str: JSON string representing model metadata (hyperparameters), sampled parameters, and sampled forests """ return self.json_cpp.DumpJson() - + def load_from_json_string(self, json_string: str) -> None: """ Parse in-memory JSON string to `JsonCpp` object @@ -37,7 +39,7 @@ def load_from_json_string(self, json_string: str) -> None: JSON string representing model metadata (hyperparameters), sampled parameters, and sampled forests """ self.json_cpp.LoadFromString(json_string) - + def add_forest(self, forest_samples: ForestContainer) -> None: """Adds a container of forest samples to a json object @@ -49,8 +51,10 @@ def add_forest(self, forest_samples: ForestContainer) -> None: forest_label = self.json_cpp.AddForest(forest_samples.forest_container_cpp) self.num_forests += 1 self.forest_labels.append(forest_label) - - def add_scalar(self, field_name: str, field_value: float, subfolder_name: str = None) -> None: + + def add_scalar( + self, field_name: str, field_value: float, subfolder_name: str = None + ) -> None: """Adds a scalar (numeric) value to a json object Parameters @@ -66,8 +70,10 @@ def add_scalar(self, field_name: str, field_value: float, subfolder_name: str = self.json_cpp.AddDouble(field_name, field_value) else: self.json_cpp.AddDoubleSubfolder(subfolder_name, field_name, field_value) - - def add_integer(self, field_name: str, field_value: int, subfolder_name: str = None) -> None: + + def add_integer( + self, field_name: str, field_value: int, subfolder_name: str = None + ) -> None: """Adds an integer value to a json object Parameters @@ -83,8 +89,10 @@ def add_integer(self, field_name: str, field_value: int, subfolder_name: str = N self.json_cpp.AddInteger(field_name, field_value) else: self.json_cpp.AddIntegerSubfolder(subfolder_name, field_name, field_value) - - def add_boolean(self, field_name: str, field_value: bool, subfolder_name: str = None) -> None: + + def add_boolean( + self, field_name: str, field_value: bool, subfolder_name: str = None + ) -> None: """Adds a scalar (boolean) value to a json object Parameters @@ -100,8 +108,10 @@ def add_boolean(self, field_name: str, field_value: bool, subfolder_name: str = self.json_cpp.AddBool(field_name, field_value) else: self.json_cpp.AddBoolSubfolder(subfolder_name, field_name, field_value) - - def add_string(self, field_name: str, field_value: str, subfolder_name: str = None) -> None: + + def add_string( + self, field_name: str, field_value: str, subfolder_name: str = None + ) -> None: """Adds a string to a json object Parameters @@ -117,8 +127,10 @@ def add_string(self, field_name: str, field_value: str, subfolder_name: str = No self.json_cpp.AddString(field_name, field_value) else: self.json_cpp.AddStringSubfolder(subfolder_name, field_name, field_value) - - def add_numeric_vector(self, field_name: str, field_vector: np.array, subfolder_name: str = None) -> None: + + def add_numeric_vector( + self, field_name: str, field_vector: np.array, subfolder_name: str = None + ) -> None: """Adds a numeric vector (stored as a numpy array) to a json object Parameters @@ -135,15 +147,21 @@ def add_numeric_vector(self, field_name: str, field_vector: np.array, subfolder_ raise ValueError("field_vector must be a numpy array") field_vector = np.squeeze(field_vector) if field_vector.ndim > 1: - warnings.warn("field_vector has more than 1 dimension. It will be flattened in row-major order using np.ravel()") - field_vector = np.ravel(field_vector, order = "C") - + warnings.warn( + "field_vector has more than 1 dimension. It will be flattened in row-major order using np.ravel()" + ) + field_vector = np.ravel(field_vector, order="C") + if subfolder_name is None: self.json_cpp.AddDoubleVector(field_name, field_vector) else: - self.json_cpp.AddDoubleVectorSubfolder(subfolder_name, field_name, field_vector) - - def add_integer_vector(self, field_name: str, field_vector: np.array, subfolder_name: str = None) -> None: + self.json_cpp.AddDoubleVectorSubfolder( + subfolder_name, field_name, field_vector + ) + + def add_integer_vector( + self, field_name: str, field_vector: np.array, subfolder_name: str = None + ) -> None: """Adds a integer vector (stored as a numpy array) to a json object Parameters @@ -159,18 +177,26 @@ def add_integer_vector(self, field_name: str, field_vector: np.array, subfolder_ if not isinstance(field_vector, np.ndarray): raise ValueError("field_vector must be a numpy array") if not np.issubdtype(field_vector.dtype, np.integer): - raise ValueError("field_vector must be a numpy array with integer data types") + raise ValueError( + "field_vector must be a numpy array with integer data types" + ) field_vector = np.squeeze(field_vector) if field_vector.ndim > 1: - warnings.warn("field_vector has more than 1 dimension. It will be flattened in row-major order using np.ravel()") - field_vector = np.ravel(field_vector, order = "C") - + warnings.warn( + "field_vector has more than 1 dimension. It will be flattened in row-major order using np.ravel()" + ) + field_vector = np.ravel(field_vector, order="C") + if subfolder_name is None: self.json_cpp.AddIntegerVector(field_name, field_vector) else: - self.json_cpp.AddIntegerVectorSubfolder(subfolder_name, field_name, field_vector) - - def add_string_vector(self, field_name: str, field_vector: list, subfolder_name: str = None) -> None: + self.json_cpp.AddIntegerVectorSubfolder( + subfolder_name, field_name, field_vector + ) + + def add_string_vector( + self, field_name: str, field_vector: list, subfolder_name: str = None + ) -> None: """Adds a list of strings to a json object as an array Parameters @@ -183,16 +209,20 @@ def add_string_vector(self, field_name: str, field_vector: list, subfolder_name: Name of "subfolder" under which `field_name` to be stored in the json hierarchy """ # Runtime checks - if not isinstance(field_vector, list) and not isinstance(field_vector, np.ndarray): + if not isinstance(field_vector, list) and not isinstance( + field_vector, np.ndarray + ): raise ValueError("field_vector must be a list or numpy object array") - + if isinstance(field_vector, np.ndarray): field_vector = field_vector.tolist() if subfolder_name is None: self.json_cpp.AddStringVector(field_name, field_vector) else: - self.json_cpp.AddStringVectorSubfolder(subfolder_name, field_name, field_vector) - + self.json_cpp.AddStringVectorSubfolder( + subfolder_name, field_name, field_vector + ) + def get_scalar(self, field_name: str, subfolder_name: str = None) -> float: """Retrieves a scalar (numeric) value from a json object @@ -207,7 +237,7 @@ def get_scalar(self, field_name: str, subfolder_name: str = None) -> float: return self.json_cpp.ExtractDouble(field_name) else: return self.json_cpp.ExtractDoubleSubfolder(subfolder_name, field_name) - + def get_integer(self, field_name: str, subfolder_name: str = None) -> int: """Retrieves an integer value from a json object @@ -222,7 +252,7 @@ def get_integer(self, field_name: str, subfolder_name: str = None) -> int: return self.json_cpp.ExtractInteger(field_name) else: return self.json_cpp.ExtractIntegerSubfolder(subfolder_name, field_name) - + def get_boolean(self, field_name: str, subfolder_name: str = None) -> bool: """Retrieves a scalar (boolean) value from a json object @@ -237,7 +267,7 @@ def get_boolean(self, field_name: str, subfolder_name: str = None) -> bool: return self.json_cpp.ExtractBool(field_name) else: return self.json_cpp.ExtractBoolSubfolder(subfolder_name, field_name) - + def get_string(self, field_name: str, subfolder_name: str = None) -> str: """Retrieve a string from a json object @@ -252,8 +282,10 @@ def get_string(self, field_name: str, subfolder_name: str = None) -> str: return self.json_cpp.ExtractString(field_name) else: return self.json_cpp.ExtractStringSubfolder(subfolder_name, field_name) - - def get_numeric_vector(self, field_name: str, subfolder_name: str = None) -> np.array: + + def get_numeric_vector( + self, field_name: str, subfolder_name: str = None + ) -> np.array: """Retrieve numeric vector from a json object Parameters @@ -266,9 +298,13 @@ def get_numeric_vector(self, field_name: str, subfolder_name: str = None) -> np. if subfolder_name is None: return self.json_cpp.ExtractDoubleVector(field_name) else: - return self.json_cpp.ExtractDoubleVectorSubfolder(subfolder_name, field_name) - - def get_integer_vector(self, field_name: str, subfolder_name: str = None) -> np.array: + return self.json_cpp.ExtractDoubleVectorSubfolder( + subfolder_name, field_name + ) + + def get_integer_vector( + self, field_name: str, subfolder_name: str = None + ) -> np.array: """Retrieve integer vector from a json object Parameters @@ -281,8 +317,10 @@ def get_integer_vector(self, field_name: str, subfolder_name: str = None) -> np. if subfolder_name is None: return self.json_cpp.ExtractIntegerVector(field_name) else: - return self.json_cpp.ExtractIntegerVectorSubfolder(subfolder_name, field_name) - + return self.json_cpp.ExtractIntegerVectorSubfolder( + subfolder_name, field_name + ) + def get_string_vector(self, field_name: str, subfolder_name: str = None) -> list: """Adds a string to a json object @@ -296,8 +334,10 @@ def get_string_vector(self, field_name: str, subfolder_name: str = None) -> list if subfolder_name is None: return self.json_cpp.ExtractStringVector(field_name) else: - return self.json_cpp.ExtractStringVectorSubfolder(subfolder_name, field_name) - + return self.json_cpp.ExtractStringVectorSubfolder( + subfolder_name, field_name + ) + def get_forest_container(self, forest_str: str) -> ForestContainer: """Converts a JSON string for a container of forests to a `ForestContainer` object. @@ -305,7 +345,7 @@ def get_forest_container(self, forest_str: str) -> ForestContainer: ---------- forest_str : str String containing the JSON representation of a `ForestContainer` - + Returns ------- ForestContainer diff --git a/stochtree/utils.py b/stochtree/utils.py index be099353..27062655 100644 --- a/stochtree/utils.py +++ b/stochtree/utils.py @@ -4,6 +4,6 @@ class NotSampledError(ValueError, AttributeError): This class inherits from both ValueError and AttributeError to help with exception handling and backward compatibility. - Renamed from scikit-learn's "NotFittedError" + Renamed from scikit-learn's "NotFittedError" https://github.com/scikit-learn/scikit-learn/blob/8721245511de2f225ff5f9aa5f5fadce663cd4a3/sklearn/exceptions.py#L45C7-L45C21 - """ \ No newline at end of file + """ From ea7407eefc94817ad7636864e7ca91b533d16f39 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Fri, 14 Feb 2025 17:29:20 -0600 Subject: [PATCH 02/35] Fixing python code issues --- stochtree/bart.py | 12 +----------- stochtree/preprocessing.py | 1 - 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/stochtree/bart.py b/stochtree/bart.py index 3d1a00c1..c4d4a3fc 100644 --- a/stochtree/bart.py +++ b/stochtree/bart.py @@ -3,7 +3,7 @@ """ import warnings -from numbers import Number, Integral +from numbers import Integral from math import log import numpy as np import pandas as pd @@ -1241,21 +1241,11 @@ def predict_variance(self, covariates: np.array) -> np.array: covariates, np.ndarray ): raise ValueError("covariates must be a pandas dataframe or numpy array") - if basis is not None: - if not isinstance(basis, np.ndarray): - raise ValueError("basis must be a numpy array") - if basis.shape[0] != covariates.shape[0]: - raise ValueError( - "covariates and basis must have the same number of rows" - ) # Convert everything to standard shape (2-dimensional) if isinstance(covariates, np.ndarray): if covariates.ndim == 1: covariates = np.expand_dims(covariates, 1) - if basis is not None: - if basis.ndim == 1: - basis = np.expand_dims(basis, 1) # Covariate preprocessing if not self._covariate_preprocessor._check_is_fitted(): diff --git a/stochtree/preprocessing.py b/stochtree/preprocessing.py index 73d76655..6b8e49d1 100644 --- a/stochtree/preprocessing.py +++ b/stochtree/preprocessing.py @@ -5,7 +5,6 @@ """ from typing import Union, Optional, Any, Dict -from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder import numpy as np import pandas as pd from scipy import sparse From 486b2fe979f270d3deef0a9f44e300270db14a74 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Fri, 14 Feb 2025 17:59:17 -0600 Subject: [PATCH 03/35] Formatting core stochtree python imports with ruff --- stochtree/__init__.py | 2 +- stochtree/bart.py | 10 ++++++---- stochtree/bcf.py | 8 +++++--- stochtree/calibration.py | 3 ++- stochtree/forest.py | 6 ++++-- stochtree/preprocessing.py | 6 ++++-- stochtree/sampler.py | 10 ++++++---- stochtree/serialization.py | 6 ++++-- 8 files changed, 32 insertions(+), 19 deletions(-) diff --git a/stochtree/__init__.py b/stochtree/__init__.py index 5c68ccdb..6e2de29a 100644 --- a/stochtree/__init__.py +++ b/stochtree/__init__.py @@ -2,7 +2,7 @@ from .bcf import BCFModel from .calibration import calibrate_global_error_variance from .data import Dataset, Residual -from .forest import ForestContainer, Forest +from .forest import Forest, ForestContainer from .preprocessing import CovariatePreprocessor from .sampler import RNG, ForestSampler, GlobalVarianceModel, LeafVarianceModel from .serialization import JSONSerializer diff --git a/stochtree/bart.py b/stochtree/bart.py index c4d4a3fc..9ab3e592 100644 --- a/stochtree/bart.py +++ b/stochtree/bart.py @@ -3,15 +3,17 @@ """ import warnings -from numbers import Integral from math import log +from numbers import Integral +from typing import Any, Dict, Optional, Union + import numpy as np import pandas as pd -from typing import Optional, Dict, Any, Union + from .data import Dataset, Residual -from .forest import ForestContainer, Forest +from .forest import Forest, ForestContainer from .preprocessing import CovariatePreprocessor, _preprocess_params -from .sampler import ForestSampler, RNG, GlobalVarianceModel, LeafVarianceModel +from .sampler import RNG, ForestSampler, GlobalVarianceModel, LeafVarianceModel from .serialization import JSONSerializer from .utils import NotSampledError diff --git a/stochtree/bcf.py b/stochtree/bcf.py index 3697a6ce..39975569 100644 --- a/stochtree/bcf.py +++ b/stochtree/bcf.py @@ -2,15 +2,17 @@ Bayesian Causal Forests (BCF) module """ +from typing import Any, Dict, Optional, Union + import numpy as np import pandas as pd from sklearn.utils import check_scalar -from typing import Optional, Union, Dict, Any + from .bart import BARTModel from .data import Dataset, Residual -from .forest import ForestContainer, Forest +from .forest import Forest, ForestContainer from .preprocessing import CovariatePreprocessor, _preprocess_params -from .sampler import ForestSampler, RNG, GlobalVarianceModel, LeafVarianceModel +from .sampler import RNG, ForestSampler, GlobalVarianceModel, LeafVarianceModel from .serialization import JSONSerializer from .utils import NotSampledError diff --git a/stochtree/calibration.py b/stochtree/calibration.py index b19a7e3f..d09124fa 100644 --- a/stochtree/calibration.py +++ b/stochtree/calibration.py @@ -1,9 +1,10 @@ import warnings + import numpy as np import pandas as pd +from scipy.stats import gamma from sklearn import linear_model from sklearn.metrics import mean_squared_error -from scipy.stats import gamma def calibrate_global_error_variance( diff --git a/stochtree/forest.py b/stochtree/forest.py index b12cc924..c1192183 100644 --- a/stochtree/forest.py +++ b/stochtree/forest.py @@ -2,10 +2,12 @@ Python classes wrapping C++ forest container object """ +from typing import Union + import numpy as np -from .data import Dataset from stochtree_cpp import ForestContainerCpp, ForestCpp -from typing import Union + +from .data import Dataset class ForestContainer: diff --git a/stochtree/preprocessing.py b/stochtree/preprocessing.py index 6b8e49d1..53f12e37 100644 --- a/stochtree/preprocessing.py +++ b/stochtree/preprocessing.py @@ -4,11 +4,13 @@ Copyright (c) 2007-2024 The scikit-learn developers. """ -from typing import Union, Optional, Any, Dict +import warnings +from typing import Any, Dict, Optional, Union + import numpy as np import pandas as pd from scipy import sparse -import warnings + from .serialization import JSONSerializer diff --git a/stochtree/sampler.py b/stochtree/sampler.py index 95f2e353..d4270872 100644 --- a/stochtree/sampler.py +++ b/stochtree/sampler.py @@ -2,16 +2,18 @@ Python classes wrapping C++ sampler objects """ +from typing import Union + import numpy as np -from .data import Dataset, Residual -from .forest import ForestContainer, Forest from stochtree_cpp import ( - RngCpp, ForestSamplerCpp, GlobalVarianceModelCpp, LeafVarianceModelCpp, + RngCpp, ) -from typing import Union + +from .data import Dataset, Residual +from .forest import Forest, ForestContainer class RNG: diff --git a/stochtree/serialization.py b/stochtree/serialization.py index 5f91902e..f4d0ff80 100644 --- a/stochtree/serialization.py +++ b/stochtree/serialization.py @@ -1,12 +1,14 @@ import warnings +from typing import Union + import numpy as np import pandas as pd -from typing import Union from scipy.linalg import lstsq from scipy.stats import gamma -from .forest import ForestContainer from stochtree_cpp import JsonCpp +from .forest import ForestContainer + class JSONSerializer: """ From 6595736b69cc50b210e75eecf5a29aa7f174b778 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Fri, 14 Feb 2025 18:36:10 -0600 Subject: [PATCH 04/35] Used ruff to format python demo notebooks --- demo/notebooks/causal_inference.ipynb | 75 ++- .../causal_inference_feature_subsets.ipynb | 145 +++-- .../heteroskedastic_supervised_learning.ipynb | 101 ++-- ...tivariate_treatment_causal_inference.ipynb | 110 +++- demo/notebooks/prototype_interface.ipynb | 538 +++++++++++++----- demo/notebooks/serialization.ipynb | 105 ++-- demo/notebooks/supervised_learning.ipynb | 110 +++- demo/notebooks/tree_inspection.ipynb | 122 ++-- 8 files changed, 931 insertions(+), 375 deletions(-) diff --git a/demo/notebooks/causal_inference.ipynb b/demo/notebooks/causal_inference.ipynb index 92d58528..80565aed 100644 --- a/demo/notebooks/causal_inference.ipynb +++ b/demo/notebooks/causal_inference.ipynb @@ -48,16 +48,16 @@ "n = 1000\n", "p_X = 5\n", "X = rng.uniform(0, 1, (n, p_X))\n", - "pi_X = 0.25 + 0.5*X[:,0]\n", + "pi_X = 0.25 + 0.5 * X[:, 0]\n", "Z = rng.binomial(1, pi_X, n).astype(float)\n", "\n", "# Define the outcome mean functions (prognostic and treatment effects)\n", - "mu_X = pi_X*5 + 2*X[:,2]\n", - "tau_X = (X[:,1]*2 - 1)\n", + "mu_X = pi_X * 5 + 2 * X[:, 2]\n", + "tau_X = X[:, 1] * 2 - 1\n", "\n", "# Generate outcome\n", "epsilon = rng.normal(0, 1, n)\n", - "y = mu_X + tau_X*Z + epsilon" + "y = mu_X + tau_X * Z + epsilon" ] }, { @@ -75,8 +75,8 @@ "source": [ "sample_inds = np.arange(n)\n", "train_inds, test_inds = train_test_split(sample_inds, test_size=0.5)\n", - "X_train = X[train_inds,:]\n", - "X_test = X[test_inds,:]\n", + "X_train = X[train_inds, :]\n", + "X_test = X[test_inds, :]\n", "Z_train = Z[train_inds]\n", "Z_test = Z[test_inds]\n", "y_train = y[train_inds]\n", @@ -104,7 +104,18 @@ "source": [ "bcf_model = BCFModel()\n", "general_params = {\"keep_every\": 5}\n", - "bcf_model.sample(X_train, Z_train, y_train, pi_train, X_test, Z_test, pi_test, num_gfr=10, num_mcmc=100, general_params=general_params)" + "bcf_model.sample(\n", + " X_train,\n", + " Z_train,\n", + " y_train,\n", + " pi_train,\n", + " X_test,\n", + " Z_test,\n", + " pi_test,\n", + " num_gfr=10,\n", + " num_mcmc=100,\n", + " general_params=general_params,\n", + ")" ] }, { @@ -121,10 +132,13 @@ "outputs": [], "source": [ "forest_preds_y_mcmc = bcf_model.y_hat_test\n", - "y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis = 1, keepdims = True)\n", - "y_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(y_test,1), y_avg_mcmc), axis = 1), columns=[\"True outcome\", \"Average estimated outcome\"])\n", + "y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis=1, keepdims=True)\n", + "y_df_mcmc = pd.DataFrame(\n", + " np.concatenate((np.expand_dims(y_test, 1), y_avg_mcmc), axis=1),\n", + " columns=[\"True outcome\", \"Average estimated outcome\"],\n", + ")\n", "sns.scatterplot(data=y_df_mcmc, x=\"Average estimated outcome\", y=\"True outcome\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -135,10 +149,13 @@ "outputs": [], "source": [ "forest_preds_tau_mcmc = bcf_model.tau_hat_test\n", - "tau_avg_mcmc = np.squeeze(forest_preds_tau_mcmc).mean(axis = 1, keepdims = True)\n", - "tau_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(tau_test,1), tau_avg_mcmc), axis = 1), columns=[\"True tau\", \"Average estimated tau\"])\n", + "tau_avg_mcmc = np.squeeze(forest_preds_tau_mcmc).mean(axis=1, keepdims=True)\n", + "tau_df_mcmc = pd.DataFrame(\n", + " np.concatenate((np.expand_dims(tau_test, 1), tau_avg_mcmc), axis=1),\n", + " columns=[\"True tau\", \"Average estimated tau\"],\n", + ")\n", "sns.scatterplot(data=tau_df_mcmc, x=\"True tau\", y=\"Average estimated tau\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -149,10 +166,13 @@ "outputs": [], "source": [ "forest_preds_mu_mcmc = bcf_model.mu_hat_test\n", - "mu_avg_mcmc = np.squeeze(forest_preds_mu_mcmc).mean(axis = 1, keepdims = True)\n", - "mu_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(mu_test,1), mu_avg_mcmc), axis = 1), columns=[\"True mu\", \"Average estimated mu\"])\n", + "mu_avg_mcmc = np.squeeze(forest_preds_mu_mcmc).mean(axis=1, keepdims=True)\n", + "mu_df_mcmc = pd.DataFrame(\n", + " np.concatenate((np.expand_dims(mu_test, 1), mu_avg_mcmc), axis=1),\n", + " columns=[\"True mu\", \"Average estimated mu\"],\n", + ")\n", "sns.scatterplot(data=mu_df_mcmc, x=\"True mu\", y=\"Average estimated mu\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -162,7 +182,16 @@ "metadata": {}, "outputs": [], "source": [ - "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model.num_samples),axis=1), np.expand_dims(bcf_model.global_var_samples,axis=1)), axis = 1), columns=[\"Sample\", \"Sigma\"])\n", + "sigma_df_mcmc = pd.DataFrame(\n", + " np.concatenate(\n", + " (\n", + " np.expand_dims(np.arange(bcf_model.num_samples), axis=1),\n", + " np.expand_dims(bcf_model.global_var_samples, axis=1),\n", + " ),\n", + " axis=1,\n", + " ),\n", + " columns=[\"Sample\", \"Sigma\"],\n", + ")\n", "sns.scatterplot(data=sigma_df_mcmc, x=\"Sample\", y=\"Sigma\")\n", "plt.show()" ] @@ -173,7 +202,17 @@ "metadata": {}, "outputs": [], "source": [ - "b_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model.num_samples),axis=1), np.expand_dims(bcf_model.b0_samples,axis=1), np.expand_dims(bcf_model.b1_samples,axis=1)), axis = 1), columns=[\"Sample\", \"Beta_0\", \"Beta_1\"])\n", + "b_df_mcmc = pd.DataFrame(\n", + " np.concatenate(\n", + " (\n", + " np.expand_dims(np.arange(bcf_model.num_samples), axis=1),\n", + " np.expand_dims(bcf_model.b0_samples, axis=1),\n", + " np.expand_dims(bcf_model.b1_samples, axis=1),\n", + " ),\n", + " axis=1,\n", + " ),\n", + " columns=[\"Sample\", \"Beta_0\", \"Beta_1\"],\n", + ")\n", "sns.scatterplot(data=b_df_mcmc, x=\"Sample\", y=\"Beta_0\")\n", "sns.scatterplot(data=b_df_mcmc, x=\"Sample\", y=\"Beta_1\")\n", "plt.show()" diff --git a/demo/notebooks/causal_inference_feature_subsets.ipynb b/demo/notebooks/causal_inference_feature_subsets.ipynb index b391a33f..8b9169a9 100644 --- a/demo/notebooks/causal_inference_feature_subsets.ipynb +++ b/demo/notebooks/causal_inference_feature_subsets.ipynb @@ -53,16 +53,16 @@ "n = 1000\n", "p_X = 10\n", "X = rng.uniform(0, 1, (n, p_X))\n", - "pi_X = 0.25 + 0.5*X[:,0]\n", + "pi_X = 0.25 + 0.5 * X[:, 0]\n", "Z = rng.binomial(1, pi_X, n).astype(float)\n", "\n", "# Define the outcome mean functions (prognostic and treatment effects)\n", - "mu_X = pi_X*5 + 2*X[:,2]\n", - "tau_X = 1 - 2*X[:,0] + 2*X[:,1] + 1*X[:,0]*X[:,1]\n", + "mu_X = pi_X * 5 + 2 * X[:, 2]\n", + "tau_X = 1 - 2 * X[:, 0] + 2 * X[:, 1] + 1 * X[:, 0] * X[:, 1]\n", "\n", "# Generate outcome\n", "epsilon = rng.normal(0, 1, n)\n", - "y = mu_X + tau_X*Z + epsilon" + "y = mu_X + tau_X * Z + epsilon" ] }, { @@ -80,8 +80,8 @@ "source": [ "sample_inds = np.arange(n)\n", "train_inds, test_inds = train_test_split(sample_inds, test_size=0.5)\n", - "X_train = X[train_inds,:]\n", - "X_test = X[test_inds,:]\n", + "X_train = X[train_inds, :]\n", + "X_test = X[test_inds, :]\n", "Z_train = Z[train_inds]\n", "Z_test = Z[test_inds]\n", "y_train = y[train_inds]\n", @@ -108,7 +108,18 @@ "outputs": [], "source": [ "bcf_model = BCFModel()\n", - "bcf_model.sample(X_train, Z_train, y_train, pi_train, X_test, Z_test, pi_test, num_gfr=10, num_mcmc=100, params={\"keep_every\": 5})" + "bcf_model.sample(\n", + " X_train,\n", + " Z_train,\n", + " y_train,\n", + " pi_train,\n", + " X_test,\n", + " Z_test,\n", + " pi_test,\n", + " num_gfr=10,\n", + " num_mcmc=100,\n", + " params={\"keep_every\": 5},\n", + ")" ] }, { @@ -125,10 +136,13 @@ "outputs": [], "source": [ "forest_preds_y_mcmc = bcf_model.y_hat_test\n", - "y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis = 1, keepdims = True)\n", - "y_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(y_test,1), y_avg_mcmc), axis = 1), columns=[\"True outcome\", \"Average estimated outcome\"])\n", + "y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis=1, keepdims=True)\n", + "y_df_mcmc = pd.DataFrame(\n", + " np.concatenate((np.expand_dims(y_test, 1), y_avg_mcmc), axis=1),\n", + " columns=[\"True outcome\", \"Average estimated outcome\"],\n", + ")\n", "sns.scatterplot(data=y_df_mcmc, x=\"Average estimated outcome\", y=\"True outcome\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -139,10 +153,13 @@ "outputs": [], "source": [ "forest_preds_tau_mcmc = bcf_model.tau_hat_test\n", - "tau_avg_mcmc = np.squeeze(forest_preds_tau_mcmc).mean(axis = 1, keepdims = True)\n", - "tau_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(tau_test,1), tau_avg_mcmc), axis = 1), columns=[\"True tau\", \"Average estimated tau\"])\n", + "tau_avg_mcmc = np.squeeze(forest_preds_tau_mcmc).mean(axis=1, keepdims=True)\n", + "tau_df_mcmc = pd.DataFrame(\n", + " np.concatenate((np.expand_dims(tau_test, 1), tau_avg_mcmc), axis=1),\n", + " columns=[\"True tau\", \"Average estimated tau\"],\n", + ")\n", "sns.scatterplot(data=tau_df_mcmc, x=\"True tau\", y=\"Average estimated tau\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -153,10 +170,13 @@ "outputs": [], "source": [ "forest_preds_mu_mcmc = bcf_model.mu_hat_test\n", - "mu_avg_mcmc = np.squeeze(forest_preds_mu_mcmc).mean(axis = 1, keepdims = True)\n", - "mu_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(mu_test,1), mu_avg_mcmc), axis = 1), columns=[\"True mu\", \"Average estimated mu\"])\n", + "mu_avg_mcmc = np.squeeze(forest_preds_mu_mcmc).mean(axis=1, keepdims=True)\n", + "mu_df_mcmc = pd.DataFrame(\n", + " np.concatenate((np.expand_dims(mu_test, 1), mu_avg_mcmc), axis=1),\n", + " columns=[\"True mu\", \"Average estimated mu\"],\n", + ")\n", "sns.scatterplot(data=mu_df_mcmc, x=\"True mu\", y=\"Average estimated mu\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -166,7 +186,16 @@ "metadata": {}, "outputs": [], "source": [ - "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model.num_samples),axis=1), np.expand_dims(bcf_model.global_var_samples,axis=1)), axis = 1), columns=[\"Sample\", \"Sigma\"])\n", + "sigma_df_mcmc = pd.DataFrame(\n", + " np.concatenate(\n", + " (\n", + " np.expand_dims(np.arange(bcf_model.num_samples), axis=1),\n", + " np.expand_dims(bcf_model.global_var_samples, axis=1),\n", + " ),\n", + " axis=1,\n", + " ),\n", + " columns=[\"Sample\", \"Sigma\"],\n", + ")\n", "sns.scatterplot(data=sigma_df_mcmc, x=\"Sample\", y=\"Sigma\")\n", "plt.show()" ] @@ -177,7 +206,17 @@ "metadata": {}, "outputs": [], "source": [ - "b_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model.num_samples),axis=1), np.expand_dims(bcf_model.b0_samples,axis=1), np.expand_dims(bcf_model.b1_samples,axis=1)), axis = 1), columns=[\"Sample\", \"Beta_0\", \"Beta_1\"])\n", + "b_df_mcmc = pd.DataFrame(\n", + " np.concatenate(\n", + " (\n", + " np.expand_dims(np.arange(bcf_model.num_samples), axis=1),\n", + " np.expand_dims(bcf_model.b0_samples, axis=1),\n", + " np.expand_dims(bcf_model.b1_samples, axis=1),\n", + " ),\n", + " axis=1,\n", + " ),\n", + " columns=[\"Sample\", \"Beta_0\", \"Beta_1\"],\n", + ")\n", "sns.scatterplot(data=b_df_mcmc, x=\"Sample\", y=\"Beta_0\")\n", "sns.scatterplot(data=b_df_mcmc, x=\"Sample\", y=\"Beta_1\")\n", "plt.show()" @@ -197,8 +236,19 @@ "outputs": [], "source": [ "bcf_model_subset = BCFModel()\n", - "bcf_params = {'keep_vars_tau': [0,1]}\n", - "bcf_model_subset.sample(X_train, Z_train, y_train, pi_train, X_test, Z_test, pi_test, num_gfr=10, num_mcmc=100, params=bcf_params)" + "bcf_params = {\"keep_vars_tau\": [0, 1]}\n", + "bcf_model_subset.sample(\n", + " X_train,\n", + " Z_train,\n", + " y_train,\n", + " pi_train,\n", + " X_test,\n", + " Z_test,\n", + " pi_test,\n", + " num_gfr=10,\n", + " num_mcmc=100,\n", + " params=bcf_params,\n", + ")" ] }, { @@ -215,10 +265,13 @@ "outputs": [], "source": [ "forest_preds_y_mcmc = bcf_model_subset.y_hat_test\n", - "y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis = 1, keepdims = True)\n", - "y_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(y_test,1), y_avg_mcmc), axis = 1), columns=[\"True outcome\", \"Average estimated outcome\"])\n", + "y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis=1, keepdims=True)\n", + "y_df_mcmc = pd.DataFrame(\n", + " np.concatenate((np.expand_dims(y_test, 1), y_avg_mcmc), axis=1),\n", + " columns=[\"True outcome\", \"Average estimated outcome\"],\n", + ")\n", "sns.scatterplot(data=y_df_mcmc, x=\"Average estimated outcome\", y=\"True outcome\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -229,10 +282,13 @@ "outputs": [], "source": [ "forest_preds_tau_mcmc = bcf_model_subset.tau_hat_test\n", - "tau_avg_mcmc = np.squeeze(forest_preds_tau_mcmc).mean(axis = 1, keepdims = True)\n", - "tau_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(tau_test,1), tau_avg_mcmc), axis = 1), columns=[\"True tau\", \"Average estimated tau\"])\n", + "tau_avg_mcmc = np.squeeze(forest_preds_tau_mcmc).mean(axis=1, keepdims=True)\n", + "tau_df_mcmc = pd.DataFrame(\n", + " np.concatenate((np.expand_dims(tau_test, 1), tau_avg_mcmc), axis=1),\n", + " columns=[\"True tau\", \"Average estimated tau\"],\n", + ")\n", "sns.scatterplot(data=tau_df_mcmc, x=\"True tau\", y=\"Average estimated tau\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -243,10 +299,13 @@ "outputs": [], "source": [ "forest_preds_mu_mcmc = bcf_model_subset.mu_hat_test\n", - "mu_avg_mcmc = np.squeeze(forest_preds_mu_mcmc).mean(axis = 1, keepdims = True)\n", - "mu_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(mu_test,1), mu_avg_mcmc), axis = 1), columns=[\"True mu\", \"Average estimated mu\"])\n", + "mu_avg_mcmc = np.squeeze(forest_preds_mu_mcmc).mean(axis=1, keepdims=True)\n", + "mu_df_mcmc = pd.DataFrame(\n", + " np.concatenate((np.expand_dims(mu_test, 1), mu_avg_mcmc), axis=1),\n", + " columns=[\"True mu\", \"Average estimated mu\"],\n", + ")\n", "sns.scatterplot(data=mu_df_mcmc, x=\"True mu\", y=\"Average estimated mu\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -256,9 +315,16 @@ "metadata": {}, "outputs": [], "source": [ - "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model_subset.num_samples),axis=1), \n", - " np.expand_dims(bcf_model_subset.global_var_samples,axis=1)), axis = 1), \n", - " columns=[\"Sample\", \"Sigma\"])\n", + "sigma_df_mcmc = pd.DataFrame(\n", + " np.concatenate(\n", + " (\n", + " np.expand_dims(np.arange(bcf_model_subset.num_samples), axis=1),\n", + " np.expand_dims(bcf_model_subset.global_var_samples, axis=1),\n", + " ),\n", + " axis=1,\n", + " ),\n", + " columns=[\"Sample\", \"Sigma\"],\n", + ")\n", "sns.scatterplot(data=sigma_df_mcmc, x=\"Sample\", y=\"Sigma\")\n", "plt.show()" ] @@ -269,10 +335,17 @@ "metadata": {}, "outputs": [], "source": [ - "b_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model_subset.num_samples),axis=1), \n", - " np.expand_dims(bcf_model_subset.b0_samples,axis=1), \n", - " np.expand_dims(bcf_model_subset.b1_samples,axis=1)), axis = 1), \n", - " columns=[\"Sample\", \"Beta_0\", \"Beta_1\"])\n", + "b_df_mcmc = pd.DataFrame(\n", + " np.concatenate(\n", + " (\n", + " np.expand_dims(np.arange(bcf_model_subset.num_samples), axis=1),\n", + " np.expand_dims(bcf_model_subset.b0_samples, axis=1),\n", + " np.expand_dims(bcf_model_subset.b1_samples, axis=1),\n", + " ),\n", + " axis=1,\n", + " ),\n", + " columns=[\"Sample\", \"Beta_0\", \"Beta_1\"],\n", + ")\n", "sns.scatterplot(data=b_df_mcmc, x=\"Sample\", y=\"Beta_0\")\n", "sns.scatterplot(data=b_df_mcmc, x=\"Sample\", y=\"Beta_1\")\n", "plt.show()" diff --git a/demo/notebooks/heteroskedastic_supervised_learning.ipynb b/demo/notebooks/heteroskedastic_supervised_learning.ipynb index 427d9984..898833c6 100644 --- a/demo/notebooks/heteroskedastic_supervised_learning.ipynb +++ b/demo/notebooks/heteroskedastic_supervised_learning.ipynb @@ -53,42 +53,43 @@ "X = rng.uniform(0, 1, (n, p_X))\n", "W = rng.uniform(0, 1, (n, p_W))\n", "\n", + "\n", "# Define the outcome mean function\n", "def outcome_mean(X, W):\n", " return np.where(\n", - " (X[:,0] >= 0.0) & (X[:,0] < 0.25), -7.5 * W[:,0], \n", + " (X[:, 0] >= 0.0) & (X[:, 0] < 0.25),\n", + " -7.5 * W[:, 0],\n", " np.where(\n", - " (X[:,0] >= 0.25) & (X[:,0] < 0.5), -2.5 * W[:,0], \n", - " np.where(\n", - " (X[:,0] >= 0.5) & (X[:,0] < 0.75), 2.5 * W[:,0], \n", - " 7.5 * W[:,0]\n", - " )\n", - " )\n", + " (X[:, 0] >= 0.25) & (X[:, 0] < 0.5),\n", + " -2.5 * W[:, 0],\n", + " np.where((X[:, 0] >= 0.5) & (X[:, 0] < 0.75), 2.5 * W[:, 0], 7.5 * W[:, 0]),\n", + " ),\n", " )\n", "\n", + "\n", "# Define the outcome standard deviation function\n", "def outcome_stddev(X):\n", " return np.where(\n", - " (X[:,1] >= 0.0) & (X[:,1] < 0.25), sqrt(0.5), \n", + " (X[:, 1] >= 0.0) & (X[:, 1] < 0.25),\n", + " sqrt(0.5),\n", " np.where(\n", - " (X[:,1] >= 0.25) & (X[:,1] < 0.5), 1., \n", - " np.where(\n", - " (X[:,1] >= 0.5) & (X[:,1] < 0.75), 2., \n", - " 3.\n", - " )\n", - " )\n", + " (X[:, 1] >= 0.25) & (X[:, 1] < 0.5),\n", + " 1.0,\n", + " np.where((X[:, 1] >= 0.5) & (X[:, 1] < 0.75), 2.0, 3.0),\n", + " ),\n", " )\n", "\n", + "\n", "# Generate outcome\n", "epsilon = rng.normal(0, 1, n)\n", "f_x = outcome_mean(X, W)\n", "s_x = outcome_stddev(X)\n", - "y = f_x + epsilon*s_x\n", + "y = f_x + epsilon * s_x\n", "\n", "# Standardize outcome\n", "y_bar = np.mean(y)\n", "y_std = np.std(y)\n", - "resid = (y-y_bar)/y_std" + "resid = (y - y_bar) / y_std" ] }, { @@ -106,16 +107,16 @@ "source": [ "sample_inds = np.arange(n)\n", "train_inds, test_inds = train_test_split(sample_inds, test_size=0.5)\n", - "X_train = X[train_inds,:]\n", - "X_test = X[test_inds,:]\n", - "basis_train = W[train_inds,:]\n", - "basis_test = W[test_inds,:]\n", + "X_train = X[train_inds, :]\n", + "X_test = X[test_inds, :]\n", + "basis_train = W[train_inds, :]\n", + "basis_test = W[test_inds, :]\n", "y_train = y[train_inds]\n", "y_test = y[test_inds]\n", "f_x_train = f_x[train_inds]\n", "f_x_test = f_x[test_inds]\n", "s_x_train = s_x[train_inds]\n", - "s_x_test = s_x[test_inds]\n" + "s_x_test = s_x[test_inds]" ] }, { @@ -132,12 +133,21 @@ "outputs": [], "source": [ "bart_model = BARTModel()\n", - "global_params = {'sample_sigma2_global': True}\n", - "mean_params = {'num_trees': 100, 'sample_sigma2_leaf': False}\n", - "variance_params = {'num_trees': 50}\n", - "bart_model.sample(X_train=X_train, y_train=y_train, X_test=X_test, basis_train=basis_train, basis_test=basis_test,\n", - " num_gfr=10, num_mcmc=100, general_params=global_params, mean_forest_params=mean_params, \n", - " variance_forest_params=variance_params)" + "global_params = {\"sample_sigma2_global\": True}\n", + "mean_params = {\"num_trees\": 100, \"sample_sigma2_leaf\": False}\n", + "variance_params = {\"num_trees\": 50}\n", + "bart_model.sample(\n", + " X_train=X_train,\n", + " y_train=y_train,\n", + " X_test=X_test,\n", + " basis_train=basis_train,\n", + " basis_test=basis_test,\n", + " num_gfr=10,\n", + " num_mcmc=100,\n", + " general_params=global_params,\n", + " mean_forest_params=mean_params,\n", + " variance_forest_params=variance_params,\n", + ")" ] }, { @@ -154,10 +164,13 @@ "outputs": [], "source": [ "forest_preds_y_mcmc = bart_model.y_hat_test\n", - "y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis = 1, keepdims = True)\n", - "y_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(y_test,1), y_avg_mcmc), axis = 1), columns=[\"True outcome\", \"Average estimated outcome\"])\n", + "y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis=1, keepdims=True)\n", + "y_df_mcmc = pd.DataFrame(\n", + " np.concatenate((np.expand_dims(y_test, 1), y_avg_mcmc), axis=1),\n", + " columns=[\"True outcome\", \"Average estimated outcome\"],\n", + ")\n", "sns.scatterplot(data=y_df_mcmc, x=\"Average estimated outcome\", y=\"True outcome\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -168,10 +181,17 @@ "outputs": [], "source": [ "forest_preds_s_x_mcmc = np.sqrt(bart_model.sigma2_x_test)\n", - "s_x_avg_mcmc = np.squeeze(forest_preds_s_x_mcmc).mean(axis = 1, keepdims = True)\n", - "s_x_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(s_x_test,1), s_x_avg_mcmc), axis = 1), columns=[\"True standard deviation\", \"Average estimated standard deviation\"])\n", - "sns.scatterplot(data=s_x_df_mcmc, x=\"Average estimated standard deviation\", y=\"True standard deviation\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "s_x_avg_mcmc = np.squeeze(forest_preds_s_x_mcmc).mean(axis=1, keepdims=True)\n", + "s_x_df_mcmc = pd.DataFrame(\n", + " np.concatenate((np.expand_dims(s_x_test, 1), s_x_avg_mcmc), axis=1),\n", + " columns=[\"True standard deviation\", \"Average estimated standard deviation\"],\n", + ")\n", + "sns.scatterplot(\n", + " data=s_x_df_mcmc,\n", + " x=\"Average estimated standard deviation\",\n", + " y=\"True standard deviation\",\n", + ")\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -181,7 +201,16 @@ "metadata": {}, "outputs": [], "source": [ - "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bart_model.global_var_samples.shape[0]),axis=1), np.expand_dims(bart_model.global_var_samples,axis=1)), axis = 1), columns=[\"Sample\", \"Sigma\"])\n", + "sigma_df_mcmc = pd.DataFrame(\n", + " np.concatenate(\n", + " (\n", + " np.expand_dims(np.arange(bart_model.global_var_samples.shape[0]), axis=1),\n", + " np.expand_dims(bart_model.global_var_samples, axis=1),\n", + " ),\n", + " axis=1,\n", + " ),\n", + " columns=[\"Sample\", \"Sigma\"],\n", + ")\n", "sns.scatterplot(data=sigma_df_mcmc, x=\"Sample\", y=\"Sigma\")\n", "plt.show()" ] @@ -199,7 +228,7 @@ "metadata": {}, "outputs": [], "source": [ - "np.sqrt(np.mean(np.power(y_test - np.squeeze(y_avg_mcmc),2)))" + "np.sqrt(np.mean(np.power(y_test - np.squeeze(y_avg_mcmc), 2)))" ] } ], diff --git a/demo/notebooks/multivariate_treatment_causal_inference.ipynb b/demo/notebooks/multivariate_treatment_causal_inference.ipynb index 60741e33..6e7e17a8 100644 --- a/demo/notebooks/multivariate_treatment_causal_inference.ipynb +++ b/demo/notebooks/multivariate_treatment_causal_inference.ipynb @@ -48,13 +48,13 @@ "n = 500\n", "p_X = 5\n", "X = rng.uniform(0, 1, (n, p_X))\n", - "pi_X = np.c_[0.25 + 0.5*X[:,0], 0.75 - 0.5*X[:,1]]\n", + "pi_X = np.c_[0.25 + 0.5 * X[:, 0], 0.75 - 0.5 * X[:, 1]]\n", "# Z = rng.uniform(0, 1, (n, 2))\n", "Z = rng.binomial(1, pi_X, (n, 2))\n", "\n", "# Define the outcome mean functions (prognostic and treatment effects)\n", - "mu_X = pi_X[:,0]*5 + pi_X[:,1]*2 + 2*X[:,2]\n", - "tau_X = np.stack((X[:,1], X[:,2]), axis=-1)\n", + "mu_X = pi_X[:, 0] * 5 + pi_X[:, 1] * 2 + 2 * X[:, 2]\n", + "tau_X = np.stack((X[:, 1], X[:, 2]), axis=-1)\n", "\n", "# Generate outcome\n", "epsilon = rng.normal(0, 1, n)\n", @@ -77,18 +77,18 @@ "source": [ "sample_inds = np.arange(n)\n", "train_inds, test_inds = train_test_split(sample_inds, test_size=0.5)\n", - "X_train = X[train_inds,:]\n", - "X_test = X[test_inds,:]\n", - "Z_train = Z[train_inds,:]\n", - "Z_test = Z[test_inds,:]\n", + "X_train = X[train_inds, :]\n", + "X_test = X[test_inds, :]\n", + "Z_train = Z[train_inds, :]\n", + "Z_test = Z[test_inds, :]\n", "y_train = y[train_inds]\n", "y_test = y[test_inds]\n", "pi_train = pi_X[train_inds]\n", "pi_test = pi_X[test_inds]\n", "mu_train = mu_X[train_inds]\n", "mu_test = mu_X[test_inds]\n", - "tau_train = tau_X[train_inds,:]\n", - "tau_test = tau_X[test_inds,:]" + "tau_train = tau_X[train_inds, :]\n", + "tau_test = tau_X[test_inds, :]" ] }, { @@ -105,7 +105,17 @@ "outputs": [], "source": [ "bcf_model = BCFModel()\n", - "bcf_model.sample(X_train, Z_train, y_train, pi_train, X_test, Z_test, pi_test, num_gfr=10, num_mcmc=100)" + "bcf_model.sample(\n", + " X_train,\n", + " Z_train,\n", + " y_train,\n", + " pi_train,\n", + " X_test,\n", + " Z_test,\n", + " pi_test,\n", + " num_gfr=10,\n", + " num_mcmc=100,\n", + ")" ] }, { @@ -122,10 +132,13 @@ "outputs": [], "source": [ "forest_preds_y_mcmc = bcf_model.y_hat_test\n", - "y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis = 1, keepdims = True)\n", - "y_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(y_test,1), y_avg_mcmc), axis = 1), columns=[\"True outcome\", \"Average estimated outcome\"])\n", + "y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis=1, keepdims=True)\n", + "y_df_mcmc = pd.DataFrame(\n", + " np.concatenate((np.expand_dims(y_test, 1), y_avg_mcmc), axis=1),\n", + " columns=[\"True outcome\", \"Average estimated outcome\"],\n", + ")\n", "sns.scatterplot(data=y_df_mcmc, x=\"Average estimated outcome\", y=\"True outcome\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -145,11 +158,16 @@ "outputs": [], "source": [ "treatment_idx = 0\n", - "forest_preds_tau_mcmc = np.squeeze(bcf_model.tau_hat_test[:,:,treatment_idx])\n", - "tau_avg_mcmc = np.squeeze(forest_preds_tau_mcmc).mean(axis = 1, keepdims = True)\n", - "tau_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(tau_test[:,treatment_idx],1), tau_avg_mcmc), axis = 1), columns=[\"True tau\", \"Average estimated tau\"])\n", + "forest_preds_tau_mcmc = np.squeeze(bcf_model.tau_hat_test[:, :, treatment_idx])\n", + "tau_avg_mcmc = np.squeeze(forest_preds_tau_mcmc).mean(axis=1, keepdims=True)\n", + "tau_df_mcmc = pd.DataFrame(\n", + " np.concatenate(\n", + " (np.expand_dims(tau_test[:, treatment_idx], 1), tau_avg_mcmc), axis=1\n", + " ),\n", + " columns=[\"True tau\", \"Average estimated tau\"],\n", + ")\n", "sns.scatterplot(data=tau_df_mcmc, x=\"True tau\", y=\"Average estimated tau\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -160,11 +178,16 @@ "outputs": [], "source": [ "treatment_idx = 1\n", - "forest_preds_tau_mcmc = np.squeeze(bcf_model.tau_hat_test[:,:,treatment_idx])\n", - "tau_avg_mcmc = np.squeeze(forest_preds_tau_mcmc).mean(axis = 1, keepdims = True)\n", - "tau_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(tau_test[:,treatment_idx],1), tau_avg_mcmc), axis = 1), columns=[\"True tau\", \"Average estimated tau\"])\n", + "forest_preds_tau_mcmc = np.squeeze(bcf_model.tau_hat_test[:, :, treatment_idx])\n", + "tau_avg_mcmc = np.squeeze(forest_preds_tau_mcmc).mean(axis=1, keepdims=True)\n", + "tau_df_mcmc = pd.DataFrame(\n", + " np.concatenate(\n", + " (np.expand_dims(tau_test[:, treatment_idx], 1), tau_avg_mcmc), axis=1\n", + " ),\n", + " columns=[\"True tau\", \"Average estimated tau\"],\n", + ")\n", "sns.scatterplot(data=tau_df_mcmc, x=\"True tau\", y=\"Average estimated tau\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -174,13 +197,24 @@ "metadata": {}, "outputs": [], "source": [ - "treatment_term_mcmc_test = np.multiply(np.atleast_3d(Z_test).swapaxes(1,2),bcf_model.tau_hat_test).sum(axis=2)\n", + "treatment_term_mcmc_test = np.multiply(\n", + " np.atleast_3d(Z_test).swapaxes(1, 2), bcf_model.tau_hat_test\n", + ").sum(axis=2)\n", "treatment_term_test = np.multiply(tau_test, Z_test).sum(axis=1)\n", - "treatment_term_mcmc_avg = np.squeeze(treatment_term_mcmc_test).mean(axis = 1, keepdims = True)\n", - "mu_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(treatment_term_test,1), treatment_term_mcmc_avg), axis = 1), columns=[\"True treatment term\", \"Average estimated treatment term\"])\n", - "sns.scatterplot(data=mu_df_mcmc, x=\"True treatment term\", y=\"Average estimated treatment term\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", - "plt.show()\n" + "treatment_term_mcmc_avg = np.squeeze(treatment_term_mcmc_test).mean(\n", + " axis=1, keepdims=True\n", + ")\n", + "mu_df_mcmc = pd.DataFrame(\n", + " np.concatenate(\n", + " (np.expand_dims(treatment_term_test, 1), treatment_term_mcmc_avg), axis=1\n", + " ),\n", + " columns=[\"True treatment term\", \"Average estimated treatment term\"],\n", + ")\n", + "sns.scatterplot(\n", + " data=mu_df_mcmc, x=\"True treatment term\", y=\"Average estimated treatment term\"\n", + ")\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", + "plt.show()" ] }, { @@ -190,10 +224,13 @@ "outputs": [], "source": [ "forest_preds_mu_mcmc = bcf_model.mu_hat_test\n", - "mu_avg_mcmc = np.squeeze(forest_preds_mu_mcmc).mean(axis = 1, keepdims = True)\n", - "mu_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(mu_test,1), mu_avg_mcmc), axis = 1), columns=[\"True mu\", \"Average estimated mu\"])\n", + "mu_avg_mcmc = np.squeeze(forest_preds_mu_mcmc).mean(axis=1, keepdims=True)\n", + "mu_df_mcmc = pd.DataFrame(\n", + " np.concatenate((np.expand_dims(mu_test, 1), mu_avg_mcmc), axis=1),\n", + " columns=[\"True mu\", \"Average estimated mu\"],\n", + ")\n", "sns.scatterplot(data=mu_df_mcmc, x=\"True mu\", y=\"Average estimated mu\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -203,7 +240,18 @@ "metadata": {}, "outputs": [], "source": [ - "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model.num_samples - bcf_model.num_gfr),axis=1), np.expand_dims(bcf_model.global_var_samples[bcf_model.num_gfr:],axis=1)), axis = 1), columns=[\"Sample\", \"Sigma\"])\n", + "sigma_df_mcmc = pd.DataFrame(\n", + " np.concatenate(\n", + " (\n", + " np.expand_dims(\n", + " np.arange(bcf_model.num_samples - bcf_model.num_gfr), axis=1\n", + " ),\n", + " np.expand_dims(bcf_model.global_var_samples[bcf_model.num_gfr :], axis=1),\n", + " ),\n", + " axis=1,\n", + " ),\n", + " columns=[\"Sample\", \"Sigma\"],\n", + ")\n", "sns.scatterplot(data=sigma_df_mcmc, x=\"Sample\", y=\"Sigma\")\n", "plt.show()" ] diff --git a/demo/notebooks/prototype_interface.ipynb b/demo/notebooks/prototype_interface.ipynb index 04a7af40..972b291a 100644 --- a/demo/notebooks/prototype_interface.ipynb +++ b/demo/notebooks/prototype_interface.ipynb @@ -61,7 +61,16 @@ "import pandas as pd\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", - "from stochtree import Dataset, Residual, RNG, ForestSampler, ForestContainer, Forest, GlobalVarianceModel, LeafVarianceModel" + "from stochtree import (\n", + " Dataset,\n", + " Residual,\n", + " RNG,\n", + " ForestSampler,\n", + " ForestContainer,\n", + " Forest,\n", + " GlobalVarianceModel,\n", + " LeafVarianceModel,\n", + ")" ] }, { @@ -88,19 +97,20 @@ "X = rng.uniform(0, 1, (n, p_X))\n", "W = rng.uniform(0, 1, (n, p_W))\n", "\n", + "\n", "# Define the outcome mean function\n", "def outcome_mean(X, W):\n", " return np.where(\n", - " (X[:,0] >= 0.0) & (X[:,0] < 0.25), -7.5 * W[:,0], \n", + " (X[:, 0] >= 0.0) & (X[:, 0] < 0.25),\n", + " -7.5 * W[:, 0],\n", " np.where(\n", - " (X[:,0] >= 0.25) & (X[:,0] < 0.5), -2.5 * W[:,0], \n", - " np.where(\n", - " (X[:,0] >= 0.5) & (X[:,0] < 0.75), 2.5 * W[:,0], \n", - " 7.5 * W[:,0]\n", - " )\n", - " )\n", + " (X[:, 0] >= 0.25) & (X[:, 0] < 0.5),\n", + " -2.5 * W[:, 0],\n", + " np.where((X[:, 0] >= 0.5) & (X[:, 0] < 0.75), 2.5 * W[:, 0], 7.5 * W[:, 0]),\n", + " ),\n", " )\n", "\n", + "\n", "# Generate outcome\n", "epsilon = rng.normal(0, 1, n)\n", "y = outcome_mean(X, W) + epsilon\n", @@ -108,7 +118,7 @@ "# Standardize outcome\n", "y_bar = np.mean(y)\n", "y_std = np.std(y)\n", - "resid = (y-y_bar)/y_std" + "resid = (y - y_bar) / y_std" ] }, { @@ -129,16 +139,16 @@ "min_samples_leaf = 1\n", "num_trees = 100\n", "cutpoint_grid_size = 100\n", - "global_variance_init = 1.\n", + "global_variance_init = 1.0\n", "tau_init = 0.5\n", - "leaf_prior_scale = np.array([[tau_init]], order='C')\n", - "a_global = 4.\n", - "b_global = 2.\n", - "a_leaf = 2.\n", + "leaf_prior_scale = np.array([[tau_init]], order=\"C\")\n", + "a_global = 4.0\n", + "b_global = 2.0\n", + "a_leaf = 2.0\n", "b_leaf = 0.5\n", "leaf_regression = True\n", - "feature_types = np.repeat(0, p_X).astype(int) # 0 = numeric\n", - "var_weights = np.repeat(1/p_X, p_X)" + "feature_types = np.repeat(0, p_X).astype(int) # 0 = numeric\n", + "var_weights = np.repeat(1 / p_X, p_X)" ] }, { @@ -178,7 +188,9 @@ "source": [ "forest_container = ForestContainer(num_trees, W.shape[1], False, False)\n", "active_forest = Forest(num_trees, W.shape[1], False, False)\n", - "forest_sampler = ForestSampler(dataset, feature_types, num_trees, n, alpha, beta, min_samples_leaf)\n", + "forest_sampler = ForestSampler(\n", + " dataset, feature_types, num_trees, n, alpha, beta, min_samples_leaf\n", + ")\n", "cpp_rng = RNG(random_seed)\n", "global_var_model = GlobalVarianceModel()\n", "leaf_var_model = LeafVarianceModel()" @@ -200,7 +212,9 @@ "num_warmstart = 10\n", "num_mcmc = 100\n", "num_samples = num_warmstart + num_mcmc\n", - "global_var_samples = np.concatenate((np.array([global_variance_init]), np.repeat(0, num_samples)))\n", + "global_var_samples = np.concatenate(\n", + " (np.array([global_variance_init]), np.repeat(0, num_samples))\n", + ")\n", "leaf_scale_samples = np.concatenate((np.array([tau_init]), np.repeat(0, num_samples)))" ] }, @@ -218,12 +232,31 @@ "outputs": [], "source": [ "for i in range(num_warmstart):\n", - " forest_sampler.sample_one_iteration(forest_container, active_forest, dataset, residual, cpp_rng, \n", - " feature_types, cutpoint_grid_size, leaf_prior_scale, var_weights, \n", - " 0.0, 0.0, global_var_samples[i], 1, True, True, False)\n", - " global_var_samples[i+1] = global_var_model.sample_one_iteration(residual, cpp_rng, a_global, b_global)\n", - " leaf_scale_samples[i+1] = leaf_var_model.sample_one_iteration(active_forest, cpp_rng, a_leaf, b_leaf)\n", - " leaf_prior_scale[0,0] = leaf_scale_samples[i+1]" + " forest_sampler.sample_one_iteration(\n", + " forest_container,\n", + " active_forest,\n", + " dataset,\n", + " residual,\n", + " cpp_rng,\n", + " feature_types,\n", + " cutpoint_grid_size,\n", + " leaf_prior_scale,\n", + " var_weights,\n", + " 0.0,\n", + " 0.0,\n", + " global_var_samples[i],\n", + " 1,\n", + " True,\n", + " True,\n", + " False,\n", + " )\n", + " global_var_samples[i + 1] = global_var_model.sample_one_iteration(\n", + " residual, cpp_rng, a_global, b_global\n", + " )\n", + " leaf_scale_samples[i + 1] = leaf_var_model.sample_one_iteration(\n", + " active_forest, cpp_rng, a_leaf, b_leaf\n", + " )\n", + " leaf_prior_scale[0, 0] = leaf_scale_samples[i + 1]" ] }, { @@ -240,12 +273,31 @@ "outputs": [], "source": [ "for i in range(num_warmstart, num_samples):\n", - " forest_sampler.sample_one_iteration(forest_container, active_forest, dataset, residual, cpp_rng, \n", - " feature_types, cutpoint_grid_size, leaf_prior_scale, var_weights, \n", - " 0.0, 0.0, global_var_samples[i], 1, True, False, False)\n", - " global_var_samples[i+1] = global_var_model.sample_one_iteration(residual, cpp_rng, a_global, b_global)\n", - " leaf_scale_samples[i+1] = leaf_var_model.sample_one_iteration(active_forest, cpp_rng, a_leaf, b_leaf)\n", - " leaf_prior_scale[0,0] = leaf_scale_samples[i+1]" + " forest_sampler.sample_one_iteration(\n", + " forest_container,\n", + " active_forest,\n", + " dataset,\n", + " residual,\n", + " cpp_rng,\n", + " feature_types,\n", + " cutpoint_grid_size,\n", + " leaf_prior_scale,\n", + " var_weights,\n", + " 0.0,\n", + " 0.0,\n", + " global_var_samples[i],\n", + " 1,\n", + " True,\n", + " False,\n", + " False,\n", + " )\n", + " global_var_samples[i + 1] = global_var_model.sample_one_iteration(\n", + " residual, cpp_rng, a_global, b_global\n", + " )\n", + " leaf_scale_samples[i + 1] = leaf_var_model.sample_one_iteration(\n", + " active_forest, cpp_rng, a_leaf, b_leaf\n", + " )\n", + " leaf_prior_scale[0, 0] = leaf_scale_samples[i + 1]" ] }, { @@ -262,12 +314,12 @@ "outputs": [], "source": [ "# Forest predictions\n", - "forest_preds = forest_container.predict(dataset)*y_std + y_bar\n", - "forest_preds_gfr = forest_preds[:,:num_warmstart]\n", - "forest_preds_mcmc = forest_preds[:,num_warmstart:num_samples]\n", + "forest_preds = forest_container.predict(dataset) * y_std + y_bar\n", + "forest_preds_gfr = forest_preds[:, :num_warmstart]\n", + "forest_preds_mcmc = forest_preds[:, num_warmstart:num_samples]\n", "\n", "# Global error variance\n", - "sigma_samples = np.sqrt(global_var_samples)*y_std\n", + "sigma_samples = np.sqrt(global_var_samples) * y_std\n", "sigma_samples_gfr = sigma_samples[:num_warmstart]\n", "sigma_samples_mcmc = sigma_samples[num_warmstart:num_samples]" ] @@ -285,10 +337,13 @@ "metadata": {}, "outputs": [], "source": [ - "forest_pred_avg_gfr = forest_preds_gfr.mean(axis = 1, keepdims = True)\n", - "forest_pred_df_gfr = pd.DataFrame(np.concatenate((np.expand_dims(y, axis=1), forest_pred_avg_gfr), axis = 1), columns=[\"True y\", \"Average predicted y\"])\n", + "forest_pred_avg_gfr = forest_preds_gfr.mean(axis=1, keepdims=True)\n", + "forest_pred_df_gfr = pd.DataFrame(\n", + " np.concatenate((np.expand_dims(y, axis=1), forest_pred_avg_gfr), axis=1),\n", + " columns=[\"True y\", \"Average predicted y\"],\n", + ")\n", "sns.scatterplot(data=forest_pred_df_gfr, x=\"True y\", y=\"Average predicted y\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -298,7 +353,16 @@ "metadata": {}, "outputs": [], "source": [ - "sigma_df_gfr = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(num_warmstart),axis=1), np.expand_dims(sigma_samples_gfr,axis=1)), axis = 1), columns=[\"Sample\", \"Sigma\"])\n", + "sigma_df_gfr = pd.DataFrame(\n", + " np.concatenate(\n", + " (\n", + " np.expand_dims(np.arange(num_warmstart), axis=1),\n", + " np.expand_dims(sigma_samples_gfr, axis=1),\n", + " ),\n", + " axis=1,\n", + " ),\n", + " columns=[\"Sample\", \"Sigma\"],\n", + ")\n", "sns.scatterplot(data=sigma_df_gfr, x=\"Sample\", y=\"Sigma\")\n", "plt.show()" ] @@ -316,10 +380,13 @@ "metadata": {}, "outputs": [], "source": [ - "forest_pred_avg_mcmc = forest_preds_mcmc.mean(axis = 1, keepdims = True)\n", - "forest_pred_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(y, axis=1), forest_pred_avg_mcmc), axis = 1), columns=[\"True y\", \"Average predicted y\"])\n", + "forest_pred_avg_mcmc = forest_preds_mcmc.mean(axis=1, keepdims=True)\n", + "forest_pred_df_mcmc = pd.DataFrame(\n", + " np.concatenate((np.expand_dims(y, axis=1), forest_pred_avg_mcmc), axis=1),\n", + " columns=[\"True y\", \"Average predicted y\"],\n", + ")\n", "sns.scatterplot(data=forest_pred_df_mcmc, x=\"True y\", y=\"Average predicted y\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -329,7 +396,16 @@ "metadata": {}, "outputs": [], "source": [ - "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(num_samples - num_warmstart),axis=1), np.expand_dims(sigma_samples_mcmc,axis=1)), axis = 1), columns=[\"Sample\", \"Sigma\"])\n", + "sigma_df_mcmc = pd.DataFrame(\n", + " np.concatenate(\n", + " (\n", + " np.expand_dims(np.arange(num_samples - num_warmstart), axis=1),\n", + " np.expand_dims(sigma_samples_mcmc, axis=1),\n", + " ),\n", + " axis=1,\n", + " ),\n", + " columns=[\"Sample\", \"Sigma\"],\n", + ")\n", "sns.scatterplot(data=sigma_df_mcmc, x=\"Sample\", y=\"Sigma\")\n", "plt.show()" ] @@ -362,22 +438,22 @@ "n = 500\n", "p_X = 5\n", "X = rng.uniform(0, 1, (n, p_X))\n", - "pi_X = 0.35 + 0.3*X[:,0]\n", + "pi_X = 0.35 + 0.3 * X[:, 0]\n", "Z = rng.binomial(1, pi_X, n).astype(float)\n", "\n", "# Define the outcome mean functions (prognostic and treatment effects)\n", - "mu_X = (pi_X - 0.5)*30\n", + "mu_X = (pi_X - 0.5) * 30\n", "# tau_X = np.sin(X[:,1]*2*np.pi)\n", - "tau_X = X[:,1]*2\n", + "tau_X = X[:, 1] * 2\n", "\n", "# Generate outcome\n", "epsilon = rng.normal(0, 1, n)\n", - "y = mu_X + tau_X*Z + epsilon\n", + "y = mu_X + tau_X * Z + epsilon\n", "\n", "# Standardize outcome\n", "y_bar = np.mean(y)\n", "y_std = np.std(y)\n", - "resid = (y-y_bar)/y_std" + "resid = (y - y_bar) / y_std" ] }, { @@ -399,32 +475,32 @@ "min_samples_leaf_mu = 1\n", "num_trees_mu = 200\n", "cutpoint_grid_size_mu = 100\n", - "tau_init_mu = 1/num_trees_mu\n", - "leaf_prior_scale_mu = np.array([[tau_init_mu]], order='C')\n", - "a_leaf_mu = 3.\n", - "b_leaf_mu = 1/num_trees_mu\n", + "tau_init_mu = 1 / num_trees_mu\n", + "leaf_prior_scale_mu = np.array([[tau_init_mu]], order=\"C\")\n", + "a_leaf_mu = 3.0\n", + "b_leaf_mu = 1 / num_trees_mu\n", "leaf_regression_mu = False\n", - "feature_types_mu = np.repeat(0, p_X).astype(int) # 0 = numeric\n", - "var_weights_mu = np.repeat(1/(p_X + 1), p_X + 1)\n", + "feature_types_mu = np.repeat(0, p_X).astype(int) # 0 = numeric\n", + "var_weights_mu = np.repeat(1 / (p_X + 1), p_X + 1)\n", "\n", "# Treatment forest parameters\n", "alpha_tau = 0.75\n", - "beta_tau = 3.\n", + "beta_tau = 3.0\n", "min_samples_leaf_tau = 1\n", "num_trees_tau = 50\n", "cutpoint_grid_size_tau = 100\n", - "tau_init_tau = 1/num_trees_tau\n", - "leaf_prior_scale_tau = np.array([[tau_init_tau]], order='C')\n", - "a_leaf_tau = 3.\n", - "b_leaf_tau = 1/num_trees_tau\n", + "tau_init_tau = 1 / num_trees_tau\n", + "leaf_prior_scale_tau = np.array([[tau_init_tau]], order=\"C\")\n", + "a_leaf_tau = 3.0\n", + "b_leaf_tau = 1 / num_trees_tau\n", "leaf_regression_tau = True\n", - "feature_types_tau = np.repeat(0, p_X).astype(int) # 0 = numeric\n", - "var_weights_tau = np.repeat(1/p_X, p_X)\n", + "feature_types_tau = np.repeat(0, p_X).astype(int) # 0 = numeric\n", + "var_weights_tau = np.repeat(1 / p_X, p_X)\n", "\n", "# Global parameters\n", - "a_global = 2.\n", - "b_global = 1.\n", - "global_variance_init = 1." + "a_global = 2.0\n", + "b_global = 1.0\n", + "global_variance_init = 1.0" ] }, { @@ -469,13 +545,33 @@ "# Prognostic forest sampling classes\n", "forest_container_mu = ForestContainer(num_trees_mu, 1, True, False)\n", "active_forest_mu = Forest(num_trees_mu, 1, True, False)\n", - "forest_sampler_mu = ForestSampler(dataset_mu, feature_types_mu, num_trees_mu, n, alpha_mu, beta_mu, min_samples_leaf_mu)\n", + "forest_sampler_mu = ForestSampler(\n", + " dataset_mu,\n", + " feature_types_mu,\n", + " num_trees_mu,\n", + " n,\n", + " alpha_mu,\n", + " beta_mu,\n", + " min_samples_leaf_mu,\n", + ")\n", "leaf_var_model_mu = LeafVarianceModel()\n", "\n", "# Treatment forest sampling classes\n", - "forest_container_tau = ForestContainer(num_trees_tau, 1 if np.ndim(Z) == 1 else Z.shape[1], False, False)\n", - "active_forest_tau = Forest(num_trees_tau, 1 if np.ndim(Z) == 1 else Z.shape[1], False, False)\n", - "forest_sampler_tau = ForestSampler(dataset_tau, feature_types_tau, num_trees_tau, n, alpha_tau, beta_tau, min_samples_leaf_tau)\n", + "forest_container_tau = ForestContainer(\n", + " num_trees_tau, 1 if np.ndim(Z) == 1 else Z.shape[1], False, False\n", + ")\n", + "active_forest_tau = Forest(\n", + " num_trees_tau, 1 if np.ndim(Z) == 1 else Z.shape[1], False, False\n", + ")\n", + "forest_sampler_tau = ForestSampler(\n", + " dataset_tau,\n", + " feature_types_tau,\n", + " num_trees_tau,\n", + " n,\n", + " alpha_tau,\n", + " beta_tau,\n", + " min_samples_leaf_tau,\n", + ")\n", "leaf_var_model_tau = LeafVarianceModel()\n", "\n", "# Global classes\n", @@ -499,16 +595,22 @@ "num_warmstart = 10\n", "num_mcmc = 100\n", "num_samples = num_warmstart + num_mcmc\n", - "global_var_samples = np.concatenate((np.array([global_variance_init]), np.repeat(0, num_samples)))\n", - "leaf_scale_samples_mu = np.concatenate((np.array([tau_init_mu]), np.repeat(0, num_samples)))\n", - "leaf_scale_samples_tau = np.concatenate((np.array([tau_init_tau]), np.repeat(0, num_samples)))\n", + "global_var_samples = np.concatenate(\n", + " (np.array([global_variance_init]), np.repeat(0, num_samples))\n", + ")\n", + "leaf_scale_samples_mu = np.concatenate(\n", + " (np.array([tau_init_mu]), np.repeat(0, num_samples))\n", + ")\n", + "leaf_scale_samples_tau = np.concatenate(\n", + " (np.array([tau_init_tau]), np.repeat(0, num_samples))\n", + ")\n", "leaf_prior_scale_mu = np.array([[tau_init_mu]])\n", "leaf_prior_scale_tau = np.array([[tau_init_tau]])\n", "b_0_init = -0.5\n", "b_1_init = 0.5\n", "b_0_samples = np.concatenate((np.array([b_0_init]), np.repeat(0, num_samples)))\n", "b_1_samples = np.concatenate((np.array([b_1_init]), np.repeat(0, num_samples)))\n", - "tau_basis = (1-Z)*b_0_init + Z*b_1_init\n", + "tau_basis = (1 - Z) * b_0_init + Z * b_1_init\n", "dataset_tau.update_basis(tau_basis)" ] }, @@ -526,32 +628,74 @@ "outputs": [], "source": [ "for i in range(num_warmstart):\n", - " # Sample the prognostic forest\n", - " forest_sampler_mu.sample_one_iteration(forest_container_mu, active_forest_mu, dataset_mu, residual, cpp_rng, \n", - " feature_types_mu, cutpoint_grid_size_mu, leaf_prior_scale_mu, var_weights_mu, \n", - " 0.0, 0.0, global_var_samples[i], 0, True, True, False)\n", - " leaf_scale_samples_mu[i+1] = leaf_var_model_mu.sample_one_iteration(active_forest_mu, cpp_rng, a_leaf_mu, b_leaf_mu)\n", - " leaf_prior_scale_mu[0,0] = leaf_scale_samples_mu[i+1]\n", - " mu_x = active_forest_mu.predict_raw(dataset_mu)\n", + " # Sample the prognostic forest\n", + " forest_sampler_mu.sample_one_iteration(\n", + " forest_container_mu,\n", + " active_forest_mu,\n", + " dataset_mu,\n", + " residual,\n", + " cpp_rng,\n", + " feature_types_mu,\n", + " cutpoint_grid_size_mu,\n", + " leaf_prior_scale_mu,\n", + " var_weights_mu,\n", + " 0.0,\n", + " 0.0,\n", + " global_var_samples[i],\n", + " 0,\n", + " True,\n", + " True,\n", + " False,\n", + " )\n", + " leaf_scale_samples_mu[i + 1] = leaf_var_model_mu.sample_one_iteration(\n", + " active_forest_mu, cpp_rng, a_leaf_mu, b_leaf_mu\n", + " )\n", + " leaf_prior_scale_mu[0, 0] = leaf_scale_samples_mu[i + 1]\n", + " mu_x = active_forest_mu.predict_raw(dataset_mu)\n", "\n", - " # Sample the treatment effect forest\n", - " forest_sampler_tau.sample_one_iteration(forest_container_tau, active_forest_tau, dataset_tau, residual, cpp_rng, \n", - " feature_types_tau, cutpoint_grid_size_tau, leaf_prior_scale_tau, var_weights_tau, \n", - " 0.0, 0.0, global_var_samples[i], 1, True, True, False)\n", - " tau_x = np.squeeze(active_forest_tau.predict_raw(dataset_tau))\n", - " s_tt0 = np.sum(tau_x*tau_x*(Z==0))\n", - " s_tt1 = np.sum(tau_x*tau_x*(Z==1))\n", - " partial_resid_mu = resid - np.squeeze(mu_x)\n", - " s_ty0 = np.sum(tau_x*partial_resid_mu*(Z==0))\n", - " s_ty1 = np.sum(tau_x*partial_resid_mu*(Z==1))\n", - " b_0_samples[i+1] = rng.normal(loc = (s_ty0/(s_tt0 + 2*global_var_samples[i])), scale = np.sqrt(global_var_samples[i]/(s_tt0 + 2*global_var_samples[i])), size = 1)\n", - " b_1_samples[i+1] = rng.normal(loc = (s_ty1/(s_tt1 + 2*global_var_samples[i])), scale = np.sqrt(global_var_samples[i]/(s_tt1 + 2*global_var_samples[i])), size = 1)\n", - " tau_basis = (1-Z)*b_0_samples[i+1] + Z*b_1_samples[i+1]\n", - " dataset_tau.update_basis(tau_basis)\n", - " forest_sampler_tau.propagate_basis_update(dataset_tau, residual, active_forest_tau)\n", - " \n", - " # Sample global variance\n", - " global_var_samples[i+1] = global_var_model.sample_one_iteration(residual, cpp_rng, a_global, b_global)" + " # Sample the treatment effect forest\n", + " forest_sampler_tau.sample_one_iteration(\n", + " forest_container_tau,\n", + " active_forest_tau,\n", + " dataset_tau,\n", + " residual,\n", + " cpp_rng,\n", + " feature_types_tau,\n", + " cutpoint_grid_size_tau,\n", + " leaf_prior_scale_tau,\n", + " var_weights_tau,\n", + " 0.0,\n", + " 0.0,\n", + " global_var_samples[i],\n", + " 1,\n", + " True,\n", + " True,\n", + " False,\n", + " )\n", + " tau_x = np.squeeze(active_forest_tau.predict_raw(dataset_tau))\n", + " s_tt0 = np.sum(tau_x * tau_x * (Z == 0))\n", + " s_tt1 = np.sum(tau_x * tau_x * (Z == 1))\n", + " partial_resid_mu = resid - np.squeeze(mu_x)\n", + " s_ty0 = np.sum(tau_x * partial_resid_mu * (Z == 0))\n", + " s_ty1 = np.sum(tau_x * partial_resid_mu * (Z == 1))\n", + " b_0_samples[i + 1] = rng.normal(\n", + " loc=(s_ty0 / (s_tt0 + 2 * global_var_samples[i])),\n", + " scale=np.sqrt(global_var_samples[i] / (s_tt0 + 2 * global_var_samples[i])),\n", + " size=1,\n", + " )\n", + " b_1_samples[i + 1] = rng.normal(\n", + " loc=(s_ty1 / (s_tt1 + 2 * global_var_samples[i])),\n", + " scale=np.sqrt(global_var_samples[i] / (s_tt1 + 2 * global_var_samples[i])),\n", + " size=1,\n", + " )\n", + " tau_basis = (1 - Z) * b_0_samples[i + 1] + Z * b_1_samples[i + 1]\n", + " dataset_tau.update_basis(tau_basis)\n", + " forest_sampler_tau.propagate_basis_update(dataset_tau, residual, active_forest_tau)\n", + "\n", + " # Sample global variance\n", + " global_var_samples[i + 1] = global_var_model.sample_one_iteration(\n", + " residual, cpp_rng, a_global, b_global\n", + " )" ] }, { @@ -568,32 +712,74 @@ "outputs": [], "source": [ "for i in range(num_warmstart, num_samples):\n", - " # Sample the prognostic forest\n", - " forest_sampler_mu.sample_one_iteration(forest_container_mu, active_forest_mu, dataset_mu, residual, cpp_rng, \n", - " feature_types_mu, cutpoint_grid_size_mu, leaf_prior_scale_mu, var_weights_mu, \n", - " 0.0, 0.0, global_var_samples[i], 0, True, False, False)\n", - " leaf_scale_samples_mu[i+1] = leaf_var_model_mu.sample_one_iteration(active_forest_mu, cpp_rng, a_leaf_mu, b_leaf_mu)\n", - " leaf_prior_scale_mu[0,0] = leaf_scale_samples_mu[i+1]\n", - " mu_x = active_forest_mu.predict_raw(dataset_mu)\n", + " # Sample the prognostic forest\n", + " forest_sampler_mu.sample_one_iteration(\n", + " forest_container_mu,\n", + " active_forest_mu,\n", + " dataset_mu,\n", + " residual,\n", + " cpp_rng,\n", + " feature_types_mu,\n", + " cutpoint_grid_size_mu,\n", + " leaf_prior_scale_mu,\n", + " var_weights_mu,\n", + " 0.0,\n", + " 0.0,\n", + " global_var_samples[i],\n", + " 0,\n", + " True,\n", + " False,\n", + " False,\n", + " )\n", + " leaf_scale_samples_mu[i + 1] = leaf_var_model_mu.sample_one_iteration(\n", + " active_forest_mu, cpp_rng, a_leaf_mu, b_leaf_mu\n", + " )\n", + " leaf_prior_scale_mu[0, 0] = leaf_scale_samples_mu[i + 1]\n", + " mu_x = active_forest_mu.predict_raw(dataset_mu)\n", + "\n", + " # Sample the treatment effect forest\n", + " forest_sampler_tau.sample_one_iteration(\n", + " forest_container_tau,\n", + " active_forest_tau,\n", + " dataset_tau,\n", + " residual,\n", + " cpp_rng,\n", + " feature_types_tau,\n", + " cutpoint_grid_size_tau,\n", + " leaf_prior_scale_tau,\n", + " var_weights_tau,\n", + " 0.0,\n", + " 0.0,\n", + " global_var_samples[i],\n", + " 1,\n", + " True,\n", + " False,\n", + " False,\n", + " )\n", + " tau_x = np.squeeze(active_forest_tau.predict_raw(dataset_tau))\n", + " s_tt0 = np.sum(tau_x * tau_x * (Z == 0))\n", + " s_tt1 = np.sum(tau_x * tau_x * (Z == 1))\n", + " partial_resid_mu = resid - np.squeeze(mu_x)\n", + " s_ty0 = np.sum(tau_x * partial_resid_mu * (Z == 0))\n", + " s_ty1 = np.sum(tau_x * partial_resid_mu * (Z == 1))\n", + " b_0_samples[i + 1] = rng.normal(\n", + " loc=(s_ty0 / (s_tt0 + 2 * global_var_samples[i])),\n", + " scale=np.sqrt(global_var_samples[i] / (s_tt0 + 2 * global_var_samples[i])),\n", + " size=1,\n", + " )\n", + " b_1_samples[i + 1] = rng.normal(\n", + " loc=(s_ty1 / (s_tt1 + 2 * global_var_samples[i])),\n", + " scale=np.sqrt(global_var_samples[i] / (s_tt1 + 2 * global_var_samples[i])),\n", + " size=1,\n", + " )\n", + " tau_basis = (1 - Z) * b_0_samples[i + 1] + Z * b_1_samples[i + 1]\n", + " dataset_tau.update_basis(tau_basis)\n", + " forest_sampler_tau.propagate_basis_update(dataset_tau, residual, active_forest_tau)\n", "\n", - " # Sample the treatment effect forest\n", - " forest_sampler_tau.sample_one_iteration(forest_container_tau, active_forest_tau, dataset_tau, residual, cpp_rng, \n", - " feature_types_tau, cutpoint_grid_size_tau, leaf_prior_scale_tau, var_weights_tau, \n", - " 0.0, 0.0, global_var_samples[i], 1, True, False, False)\n", - " tau_x = np.squeeze(active_forest_tau.predict_raw(dataset_tau))\n", - " s_tt0 = np.sum(tau_x*tau_x*(Z==0))\n", - " s_tt1 = np.sum(tau_x*tau_x*(Z==1))\n", - " partial_resid_mu = resid - np.squeeze(mu_x)\n", - " s_ty0 = np.sum(tau_x*partial_resid_mu*(Z==0))\n", - " s_ty1 = np.sum(tau_x*partial_resid_mu*(Z==1))\n", - " b_0_samples[i+1] = rng.normal(loc = (s_ty0/(s_tt0 + 2*global_var_samples[i])), scale = np.sqrt(global_var_samples[i]/(s_tt0 + 2*global_var_samples[i])), size = 1)\n", - " b_1_samples[i+1] = rng.normal(loc = (s_ty1/(s_tt1 + 2*global_var_samples[i])), scale = np.sqrt(global_var_samples[i]/(s_tt1 + 2*global_var_samples[i])), size = 1)\n", - " tau_basis = (1-Z)*b_0_samples[i+1] + Z*b_1_samples[i+1]\n", - " dataset_tau.update_basis(tau_basis)\n", - " forest_sampler_tau.propagate_basis_update(dataset_tau, residual, active_forest_tau)\n", - " \n", - " # Sample global variance\n", - " global_var_samples[i+1] = global_var_model.sample_one_iteration(residual, cpp_rng, a_global, b_global)" + " # Sample global variance\n", + " global_var_samples[i + 1] = global_var_model.sample_one_iteration(\n", + " residual, cpp_rng, a_global, b_global\n", + " )" ] }, { @@ -610,24 +796,28 @@ "outputs": [], "source": [ "# Forest predictions\n", - "forest_preds_mu = forest_container_mu.predict(dataset_mu)*y_std + y_bar\n", - "forest_preds_mu_gfr = forest_preds_mu[:,:num_warmstart]\n", - "forest_preds_mu_mcmc = forest_preds_mu[:,num_warmstart:num_samples]\n", - "treatment_coding_samples = (b_1_samples[1:] - b_0_samples[1:])\n", - "forest_preds_tau = (forest_container_tau.predict_raw(dataset_tau)*y_std*np.expand_dims(treatment_coding_samples, axis=(0,2)))\n", - "forest_preds_tau_gfr = forest_preds_tau[:,:num_warmstart]\n", - "forest_preds_tau_mcmc = forest_preds_tau[:,num_warmstart:num_samples]\n", + "forest_preds_mu = forest_container_mu.predict(dataset_mu) * y_std + y_bar\n", + "forest_preds_mu_gfr = forest_preds_mu[:, :num_warmstart]\n", + "forest_preds_mu_mcmc = forest_preds_mu[:, num_warmstart:num_samples]\n", + "treatment_coding_samples = b_1_samples[1:] - b_0_samples[1:]\n", + "forest_preds_tau = (\n", + " forest_container_tau.predict_raw(dataset_tau)\n", + " * y_std\n", + " * np.expand_dims(treatment_coding_samples, axis=(0, 2))\n", + ")\n", + "forest_preds_tau_gfr = forest_preds_tau[:, :num_warmstart]\n", + "forest_preds_tau_mcmc = forest_preds_tau[:, num_warmstart:num_samples]\n", "\n", "# Global error variance\n", - "sigma_samples = np.sqrt(global_var_samples)*y_std\n", + "sigma_samples = np.sqrt(global_var_samples) * y_std\n", "sigma_samples_gfr = sigma_samples[:num_warmstart]\n", "sigma_samples_mcmc = sigma_samples[num_warmstart:num_samples]\n", "\n", "# Adaptive coding parameters\n", - "b_1_samples_gfr = b_1_samples[1:(num_warmstart+1)]*y_std\n", - "b_0_samples_gfr = b_0_samples[1:(num_warmstart+1)]*y_std\n", - "b_1_samples_mcmc = b_1_samples[(num_warmstart+1):]*y_std\n", - "b_0_samples_mcmc = b_0_samples[(num_warmstart+1):]*y_std" + "b_1_samples_gfr = b_1_samples[1 : (num_warmstart + 1)] * y_std\n", + "b_0_samples_gfr = b_0_samples[1 : (num_warmstart + 1)] * y_std\n", + "b_1_samples_mcmc = b_1_samples[(num_warmstart + 1) :] * y_std\n", + "b_0_samples_mcmc = b_0_samples[(num_warmstart + 1) :] * y_std" ] }, { @@ -643,10 +833,13 @@ "metadata": {}, "outputs": [], "source": [ - "forest_preds_tau_avg_gfr = np.squeeze(forest_preds_tau_gfr).mean(axis = 1, keepdims = True)\n", - "forest_pred_tau_df_gfr = pd.DataFrame(np.concatenate((np.expand_dims(tau_X,1), forest_preds_tau_avg_gfr), axis = 1), columns=[\"True tau\", \"Average estimated tau\"])\n", + "forest_preds_tau_avg_gfr = np.squeeze(forest_preds_tau_gfr).mean(axis=1, keepdims=True)\n", + "forest_pred_tau_df_gfr = pd.DataFrame(\n", + " np.concatenate((np.expand_dims(tau_X, 1), forest_preds_tau_avg_gfr), axis=1),\n", + " columns=[\"True tau\", \"Average estimated tau\"],\n", + ")\n", "sns.scatterplot(data=forest_pred_tau_df_gfr, x=\"True tau\", y=\"Average estimated tau\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -656,10 +849,13 @@ "metadata": {}, "outputs": [], "source": [ - "forest_pred_avg_gfr = np.squeeze(forest_preds_mu_gfr).mean(axis = 1, keepdims = True)\n", - "forest_pred_df_gfr = pd.DataFrame(np.concatenate((np.expand_dims(mu_X,1), forest_pred_avg_gfr), axis = 1), columns=[\"True mu\", \"Average estimated mu\"])\n", + "forest_pred_avg_gfr = np.squeeze(forest_preds_mu_gfr).mean(axis=1, keepdims=True)\n", + "forest_pred_df_gfr = pd.DataFrame(\n", + " np.concatenate((np.expand_dims(mu_X, 1), forest_pred_avg_gfr), axis=1),\n", + " columns=[\"True mu\", \"Average estimated mu\"],\n", + ")\n", "sns.scatterplot(data=forest_pred_df_gfr, x=\"True mu\", y=\"Average estimated mu\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -669,7 +865,16 @@ "metadata": {}, "outputs": [], "source": [ - "sigma_df_gfr = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(num_warmstart),axis=1), np.expand_dims(sigma_samples_gfr,axis=1)), axis = 1), columns=[\"Sample\", \"Sigma\"])\n", + "sigma_df_gfr = pd.DataFrame(\n", + " np.concatenate(\n", + " (\n", + " np.expand_dims(np.arange(num_warmstart), axis=1),\n", + " np.expand_dims(sigma_samples_gfr, axis=1),\n", + " ),\n", + " axis=1,\n", + " ),\n", + " columns=[\"Sample\", \"Sigma\"],\n", + ")\n", "sns.scatterplot(data=sigma_df_gfr, x=\"Sample\", y=\"Sigma\")\n", "plt.show()" ] @@ -680,7 +885,17 @@ "metadata": {}, "outputs": [], "source": [ - "b_df_gfr = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(num_warmstart),axis=1), np.expand_dims(b_0_samples_gfr,axis=1), np.expand_dims(b_1_samples_gfr,axis=1)), axis = 1), columns=[\"Sample\", \"Beta_0\", \"Beta_1\"])\n", + "b_df_gfr = pd.DataFrame(\n", + " np.concatenate(\n", + " (\n", + " np.expand_dims(np.arange(num_warmstart), axis=1),\n", + " np.expand_dims(b_0_samples_gfr, axis=1),\n", + " np.expand_dims(b_1_samples_gfr, axis=1),\n", + " ),\n", + " axis=1,\n", + " ),\n", + " columns=[\"Sample\", \"Beta_0\", \"Beta_1\"],\n", + ")\n", "sns.scatterplot(data=b_df_gfr, x=\"Sample\", y=\"Beta_0\")\n", "sns.scatterplot(data=b_df_gfr, x=\"Sample\", y=\"Beta_1\")\n", "plt.show()" @@ -699,10 +914,13 @@ "metadata": {}, "outputs": [], "source": [ - "forest_pred_avg_mcmc = np.squeeze(forest_preds_tau_mcmc).mean(axis = 1, keepdims = True)\n", - "forest_pred_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(tau_X,1), forest_pred_avg_mcmc), axis = 1), columns=[\"True tau\", \"Average estimated tau\"])\n", + "forest_pred_avg_mcmc = np.squeeze(forest_preds_tau_mcmc).mean(axis=1, keepdims=True)\n", + "forest_pred_df_mcmc = pd.DataFrame(\n", + " np.concatenate((np.expand_dims(tau_X, 1), forest_pred_avg_mcmc), axis=1),\n", + " columns=[\"True tau\", \"Average estimated tau\"],\n", + ")\n", "sns.scatterplot(data=forest_pred_df_mcmc, x=\"True tau\", y=\"Average estimated tau\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -712,10 +930,13 @@ "metadata": {}, "outputs": [], "source": [ - "forest_pred_avg_mcmc = np.squeeze(forest_preds_mu_mcmc).mean(axis = 1, keepdims = True)\n", - "forest_pred_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(mu_X,1), forest_pred_avg_mcmc), axis = 1), columns=[\"True mu\", \"Average estimated mu\"])\n", + "forest_pred_avg_mcmc = np.squeeze(forest_preds_mu_mcmc).mean(axis=1, keepdims=True)\n", + "forest_pred_df_mcmc = pd.DataFrame(\n", + " np.concatenate((np.expand_dims(mu_X, 1), forest_pred_avg_mcmc), axis=1),\n", + " columns=[\"True mu\", \"Average estimated mu\"],\n", + ")\n", "sns.scatterplot(data=forest_pred_df_mcmc, x=\"True mu\", y=\"Average estimated mu\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -725,7 +946,16 @@ "metadata": {}, "outputs": [], "source": [ - "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(num_samples - num_warmstart),axis=1), np.expand_dims(sigma_samples_mcmc,axis=1)), axis = 1), columns=[\"Sample\", \"Sigma\"])\n", + "sigma_df_mcmc = pd.DataFrame(\n", + " np.concatenate(\n", + " (\n", + " np.expand_dims(np.arange(num_samples - num_warmstart), axis=1),\n", + " np.expand_dims(sigma_samples_mcmc, axis=1),\n", + " ),\n", + " axis=1,\n", + " ),\n", + " columns=[\"Sample\", \"Sigma\"],\n", + ")\n", "sns.scatterplot(data=sigma_df_mcmc, x=\"Sample\", y=\"Sigma\")\n", "plt.show()" ] @@ -736,7 +966,17 @@ "metadata": {}, "outputs": [], "source": [ - "b_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(num_samples - num_warmstart),axis=1), np.expand_dims(b_0_samples_mcmc,axis=1), np.expand_dims(b_1_samples_mcmc,axis=1)), axis = 1), columns=[\"Sample\", \"Beta_0\", \"Beta_1\"])\n", + "b_df_mcmc = pd.DataFrame(\n", + " np.concatenate(\n", + " (\n", + " np.expand_dims(np.arange(num_samples - num_warmstart), axis=1),\n", + " np.expand_dims(b_0_samples_mcmc, axis=1),\n", + " np.expand_dims(b_1_samples_mcmc, axis=1),\n", + " ),\n", + " axis=1,\n", + " ),\n", + " columns=[\"Sample\", \"Beta_0\", \"Beta_1\"],\n", + ")\n", "sns.scatterplot(data=b_df_mcmc, x=\"Sample\", y=\"Beta_0\")\n", "sns.scatterplot(data=b_df_mcmc, x=\"Sample\", y=\"Beta_1\")\n", "plt.show()" diff --git a/demo/notebooks/serialization.ipynb b/demo/notebooks/serialization.ipynb index 646b0be4..7f1d0ecd 100644 --- a/demo/notebooks/serialization.ipynb +++ b/demo/notebooks/serialization.ipynb @@ -61,19 +61,20 @@ "X = rng.uniform(0, 1, (n, p_X))\n", "W = rng.uniform(0, 1, (n, p_W))\n", "\n", + "\n", "# Define the outcome mean function\n", "def outcome_mean(X, W):\n", " return np.where(\n", - " (X[:,0] >= 0.0) & (X[:,0] < 0.25), -7.5 * W[:,0], \n", + " (X[:, 0] >= 0.0) & (X[:, 0] < 0.25),\n", + " -7.5 * W[:, 0],\n", " np.where(\n", - " (X[:,0] >= 0.25) & (X[:,0] < 0.5), -2.5 * W[:,0], \n", - " np.where(\n", - " (X[:,0] >= 0.5) & (X[:,0] < 0.75), 2.5 * W[:,0], \n", - " 7.5 * W[:,0]\n", - " )\n", - " )\n", + " (X[:, 0] >= 0.25) & (X[:, 0] < 0.5),\n", + " -2.5 * W[:, 0],\n", + " np.where((X[:, 0] >= 0.5) & (X[:, 0] < 0.75), 2.5 * W[:, 0], 7.5 * W[:, 0]),\n", + " ),\n", " )\n", "\n", + "\n", "# Generate outcome\n", "epsilon = rng.normal(0, 1, n)\n", "y = outcome_mean(X, W) + epsilon\n", @@ -81,7 +82,7 @@ "# Standardize outcome\n", "y_bar = np.mean(y)\n", "y_std = np.std(y)\n", - "resid = (y-y_bar)/y_std" + "resid = (y - y_bar) / y_std" ] }, { @@ -99,10 +100,10 @@ "source": [ "sample_inds = np.arange(n)\n", "train_inds, test_inds = train_test_split(sample_inds, test_size=0.5)\n", - "X_train = X[train_inds,:]\n", - "X_test = X[test_inds,:]\n", - "basis_train = W[train_inds,:]\n", - "basis_test = W[test_inds,:]\n", + "X_train = X[train_inds, :]\n", + "X_test = X[test_inds, :]\n", + "basis_train = W[train_inds, :]\n", + "basis_test = W[test_inds, :]\n", "y_train = y[train_inds]\n", "y_test = y[test_inds]" ] @@ -121,7 +122,15 @@ "outputs": [], "source": [ "bart_model = BARTModel()\n", - "bart_model.sample(X_train=X_train, y_train=y_train, basis_train=basis_train, X_test=X_test, basis_test=basis_test, num_gfr=10, num_mcmc=10)" + "bart_model.sample(\n", + " X_train=X_train,\n", + " y_train=y_train,\n", + " basis_train=basis_train,\n", + " X_test=X_test,\n", + " basis_test=basis_test,\n", + " num_gfr=10,\n", + " num_mcmc=10,\n", + ")" ] }, { @@ -138,10 +147,13 @@ "outputs": [], "source": [ "forest_preds_y_mcmc = bart_model.y_hat_test\n", - "y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis = 1, keepdims = True)\n", - "y_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(y_test,1), y_avg_mcmc), axis = 1), columns=[\"True outcome\", \"Average estimated outcome\"])\n", + "y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis=1, keepdims=True)\n", + "y_df_mcmc = pd.DataFrame(\n", + " np.concatenate((np.expand_dims(y_test, 1), y_avg_mcmc), axis=1),\n", + " columns=[\"True outcome\", \"Average estimated outcome\"],\n", + ")\n", "sns.scatterplot(data=y_df_mcmc, x=\"Average estimated outcome\", y=\"True outcome\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -151,7 +163,16 @@ "metadata": {}, "outputs": [], "source": [ - "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bart_model.num_samples),axis=1), np.expand_dims(bart_model.global_var_samples,axis=1)), axis = 1), columns=[\"Sample\", \"Sigma\"])\n", + "sigma_df_mcmc = pd.DataFrame(\n", + " np.concatenate(\n", + " (\n", + " np.expand_dims(np.arange(bart_model.num_samples), axis=1),\n", + " np.expand_dims(bart_model.global_var_samples, axis=1),\n", + " ),\n", + " axis=1,\n", + " ),\n", + " columns=[\"Sample\", \"Sigma\"],\n", + ")\n", "sns.scatterplot(data=sigma_df_mcmc, x=\"Sample\", y=\"Sigma\")\n", "plt.show()" ] @@ -169,7 +190,7 @@ "metadata": {}, "outputs": [], "source": [ - "np.sqrt(np.mean(np.power(y_test - np.squeeze(y_avg_mcmc),2)))" + "np.sqrt(np.mean(np.power(y_test - np.squeeze(y_avg_mcmc), 2)))" ] }, { @@ -219,10 +240,13 @@ "outputs": [], "source": [ "y_hat_deserialized = bart_model_deserialized.predict(X_test, basis_test)\n", - "y_avg_mcmc_deserialized = np.squeeze(y_hat_deserialized).mean(axis = 1, keepdims = True)\n", - "y_df = pd.DataFrame(np.concatenate((y_avg_mcmc, y_avg_mcmc_deserialized), axis = 1), columns=[\"Original model\", \"Deserialized model\"])\n", + "y_avg_mcmc_deserialized = np.squeeze(y_hat_deserialized).mean(axis=1, keepdims=True)\n", + "y_df = pd.DataFrame(\n", + " np.concatenate((y_avg_mcmc, y_avg_mcmc_deserialized), axis=1),\n", + " columns=[\"Original model\", \"Deserialized model\"],\n", + ")\n", "sns.scatterplot(data=y_df, x=\"Original model\", y=\"Deserialized model\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -239,9 +263,12 @@ "metadata": {}, "outputs": [], "source": [ - "sigma2_df = pd.DataFrame(np.c_[bart_model.global_var_samples, bart_model_deserialized.global_var_samples], columns=[\"Original model\", \"Deserialized model\"])\n", + "sigma2_df = pd.DataFrame(\n", + " np.c_[bart_model.global_var_samples, bart_model_deserialized.global_var_samples],\n", + " columns=[\"Original model\", \"Deserialized model\"],\n", + ")\n", "sns.scatterplot(data=sigma2_df, x=\"Original model\", y=\"Deserialized model\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -258,9 +285,9 @@ "metadata": {}, "outputs": [], "source": [ - "with open('bart.json', 'w') as f:\n", - " bart_json_python = json.loads(bart_json_string)\n", - " json.dump(bart_json_python, f)" + "with open(\"bart.json\", \"w\") as f:\n", + " bart_json_python = json.loads(bart_json_string)\n", + " json.dump(bart_json_python, f)" ] }, { @@ -276,8 +303,8 @@ "metadata": {}, "outputs": [], "source": [ - "with open('bart.json', 'r') as f:\n", - " bart_json_python_reload = json.load(f)\n", + "with open(\"bart.json\", \"r\") as f:\n", + " bart_json_python_reload = json.load(f)\n", "bart_json_string_reload = json.dumps(bart_json_python_reload)\n", "bart_model_file_deserialized = BARTModel()\n", "bart_model_file_deserialized.from_json(bart_json_string_reload)" @@ -297,10 +324,15 @@ "outputs": [], "source": [ "y_hat_file_deserialized = bart_model_file_deserialized.predict(X_test, basis_test)\n", - "y_avg_mcmc_file_deserialized = np.squeeze(y_hat_file_deserialized).mean(axis = 1, keepdims = True)\n", - "y_df = pd.DataFrame(np.concatenate((y_avg_mcmc, y_avg_mcmc_file_deserialized), axis = 1), columns=[\"Original model\", \"Deserialized model\"])\n", + "y_avg_mcmc_file_deserialized = np.squeeze(y_hat_file_deserialized).mean(\n", + " axis=1, keepdims=True\n", + ")\n", + "y_df = pd.DataFrame(\n", + " np.concatenate((y_avg_mcmc, y_avg_mcmc_file_deserialized), axis=1),\n", + " columns=[\"Original model\", \"Deserialized model\"],\n", + ")\n", "sns.scatterplot(data=y_df, x=\"Original model\", y=\"Deserialized model\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -317,9 +349,14 @@ "metadata": {}, "outputs": [], "source": [ - "sigma2_df = pd.DataFrame(np.c_[bart_model.global_var_samples, bart_model_file_deserialized.global_var_samples], columns=[\"Original model\", \"Deserialized model\"])\n", + "sigma2_df = pd.DataFrame(\n", + " np.c_[\n", + " bart_model.global_var_samples, bart_model_file_deserialized.global_var_samples\n", + " ],\n", + " columns=[\"Original model\", \"Deserialized model\"],\n", + ")\n", "sns.scatterplot(data=sigma2_df, x=\"Original model\", y=\"Deserialized model\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -336,7 +373,7 @@ "metadata": {}, "outputs": [], "source": [ - "os.remove('bart.json')" + "os.remove(\"bart.json\")" ] } ], diff --git a/demo/notebooks/supervised_learning.ipynb b/demo/notebooks/supervised_learning.ipynb index ff96872f..522c64d8 100644 --- a/demo/notebooks/supervised_learning.ipynb +++ b/demo/notebooks/supervised_learning.ipynb @@ -52,19 +52,20 @@ "X = rng.uniform(0, 1, (n, p_X))\n", "W = rng.uniform(0, 1, (n, p_W))\n", "\n", + "\n", "# Define the outcome mean function\n", "def outcome_mean(X, W):\n", " return np.where(\n", - " (X[:,0] >= 0.0) & (X[:,0] < 0.25), -7.5 * W[:,0], \n", + " (X[:, 0] >= 0.0) & (X[:, 0] < 0.25),\n", + " -7.5 * W[:, 0],\n", " np.where(\n", - " (X[:,0] >= 0.25) & (X[:,0] < 0.5), -2.5 * W[:,0], \n", - " np.where(\n", - " (X[:,0] >= 0.5) & (X[:,0] < 0.75), 2.5 * W[:,0], \n", - " 7.5 * W[:,0]\n", - " )\n", - " )\n", + " (X[:, 0] >= 0.25) & (X[:, 0] < 0.5),\n", + " -2.5 * W[:, 0],\n", + " np.where((X[:, 0] >= 0.5) & (X[:, 0] < 0.75), 2.5 * W[:, 0], 7.5 * W[:, 0]),\n", + " ),\n", " )\n", "\n", + "\n", "# Generate outcome\n", "epsilon = rng.normal(0, 1, n)\n", "y = outcome_mean(X, W) + epsilon\n", @@ -72,7 +73,7 @@ "# Standardize outcome\n", "y_bar = np.mean(y)\n", "y_std = np.std(y)\n", - "resid = (y-y_bar)/y_std" + "resid = (y - y_bar) / y_std" ] }, { @@ -90,10 +91,10 @@ "source": [ "sample_inds = np.arange(n)\n", "train_inds, test_inds = train_test_split(sample_inds, test_size=0.5)\n", - "X_train = X[train_inds,:]\n", - "X_test = X[test_inds,:]\n", - "basis_train = W[train_inds,:]\n", - "basis_test = W[test_inds,:]\n", + "X_train = X[train_inds, :]\n", + "X_test = X[test_inds, :]\n", + "basis_train = W[train_inds, :]\n", + "basis_test = W[test_inds, :]\n", "y_train = y[train_inds]\n", "y_test = y[test_inds]" ] @@ -120,7 +121,16 @@ "source": [ "bart_model = BARTModel()\n", "general_params = {\"num_chains\": 3}\n", - "bart_model.sample(X_train=X_train, y_train=y_train, basis_train=basis_train, X_test=X_test, basis_test=basis_test, num_gfr=10, num_mcmc=100, general_params=general_params)" + "bart_model.sample(\n", + " X_train=X_train,\n", + " y_train=y_train,\n", + " basis_train=basis_train,\n", + " X_test=X_test,\n", + " basis_test=basis_test,\n", + " num_gfr=10,\n", + " num_mcmc=100,\n", + " general_params=general_params,\n", + ")" ] }, { @@ -137,10 +147,13 @@ "outputs": [], "source": [ "forest_preds_y_mcmc = bart_model.y_hat_test\n", - "y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis = 1, keepdims = True)\n", - "y_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(y_test,1), y_avg_mcmc), axis = 1), columns=[\"True outcome\", \"Average estimated outcome\"])\n", + "y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis=1, keepdims=True)\n", + "y_df_mcmc = pd.DataFrame(\n", + " np.concatenate((np.expand_dims(y_test, 1), y_avg_mcmc), axis=1),\n", + " columns=[\"True outcome\", \"Average estimated outcome\"],\n", + ")\n", "sns.scatterplot(data=y_df_mcmc, x=\"Average estimated outcome\", y=\"True outcome\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -150,7 +163,16 @@ "metadata": {}, "outputs": [], "source": [ - "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bart_model.num_samples),axis=1), np.expand_dims(bart_model.global_var_samples,axis=1)), axis = 1), columns=[\"Sample\", \"Sigma\"])\n", + "sigma_df_mcmc = pd.DataFrame(\n", + " np.concatenate(\n", + " (\n", + " np.expand_dims(np.arange(bart_model.num_samples), axis=1),\n", + " np.expand_dims(bart_model.global_var_samples, axis=1),\n", + " ),\n", + " axis=1,\n", + " ),\n", + " columns=[\"Sample\", \"Sigma\"],\n", + ")\n", "sns.scatterplot(data=sigma_df_mcmc, x=\"Sample\", y=\"Sigma\")\n", "plt.show()" ] @@ -168,7 +190,7 @@ "metadata": {}, "outputs": [], "source": [ - "np.sqrt(np.mean(np.power(y_test - np.squeeze(y_avg_mcmc),2)))" + "np.sqrt(np.mean(np.power(y_test - np.squeeze(y_avg_mcmc), 2)))" ] }, { @@ -194,7 +216,9 @@ "bart_model = BARTModel()\n", "X_train_aug = np.c_[X_train, basis_train]\n", "X_test_aug = np.c_[X_test, basis_test]\n", - "bart_model.sample(X_train=X_train_aug, y_train=y_train, X_test=X_test_aug, num_gfr=10, num_mcmc=100)" + "bart_model.sample(\n", + " X_train=X_train_aug, y_train=y_train, X_test=X_test_aug, num_gfr=10, num_mcmc=100\n", + ")" ] }, { @@ -211,10 +235,13 @@ "outputs": [], "source": [ "forest_preds_y_mcmc = bart_model.y_hat_test\n", - "y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis = 1, keepdims = True)\n", - "y_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(y_test,1), y_avg_mcmc), axis = 1), columns=[\"True outcome\", \"Average estimated outcome\"])\n", + "y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis=1, keepdims=True)\n", + "y_df_mcmc = pd.DataFrame(\n", + " np.concatenate((np.expand_dims(y_test, 1), y_avg_mcmc), axis=1),\n", + " columns=[\"True outcome\", \"Average estimated outcome\"],\n", + ")\n", "sns.scatterplot(data=y_df_mcmc, x=\"Average estimated outcome\", y=\"True outcome\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -224,7 +251,16 @@ "metadata": {}, "outputs": [], "source": [ - "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bart_model.num_samples),axis=1), np.expand_dims(bart_model.global_var_samples,axis=1)), axis = 1), columns=[\"Sample\", \"Sigma\"])\n", + "sigma_df_mcmc = pd.DataFrame(\n", + " np.concatenate(\n", + " (\n", + " np.expand_dims(np.arange(bart_model.num_samples), axis=1),\n", + " np.expand_dims(bart_model.global_var_samples, axis=1),\n", + " ),\n", + " axis=1,\n", + " ),\n", + " columns=[\"Sample\", \"Sigma\"],\n", + ")\n", "sns.scatterplot(data=sigma_df_mcmc, x=\"Sample\", y=\"Sigma\")\n", "plt.show()" ] @@ -242,7 +278,7 @@ "metadata": {}, "outputs": [], "source": [ - "np.sqrt(np.mean(np.power(y_test - np.squeeze(y_avg_mcmc),2)))" + "np.sqrt(np.mean(np.power(y_test - np.squeeze(y_avg_mcmc), 2)))" ] }, { @@ -266,7 +302,9 @@ "outputs": [], "source": [ "bart_model = BARTModel()\n", - "bart_model.sample(X_train=X_train, y_train=y_train, X_test=X_test, num_gfr=10, num_mcmc=100)" + "bart_model.sample(\n", + " X_train=X_train, y_train=y_train, X_test=X_test, num_gfr=10, num_mcmc=100\n", + ")" ] }, { @@ -283,10 +321,13 @@ "outputs": [], "source": [ "forest_preds_y_mcmc = bart_model.y_hat_test\n", - "y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis = 1, keepdims = True)\n", - "y_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(y_test,1), y_avg_mcmc), axis = 1), columns=[\"True outcome\", \"Average estimated outcome\"])\n", + "y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis=1, keepdims=True)\n", + "y_df_mcmc = pd.DataFrame(\n", + " np.concatenate((np.expand_dims(y_test, 1), y_avg_mcmc), axis=1),\n", + " columns=[\"True outcome\", \"Average estimated outcome\"],\n", + ")\n", "sns.scatterplot(data=y_df_mcmc, x=\"Average estimated outcome\", y=\"True outcome\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -296,7 +337,16 @@ "metadata": {}, "outputs": [], "source": [ - "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bart_model.num_samples),axis=1), np.expand_dims(bart_model.global_var_samples,axis=1)), axis = 1), columns=[\"Sample\", \"Sigma\"])\n", + "sigma_df_mcmc = pd.DataFrame(\n", + " np.concatenate(\n", + " (\n", + " np.expand_dims(np.arange(bart_model.num_samples), axis=1),\n", + " np.expand_dims(bart_model.global_var_samples, axis=1),\n", + " ),\n", + " axis=1,\n", + " ),\n", + " columns=[\"Sample\", \"Sigma\"],\n", + ")\n", "sns.scatterplot(data=sigma_df_mcmc, x=\"Sample\", y=\"Sigma\")\n", "plt.show()" ] @@ -314,7 +364,7 @@ "metadata": {}, "outputs": [], "source": [ - "np.sqrt(np.mean(np.power(y_test - np.squeeze(y_avg_mcmc),2)))" + "np.sqrt(np.mean(np.power(y_test - np.squeeze(y_avg_mcmc), 2)))" ] } ], diff --git a/demo/notebooks/tree_inspection.ipynb b/demo/notebooks/tree_inspection.ipynb index 089baf49..093b2414 100644 --- a/demo/notebooks/tree_inspection.ipynb +++ b/demo/notebooks/tree_inspection.ipynb @@ -59,19 +59,20 @@ "p_X = 10\n", "X = rng.uniform(0, 1, (n, p_X))\n", "\n", + "\n", "# Define the outcome mean function\n", "def outcome_mean(X):\n", " return np.where(\n", - " (X[:,9] >= 0.0) & (X[:,9] < 0.25), -7.5, \n", + " (X[:, 9] >= 0.0) & (X[:, 9] < 0.25),\n", + " -7.5,\n", " np.where(\n", - " (X[:,9] >= 0.25) & (X[:,9] < 0.5), -2.5, \n", - " np.where(\n", - " (X[:,9] >= 0.5) & (X[:,9] < 0.75), 2.5, \n", - " 7.5\n", - " )\n", - " )\n", + " (X[:, 9] >= 0.25) & (X[:, 9] < 0.5),\n", + " -2.5,\n", + " np.where((X[:, 9] >= 0.5) & (X[:, 9] < 0.75), 2.5, 7.5),\n", + " ),\n", " )\n", "\n", + "\n", "# Generate outcome\n", "epsilon = rng.normal(0, 1, n)\n", "y = outcome_mean(X) + epsilon\n", @@ -79,7 +80,7 @@ "# Standardize outcome\n", "y_bar = np.mean(y)\n", "y_std = np.std(y)\n", - "resid = (y-y_bar)/y_std" + "resid = (y - y_bar) / y_std" ] }, { @@ -97,8 +98,8 @@ "source": [ "sample_inds = np.arange(n)\n", "train_inds, test_inds = train_test_split(sample_inds, test_size=0.5)\n", - "X_train = X[train_inds,:]\n", - "X_test = X[test_inds,:]\n", + "X_train = X[train_inds, :]\n", + "X_test = X[test_inds, :]\n", "y_train = y[train_inds]\n", "y_test = y[test_inds]" ] @@ -118,7 +119,14 @@ "source": [ "bart_model = BARTModel()\n", "param_dict = {\"keep_gfr\": True}\n", - "bart_model.sample(X_train=X_train, y_train=y_train, X_test=X_test, num_gfr=10, num_mcmc=10, mean_forest_params=param_dict)" + "bart_model.sample(\n", + " X_train=X_train,\n", + " y_train=y_train,\n", + " X_test=X_test,\n", + " num_gfr=10,\n", + " num_mcmc=10,\n", + " mean_forest_params=param_dict,\n", + ")" ] }, { @@ -135,10 +143,13 @@ "outputs": [], "source": [ "forest_preds_y_mcmc = bart_model.y_hat_test\n", - "y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis = 1, keepdims = True)\n", - "y_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(y_test,1), y_avg_mcmc), axis = 1), columns=[\"True outcome\", \"Average estimated outcome\"])\n", + "y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis=1, keepdims=True)\n", + "y_df_mcmc = pd.DataFrame(\n", + " np.concatenate((np.expand_dims(y_test, 1), y_avg_mcmc), axis=1),\n", + " columns=[\"True outcome\", \"Average estimated outcome\"],\n", + ")\n", "sns.scatterplot(data=y_df_mcmc, x=\"Average estimated outcome\", y=\"True outcome\")\n", - "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n", + "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3, 3)))\n", "plt.show()" ] }, @@ -148,7 +159,16 @@ "metadata": {}, "outputs": [], "source": [ - "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bart_model.num_samples),axis=1), np.expand_dims(bart_model.global_var_samples,axis=1)), axis = 1), columns=[\"Sample\", \"Sigma\"])\n", + "sigma_df_mcmc = pd.DataFrame(\n", + " np.concatenate(\n", + " (\n", + " np.expand_dims(np.arange(bart_model.num_samples), axis=1),\n", + " np.expand_dims(bart_model.global_var_samples, axis=1),\n", + " ),\n", + " axis=1,\n", + " ),\n", + " columns=[\"Sample\", \"Sigma\"],\n", + ")\n", "sns.scatterplot(data=sigma_df_mcmc, x=\"Sample\", y=\"Sigma\")\n", "plt.show()" ] @@ -166,7 +186,7 @@ "metadata": {}, "outputs": [], "source": [ - "np.sqrt(np.mean(np.power(y_test - np.squeeze(y_avg_mcmc),2)))" + "np.sqrt(np.mean(np.power(y_test - np.squeeze(y_avg_mcmc), 2)))" ] }, { @@ -216,7 +236,7 @@ "metadata": {}, "outputs": [], "source": [ - "splits[9,0,:]" + "splits[9, 0, :]" ] }, { @@ -232,7 +252,7 @@ "metadata": {}, "outputs": [], "source": [ - "splits[9,1,:]" + "splits[9, 1, :]" ] }, { @@ -248,7 +268,7 @@ "metadata": {}, "outputs": [], "source": [ - "splits[9,20,:]" + "splits[9, 20, :]" ] }, { @@ -257,7 +277,7 @@ "metadata": {}, "outputs": [], "source": [ - "splits[9,30,:]" + "splits[9, 30, :]" ] }, { @@ -290,28 +310,48 @@ "metadata": {}, "outputs": [], "source": [ - "nodes = np.sort(bart_model.forest_container_mean.nodes(forest_num,tree_num))\n", + "nodes = np.sort(bart_model.forest_container_mean.nodes(forest_num, tree_num))\n", "for nid in nodes:\n", - " if bart_model.forest_container_mean.is_leaf_node(forest_num,tree_num,nid):\n", - " print(\n", - " \"{space}node={node} is a leaf node with value={value}.\".format(\n", - " space=bart_model.forest_container_mean.node_depth(forest_num,tree_num,nid) * \"\\t\", \n", - " node=nid, value=np.around(bart_model.forest_container_mean.node_leaf_values(forest_num,tree_num,nid), 3)\n", - " )\n", - " )\n", - " else:\n", - " print(\n", - " \"{space}node={node} is a split node, which tells us to \"\n", - " \"go to node {left} if X[:, {feature}] <= {threshold} \"\n", - " \"else to node {right}.\".format(\n", - " space=bart_model.forest_container_mean.node_depth(forest_num,tree_num,nid) * \"\\t\",\n", - " node=nid,\n", - " left=bart_model.forest_container_mean.left_child_node(forest_num,tree_num,nid),\n", - " feature=bart_model.forest_container_mean.node_split_index(forest_num,tree_num,nid),\n", - " threshold=bart_model.forest_container_mean.node_split_threshold(forest_num,tree_num,nid),\n", - " right=bart_model.forest_container_mean.right_child_node(forest_num,tree_num,nid),\n", - " )\n", - " )" + " if bart_model.forest_container_mean.is_leaf_node(forest_num, tree_num, nid):\n", + " print(\n", + " \"{space}node={node} is a leaf node with value={value}.\".format(\n", + " space=bart_model.forest_container_mean.node_depth(\n", + " forest_num, tree_num, nid\n", + " )\n", + " * \"\\t\",\n", + " node=nid,\n", + " value=np.around(\n", + " bart_model.forest_container_mean.node_leaf_values(\n", + " forest_num, tree_num, nid\n", + " ),\n", + " 3,\n", + " ),\n", + " )\n", + " )\n", + " else:\n", + " print(\n", + " \"{space}node={node} is a split node, which tells us to \"\n", + " \"go to node {left} if X[:, {feature}] <= {threshold} \"\n", + " \"else to node {right}.\".format(\n", + " space=bart_model.forest_container_mean.node_depth(\n", + " forest_num, tree_num, nid\n", + " )\n", + " * \"\\t\",\n", + " node=nid,\n", + " left=bart_model.forest_container_mean.left_child_node(\n", + " forest_num, tree_num, nid\n", + " ),\n", + " feature=bart_model.forest_container_mean.node_split_index(\n", + " forest_num, tree_num, nid\n", + " ),\n", + " threshold=bart_model.forest_container_mean.node_split_threshold(\n", + " forest_num, tree_num, nid\n", + " ),\n", + " right=bart_model.forest_container_mean.right_child_node(\n", + " forest_num, tree_num, nid\n", + " ),\n", + " )\n", + " )" ] } ], From d6afe5917c7d72ad1a32a50ea14f29b714923534 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Fri, 14 Feb 2025 18:36:53 -0600 Subject: [PATCH 05/35] Used ruff to format imports in python demo notebooks --- demo/notebooks/causal_inference.ipynb | 7 ++++--- demo/notebooks/causal_inference_feature_subsets.ipynb | 7 ++++--- .../heteroskedastic_supervised_learning.ipynb | 8 +++++--- .../multivariate_treatment_causal_inference.ipynb | 7 ++++--- demo/notebooks/prototype_interface.ipynb | 11 ++++++----- demo/notebooks/serialization.ipynb | 10 ++++++---- demo/notebooks/supervised_learning.ipynb | 7 ++++--- demo/notebooks/tree_inspection.ipynb | 7 ++++--- 8 files changed, 37 insertions(+), 27 deletions(-) diff --git a/demo/notebooks/causal_inference.ipynb b/demo/notebooks/causal_inference.ipynb index 80565aed..40ac4725 100644 --- a/demo/notebooks/causal_inference.ipynb +++ b/demo/notebooks/causal_inference.ipynb @@ -20,12 +20,13 @@ "metadata": {}, "outputs": [], "source": [ + "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "from stochtree import BCFModel\n", - "from sklearn.model_selection import train_test_split" + "from sklearn.model_selection import train_test_split\n", + "\n", + "from stochtree import BCFModel" ] }, { diff --git a/demo/notebooks/causal_inference_feature_subsets.ipynb b/demo/notebooks/causal_inference_feature_subsets.ipynb index 8b9169a9..e64a5164 100644 --- a/demo/notebooks/causal_inference_feature_subsets.ipynb +++ b/demo/notebooks/causal_inference_feature_subsets.ipynb @@ -25,12 +25,13 @@ "metadata": {}, "outputs": [], "source": [ + "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "from stochtree import BCFModel\n", - "from sklearn.model_selection import train_test_split" + "from sklearn.model_selection import train_test_split\n", + "\n", + "from stochtree import BCFModel" ] }, { diff --git a/demo/notebooks/heteroskedastic_supervised_learning.ipynb b/demo/notebooks/heteroskedastic_supervised_learning.ipynb index 898833c6..ff90ec43 100644 --- a/demo/notebooks/heteroskedastic_supervised_learning.ipynb +++ b/demo/notebooks/heteroskedastic_supervised_learning.ipynb @@ -20,13 +20,15 @@ "metadata": {}, "outputs": [], "source": [ + "from math import sqrt\n", + "\n", + "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "from stochtree import BARTModel\n", "from sklearn.model_selection import train_test_split\n", - "from math import sqrt" + "\n", + "from stochtree import BARTModel" ] }, { diff --git a/demo/notebooks/multivariate_treatment_causal_inference.ipynb b/demo/notebooks/multivariate_treatment_causal_inference.ipynb index 6e7e17a8..5c3d10b9 100644 --- a/demo/notebooks/multivariate_treatment_causal_inference.ipynb +++ b/demo/notebooks/multivariate_treatment_causal_inference.ipynb @@ -20,12 +20,13 @@ "metadata": {}, "outputs": [], "source": [ + "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "from stochtree import BCFModel\n", - "from sklearn.model_selection import train_test_split" + "from sklearn.model_selection import train_test_split\n", + "\n", + "from stochtree import BCFModel" ] }, { diff --git a/demo/notebooks/prototype_interface.ipynb b/demo/notebooks/prototype_interface.ipynb index 972b291a..90a0c564 100644 --- a/demo/notebooks/prototype_interface.ipynb +++ b/demo/notebooks/prototype_interface.ipynb @@ -57,19 +57,20 @@ "metadata": {}, "outputs": [], "source": [ + "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", + "\n", "from stochtree import (\n", - " Dataset,\n", - " Residual,\n", " RNG,\n", - " ForestSampler,\n", - " ForestContainer,\n", + " Dataset,\n", " Forest,\n", + " ForestContainer,\n", + " ForestSampler,\n", " GlobalVarianceModel,\n", " LeafVarianceModel,\n", + " Residual,\n", ")" ] }, diff --git a/demo/notebooks/serialization.ipynb b/demo/notebooks/serialization.ipynb index 7f1d0ecd..4093ccd3 100644 --- a/demo/notebooks/serialization.ipynb +++ b/demo/notebooks/serialization.ipynb @@ -28,13 +28,15 @@ "outputs": [], "source": [ "import json\n", - "import numpy as np\n", "import os\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "from stochtree import BARTModel\n", - "from sklearn.model_selection import train_test_split" + "from sklearn.model_selection import train_test_split\n", + "\n", + "from stochtree import BARTModel" ] }, { diff --git a/demo/notebooks/supervised_learning.ipynb b/demo/notebooks/supervised_learning.ipynb index 522c64d8..57a8c9b2 100644 --- a/demo/notebooks/supervised_learning.ipynb +++ b/demo/notebooks/supervised_learning.ipynb @@ -20,12 +20,13 @@ "metadata": {}, "outputs": [], "source": [ + "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "from stochtree import BARTModel\n", - "from sklearn.model_selection import train_test_split" + "from sklearn.model_selection import train_test_split\n", + "\n", + "from stochtree import BARTModel" ] }, { diff --git a/demo/notebooks/tree_inspection.ipynb b/demo/notebooks/tree_inspection.ipynb index 093b2414..38a9f4ac 100644 --- a/demo/notebooks/tree_inspection.ipynb +++ b/demo/notebooks/tree_inspection.ipynb @@ -22,12 +22,13 @@ "metadata": {}, "outputs": [], "source": [ + "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "from stochtree import BARTModel\n", - "from sklearn.model_selection import train_test_split" + "from sklearn.model_selection import train_test_split\n", + "\n", + "from stochtree import BARTModel" ] }, { From 6759c46ee15a2eba425ae0d01d17f5321c212d05 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Fri, 14 Feb 2025 19:34:52 -0600 Subject: [PATCH 06/35] Updated demo notebooks --- demo/notebooks/causal_inference_feature_subsets.ipynb | 6 +++--- demo/notebooks/serialization.ipynb | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/demo/notebooks/causal_inference_feature_subsets.ipynb b/demo/notebooks/causal_inference_feature_subsets.ipynb index e64a5164..d2ada0d6 100644 --- a/demo/notebooks/causal_inference_feature_subsets.ipynb +++ b/demo/notebooks/causal_inference_feature_subsets.ipynb @@ -119,7 +119,7 @@ " pi_test,\n", " num_gfr=10,\n", " num_mcmc=100,\n", - " params={\"keep_every\": 5},\n", + " general_params={\"keep_every\": 5},\n", ")" ] }, @@ -237,7 +237,7 @@ "outputs": [], "source": [ "bcf_model_subset = BCFModel()\n", - "bcf_params = {\"keep_vars_tau\": [0, 1]}\n", + "tau_params = {\"keep_vars\": [0, 1]}\n", "bcf_model_subset.sample(\n", " X_train,\n", " Z_train,\n", @@ -248,7 +248,7 @@ " pi_test,\n", " num_gfr=10,\n", " num_mcmc=100,\n", - " params=bcf_params,\n", + " tau_forest_params=tau_params,\n", ")" ] }, diff --git a/demo/notebooks/serialization.ipynb b/demo/notebooks/serialization.ipynb index 4093ccd3..762affe6 100644 --- a/demo/notebooks/serialization.ipynb +++ b/demo/notebooks/serialization.ipynb @@ -57,7 +57,7 @@ "rng = np.random.default_rng(random_seed)\n", "\n", "# Generate covariates and basis\n", - "n = 1000\n", + "n = 100\n", "p_X = 10\n", "p_W = 1\n", "X = rng.uniform(0, 1, (n, p_X))\n", From 859e04fb70873e3e4c6631c14ca0d0065df91bfa Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Fri, 14 Feb 2025 22:12:15 -0600 Subject: [PATCH 07/35] Removed pandas import from calibration python file --- stochtree/calibration.py | 1 - 1 file changed, 1 deletion(-) diff --git a/stochtree/calibration.py b/stochtree/calibration.py index d09124fa..fb6be1db 100644 --- a/stochtree/calibration.py +++ b/stochtree/calibration.py @@ -1,7 +1,6 @@ import warnings import numpy as np -import pandas as pd from scipy.stats import gamma from sklearn import linear_model from sklearn.metrics import mean_squared_error From 207a3d0b5e979f7ff945ba3cd813d97f2dd10b00 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Tue, 18 Feb 2025 00:48:15 -0600 Subject: [PATCH 08/35] Updated R config code and added python ForestModelConfig object code --- R/config.R | 12 +- stochtree/config.py | 395 ++++++++++++++++++++++++++++++++++++++++++++ stochtree/utils.py | 166 +++++++++++++++++++ 3 files changed, 570 insertions(+), 3 deletions(-) create mode 100644 stochtree/config.py diff --git a/R/config.R b/R/config.R index 08693674..c9b04e3f 100644 --- a/R/config.R +++ b/R/config.R @@ -61,7 +61,7 @@ ForestModelConfig <- R6::R6Class( #' Create a new ForestModelConfig object. #' - #' @param feature_types Vector of integer-coded feature types (integers where 0 = numeric, 1 = ordered categorical, 2 = unordered categorical) + #' @param feature_types Vector of integer-coded feature types (where 0 = numeric, 1 = ordered categorical, 2 = unordered categorical) #' @param num_trees Number of trees in the forest being sampled #' @param num_features Number of features in training dataset #' @param num_observations Number of observations in training dataset @@ -98,6 +98,12 @@ ForestModelConfig <- R6::R6Class( warning("`variable_weights` not provided, will be assumed to be equal-weighted") variable_weights <- rep(1/num_features, num_features) } + if (is.null(num_trees)) { + stop("num_trees must be provided") + } + if (is.null(num_observations)) { + stop("num_observations must be provided") + } if (num_features != length(feature_types)) { stop("`feature_types` must have `num_features` total elements") } @@ -175,14 +181,14 @@ ForestModelConfig <- R6::R6Class( }, #' @description - #' Update root node split probability in tree prior + #' Update minimum number of samples per leaf node in the tree prior #' @param min_samples_leaf Minimum number of samples in a tree leaf update_min_samples_leaf = function(min_samples_leaf) { self$min_samples_leaf <- min_samples_leaf }, #' @description - #' Update root node split probability in tree prior + #' Update max depth in the tree prior #' @param max_depth Maximum depth of any tree in the ensemble in the model update_max_depth = function(max_depth) { self$max_depth <- max_depth diff --git a/stochtree/config.py b/stochtree/config.py new file mode 100644 index 00000000..61a25c96 --- /dev/null +++ b/stochtree/config.py @@ -0,0 +1,395 @@ +from typing import Union +import warnings + +import numpy as np + +from .utils import _standardize_array_to_np, _check_is_int, _check_matrix_square + +class ForestModelConfig: + """ + Object used to get / set parameters and other model configuration options for a forest model in the "low-level" stochtree interface. + + The "low-level" stochtree interface enables a high degreee of sampler customization, in which users employ R wrappers around + C++ objects like `ForestDataset`, `Outcome`, `CppRng`, and `ForestModel` to run the Gibbs sampler of a BART model with custom modifications. + `ForestModelConfig` allows users to specify / query the parameters of a forest model they wish to run. + + Parameters + ---------- + num_trees : int + Number of trees in the forest being sampled + num_features : int + Number of features in training dataset + num_observations : int + Number of observations in training dataset + feature_types : np.array or list, optional + Vector of integer-coded feature types (where 0 = numeric, 1 = ordered categorical, 2 = unordered categorical) + variable_weights : np.array or list, optional + Vector specifying sampling probability for all p covariates in ForestDataset + leaf_dimension : int, optional + Dimension of the leaf model (default: `1`) + alpha : int, optional + Root node split probability in tree prior (default: `0.95`) + beta : int, optional + Depth prior penalty in tree prior (default: `2.0`) + min_samples_leaf : int, optional + Minimum number of samples in a tree leaf (default: `5`) + max_depth : int, optional + Maximum depth of any tree in the ensemble in the model. Setting to `-1` does not enforce any depth limits on trees. Default: `-1`. + leaf_model_type : int, optional + Integer specifying the leaf model type (0 = constant leaf, 1 = univariate leaf regression, 2 = multivariate leaf regression). Default: `0`. + leaf_model_scale : float or np.ndarray, optional + Scale parameter used in Gaussian leaf models (can either be a scalar or a q x q matrix, where q is the dimensionality of the basis and is only >1 when `leaf_model_int = 2`). Calibrated internally as `1/num_trees`, propagated along diagonal if needed for multivariate leaf models. + variance_forest_shape : int, optional + Shape parameter for IG leaf models (applicable when `leaf_model_type = 3`). Default: `1`. + variance_forest_scale : int, optional + Scale parameter for IG leaf models (applicable when `leaf_model_type = 3`). Default: `1`. + cutpoint_grid_size : int, optional + Number of unique cutpoints to consider (default: `100`) + """ + + def __init__(self, num_trees = None, num_features = None, num_observations = None, + feature_types = None, variable_weights = None, leaf_dimension = 1, + alpha = 0.95, beta = 2.0, min_samples_leaf = 5, max_depth = -1, + leaf_model_type = 1, leaf_model_scale = None, variance_forest_shape = 1.0, + variance_forest_scale = 1.0, cutpoint_grid_size = 100) -> None: + # Preprocess inputs and run some error checks + if not feature_types: + if not num_features: + raise ValueError("Neither of `num_features` nor `feature_types` (a vector from which `num_features` can be inferred) was provided.", + "Please provide at least one of these inputs when creating a `ForestModelConfig` object.") + warnings.warn("`feature_types` not provided, will be assumed to be numeric") + self.feature_types = np.repeat(0, num_features) + else: + self.feature_types = _standardize_array_to_np(feature_types) + if not num_features: + num_features = len(self.feature_types) + if not variable_weights: + warnings.warn("`variable_weights` not provided, will be assumed to be equal-weighted") + self.variable_weights = np.repeat(1.0 / num_features, num_features) + else: + self.variable_weights = _standardize_array_to_np(variable_weights) + if not num_trees: + raise ValueError("`num_trees` must be provided") + if not num_observations: + raise ValueError("`num_observations` must be provided") + if num_features != len(self.feature_types): + raise ValueError("`feature_types` must have `num_features` total elements") + if num_features != len(self.variable_weights): + raise ValueError("`variable_weights` must have `num_features` total elements") + if not _check_is_int(leaf_model_type): + raise ValueError("`leaf_model_type` must be an integer between 0 and 3") + elif leaf_model_type < 0 or leaf_model_type > 3: + raise ValueError("`leaf_model_type` must be an integer between 0 and 3") + if not _check_is_int(leaf_dimension): + raise ValueError("`leaf_dimension` must be an integer greater than 0") + elif leaf_dimension <= 0: + raise ValueError("`leaf_dimension` must be an integer greater than 0") + if not leaf_model_scale: + diag_value = 1.0 / num_trees + leaf_model_scale_array = np.zeros((leaf_dimension, leaf_dimension), float) + np.fill_diagonal(leaf_model_scale_array, diag_value) + else: + if isinstance(leaf_model_scale, np.ndarray): + if not _check_matrix_square(leaf_model_scale): + raise ValueError("`leaf_model_scale` must be a square matrix if provided as a numpy array") + leaf_model_scale_array = leaf_model_scale + elif isinstance(leaf_model_scale, (int, float)): + if leaf_model_scale <= 0: + raise ValueError("`leaf_model_scale` must be positive, if provided as scalar") + leaf_model_scale_array = np.zeros((leaf_dimension, leaf_dimension), float) + np.fill_diagonal(leaf_model_scale_array, leaf_model_scale) + else: + raise ValueError("`leaf_model_scale` must be a scalar value or a 2d numpy array with matching dimensions") + + # Set internal config values + self.num_trees = num_trees + self.num_features = num_features + self.num_observations = num_observations + self.leaf_dimension = leaf_dimension + self.alpha = alpha + self.beta = beta + self.min_samples_leaf = min_samples_leaf + self.max_depth = max_depth + self.variance_forest_shape = variance_forest_shape + self.variance_forest_scale = variance_forest_scale + self.cutpoint_grid_size = cutpoint_grid_size + self.leaf_model_type = leaf_model_type + self.leaf_model_scale = leaf_model_scale_array + + def update_feature_types(self, feature_types) -> None: + """ + Update feature types + + Parameters + ---------- + feature_types : list of np.ndarray + Vector of integer-coded feature types (where 0 = numeric, 1 = ordered categorical, 2 = unordered categorical) + + Returns + ------- + self + """ + feature_types = _standardize_array_to_np(feature_types) + if self.num_features != len(feature_types): + raise ValueError("`feature_types` must have `num_features` total elements") + self.feature_types = feature_types + + def update_variable_weights(self, variable_weights: Union[list, np.ndarray]) -> None: + """ + Update variable weights + + Parameters + ---------- + variable_weights : list or np.ndarray + List or array specifying sampling probability for all p covariates in ForestDataset + + Returns + ------- + self + """ + variable_weights = _standardize_array_to_np(variable_weights) + if self.num_features != len(variable_weights): + raise ValueError("`variable_weights` must have `num_features` total elements") + self.variable_weights = variable_weights + + def update_alpha(self, alpha: float) -> None: + """ + Update root node split probability in tree prior + + Parameters + ---------- + alpha : float + Root node split probability in tree prior + + Returns + ------- + self + """ + self.alpha = alpha + + def update_beta(self, beta: float) -> None: + """ + Update depth prior penalty in tree prior + + Parameters + ---------- + beta : float + Depth prior penalty in tree prior + + Returns + ------- + self + """ + self.beta = beta + + def update_min_samples_leaf(self, min_samples_leaf: int) -> None: + """ + Update minimum number of samples per leaf node in the tree prior + + Parameters + ---------- + min_samples_leaf : int + Minimum number of samples per leaf node in the tree prior + + Returns + ------- + self + """ + self.min_samples_leaf = min_samples_leaf + + def update_max_depth(self, max_depth: int) -> None: + """ + Update max depth in the tree prior + + Parameters + ---------- + max_depth : int + Max depth in the tree prior + + Returns + ------- + self + """ + self.max_depth = max_depth + + def update_leaf_model_scale(self, leaf_model_scale: Union[float, np.ndarray]) -> None: + """ + Update scale parameter used in Gaussian leaf models + + Parameters + ---------- + leaf_model_scale : float or np.ndarray + Scale parameter used in Gaussian leaf models (can either be a scalar or a q x q matrix, where q is the dimensionality of the basis and is only >1 when `leaf_model_int = 2`). + + Returns + ------- + self + """ + if isinstance(leaf_model_scale, np.ndarray): + if not _check_matrix_square(leaf_model_scale): + raise ValueError("`leaf_model_scale` must be a square matrix if provided as a numpy array") + leaf_model_scale_array = leaf_model_scale + elif isinstance(leaf_model_scale, (int, float)): + if leaf_model_scale <= 0: + raise ValueError("`leaf_model_scale` must be positive, if provided as scalar") + leaf_model_scale_array = np.zeros((self.leaf_dimension, self.leaf_dimension), float) + np.fill_diagonal(leaf_model_scale_array, leaf_model_scale) + else: + raise ValueError("`leaf_model_scale` must be a scalar value or a 2d numpy array with matching dimensions") + + self.leaf_model_scale = leaf_model_scale_array + + def update_variance_forest_shape(self, variance_forest_shape: float) -> None: + """ + Update shape parameter for IG leaf models + + Parameters + ---------- + variance_forest_shape : float + Shape parameter for IG leaf models + + Returns + ------- + self + """ + self.variance_forest_shape = variance_forest_shape + + def update_variance_forest_scale(self, variance_forest_scale: float) -> None: + """ + Update scale parameter for IG leaf models + + Parameters + ---------- + variance_forest_scale : float + Scale parameter for IG leaf models + + Returns + ------- + self + """ + self.variance_forest_scale = variance_forest_scale + + def update_cutpoint_grid_size(self, cutpoint_grid_size: int) -> None: + """ + Update maximum number of unique cutpoints to consider in a grow-from-root split + + Parameters + ---------- + cutpoint_grid_size : int + Maximum number of unique cutpoints to consider in a grow-from-root split + + Returns + ------- + self + """ + self.cutpoint_grid_size = cutpoint_grid_size + + def get_feature_types(self): + """ + Query feature types (integer-coded so that 0 = numeric, 1 = ordered categorical, 2 = unordered categorical) + + Returns + ------- + feature_types : np.ndarray + Array of integer-coded feature types + """ + return self.feature_types + + def get_variable_weights(self) -> np.ndarray: + """ + Query variable weights + + Returns + ------- + variable_weights : np.ndarray + Array of variable split probability weights + """ + return self.variable_weights + + def get_alpha(self) -> float: + """ + Query root node split probability in tree prior + + Returns + ------- + alpha : float + Root node split probability in tree prior + """ + return self.alpha + + def get_beta(self) -> float: + """ + Query depth prior penalty in tree prior + + Returns + ------- + beta : float + Depth prior penalty in tree prior + """ + return self.beta + + def get_min_samples_leaf(self) -> int: + """ + Query min samples in a leaf node in the tree prior + + Returns + ------- + min_samples_leaf : int + Min samples in a leaf node + """ + return self.min_samples_leaf + + def get_max_depth(self) -> int: + """ + Query max depth in the tree prior + + Returns + ------- + max_depth : int + Max depth in the tree prior + """ + return self.max_depth + + def get_leaf_model_scale(self) -> np.ndarray: + """ + Query scale parameter used in Gaussian leaf models + + Returns + ------- + leaf_model_scale : np.ndarray + Scale parameter (in array form) used in Gaussian leaf models. If the Gaussian leaf model is univariate, the array returned is a 1x1 matrix. + """ + self.leaf_model_scale + + def get_variance_forest_shape(self) -> float: + """ + Query shape parameter for IG leaf models + + Returns + ------- + variance_forest_shape : float + Shape parameter for IG leaf models + """ + return self.variance_forest_shape + + def get_variance_forest_scale(self) -> float: + """ + Query scale parameter for IG leaf models + + Returns + ------- + variance_forest_scale : float + Scale parameter for IG leaf models + """ + return self.variance_forest_scale + + def get_cutpoint_grid_size(self) -> int: + """ + Query maximum number of unique cutpoints considered in a grow-from-root split + + Returns + ------- + cutpoint_grid_size : int + Maximum number of unique cutpoints considered in a grow-from-root split + """ + return self.cutpoint_grid_size diff --git a/stochtree/utils.py b/stochtree/utils.py index 27062655..5c83e950 100644 --- a/stochtree/utils.py +++ b/stochtree/utils.py @@ -1,3 +1,6 @@ +from typing import Union +import numpy as np + class NotSampledError(ValueError, AttributeError): """Exception class to raise if attempting to predict from a model before it has been sampled. @@ -7,3 +10,166 @@ class NotSampledError(ValueError, AttributeError): Renamed from scikit-learn's "NotFittedError" https://github.com/scikit-learn/scikit-learn/blob/8721245511de2f225ff5f9aa5f5fadce663cd4a3/sklearn/exceptions.py#L45C7-L45C21 """ + +def _standardize_array_to_list(input: Union[list, np.ndarray]) -> list: + """ + Standarize an array (either a python list or numpy array) to a python list + + Parameters + ---------- + input : list or np.array + Array to be standardized + + Returns + ------- + list + Input array, standardized into a simple python list + """ + if isinstance(input, list): + return input + elif isinstance(input, np.ndarray): + if input.ndim > 1: + if np.squeeze(input).ndim > 1: + raise ValueError("`input` is not a one-dimensional numpy array, cannot be flattened into a python list") + return np.squeeze(input).tolist() + else: + return input.tolist() + else: + return ValueError("`input` must be either a list or numpy array") + +def _standardize_array_to_np(input: Union[list, np.ndarray]) -> np.ndarray: + """ + Standarize an array (either a python list or numpy array) to a 1d numpy array + + Parameters + ---------- + input : list or np.array + Array to be standardized + + Returns + ------- + np.array + Input array, standardized into a 1d numpy array + """ + if isinstance(input, list): + return np.array(input) + elif isinstance(input, np.ndarray): + if input.ndim > 1: + if np.squeeze(input).ndim > 1: + raise ValueError("`input` is not a one-dimensional numpy array, cannot be flattened into a 1d numpy array") + return np.squeeze(input) + else: + return input + else: + return ValueError("`input` must be either a list or numpy array") + +def _check_is_int(input: Union[int, float]) -> bool: + """ + Checks whether a scalar input is or is convertible to an integer + + Parameters + ---------- + input : int or float + Input to be checked for integer status + + Returns + ------- + bool + True if integer, False otherwise + """ + if not isinstance(input, (int, float)): + return False + elif isinstance(input, float): + return int(input) == input + else: + return True + +def _check_is_numeric(input: Union[int, float]) -> bool: + """ + Checks whether a scalar input is numeric + + Parameters + ---------- + input : int or float + Input to be checked for numeric status + + Returns + ------- + bool + True if integer, False otherwise + """ + if not isinstance(input, (int, float)): + return False + else: + return True + +def _check_array_numeric(input: Union[list, np.ndarray]) -> bool: + """ + Checks whether an array is populated with numeric values + + Parameters + ---------- + input : list or np.ndarray + Input array to be checked for numeric values + + Returns + ------- + bool + True if the array is all numeric values, False otherwise + """ + if isinstance(input, list): + return all([isinstance(item, (int, float)) for item in input]) + elif isinstance(input, np.ndarray): + return np.issubdtype(input.dtype, np.number) + else: + raise ValueError("input must be a list or numpy array") + +def _check_array_integer(input: Union[list, np.ndarray]) -> bool: + """ + Checks whether an array is populated with integer values + + Parameters + ---------- + input : list or np.ndarray + Input array to be checked for integer values + + Returns + ------- + bool + True if the array is all integer values, False otherwise + """ + if isinstance(input, list): + return all([isinstance(item, (int)) for item in input]) + elif isinstance(input, np.ndarray): + return np.issubdtype(input.dtype, np.integer) + else: + raise ValueError("input must be a list or numpy array") + +def _check_matrix_square(input: np.ndarray) -> bool: + """ + Checks whether a numpy array is a 2d square matrix + + Parameters + ---------- + input : np.ndarray + Input array to be checked + + Returns + ------- + bool + True if the array is a square matrix, False otherwise + """ + if isinstance(input, np.ndarray): + if input.ndim == 2: + nrow, ncol = input.shape + return nrow == ncol + elif input.ndim > 2: + if np.squeeze(input).ndim == 2: + nrow, ncol = np.squeeze(input).shape + return nrow == ncol + else: + return False + else: + return False + else: + return False From c6f51f63f0c6f2ae52c7df0abcf94e09bb89194c Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Tue, 18 Feb 2025 00:50:10 -0600 Subject: [PATCH 09/35] Formatted new python code with ruff --- stochtree/config.py | 158 ++++++++++++++++++++++++++++---------------- stochtree/utils.py | 17 ++++- 2 files changed, 115 insertions(+), 60 deletions(-) diff --git a/stochtree/config.py b/stochtree/config.py index 61a25c96..eaecfee5 100644 --- a/stochtree/config.py +++ b/stochtree/config.py @@ -1,16 +1,17 @@ -from typing import Union import warnings +from typing import Union import numpy as np -from .utils import _standardize_array_to_np, _check_is_int, _check_matrix_square +from .utils import _check_is_int, _check_matrix_square, _standardize_array_to_np + class ForestModelConfig: """ Object used to get / set parameters and other model configuration options for a forest model in the "low-level" stochtree interface. - - The "low-level" stochtree interface enables a high degreee of sampler customization, in which users employ R wrappers around - C++ objects like `ForestDataset`, `Outcome`, `CppRng`, and `ForestModel` to run the Gibbs sampler of a BART model with custom modifications. + + The "low-level" stochtree interface enables a high degreee of sampler customization, in which users employ R wrappers around + C++ objects like `ForestDataset`, `Outcome`, `CppRng`, and `ForestModel` to run the Gibbs sampler of a BART model with custom modifications. `ForestModelConfig` allows users to specify / query the parameters of a forest model they wish to run. Parameters @@ -47,16 +48,31 @@ class ForestModelConfig: Number of unique cutpoints to consider (default: `100`) """ - def __init__(self, num_trees = None, num_features = None, num_observations = None, - feature_types = None, variable_weights = None, leaf_dimension = 1, - alpha = 0.95, beta = 2.0, min_samples_leaf = 5, max_depth = -1, - leaf_model_type = 1, leaf_model_scale = None, variance_forest_shape = 1.0, - variance_forest_scale = 1.0, cutpoint_grid_size = 100) -> None: + def __init__( + self, + num_trees=None, + num_features=None, + num_observations=None, + feature_types=None, + variable_weights=None, + leaf_dimension=1, + alpha=0.95, + beta=2.0, + min_samples_leaf=5, + max_depth=-1, + leaf_model_type=1, + leaf_model_scale=None, + variance_forest_shape=1.0, + variance_forest_scale=1.0, + cutpoint_grid_size=100, + ) -> None: # Preprocess inputs and run some error checks if not feature_types: if not num_features: - raise ValueError("Neither of `num_features` nor `feature_types` (a vector from which `num_features` can be inferred) was provided.", - "Please provide at least one of these inputs when creating a `ForestModelConfig` object.") + raise ValueError( + "Neither of `num_features` nor `feature_types` (a vector from which `num_features` can be inferred) was provided.", + "Please provide at least one of these inputs when creating a `ForestModelConfig` object.", + ) warnings.warn("`feature_types` not provided, will be assumed to be numeric") self.feature_types = np.repeat(0, num_features) else: @@ -64,7 +80,9 @@ def __init__(self, num_trees = None, num_features = None, num_observations = Non if not num_features: num_features = len(self.feature_types) if not variable_weights: - warnings.warn("`variable_weights` not provided, will be assumed to be equal-weighted") + warnings.warn( + "`variable_weights` not provided, will be assumed to be equal-weighted" + ) self.variable_weights = np.repeat(1.0 / num_features, num_features) else: self.variable_weights = _standardize_array_to_np(variable_weights) @@ -75,7 +93,9 @@ def __init__(self, num_trees = None, num_features = None, num_observations = Non if num_features != len(self.feature_types): raise ValueError("`feature_types` must have `num_features` total elements") if num_features != len(self.variable_weights): - raise ValueError("`variable_weights` must have `num_features` total elements") + raise ValueError( + "`variable_weights` must have `num_features` total elements" + ) if not _check_is_int(leaf_model_type): raise ValueError("`leaf_model_type` must be an integer between 0 and 3") elif leaf_model_type < 0 or leaf_model_type > 3: @@ -91,16 +111,24 @@ def __init__(self, num_trees = None, num_features = None, num_observations = Non else: if isinstance(leaf_model_scale, np.ndarray): if not _check_matrix_square(leaf_model_scale): - raise ValueError("`leaf_model_scale` must be a square matrix if provided as a numpy array") + raise ValueError( + "`leaf_model_scale` must be a square matrix if provided as a numpy array" + ) leaf_model_scale_array = leaf_model_scale elif isinstance(leaf_model_scale, (int, float)): if leaf_model_scale <= 0: - raise ValueError("`leaf_model_scale` must be positive, if provided as scalar") - leaf_model_scale_array = np.zeros((leaf_dimension, leaf_dimension), float) + raise ValueError( + "`leaf_model_scale` must be positive, if provided as scalar" + ) + leaf_model_scale_array = np.zeros( + (leaf_dimension, leaf_dimension), float + ) np.fill_diagonal(leaf_model_scale_array, leaf_model_scale) else: - raise ValueError("`leaf_model_scale` must be a scalar value or a 2d numpy array with matching dimensions") - + raise ValueError( + "`leaf_model_scale` must be a scalar value or a 2d numpy array with matching dimensions" + ) + # Set internal config values self.num_trees = num_trees self.num_features = num_features @@ -115,7 +143,7 @@ def __init__(self, num_trees = None, num_features = None, num_observations = Non self.cutpoint_grid_size = cutpoint_grid_size self.leaf_model_type = leaf_model_type self.leaf_model_scale = leaf_model_scale_array - + def update_feature_types(self, feature_types) -> None: """ Update feature types @@ -124,7 +152,7 @@ def update_feature_types(self, feature_types) -> None: ---------- feature_types : list of np.ndarray Vector of integer-coded feature types (where 0 = numeric, 1 = ordered categorical, 2 = unordered categorical) - + Returns ------- self @@ -133,8 +161,10 @@ def update_feature_types(self, feature_types) -> None: if self.num_features != len(feature_types): raise ValueError("`feature_types` must have `num_features` total elements") self.feature_types = feature_types - - def update_variable_weights(self, variable_weights: Union[list, np.ndarray]) -> None: + + def update_variable_weights( + self, variable_weights: Union[list, np.ndarray] + ) -> None: """ Update variable weights @@ -142,16 +172,18 @@ def update_variable_weights(self, variable_weights: Union[list, np.ndarray]) -> ---------- variable_weights : list or np.ndarray List or array specifying sampling probability for all p covariates in ForestDataset - + Returns ------- self """ variable_weights = _standardize_array_to_np(variable_weights) if self.num_features != len(variable_weights): - raise ValueError("`variable_weights` must have `num_features` total elements") + raise ValueError( + "`variable_weights` must have `num_features` total elements" + ) self.variable_weights = variable_weights - + def update_alpha(self, alpha: float) -> None: """ Update root node split probability in tree prior @@ -160,13 +192,13 @@ def update_alpha(self, alpha: float) -> None: ---------- alpha : float Root node split probability in tree prior - + Returns ------- self """ self.alpha = alpha - + def update_beta(self, beta: float) -> None: """ Update depth prior penalty in tree prior @@ -175,13 +207,13 @@ def update_beta(self, beta: float) -> None: ---------- beta : float Depth prior penalty in tree prior - + Returns ------- self """ self.beta = beta - + def update_min_samples_leaf(self, min_samples_leaf: int) -> None: """ Update minimum number of samples per leaf node in the tree prior @@ -190,13 +222,13 @@ def update_min_samples_leaf(self, min_samples_leaf: int) -> None: ---------- min_samples_leaf : int Minimum number of samples per leaf node in the tree prior - + Returns ------- self """ self.min_samples_leaf = min_samples_leaf - + def update_max_depth(self, max_depth: int) -> None: """ Update max depth in the tree prior @@ -205,14 +237,16 @@ def update_max_depth(self, max_depth: int) -> None: ---------- max_depth : int Max depth in the tree prior - + Returns ------- self """ self.max_depth = max_depth - - def update_leaf_model_scale(self, leaf_model_scale: Union[float, np.ndarray]) -> None: + + def update_leaf_model_scale( + self, leaf_model_scale: Union[float, np.ndarray] + ) -> None: """ Update scale parameter used in Gaussian leaf models @@ -220,25 +254,33 @@ def update_leaf_model_scale(self, leaf_model_scale: Union[float, np.ndarray]) -> ---------- leaf_model_scale : float or np.ndarray Scale parameter used in Gaussian leaf models (can either be a scalar or a q x q matrix, where q is the dimensionality of the basis and is only >1 when `leaf_model_int = 2`). - + Returns ------- self """ if isinstance(leaf_model_scale, np.ndarray): if not _check_matrix_square(leaf_model_scale): - raise ValueError("`leaf_model_scale` must be a square matrix if provided as a numpy array") + raise ValueError( + "`leaf_model_scale` must be a square matrix if provided as a numpy array" + ) leaf_model_scale_array = leaf_model_scale elif isinstance(leaf_model_scale, (int, float)): if leaf_model_scale <= 0: - raise ValueError("`leaf_model_scale` must be positive, if provided as scalar") - leaf_model_scale_array = np.zeros((self.leaf_dimension, self.leaf_dimension), float) + raise ValueError( + "`leaf_model_scale` must be positive, if provided as scalar" + ) + leaf_model_scale_array = np.zeros( + (self.leaf_dimension, self.leaf_dimension), float + ) np.fill_diagonal(leaf_model_scale_array, leaf_model_scale) else: - raise ValueError("`leaf_model_scale` must be a scalar value or a 2d numpy array with matching dimensions") - + raise ValueError( + "`leaf_model_scale` must be a scalar value or a 2d numpy array with matching dimensions" + ) + self.leaf_model_scale = leaf_model_scale_array - + def update_variance_forest_shape(self, variance_forest_shape: float) -> None: """ Update shape parameter for IG leaf models @@ -247,13 +289,13 @@ def update_variance_forest_shape(self, variance_forest_shape: float) -> None: ---------- variance_forest_shape : float Shape parameter for IG leaf models - + Returns ------- self """ self.variance_forest_shape = variance_forest_shape - + def update_variance_forest_scale(self, variance_forest_scale: float) -> None: """ Update scale parameter for IG leaf models @@ -262,13 +304,13 @@ def update_variance_forest_scale(self, variance_forest_scale: float) -> None: ---------- variance_forest_scale : float Scale parameter for IG leaf models - + Returns ------- self """ self.variance_forest_scale = variance_forest_scale - + def update_cutpoint_grid_size(self, cutpoint_grid_size: int) -> None: """ Update maximum number of unique cutpoints to consider in a grow-from-root split @@ -277,13 +319,13 @@ def update_cutpoint_grid_size(self, cutpoint_grid_size: int) -> None: ---------- cutpoint_grid_size : int Maximum number of unique cutpoints to consider in a grow-from-root split - + Returns ------- self """ self.cutpoint_grid_size = cutpoint_grid_size - + def get_feature_types(self): """ Query feature types (integer-coded so that 0 = numeric, 1 = ordered categorical, 2 = unordered categorical) @@ -294,7 +336,7 @@ def get_feature_types(self): Array of integer-coded feature types """ return self.feature_types - + def get_variable_weights(self) -> np.ndarray: """ Query variable weights @@ -305,7 +347,7 @@ def get_variable_weights(self) -> np.ndarray: Array of variable split probability weights """ return self.variable_weights - + def get_alpha(self) -> float: """ Query root node split probability in tree prior @@ -316,7 +358,7 @@ def get_alpha(self) -> float: Root node split probability in tree prior """ return self.alpha - + def get_beta(self) -> float: """ Query depth prior penalty in tree prior @@ -327,7 +369,7 @@ def get_beta(self) -> float: Depth prior penalty in tree prior """ return self.beta - + def get_min_samples_leaf(self) -> int: """ Query min samples in a leaf node in the tree prior @@ -338,7 +380,7 @@ def get_min_samples_leaf(self) -> int: Min samples in a leaf node """ return self.min_samples_leaf - + def get_max_depth(self) -> int: """ Query max depth in the tree prior @@ -349,7 +391,7 @@ def get_max_depth(self) -> int: Max depth in the tree prior """ return self.max_depth - + def get_leaf_model_scale(self) -> np.ndarray: """ Query scale parameter used in Gaussian leaf models @@ -360,7 +402,7 @@ def get_leaf_model_scale(self) -> np.ndarray: Scale parameter (in array form) used in Gaussian leaf models. If the Gaussian leaf model is univariate, the array returned is a 1x1 matrix. """ self.leaf_model_scale - + def get_variance_forest_shape(self) -> float: """ Query shape parameter for IG leaf models @@ -371,18 +413,18 @@ def get_variance_forest_shape(self) -> float: Shape parameter for IG leaf models """ return self.variance_forest_shape - + def get_variance_forest_scale(self) -> float: """ Query scale parameter for IG leaf models - + Returns ------- variance_forest_scale : float Scale parameter for IG leaf models """ return self.variance_forest_scale - + def get_cutpoint_grid_size(self) -> int: """ Query maximum number of unique cutpoints considered in a grow-from-root split diff --git a/stochtree/utils.py b/stochtree/utils.py index 5c83e950..44d41372 100644 --- a/stochtree/utils.py +++ b/stochtree/utils.py @@ -1,6 +1,8 @@ from typing import Union + import numpy as np + class NotSampledError(ValueError, AttributeError): """Exception class to raise if attempting to predict from a model before it has been sampled. @@ -11,6 +13,7 @@ class NotSampledError(ValueError, AttributeError): https://github.com/scikit-learn/scikit-learn/blob/8721245511de2f225ff5f9aa5f5fadce663cd4a3/sklearn/exceptions.py#L45C7-L45C21 """ + def _standardize_array_to_list(input: Union[list, np.ndarray]) -> list: """ Standarize an array (either a python list or numpy array) to a python list @@ -30,13 +33,16 @@ def _standardize_array_to_list(input: Union[list, np.ndarray]) -> list: elif isinstance(input, np.ndarray): if input.ndim > 1: if np.squeeze(input).ndim > 1: - raise ValueError("`input` is not a one-dimensional numpy array, cannot be flattened into a python list") + raise ValueError( + "`input` is not a one-dimensional numpy array, cannot be flattened into a python list" + ) return np.squeeze(input).tolist() else: return input.tolist() else: return ValueError("`input` must be either a list or numpy array") + def _standardize_array_to_np(input: Union[list, np.ndarray]) -> np.ndarray: """ Standarize an array (either a python list or numpy array) to a 1d numpy array @@ -56,13 +62,16 @@ def _standardize_array_to_np(input: Union[list, np.ndarray]) -> np.ndarray: elif isinstance(input, np.ndarray): if input.ndim > 1: if np.squeeze(input).ndim > 1: - raise ValueError("`input` is not a one-dimensional numpy array, cannot be flattened into a 1d numpy array") + raise ValueError( + "`input` is not a one-dimensional numpy array, cannot be flattened into a 1d numpy array" + ) return np.squeeze(input) else: return input else: return ValueError("`input` must be either a list or numpy array") + def _check_is_int(input: Union[int, float]) -> bool: """ Checks whether a scalar input is or is convertible to an integer @@ -84,6 +93,7 @@ def _check_is_int(input: Union[int, float]) -> bool: else: return True + def _check_is_numeric(input: Union[int, float]) -> bool: """ Checks whether a scalar input is numeric @@ -103,6 +113,7 @@ def _check_is_numeric(input: Union[int, float]) -> bool: else: return True + def _check_array_numeric(input: Union[list, np.ndarray]) -> bool: """ Checks whether an array is populated with numeric values @@ -124,6 +135,7 @@ def _check_array_numeric(input: Union[list, np.ndarray]) -> bool: else: raise ValueError("input must be a list or numpy array") + def _check_array_integer(input: Union[list, np.ndarray]) -> bool: """ Checks whether an array is populated with integer values @@ -145,6 +157,7 @@ def _check_array_integer(input: Union[list, np.ndarray]) -> bool: else: raise ValueError("input must be a list or numpy array") + def _check_matrix_square(input: np.ndarray) -> bool: """ Checks whether a numpy array is a 2d square matrix From 4f4e5bf640b817b2c70791afa1b58403a6f320c9 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Tue, 18 Feb 2025 00:56:31 -0600 Subject: [PATCH 10/35] Added global variance parameter config --- stochtree/config.py | 65 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/stochtree/config.py b/stochtree/config.py index eaecfee5..635aa1d3 100644 --- a/stochtree/config.py +++ b/stochtree/config.py @@ -3,7 +3,12 @@ import numpy as np -from .utils import _check_is_int, _check_matrix_square, _standardize_array_to_np +from .utils import ( + _check_is_int, + _check_is_numeric, + _check_matrix_square, + _standardize_array_to_np, +) class ForestModelConfig: @@ -435,3 +440,61 @@ def get_cutpoint_grid_size(self) -> int: Maximum number of unique cutpoints considered in a grow-from-root split """ return self.cutpoint_grid_size + + +class GlobalModelConfig: + """ + Object used to get / set global parameters and other global model configuration options in the "low-level" stochtree interface + + The "low-level" stochtree interface enables a high degreee of sampler customization, in which users employ R wrappers around C++ objects + like ForestDataset, Outcome, CppRng, and ForestModel to run the Gibbs sampler of a BART model with custom modifications. + GlobalModelConfig allows users to specify / query the global parameters of a model they wish to run. + + Parameters + ---------- + global_error_variance : float, optional + Global error variance parameter (default: `1.0`) + """ + + def __init__( + self, + global_error_variance=1.0, + ) -> None: + # Preprocess inputs and run some error checks + if not _check_is_numeric(global_error_variance): + raise ValueError("`global_error_variance` must be a positive scalar") + elif global_error_variance <= 0: + raise ValueError("`global_error_variance` must be a positive scalar") + + # Set internal config values + self.global_error_variance = global_error_variance + + def update_global_error_variance(self, global_error_variance) -> None: + """ + Update global error variance parameter + + Parameters + ---------- + global_error_variance : float + Global error variance parameter + + Returns + ------- + self + """ + if not _check_is_numeric(global_error_variance): + raise ValueError("`global_error_variance` must be a positive scalar") + elif global_error_variance <= 0: + raise ValueError("`global_error_variance` must be a positive scalar") + self.global_error_variance = global_error_variance + + def get_global_error_variance(self) -> float: + """ + Query the global error variance parameter + + Returns + ------- + global_error_variance : float + Global error variance parameter + """ + return self.global_error_variance From c73a10f5229084a9decff58ac065500140d1be40 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Wed, 19 Feb 2025 18:39:37 -0600 Subject: [PATCH 11/35] Add utility unit tests --- stochtree/__init__.py | 18 +++++- stochtree/utils.py | 8 +-- test/python/test_utils.py | 116 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 137 insertions(+), 5 deletions(-) create mode 100644 test/python/test_utils.py diff --git a/stochtree/__init__.py b/stochtree/__init__.py index 6e2de29a..c93585bb 100644 --- a/stochtree/__init__.py +++ b/stochtree/__init__.py @@ -6,7 +6,16 @@ from .preprocessing import CovariatePreprocessor from .sampler import RNG, ForestSampler, GlobalVarianceModel, LeafVarianceModel from .serialization import JSONSerializer -from .utils import NotSampledError +from .utils import ( + NotSampledError, + _check_array_integer, + _check_array_numeric, + _check_is_int, + _check_is_numeric, + _check_matrix_square, + _standardize_array_to_list, + _standardize_array_to_np, +) __all__ = [ "BARTModel", @@ -22,5 +31,12 @@ "LeafVarianceModel", "JSONSerializer", "NotSampledError", + "_check_array_integer", + "_check_array_numeric", + "_check_is_int", + "_check_is_numeric", + "_check_matrix_square", + "_standardize_array_to_list", + "_standardize_array_to_np", "calibrate_global_error_variance", ] diff --git a/stochtree/utils.py b/stochtree/utils.py index 44d41372..356415bf 100644 --- a/stochtree/utils.py +++ b/stochtree/utils.py @@ -40,7 +40,7 @@ def _standardize_array_to_list(input: Union[list, np.ndarray]) -> list: else: return input.tolist() else: - return ValueError("`input` must be either a list or numpy array") + raise ValueError("`input` must be either a list or numpy array") def _standardize_array_to_np(input: Union[list, np.ndarray]) -> np.ndarray: @@ -69,7 +69,7 @@ def _standardize_array_to_np(input: Union[list, np.ndarray]) -> np.ndarray: else: return input else: - return ValueError("`input` must be either a list or numpy array") + raise ValueError("`input` must be either a list or numpy array") def _check_is_int(input: Union[int, float]) -> bool: @@ -133,7 +133,7 @@ def _check_array_numeric(input: Union[list, np.ndarray]) -> bool: elif isinstance(input, np.ndarray): return np.issubdtype(input.dtype, np.number) else: - raise ValueError("input must be a list or numpy array") + return False def _check_array_integer(input: Union[list, np.ndarray]) -> bool: @@ -155,7 +155,7 @@ def _check_array_integer(input: Union[list, np.ndarray]) -> bool: elif isinstance(input, np.ndarray): return np.issubdtype(input.dtype, np.integer) else: - raise ValueError("input must be a list or numpy array") + return False def _check_matrix_square(input: np.ndarray) -> bool: diff --git a/test/python/test_utils.py b/test/python/test_utils.py new file mode 100644 index 00000000..4744a21e --- /dev/null +++ b/test/python/test_utils.py @@ -0,0 +1,116 @@ +import numpy as np +import pytest + +from stochtree.utils import ( + _check_array_integer, + _check_array_numeric, + _check_is_int, + _check_is_numeric, + _check_matrix_square, + _standardize_array_to_list, + _standardize_array_to_np, +) + + +class TestUtils: + def test_check_array(self): + # Test data + array_list1 = [1, 2, 3, 4, 5] + array_list2 = [1.5, 2.3, 3.5, 4.1, 5.3] + array_np1 = np.array([1, 2, 3, 4, 5]) + array_np2 = np.array([1.5, 2.3, 3.5, 4.1, 5.3]) + not_array = dict({"a": 1, "b": 2}) + + # Integer checks + assert _check_array_integer(array_list1) + assert not _check_array_integer(array_list2) + assert _check_array_integer(array_np1) + assert not _check_array_integer(array_np2) + assert not _check_array_integer(not_array) + + # Numeric checks + assert _check_array_numeric(array_list1) + assert _check_array_numeric(array_list2) + assert _check_array_numeric(array_np1) + assert _check_array_numeric(array_np2) + assert not _check_array_numeric(not_array) + + def test_check_scalar(self): + # Test data + int_py1 = 1 + int_py2 = 100000000 + float_py1 = 1.5 + float_py2 = 1000000000001.5 + not_scalar = "a" + + # Integer checks + assert _check_is_int(int_py1) + assert _check_is_int(int_py2) + assert not _check_is_int(float_py1) + assert not _check_is_int(float_py2) + assert not _check_is_int(not_scalar) + + # Numeric checks + assert _check_is_numeric(int_py1) + assert _check_is_numeric(int_py2) + assert _check_is_numeric(float_py1) + assert _check_is_numeric(float_py2) + assert not _check_is_numeric(not_scalar) + + def test_check_matrix(self): + # Test data + array_11 = np.array([[1.6]]) + array_22 = np.array([[1.6, 5.6], [2.3, 4.5]]) + array_33 = np.array([[1.6, 5.6, 3.4], [2.3, 4.5, 7.2], [2.7, 6.1, 3.0]]) + array_23 = np.array([[1.6, 5.6, 3.4], [2.3, 4.5, 7.2]]) + array_32 = np.array([[1.6, 5.6], [2.3, 4.5], [2.7, 6.1]]) + non_array_1 = 100000000 + non_array_2 = "a" + non_array_3 = [[1, 2], [3, 4]] + + # Array checks + assert _check_matrix_square(array_11) + assert _check_matrix_square(array_22) + assert _check_matrix_square(array_33) + assert not _check_matrix_square(array_23) + assert not _check_matrix_square(array_32) + assert not _check_matrix_square(non_array_1) + assert not _check_matrix_square(non_array_2) + assert not _check_matrix_square(non_array_3) + + def test_standardize(self): + # Test data + array_py1 = [1.6, 3.4, 7.6, 8.7] + array_py2 = [8.2, 4.5, 3.8] + array_np1 = np.array([1.6, 3.4, 7.6, 8.7]) + array_np2 = np.array([[1.6, 3.4, 7.6, 8.7]]) + array_np3 = np.array([8.2, 4.5, 3.8]) + array_np4 = np.array([[8.2, 4.5, 3.8]]) + nonconforming_array_np1 = np.array([[8.2, 4.5, 3.8], [1.6, 3.4, 7.6]]) + nonconforming_array_np2 = np.array( + [[8.2, 4.5, 3.8], [1.6, 3.4, 7.6], [1.6, 3.4, 7.6]] + ) + non_array_1 = 100000000 + non_array_2 = "a" + + # List standardization checks + np.testing.assert_array_equal(array_py1, _standardize_array_to_list(array_np1)) + np.testing.assert_array_equal(array_py1, _standardize_array_to_list(array_np2)) + np.testing.assert_array_equal(array_py2, _standardize_array_to_list(array_np3)) + np.testing.assert_array_equal(array_py2, _standardize_array_to_list(array_np4)) + with pytest.raises(ValueError): + _ = _standardize_array_to_list(non_array_1) + _ = _standardize_array_to_list(non_array_2) + _ = _standardize_array_to_list(nonconforming_array_np1) + _ = _standardize_array_to_list(nonconforming_array_np2) + + # Numpy standardization checks + np.testing.assert_array_equal(array_np1, _standardize_array_to_np(array_py1)) + np.testing.assert_array_equal(array_np1, _standardize_array_to_np(array_np2)) + np.testing.assert_array_equal(array_np3, _standardize_array_to_np(array_py2)) + np.testing.assert_array_equal(array_np3, _standardize_array_to_np(array_np4)) + with pytest.raises(ValueError): + _ = _standardize_array_to_np(non_array_1) + _ = _standardize_array_to_np(non_array_2) + _ = _standardize_array_to_np(nonconforming_array_np1) + _ = _standardize_array_to_np(nonconforming_array_np2) From 3111b4c91fcaaff9e1d0c2af1d9c7c6cfc0d1381 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Wed, 19 Feb 2025 18:45:54 -0600 Subject: [PATCH 12/35] Formatted test code --- test/python/test_bart.py | 316 ++++++++++++--------- test/python/test_bcf.py | 459 ++++++++++++++++++------------- test/python/test_calibration.py | 44 +-- test/python/test_json.py | 232 +++++++++++----- test/python/test_predict.py | 13 +- test/python/test_preprocessor.py | 139 ++++++---- test/python/test_residual.py | 70 +++-- 7 files changed, 772 insertions(+), 501 deletions(-) diff --git a/test/python/test_bart.py b/test/python/test_bart.py index e49931ab..bd08fd05 100644 --- a/test/python/test_bart.py +++ b/test/python/test_bart.py @@ -1,8 +1,9 @@ -import pytest import numpy as np from sklearn.model_selection import train_test_split + from stochtree import BARTModel + class TestBART: def test_bart_constant_leaf_homoskedastic(self): # RNG @@ -17,14 +18,13 @@ def test_bart_constant_leaf_homoskedastic(self): # Define the outcome mean function def outcome_mean(X): return np.where( - (X[:,0] >= 0.0) & (X[:,0] < 0.25), -7.5, + (X[:, 0] >= 0.0) & (X[:, 0] < 0.25), + -7.5, np.where( - (X[:,0] >= 0.25) & (X[:,0] < 0.5), -2.5, - np.where( - (X[:,0] >= 0.5) & (X[:,0] < 0.75), 2.5, - 7.5 - ) - ) + (X[:, 0] >= 0.25) & (X[:, 0] < 0.5), + -2.5, + np.where((X[:, 0] >= 0.5) & (X[:, 0] < 0.75), 2.5, 7.5), + ), ) # Generate outcome @@ -34,10 +34,10 @@ def outcome_mean(X): # Test-train split sample_inds = np.arange(n) train_inds, test_inds = train_test_split(sample_inds, test_size=0.5) - X_train = X[train_inds,:] - X_test = X[test_inds,:] + X_train = X[train_inds, :] + X_test = X[test_inds, :] y_train = y[train_inds] - y_test = y[test_inds] + # y_test = y[test_inds] n_train = X_train.shape[0] n_test = X_test.shape[0] @@ -45,16 +45,22 @@ def outcome_mean(X): num_gfr = 10 num_burnin = 0 num_mcmc = 10 - + # Run BCF with test set and propensity score bart_model = BARTModel() - bart_model.sample(X_train=X_train, y_train=y_train, X_test=X_test, - num_gfr=num_gfr, num_burnin=num_burnin, num_mcmc=num_mcmc) + bart_model.sample( + X_train=X_train, + y_train=y_train, + X_test=X_test, + num_gfr=num_gfr, + num_burnin=num_burnin, + num_mcmc=num_mcmc, + ) # Assertions - assert (bart_model.y_hat_train.shape == (n_train, num_mcmc)) - assert (bart_model.y_hat_test.shape == (n_test, num_mcmc)) - + assert bart_model.y_hat_train.shape == (n_train, num_mcmc) + assert bart_model.y_hat_test.shape == (n_test, num_mcmc) + def test_bart_univariate_leaf_regression_homoskedastic(self): # RNG random_seed = 101 @@ -70,14 +76,17 @@ def test_bart_univariate_leaf_regression_homoskedastic(self): # Define the outcome mean function def outcome_mean(X, W): return np.where( - (X[:,0] >= 0.0) & (X[:,0] < 0.25), -7.5 * W[:,0], + (X[:, 0] >= 0.0) & (X[:, 0] < 0.25), + -7.5 * W[:, 0], np.where( - (X[:,0] >= 0.25) & (X[:,0] < 0.5), -2.5 * W[:,0], + (X[:, 0] >= 0.25) & (X[:, 0] < 0.5), + -2.5 * W[:, 0], np.where( - (X[:,0] >= 0.5) & (X[:,0] < 0.75), 2.5 * W[:,0], - 7.5 * W[:,0] - ) - ) + (X[:, 0] >= 0.5) & (X[:, 0] < 0.75), + 2.5 * W[:, 0], + 7.5 * W[:, 0], + ), + ), ) # Generate outcome @@ -87,12 +96,12 @@ def outcome_mean(X, W): # Test-train split sample_inds = np.arange(n) train_inds, test_inds = train_test_split(sample_inds, test_size=0.5) - X_train = X[train_inds,:] - X_test = X[test_inds,:] - basis_train = W[train_inds,:] - basis_test = W[test_inds,:] + X_train = X[train_inds, :] + X_test = X[test_inds, :] + basis_train = W[train_inds, :] + basis_test = W[test_inds, :] y_train = y[train_inds] - y_test = y[test_inds] + # y_test = y[test_inds] n_train = X_train.shape[0] n_test = X_test.shape[0] @@ -100,17 +109,24 @@ def outcome_mean(X, W): num_gfr = 10 num_burnin = 0 num_mcmc = 10 - + # Run BCF with test set and propensity score bart_model = BARTModel() - bart_model.sample(X_train=X_train, y_train=y_train, basis_train=basis_train, - X_test=X_test, basis_test=basis_test, num_gfr=num_gfr, - num_burnin=num_burnin, num_mcmc=num_mcmc) + bart_model.sample( + X_train=X_train, + y_train=y_train, + basis_train=basis_train, + X_test=X_test, + basis_test=basis_test, + num_gfr=num_gfr, + num_burnin=num_burnin, + num_mcmc=num_mcmc, + ) # Assertions - assert (bart_model.y_hat_train.shape == (n_train, num_mcmc)) - assert (bart_model.y_hat_test.shape == (n_test, num_mcmc)) - + assert bart_model.y_hat_train.shape == (n_train, num_mcmc) + assert bart_model.y_hat_test.shape == (n_test, num_mcmc) + def test_bart_multivariate_leaf_regression_homoskedastic(self): # RNG random_seed = 101 @@ -126,14 +142,17 @@ def test_bart_multivariate_leaf_regression_homoskedastic(self): # Define the outcome mean function def outcome_mean(X, W): return np.where( - (X[:,0] >= 0.0) & (X[:,0] < 0.25), -7.5 * W[:,0], + (X[:, 0] >= 0.0) & (X[:, 0] < 0.25), + -7.5 * W[:, 0], np.where( - (X[:,0] >= 0.25) & (X[:,0] < 0.5), -2.5 * W[:,0], + (X[:, 0] >= 0.25) & (X[:, 0] < 0.5), + -2.5 * W[:, 0], np.where( - (X[:,0] >= 0.5) & (X[:,0] < 0.75), 2.5 * W[:,0], - 7.5 * W[:,0] - ) - ) + (X[:, 0] >= 0.5) & (X[:, 0] < 0.75), + 2.5 * W[:, 0], + 7.5 * W[:, 0], + ), + ), ) # Generate outcome @@ -143,12 +162,12 @@ def outcome_mean(X, W): # Test-train split sample_inds = np.arange(n) train_inds, test_inds = train_test_split(sample_inds, test_size=0.5) - X_train = X[train_inds,:] - X_test = X[test_inds,:] - basis_train = W[train_inds,:] - basis_test = W[test_inds,:] + X_train = X[train_inds, :] + X_test = X[test_inds, :] + basis_train = W[train_inds, :] + basis_test = W[test_inds, :] y_train = y[train_inds] - y_test = y[test_inds] + # y_test = y[test_inds] n_train = X_train.shape[0] n_test = X_test.shape[0] @@ -156,17 +175,24 @@ def outcome_mean(X, W): num_gfr = 10 num_burnin = 0 num_mcmc = 10 - + # Run BCF with test set and propensity score bart_model = BARTModel() - bart_model.sample(X_train=X_train, y_train=y_train, basis_train=basis_train, - X_test=X_test, basis_test=basis_test, num_gfr=num_gfr, - num_burnin=num_burnin, num_mcmc=num_mcmc) + bart_model.sample( + X_train=X_train, + y_train=y_train, + basis_train=basis_train, + X_test=X_test, + basis_test=basis_test, + num_gfr=num_gfr, + num_burnin=num_burnin, + num_mcmc=num_mcmc, + ) # Assertions - assert (bart_model.y_hat_train.shape == (n_train, num_mcmc)) - assert (bart_model.y_hat_test.shape == (n_test, num_mcmc)) - + assert bart_model.y_hat_train.shape == (n_train, num_mcmc) + assert bart_model.y_hat_test.shape == (n_test, num_mcmc) + def test_bart_constant_leaf_heteroskedastic(self): # RNG random_seed = 101 @@ -180,27 +206,25 @@ def test_bart_constant_leaf_heteroskedastic(self): # Define the outcome mean function def outcome_mean(X): return np.where( - (X[:,0] >= 0.0) & (X[:,0] < 0.25), -7.5, + (X[:, 0] >= 0.0) & (X[:, 0] < 0.25), + -7.5, np.where( - (X[:,0] >= 0.25) & (X[:,0] < 0.5), -2.5, - np.where( - (X[:,0] >= 0.5) & (X[:,0] < 0.75), 2.5, - 7.5 - ) - ) + (X[:, 0] >= 0.25) & (X[:, 0] < 0.5), + -2.5, + np.where((X[:, 0] >= 0.5) & (X[:, 0] < 0.75), 2.5, 7.5), + ), ) # Define the conditional standard deviation function def conditional_stddev(X): return np.where( - (X[:,0] >= 0.0) & (X[:,0] < 0.25), 0.25, + (X[:, 0] >= 0.0) & (X[:, 0] < 0.25), + 0.25, np.where( - (X[:,0] >= 0.25) & (X[:,0] < 0.5), 0.5, - np.where( - (X[:,0] >= 0.5) & (X[:,0] < 0.75), 1, - 2 - ) - ) + (X[:, 0] >= 0.25) & (X[:, 0] < 0.5), + 0.5, + np.where((X[:, 0] >= 0.5) & (X[:, 0] < 0.75), 1, 2), + ), ) # Generate outcome @@ -210,10 +234,9 @@ def conditional_stddev(X): # Test-train split sample_inds = np.arange(n) train_inds, test_inds = train_test_split(sample_inds, test_size=0.5) - X_train = X[train_inds,:] - X_test = X[test_inds,:] + X_train = X[train_inds, :] + X_test = X[test_inds, :] y_train = y[train_inds] - y_test = y[test_inds] n_train = X_train.shape[0] n_test = X_test.shape[0] @@ -221,19 +244,26 @@ def conditional_stddev(X): num_gfr = 10 num_burnin = 0 num_mcmc = 10 - + # Run BCF with test set and propensity score bart_model = BARTModel() - general_params = {'sample_sigma2_global': True} - variance_forest_params = {'num_trees': 50} - bart_model.sample(X_train=X_train, y_train=y_train, X_test=X_test, general_params=general_params, - variance_forest_params=variance_forest_params, num_gfr=num_gfr, - num_burnin=num_burnin, num_mcmc=num_mcmc) + general_params = {"sample_sigma2_global": True} + variance_forest_params = {"num_trees": 50} + bart_model.sample( + X_train=X_train, + y_train=y_train, + X_test=X_test, + general_params=general_params, + variance_forest_params=variance_forest_params, + num_gfr=num_gfr, + num_burnin=num_burnin, + num_mcmc=num_mcmc, + ) # Assertions - assert (bart_model.y_hat_train.shape == (n_train, num_mcmc)) - assert (bart_model.y_hat_test.shape == (n_test, num_mcmc)) - + assert bart_model.y_hat_train.shape == (n_train, num_mcmc) + assert bart_model.y_hat_test.shape == (n_test, num_mcmc) + def test_bart_univariate_leaf_regression_heteroskedastic(self): # RNG random_seed = 101 @@ -249,27 +279,29 @@ def test_bart_univariate_leaf_regression_heteroskedastic(self): # Define the outcome mean function def outcome_mean(X, W): return np.where( - (X[:,0] >= 0.0) & (X[:,0] < 0.25), -7.5 * W[:,0], + (X[:, 0] >= 0.0) & (X[:, 0] < 0.25), + -7.5 * W[:, 0], np.where( - (X[:,0] >= 0.25) & (X[:,0] < 0.5), -2.5 * W[:,0], + (X[:, 0] >= 0.25) & (X[:, 0] < 0.5), + -2.5 * W[:, 0], np.where( - (X[:,0] >= 0.5) & (X[:,0] < 0.75), 2.5 * W[:,0], - 7.5 * W[:,0] - ) - ) + (X[:, 0] >= 0.5) & (X[:, 0] < 0.75), + 2.5 * W[:, 0], + 7.5 * W[:, 0], + ), + ), ) # Define the conditional standard deviation function def conditional_stddev(X): return np.where( - (X[:,0] >= 0.0) & (X[:,0] < 0.25), 0.25, + (X[:, 0] >= 0.0) & (X[:, 0] < 0.25), + 0.25, np.where( - (X[:,0] >= 0.25) & (X[:,0] < 0.5), 0.5, - np.where( - (X[:,0] >= 0.5) & (X[:,0] < 0.75), 1, - 2 - ) - ) + (X[:, 0] >= 0.25) & (X[:, 0] < 0.5), + 0.5, + np.where((X[:, 0] >= 0.5) & (X[:, 0] < 0.75), 1, 2), + ), ) # Generate outcome @@ -279,12 +311,11 @@ def conditional_stddev(X): # Test-train split sample_inds = np.arange(n) train_inds, test_inds = train_test_split(sample_inds, test_size=0.5) - X_train = X[train_inds,:] - X_test = X[test_inds,:] - basis_train = W[train_inds,:] - basis_test = W[test_inds,:] + X_train = X[train_inds, :] + X_test = X[test_inds, :] + basis_train = W[train_inds, :] + basis_test = W[test_inds, :] y_train = y[train_inds] - y_test = y[test_inds] n_train = X_train.shape[0] n_test = X_test.shape[0] @@ -292,20 +323,28 @@ def conditional_stddev(X): num_gfr = 10 num_burnin = 0 num_mcmc = 10 - + # Run BCF with test set and propensity score bart_model = BARTModel() - general_params = {'sample_sigma2_global': True} - variance_forest_params = {'num_trees': 50} - bart_model.sample(X_train=X_train, y_train=y_train, basis_train=basis_train, - X_test=X_test, basis_test=basis_test, num_gfr=num_gfr, - num_burnin=num_burnin, num_mcmc=num_mcmc, general_params=general_params, - variance_forest_params=variance_forest_params) + general_params = {"sample_sigma2_global": True} + variance_forest_params = {"num_trees": 50} + bart_model.sample( + X_train=X_train, + y_train=y_train, + basis_train=basis_train, + X_test=X_test, + basis_test=basis_test, + num_gfr=num_gfr, + num_burnin=num_burnin, + num_mcmc=num_mcmc, + general_params=general_params, + variance_forest_params=variance_forest_params, + ) # Assertions - assert (bart_model.y_hat_train.shape == (n_train, num_mcmc)) - assert (bart_model.y_hat_test.shape == (n_test, num_mcmc)) - + assert bart_model.y_hat_train.shape == (n_train, num_mcmc) + assert bart_model.y_hat_test.shape == (n_test, num_mcmc) + def test_bart_multivariate_leaf_regression_heteroskedastic(self): # RNG random_seed = 101 @@ -321,27 +360,29 @@ def test_bart_multivariate_leaf_regression_heteroskedastic(self): # Define the outcome mean function def outcome_mean(X, W): return np.where( - (X[:,0] >= 0.0) & (X[:,0] < 0.25), -7.5 * W[:,0], + (X[:, 0] >= 0.0) & (X[:, 0] < 0.25), + -7.5 * W[:, 0], np.where( - (X[:,0] >= 0.25) & (X[:,0] < 0.5), -2.5 * W[:,0], + (X[:, 0] >= 0.25) & (X[:, 0] < 0.5), + -2.5 * W[:, 0], np.where( - (X[:,0] >= 0.5) & (X[:,0] < 0.75), 2.5 * W[:,0], - 7.5 * W[:,0] - ) - ) + (X[:, 0] >= 0.5) & (X[:, 0] < 0.75), + 2.5 * W[:, 0], + 7.5 * W[:, 0], + ), + ), ) # Define the conditional standard deviation function def conditional_stddev(X): return np.where( - (X[:,0] >= 0.0) & (X[:,0] < 0.25), 0.25, + (X[:, 0] >= 0.0) & (X[:, 0] < 0.25), + 0.25, np.where( - (X[:,0] >= 0.25) & (X[:,0] < 0.5), 0.5, - np.where( - (X[:,0] >= 0.5) & (X[:,0] < 0.75), 1, - 2 - ) - ) + (X[:, 0] >= 0.25) & (X[:, 0] < 0.5), + 0.5, + np.where((X[:, 0] >= 0.5) & (X[:, 0] < 0.75), 1, 2), + ), ) # Generate outcome @@ -351,12 +392,11 @@ def conditional_stddev(X): # Test-train split sample_inds = np.arange(n) train_inds, test_inds = train_test_split(sample_inds, test_size=0.5) - X_train = X[train_inds,:] - X_test = X[test_inds,:] - basis_train = W[train_inds,:] - basis_test = W[test_inds,:] + X_train = X[train_inds, :] + X_test = X[test_inds, :] + basis_train = W[train_inds, :] + basis_test = W[test_inds, :] y_train = y[train_inds] - y_test = y[test_inds] n_train = X_train.shape[0] n_test = X_test.shape[0] @@ -364,16 +404,24 @@ def conditional_stddev(X): num_gfr = 10 num_burnin = 0 num_mcmc = 10 - + # Run BCF with test set and propensity score bart_model = BARTModel() - general_params = {'sample_sigma2_global': True} - variance_forest_params = {'num_trees': 50} - bart_model.sample(X_train=X_train, y_train=y_train, basis_train=basis_train, - X_test=X_test, basis_test=basis_test, num_gfr=num_gfr, - num_burnin=num_burnin, num_mcmc=num_mcmc, general_params=general_params, - variance_forest_params=variance_forest_params) + general_params = {"sample_sigma2_global": True} + variance_forest_params = {"num_trees": 50} + bart_model.sample( + X_train=X_train, + y_train=y_train, + basis_train=basis_train, + X_test=X_test, + basis_test=basis_test, + num_gfr=num_gfr, + num_burnin=num_burnin, + num_mcmc=num_mcmc, + general_params=general_params, + variance_forest_params=variance_forest_params, + ) # Assertions - assert (bart_model.y_hat_train.shape == (n_train, num_mcmc)) - assert (bart_model.y_hat_test.shape == (n_test, num_mcmc)) + assert bart_model.y_hat_train.shape == (n_train, num_mcmc) + assert bart_model.y_hat_test.shape == (n_test, num_mcmc) diff --git a/test/python/test_bcf.py b/test/python/test_bcf.py index 6b558753..dc6fe162 100644 --- a/test/python/test_bcf.py +++ b/test/python/test_bcf.py @@ -1,8 +1,10 @@ -import pytest import numpy as np +import pytest from sklearn.model_selection import train_test_split + from stochtree import BCFModel + class TestBCF: def test_binary_bcf(self): # RNG @@ -13,32 +15,27 @@ def test_binary_bcf(self): n = 100 p_X = 5 X = rng.uniform(0, 1, (n, p_X)) - pi_X = 0.25 + 0.5*X[:,0] + pi_X = 0.25 + 0.5 * X[:, 0] Z = rng.binomial(1, pi_X, n).astype(float) # Define the outcome mean functions (prognostic and treatment effects) - mu_X = pi_X*5 - tau_X = X[:,1]*2 + mu_X = pi_X * 5 + tau_X = X[:, 1] * 2 # Generate outcome epsilon = rng.normal(0, 1, n) - y = mu_X + tau_X*Z + epsilon + y = mu_X + tau_X * Z + epsilon # Test-train split sample_inds = np.arange(n) train_inds, test_inds = train_test_split(sample_inds, test_size=0.5) - X_train = X[train_inds,:] - X_test = X[test_inds,:] + X_train = X[train_inds, :] + X_test = X[test_inds, :] Z_train = Z[train_inds] Z_test = Z[test_inds] y_train = y[train_inds] - y_test = y[test_inds] pi_train = pi_X[train_inds] pi_test = pi_X[test_inds] - mu_train = mu_X[train_inds] - mu_test = mu_X[test_inds] - tau_train = tau_X[train_inds] - tau_test = tau_X[test_inds] n_train = X_train.shape[0] n_test = X_test.shape[0] @@ -46,104 +43,134 @@ def test_binary_bcf(self): num_gfr = 10 num_burnin = 0 num_mcmc = 10 - + # Run BCF with test set and propensity score bcf_model = BCFModel() - variance_forest_params = {'num_trees': 0} - bcf_model.sample(X_train=X_train, Z_train=Z_train, y_train=y_train, pi_train=pi_train, - X_test=X_test, Z_test=Z_test, pi_test=pi_test, num_gfr=num_gfr, - num_burnin=num_burnin, num_mcmc=num_mcmc, variance_forest_params=variance_forest_params) + variance_forest_params = {"num_trees": 0} + bcf_model.sample( + X_train=X_train, + Z_train=Z_train, + y_train=y_train, + pi_train=pi_train, + X_test=X_test, + Z_test=Z_test, + pi_test=pi_test, + num_gfr=num_gfr, + num_burnin=num_burnin, + num_mcmc=num_mcmc, + variance_forest_params=variance_forest_params, + ) # Assertions - assert (bcf_model.y_hat_train.shape == (n_train, num_mcmc)) - assert (bcf_model.mu_hat_train.shape == (n_train, num_mcmc)) - assert (bcf_model.tau_hat_train.shape == (n_train, num_mcmc)) - assert (bcf_model.y_hat_test.shape == (n_test, num_mcmc)) - assert (bcf_model.mu_hat_test.shape == (n_test, num_mcmc)) - assert (bcf_model.tau_hat_test.shape == (n_test, num_mcmc)) + assert bcf_model.y_hat_train.shape == (n_train, num_mcmc) + assert bcf_model.mu_hat_train.shape == (n_train, num_mcmc) + assert bcf_model.tau_hat_train.shape == (n_train, num_mcmc) + assert bcf_model.y_hat_test.shape == (n_test, num_mcmc) + assert bcf_model.mu_hat_test.shape == (n_test, num_mcmc) + assert bcf_model.tau_hat_test.shape == (n_test, num_mcmc) # Check overall prediction method tau_hat, mu_hat, y_hat = bcf_model.predict(X_test, Z_test, pi_test) - assert (tau_hat.shape == (n_test, num_mcmc)) - assert (mu_hat.shape == (n_test, num_mcmc)) - assert (y_hat.shape == (n_test, num_mcmc)) - + assert tau_hat.shape == (n_test, num_mcmc) + assert mu_hat.shape == (n_test, num_mcmc) + assert y_hat.shape == (n_test, num_mcmc) + # Check treatment effect prediction method tau_hat = bcf_model.predict_tau(X_test, Z_test, pi_test) - assert (tau_hat.shape == (n_test, num_mcmc)) + assert tau_hat.shape == (n_test, num_mcmc) # Run BCF without test set and with propensity score bcf_model = BCFModel() - variance_forest_params = {'num_trees': 0} - bcf_model.sample(X_train=X_train, Z_train=Z_train, y_train=y_train, pi_train=pi_train, - num_gfr=num_gfr, num_burnin=num_burnin, num_mcmc=num_mcmc, - variance_forest_params=variance_forest_params) + variance_forest_params = {"num_trees": 0} + bcf_model.sample( + X_train=X_train, + Z_train=Z_train, + y_train=y_train, + pi_train=pi_train, + num_gfr=num_gfr, + num_burnin=num_burnin, + num_mcmc=num_mcmc, + variance_forest_params=variance_forest_params, + ) # Assertions - assert (bcf_model.y_hat_train.shape == (n_train, num_mcmc)) - assert (bcf_model.mu_hat_train.shape == (n_train, num_mcmc)) - assert (bcf_model.tau_hat_train.shape == (n_train, num_mcmc)) + assert bcf_model.y_hat_train.shape == (n_train, num_mcmc) + assert bcf_model.mu_hat_train.shape == (n_train, num_mcmc) + assert bcf_model.tau_hat_train.shape == (n_train, num_mcmc) # Check overall prediction method tau_hat, mu_hat, y_hat = bcf_model.predict(X_test, Z_test, pi_test) - assert (tau_hat.shape == (n_test, num_mcmc)) - assert (mu_hat.shape == (n_test, num_mcmc)) - assert (y_hat.shape == (n_test, num_mcmc)) - + assert tau_hat.shape == (n_test, num_mcmc) + assert mu_hat.shape == (n_test, num_mcmc) + assert y_hat.shape == (n_test, num_mcmc) + # Check treatment effect prediction method tau_hat = bcf_model.predict_tau(X_test, Z_test, pi_test) - assert (tau_hat.shape == (n_test, num_mcmc)) + assert tau_hat.shape == (n_test, num_mcmc) # Run BCF with test set and without propensity score bcf_model = BCFModel() - variance_forest_params = {'num_trees': 0} - bcf_model.sample(X_train=X_train, Z_train=Z_train, y_train=y_train, - X_test=X_test, Z_test=Z_test, num_gfr=num_gfr, - num_burnin=num_burnin, num_mcmc=num_mcmc, - variance_forest_params=variance_forest_params) + variance_forest_params = {"num_trees": 0} + bcf_model.sample( + X_train=X_train, + Z_train=Z_train, + y_train=y_train, + X_test=X_test, + Z_test=Z_test, + num_gfr=num_gfr, + num_burnin=num_burnin, + num_mcmc=num_mcmc, + variance_forest_params=variance_forest_params, + ) # Assertions - assert (bcf_model.y_hat_train.shape == (n_train, num_mcmc)) - assert (bcf_model.mu_hat_train.shape == (n_train, num_mcmc)) - assert (bcf_model.tau_hat_train.shape == (n_train, num_mcmc)) - assert (bcf_model.bart_propensity_model.y_hat_train.shape == (n_train, 10)) - assert (bcf_model.y_hat_test.shape == (n_test, num_mcmc)) - assert (bcf_model.mu_hat_test.shape == (n_test, num_mcmc)) - assert (bcf_model.tau_hat_test.shape == (n_test, num_mcmc)) - assert (bcf_model.bart_propensity_model.y_hat_test.shape == (n_test, 10)) + assert bcf_model.y_hat_train.shape == (n_train, num_mcmc) + assert bcf_model.mu_hat_train.shape == (n_train, num_mcmc) + assert bcf_model.tau_hat_train.shape == (n_train, num_mcmc) + assert bcf_model.bart_propensity_model.y_hat_train.shape == (n_train, 10) + assert bcf_model.y_hat_test.shape == (n_test, num_mcmc) + assert bcf_model.mu_hat_test.shape == (n_test, num_mcmc) + assert bcf_model.tau_hat_test.shape == (n_test, num_mcmc) + assert bcf_model.bart_propensity_model.y_hat_test.shape == (n_test, 10) # Check overall prediction method tau_hat, mu_hat, y_hat = bcf_model.predict(X_test, Z_test) - assert (tau_hat.shape == (n_test, num_mcmc)) - assert (mu_hat.shape == (n_test, num_mcmc)) - assert (y_hat.shape == (n_test, num_mcmc)) - + assert tau_hat.shape == (n_test, num_mcmc) + assert mu_hat.shape == (n_test, num_mcmc) + assert y_hat.shape == (n_test, num_mcmc) + # Check treatment effect prediction method tau_hat = bcf_model.predict_tau(X_test, Z_test) - assert (tau_hat.shape == (n_test, num_mcmc)) + assert tau_hat.shape == (n_test, num_mcmc) # Run BCF without test set and without propensity score bcf_model = BCFModel() - variance_forest_params = {'num_trees': 0} - bcf_model.sample(X_train=X_train, Z_train=Z_train, y_train=y_train, - num_gfr=num_gfr, num_burnin=num_burnin, num_mcmc=num_mcmc, - variance_forest_params=variance_forest_params) + variance_forest_params = {"num_trees": 0} + bcf_model.sample( + X_train=X_train, + Z_train=Z_train, + y_train=y_train, + num_gfr=num_gfr, + num_burnin=num_burnin, + num_mcmc=num_mcmc, + variance_forest_params=variance_forest_params, + ) # Assertions - assert (bcf_model.y_hat_train.shape == (n_train, num_mcmc)) - assert (bcf_model.mu_hat_train.shape == (n_train, num_mcmc)) - assert (bcf_model.tau_hat_train.shape == (n_train, num_mcmc)) - assert (bcf_model.bart_propensity_model.y_hat_train.shape == (n_train, 10)) + assert bcf_model.y_hat_train.shape == (n_train, num_mcmc) + assert bcf_model.mu_hat_train.shape == (n_train, num_mcmc) + assert bcf_model.tau_hat_train.shape == (n_train, num_mcmc) + assert bcf_model.bart_propensity_model.y_hat_train.shape == (n_train, 10) # Check overall prediction method tau_hat, mu_hat, y_hat = bcf_model.predict(X_test, Z_test) - assert (tau_hat.shape == (n_test, num_mcmc)) - assert (mu_hat.shape == (n_test, num_mcmc)) - assert (y_hat.shape == (n_test, num_mcmc)) - + assert tau_hat.shape == (n_test, num_mcmc) + assert mu_hat.shape == (n_test, num_mcmc) + assert y_hat.shape == (n_test, num_mcmc) + # Check treatment effect prediction method tau_hat = bcf_model.predict_tau(X_test, Z_test) - + def test_continuous_univariate_bcf(self): # RNG random_seed = 101 @@ -153,32 +180,27 @@ def test_continuous_univariate_bcf(self): n = 100 p_X = 5 X = rng.uniform(0, 1, (n, p_X)) - pi_X = 0.25 + 0.5*X[:,0] + pi_X = 0.25 + 0.5 * X[:, 0] Z = pi_X + rng.normal(0, 1, n) # Define the outcome mean functions (prognostic and treatment effects) - mu_X = pi_X*5 - tau_X = X[:,1]*2 + mu_X = pi_X * 5 + tau_X = X[:, 1] * 2 # Generate outcome epsilon = rng.normal(0, 1, n) - y = mu_X + tau_X*Z + epsilon + y = mu_X + tau_X * Z + epsilon # Test-train split sample_inds = np.arange(n) train_inds, test_inds = train_test_split(sample_inds, test_size=0.5) - X_train = X[train_inds,:] - X_test = X[test_inds,:] + X_train = X[train_inds, :] + X_test = X[test_inds, :] Z_train = Z[train_inds] Z_test = Z[test_inds] y_train = y[train_inds] - y_test = y[test_inds] pi_train = pi_X[train_inds] pi_test = pi_X[test_inds] - mu_train = mu_X[train_inds] - mu_test = mu_X[test_inds] - tau_train = tau_X[train_inds] - tau_test = tau_X[test_inds] n_train = X_train.shape[0] n_test = X_test.shape[0] @@ -186,105 +208,134 @@ def test_continuous_univariate_bcf(self): num_gfr = 10 num_burnin = 0 num_mcmc = 10 - + # Run BCF with test set and propensity score bcf_model = BCFModel() - variance_forest_params = {'num_trees': 0} - bcf_model.sample(X_train=X_train, Z_train=Z_train, y_train=y_train, pi_train=pi_train, - X_test=X_test, Z_test=Z_test, pi_test=pi_test, num_gfr=num_gfr, - num_burnin=num_burnin, num_mcmc=num_mcmc, - variance_forest_params=variance_forest_params) + variance_forest_params = {"num_trees": 0} + bcf_model.sample( + X_train=X_train, + Z_train=Z_train, + y_train=y_train, + pi_train=pi_train, + X_test=X_test, + Z_test=Z_test, + pi_test=pi_test, + num_gfr=num_gfr, + num_burnin=num_burnin, + num_mcmc=num_mcmc, + variance_forest_params=variance_forest_params, + ) # Assertions - assert (bcf_model.y_hat_train.shape == (n_train, num_mcmc)) - assert (bcf_model.mu_hat_train.shape == (n_train, num_mcmc)) - assert (bcf_model.tau_hat_train.shape == (n_train, num_mcmc)) - assert (bcf_model.y_hat_test.shape == (n_test, num_mcmc)) - assert (bcf_model.mu_hat_test.shape == (n_test, num_mcmc)) - assert (bcf_model.tau_hat_test.shape == (n_test, num_mcmc)) + assert bcf_model.y_hat_train.shape == (n_train, num_mcmc) + assert bcf_model.mu_hat_train.shape == (n_train, num_mcmc) + assert bcf_model.tau_hat_train.shape == (n_train, num_mcmc) + assert bcf_model.y_hat_test.shape == (n_test, num_mcmc) + assert bcf_model.mu_hat_test.shape == (n_test, num_mcmc) + assert bcf_model.tau_hat_test.shape == (n_test, num_mcmc) # Check overall prediction method tau_hat, mu_hat, y_hat = bcf_model.predict(X_test, Z_test, pi_test) - assert (tau_hat.shape == (n_test, num_mcmc)) - assert (mu_hat.shape == (n_test, num_mcmc)) - assert (y_hat.shape == (n_test, num_mcmc)) - + assert tau_hat.shape == (n_test, num_mcmc) + assert mu_hat.shape == (n_test, num_mcmc) + assert y_hat.shape == (n_test, num_mcmc) + # Check treatment effect prediction method tau_hat = bcf_model.predict_tau(X_test, Z_test, pi_test) - assert (tau_hat.shape == (n_test, num_mcmc)) + assert tau_hat.shape == (n_test, num_mcmc) # Run BCF without test set and with propensity score bcf_model = BCFModel() - variance_forest_params = {'num_trees': 0} - bcf_model.sample(X_train=X_train, Z_train=Z_train, y_train=y_train, pi_train=pi_train, - num_gfr=num_gfr, num_burnin=num_burnin, num_mcmc=num_mcmc, - variance_forest_params=variance_forest_params) + variance_forest_params = {"num_trees": 0} + bcf_model.sample( + X_train=X_train, + Z_train=Z_train, + y_train=y_train, + pi_train=pi_train, + num_gfr=num_gfr, + num_burnin=num_burnin, + num_mcmc=num_mcmc, + variance_forest_params=variance_forest_params, + ) # Assertions - assert (bcf_model.y_hat_train.shape == (n_train, num_mcmc)) - assert (bcf_model.mu_hat_train.shape == (n_train, num_mcmc)) - assert (bcf_model.tau_hat_train.shape == (n_train, num_mcmc)) + assert bcf_model.y_hat_train.shape == (n_train, num_mcmc) + assert bcf_model.mu_hat_train.shape == (n_train, num_mcmc) + assert bcf_model.tau_hat_train.shape == (n_train, num_mcmc) # Check overall prediction method tau_hat, mu_hat, y_hat = bcf_model.predict(X_test, Z_test, pi_test) - assert (tau_hat.shape == (n_test, num_mcmc)) - assert (mu_hat.shape == (n_test, num_mcmc)) - assert (y_hat.shape == (n_test, num_mcmc)) - + assert tau_hat.shape == (n_test, num_mcmc) + assert mu_hat.shape == (n_test, num_mcmc) + assert y_hat.shape == (n_test, num_mcmc) + # Check treatment effect prediction method tau_hat = bcf_model.predict_tau(X_test, Z_test, pi_test) - assert (tau_hat.shape == (n_test, num_mcmc)) + assert tau_hat.shape == (n_test, num_mcmc) # Run BCF with test set and without propensity score bcf_model = BCFModel() - variance_forest_params = {'num_trees': 0} - bcf_model.sample(X_train=X_train, Z_train=Z_train, y_train=y_train, - X_test=X_test, Z_test=Z_test, num_gfr=num_gfr, - num_burnin=num_burnin, num_mcmc=num_mcmc, - variance_forest_params=variance_forest_params) + variance_forest_params = {"num_trees": 0} + bcf_model.sample( + X_train=X_train, + Z_train=Z_train, + y_train=y_train, + X_test=X_test, + Z_test=Z_test, + num_gfr=num_gfr, + num_burnin=num_burnin, + num_mcmc=num_mcmc, + variance_forest_params=variance_forest_params, + ) # Assertions - assert (bcf_model.y_hat_train.shape == (n_train, num_mcmc)) - assert (bcf_model.mu_hat_train.shape == (n_train, num_mcmc)) - assert (bcf_model.tau_hat_train.shape == (n_train, num_mcmc)) - assert (bcf_model.bart_propensity_model.y_hat_train.shape == (n_train, 10)) - assert (bcf_model.y_hat_test.shape == (n_test, num_mcmc)) - assert (bcf_model.mu_hat_test.shape == (n_test, num_mcmc)) - assert (bcf_model.tau_hat_test.shape == (n_test, num_mcmc)) - assert (bcf_model.bart_propensity_model.y_hat_test.shape == (n_test, 10)) + assert bcf_model.y_hat_train.shape == (n_train, num_mcmc) + assert bcf_model.mu_hat_train.shape == (n_train, num_mcmc) + assert bcf_model.tau_hat_train.shape == (n_train, num_mcmc) + assert bcf_model.bart_propensity_model.y_hat_train.shape == (n_train, 10) + assert bcf_model.y_hat_test.shape == (n_test, num_mcmc) + assert bcf_model.mu_hat_test.shape == (n_test, num_mcmc) + assert bcf_model.tau_hat_test.shape == (n_test, num_mcmc) + assert bcf_model.bart_propensity_model.y_hat_test.shape == (n_test, 10) # Check overall prediction method tau_hat, mu_hat, y_hat = bcf_model.predict(X_test, Z_test) - assert (tau_hat.shape == (n_test, num_mcmc)) - assert (mu_hat.shape == (n_test, num_mcmc)) - assert (y_hat.shape == (n_test, num_mcmc)) - + assert tau_hat.shape == (n_test, num_mcmc) + assert mu_hat.shape == (n_test, num_mcmc) + assert y_hat.shape == (n_test, num_mcmc) + # Check treatment effect prediction method tau_hat = bcf_model.predict_tau(X_test, Z_test) - assert (tau_hat.shape == (n_test, num_mcmc)) + assert tau_hat.shape == (n_test, num_mcmc) # Run BCF without test set and without propensity score bcf_model = BCFModel() - variance_forest_params = {'num_trees': 0} - bcf_model.sample(X_train=X_train, Z_train=Z_train, y_train=y_train, - num_gfr=num_gfr, num_burnin=num_burnin, num_mcmc=num_mcmc, - variance_forest_params=variance_forest_params) + variance_forest_params = {"num_trees": 0} + bcf_model.sample( + X_train=X_train, + Z_train=Z_train, + y_train=y_train, + num_gfr=num_gfr, + num_burnin=num_burnin, + num_mcmc=num_mcmc, + variance_forest_params=variance_forest_params, + ) # Assertions - assert (bcf_model.y_hat_train.shape == (n_train, num_mcmc)) - assert (bcf_model.mu_hat_train.shape == (n_train, num_mcmc)) - assert (bcf_model.tau_hat_train.shape == (n_train, num_mcmc)) - assert (bcf_model.bart_propensity_model.y_hat_train.shape == (n_train, 10)) + assert bcf_model.y_hat_train.shape == (n_train, num_mcmc) + assert bcf_model.mu_hat_train.shape == (n_train, num_mcmc) + assert bcf_model.tau_hat_train.shape == (n_train, num_mcmc) + assert bcf_model.bart_propensity_model.y_hat_train.shape == (n_train, 10) # Check overall prediction method tau_hat, mu_hat, y_hat = bcf_model.predict(X_test, Z_test) - assert (tau_hat.shape == (n_test, num_mcmc)) - assert (mu_hat.shape == (n_test, num_mcmc)) - assert (y_hat.shape == (n_test, num_mcmc)) - + assert tau_hat.shape == (n_test, num_mcmc) + assert mu_hat.shape == (n_test, num_mcmc) + assert y_hat.shape == (n_test, num_mcmc) + # Check treatment effect prediction method tau_hat = bcf_model.predict_tau(X_test, Z_test) - + def test_multivariate_bcf(self): # RNG random_seed = 101 @@ -294,34 +345,29 @@ def test_multivariate_bcf(self): n = 100 p_X = 5 X = rng.uniform(0, 1, (n, p_X)) - pi_X = np.c_[0.25 + 0.5*X[:,0], 0.5 - 0.25*X[:,1]] - Z = pi_X + rng.normal(0, 1, (n,2)) + pi_X = np.c_[0.25 + 0.5 * X[:, 0], 0.5 - 0.25 * X[:, 1]] + Z = pi_X + rng.normal(0, 1, (n, 2)) treatment_dim = Z.shape[1] # Define the outcome mean functions (prognostic and treatment effects) - mu_X = pi_X[:,0]*5 - tau_X = np.c_[X[:,1]*2,-0.5*X[:,2]] + mu_X = pi_X[:, 0] * 5 + tau_X = np.c_[X[:, 1] * 2, -0.5 * X[:, 2]] # Generate outcome epsilon = rng.normal(0, 1, n) - treatment_term = (tau_X*Z).sum(axis=1) + treatment_term = (tau_X * Z).sum(axis=1) y = mu_X + treatment_term + epsilon # Test-train split sample_inds = np.arange(n) train_inds, test_inds = train_test_split(sample_inds, test_size=0.5) - X_train = X[train_inds,:] - X_test = X[test_inds,:] + X_train = X[train_inds, :] + X_test = X[test_inds, :] Z_train = Z[train_inds] Z_test = Z[test_inds] y_train = y[train_inds] - y_test = y[test_inds] pi_train = pi_X[train_inds] pi_test = pi_X[test_inds] - mu_train = mu_X[train_inds] - mu_test = mu_X[test_inds] - tau_train = tau_X[train_inds] - tau_test = tau_X[test_inds] n_train = X_train.shape[0] n_test = X_test.shape[0] @@ -329,68 +375,97 @@ def test_multivariate_bcf(self): num_gfr = 10 num_burnin = 0 num_mcmc = 10 - + # Run BCF with test set and propensity score bcf_model = BCFModel() - variance_forest_params = {'num_trees': 0} - bcf_model.sample(X_train=X_train, Z_train=Z_train, y_train=y_train, pi_train=pi_train, - X_test=X_test, Z_test=Z_test, pi_test=pi_test, num_gfr=num_gfr, - num_burnin=num_burnin, num_mcmc=num_mcmc, - variance_forest_params=variance_forest_params) + variance_forest_params = {"num_trees": 0} + bcf_model.sample( + X_train=X_train, + Z_train=Z_train, + y_train=y_train, + pi_train=pi_train, + X_test=X_test, + Z_test=Z_test, + pi_test=pi_test, + num_gfr=num_gfr, + num_burnin=num_burnin, + num_mcmc=num_mcmc, + variance_forest_params=variance_forest_params, + ) # Assertions - assert (bcf_model.y_hat_train.shape == (n_train, num_mcmc)) - assert (bcf_model.mu_hat_train.shape == (n_train, num_mcmc)) - assert (bcf_model.tau_hat_train.shape == (n_train, num_mcmc, treatment_dim)) - assert (bcf_model.y_hat_test.shape == (n_test, num_mcmc)) - assert (bcf_model.mu_hat_test.shape == (n_test, num_mcmc)) - assert (bcf_model.tau_hat_test.shape == (n_test, num_mcmc, treatment_dim)) + assert bcf_model.y_hat_train.shape == (n_train, num_mcmc) + assert bcf_model.mu_hat_train.shape == (n_train, num_mcmc) + assert bcf_model.tau_hat_train.shape == (n_train, num_mcmc, treatment_dim) + assert bcf_model.y_hat_test.shape == (n_test, num_mcmc) + assert bcf_model.mu_hat_test.shape == (n_test, num_mcmc) + assert bcf_model.tau_hat_test.shape == (n_test, num_mcmc, treatment_dim) # Check overall prediction method tau_hat, mu_hat, y_hat = bcf_model.predict(X_test, Z_test, pi_test) - assert (tau_hat.shape == (n_test, num_mcmc, treatment_dim)) - assert (mu_hat.shape == (n_test, num_mcmc)) - assert (y_hat.shape == (n_test, num_mcmc)) - + assert tau_hat.shape == (n_test, num_mcmc, treatment_dim) + assert mu_hat.shape == (n_test, num_mcmc) + assert y_hat.shape == (n_test, num_mcmc) + # Check treatment effect prediction method tau_hat = bcf_model.predict_tau(X_test, Z_test, pi_test) - assert (tau_hat.shape == (n_test, num_mcmc, treatment_dim)) + assert tau_hat.shape == (n_test, num_mcmc, treatment_dim) # Run BCF without test set and with propensity score bcf_model = BCFModel() - variance_forest_params = {'num_trees': 0} - bcf_model.sample(X_train=X_train, Z_train=Z_train, y_train=y_train, pi_train=pi_train, - num_gfr=num_gfr, num_burnin=num_burnin, num_mcmc=num_mcmc, - variance_forest_params=variance_forest_params) + variance_forest_params = {"num_trees": 0} + bcf_model.sample( + X_train=X_train, + Z_train=Z_train, + y_train=y_train, + pi_train=pi_train, + num_gfr=num_gfr, + num_burnin=num_burnin, + num_mcmc=num_mcmc, + variance_forest_params=variance_forest_params, + ) # Assertions - assert (bcf_model.y_hat_train.shape == (n_train, num_mcmc)) - assert (bcf_model.mu_hat_train.shape == (n_train, num_mcmc)) - assert (bcf_model.tau_hat_train.shape == (n_train, num_mcmc, treatment_dim)) + assert bcf_model.y_hat_train.shape == (n_train, num_mcmc) + assert bcf_model.mu_hat_train.shape == (n_train, num_mcmc) + assert bcf_model.tau_hat_train.shape == (n_train, num_mcmc, treatment_dim) # Check overall prediction method tau_hat, mu_hat, y_hat = bcf_model.predict(X_test, Z_test, pi_test) - assert (tau_hat.shape == (n_test, num_mcmc, treatment_dim)) - assert (mu_hat.shape == (n_test, num_mcmc)) - assert (y_hat.shape == (n_test, num_mcmc)) - + assert tau_hat.shape == (n_test, num_mcmc, treatment_dim) + assert mu_hat.shape == (n_test, num_mcmc) + assert y_hat.shape == (n_test, num_mcmc) + # Check treatment effect prediction method tau_hat = bcf_model.predict_tau(X_test, Z_test, pi_test) - assert (tau_hat.shape == (n_test, num_mcmc, treatment_dim)) + assert tau_hat.shape == (n_test, num_mcmc, treatment_dim) # Run BCF with test set and without propensity score with pytest.raises(ValueError): bcf_model = BCFModel() - variance_forest_params = {'num_trees': 0} - bcf_model.sample(X_train=X_train, Z_train=Z_train, y_train=y_train, - X_test=X_test, Z_test=Z_test, num_gfr=num_gfr, - num_burnin=num_burnin, num_mcmc=num_mcmc, - variance_forest_params=variance_forest_params) + variance_forest_params = {"num_trees": 0} + bcf_model.sample( + X_train=X_train, + Z_train=Z_train, + y_train=y_train, + X_test=X_test, + Z_test=Z_test, + num_gfr=num_gfr, + num_burnin=num_burnin, + num_mcmc=num_mcmc, + variance_forest_params=variance_forest_params, + ) # Run BCF without test set and without propensity score with pytest.raises(ValueError): bcf_model = BCFModel() - variance_forest_params = {'num_trees': 0} - bcf_model.sample(X_train=X_train, Z_train=Z_train, y_train=y_train, - num_gfr=num_gfr, num_burnin=num_burnin, num_mcmc=num_mcmc, - variance_forest_params=variance_forest_params) + variance_forest_params = {"num_trees": 0} + bcf_model.sample( + X_train=X_train, + Z_train=Z_train, + y_train=y_train, + num_gfr=num_gfr, + num_burnin=num_burnin, + num_mcmc=num_mcmc, + variance_forest_params=variance_forest_params, + ) diff --git a/test/python/test_calibration.py b/test/python/test_calibration.py index 0cc437a8..4f994cd5 100644 --- a/test/python/test_calibration.py +++ b/test/python/test_calibration.py @@ -1,10 +1,11 @@ import numpy as np -import pandas as pd +import pytest +from scipy.stats import gamma from sklearn import linear_model from sklearn.metrics import mean_squared_error -from scipy.stats import gamma + from stochtree import calibrate_global_error_variance -import pytest + class TestCalibration: def test_full_rank(self): @@ -12,45 +13,50 @@ def test_full_rank(self): p = 5 nu = 3 q = 0.9 - X = np.random.uniform(size=(n,p)) - y = 1 + X[:,0]*0.1 - X[:,1]*0.2 + np.random.normal(size=n) + X = np.random.uniform(size=(n, p)) + y = 1 + X[:, 0] * 0.1 - X[:, 1] * 0.2 + np.random.normal(size=n) y_std = (y - np.mean(y)) / np.std(y) reg_model = linear_model.LinearRegression() reg_model.fit(X, y_std) mse = mean_squared_error(y_std, reg_model.predict(X)) - lamb = calibrate_global_error_variance(X = X, y = y, nu = nu, q = q, standardize = True) - assert lamb == pytest.approx((mse*gamma.ppf(1-q,nu))/nu) + lamb = calibrate_global_error_variance(X=X, y=y, nu=nu, q=q, standardize=True) + assert lamb == pytest.approx((mse * gamma.ppf(1 - q, nu)) / nu) def test_rank_deficient(self): n = 100 p = 5 nu = 3 q = 0.9 - X = np.random.uniform(size=(n,p)) - X[:,4] = X[:,2] - y = 1 + X[:,0]*0.1 - X[:,1]*0.2 + np.random.normal(size=n) + X = np.random.uniform(size=(n, p)) + X[:, 4] = X[:, 2] + y = 1 + X[:, 0] * 0.1 - X[:, 1] * 0.2 + np.random.normal(size=n) y_std = (y - np.mean(y)) / np.std(y) reg_model = linear_model.LinearRegression() reg_model.fit(X, y_std) mse = mean_squared_error(y_std, reg_model.predict(X)) if reg_model.rank_ < p: - with pytest.warns(UserWarning): - lamb = calibrate_global_error_variance(X = X, y = y, nu = nu, q = q, standardize = True) + with pytest.warns(UserWarning): + lamb = calibrate_global_error_variance( + X=X, y=y, nu=nu, q=q, standardize=True + ) else: - lamb = calibrate_global_error_variance(X = X, y = y, nu = nu, q = q, standardize = True) - assert lamb == pytest.approx((mse*gamma.ppf(1-q,nu))/nu) + lamb = calibrate_global_error_variance( + X=X, y=y, nu=nu, q=q, standardize=True + ) + assert lamb == pytest.approx((mse * gamma.ppf(1 - q, nu)) / nu) def test_overdetermined(self): n = 100 p = 101 nu = 3 q = 0.9 - X = np.random.uniform(size=(n,p)) - y = 1 + X[:,0]*0.1 - X[:,1]*0.2 + np.random.normal(size=n) + X = np.random.uniform(size=(n, p)) + y = 1 + X[:, 0] * 0.1 - X[:, 1] * 0.2 + np.random.normal(size=n) y_std = (y - np.mean(y)) / np.std(y) reg_model = linear_model.LinearRegression() reg_model.fit(X, y_std) - mse = mean_squared_error(y_std, reg_model.predict(X)) with pytest.warns(UserWarning): - lamb = calibrate_global_error_variance(X = X, y = y, nu = nu, q = q, standardize = True) - assert lamb == pytest.approx(np.var(y)*(gamma.ppf(1-q,nu))/nu) + lamb = calibrate_global_error_variance( + X=X, y=y, nu=nu, q=q, standardize=True + ) + assert lamb == pytest.approx(np.var(y) * (gamma.ppf(1 - q, nu)) / nu) diff --git a/test/python/test_json.py b/test/python/test_json.py index 4d8d903c..4eba7ee6 100644 --- a/test/python/test_json.py +++ b/test/python/test_json.py @@ -1,10 +1,21 @@ import numpy as np import pandas as pd + from stochtree import ( - BARTModel, BCFModel, JSONSerializer, ForestContainer, Forest, Dataset, Residual, - RNG, ForestSampler, ForestContainer, GlobalVarianceModel, CovariatePreprocessor + RNG, + BARTModel, + BCFModel, + CovariatePreprocessor, + Dataset, + Forest, + ForestContainer, + ForestSampler, + GlobalVarianceModel, + JSONSerializer, + Residual, ) + class TestJson: def test_value(self): json_test = JSONSerializer() @@ -20,8 +31,8 @@ def test_value(self): def test_array(self): json_test = JSONSerializer() - a = np.array([1.5,2.4,3.3]) - b = ["a","b","c"] + a = np.array([1.5, 2.4, 3.3]) + b = ["a", "b", "c"] json_test.add_numeric_vector("a", a) json_test.add_string_vector("b", b) np.testing.assert_array_equal(a, json_test.get_numeric_vector("a")) @@ -29,9 +40,15 @@ def test_array(self): def test_preprocessor(self): df = pd.DataFrame( - {"x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1], - "x2": pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=False, categories=['c', 'b', 'a']), - "x3": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4]} + { + "x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1], + "x2": pd.Categorical( + ["a", "b", "c", "a", "b", "c"], + ordered=False, + categories=["c", "b", "a"], + ), + "x3": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4], + } ) cov_transformer = CovariatePreprocessor() df_transformed_orig = cov_transformer.fit_transform(df) @@ -42,11 +59,25 @@ def test_preprocessor(self): np.testing.assert_array_equal(df_transformed_orig, df_transformed_reloaded) df_2 = pd.DataFrame( - {"x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1], - "x2": pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=False, categories=['c', 'b', 'a']), - "x3": pd.Categorical(['a', 'c', 'd', 'b', 'd', 'b'], ordered=False, categories=['c', 'b', 'a', 'd']), - "x4": pd.Categorical(['a', 'b', 'f', 'f', 'c', 'a'], ordered=True, categories=['c', 'b', 'a', 'f']), - "x5": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4]} + { + "x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1], + "x2": pd.Categorical( + ["a", "b", "c", "a", "b", "c"], + ordered=False, + categories=["c", "b", "a"], + ), + "x3": pd.Categorical( + ["a", "c", "d", "b", "d", "b"], + ordered=False, + categories=["c", "b", "a", "d"], + ), + "x4": pd.Categorical( + ["a", "b", "f", "f", "c", "a"], + ordered=True, + categories=["c", "b", "a", "f"], + ), + "x5": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4], + } ) cov_transformer_2 = CovariatePreprocessor() df_transformed_orig_2 = cov_transformer_2.fit_transform(df_2) @@ -73,19 +104,19 @@ def test_forest(self): rng = np.random.default_rng(random_seed) n = 1000 p_X = 10 - p_W = 1 X = rng.uniform(0, 1, (n, p_X)) + def outcome_mean(X): return np.where( - (X[:,0] >= 0.0) & (X[:,0] < 0.25), -7.5, + (X[:, 0] >= 0.0) & (X[:, 0] < 0.25), + -7.5, np.where( - (X[:,0] >= 0.25) & (X[:,0] < 0.5), -2.5, - np.where( - (X[:,0] >= 0.5) & (X[:,0] < 0.75), 2.5, - 7.5 - ) - ) + (X[:, 0] >= 0.25) & (X[:, 0] < 0.5), + -2.5, + np.where((X[:, 0] >= 0.5) & (X[:, 0] < 0.75), 2.5, 7.5), + ), ) + epsilon = rng.normal(0, 1, n) y = outcome_mean(X) + epsilon @@ -108,11 +139,17 @@ def outcome_mean(X): forest_dataset = Dataset() forest_dataset.add_covariates(X) forest_preds_json_reload = forest_container.predict(forest_dataset) - forest_preds_json_reload = forest_preds_json_reload*bart_model.y_std + bart_model.y_bar + forest_preds_json_reload = ( + forest_preds_json_reload * bart_model.y_std + bart_model.y_bar + ) # Check the predictions - np.testing.assert_almost_equal(forest_preds_y_mcmc_cached, forest_preds_json_reload) - np.testing.assert_almost_equal(forest_preds_y_mcmc_retrieved, forest_preds_json_reload) - + np.testing.assert_almost_equal( + forest_preds_y_mcmc_cached, forest_preds_json_reload + ) + np.testing.assert_almost_equal( + forest_preds_y_mcmc_retrieved, forest_preds_json_reload + ) + def test_forest_string(self): # RNG random_seed = 1234 @@ -128,14 +165,17 @@ def test_forest_string(self): # Define the outcome mean function def outcome_mean(X, W): return np.where( - (X[:,0] >= 0.0) & (X[:,0] < 0.25), -7.5 * W[:,0], + (X[:, 0] >= 0.0) & (X[:, 0] < 0.25), + -7.5 * W[:, 0], np.where( - (X[:,0] >= 0.25) & (X[:,0] < 0.5), -2.5 * W[:,0], + (X[:, 0] >= 0.25) & (X[:, 0] < 0.5), + -2.5 * W[:, 0], np.where( - (X[:,0] >= 0.5) & (X[:,0] < 0.75), 2.5 * W[:,0], - 7.5 * W[:,0] - ) - ) + (X[:, 0] >= 0.5) & (X[:, 0] < 0.75), + 2.5 * W[:, 0], + 7.5 * W[:, 0], + ), + ), ) # Generate outcome @@ -145,7 +185,7 @@ def outcome_mean(X, W): # Standardize outcome y_bar = np.mean(y) y_std = np.std(y) - resid = (y-y_bar)/y_std + resid = (y - y_bar) / y_std # Sampler parameters alpha = 0.9 @@ -153,17 +193,14 @@ def outcome_mean(X, W): min_samples_leaf = 1 num_trees = 100 cutpoint_grid_size = 100 - global_variance_init = 1. + global_variance_init = 1.0 tau_init = 0.5 - leaf_prior_scale = np.array([[tau_init]], order='C') - a_global = 4. - b_global = 2. - a_leaf = 2. - b_leaf = 0.5 - leaf_regression = True - feature_types = np.repeat(0, p_X).astype(int) # 0 = numeric - var_weights = np.repeat(1/p_X, p_X) - + leaf_prior_scale = np.array([[tau_init]], order="C") + a_global = 4.0 + b_global = 2.0 + feature_types = np.repeat(0, p_X).astype(int) # 0 = numeric + var_weights = np.repeat(1 / p_X, p_X) + # Dataset (covariates and basis) dataset = Dataset() dataset.add_covariates(X) @@ -175,7 +212,9 @@ def outcome_mean(X, W): # Forest samplers and temporary tracking data structures forest_container = ForestContainer(num_trees, W.shape[1], False, False) active_forest = Forest(num_trees, W.shape[1], False, False) - forest_sampler = ForestSampler(dataset, feature_types, num_trees, n, alpha, beta, min_samples_leaf) + forest_sampler = ForestSampler( + dataset, feature_types, num_trees, n, alpha, beta, min_samples_leaf + ) cpp_rng = RNG(random_seed) global_var_model = GlobalVarianceModel() @@ -183,18 +222,58 @@ def outcome_mean(X, W): num_warmstart = 10 num_mcmc = 100 num_samples = num_warmstart + num_mcmc - global_var_samples = np.concatenate((np.array([global_variance_init]), np.repeat(0, num_samples))) + global_var_samples = np.concatenate( + (np.array([global_variance_init]), np.repeat(0, num_samples)) + ) # Run "grow-from-root" sampler for i in range(num_warmstart): - forest_sampler.sample_one_iteration(forest_container, active_forest, dataset, residual, cpp_rng, feature_types, cutpoint_grid_size, leaf_prior_scale, var_weights, 1., 1., global_var_samples[i], 1, True, True, False) - global_var_samples[i+1] = global_var_model.sample_one_iteration(residual, cpp_rng, a_global, b_global) - + forest_sampler.sample_one_iteration( + forest_container, + active_forest, + dataset, + residual, + cpp_rng, + feature_types, + cutpoint_grid_size, + leaf_prior_scale, + var_weights, + 1.0, + 1.0, + global_var_samples[i], + 1, + True, + True, + False, + ) + global_var_samples[i + 1] = global_var_model.sample_one_iteration( + residual, cpp_rng, a_global, b_global + ) + # Run MCMC sampler for i in range(num_warmstart, num_samples): - forest_sampler.sample_one_iteration(forest_container, active_forest, dataset, residual, cpp_rng, feature_types, cutpoint_grid_size, leaf_prior_scale, var_weights, 1., 1., global_var_samples[i], 1, True, False, False) - global_var_samples[i+1] = global_var_model.sample_one_iteration(residual, cpp_rng, a_global, b_global) - + forest_sampler.sample_one_iteration( + forest_container, + active_forest, + dataset, + residual, + cpp_rng, + feature_types, + cutpoint_grid_size, + leaf_prior_scale, + var_weights, + 1.0, + 1.0, + global_var_samples[i], + 1, + True, + False, + False, + ) + global_var_samples[i + 1] = global_var_model.sample_one_iteration( + residual, cpp_rng, a_global, b_global + ) + # Extract predictions from the sampler y_hat_orig = forest_container.predict(dataset) @@ -204,7 +283,7 @@ def outcome_mean(X, W): forest_container_reloaded.load_from_json_string(forest_json_string) y_hat_reloaded = forest_container_reloaded.predict(dataset) np.testing.assert_almost_equal(y_hat_orig, y_hat_reloaded) - + def test_bart_string(self): # RNG random_seed = 1234 @@ -220,14 +299,17 @@ def test_bart_string(self): # Define the outcome mean function def outcome_mean(X, W): return np.where( - (X[:,0] >= 0.0) & (X[:,0] < 0.25), -7.5 * W[:,0], + (X[:, 0] >= 0.0) & (X[:, 0] < 0.25), + -7.5 * W[:, 0], np.where( - (X[:,0] >= 0.25) & (X[:,0] < 0.5), -2.5 * W[:,0], + (X[:, 0] >= 0.25) & (X[:, 0] < 0.5), + -2.5 * W[:, 0], np.where( - (X[:,0] >= 0.5) & (X[:,0] < 0.75), 2.5 * W[:,0], - 7.5 * W[:,0] - ) - ) + (X[:, 0] >= 0.5) & (X[:, 0] < 0.75), + 2.5 * W[:, 0], + 7.5 * W[:, 0], + ), + ), ) # Generate outcome @@ -237,7 +319,7 @@ def outcome_mean(X, W): # Run BART bart_orig = BARTModel() bart_orig.sample(X_train=X, y_train=y, basis_train=W, num_gfr=10, num_mcmc=10) - + # Extract predictions from the sampler y_hat_orig = bart_orig.predict(X, W) @@ -247,7 +329,7 @@ def outcome_mean(X, W): bart_reloaded.from_json(bart_json_string) y_hat_reloaded = bart_reloaded.predict(X, W) np.testing.assert_almost_equal(y_hat_orig, y_hat_reloaded) - + def test_bcf_string(self): # RNG random_seed = 1234 @@ -257,21 +339,23 @@ def test_bcf_string(self): n = 100 p_X = 5 X = rng.uniform(0, 1, (n, p_X)) - pi_X = 0.25 + 0.5*X[:,0] + pi_X = 0.25 + 0.5 * X[:, 0] Z = rng.binomial(1, pi_X, n).astype(float) # Define the outcome mean functions (prognostic and treatment effects) - mu_X = pi_X*5 - tau_X = X[:,1]*2 + mu_X = pi_X * 5 + tau_X = X[:, 1] * 2 # Generate outcome epsilon = rng.normal(0, 1, n) - y = mu_X + tau_X*Z + epsilon + y = mu_X + tau_X * Z + epsilon # Run BCF bcf_orig = BCFModel() - bcf_orig.sample(X_train=X, Z_train=Z, y_train=y, pi_train=pi_X, num_gfr=10, num_mcmc=10) - + bcf_orig.sample( + X_train=X, Z_train=Z, y_train=y, pi_train=pi_X, num_gfr=10, num_mcmc=10 + ) + # Extract predictions from the sampler mu_hat_orig, tau_hat_orig, y_hat_orig = bcf_orig.predict(X, Z, pi_X) @@ -279,11 +363,13 @@ def test_bcf_string(self): bcf_json_string = bcf_orig.to_json() bcf_reloaded = BCFModel() bcf_reloaded.from_json(bcf_json_string) - mu_hat_reloaded, tau_hat_reloaded, y_hat_reloaded = bcf_reloaded.predict(X, Z, pi_X) + mu_hat_reloaded, tau_hat_reloaded, y_hat_reloaded = bcf_reloaded.predict( + X, Z, pi_X + ) np.testing.assert_almost_equal(y_hat_orig, y_hat_reloaded) np.testing.assert_almost_equal(tau_hat_orig, tau_hat_reloaded) np.testing.assert_almost_equal(mu_hat_orig, mu_hat_reloaded) - + def test_bcf_propensity_string(self): # RNG random_seed = 1234 @@ -293,21 +379,21 @@ def test_bcf_propensity_string(self): n = 100 p_X = 5 X = rng.uniform(0, 1, (n, p_X)) - pi_X = 0.25 + 0.5*X[:,0] + pi_X = 0.25 + 0.5 * X[:, 0] Z = rng.binomial(1, pi_X, n).astype(float) # Define the outcome mean functions (prognostic and treatment effects) - mu_X = pi_X*5 - tau_X = X[:,1]*2 + mu_X = pi_X * 5 + tau_X = X[:, 1] * 2 # Generate outcome epsilon = rng.normal(0, 1, n) - y = mu_X + tau_X*Z + epsilon + y = mu_X + tau_X * Z + epsilon # Run BCF without passing propensity scores (so an internal propensity model must be constructed) bcf_orig = BCFModel() bcf_orig.sample(X_train=X, Z_train=Z, y_train=y, num_gfr=10, num_mcmc=10) - + # Extract predictions from the sampler mu_hat_orig, tau_hat_orig, y_hat_orig = bcf_orig.predict(X, Z, pi_X) @@ -315,7 +401,9 @@ def test_bcf_propensity_string(self): bcf_json_string = bcf_orig.to_json() bcf_reloaded = BCFModel() bcf_reloaded.from_json(bcf_json_string) - mu_hat_reloaded, tau_hat_reloaded, y_hat_reloaded = bcf_reloaded.predict(X, Z, pi_X) + mu_hat_reloaded, tau_hat_reloaded, y_hat_reloaded = bcf_reloaded.predict( + X, Z, pi_X + ) np.testing.assert_almost_equal(y_hat_orig, y_hat_reloaded) np.testing.assert_almost_equal(tau_hat_orig, tau_hat_reloaded) np.testing.assert_almost_equal(mu_hat_orig, mu_hat_reloaded) diff --git a/test/python/test_predict.py b/test/python/test_predict.py index 77505915..d180fd67 100644 --- a/test/python/test_predict.py +++ b/test/python/test_predict.py @@ -1,5 +1,7 @@ import numpy as np -from stochtree import ForestContainer, Dataset, Residual, ForestSampler, RNG + +from stochtree import Dataset, ForestContainer + class TestPredict: def test_constant_leaf_prediction(self): @@ -12,8 +14,7 @@ def test_constant_leaf_prediction(self): [5.3, 9.3, 3.6], [6.1, 10.4, 4.4]] ) - n = X.shape[0] - p = X.shape[1] + n, p = X.shape num_trees = 10 output_dim = 1 forest_dataset = Dataset() @@ -75,8 +76,7 @@ def test_univariate_regression_leaf_prediction(self): [1], [1]] ) - n = X.shape[0] - p = X.shape[1] + n, p = X.shape num_trees = 10 output_dim = 1 forest_dataset = Dataset() @@ -141,8 +141,7 @@ def test_multivariate_regression_leaf_prediction(self): [1, 1], [1, 1]] ) - n = X.shape[0] - p = X.shape[1] + n, p = X.shape num_trees = 10 output_dim = 2 num_samples = 0 diff --git a/test/python/test_preprocessor.py b/test/python/test_preprocessor.py index 87e338e7..f40ef204 100644 --- a/test/python/test_preprocessor.py +++ b/test/python/test_preprocessor.py @@ -1,93 +1,134 @@ import numpy as np import pandas as pd + from stochtree import CovariatePreprocessor + class TestPreprocessor: def test_numpy(self): cov_transformer = CovariatePreprocessor() np_1 = np.array( - [[1.5, 8.7, 1.2], - [2.7, 3.4, 5.4], - [3.6, 1.2, 9.3], - [4.4, 5.4, 10.4], - [5.3, 9.3, 3.6], - [6.1, 10.4, 4.4]] + [ + [1.5, 8.7, 1.2], + [2.7, 3.4, 5.4], + [3.6, 1.2, 9.3], + [4.4, 5.4, 10.4], + [5.3, 9.3, 3.6], + [6.1, 10.4, 4.4], + ] ) np_1_transformed = cov_transformer.fit_transform(np_1) np.testing.assert_array_equal(np_1, np_1_transformed) - np.testing.assert_array_equal(cov_transformer._processed_feature_types, np.array([0,0,0])) + np.testing.assert_array_equal( + cov_transformer._processed_feature_types, np.array([0, 0, 0]) + ) def test_pandas(self): df_1 = pd.DataFrame( - {"x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1], - "x2": [8.7, 3.4, 1.2, 5.4, 9.3, 10.4], - "x3": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4]} + { + "x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1], + "x2": [8.7, 3.4, 1.2, 5.4, 9.3, 10.4], + "x3": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4], + } ) np_1 = np.array( - [[1.5, 8.7, 1.2], - [2.7, 3.4, 5.4], - [3.6, 1.2, 9.3], - [4.4, 5.4, 10.4], - [5.3, 9.3, 3.6], - [6.1, 10.4, 4.4]] + [ + [1.5, 8.7, 1.2], + [2.7, 3.4, 5.4], + [3.6, 1.2, 9.3], + [4.4, 5.4, 10.4], + [5.3, 9.3, 3.6], + [6.1, 10.4, 4.4], + ] ) cov_transformer = CovariatePreprocessor() df_1_transformed = cov_transformer.fit_transform(df_1) np.testing.assert_array_equal(np_1, df_1_transformed) - np.testing.assert_array_equal(cov_transformer._processed_feature_types, np.array([0,0,0])) + np.testing.assert_array_equal( + cov_transformer._processed_feature_types, np.array([0, 0, 0]) + ) df_2 = pd.DataFrame( - {"x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1], - "x2": pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True, categories=['c', 'b', 'a']), - "x3": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4]} + { + "x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1], + "x2": pd.Categorical( + ["a", "b", "c", "a", "b", "c"], + ordered=True, + categories=["c", "b", "a"], + ), + "x3": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4], + } ) np_2 = np.array( - [[1.5, 2, 1.2], - [2.7, 1, 5.4], - [3.6, 0, 9.3], - [4.4, 2, 10.4], - [5.3, 1, 3.6], - [6.1, 0, 4.4]] + [ + [1.5, 2, 1.2], + [2.7, 1, 5.4], + [3.6, 0, 9.3], + [4.4, 2, 10.4], + [5.3, 1, 3.6], + [6.1, 0, 4.4], + ] ) cov_transformer = CovariatePreprocessor() df_2_transformed = cov_transformer.fit_transform(df_2) np.testing.assert_array_equal(np_2, df_2_transformed) - np.testing.assert_array_equal(cov_transformer._processed_feature_types, np.array([0,1,0])) + np.testing.assert_array_equal( + cov_transformer._processed_feature_types, np.array([0, 1, 0]) + ) df_3 = pd.DataFrame( - {"x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1], - "x2": pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=False, categories=['c', 'b', 'a']), - "x3": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4]} + { + "x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1], + "x2": pd.Categorical( + ["a", "b", "c", "a", "b", "c"], + ordered=False, + categories=["c", "b", "a"], + ), + "x3": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4], + } ) np_3 = np.array( - [[1.5, 0, 0, 1, 1.2], - [2.7, 0, 1, 0, 5.4], - [3.6, 1, 0, 0, 9.3], - [4.4, 0, 0, 1, 10.4], - [5.3, 0, 1, 0, 3.6], - [6.1, 1, 0, 0, 4.4]] + [ + [1.5, 0, 0, 1, 1.2], + [2.7, 0, 1, 0, 5.4], + [3.6, 1, 0, 0, 9.3], + [4.4, 0, 0, 1, 10.4], + [5.3, 0, 1, 0, 3.6], + [6.1, 1, 0, 0, 4.4], + ] ) cov_transformer = CovariatePreprocessor() df_3_transformed = cov_transformer.fit_transform(df_3) np.testing.assert_array_equal(np_3, df_3_transformed) - np.testing.assert_array_equal(cov_transformer._processed_feature_types, np.array([0,1,1,1,0])) + np.testing.assert_array_equal( + cov_transformer._processed_feature_types, np.array([0, 1, 1, 1, 0]) + ) df_4 = pd.DataFrame( - {"x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1, 7.6], - "x2": pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c', 'c'], ordered=False, categories=['c', 'b', 'a', 'd']), - "x3": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4, 3.4]} + { + "x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1, 7.6], + "x2": pd.Categorical( + ["a", "b", "c", "a", "b", "c", "c"], + ordered=False, + categories=["c", "b", "a", "d"], + ), + "x3": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4, 3.4], + } ) np_4 = np.array( - [[1.5, 0, 0, 1, 0, 1.2], - [2.7, 0, 1, 0, 0, 5.4], - [3.6, 1, 0, 0, 0, 9.3], - [4.4, 0, 0, 1, 0, 10.4], - [5.3, 0, 1, 0, 0, 3.6], - [6.1, 1, 0, 0, 0, 4.4], - [7.6, 1, 0, 0, 0, 3.4]] + [ + [1.5, 0, 0, 1, 0, 1.2], + [2.7, 0, 1, 0, 0, 5.4], + [3.6, 1, 0, 0, 0, 9.3], + [4.4, 0, 0, 1, 0, 10.4], + [5.3, 0, 1, 0, 0, 3.6], + [6.1, 1, 0, 0, 0, 4.4], + [7.6, 1, 0, 0, 0, 3.4], + ] ) cov_transformer = CovariatePreprocessor() df_4_transformed = cov_transformer.fit_transform(df_4) np.testing.assert_array_equal(np_4, df_4_transformed) - np.testing.assert_array_equal(cov_transformer._processed_feature_types, np.array([0,1,1,1,1,0])) - \ No newline at end of file + np.testing.assert_array_equal( + cov_transformer._processed_feature_types, np.array([0, 1, 1, 1, 1, 0]) + ) diff --git a/test/python/test_residual.py b/test/python/test_residual.py index c039a9cb..87c8a17b 100644 --- a/test/python/test_residual.py +++ b/test/python/test_residual.py @@ -1,36 +1,33 @@ import numpy as np -from stochtree import ForestContainer, Forest, Dataset, Residual, ForestSampler, RNG + +from stochtree import RNG, Dataset, Forest, ForestContainer, ForestSampler, Residual + class TestResidual: def test_basis_update(self): # Create dataset X = np.array( - [[1.5, 8.7, 1.2], - [2.7, 3.4, 5.4], - [3.6, 1.2, 9.3], - [4.4, 5.4, 10.4], - [5.3, 9.3, 3.6], - [6.1, 10.4, 4.4]] - ) - W = np.array( - [[1], - [1], - [1], - [1], - [1], - [1]] + [ + [1.5, 8.7, 1.2], + [2.7, 3.4, 5.4], + [3.6, 1.2, 9.3], + [4.4, 5.4, 10.4], + [5.3, 9.3, 3.6], + [6.1, 10.4, 4.4], + ] ) + W = np.array([[1], [1], [1], [1], [1], [1]]) n = X.shape[0] p = X.shape[1] - y = np.expand_dims(np.where(X[:,0]>4,-5,5) + np.random.normal(0,1,n), 1) + y = np.expand_dims(np.where(X[:, 0] > 4, -5, 5) + np.random.normal(0, 1, n), 1) y_bar = np.squeeze(np.mean(y)) y_std = np.squeeze(np.std(y)) - resid = (y-y_bar)/y_std + resid = (y - y_bar) / y_std forest_dataset = Dataset() forest_dataset.add_covariates(X) forest_dataset.add_basis(W) residual = Residual(resid) - variable_weights = np.repeat(1.0/p, p) + variable_weights = np.repeat(1.0 / p, p) feature_types = np.repeat(0, p).astype(int) # Forest parameters @@ -38,8 +35,8 @@ def test_basis_update(self): alpha = 0.95 beta = 2.0 min_samples_leaf = 1 - current_sigma2 = 1. - current_leaf_scale = np.array([[1./num_trees]]) + current_sigma2 = 1.0 + current_leaf_scale = np.array([[1.0 / num_trees]]) cutpoint_grid_size = 100 a_forest = 1 b_forest = 1 @@ -48,20 +45,37 @@ def test_basis_update(self): cpp_rng = RNG(-1) # Create forest sampler and forest container - forest_sampler = ForestSampler(forest_dataset, feature_types, num_trees, n, alpha, beta, min_samples_leaf) + forest_sampler = ForestSampler( + forest_dataset, feature_types, num_trees, n, alpha, beta, min_samples_leaf + ) forest_container = ForestContainer(num_trees, 1, False, False) active_forest = Forest(num_trees, 1, False, False) - + # Initialize the leaves of each tree in the prognostic forest init_root = np.squeeze(np.mean(resid)) / num_trees active_forest.set_root_leaves(init_root) - forest_sampler.adjust_residual(forest_dataset, residual, active_forest, False, True) - + forest_sampler.adjust_residual( + forest_dataset, residual, active_forest, False, True + ) + # Run the forest sampling algorithm for a single iteration forest_sampler.sample_one_iteration( - forest_container, active_forest, forest_dataset, residual, cpp_rng, feature_types, - cutpoint_grid_size, current_leaf_scale, variable_weights, a_forest, b_forest, - current_sigma2, 1, True, True, True + forest_container, + active_forest, + forest_dataset, + residual, + cpp_rng, + feature_types, + cutpoint_grid_size, + current_leaf_scale, + variable_weights, + a_forest, + b_forest, + current_sigma2, + 1, + True, + True, + True, ) # Get the current residual after running the sampler @@ -72,7 +86,7 @@ def test_basis_update(self): # Update the basis vector scalar = 2.0 - W_update = W*scalar + W_update = W * scalar forest_dataset.update_basis(W_update) # Update residual to reflect adjusted basis From 3073a9cca1a32f5c9b258de7e27afa7e7c538892 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Wed, 19 Feb 2025 19:03:59 -0600 Subject: [PATCH 13/35] Added config tests --- test/python/test_config.py | 74 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 test/python/test_config.py diff --git a/test/python/test_config.py b/test/python/test_config.py new file mode 100644 index 00000000..09f7f4ae --- /dev/null +++ b/test/python/test_config.py @@ -0,0 +1,74 @@ +import numpy as np +import pytest + +from stochtree.config import ForestModelConfig, GlobalModelConfig + + +class TestConfig: + def test_forest_config(self): + with pytest.warns(): + _ = ForestModelConfig(num_trees=10, num_features=5, num_observations=100) + _ = ForestModelConfig(num_trees=1, num_features=1, num_observations=1) + _ = ForestModelConfig( + num_trees=10, + num_features=5, + num_observations=100, + feature_types=[0, 0, 0, 0, 1], + ) + _ = ForestModelConfig( + num_trees=1, num_features=1, num_observations=1, feature_types=[2] + ) + _ = ForestModelConfig( + num_trees=10, + num_features=5, + num_observations=100, + variable_weights=[0.2, 0.2, 0.2, 0.2, 0.2], + ) + _ = ForestModelConfig( + num_trees=1, num_features=1, num_observations=1, variable_weights=[1.0] + ) + + with pytest.raises(ValueError): + _ = ForestModelConfig() + _ = ForestModelConfig( + num_trees=10, + num_features=6, + num_observations=100, + feature_types=[0, 0, 0, 0, 1], + ) + _ = ForestModelConfig( + num_trees=10, + num_features=1, + num_observations=100, + feature_types=[0, 0, 0, 0, 1], + ) + _ = ForestModelConfig( + num_trees=10, + num_features=6, + num_observations=100, + variable_weights=[0.2, 0.2, 0.2, 0.2, 0.2], + ) + _ = ForestModelConfig( + num_trees=10, + num_features=1, + num_observations=100, + variable_weight=[0.2, 0.2, 0.2, 0.2, 0.2], + ) + _ = ForestModelConfig( + num_trees=10, + num_features=1, + num_observations=100, + leaf_dimension=2, + leaf_model_scale=np.array([2, 3], [3, 4], [5, 6]), + ) + _ = ForestModelConfig( + num_trees=10, num_features=1, num_observations=100, leaf_model_type=4 + ) + _ = ForestModelConfig( + num_trees=10, num_features=1, num_observations=100, leaf_model_type=-1 + ) + + def test_global_config(self): + with pytest.raises(ValueError): + _ = GlobalModelConfig(global_error_variance=0.0) + _ = GlobalModelConfig(global_error_variance=-1.0) From d86ae36fb96b743b63632b52a742f745407e7b3f Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Wed, 19 Feb 2025 19:13:51 -0600 Subject: [PATCH 14/35] Partial update of sampler interface to use config objects --- stochtree/config.py | 46 ++++++++++++++++++++++++++++- stochtree/sampler.py | 70 +++++++++++++++----------------------------- 2 files changed, 69 insertions(+), 47 deletions(-) diff --git a/stochtree/config.py b/stochtree/config.py index 635aa1d3..59169550 100644 --- a/stochtree/config.py +++ b/stochtree/config.py @@ -331,7 +331,7 @@ def update_cutpoint_grid_size(self, cutpoint_grid_size: int) -> None: """ self.cutpoint_grid_size = cutpoint_grid_size - def get_feature_types(self): + def get_feature_types(self) -> np.ndarray: """ Query feature types (integer-coded so that 0 = numeric, 1 = ordered categorical, 2 = unordered categorical) @@ -353,6 +353,39 @@ def get_variable_weights(self) -> np.ndarray: """ return self.variable_weights + def get_num_trees(self) -> int: + """ + Query number of trees + + Returns + ------- + num_trees : int + Number of trees in a forest + """ + return self.num_trees + + def get_num_features(self) -> int: + """ + Query number of features + + Returns + ------- + num_features : int + Number of features in a forest + """ + return self.num_features + + def get_num_observations(self) -> int: + """ + Query number of observations + + Returns + ------- + num_observations : int + Number of observations in a forest + """ + return self.num_observations + def get_alpha(self) -> float: """ Query root node split probability in tree prior @@ -397,6 +430,17 @@ def get_max_depth(self) -> int: """ return self.max_depth + def get_leaf_model_type(self) -> int: + """ + Query type of leaf model + + Returns + ------- + leaf_model_type : int + Integer coded leaf model + """ + self.leaf_model_type + def get_leaf_model_scale(self) -> np.ndarray: """ Query scale parameter used in Gaussian leaf models diff --git a/stochtree/sampler.py b/stochtree/sampler.py index d4270872..e3b79b32 100644 --- a/stochtree/sampler.py +++ b/stochtree/sampler.py @@ -12,6 +12,7 @@ RngCpp, ) +from .config import ForestModelConfig, GlobalModelConfig from .data import Dataset, Residual from .forest import Forest, ForestContainer @@ -62,23 +63,18 @@ class ForestSampler: def __init__( self, dataset: Dataset, - feature_types: np.array, - num_trees: int, - num_obs: int, - alpha: float, - beta: float, - min_samples_leaf: int, - max_depth: int = -1, + global_config: GlobalModelConfig, + forest_config: ForestModelConfig, ) -> None: self.forest_sampler_cpp = ForestSamplerCpp( dataset.dataset_cpp, - feature_types, - num_trees, - num_obs, - alpha, - beta, - min_samples_leaf, - max_depth, + forest_config.get_feature_types(), + forest_config.get_num_trees(), + forest_config.get_num_observations(), + forest_config.get_alpha(), + forest_config.get_beta(), + forest_config.get_min_samples_leaf(), + forest_config.get_max_depth(), ) def reconstitute_from_forest( @@ -109,14 +105,8 @@ def sample_one_iteration( dataset: Dataset, residual: Residual, rng: RNG, - feature_types: np.array, - cutpoint_grid_size: int, - leaf_model_scale_input: np.array, - variable_weights: np.array, - a_forest: float, - b_forest: float, - global_variance: float, - leaf_model_int: int, + forest_config: ForestModelConfig, + global_config: GlobalModelConfig, keep_forest: bool, gfr: bool, pre_initialized: bool, @@ -136,22 +126,10 @@ def sample_one_iteration( `stochtree` object storing continuously updated partial / full residual rng : RNG `stochtree` object storing C++ random number generator to be used sampling algorithm - feature_types : np.array - Array of integer-coded feature types (0 = numeric, 1 = ordered categorical, 2 = unordered categorical) - cutpoint_grid_size : int - Maximum size of a grid of available cutpoints (which thins the number of possible splits, particularly useful in the grow-from-root algorithm) - leaf_model_scale_input : np.array - Numpy array containing leaf model scale parameter (if the leaf model is univariate, this is essentially a scalar which is used as such in the C++ source, but stored as a numpy array) - variable_weights : np.array - Numpy array containing sampling probabilities for each feature - a_forest : float - Shape parameter for the inverse gamma outcome model for a heteroskedasticity forest - b_forest : float - Scale parameter for the inverse gamma outcome model for a heteroskedasticity forest - global_variance : float - Current value of the global error variance parameter - leaf_model_int : int - Integer encoding the leaf model type (0 = constant Gaussian leaf mean model, 1 = univariate Gaussian leaf regression mean model, 2 = multivariate Gaussian leaf regression mean model, 3 = univariate Inverse Gamma constant leaf variance model) + forest_config : ForestModelConfig + `ForestModelConfig` object containing forest model parameters and settings + global_config : GlobalModelConfig + `GlobalModelConfig` object containing global model parameters and settings keep_forest : bool Whether or not the resulting forest should be retained in `forest_container` or discarded (due to burnin or thinning for example) gfr : bool @@ -165,14 +143,14 @@ def sample_one_iteration( dataset.dataset_cpp, residual.residual_cpp, rng.rng_cpp, - feature_types, - cutpoint_grid_size, - leaf_model_scale_input, - variable_weights, - a_forest, - b_forest, - global_variance, - leaf_model_int, + forest_config.get_feature_types(), + forest_config.get_cutpoint_grid_size(), + forest_config.get_leaf_model_scale(), + forest_config.get_variable_weights(), + forest_config.get_variance_forest_shape(), + forest_config.get_variance_forest_scale(), + global_config.get_global_error_variance(), + forest_config.get_leaf_model_type(), keep_forest, gfr, pre_initialized, From eca9c325b041694b079b8443c4284390a9d8a260 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Sun, 23 Feb 2025 13:59:44 -0600 Subject: [PATCH 15/35] Add flexibility in use of config objects in R and python interfaces --- R/config.R | 28 +++++++++++++++++++ R/cpp11.R | 16 +++++++++++ R/model.R | 43 +++++++++++++++++++++++++++++ man/ForestModel.Rd | 56 ++++++++++++++++++++++++++++++++++++++ man/ForestModelConfig.Rd | 58 +++++++++++++++++++++++++++++++++++++--- src/cpp11.cpp | 32 ++++++++++++++++++++++ src/py_stochtree.cpp | 22 ++++++++++++++- src/sampler.cpp | 21 +++++++++++++++ stochtree/config.py | 6 ++--- stochtree/sampler.py | 13 +++++++-- 10 files changed, 286 insertions(+), 9 deletions(-) diff --git a/R/config.R b/R/config.R index c9b04e3f..dc156f5e 100644 --- a/R/config.R +++ b/R/config.R @@ -249,6 +249,27 @@ ForestModelConfig <- R6::R6Class( return(self$variable_weights) }, + #' @description + #' Query number of trees + #' @returns Number of trees in a forest + get_num_trees = function() { + return(self$num_trees) + }, + + #' @description + #' Query number of features + #' @returns Number of features in a forest model training set + get_num_features = function() { + return(self$num_features) + }, + + #' @description + #' Query number of observations + #' @returns Number of observations in a forest model training set + get_num_observations = function() { + return(self$num_observations) + }, + #' @description #' Query root node split probability in tree prior for this ForestModelConfig object #' @returns Root node split probability in tree prior @@ -277,6 +298,13 @@ ForestModelConfig <- R6::R6Class( return(self$max_depth) }, + #' @description + #' Query (integer-coded) type of leaf model + #' @returns Integer coded leaf model type + get_leaf_model_type = function() { + return(self$leaf_model_type) + }, + #' @description #' Query scale parameter used in Gaussian leaf models for this ForestModelConfig object #' @returns Scale parameter used in Gaussian leaf models diff --git a/R/cpp11.R b/R/cpp11.R index 7188e9f7..c3218f8f 100644 --- a/R/cpp11.R +++ b/R/cpp11.R @@ -596,6 +596,22 @@ update_max_depth_tree_prior_cpp <- function(tree_prior_ptr, max_depth) { invisible(.Call(`_stochtree_update_max_depth_tree_prior_cpp`, tree_prior_ptr, max_depth)) } +get_alpha_tree_prior_cpp <- function(tree_prior_ptr) { + .Call(`_stochtree_get_alpha_tree_prior_cpp`, tree_prior_ptr) +} + +get_beta_tree_prior_cpp <- function(tree_prior_ptr) { + .Call(`_stochtree_get_beta_tree_prior_cpp`, tree_prior_ptr) +} + +get_min_samples_leaf_tree_prior_cpp <- function(tree_prior_ptr) { + .Call(`_stochtree_get_min_samples_leaf_tree_prior_cpp`, tree_prior_ptr) +} + +get_max_depth_tree_prior_cpp <- function(tree_prior_ptr) { + .Call(`_stochtree_get_max_depth_tree_prior_cpp`, tree_prior_ptr) +} + forest_tracker_cpp <- function(data, feature_types, num_trees, n) { .Call(`_stochtree_forest_tracker_cpp`, data, feature_types, num_trees, n) } diff --git a/R/model.R b/R/model.R index 8f1da6f5..aa33cd7d 100644 --- a/R/model.R +++ b/R/model.R @@ -85,6 +85,21 @@ ForestModel <- R6::R6Class( global_scale <- global_model_config$global_error_variance cutpoint_grid_size <- forest_model_config$cutpoint_grid_size + # Detect changes to tree prior + if (forest_model_config$alpha != get_alpha_tree_prior_cpp(self$tree_prior_ptr)) { + update_alpha_tree_prior_cpp(self$tree_prior_ptr, forest_model_config$alpha) + } + if (forest_model_config$beta != get_beta_tree_prior_cpp(self$tree_prior_ptr)) { + update_beta_tree_prior_cpp(self$tree_prior_ptr, forest_model_config$beta) + } + if (forest_model_config$min_samples_leaf != get_min_samples_leaf_tree_prior_cpp(self$tree_prior_ptr)) { + update_min_samples_leaf_tree_prior_cpp(self$tree_prior_ptr, forest_model_config$min_samples_leaf) + } + if (forest_model_config$max_depth != get_max_depth_tree_prior_cpp(self$tree_prior_ptr)) { + update_max_depth_tree_prior_cpp(self$tree_prior_ptr, forest_model_config$max_depth) + } + + # Run the sampler if (gfr) { sample_gfr_one_iteration_cpp( forest_dataset$data_ptr, residual$data_ptr, @@ -165,6 +180,34 @@ ForestModel <- R6::R6Class( #' @return None update_max_depth = function(max_depth) { update_max_depth_tree_prior_cpp(self$tree_prior_ptr, max_depth) + }, + + #' @description + #' Update alpha in the tree prior + #' @return Value of alpha in the tree prior + get_alpha = function() { + get_alpha_tree_prior_cpp(self$tree_prior_ptr) + }, + + #' @description + #' Update beta in the tree prior + #' @return Value of beta in the tree prior + get_beta = function() { + get_beta_tree_prior_cpp(self$tree_prior_ptr) + }, + + #' @description + #' Query min_samples_leaf in the tree prior + #' @return Value of min_samples_leaf in the tree prior + get_min_samples_leaf = function() { + get_min_samples_leaf_tree_prior_cpp(self$tree_prior_ptr) + }, + + #' @description + #' Query max_depth in the tree prior + #' @return Value of max_depth in the tree prior + get_max_depth = function() { + get_max_depth_tree_prior_cpp(self$tree_prior_ptr) } ) ) diff --git a/man/ForestModel.Rd b/man/ForestModel.Rd index 778764b3..ad1181d5 100644 --- a/man/ForestModel.Rd +++ b/man/ForestModel.Rd @@ -28,6 +28,10 @@ trees, and exposes functionality to run a forest sampler \item \href{#method-ForestModel-update_beta}{\code{ForestModel$update_beta()}} \item \href{#method-ForestModel-update_min_samples_leaf}{\code{ForestModel$update_min_samples_leaf()}} \item \href{#method-ForestModel-update_max_depth}{\code{ForestModel$update_max_depth()}} +\item \href{#method-ForestModel-get_alpha}{\code{ForestModel$get_alpha()}} +\item \href{#method-ForestModel-get_beta}{\code{ForestModel$get_beta()}} +\item \href{#method-ForestModel-get_min_samples_leaf}{\code{ForestModel$get_min_samples_leaf()}} +\item \href{#method-ForestModel-get_max_depth}{\code{ForestModel$get_max_depth()}} } } \if{html}{\out{
}} @@ -245,4 +249,56 @@ Update max_depth in the tree prior None } } +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ForestModel-get_alpha}{}}} +\subsection{Method \code{get_alpha()}}{ +Update alpha in the tree prior +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ForestModel$get_alpha()}\if{html}{\out{
}} +} + +\subsection{Returns}{ +Value of alpha in the tree prior +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ForestModel-get_beta}{}}} +\subsection{Method \code{get_beta()}}{ +Update beta in the tree prior +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ForestModel$get_beta()}\if{html}{\out{
}} +} + +\subsection{Returns}{ +Value of beta in the tree prior +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ForestModel-get_min_samples_leaf}{}}} +\subsection{Method \code{get_min_samples_leaf()}}{ +Query min_samples_leaf in the tree prior +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ForestModel$get_min_samples_leaf()}\if{html}{\out{
}} +} + +\subsection{Returns}{ +Value of min_samples_leaf in the tree prior +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ForestModel-get_max_depth}{}}} +\subsection{Method \code{get_max_depth()}}{ +Query max_depth in the tree prior +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ForestModel$get_max_depth()}\if{html}{\out{
}} +} + +\subsection{Returns}{ +Value of max_depth in the tree prior +} +} } diff --git a/man/ForestModelConfig.Rd b/man/ForestModelConfig.Rd index e899c8b1..e75a6cd8 100644 --- a/man/ForestModelConfig.Rd +++ b/man/ForestModelConfig.Rd @@ -9,6 +9,12 @@ Vector of integer-coded feature types (integers where 0 = numeric, 1 = ordered c Vector specifying sampling probability for all p covariates in ForestDataset +Number of trees in a forest + +Number of features in a forest model training set + +Number of observations in a forest model training set + Root node split probability in tree prior Depth prior penalty in tree prior @@ -17,6 +23,8 @@ Minimum number of samples in a tree leaf Maximum depth of any tree in the ensemble in the model +Integer coded leaf model type + Scale parameter used in Gaussian leaf models Shape parameter for IG leaf models @@ -85,10 +93,14 @@ Create a new ForestModelConfig object.} \item \href{#method-ForestModelConfig-update_cutpoint_grid_size}{\code{ForestModelConfig$update_cutpoint_grid_size()}} \item \href{#method-ForestModelConfig-get_feature_types}{\code{ForestModelConfig$get_feature_types()}} \item \href{#method-ForestModelConfig-get_variable_weights}{\code{ForestModelConfig$get_variable_weights()}} +\item \href{#method-ForestModelConfig-get_num_trees}{\code{ForestModelConfig$get_num_trees()}} +\item \href{#method-ForestModelConfig-get_num_features}{\code{ForestModelConfig$get_num_features()}} +\item \href{#method-ForestModelConfig-get_num_observations}{\code{ForestModelConfig$get_num_observations()}} \item \href{#method-ForestModelConfig-get_alpha}{\code{ForestModelConfig$get_alpha()}} \item \href{#method-ForestModelConfig-get_beta}{\code{ForestModelConfig$get_beta()}} \item \href{#method-ForestModelConfig-get_min_samples_leaf}{\code{ForestModelConfig$get_min_samples_leaf()}} \item \href{#method-ForestModelConfig-get_max_depth}{\code{ForestModelConfig$get_max_depth()}} +\item \href{#method-ForestModelConfig-get_leaf_model_type}{\code{ForestModelConfig$get_leaf_model_type()}} \item \href{#method-ForestModelConfig-get_leaf_model_scale}{\code{ForestModelConfig$get_leaf_model_scale()}} \item \href{#method-ForestModelConfig-get_variance_forest_shape}{\code{ForestModelConfig$get_variance_forest_shape()}} \item \href{#method-ForestModelConfig-get_variance_forest_scale}{\code{ForestModelConfig$get_variance_forest_scale()}} @@ -122,7 +134,7 @@ Create a new ForestModelConfig object.} \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{feature_types}}{Vector of integer-coded feature types (integers where 0 = numeric, 1 = ordered categorical, 2 = unordered categorical)} +\item{\code{feature_types}}{Vector of integer-coded feature types (where 0 = numeric, 1 = ordered categorical, 2 = unordered categorical)} \item{\code{num_trees}}{Number of trees in the forest being sampled} @@ -230,7 +242,7 @@ Update depth prior penalty in tree prior \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-ForestModelConfig-update_min_samples_leaf}{}}} \subsection{Method \code{update_min_samples_leaf()}}{ -Update root node split probability in tree prior +Update minimum number of samples per leaf node in the tree prior \subsection{Usage}{ \if{html}{\out{
}}\preformatted{ForestModelConfig$update_min_samples_leaf(min_samples_leaf)}\if{html}{\out{
}} } @@ -247,7 +259,7 @@ Update root node split probability in tree prior \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-ForestModelConfig-update_max_depth}{}}} \subsection{Method \code{update_max_depth()}}{ -Update root node split probability in tree prior +Update max depth in the tree prior \subsection{Usage}{ \if{html}{\out{
}}\preformatted{ForestModelConfig$update_max_depth(max_depth)}\if{html}{\out{
}} } @@ -347,6 +359,36 @@ Query variable weights for this ForestModelConfig object \if{html}{\out{
}}\preformatted{ForestModelConfig$get_variable_weights()}\if{html}{\out{
}} } +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ForestModelConfig-get_num_trees}{}}} +\subsection{Method \code{get_num_trees()}}{ +Query number of trees +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ForestModelConfig$get_num_trees()}\if{html}{\out{
}} +} + +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ForestModelConfig-get_num_features}{}}} +\subsection{Method \code{get_num_features()}}{ +Query number of features +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ForestModelConfig$get_num_features()}\if{html}{\out{
}} +} + +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ForestModelConfig-get_num_observations}{}}} +\subsection{Method \code{get_num_observations()}}{ +Query number of observations +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ForestModelConfig$get_num_observations()}\if{html}{\out{
}} +} + } \if{html}{\out{
}} \if{html}{\out{}} @@ -387,6 +429,16 @@ Query root node split probability in tree prior for this ForestModelConfig objec \if{html}{\out{
}}\preformatted{ForestModelConfig$get_max_depth()}\if{html}{\out{
}} } +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ForestModelConfig-get_leaf_model_type}{}}} +\subsection{Method \code{get_leaf_model_type()}}{ +Query (integer-coded) type of leaf model +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ForestModelConfig$get_leaf_model_type()}\if{html}{\out{
}} +} + } \if{html}{\out{
}} \if{html}{\out{}} diff --git a/src/cpp11.cpp b/src/cpp11.cpp index 00a3fcbc..28ccb7ec 100644 --- a/src/cpp11.cpp +++ b/src/cpp11.cpp @@ -1104,6 +1104,34 @@ extern "C" SEXP _stochtree_update_max_depth_tree_prior_cpp(SEXP tree_prior_ptr, END_CPP11 } // sampler.cpp +double get_alpha_tree_prior_cpp(cpp11::external_pointer tree_prior_ptr); +extern "C" SEXP _stochtree_get_alpha_tree_prior_cpp(SEXP tree_prior_ptr) { + BEGIN_CPP11 + return cpp11::as_sexp(get_alpha_tree_prior_cpp(cpp11::as_cpp>>(tree_prior_ptr))); + END_CPP11 +} +// sampler.cpp +double get_beta_tree_prior_cpp(cpp11::external_pointer tree_prior_ptr); +extern "C" SEXP _stochtree_get_beta_tree_prior_cpp(SEXP tree_prior_ptr) { + BEGIN_CPP11 + return cpp11::as_sexp(get_beta_tree_prior_cpp(cpp11::as_cpp>>(tree_prior_ptr))); + END_CPP11 +} +// sampler.cpp +int get_min_samples_leaf_tree_prior_cpp(cpp11::external_pointer tree_prior_ptr); +extern "C" SEXP _stochtree_get_min_samples_leaf_tree_prior_cpp(SEXP tree_prior_ptr) { + BEGIN_CPP11 + return cpp11::as_sexp(get_min_samples_leaf_tree_prior_cpp(cpp11::as_cpp>>(tree_prior_ptr))); + END_CPP11 +} +// sampler.cpp +int get_max_depth_tree_prior_cpp(cpp11::external_pointer tree_prior_ptr); +extern "C" SEXP _stochtree_get_max_depth_tree_prior_cpp(SEXP tree_prior_ptr) { + BEGIN_CPP11 + return cpp11::as_sexp(get_max_depth_tree_prior_cpp(cpp11::as_cpp>>(tree_prior_ptr))); + END_CPP11 +} +// sampler.cpp cpp11::external_pointer forest_tracker_cpp(cpp11::external_pointer data, cpp11::integers feature_types, int num_trees, StochTree::data_size_t n); extern "C" SEXP _stochtree_forest_tracker_cpp(SEXP data, SEXP feature_types, SEXP num_trees, SEXP n) { BEGIN_CPP11 @@ -1449,10 +1477,14 @@ static const R_CallMethodDef CallEntries[] = { {"_stochtree_forest_dataset_add_weights_cpp", (DL_FUNC) &_stochtree_forest_dataset_add_weights_cpp, 2}, {"_stochtree_forest_dataset_update_basis_cpp", (DL_FUNC) &_stochtree_forest_dataset_update_basis_cpp, 2}, {"_stochtree_forest_tracker_cpp", (DL_FUNC) &_stochtree_forest_tracker_cpp, 4}, + {"_stochtree_get_alpha_tree_prior_cpp", (DL_FUNC) &_stochtree_get_alpha_tree_prior_cpp, 1}, + {"_stochtree_get_beta_tree_prior_cpp", (DL_FUNC) &_stochtree_get_beta_tree_prior_cpp, 1}, {"_stochtree_get_forest_split_counts_forest_container_cpp", (DL_FUNC) &_stochtree_get_forest_split_counts_forest_container_cpp, 3}, {"_stochtree_get_granular_split_count_array_active_forest_cpp", (DL_FUNC) &_stochtree_get_granular_split_count_array_active_forest_cpp, 2}, {"_stochtree_get_granular_split_count_array_forest_container_cpp", (DL_FUNC) &_stochtree_get_granular_split_count_array_forest_container_cpp, 2}, {"_stochtree_get_json_string_cpp", (DL_FUNC) &_stochtree_get_json_string_cpp, 1}, + {"_stochtree_get_max_depth_tree_prior_cpp", (DL_FUNC) &_stochtree_get_max_depth_tree_prior_cpp, 1}, + {"_stochtree_get_min_samples_leaf_tree_prior_cpp", (DL_FUNC) &_stochtree_get_min_samples_leaf_tree_prior_cpp, 1}, {"_stochtree_get_overall_split_counts_active_forest_cpp", (DL_FUNC) &_stochtree_get_overall_split_counts_active_forest_cpp, 2}, {"_stochtree_get_overall_split_counts_forest_container_cpp", (DL_FUNC) &_stochtree_get_overall_split_counts_forest_container_cpp, 2}, {"_stochtree_get_residual_cpp", (DL_FUNC) &_stochtree_get_residual_cpp, 1}, diff --git a/src/py_stochtree.cpp b/src/py_stochtree.cpp index ee25e586..63e4c667 100644 --- a/src/py_stochtree.cpp +++ b/src/py_stochtree.cpp @@ -1125,6 +1125,22 @@ class ForestSamplerCpp { split_prior_->SetMaxDepth(max_depth); } + double GetAlpha() { + return split_prior_->GetAlpha(); + } + + double GetBeta() { + return split_prior_->GetBeta(); + } + + int GetMinSamplesLeaf() { + return split_prior_->GetMinSamplesLeaf(); + } + + int GetMaxDepth() { + return split_prior_->GetMaxDepth(); + } + private: std::unique_ptr tracker_; std::unique_ptr split_prior_; @@ -1704,7 +1720,11 @@ PYBIND11_MODULE(stochtree_cpp, m) { .def("UpdateAlpha", &ForestSamplerCpp::UpdateAlpha) .def("UpdateBeta", &ForestSamplerCpp::UpdateBeta) .def("UpdateMinSamplesLeaf", &ForestSamplerCpp::UpdateMinSamplesLeaf) - .def("UpdateMaxDepth", &ForestSamplerCpp::UpdateMaxDepth); + .def("UpdateMaxDepth", &ForestSamplerCpp::UpdateMaxDepth) + .def("GetAlpha", &ForestSamplerCpp::GetAlpha) + .def("GetBeta", &ForestSamplerCpp::GetBeta) + .def("GetMinSamplesLeaf", &ForestSamplerCpp::GetMinSamplesLeaf) + .def("GetMaxDepth", &ForestSamplerCpp::GetMaxDepth); py::class_(m, "GlobalVarianceModelCpp") .def(py::init<>()) diff --git a/src/sampler.cpp b/src/sampler.cpp index 4dbe5e13..8890237a 100644 --- a/src/sampler.cpp +++ b/src/sampler.cpp @@ -227,6 +227,27 @@ void update_max_depth_tree_prior_cpp(cpp11::external_pointerSetMaxDepth(max_depth); } +[[cpp11::register]] +double get_alpha_tree_prior_cpp(cpp11::external_pointer tree_prior_ptr) { + return tree_prior_ptr->GetAlpha(); +} + +[[cpp11::register]] +double get_beta_tree_prior_cpp(cpp11::external_pointer tree_prior_ptr) { + // Update beta + return tree_prior_ptr->GetBeta(); +} + +[[cpp11::register]] +int get_min_samples_leaf_tree_prior_cpp(cpp11::external_pointer tree_prior_ptr) { + return tree_prior_ptr->GetMinSamplesLeaf(); +} + +[[cpp11::register]] +int get_max_depth_tree_prior_cpp(cpp11::external_pointer tree_prior_ptr) { + return tree_prior_ptr->GetMaxDepth(); +} + [[cpp11::register]] cpp11::external_pointer forest_tracker_cpp(cpp11::external_pointer data, cpp11::integers feature_types, int num_trees, StochTree::data_size_t n) { // Convert vector of integers to std::vector of enum FeatureType diff --git a/stochtree/config.py b/stochtree/config.py index 59169550..9ff7cbed 100644 --- a/stochtree/config.py +++ b/stochtree/config.py @@ -371,7 +371,7 @@ def get_num_features(self) -> int: Returns ------- num_features : int - Number of features in a forest + Number of features in a forest model training set """ return self.num_features @@ -432,12 +432,12 @@ def get_max_depth(self) -> int: def get_leaf_model_type(self) -> int: """ - Query type of leaf model + Query (integer-coded) type of leaf model Returns ------- leaf_model_type : int - Integer coded leaf model + Integer coded leaf model type """ self.leaf_model_type diff --git a/stochtree/sampler.py b/stochtree/sampler.py index e3b79b32..ea94d444 100644 --- a/stochtree/sampler.py +++ b/stochtree/sampler.py @@ -2,8 +2,6 @@ Python classes wrapping C++ sampler objects """ -from typing import Union - import numpy as np from stochtree_cpp import ( ForestSamplerCpp, @@ -137,6 +135,17 @@ def sample_one_iteration( pre_initialized : bool Whether or not the forest being sampled has already been initialized """ + # Detect changes to the tree prior + if self.forest_sampler_cpp.GetAlpha() != forest_config.get_alpha(): + self.forest_sampler_cpp.SetAlpha(forest_config.get_alpha()) + if self.forest_sampler_cpp.GetBeta() != forest_config.get_beta(): + self.forest_sampler_cpp.SetBeta(forest_config.get_beta()) + if self.forest_sampler_cpp.GetMinSamplesLeaf() != forest_config.get_min_samples_leaf(): + self.forest_sampler_cpp.SetMinSamplesLeaf(forest_config.get_min_samples_leaf()) + if self.forest_sampler_cpp.GetMaxDepth() != forest_config.get_max_depth(): + self.forest_sampler_cpp.SetMaxDepth(forest_config.get_max_depth()) + + # Run the sampler self.forest_sampler_cpp.SampleOneIteration( forest_container.forest_container_cpp, forest.forest_cpp, From 74b8d6ad9ae54ecb4b55afcc211674e2ac2d2380 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Sun, 23 Feb 2025 14:19:43 -0600 Subject: [PATCH 16/35] Refactored pre_initialized parameter out of python interface --- src/py_stochtree.cpp | 6 +++++- stochtree/forest.py | 22 ++++++++++++++++++++++ stochtree/sampler.py | 22 +++++++++++++++------- 3 files changed, 42 insertions(+), 8 deletions(-) diff --git a/src/py_stochtree.cpp b/src/py_stochtree.cpp index 63e4c667..26608b22 100644 --- a/src/py_stochtree.cpp +++ b/src/py_stochtree.cpp @@ -972,7 +972,11 @@ class ForestSamplerCpp { void SampleOneIteration(ForestContainerCpp& forest_samples, ForestCpp& forest, ForestDatasetCpp& dataset, ResidualCpp& residual, RngCpp& rng, py::array_t feature_types, int cutpoint_grid_size, py::array_t leaf_model_scale_input, py::array_t variable_weights, double a_forest, double b_forest, double global_variance, - int leaf_model_int, bool keep_forest = true, bool gfr = true, bool pre_initialized = false) { + int leaf_model_int, bool keep_forest = true, bool gfr = true) { + // Refactoring completely out of the Python interface. + // Intention to refactor out of the C++ interface in the future. + bool pre_initialized = true; + // Unpack feature types std::vector feature_types_(feature_types.size()); for (int i = 0; i < feature_types.size(); i++) { diff --git a/stochtree/forest.py b/stochtree/forest.py index c1192183..2638c577 100644 --- a/stochtree/forest.py +++ b/stochtree/forest.py @@ -816,6 +816,7 @@ def __init__( self.output_dimension = output_dimension self.leaf_constant = leaf_constant self.is_exponentiated = is_exponentiated + self.internal_forest_is_empty = True def reset_root(self) -> None: """ @@ -895,6 +896,7 @@ def set_root_leaves(self, leaf_value: Union[float, np.array]) -> None: self.forest_cpp.SetRootVector(leaf_value, leaf_value.shape[0]) else: self.forest_cpp.SetRootValue(leaf_value) + self.internal_forest_is_empty = False def add_numeric_split( self, @@ -1347,3 +1349,23 @@ def leaves(self, tree_num: int) -> np.array: Array of indices of leaf nodes in tree `tree_num`. """ return self.forest_cpp.Leaves(tree_num) + + def is_empty(self) -> bool: + """ + When a Forest object is created, it is "empty" in the sense that none + of its component trees have leaves with values. There are two ways to + "initialize" a Forest object. First, the `set_root_leaves()` method of the + Forest class simply initializes every tree in the forest to a single node + carrying the same (user-specified) leaf value. Second, the `prepare_for_sampler()` + method of the ForestSampler class initializes every tree in the forest to a + single node with the same value and also propagates this information through + to the temporary tracking data structrues in a ForestSampler object, which + must be synchronized with a Forest during a forest sampler loop. + + Returns + ------- + bool + `True` if a Forest has not yet been initialized with a constant root value, + `False` otherwise if the forest has already been initialized / grown. + """ + return self.internal_forest_is_empty diff --git a/stochtree/sampler.py b/stochtree/sampler.py index ea94d444..b8eb6e61 100644 --- a/stochtree/sampler.py +++ b/stochtree/sampler.py @@ -107,7 +107,6 @@ def sample_one_iteration( global_config: GlobalModelConfig, keep_forest: bool, gfr: bool, - pre_initialized: bool, ) -> None: """ Sample one iteration of a forest using the specified model and tree sampling algorithm @@ -132,19 +131,28 @@ def sample_one_iteration( Whether or not the resulting forest should be retained in `forest_container` or discarded (due to burnin or thinning for example) gfr : bool Whether or not the "grow-from-root" (GFR) sampler is run (if this is `True` and `leaf_model_int=0` this is equivalent to XBART, if this is `FALSE` and `leaf_model_int=0` this is equivalent to the original BART) - pre_initialized : bool - Whether or not the forest being sampled has already been initialized """ + # Ensure forest has been initialized + if forest.is_empty(): + raise ValueError( + "`forest` has not yet been initialized, which is necessary to run the sampler. Please set constant values for `forest`'s leaves using the `set_root_leaves` method." + ) + # Detect changes to the tree prior if self.forest_sampler_cpp.GetAlpha() != forest_config.get_alpha(): self.forest_sampler_cpp.SetAlpha(forest_config.get_alpha()) if self.forest_sampler_cpp.GetBeta() != forest_config.get_beta(): self.forest_sampler_cpp.SetBeta(forest_config.get_beta()) - if self.forest_sampler_cpp.GetMinSamplesLeaf() != forest_config.get_min_samples_leaf(): - self.forest_sampler_cpp.SetMinSamplesLeaf(forest_config.get_min_samples_leaf()) + if ( + self.forest_sampler_cpp.GetMinSamplesLeaf() + != forest_config.get_min_samples_leaf() + ): + self.forest_sampler_cpp.SetMinSamplesLeaf( + forest_config.get_min_samples_leaf() + ) if self.forest_sampler_cpp.GetMaxDepth() != forest_config.get_max_depth(): self.forest_sampler_cpp.SetMaxDepth(forest_config.get_max_depth()) - + # Run the sampler self.forest_sampler_cpp.SampleOneIteration( forest_container.forest_container_cpp, @@ -162,7 +170,6 @@ def sample_one_iteration( forest_config.get_leaf_model_type(), keep_forest, gfr, - pre_initialized, ) def prepare_for_sampler( @@ -196,6 +203,7 @@ def prepare_for_sampler( leaf_model, initial_values, ) + forest.internal_forest_is_empty = False def adjust_residual( self, From d383760ea8207b6d59c4c1df24d1488770e55e47 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Wed, 26 Feb 2025 00:54:40 -0600 Subject: [PATCH 17/35] Updated python package to use config objects --- src/py_stochtree.cpp | 22 +++- stochtree/__init__.py | 3 + stochtree/bart.py | 122 ++++++++++++----------- stochtree/bcf.py | 188 +++++++++++++++++++---------------- stochtree/config.py | 22 ++-- stochtree/data.py | 55 ++++++++++ stochtree/sampler.py | 6 +- test/python/test_json.py | 55 ++++++---- test/python/test_residual.py | 39 +++++--- 9 files changed, 322 insertions(+), 190 deletions(-) diff --git a/src/py_stochtree.cpp b/src/py_stochtree.cpp index 26608b22..0581bf85 100644 --- a/src/py_stochtree.cpp +++ b/src/py_stochtree.cpp @@ -71,6 +71,22 @@ class ForestDatasetCpp { return dataset_->NumObservations(); } + int NumCovariates() { + return dataset_->NumCovariates(); + } + + int NumBasis() { + return dataset_->NumBasis(); + } + + bool HasBasis() { + return dataset_->HasBasis(); + } + + bool HasVarianceWeights() { + return dataset_->HasVarWeights(); + } + StochTree::ForestDataset* GetDataset() { return dataset_.get(); } @@ -1620,7 +1636,11 @@ PYBIND11_MODULE(stochtree_cpp, m) { .def("AddBasis", &ForestDatasetCpp::AddBasis) .def("UpdateBasis", &ForestDatasetCpp::UpdateBasis) .def("AddVarianceWeights", &ForestDatasetCpp::AddVarianceWeights) - .def("NumRows", &ForestDatasetCpp::NumRows); + .def("NumRows", &ForestDatasetCpp::NumRows) + .def("NumCovariates", &ForestDatasetCpp::NumCovariates) + .def("NumBasis", &ForestDatasetCpp::NumBasis) + .def("HasBasis", &ForestDatasetCpp::HasBasis) + .def("HasVarianceWeights", &ForestDatasetCpp::HasVarianceWeights); py::class_(m, "ResidualCpp") .def(py::init,data_size_t>()) diff --git a/stochtree/__init__.py b/stochtree/__init__.py index c93585bb..62c6e019 100644 --- a/stochtree/__init__.py +++ b/stochtree/__init__.py @@ -1,6 +1,7 @@ from .bart import BARTModel from .bcf import BCFModel from .calibration import calibrate_global_error_variance +from .config import ForestModelConfig, GlobalModelConfig from .data import Dataset, Residual from .forest import Forest, ForestContainer from .preprocessing import CovariatePreprocessor @@ -29,6 +30,8 @@ "ForestSampler", "GlobalVarianceModel", "LeafVarianceModel", + "ForestModelConfig", + "GlobalModelConfig", "JSONSerializer", "NotSampledError", "_check_array_integer", diff --git a/stochtree/bart.py b/stochtree/bart.py index 9ab3e592..1007681a 100644 --- a/stochtree/bart.py +++ b/stochtree/bart.py @@ -10,6 +10,7 @@ import numpy as np import pandas as pd +from .config import ForestModelConfig, GlobalModelConfig from .data import Dataset, Residual from .forest import Forest, ForestContainer from .preprocessing import CovariatePreprocessor, _preprocess_params @@ -345,6 +346,7 @@ def sample( original_var_indices = ( self._covariate_preprocessor.fetch_original_feature_indices() ) + num_features = len(feature_types) # Determine whether a test set is provided self.has_test = X_test is not None @@ -672,40 +674,66 @@ def sample( else: cpp_rng = RNG(random_seed) - # Sampling data structures - if self.include_mean_forest: - forest_sampler_mean = ForestSampler( - forest_dataset_train, - feature_types, - num_trees_mean, - self.n_train, - alpha_mean, - beta_mean, - min_samples_leaf_mean, - max_depth_mean, - ) - if self.include_variance_forest: - forest_sampler_variance = ForestSampler( - forest_dataset_train, - feature_types, - num_trees_variance, - self.n_train, - alpha_variance, - beta_variance, - min_samples_leaf_variance, - max_depth_variance, - ) - # Set variance leaf model type (currently only one option) leaf_model_variance_forest = 3 + leaf_dimension_variance = 1 # Determine the mean forest leaf model type if not self.has_basis: leaf_model_mean_forest = 0 + leaf_dimension_mean = 1 elif self.num_basis == 1: leaf_model_mean_forest = 1 + leaf_dimension_mean = 1 else: leaf_model_mean_forest = 2 + leaf_dimension_mean = self.num_basis + + # Sampling data structures + global_model_config = GlobalModelConfig(global_error_variance=current_sigma2) + if self.include_mean_forest: + forest_model_config_mean = ForestModelConfig( + num_trees=num_trees_mean, + num_features=num_features, + num_observations=self.n_train, + feature_types=feature_types, + variable_weights=variable_weights_mean, + leaf_dimension=leaf_dimension_mean, + alpha=alpha_mean, + beta=beta_mean, + min_samples_leaf=min_samples_leaf_mean, + max_depth=max_depth_mean, + leaf_model_type=leaf_model_mean_forest, + leaf_model_scale=current_leaf_scale, + cutpoint_grid_size=cutpoint_grid_size, + ) + forest_sampler_mean = ForestSampler( + forest_dataset_train, + global_model_config, + forest_model_config_mean, + ) + if self.include_variance_forest: + forest_model_config_variance = ForestModelConfig( + num_trees=num_trees_variance, + num_features=num_features, + num_observations=self.n_train, + feature_types=feature_types, + variable_weights=variable_weights_variance, + leaf_dimension=leaf_dimension_variance, + alpha=alpha_variance, + beta=beta_variance, + min_samples_leaf=min_samples_leaf_variance, + max_depth=max_depth_variance, + leaf_model_type=leaf_model_variance_forest, + cutpoint_grid_size=cutpoint_grid_size, + variance_forest_shape=a_forest, + variance_forest_scale=b_forest, + ) + forest_sampler_variance = ForestSampler( + forest_dataset_train, + global_model_config, + forest_model_config_variance, + ) # Container of forest samples if self.include_mean_forest: @@ -772,17 +800,10 @@ def sample( forest_dataset_train, residual_train, cpp_rng, - feature_types, - cutpoint_grid_size, - current_leaf_scale, - variable_weights_mean, - a_forest, - b_forest, - current_sigma2, - leaf_model_mean_forest, + global_model_config, + forest_model_config_mean, keep_sample, True, - True, ) # Sample the variance forest @@ -793,17 +814,10 @@ def sample( forest_dataset_train, residual_train, cpp_rng, - feature_types, - cutpoint_grid_size, - current_leaf_scale, - variable_weights_variance, - a_forest, - b_forest, - current_sigma2, - leaf_model_variance_forest, + global_model_config, + forest_model_config_variance, keep_sample, True, - True, ) # Sample variance parameters (if requested) @@ -811,12 +825,14 @@ def sample( current_sigma2 = global_var_model.sample_one_iteration( residual_train, cpp_rng, a_global, b_global ) + global_model_config.update_global_error_variance(current_sigma2) if keep_sample: self.global_var_samples[sample_counter] = current_sigma2 if self.sample_sigma_leaf: current_leaf_scale[0, 0] = leaf_var_model.sample_one_iteration( active_forest_mean, cpp_rng, a_leaf, b_leaf ) + forest_model_config_mean.update_leaf_model_scale(current_leaf_scale) if keep_sample: self.leaf_scale_samples[sample_counter] = current_leaf_scale[ 0, 0 @@ -899,17 +915,10 @@ def sample( forest_dataset_train, residual_train, cpp_rng, - feature_types, - cutpoint_grid_size, - current_leaf_scale, - variable_weights_mean, - a_forest, - b_forest, - current_sigma2, - leaf_model_mean_forest, + global_model_config, + forest_model_config_mean, keep_sample, False, - True, ) # Sample the variance forest @@ -920,17 +929,10 @@ def sample( forest_dataset_train, residual_train, cpp_rng, - feature_types, - cutpoint_grid_size, - current_leaf_scale, - variable_weights_variance, - a_forest, - b_forest, - current_sigma2, - leaf_model_variance_forest, + global_model_config, + forest_model_config_variance, keep_sample, False, - True, ) # Sample variance parameters (if requested) diff --git a/stochtree/bcf.py b/stochtree/bcf.py index 39975569..325bb43b 100644 --- a/stochtree/bcf.py +++ b/stochtree/bcf.py @@ -9,6 +9,7 @@ from sklearn.utils import check_scalar from .bart import BARTModel +from .config import ForestModelConfig, GlobalModelConfig from .data import Dataset, Residual from .forest import Forest, ForestContainer from .preprocessing import CovariatePreprocessor, _preprocess_params @@ -410,13 +411,20 @@ def sample( if X_test.shape[0] != pi_test.shape[0]: raise ValueError("X_test and pi_test must have the same number of rows") + # Prognostic model details + leaf_dimension_mu = 1 + leaf_model_mu = 0 + # Treatment details self.treatment_dim = Z_train.shape[1] self.multivariate_treatment = True if self.treatment_dim > 1 else False - treatment_leaf_model = 2 if self.multivariate_treatment else 1 + leaf_dimension_tau = self.treatment_dim + leaf_model_tau = 2 if self.multivariate_treatment else 1 + # treatment_leaf_model = 2 if self.multivariate_treatment else 1 # Set variance leaf model type (currently only one option) - leaf_model_variance_forest = 3 + leaf_dimension_variance = 1 + leaf_model_variance = 3 self.variance_scale = 1 # Check parameters @@ -1139,7 +1147,9 @@ def sample( "propensity_covariate must equal one of 'none', 'mu', 'tau', or 'both'" ) if propensity_covariate != "none": - feature_types = np.append(feature_types, 0).astype("int") + feature_types = np.append( + feature_types, np.repeat(0, pi_train.shape[1]) + ).astype("int") X_train_processed = np.c_[X_train_processed, pi_train] if self.has_test: X_test_processed = np.c_[X_test_processed, pi_test] @@ -1240,42 +1250,74 @@ def sample( cpp_rng = RNG(random_seed) # Sampling data structures + global_model_config = GlobalModelConfig(global_error_variance=current_sigma2) + forest_model_config_mu = ForestModelConfig( + num_trees=num_trees_mu, + num_features=forest_dataset_train.num_covariates(), + num_observations=self.n_train, + feature_types=feature_types, + variable_weights=variable_weights_mu, + leaf_dimension=leaf_dimension_mu, + alpha=alpha_mu, + beta=beta_mu, + min_samples_leaf=min_samples_leaf_mu, + max_depth=max_depth_mu, + leaf_model_type=leaf_model_mu, + leaf_model_scale=current_leaf_scale_mu, + cutpoint_grid_size=cutpoint_grid_size, + ) forest_sampler_mu = ForestSampler( forest_dataset_train, - feature_types, - num_trees_mu, - self.n_train, - alpha_mu, - beta_mu, - min_samples_leaf_mu, - max_depth_mu, + global_model_config, + forest_model_config_mu, + ) + forest_model_config_tau = ForestModelConfig( + num_trees=num_trees_tau, + num_features=forest_dataset_train.num_covariates(), + num_observations=self.n_train, + feature_types=feature_types, + variable_weights=variable_weights_tau, + leaf_dimension=leaf_dimension_tau, + alpha=alpha_tau, + beta=beta_tau, + min_samples_leaf=min_samples_leaf_tau, + max_depth=max_depth_tau, + leaf_model_type=leaf_model_tau, + leaf_model_scale=current_leaf_scale_tau, + cutpoint_grid_size=cutpoint_grid_size, ) forest_sampler_tau = ForestSampler( forest_dataset_train, - feature_types, - num_trees_tau, - self.n_train, - alpha_tau, - beta_tau, - min_samples_leaf_tau, - max_depth_tau, + global_model_config, + forest_model_config_tau, ) if self.include_variance_forest: + forest_model_config_variance = ForestModelConfig( + num_trees=num_trees_variance, + num_features=forest_dataset_train.num_covariates(), + num_observations=self.n_train, + feature_types=feature_types, + variable_weights=variable_weights_variance, + leaf_dimension=leaf_dimension_variance, + alpha=alpha_variance, + beta=beta_variance, + min_samples_leaf=min_samples_leaf_variance, + max_depth=max_depth_variance, + leaf_model_type=leaf_model_variance, + cutpoint_grid_size=cutpoint_grid_size, + variance_forest_shape=a_forest, + variance_forest_scale=b_forest, + ) forest_sampler_variance = ForestSampler( - forest_dataset_train, - feature_types, - num_trees_variance, - self.n_train, - alpha_variance, - beta_variance, - min_samples_leaf_variance, - max_depth_variance, + forest_dataset_train, global_model_config, forest_model_config_variance ) # Container of forest samples - self.forest_container_mu = ForestContainer(num_trees_mu, 1, True, False) + self.forest_container_mu = ForestContainer( + num_trees_mu, leaf_dimension_mu, True, False + ) self.forest_container_tau = ForestContainer( - num_trees_tau, Z_train.shape[1], False, False + num_trees_tau, leaf_dimension_tau, False, False ) active_forest_mu = Forest(num_trees_mu, 1, True, False) active_forest_tau = Forest(num_trees_tau, Z_train.shape[1], False, False) @@ -1296,7 +1338,11 @@ def sample( # Initialize the leaves of each tree in the prognostic forest init_mu = np.array([np.squeeze(np.mean(resid_train))]) forest_sampler_mu.prepare_for_sampler( - forest_dataset_train, residual_train, active_forest_mu, 0, init_mu + forest_dataset_train, + residual_train, + active_forest_mu, + leaf_model_mu, + init_mu, ) # Initialize the leaves of each tree in the treatment forest @@ -1308,7 +1354,7 @@ def sample( forest_dataset_train, residual_train, active_forest_tau, - treatment_leaf_model, + leaf_model_tau, init_tau, ) @@ -1319,7 +1365,7 @@ def sample( forest_dataset_train, residual_train, active_forest_variance, - leaf_model_variance_forest, + leaf_model_variance, init_val_variance, ) @@ -1338,17 +1384,10 @@ def sample( forest_dataset_train, residual_train, cpp_rng, - feature_types, - cutpoint_grid_size, - current_leaf_scale_mu, - variable_weights_mu, - a_forest, - b_forest, - current_sigma2, - 0, + global_model_config, + forest_model_config_mu, keep_sample, True, - True, ) # Sample variance parameters (if requested) @@ -1356,12 +1395,16 @@ def sample( current_sigma2 = global_var_model.sample_one_iteration( residual_train, cpp_rng, a_global, b_global ) + global_model_config.update_global_error_variance(current_sigma2) if self.sample_sigma_leaf_mu: current_leaf_scale_mu[0, 0] = ( leaf_var_model_mu.sample_one_iteration( active_forest_mu, cpp_rng, a_leaf_mu, b_leaf_mu ) ) + forest_model_config_mu.update_leaf_model_scale( + current_leaf_scale_mu + ) if keep_sample: self.leaf_scale_mu_samples[sample_counter] = ( current_leaf_scale_mu[0, 0] @@ -1374,17 +1417,10 @@ def sample( forest_dataset_train, residual_train, cpp_rng, - feature_types, - cutpoint_grid_size, - current_leaf_scale_tau, - variable_weights_tau, - a_forest, - b_forest, - current_sigma2, - treatment_leaf_model, + global_model_config, + forest_model_config_tau, keep_sample, True, - True, ) # Sample coding parameters (if requested) @@ -1438,17 +1474,10 @@ def sample( forest_dataset_train, residual_train, cpp_rng, - feature_types, - cutpoint_grid_size, - current_leaf_scale_mu, - variable_weights_variance, - a_forest, - b_forest, - current_sigma2, - leaf_model_variance_forest, + global_model_config, + forest_model_config_variance, keep_sample, True, - True, ) # Sample variance parameters (if requested) @@ -1456,6 +1485,7 @@ def sample( current_sigma2 = global_var_model.sample_one_iteration( residual_train, cpp_rng, a_global, b_global ) + global_model_config.update_global_error_variance(current_sigma2) if keep_sample: self.global_var_samples[sample_counter] = current_sigma2 if self.sample_sigma_leaf_tau: @@ -1464,6 +1494,9 @@ def sample( active_forest_tau, cpp_rng, a_leaf_tau, b_leaf_tau ) ) + forest_model_config_tau.update_leaf_model_scale( + current_leaf_scale_tau + ) if keep_sample: self.leaf_scale_tau_samples[sample_counter] = ( current_leaf_scale_tau[0, 0] @@ -1493,17 +1526,10 @@ def sample( forest_dataset_train, residual_train, cpp_rng, - feature_types, - cutpoint_grid_size, - current_leaf_scale_mu, - variable_weights_mu, - a_forest, - b_forest, - current_sigma2, - 0, + global_model_config, + forest_model_config_mu, keep_sample, False, - True, ) # Sample variance parameters (if requested) @@ -1511,12 +1537,16 @@ def sample( current_sigma2 = global_var_model.sample_one_iteration( residual_train, cpp_rng, a_global, b_global ) + global_model_config.update_global_error_variance(current_sigma2) if self.sample_sigma_leaf_mu: current_leaf_scale_mu[0, 0] = ( leaf_var_model_mu.sample_one_iteration( active_forest_mu, cpp_rng, a_leaf_mu, b_leaf_mu ) ) + forest_model_config_mu.update_leaf_model_scale( + current_leaf_scale_mu + ) if keep_sample: self.leaf_scale_mu_samples[sample_counter] = ( current_leaf_scale_mu[0, 0] @@ -1529,17 +1559,10 @@ def sample( forest_dataset_train, residual_train, cpp_rng, - feature_types, - cutpoint_grid_size, - current_leaf_scale_tau, - variable_weights_tau, - a_forest, - b_forest, - current_sigma2, - treatment_leaf_model, + global_model_config, + forest_model_config_tau, keep_sample, False, - True, ) # Sample coding parameters (if requested) @@ -1593,16 +1616,9 @@ def sample( forest_dataset_train, residual_train, cpp_rng, - feature_types, - cutpoint_grid_size, - current_leaf_scale_mu, - variable_weights_variance, - a_forest, - b_forest, - current_sigma2, - leaf_model_variance_forest, + global_model_config, + forest_model_config_variance, keep_sample, - False, True, ) @@ -1611,6 +1627,7 @@ def sample( current_sigma2 = global_var_model.sample_one_iteration( residual_train, cpp_rng, a_global, b_global ) + global_model_config.update_global_error_variance(current_sigma2) if keep_sample: self.global_var_samples[sample_counter] = current_sigma2 if self.sample_sigma_leaf_tau: @@ -1619,6 +1636,9 @@ def sample( active_forest_tau, cpp_rng, a_leaf_tau, b_leaf_tau ) ) + forest_model_config_tau.update_leaf_model_scale( + current_leaf_scale_tau + ) if keep_sample: self.leaf_scale_tau_samples[sample_counter] = ( current_leaf_scale_tau[0, 0] diff --git a/stochtree/config.py b/stochtree/config.py index 9ff7cbed..2a2701a3 100644 --- a/stochtree/config.py +++ b/stochtree/config.py @@ -65,15 +65,15 @@ def __init__( beta=2.0, min_samples_leaf=5, max_depth=-1, - leaf_model_type=1, + leaf_model_type=0, leaf_model_scale=None, variance_forest_shape=1.0, variance_forest_scale=1.0, cutpoint_grid_size=100, ) -> None: # Preprocess inputs and run some error checks - if not feature_types: - if not num_features: + if feature_types is None: + if num_features is None: raise ValueError( "Neither of `num_features` nor `feature_types` (a vector from which `num_features` can be inferred) was provided.", "Please provide at least one of these inputs when creating a `ForestModelConfig` object.", @@ -82,18 +82,18 @@ def __init__( self.feature_types = np.repeat(0, num_features) else: self.feature_types = _standardize_array_to_np(feature_types) - if not num_features: + if num_features is None: num_features = len(self.feature_types) - if not variable_weights: + if variable_weights is None: warnings.warn( "`variable_weights` not provided, will be assumed to be equal-weighted" ) self.variable_weights = np.repeat(1.0 / num_features, num_features) else: self.variable_weights = _standardize_array_to_np(variable_weights) - if not num_trees: + if num_trees is None: raise ValueError("`num_trees` must be provided") - if not num_observations: + if num_observations is None: raise ValueError("`num_observations` must be provided") if num_features != len(self.feature_types): raise ValueError("`feature_types` must have `num_features` total elements") @@ -101,6 +101,8 @@ def __init__( raise ValueError( "`variable_weights` must have `num_features` total elements" ) + if leaf_model_type is None: + leaf_model_type = 0 if not _check_is_int(leaf_model_type): raise ValueError("`leaf_model_type` must be an integer between 0 and 3") elif leaf_model_type < 0 or leaf_model_type > 3: @@ -109,7 +111,7 @@ def __init__( raise ValueError("`leaf_dimension` must be an integer greater than 0") elif leaf_dimension <= 0: raise ValueError("`leaf_dimension` must be an integer greater than 0") - if not leaf_model_scale: + if leaf_model_scale is None: diag_value = 1.0 / num_trees leaf_model_scale_array = np.zeros((leaf_dimension, leaf_dimension), float) np.fill_diagonal(leaf_model_scale_array, diag_value) @@ -439,7 +441,7 @@ def get_leaf_model_type(self) -> int: leaf_model_type : int Integer coded leaf model type """ - self.leaf_model_type + return self.leaf_model_type def get_leaf_model_scale(self) -> np.ndarray: """ @@ -450,7 +452,7 @@ def get_leaf_model_scale(self) -> np.ndarray: leaf_model_scale : np.ndarray Scale parameter (in array form) used in Gaussian leaf models. If the Gaussian leaf model is univariate, the array returned is a 1x1 matrix. """ - self.leaf_model_scale + return self.leaf_model_scale def get_variance_forest_shape(self) -> float: """ diff --git a/stochtree/data.py b/stochtree/data.py index aecd6ac4..a29e80f5 100644 --- a/stochtree/data.py +++ b/stochtree/data.py @@ -75,6 +75,61 @@ def add_variance_weights(self, variance_weights: np.array): n = variance_weights.size self.dataset_cpp.AddVarianceWeights(variance_weights, n) + def num_observations(self) -> int: + """ + Query the number of observations in a dataset + + Returns + ------- + int + Number of observations in the dataset + """ + return self.dataset_cpp.NumRows() + + def num_covariates(self) -> int: + """ + Query the number of covariates (features) in a dataset + + Returns + ------- + int + Number of covariates in the dataset + """ + return self.dataset_cpp.NumCovariates() + + def num_basis(self) -> int: + """ + Query the dimension of the basis vector in a dataset + + Returns + ------- + int + Dimension of the basis vector in the dataset, returning 0 if the dataset does not have a basis + """ + return self.dataset_cpp.NumBasis() + + def has_basis(self) -> bool: + """ + Whether or not a dataset has a basis vector (for leaf regression) + + Returns + ------- + bool + `True` if the dataset has a basis, `False` otherwise + """ + return self.dataset_cpp.HasBasis() + + def has_variance_weights(self) -> bool: + """ + Whether or not a dataset has variance weights + + Returns + ------- + bool + `True` if the dataset has variance weights, `False` otherwise + """ + return self.dataset_cpp.HasVarianceWeights() + class Residual: """ diff --git a/stochtree/sampler.py b/stochtree/sampler.py index b8eb6e61..3351c085 100644 --- a/stochtree/sampler.py +++ b/stochtree/sampler.py @@ -103,8 +103,8 @@ def sample_one_iteration( dataset: Dataset, residual: Residual, rng: RNG, - forest_config: ForestModelConfig, global_config: GlobalModelConfig, + forest_config: ForestModelConfig, keep_forest: bool, gfr: bool, ) -> None: @@ -123,10 +123,10 @@ def sample_one_iteration( `stochtree` object storing continuously updated partial / full residual rng : RNG `stochtree` object storing C++ random number generator to be used sampling algorithm - forest_config : ForestModelConfig - `ForestModelConfig` object containing forest model parameters and settings global_config : GlobalModelConfig `GlobalModelConfig` object containing global model parameters and settings + forest_config : ForestModelConfig + `ForestModelConfig` object containing forest model parameters and settings keep_forest : bool Whether or not the resulting forest should be retained in `forest_container` or discarded (due to burnin or thinning for example) gfr : bool diff --git a/test/python/test_json.py b/test/python/test_json.py index 4eba7ee6..8016cbbd 100644 --- a/test/python/test_json.py +++ b/test/python/test_json.py @@ -13,6 +13,8 @@ GlobalVarianceModel, JSONSerializer, Residual, + ForestModelConfig, + GlobalModelConfig ) @@ -210,10 +212,26 @@ def outcome_mean(X, W): residual = Residual(resid) # Forest samplers and temporary tracking data structures + leaf_model_type = 0 if p_W == 0 else 1 + 1*(p_W > 1) + forest_config = ForestModelConfig( + num_trees=num_trees, + num_features=p_X, + num_observations=n, + feature_types=feature_types, + variable_weights=var_weights, + leaf_dimension=p_W, + alpha=alpha, + beta=beta, + min_samples_leaf=min_samples_leaf, + leaf_model_type=leaf_model_type, + cutpoint_grid_size=cutpoint_grid_size, + leaf_model_scale=leaf_prior_scale, + ) + global_config = GlobalModelConfig(global_error_variance=global_variance_init) forest_container = ForestContainer(num_trees, W.shape[1], False, False) active_forest = Forest(num_trees, W.shape[1], False, False) forest_sampler = ForestSampler( - dataset, feature_types, num_trees, n, alpha, beta, min_samples_leaf + dataset, global_config, forest_config ) cpp_rng = RNG(random_seed) global_var_model = GlobalVarianceModel() @@ -225,6 +243,17 @@ def outcome_mean(X, W): global_var_samples = np.concatenate( (np.array([global_variance_init]), np.repeat(0, num_samples)) ) + if p_W > 0: + init_val = np.repeat(0.0, W.shape[1]) + else: + init_val = np.array([0.0]) + forest_sampler.prepare_for_sampler( + dataset, + residual, + active_forest, + leaf_model_type, + init_val, + ) # Run "grow-from-root" sampler for i in range(num_warmstart): @@ -234,17 +263,10 @@ def outcome_mean(X, W): dataset, residual, cpp_rng, - feature_types, - cutpoint_grid_size, - leaf_prior_scale, - var_weights, - 1.0, - 1.0, - global_var_samples[i], - 1, + global_config, + forest_config, True, True, - False, ) global_var_samples[i + 1] = global_var_model.sample_one_iteration( residual, cpp_rng, a_global, b_global @@ -258,17 +280,10 @@ def outcome_mean(X, W): dataset, residual, cpp_rng, - feature_types, - cutpoint_grid_size, - leaf_prior_scale, - var_weights, - 1.0, - 1.0, - global_var_samples[i], - 1, + global_config, + forest_config, + True, True, - False, - False, ) global_var_samples[i + 1] = global_var_model.sample_one_iteration( residual, cpp_rng, a_global, b_global diff --git a/test/python/test_residual.py b/test/python/test_residual.py index 87c8a17b..8b879d30 100644 --- a/test/python/test_residual.py +++ b/test/python/test_residual.py @@ -1,6 +1,15 @@ import numpy as np -from stochtree import RNG, Dataset, Forest, ForestContainer, ForestSampler, Residual +from stochtree import ( + RNG, + Dataset, + Forest, + ForestContainer, + ForestSampler, + Residual, + ForestModelConfig, + GlobalModelConfig, +) class TestResidual: @@ -45,9 +54,22 @@ def test_basis_update(self): cpp_rng = RNG(-1) # Create forest sampler and forest container - forest_sampler = ForestSampler( - forest_dataset, feature_types, num_trees, n, alpha, beta, min_samples_leaf + forest_config = ForestModelConfig( + num_trees=num_trees, + num_features=p, + num_observations=n, + feature_types=feature_types, + variable_weights=variable_weights, + leaf_dimension=1, + alpha=alpha, + beta=beta, + min_samples_leaf=min_samples_leaf, + leaf_model_type=1, + cutpoint_grid_size=cutpoint_grid_size, + leaf_model_scale=current_leaf_scale, ) + global_config = GlobalModelConfig(global_error_variance=current_sigma2) + forest_sampler = ForestSampler(forest_dataset, global_config, forest_config) forest_container = ForestContainer(num_trees, 1, False, False) active_forest = Forest(num_trees, 1, False, False) @@ -65,15 +87,8 @@ def test_basis_update(self): forest_dataset, residual, cpp_rng, - feature_types, - cutpoint_grid_size, - current_leaf_scale, - variable_weights, - a_forest, - b_forest, - current_sigma2, - 1, - True, + global_config, + forest_config, True, True, ) From 082b19a845cc17ed18df27dc33df43934f748a3e Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Wed, 26 Feb 2025 00:55:22 -0600 Subject: [PATCH 18/35] Removed unnecessary code from test --- test/python/test_residual.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/python/test_residual.py b/test/python/test_residual.py index 8b879d30..a0dd1c09 100644 --- a/test/python/test_residual.py +++ b/test/python/test_residual.py @@ -47,8 +47,6 @@ def test_basis_update(self): current_sigma2 = 1.0 current_leaf_scale = np.array([[1.0 / num_trees]]) cutpoint_grid_size = 100 - a_forest = 1 - b_forest = 1 # RNG cpp_rng = RNG(-1) From 0cd663a4e76dcaf024321a76afb5f24a7fa6addd Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Thu, 27 Feb 2025 18:20:55 -0600 Subject: [PATCH 19/35] Update how GoogleTest is used --- CMakeLists.txt | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c9c0796..3c8f1796 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -132,12 +132,17 @@ endif() # Build C++ test program if(BUILD_TEST) - # Download the GoogleTest dependency if necessary + # Check if user specified a local clone of the GoogleTest repo, use Github repo if not + if (NOT DEFINED GOOGLETEST_GIT_REPO) + set(GOOGLETEST_GIT_REPO https://github.com/google/googletest.git) + endif() + + # Fetch and install GoogleTest dependency include(FetchContent) FetchContent_Declare( googletest - GIT_REPOSITORY https://github.com/google/googletest.git - GIT_TAG e2239ee6043f73722e7aa812a459f54a28552929 # release-1.14.0 + GIT_REPOSITORY ${GOOGLETEST_GIT_REPO} + GIT_TAG 6910c9d9165801d8827d628cb72eb7ea9dd538c5 # release-1.16.0 ) # For Windows: Prevent overriding the parent project's compiler/linker settings set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) From 1843ab3fec97aa869c04df8925d83510409b7eaa Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Thu, 27 Feb 2025 21:56:57 -0600 Subject: [PATCH 20/35] Added ability to propagate scalar-valued leaf variance parameters --- stochtree/bart.py | 38 ++++++++++++++++++++++++++++++++------ stochtree/bcf.py | 21 ++++++++++++++++++--- test/python/test_bart.py | 12 ++++++------ 3 files changed, 56 insertions(+), 15 deletions(-) diff --git a/stochtree/bart.py b/stochtree/bart.py index 1007681a..908039b0 100644 --- a/stochtree/bart.py +++ b/stochtree/bart.py @@ -614,12 +614,38 @@ def sample( if b_leaf is None else b_leaf ) - sigma_leaf = ( - np.squeeze(np.var(resid_train)) / num_trees_mean - if sigma_leaf is None - else sigma_leaf - ) - current_leaf_scale = np.array([[sigma_leaf]]) + if self.has_basis: + if sigma_leaf is None: + current_leaf_scale = np.zeros((self.num_basis, self.num_basis)) + np.fill_diagonal(current_leaf_scale, np.squeeze(np.var(resid_train)) / num_trees_mean) + elif isinstance(sigma_leaf, float): + current_leaf_scale = np.zeros((self.num_basis, self.num_basis)) + np.fill_diagonal(current_leaf_scale, sigma_leaf) + elif isinstance(sigma_leaf, np.ndarray): + if sigma_leaf.ndim != 2: + raise ValueError("sigma_leaf must be a 2d symmetric numpy array if provided in matrix form") + if sigma_leaf.shape[0] != sigma_leaf.shape[1]: + raise ValueError("sigma_leaf must be a 2d symmetric numpy array if provided in matrix form") + if sigma_leaf.shape[0] != self.num_basis: + raise ValueError("sigma_leaf must be a 2d symmetric numpy array with its dimensionality matching the basis dimension") + current_leaf_scale = sigma_leaf + else: + raise ValueError("sigma_leaf must be either a scalar or a 2d symmetric numpy array") + else: + if sigma_leaf is None: + current_leaf_scale = np.array([[np.squeeze(np.var(resid_train)) / num_trees_mean]]) + elif isinstance(sigma_leaf, float): + current_leaf_scale = np.array([[sigma_leaf]]) + elif isinstance(sigma_leaf, np.ndarray): + if sigma_leaf.ndim != 2: + raise ValueError("sigma_leaf must be a 2d symmetric numpy array if provided in matrix form") + if sigma_leaf.shape[0] != sigma_leaf.shape[1]: + raise ValueError("sigma_leaf must be a 2d symmetric numpy array if provided in matrix form") + if sigma_leaf.shape[0] != 1: + raise ValueError("sigma_leaf must be a 1x1 numpy array for this leaf model") + current_leaf_scale = sigma_leaf + else: + raise ValueError("sigma_leaf must be either a scalar or a 2d numpy array") else: current_leaf_scale = np.array([[1.0]]) if self.include_variance_forest: diff --git a/stochtree/bcf.py b/stochtree/bcf.py index 325bb43b..4b220f51 100644 --- a/stochtree/bcf.py +++ b/stochtree/bcf.py @@ -1098,11 +1098,26 @@ def sample( ) current_sigma2 = sigma2_init self.sigma2_init = sigma2_init - current_leaf_scale_mu = np.array([[sigma_leaf_mu]]) - if not isinstance(sigma_leaf_tau, np.ndarray): - current_leaf_scale_tau = np.array([[sigma_leaf_tau]]) + if isinstance(sigma_leaf_mu, float): + current_leaf_scale_mu = np.array([[sigma_leaf_mu]]) else: + raise ValueError("sigma_leaf_mu must be a scalar") + if isinstance(sigma_leaf_tau, float): + if Z_train.shape[1] > 1: + current_leaf_scale_tau = np.zeros((Z_train.shape[1], Z_train.shape[1])) + np.fill_diagonal(current_leaf_scale_tau, sigma_leaf_tau) + else: + current_leaf_scale_tau = np.array([[sigma_leaf_tau]]) + elif isinstance(sigma_leaf_tau, np.ndarray): + if sigma_leaf_tau.ndim != 2: + raise ValueError("sigma_leaf_tau must be a 2d symmetric numpy array if provided in matrix form") + if sigma_leaf_tau.shape[0] != sigma_leaf_tau.shape[1]: + raise ValueError("sigma_leaf_tau must be a 2d symmetric numpy array if provided in matrix form") + if sigma_leaf_tau.shape[0] != Z_train.shape[1]: + raise ValueError("sigma_leaf_tau must be a 2d numpy array with dimension matching that of the treatment vector") current_leaf_scale_tau = sigma_leaf_tau + else: + raise ValueError("sigma_leaf_tau must be a scalar or a 2d numpy array") if self.include_variance_forest: if not a_forest: a_forest = num_trees_variance / a_0**2 + 0.5 diff --git a/test/python/test_bart.py b/test/python/test_bart.py index bd08fd05..878b0b20 100644 --- a/test/python/test_bart.py +++ b/test/python/test_bart.py @@ -11,7 +11,7 @@ def test_bart_constant_leaf_homoskedastic(self): rng = np.random.default_rng(random_seed) # Generate covariates and basis - n = 1000 + n = 100 p_X = 10 X = rng.uniform(0, 1, (n, p_X)) @@ -67,7 +67,7 @@ def test_bart_univariate_leaf_regression_homoskedastic(self): rng = np.random.default_rng(random_seed) # Generate covariates and basis - n = 1000 + n = 100 p_X = 10 p_W = 1 X = rng.uniform(0, 1, (n, p_X)) @@ -133,7 +133,7 @@ def test_bart_multivariate_leaf_regression_homoskedastic(self): rng = np.random.default_rng(random_seed) # Generate covariates and basis - n = 1000 + n = 100 p_X = 10 p_W = 5 X = rng.uniform(0, 1, (n, p_X)) @@ -199,7 +199,7 @@ def test_bart_constant_leaf_heteroskedastic(self): rng = np.random.default_rng(random_seed) # Generate covariates and basis - n = 1000 + n = 100 p_X = 10 X = rng.uniform(0, 1, (n, p_X)) @@ -270,7 +270,7 @@ def test_bart_univariate_leaf_regression_heteroskedastic(self): rng = np.random.default_rng(random_seed) # Generate covariates and basis - n = 1000 + n = 100 p_X = 10 p_W = 1 X = rng.uniform(0, 1, (n, p_X)) @@ -351,7 +351,7 @@ def test_bart_multivariate_leaf_regression_heteroskedastic(self): rng = np.random.default_rng(random_seed) # Generate covariates and basis - n = 1000 + n = 100 p_X = 10 p_W = 5 X = rng.uniform(0, 1, (n, p_X)) From d9efdc4039fe9d881377646ca0c9a6b59e7e3aa0 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Thu, 27 Feb 2025 22:01:31 -0600 Subject: [PATCH 21/35] Refactored "mu" and "tau" notation out of python BCF function signature --- .../causal_inference_feature_subsets.ipynb | 2 +- stochtree/bcf.py | 64 +++++++++---------- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/demo/notebooks/causal_inference_feature_subsets.ipynb b/demo/notebooks/causal_inference_feature_subsets.ipynb index d2ada0d6..2a0283b2 100644 --- a/demo/notebooks/causal_inference_feature_subsets.ipynb +++ b/demo/notebooks/causal_inference_feature_subsets.ipynb @@ -248,7 +248,7 @@ " pi_test,\n", " num_gfr=10,\n", " num_mcmc=100,\n", - " tau_forest_params=tau_params,\n", + " treatment_effect_forest_params=tau_params,\n", ")" ] }, diff --git a/stochtree/bcf.py b/stochtree/bcf.py index 4b220f51..c8dfd1b2 100644 --- a/stochtree/bcf.py +++ b/stochtree/bcf.py @@ -80,8 +80,8 @@ def sample( num_burnin: int = 0, num_mcmc: int = 100, general_params: Optional[Dict[str, Any]] = None, - mu_forest_params: Optional[Dict[str, Any]] = None, - tau_forest_params: Optional[Dict[str, Any]] = None, + prognostic_forest_params: Optional[Dict[str, Any]] = None, + treatment_effect_forest_params: Optional[Dict[str, Any]] = None, variance_forest_params: Optional[Dict[str, Any]] = None, ) -> None: """Runs a BCF sampler on provided training set. Outcome predictions and estimates of the prognostic and treatment effect functions @@ -132,7 +132,7 @@ def sample( * `keep_every` (`int`): How many iterations of the burned-in MCMC sampler should be run before forests and parameters are retained. Defaults to `1`. Setting `keep_every = k` for some `k > 1` will "thin" the MCMC samples by retaining every `k`-th sample, rather than simply every sample. This can reduce the autocorrelation of the MCMC samples. * `num_chains` (`int`): How many independent MCMC chains should be sampled. If `num_mcmc = 0`, this is ignored. If `num_gfr = 0`, then each chain is run from root for `num_mcmc * keep_every + num_burnin` iterations, with `num_mcmc` samples retained. If `num_gfr > 0`, each MCMC chain will be initialized from a separate GFR ensemble, with the requirement that `num_gfr >= num_chains`. Defaults to `1`. - mu_forest_params : dict, optional + prognostic_forest_params : dict, optional Dictionary of prognostic forest model parameters, each of which has a default value processed internally, so this argument is optional. * `num_trees` (`int`): Number of trees in the prognostic forest. Defaults to `250`. Must be a positive integer. @@ -148,7 +148,7 @@ def sample( * `keep_vars` (`list` or `np.array`): Vector of variable names or column indices denoting variables that should be included in the prognostic (`mu(X)`) forest. Defaults to `None`. * `drop_vars` (`list` or `np.array`): Vector of variable names or column indices denoting variables that should be excluded from the prognostic (`mu(X)`) forest. Defaults to `None`. If both `drop_vars` and `keep_vars` are set, `drop_vars` will be ignored. - tau_forest_params : dict, optional + treatment_effect_forest_params : dict, optional Dictionary of treatment effect forest model parameters, each of which has a default value processed internally, so this argument is optional. * `num_trees` (`int`): Number of trees in the treatment effect forest. Defaults to `50`. Must be a positive integer. @@ -207,7 +207,7 @@ def sample( ) # Update mu forest BART parameters - mu_forest_params_default = { + prognostic_forest_params_default = { "num_trees": 250, "alpha": 0.95, "beta": 2.0, @@ -220,12 +220,12 @@ def sample( "keep_vars": None, "drop_vars": None, } - mu_forest_params_updated = _preprocess_params( - mu_forest_params_default, mu_forest_params + prognostic_forest_params_updated = _preprocess_params( + prognostic_forest_params_default, prognostic_forest_params ) # Update tau forest BART parameters - tau_forest_params_default = { + treatment_effect_forest_params_default = { "num_trees": 50, "alpha": 0.25, "beta": 3.0, @@ -238,8 +238,8 @@ def sample( "keep_vars": None, "drop_vars": None, } - tau_forest_params_updated = _preprocess_params( - tau_forest_params_default, tau_forest_params + treatment_effect_forest_params_updated = _preprocess_params( + treatment_effect_forest_params_default, treatment_effect_forest_params ) # Update variance forest BART parameters @@ -279,30 +279,30 @@ def sample( keep_every = general_params_updated["keep_every"] # 2. Mu forest parameters - num_trees_mu = mu_forest_params_updated["num_trees"] - alpha_mu = mu_forest_params_updated["alpha"] - beta_mu = mu_forest_params_updated["beta"] - min_samples_leaf_mu = mu_forest_params_updated["min_samples_leaf"] - max_depth_mu = mu_forest_params_updated["max_depth"] - sample_sigma_leaf_mu = mu_forest_params_updated["sample_sigma2_leaf"] - sigma_leaf_mu = mu_forest_params_updated["sigma2_leaf_init"] - a_leaf_mu = mu_forest_params_updated["sigma2_leaf_shape"] - b_leaf_mu = mu_forest_params_updated["sigma2_leaf_scale"] - keep_vars_mu = mu_forest_params_updated["keep_vars"] - drop_vars_mu = mu_forest_params_updated["drop_vars"] + num_trees_mu = prognostic_forest_params_updated["num_trees"] + alpha_mu = prognostic_forest_params_updated["alpha"] + beta_mu = prognostic_forest_params_updated["beta"] + min_samples_leaf_mu = prognostic_forest_params_updated["min_samples_leaf"] + max_depth_mu = prognostic_forest_params_updated["max_depth"] + sample_sigma_leaf_mu = prognostic_forest_params_updated["sample_sigma2_leaf"] + sigma_leaf_mu = prognostic_forest_params_updated["sigma2_leaf_init"] + a_leaf_mu = prognostic_forest_params_updated["sigma2_leaf_shape"] + b_leaf_mu = prognostic_forest_params_updated["sigma2_leaf_scale"] + keep_vars_mu = prognostic_forest_params_updated["keep_vars"] + drop_vars_mu = prognostic_forest_params_updated["drop_vars"] # 3. Tau forest parameters - num_trees_tau = tau_forest_params_updated["num_trees"] - alpha_tau = tau_forest_params_updated["alpha"] - beta_tau = tau_forest_params_updated["beta"] - min_samples_leaf_tau = tau_forest_params_updated["min_samples_leaf"] - max_depth_tau = tau_forest_params_updated["max_depth"] - sample_sigma_leaf_tau = tau_forest_params_updated["sample_sigma2_leaf"] - sigma_leaf_tau = tau_forest_params_updated["sigma2_leaf_init"] - a_leaf_tau = tau_forest_params_updated["sigma2_leaf_shape"] - b_leaf_tau = tau_forest_params_updated["sigma2_leaf_scale"] - keep_vars_tau = tau_forest_params_updated["keep_vars"] - drop_vars_tau = tau_forest_params_updated["drop_vars"] + num_trees_tau = treatment_effect_forest_params_updated["num_trees"] + alpha_tau = treatment_effect_forest_params_updated["alpha"] + beta_tau = treatment_effect_forest_params_updated["beta"] + min_samples_leaf_tau = treatment_effect_forest_params_updated["min_samples_leaf"] + max_depth_tau = treatment_effect_forest_params_updated["max_depth"] + sample_sigma_leaf_tau = treatment_effect_forest_params_updated["sample_sigma2_leaf"] + sigma_leaf_tau = treatment_effect_forest_params_updated["sigma2_leaf_init"] + a_leaf_tau = treatment_effect_forest_params_updated["sigma2_leaf_shape"] + b_leaf_tau = treatment_effect_forest_params_updated["sigma2_leaf_scale"] + keep_vars_tau = treatment_effect_forest_params_updated["keep_vars"] + drop_vars_tau = treatment_effect_forest_params_updated["drop_vars"] # 4. Variance forest parameters num_trees_variance = variance_forest_params_updated["num_trees"] From cad3dabf47581a79e35dd939b85ce58e7c6443f9 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Tue, 4 Mar 2025 13:38:31 -0600 Subject: [PATCH 22/35] Added python wrappers for C++ random effects sampler --- R/random_effects.R | 2 +- include/stochtree/data.h | 2 + include/stochtree/random_effects.h | 41 +++ src/py_stochtree.cpp | 335 +++++++++++++++++++++++-- stochtree/__init__.py | 17 +- stochtree/random_effects.py | 388 +++++++++++++++++++++++++++++ stochtree/sampler.py | 4 - stochtree/serialization.py | 23 +- 8 files changed, 785 insertions(+), 27 deletions(-) create mode 100644 stochtree/random_effects.py diff --git a/R/random_effects.R b/R/random_effects.R index e4199155..d737ef8e 100644 --- a/R/random_effects.R +++ b/R/random_effects.R @@ -93,7 +93,7 @@ RandomEffectSamples <- R6::R6Class( #' Predict random effects for each observation implied by `rfx_group_ids` and `rfx_basis`. #' If a random effects model is "intercept-only" the `rfx_basis` will be a vector of ones of size `length(rfx_group_ids)`. #' @param rfx_group_ids Indices of random effects groups in a prediction set - #' @param rfx_basis (Optional ) Basis used for random effects prediction + #' @param rfx_basis (Optional) Basis used for random effects prediction #' @return Matrix with as many rows as observations provided and as many columns as samples drawn of the model. predict = function(rfx_group_ids, rfx_basis = NULL) { num_obs = length(rfx_group_ids) diff --git a/include/stochtree/data.h b/include/stochtree/data.h index c3bdb077..47b4fb9b 100644 --- a/include/stochtree/data.h +++ b/include/stochtree/data.h @@ -522,6 +522,8 @@ class RandomEffectsDataset { } /*! \brief Number of observations (rows) in the dataset */ inline data_size_t NumObservations() {return basis_.NumRows();} + /*! \brief Number of columns of the basis vector in the dataset */ + inline int NumBases() {return basis_.NumCols();} /*! \brief Whether or not a `RandomEffectsDataset` has (yet) loaded basis data */ inline bool HasBasis() {return has_basis_;} /*! \brief Whether or not a `RandomEffectsDataset` has (yet) loaded variance weights */ diff --git a/include/stochtree/random_effects.h b/include/stochtree/random_effects.h index 1f324970..451bc4e4 100644 --- a/include/stochtree/random_effects.h +++ b/include/stochtree/random_effects.h @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -91,6 +92,26 @@ class LabelMapper { int32_t CategoryNumber(int32_t category_id) { return label_map_[category_id]; } + void SaveToJsonFile(std::string filename) { + nlohmann::json model_json = this->to_json(); + std::ofstream output_file(filename); + output_file << model_json << std::endl; + } + void LoadFromJsonFile(std::string filename) { + std::ifstream f(filename); + nlohmann::json rfx_label_mapper_json = nlohmann::json::parse(f); + this->Reset(); + this->from_json(rfx_label_mapper_json); + } + std::string DumpJsonString() { + nlohmann::json model_json = this->to_json(); + return model_json.dump(); + } + void LoadFromJsonString(std::string& json_string) { + nlohmann::json rfx_label_mapper_json = nlohmann::json::parse(json_string); + this->Reset(); + this->from_json(rfx_label_mapper_json); + } std::vector& Keys() {return keys_;} std::map& Map() {return label_map_;} void Reset() {label_map_.clear(); keys_.clear();} @@ -275,6 +296,26 @@ class RandomEffectsContainer { num_samples_ = 0; } ~RandomEffectsContainer() {} + void SaveToJsonFile(std::string filename) { + nlohmann::json model_json = this->to_json(); + std::ofstream output_file(filename); + output_file << model_json << std::endl; + } + void LoadFromJsonFile(std::string filename) { + std::ifstream f(filename); + nlohmann::json rfx_container_json = nlohmann::json::parse(f); + this->Reset(); + this->from_json(rfx_container_json); + } + std::string DumpJsonString() { + nlohmann::json model_json = this->to_json(); + return model_json.dump(); + } + void LoadFromJsonString(std::string& json_string) { + nlohmann::json rfx_container_json = nlohmann::json::parse(json_string); + this->Reset(); + this->from_json(rfx_container_json); + } void AddSample(MultivariateRegressionRandomEffectsModel& model); void DeleteSample(int sample_num); void Predict(RandomEffectsDataset& dataset, LabelMapper& label_mapper, std::vector& output); diff --git a/src/py_stochtree.cpp b/src/py_stochtree.cpp index 0581bf85..6cdae80c 100644 --- a/src/py_stochtree.cpp +++ b/src/py_stochtree.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -1200,25 +1201,206 @@ class LeafVarianceModelCpp { StochTree::LeafNodeHomoskedasticVarianceModel var_model_; }; -void ForestContainerCpp::AdjustResidual(ForestDatasetCpp& dataset, ResidualCpp& residual, ForestSamplerCpp& sampler, bool requires_basis, int forest_num, bool add) { - // Determine whether or not we are adding forest_num to the residuals - std::function op; - if (add) op = std::plus(); - else op = std::minus(); - - // Perform the update (addition / subtraction) operation - StochTree::UpdateResidualEntireForest(*(sampler.GetTracker()), *(dataset.GetDataset()), *(residual.GetData()), forest_samples_->GetEnsemble(forest_num), requires_basis, op); -} +class RandomEffectsDatasetCpp { + public: + RandomEffectsDatasetCpp() { + rfx_dataset_ = std::make_unique(); + } + ~RandomEffectsDatasetCpp() {} + StochTree::RandomEffectsDataset* GetDataset() { + return rfx_dataset_.get(); + } + py::ssize_t NumObservations() { + return rfx_dataset_->NumObservations(); + } + int NumBases() { + return rfx_dataset_->NumBases(); + } + void AddGroupLabels(py::array_t group_labels, data_size_t num_row) { + std::vector group_labels_vec(num_row); + auto accessor = group_labels.mutable_unchecked<1>(); + for (py::ssize_t i = 0; i < num_row; i++) { + group_labels_vec[i] = accessor(i); + } + rfx_dataset_->AddGroupLabels(group_labels_vec); + } + void AddBasis(py::array_t basis, data_size_t num_row, int num_col, bool row_major) { + double* basis_data_ptr = static_cast(basis.mutable_data()); + rfx_dataset_->AddBasis(basis_data_ptr, num_row, num_col, row_major); + } + void AddVarianceWeights(py::array_t weights, data_size_t num_row) { + double* weight_data_ptr = static_cast(weights.mutable_data()); + rfx_dataset_->AddVarianceWeights(weight_data_ptr, num_row); + } + bool HasGroupLabels() {return rfx_dataset_->HasGroupLabels();} + bool HasBasis() {return rfx_dataset_->HasBasis();} + bool HasVarianceWeights() {return rfx_dataset_->HasVarWeights();} -void ForestCpp::AdjustResidual(ForestDatasetCpp& dataset, ResidualCpp& residual, ForestSamplerCpp& sampler, bool requires_basis, bool add) { - // Determine whether or not we are adding forest_num to the residuals - std::function op; - if (add) op = std::plus(); - else op = std::minus(); - - // Perform the update (addition / subtraction) operation - StochTree::UpdateResidualEntireForest(*(sampler.GetTracker()), *(dataset.GetDataset()), *(residual.GetData()), forest_.get(), requires_basis, op); -} + private: + std::unique_ptr rfx_dataset_; +}; + +class RandomEffectsModelCpp; + +class RandomEffectsLabelMapperCpp; + +class RandomEffectsContainerCpp { + public: + RandomEffectsContainerCpp(int num_components, int num_groups) { + rfx_container_ = std::make_unique(num_components, num_groups); + } + ~RandomEffectsContainerCpp() {} + void AddSample(RandomEffectsModelCpp& rfx_model); + int NumSamples() { + return rfx_container_->NumSamples(); + } + int NumComponents() { + return rfx_container_->NumComponents(); + } + int NumGroups() { + return rfx_container_->NumGroups(); + } + void DeleteSample(int sample_num) { + rfx_container_->DeleteSample(sample_num); + } + py::array_t Predict(RandomEffectsDatasetCpp& rfx_dataset, RandomEffectsLabelMapperCpp& label_mapper); + void SaveToJsonFile(std::string json_filename) { + rfx_container_->SaveToJsonFile(json_filename); + } + void LoadFromJsonFile(std::string json_filename) { + rfx_container_->LoadFromJsonFile(json_filename); + } + std::string DumpJsonString() { + return rfx_container_->DumpJsonString(); + } + void LoadFromJsonString(std::string& json_string) { + rfx_container_->LoadFromJsonString(json_string); + } + void LoadFromJson(JsonCpp& json, std::string rfx_label); + + private: + std::unique_ptr rfx_container_; +}; + +class RandomEffectsTrackerCpp { + public: + RandomEffectsTrackerCpp(py::array_t group_labels) { + int vec_size = group_labels.size(); + std::vector group_labels_vec(vec_size); + for (int i = 0; i < vec_size; i++) { + group_labels_vec[i] = group_labels.at(i); + } + rfx_tracker_ = std::make_unique(group_labels_vec); + } + ~RandomEffectsTrackerCpp() {} + StochTree::RandomEffectsTracker* GetTracker() { + return rfx_tracker_.get(); + } + + private: + std::unique_ptr rfx_tracker_; +}; + +class RandomEffectsLabelMapperCpp { + public: + RandomEffectsLabelMapperCpp(RandomEffectsTrackerCpp& rfx_tracker) { + StochTree::RandomEffectsTracker* internal_tracker = rfx_tracker.GetTracker(); + rfx_label_mapper_ = std::make_unique(internal_tracker->GetLabelMap()); + } + ~RandomEffectsLabelMapperCpp() {} + void SaveToJsonFile(std::string json_filename) { + rfx_label_mapper_->SaveToJsonFile(json_filename); + } + void LoadFromJsonFile(std::string json_filename) { + rfx_label_mapper_->LoadFromJsonFile(json_filename); + } + std::string DumpJsonString() { + return rfx_label_mapper_->DumpJsonString(); + } + void LoadFromJsonString(std::string& json_string) { + rfx_label_mapper_->LoadFromJsonString(json_string); + } + void LoadFromJson(JsonCpp& json, std::string rfx_label); + StochTree::LabelMapper* GetLabelMapper() { + return rfx_label_mapper_.get(); + } + + private: + std::unique_ptr rfx_label_mapper_; +}; + +class RandomEffectsModelCpp { + public: + RandomEffectsModelCpp(int num_components, int num_groups) { + rfx_model_ = std::make_unique(num_components, num_groups); + } + ~RandomEffectsModelCpp() {} + StochTree::MultivariateRegressionRandomEffectsModel* GetModel() { + return rfx_model_.get(); + } + void SampleRandomEffects(RandomEffectsDatasetCpp& rfx_dataset, ResidualCpp& residual, + RandomEffectsTrackerCpp& rfx_tracker, RandomEffectsContainerCpp& rfx_container, + bool keep_sample, double global_variance, RngCpp& rng); + py::array_t Predict(RandomEffectsDatasetCpp& rfx_dataset, RandomEffectsTrackerCpp& rfx_tracker) { + std::vector output = rfx_model_->Predict(*rfx_dataset.GetDataset(), *rfx_tracker.GetTracker()); + py::ssize_t output_length = output.size(); + auto result = py::array_t(py::detail::any_container({output_length})); + auto accessor = result.mutable_unchecked<1>(); + for (size_t i = 0; i < output_length; i++) { + accessor(i) = output.at(i); + } + return result; + } + void SetWorkingParameter(py::array_t& working_param) { + Eigen::VectorXd working_param_eigen(working_param.size()); + for (int i = 0; i < working_param.size(); i++) { + working_param_eigen(i) = working_param.at(i); + } + rfx_model_->SetWorkingParameter(working_param_eigen); + } + void SetGroupParameters(py::array_t& group_params) { + py::ssize_t nrow = group_params.shape(0); + py::ssize_t ncol = group_params.shape(1); + Eigen::MatrixXd group_params_eigen(nrow, ncol); + for (py::ssize_t i = 0; i < nrow; i++) { + for (int j = 0; j < ncol; j++) { + group_params_eigen(i,j) = group_params.at(i,j); + } + } + rfx_model_->SetGroupParameters(group_params_eigen); + } + void SetWorkingParameterCovariance(py::array_t& working_param_cov) { + int nrow = working_param_cov.shape(0); + int ncol = working_param_cov.shape(1); + Eigen::MatrixXd working_param_cov_eigen(nrow, ncol); + for (int i = 0; i < nrow; i++) { + for (int j = 0; j < ncol; j++) { + working_param_cov_eigen(i,j) = working_param_cov.at(i,j); + } + } + rfx_model_->SetWorkingParameterCovariance(working_param_cov_eigen); + } + void SetGroupParameterCovariance(py::array_t& group_param_cov) { + int nrow = group_param_cov.shape(0); + int ncol = group_param_cov.shape(1); + Eigen::MatrixXd group_param_cov_eigen(nrow, ncol); + for (int i = 0; i < nrow; i++) { + for (int j = 0; j < ncol; j++) { + group_param_cov_eigen(i,j) = group_param_cov.at(i,j); + } + } + rfx_model_->SetGroupParameterCovariance(group_param_cov_eigen); + } + void SetVariancePriorShape(double shape) { + rfx_model_->SetVariancePriorShape(shape); + } + void SetVariancePriorScale(double scale) { + rfx_model_->SetVariancePriorScale(scale); + } + + private: + std::unique_ptr rfx_model_; +}; class JsonCpp { public: @@ -1580,6 +1762,10 @@ class JsonCpp { return json_->at("forests").at(forest_label); } + nlohmann::json SubsetJsonRFX(std::string rfx_label) { + return json_->at("random_effects").at(rfx_label); + } + private: std::unique_ptr json_; }; @@ -1590,6 +1776,65 @@ void ForestContainerCpp::LoadFromJson(JsonCpp& json, std::string forest_label) { forest_samples_->from_json(forest_json); } +void ForestContainerCpp::AdjustResidual(ForestDatasetCpp& dataset, ResidualCpp& residual, ForestSamplerCpp& sampler, bool requires_basis, int forest_num, bool add) { + // Determine whether or not we are adding forest_num to the residuals + std::function op; + if (add) op = std::plus(); + else op = std::minus(); + + // Perform the update (addition / subtraction) operation + StochTree::UpdateResidualEntireForest(*(sampler.GetTracker()), *(dataset.GetDataset()), *(residual.GetData()), forest_samples_->GetEnsemble(forest_num), requires_basis, op); +} + +void ForestCpp::AdjustResidual(ForestDatasetCpp& dataset, ResidualCpp& residual, ForestSamplerCpp& sampler, bool requires_basis, bool add) { + // Determine whether or not we are adding forest_num to the residuals + std::function op; + if (add) op = std::plus(); + else op = std::minus(); + + // Perform the update (addition / subtraction) operation + StochTree::UpdateResidualEntireForest(*(sampler.GetTracker()), *(dataset.GetDataset()), *(residual.GetData()), forest_.get(), requires_basis, op); +} + +void RandomEffectsContainerCpp::LoadFromJson(JsonCpp& json, std::string rfx_label) { + nlohmann::json rfx_json = json.SubsetJsonRFX(rfx_label); + rfx_container_->Reset(); + rfx_container_->from_json(rfx_json); +} + +void RandomEffectsContainerCpp::AddSample(RandomEffectsModelCpp& rfx_model) { + rfx_container_->AddSample(*rfx_model.GetModel()); +} + +py::array_t RandomEffectsContainerCpp::Predict(RandomEffectsDatasetCpp& rfx_dataset, RandomEffectsLabelMapperCpp& label_mapper) { + py::ssize_t num_observations = rfx_dataset.NumObservations(); + int num_samples = rfx_container_->NumSamples(); + std::vector output(num_observations*num_samples); + rfx_container_->Predict(*rfx_dataset.GetDataset(), *label_mapper.GetLabelMapper(), output); + auto result = py::array_t(py::detail::any_container({num_observations, num_samples})); + auto accessor = result.mutable_unchecked<2>(); + for (size_t i = 0; i < num_observations; i++) { + for (int j = 0; j < num_samples; j++) { + accessor(i, j) = output.at(j*num_observations + i); + } + } + return result; +} + +void RandomEffectsLabelMapperCpp::LoadFromJson(JsonCpp& json, std::string rfx_label) { + nlohmann::json rfx_json = json.SubsetJsonRFX(rfx_label); + rfx_label_mapper_->Reset(); + rfx_label_mapper_->from_json(rfx_json); +} + +void RandomEffectsModelCpp::SampleRandomEffects(RandomEffectsDatasetCpp& rfx_dataset, ResidualCpp& residual, + RandomEffectsTrackerCpp& rfx_tracker, RandomEffectsContainerCpp& rfx_container, + bool keep_sample, double global_variance, RngCpp& rng) { + rfx_model_->SampleRandomEffects(*rfx_dataset.GetDataset(), *residual.GetData(), + *rfx_tracker.GetTracker(), global_variance, *rng.GetRng()); + if (keep_sample) rfx_container.AddSample(*this); +} + PYBIND11_MODULE(stochtree_cpp, m) { py::class_(m, "JsonCpp") .def(py::init<>()) @@ -1628,7 +1873,8 @@ PYBIND11_MODULE(stochtree_cpp, m) { .def("ExtractIntegerVectorSubfolder", &JsonCpp::ExtractIntegerVectorSubfolder) .def("ExtractStringVector", &JsonCpp::ExtractStringVector) .def("ExtractStringVectorSubfolder", &JsonCpp::ExtractStringVectorSubfolder) - .def("SubsetJsonForest", &JsonCpp::SubsetJsonForest); + .def("SubsetJsonForest", &JsonCpp::SubsetJsonForest) + .def("SubsetJsonRFX", &JsonCpp::SubsetJsonRFX); py::class_(m, "ForestDatasetCpp") .def(py::init<>()) @@ -1750,6 +1996,57 @@ PYBIND11_MODULE(stochtree_cpp, m) { .def("GetMinSamplesLeaf", &ForestSamplerCpp::GetMinSamplesLeaf) .def("GetMaxDepth", &ForestSamplerCpp::GetMaxDepth); + py::class_(m, "RandomEffectsDatasetCpp") + .def(py::init<>()) + .def("GetDataset", &RandomEffectsDatasetCpp::GetDataset) + .def("NumObservations", &RandomEffectsDatasetCpp::NumObservations) + .def("NumBases", &RandomEffectsDatasetCpp::NumBases) + .def("AddGroupLabels", &RandomEffectsDatasetCpp::AddGroupLabels) + .def("AddBasis", &RandomEffectsDatasetCpp::AddBasis) + .def("AddVarianceWeights", &RandomEffectsDatasetCpp::AddVarianceWeights) + .def("HasGroupLabels", &RandomEffectsDatasetCpp::HasGroupLabels) + .def("HasBasis", &RandomEffectsDatasetCpp::HasBasis) + .def("HasVarianceWeights", &RandomEffectsDatasetCpp::HasVarianceWeights); + + py::class_(m, "RandomEffectsContainerCpp") + .def(py::init()) + .def("AddSample", &RandomEffectsContainerCpp::AddSample) + .def("NumSamples", &RandomEffectsContainerCpp::NumSamples) + .def("NumComponents", &RandomEffectsContainerCpp::NumComponents) + .def("NumGroups", &RandomEffectsContainerCpp::NumGroups) + .def("DeleteSample", &RandomEffectsContainerCpp::DeleteSample) + .def("Predict", &RandomEffectsContainerCpp::Predict) + .def("SaveToJsonFile", &RandomEffectsContainerCpp::SaveToJsonFile) + .def("LoadFromJsonFile", &RandomEffectsContainerCpp::LoadFromJsonFile) + .def("DumpJsonString", &RandomEffectsContainerCpp::DumpJsonString) + .def("LoadFromJsonString", &RandomEffectsContainerCpp::LoadFromJsonString) + .def("LoadFromJson", &RandomEffectsContainerCpp::LoadFromJson); + + py::class_(m, "RandomEffectsTrackerCpp") + .def(py::init>()) + .def("GetTracker", &RandomEffectsTrackerCpp::GetTracker); + + py::class_(m, "RandomEffectsLabelMapperCpp") + .def(py::init()) + .def("SaveToJsonFile", &RandomEffectsLabelMapperCpp::SaveToJsonFile) + .def("LoadFromJsonFile", &RandomEffectsLabelMapperCpp::LoadFromJsonFile) + .def("DumpJsonString", &RandomEffectsLabelMapperCpp::DumpJsonString) + .def("LoadFromJsonString", &RandomEffectsLabelMapperCpp::LoadFromJsonString) + .def("LoadFromJson", &RandomEffectsLabelMapperCpp::LoadFromJson) + .def("GetLabelMapper", &RandomEffectsLabelMapperCpp::GetLabelMapper); + + py::class_(m, "RandomEffectsModelCpp") + .def(py::init()) + .def("GetModel", &RandomEffectsModelCpp::GetModel) + .def("SampleRandomEffects", &RandomEffectsModelCpp::SampleRandomEffects) + .def("Predict", &RandomEffectsModelCpp::Predict) + .def("SetWorkingParameter", &RandomEffectsModelCpp::SetWorkingParameter) + .def("SetGroupParameters", &RandomEffectsModelCpp::SetGroupParameters) + .def("SetWorkingParameterCovariance", &RandomEffectsModelCpp::SetWorkingParameterCovariance) + .def("SetGroupParameterCovariance", &RandomEffectsModelCpp::SetGroupParameterCovariance) + .def("SetVariancePriorShape", &RandomEffectsModelCpp::SetVariancePriorShape) + .def("SetVariancePriorScale", &RandomEffectsModelCpp::SetVariancePriorScale); + py::class_(m, "GlobalVarianceModelCpp") .def(py::init<>()) .def("SampleOneIteration", &GlobalVarianceModelCpp::SampleOneIteration); diff --git a/stochtree/__init__.py b/stochtree/__init__.py index 62c6e019..feb28402 100644 --- a/stochtree/__init__.py +++ b/stochtree/__init__.py @@ -5,7 +5,18 @@ from .data import Dataset, Residual from .forest import Forest, ForestContainer from .preprocessing import CovariatePreprocessor -from .sampler import RNG, ForestSampler, GlobalVarianceModel, LeafVarianceModel +from .random_effects import ( + RandomEffectsContainer, + RandomEffectsDataset, + RandomEffectsModel, + RandomEffectsTracker, +) +from .sampler import ( + RNG, + ForestSampler, + GlobalVarianceModel, + LeafVarianceModel +) from .serialization import JSONSerializer from .utils import ( NotSampledError, @@ -28,6 +39,10 @@ "CovariatePreprocessor", "RNG", "ForestSampler", + "RandomEffectsContainer", + "RandomEffectsDataset", + "RandomEffectsModel", + "RandomEffectsTracker", "GlobalVarianceModel", "LeafVarianceModel", "ForestModelConfig", diff --git a/stochtree/random_effects.py b/stochtree/random_effects.py new file mode 100644 index 00000000..27caf98e --- /dev/null +++ b/stochtree/random_effects.py @@ -0,0 +1,388 @@ +import numpy as np +from stochtree_cpp import ( + RandomEffectsDatasetCpp, + RandomEffectsModelCpp, + RandomEffectsTrackerCpp, + RandomEffectsContainerCpp, + RandomEffectsLabelMapperCpp, + ResidualCpp, + RngCpp, +) +from .data import Residual +from .sampler import RNG + + +class RandomEffectsDataset: + """ + Wrapper around a C++ class that stores all of the data needed to fit a group random effects model in `stochtree`. This includes: + + 1. Labels that define random effects groups. + 2. Basis vectors used to define non-constant leaf models. This is optional but may be included via the `add_basis` method. + 3. Variance weights used to define heteroskedastic or otherwise weighted models. This is optional but may be included via the `add_variance_weights` method. + """ + + def __init__(self) -> None: + self.rfx_dataset_cpp = RandomEffectsDatasetCpp() + + def add_group_labels(self, group_labels: np.array): + """ + Add group labels to a dataset + + Parameters + ---------- + group_labels : np.array + One-dimensional numpy array of group labels. + """ + group_labels_ = np.squeeze(group_labels) + if group_labels_.ndim > 1: + raise ValueError("group_labels must be a one-dimensional numpy array of group indices") + n = group_labels_.shape[0] + self.rfx_dataset_cpp.AddGroupLabels(group_labels_, n) + + def add_basis(self, basis: np.array): + """ + Add basis matrix to a dataset + + Parameters + ---------- + basis : np.array + Two-dimensional numpy array of basis vectors. + """ + basis_ = np.expand_dims(basis, 1) if np.ndim(basis) == 1 else basis + if basis_.ndim != 2: + raise ValueError("basis must be a one-or-two-dimensional numpy array of random effect bases") + n, p = basis_.shape + basis_rowmajor = np.ascontiguousarray(basis_) + self.rfx_dataset_cpp.AddBasis(basis_rowmajor, n, p, True) + + def update_basis(self, basis: np.array): + """ + Update basis matrix in a dataset. Allows users to build an ensemble whose leaves + regress on bases that are updated throughout the sampler. + + Parameters + ---------- + basis : np.array + Numpy array of basis vectors. + """ + basis_ = np.expand_dims(basis, 1) if np.ndim(basis) == 1 else basis + if basis_.ndim != 2: + raise ValueError("basis must be a one-or-two-dimensional numpy array of random effect bases") + n, p = basis_.shape + basis_rowmajor = np.ascontiguousarray(basis_) + self.rfx_dataset_cpp.UpdateBasis(basis_rowmajor, n, p, True) + + def add_variance_weights(self, variance_weights: np.array): + """ + Add variance weights to a dataset + + Parameters + ---------- + variance_weights : np.array + Univariate numpy array of variance weights. + """ + variance_weights_ = np.squeeze(variance_weights) + if variance_weights_.ndim > 1: + raise ValueError("variance_weights must be a one-dimensional numpy array of group indices") + n = variance_weights_.shape[0] + self.rfx_dataset_cpp.AddVarianceWeights(variance_weights_, n) + + def num_observations(self) -> int: + """ + Query the number of observations in a dataset + + Returns + ------- + int + Number of observations in the dataset + """ + return self.rfx_dataset_cpp.NumObservations() + + def num_basis(self) -> int: + """ + Query the number of bases in a dataset + + Returns + ------- + int + Number of bases in the dataset + """ + return self.rfx_dataset_cpp.NumBases() + + def has_group_labels(self) -> bool: + """ + Whether or not a dataset has group labels + + Returns + ------- + bool + `True` if the dataset has group labels, `False` otherwise + """ + return self.rfx_dataset_cpp.HasGroupLabels() + + def has_basis(self) -> bool: + """ + Whether or not a dataset has a basis vector (for leaf regression) + + Returns + ------- + bool + `True` if the dataset has a basis, `False` otherwise + """ + return self.rfx_dataset_cpp.HasBasis() + + def has_variance_weights(self) -> bool: + """ + Whether or not a dataset has variance weights + + Returns + ------- + bool + `True` if the dataset has variance weights, `False` otherwise + """ + return self.rfx_dataset_cpp.HasVarianceWeights() + + +class RandomEffectsTracker: + """ + Class that defines a "tracker" for random effects models, most notably + storing the data indices available in each group for quicker posterior + computation and sampling of random effects terms. + + Parameters + ---------- + group_indices : np.ndarray + Integer indices indicating groups used to define random effects + """ + + def __init__(self, group_indices: np.ndarray) -> None: + self.rfx_tracker_cpp = RandomEffectsTrackerCpp(group_indices) + + +class RandomEffectsContainer(): + """ + Wrapper around the "persistent" aspects of a C++ random effects model. This includes + draws of the parameters and a map from the original label indices to the + 0-indexed label numbers used to place group samples in memory (i.e. the + first label is stored in column 0 of the sample matrix, the second label + is store in column 1 of the sample matrix, etc...). + + Parameters + ---------- + num_components : int + Number of components (bases) in a random effects model. For the simplest random effects model, + in which each group has a different random intercept, this is 1, and the basis is a trivial + "dummy" intercept vector. + num_groups : int + Number of groups in a random effects model. + """ + + def __init__(self, num_components: int, num_groups: int, rfx_tracker: RandomEffectsTracker) -> None: + self.rfx_container_cpp = RandomEffectsContainerCpp(num_components, num_groups) + self.rfx_label_mapper_cpp = RandomEffectsLabelMapperCpp(rfx_tracker.rfx_tracker_cpp) + + def num_samples(self) -> int: + return self.rfx_container_cpp.NumSamples() + + def num_components(self) -> int: + return self.rfx_container_cpp.NumComponents() + + def num_groups(self) -> int: + return self.rfx_container_cpp.NumGroups() + + def delete_sample(self, sample_num: int) -> None: + self.rfx_container_cpp.DeleteSample(sample_num) + + def load_from_json_string(self, json_string: str) -> None: + """ + Reload a random effects container from an in-memory JSON string. + + Parameters + ---------- + json_string : str + In-memory string containing state of a random effects container. + """ + self.rfx_container_cpp.LoadFromJsonString(json_string) + + def predict(self, group_labels: np.array, basis: np.array) -> np.ndarray: + """ + Predict random effects for each observation implied by `group_labels` and `basis`. + If a random effects model is "intercept-only", `basis` will be an array of ones of size `group_labels.shape[0]`. + + Parameters + ---------- + group_labels : np.ndarray + Indices of random effects groups in a prediction set + basis : np.ndarray + Basis used for random effects prediction + + Returns + ------- + result : np.ndarray + Numpy array with as many rows as observations in `group_labels` and as many columns as samples in the container + """ + # TODO: add more runtime checks to handle group labels + rfx_dataset = RandomEffectsDataset() + rfx_dataset.add_group_labels(group_labels) + rfx_dataset.add_basis(basis) + return self.rfx_container_cpp.Predict(rfx_dataset.rfx_dataset_cpp, self.rfx_label_mapper_cpp) + + +class RandomEffectsModel: + """ + Class that stores current model state, prior parameters, and procedures for sampling from the conditional posterior of each parameter. + + Parameters + ---------- + num_components : int + Number of "components," or bases, defining the random effects regression + num_groups : int + Number of random effects groups + """ + + def __init__(self, num_components: int, num_groups: int) -> None: + self.rfx_model_cpp = RandomEffectsModelCpp(num_components, num_groups) + self.num_components = num_components + self.num_groups = num_groups + + def sample(self, rfx_dataset: RandomEffectsDataset, residual: Residual, + rfx_tracker: RandomEffectsTracker, rfx_container: RandomEffectsContainer, + keep_sample: bool, global_variance: float, rng: RNG) -> None: + """ + Sample from random effects model + + Parameters + ---------- + rfx_dataset: RandomEffectsDataset + Object of type `RandomEffectsDataset` + residual: Residual + Object of type `Residual` + rfx_tracker: RandomEffectsTracker + Object of type `RandomEffectsTracker` + rfx_samples: RandomEffectsContainer + Object of type `RandomEffectsContainer` + keep_sample: bool + Whether sample should be retained in `rfx_samples`. If `FALSE`, the state of `rfx_tracker` will be updated, but the parameter values will not be added to the sample container. Samples are commonly discarded due to burn-in or thinning. + global_variance: float + Scalar global variance parameter + rng: RNG + Object of type `RNG` + """ + self.rfx_model_cpp.SampleRandomEffects(rfx_dataset.rfx_dataset_cpp, residual.residual_cpp, + rfx_tracker.rfx_tracker_cpp, rfx_container.rfx_container_cpp, + keep_sample, global_variance, rng.rng_cpp) + + def set_working_parameter(self, working_parameter: np.ndarray) -> None: + """ + Set values for the "working parameter." This is typically used for initialization, + but could also be used to interrupt or override the sampler. + + Parameters + ---------- + working_parameter: np.ndarray + Working parameter initial values. Must have the same dimension as the basis in the random effects model. + """ + if not isinstance(working_parameter, np.ndarray): + raise ValueError("working_parameter must be a numpy array") + working_parameter_ = np.squeeze(working_parameter) if working_parameter.ndim > 1 else working_parameter + if working_parameter_.ndim != 1: + raise ValueError("working_parameter must be a 1d numpy array with as many elements as bases in the random effects model") + if working_parameter_.shape[0] != self.num_components: + raise ValueError("working_parameter must be a 1d numpy array with as many elements as bases in the random effects model") + self.rfx_model_cpp.SetWorkingParameter(working_parameter) + + def set_group_parameters(self, group_parameters: np.ndarray) -> None: + """ + Set values for the "group parameters." This is typically used for initialization, + but could also be used to interrupt or override the sampler. + + Parameters + ---------- + group_parameters: np.ndarray + Group parameter initial values. Must have as many rows as bases in the random effects model and as + many columns as groups in the random effects model. + """ + if not isinstance(group_parameters, np.ndarray): + raise ValueError("group_parameters must be a numpy array") + group_parameters_ = np.squeeze(group_parameters) if group_parameters.ndim > 2 else group_parameters + if group_parameters_.ndim != 2: + raise ValueError("group_parameters must be a 2d numpy array with as many rows as bases and as many columns as groups in the random effects model") + if group_parameters_.shape[0] != self.num_components: + raise ValueError("group_parameters must be a 2d numpy array with as many rows as bases and as many columns as groups in the random effects model") + if group_parameters_.shape[1] != self.num_groups: + raise ValueError("group_parameters must be a 2d numpy array with as many rows as bases and as many columns as groups in the random effects model") + self.rfx_model_cpp.SetGroupParameters(group_parameters) + + def set_working_parameter_covariance(self, covariance: np.ndarray) -> None: + """ + Set values for the working parameter covariance. This is typically used for initialization, + but could also be used to interrupt or override the sampler. + + Parameters + ---------- + covariance: np.ndarray + Working parameter covariance initial values. Must have as many rows and columns as bases in the random effects model. + """ + if not isinstance(covariance, np.ndarray): + raise ValueError("covariance must be a numpy array") + covariance_ = np.squeeze(covariance) if covariance.ndim > 2 else covariance + if covariance_.ndim != 2: + raise ValueError("covariance must be a 2d numpy array with as many rows and columns as bases in the random effects model") + if covariance_.shape[0] != self.num_components: + raise ValueError("covariance must be a 2d numpy array with as many rows and columns as bases in the random effects model") + if covariance_.shape[1] != self.num_components: + raise ValueError("covariance must be a 2d numpy array with as many rows and columns as bases in the random effects model") + self.rfx_model_cpp.SetWorkingParameterCovariance(covariance_) + + def set_group_parameter_covariance(self, covariance: np.ndarray) -> None: + """ + Set values for the group parameter covariance. This is typically used for initialization, + but could also be used to interrupt or override the sampler. + + Parameters + ---------- + covariance: np.ndarray + Group parameter covariance initial values. Must have as many rows and columns as bases in the random effects model. + """ + if not isinstance(covariance, np.ndarray): + raise ValueError("covariance must be a numpy array") + covariance_ = np.squeeze(covariance) if covariance.ndim > 2 else covariance + if covariance_.ndim != 2: + raise ValueError("covariance must be a 2d numpy array with as many rows and columns as bases in the random effects model") + if covariance_.shape[0] != self.num_components: + raise ValueError("covariance must be a 2d numpy array with as many rows and columns as bases in the random effects model") + if covariance_.shape[1] != self.num_components: + raise ValueError("covariance must be a 2d numpy array with as many rows and columns as bases in the random effects model") + self.rfx_model_cpp.SetGroupParameterCovariance(covariance_) + + def set_variance_prior_shape(self, shape: float) -> None: + """ + Set shape parameter for the group parameter variance prior. This is typically used for initialization, + but could also be used to interrupt or override the sampler. + + Parameters + ---------- + shape: float + Shape parameter for the group parameter variance prior. Must be positive. + """ + if not isinstance(shape, (int, float)): + raise ValueError("shape must a positive scalar") + if shape <= 0: + raise ValueError("shape must a positive scalar") + self.rfx_model_cpp.SetVariancePriorShape(shape) + + def set_variance_prior_scale(self, scale: float) -> None: + """ + Set scale parameter for the group parameter variance prior. This is typically used for initialization, + but could also be used to interrupt or override the sampler. + + Parameters + ---------- + scale: float + Scale parameter for the group parameter variance prior. Must be positive. + """ + if not isinstance(scale, (int, float)): + raise ValueError("scale must a positive scalar") + if scale <= 0: + raise ValueError("scale must a positive scalar") + self.rfx_model_cpp.SetVariancePriorScale(scale) diff --git a/stochtree/sampler.py b/stochtree/sampler.py index 3351c085..ff6a371c 100644 --- a/stochtree/sampler.py +++ b/stochtree/sampler.py @@ -1,7 +1,3 @@ -""" -Python classes wrapping C++ sampler objects -""" - import numpy as np from stochtree_cpp import ( ForestSamplerCpp, diff --git a/stochtree/serialization.py b/stochtree/serialization.py index f4d0ff80..5bc63962 100644 --- a/stochtree/serialization.py +++ b/stochtree/serialization.py @@ -8,6 +8,7 @@ from stochtree_cpp import JsonCpp from .forest import ForestContainer +from .random_effects import RandomEffectsContainer class JSONSerializer: @@ -346,14 +347,32 @@ def get_forest_container(self, forest_str: str) -> ForestContainer: Parameters ---------- forest_str : str - String containing the JSON representation of a `ForestContainer` + String containing the label for a given forest in a JSON object Returns ------- ForestContainer - In-memory `ForestContainer` python object, created from JSON string + In-memory `ForestContainer` python object, created from JSON """ # TODO: read this from JSON result = ForestContainer(0, 1, True, False) result.forest_container_cpp.LoadFromJson(self.json_cpp, forest_str) return result + + def get_random_effects_container(self, random_effects_str: str) -> RandomEffectsContainer: + """Converts a JSON string for a random effects container to a `RandomEffectsContainer` object. + + Parameters + ---------- + random_effects_str : str + String containing the label for a given random effects term in a JSON object + + Returns + ------- + RandomEffectsContainer + In-memory `RandomEffectsContainer` python object, created from JSON + """ + # TODO: read this from JSON + result = RandomEffectsContainer(0, 0) + result.random_effects_container_cpp.LoadFromJson(self.json_cpp, random_effects_str) + return result From 96a61265b037b18b8fea2b5eac0b47de3e2468d3 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Tue, 4 Mar 2025 13:45:35 -0600 Subject: [PATCH 23/35] Added unit tests and demo script for random effects --- demo/debug/random_effects.py | 85 ++++++++++++++++++ test/python/test_random_effects.py | 136 +++++++++++++++++++++++++++++ 2 files changed, 221 insertions(+) create mode 100644 demo/debug/random_effects.py create mode 100644 test/python/test_random_effects.py diff --git a/demo/debug/random_effects.py b/demo/debug/random_effects.py new file mode 100644 index 00000000..9a3c4350 --- /dev/null +++ b/demo/debug/random_effects.py @@ -0,0 +1,85 @@ +# Random Effects Demo Script + +# Load necessary libraries +import numpy as np +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +from stochtree import ( + RandomEffectsContainer, + RandomEffectsDataset, + RandomEffectsModel, + RandomEffectsTracker, + Residual, + RNG, +) +# from sklearn.model_selection import train_test_split + +# Generate sample data +# RNG +random_seed = 1234 +rng = np.random.default_rng(random_seed) + +# Generate group labels and random effects basis +num_observations = 1000 +num_basis = 2 +num_groups = 4 +group_labels = rng.choice(num_groups, size=num_observations) +basis = np.empty((num_observations, num_basis)) +basis[:, 0] = 1.0 +if num_basis > 1: + basis[:, 1:] = rng.uniform(-1, 1, (num_observations, num_basis - 1)) + + +# Define the group rfx function +def outcome_mean(group_labels, basis): + return np.where( + group_labels == 0, + 0 - 1 * basis[:, 1], + np.where( + group_labels == 1, + 4 + 1 * basis[:, 1], + np.where(group_labels == 2, 8 + 3 * basis[:, 1], 12 + 5 * basis[:, 1]), + ), + ) + + +# Generate outcome +epsilon = rng.normal(0, 1, num_observations) +rfx_term = outcome_mean(group_labels, basis) +y = rfx_term + epsilon + +# Standardize outcome +y_bar = np.mean(y) +y_std = np.std(y) +resid = (y - y_bar) / y_std + +# Construct python objects used for rfx sampling +outcome = Residual(resid) +rfx_dataset = RandomEffectsDataset() +rfx_dataset.add_group_labels(group_labels) +rfx_dataset.add_basis(basis) +rfx_tracker = RandomEffectsTracker(group_labels) +rfx_model = RandomEffectsModel(num_basis, num_groups) +rfx_model.set_working_parameter(np.ones(num_basis)) +rfx_model.set_group_parameters(np.ones((num_basis, num_groups))) +rfx_model.set_working_parameter_covariance(np.identity(num_basis)) +rfx_model.set_group_parameter_covariance(np.identity(num_basis)) +rfx_model.set_variance_prior_shape(1.0) +rfx_model.set_variance_prior_scale(1.0) +rfx_container = RandomEffectsContainer(num_basis, num_groups, rfx_tracker) +# cpp_rng = RNG(random_seed) +cpp_rng = RNG() + +# Sample the model +rfx_model.sample(rfx_dataset, outcome, rfx_tracker, rfx_container, True, 1.0, cpp_rng) + +# Inspect the samples +rfx_preds = rfx_container.predict(group_labels, basis) * y_std + y_bar +rfx_comparison_df = pd.DataFrame( + np.concatenate((rfx_preds, np.expand_dims(rfx_term, axis=1)), axis=1), + columns=["Predicted", "Actual"], +) +sns.scatterplot(data=rfx_comparison_df, x="Predicted", y="Actual") +plt.axline((0, 0), slope=1, color="black", linestyle=(0, (3, 3))) +plt.show() diff --git a/test/python/test_random_effects.py b/test/python/test_random_effects.py new file mode 100644 index 00000000..aaeaa0c3 --- /dev/null +++ b/test/python/test_random_effects.py @@ -0,0 +1,136 @@ +import numpy as np + +from stochtree import ( + RandomEffectsContainer, + RandomEffectsDataset, + RandomEffectsModel, + RandomEffectsTracker, + Residual, + RNG, +) + + +class TestRandomEffects: + def test_random_intercept(self): + # RNG + rng = np.random.default_rng() + + # Generate group labels and random effects basis + num_observations = 1000 + num_basis = 1 + num_groups = 4 + group_labels = rng.choice(num_groups, size=num_observations) + basis = np.empty((num_observations, num_basis)) + basis[:, 0] = 1.0 + if num_basis > 1: + basis[:, 1:] = rng.uniform(-1, 1, (num_observations, num_basis - 1)) + + # Define the group rfx function + def outcome_mean(group_labels, basis): + return np.where( + group_labels == 0, + 0, + np.where(group_labels == 1, 4, np.where(group_labels == 2, 8, 12)), + ) + + # Generate outcome + epsilon = rng.normal(0, 1, num_observations) + rfx_term = outcome_mean(group_labels, basis) + y = rfx_term + epsilon + + # Standardize outcome + y_bar = np.mean(y) + y_std = np.std(y) + resid = (y - y_bar) / y_std + + # Construct python objects used for rfx sampling + outcome = Residual(resid) + rfx_dataset = RandomEffectsDataset() + rfx_dataset.add_group_labels(group_labels) + rfx_dataset.add_basis(basis) + rfx_tracker = RandomEffectsTracker(group_labels) + rfx_model = RandomEffectsModel(num_basis, num_groups) + rfx_model.set_working_parameter(np.ones(num_basis)) + rfx_model.set_group_parameters(np.ones((num_basis, num_groups))) + rfx_model.set_working_parameter_covariance(np.identity(num_basis)) + rfx_model.set_group_parameter_covariance(np.identity(num_basis)) + rfx_model.set_variance_prior_shape(1.0) + rfx_model.set_variance_prior_scale(1.0) + rfx_container = RandomEffectsContainer(num_basis, num_groups, rfx_tracker) + cpp_rng = RNG() + + # Sample the model + num_mcmc = 10 + for _ in range(num_mcmc): + rfx_model.sample( + rfx_dataset, outcome, rfx_tracker, rfx_container, True, 1.0, cpp_rng + ) + + # Inspect the samples + rfx_preds = rfx_container.predict(group_labels, basis) * y_std + y_bar + assert rfx_preds.shape == (num_observations, num_mcmc) + + def test_random_slope(self): + # RNG + rng = np.random.default_rng() + + # Generate group labels and random effects basis + num_observations = 1000 + num_basis = 2 + num_groups = 4 + group_labels = rng.choice(num_groups, size=num_observations) + basis = np.empty((num_observations, num_basis)) + basis[:, 0] = 1.0 + if num_basis > 1: + basis[:, 1:] = rng.uniform(-1, 1, (num_observations, num_basis - 1)) + + # Define the group rfx function + def outcome_mean(group_labels, basis): + return np.where( + group_labels == 0, + 0 - 1 * basis[:, 1], + np.where( + group_labels == 1, + 4 + 1 * basis[:, 1], + np.where( + group_labels == 2, 8 + 3 * basis[:, 1], 12 + 5 * basis[:, 1] + ), + ), + ) + + # Generate outcome + epsilon = rng.normal(0, 1, num_observations) + rfx_term = outcome_mean(group_labels, basis) + y = rfx_term + epsilon + + # Standardize outcome + y_bar = np.mean(y) + y_std = np.std(y) + resid = (y - y_bar) / y_std + + # Construct python objects used for rfx sampling + outcome = Residual(resid) + rfx_dataset = RandomEffectsDataset() + rfx_dataset.add_group_labels(group_labels) + rfx_dataset.add_basis(basis) + rfx_tracker = RandomEffectsTracker(group_labels) + rfx_model = RandomEffectsModel(num_basis, num_groups) + rfx_model.set_working_parameter(np.ones(num_basis)) + rfx_model.set_group_parameters(np.ones((num_basis, num_groups))) + rfx_model.set_working_parameter_covariance(np.identity(num_basis)) + rfx_model.set_group_parameter_covariance(np.identity(num_basis)) + rfx_model.set_variance_prior_shape(1.0) + rfx_model.set_variance_prior_scale(1.0) + rfx_container = RandomEffectsContainer(num_basis, num_groups, rfx_tracker) + cpp_rng = RNG() + + # Sample the model + num_mcmc = 10 + for _ in range(num_mcmc): + rfx_model.sample( + rfx_dataset, outcome, rfx_tracker, rfx_container, True, 1.0, cpp_rng + ) + + # Inspect the samples + rfx_preds = rfx_container.predict(group_labels, basis) * y_std + y_bar + assert rfx_preds.shape == (num_observations, num_mcmc) From 2aa3e90d85153e3f0abd86320f041fdd8256d753 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Tue, 4 Mar 2025 13:47:23 -0600 Subject: [PATCH 24/35] Reformatted python code --- stochtree/random_effects.py | 178 ++++++++++++++++++++++++------------ stochtree/serialization.py | 12 +-- 2 files changed, 123 insertions(+), 67 deletions(-) diff --git a/stochtree/random_effects.py b/stochtree/random_effects.py index 27caf98e..709b83b7 100644 --- a/stochtree/random_effects.py +++ b/stochtree/random_effects.py @@ -1,13 +1,12 @@ import numpy as np from stochtree_cpp import ( + RandomEffectsContainerCpp, RandomEffectsDatasetCpp, + RandomEffectsLabelMapperCpp, RandomEffectsModelCpp, RandomEffectsTrackerCpp, - RandomEffectsContainerCpp, - RandomEffectsLabelMapperCpp, - ResidualCpp, - RngCpp, ) + from .data import Residual from .sampler import RNG @@ -23,7 +22,7 @@ class RandomEffectsDataset: def __init__(self) -> None: self.rfx_dataset_cpp = RandomEffectsDatasetCpp() - + def add_group_labels(self, group_labels: np.array): """ Add group labels to a dataset @@ -35,7 +34,9 @@ def add_group_labels(self, group_labels: np.array): """ group_labels_ = np.squeeze(group_labels) if group_labels_.ndim > 1: - raise ValueError("group_labels must be a one-dimensional numpy array of group indices") + raise ValueError( + "group_labels must be a one-dimensional numpy array of group indices" + ) n = group_labels_.shape[0] self.rfx_dataset_cpp.AddGroupLabels(group_labels_, n) @@ -50,7 +51,9 @@ def add_basis(self, basis: np.array): """ basis_ = np.expand_dims(basis, 1) if np.ndim(basis) == 1 else basis if basis_.ndim != 2: - raise ValueError("basis must be a one-or-two-dimensional numpy array of random effect bases") + raise ValueError( + "basis must be a one-or-two-dimensional numpy array of random effect bases" + ) n, p = basis_.shape basis_rowmajor = np.ascontiguousarray(basis_) self.rfx_dataset_cpp.AddBasis(basis_rowmajor, n, p, True) @@ -67,7 +70,9 @@ def update_basis(self, basis: np.array): """ basis_ = np.expand_dims(basis, 1) if np.ndim(basis) == 1 else basis if basis_.ndim != 2: - raise ValueError("basis must be a one-or-two-dimensional numpy array of random effect bases") + raise ValueError( + "basis must be a one-or-two-dimensional numpy array of random effect bases" + ) n, p = basis_.shape basis_rowmajor = np.ascontiguousarray(basis_) self.rfx_dataset_cpp.UpdateBasis(basis_rowmajor, n, p, True) @@ -83,7 +88,9 @@ def add_variance_weights(self, variance_weights: np.array): """ variance_weights_ = np.squeeze(variance_weights) if variance_weights_.ndim > 1: - raise ValueError("variance_weights must be a one-dimensional numpy array of group indices") + raise ValueError( + "variance_weights must be a one-dimensional numpy array of group indices" + ) n = variance_weights_.shape[0] self.rfx_dataset_cpp.AddVarianceWeights(variance_weights_, n) @@ -145,8 +152,8 @@ def has_variance_weights(self) -> bool: class RandomEffectsTracker: """ - Class that defines a "tracker" for random effects models, most notably - storing the data indices available in each group for quicker posterior + Class that defines a "tracker" for random effects models, most notably + storing the data indices available in each group for quicker posterior computation and sampling of random effects terms. Parameters @@ -154,42 +161,46 @@ class RandomEffectsTracker: group_indices : np.ndarray Integer indices indicating groups used to define random effects """ - + def __init__(self, group_indices: np.ndarray) -> None: self.rfx_tracker_cpp = RandomEffectsTrackerCpp(group_indices) -class RandomEffectsContainer(): +class RandomEffectsContainer: """ Wrapper around the "persistent" aspects of a C++ random effects model. This includes - draws of the parameters and a map from the original label indices to the - 0-indexed label numbers used to place group samples in memory (i.e. the - first label is stored in column 0 of the sample matrix, the second label + draws of the parameters and a map from the original label indices to the + 0-indexed label numbers used to place group samples in memory (i.e. the + first label is stored in column 0 of the sample matrix, the second label is store in column 1 of the sample matrix, etc...). Parameters ---------- num_components : int - Number of components (bases) in a random effects model. For the simplest random effects model, - in which each group has a different random intercept, this is 1, and the basis is a trivial + Number of components (bases) in a random effects model. For the simplest random effects model, + in which each group has a different random intercept, this is 1, and the basis is a trivial "dummy" intercept vector. num_groups : int Number of groups in a random effects model. """ - def __init__(self, num_components: int, num_groups: int, rfx_tracker: RandomEffectsTracker) -> None: + def __init__( + self, num_components: int, num_groups: int, rfx_tracker: RandomEffectsTracker + ) -> None: self.rfx_container_cpp = RandomEffectsContainerCpp(num_components, num_groups) - self.rfx_label_mapper_cpp = RandomEffectsLabelMapperCpp(rfx_tracker.rfx_tracker_cpp) - + self.rfx_label_mapper_cpp = RandomEffectsLabelMapperCpp( + rfx_tracker.rfx_tracker_cpp + ) + def num_samples(self) -> int: return self.rfx_container_cpp.NumSamples() - + def num_components(self) -> int: return self.rfx_container_cpp.NumComponents() - + def num_groups(self) -> int: return self.rfx_container_cpp.NumGroups() - + def delete_sample(self, sample_num: int) -> None: self.rfx_container_cpp.DeleteSample(sample_num) @@ -203,10 +214,10 @@ def load_from_json_string(self, json_string: str) -> None: In-memory string containing state of a random effects container. """ self.rfx_container_cpp.LoadFromJsonString(json_string) - + def predict(self, group_labels: np.array, basis: np.array) -> np.ndarray: """ - Predict random effects for each observation implied by `group_labels` and `basis`. + Predict random effects for each observation implied by `group_labels` and `basis`. If a random effects model is "intercept-only", `basis` will be an array of ones of size `group_labels.shape[0]`. Parameters @@ -225,7 +236,9 @@ def predict(self, group_labels: np.array, basis: np.array) -> np.ndarray: rfx_dataset = RandomEffectsDataset() rfx_dataset.add_group_labels(group_labels) rfx_dataset.add_basis(basis) - return self.rfx_container_cpp.Predict(rfx_dataset.rfx_dataset_cpp, self.rfx_label_mapper_cpp) + return self.rfx_container_cpp.Predict( + rfx_dataset.rfx_dataset_cpp, self.rfx_label_mapper_cpp + ) class RandomEffectsModel: @@ -244,13 +257,20 @@ def __init__(self, num_components: int, num_groups: int) -> None: self.rfx_model_cpp = RandomEffectsModelCpp(num_components, num_groups) self.num_components = num_components self.num_groups = num_groups - - def sample(self, rfx_dataset: RandomEffectsDataset, residual: Residual, - rfx_tracker: RandomEffectsTracker, rfx_container: RandomEffectsContainer, - keep_sample: bool, global_variance: float, rng: RNG) -> None: + + def sample( + self, + rfx_dataset: RandomEffectsDataset, + residual: Residual, + rfx_tracker: RandomEffectsTracker, + rfx_container: RandomEffectsContainer, + keep_sample: bool, + global_variance: float, + rng: RNG, + ) -> None: """ Sample from random effects model - + Parameters ---------- rfx_dataset: RandomEffectsDataset @@ -268,13 +288,19 @@ def sample(self, rfx_dataset: RandomEffectsDataset, residual: Residual, rng: RNG Object of type `RNG` """ - self.rfx_model_cpp.SampleRandomEffects(rfx_dataset.rfx_dataset_cpp, residual.residual_cpp, - rfx_tracker.rfx_tracker_cpp, rfx_container.rfx_container_cpp, - keep_sample, global_variance, rng.rng_cpp) - + self.rfx_model_cpp.SampleRandomEffects( + rfx_dataset.rfx_dataset_cpp, + residual.residual_cpp, + rfx_tracker.rfx_tracker_cpp, + rfx_container.rfx_container_cpp, + keep_sample, + global_variance, + rng.rng_cpp, + ) + def set_working_parameter(self, working_parameter: np.ndarray) -> None: """ - Set values for the "working parameter." This is typically used for initialization, + Set values for the "working parameter." This is typically used for initialization, but could also be used to interrupt or override the sampler. Parameters @@ -284,38 +310,56 @@ def set_working_parameter(self, working_parameter: np.ndarray) -> None: """ if not isinstance(working_parameter, np.ndarray): raise ValueError("working_parameter must be a numpy array") - working_parameter_ = np.squeeze(working_parameter) if working_parameter.ndim > 1 else working_parameter + working_parameter_ = ( + np.squeeze(working_parameter) + if working_parameter.ndim > 1 + else working_parameter + ) if working_parameter_.ndim != 1: - raise ValueError("working_parameter must be a 1d numpy array with as many elements as bases in the random effects model") + raise ValueError( + "working_parameter must be a 1d numpy array with as many elements as bases in the random effects model" + ) if working_parameter_.shape[0] != self.num_components: - raise ValueError("working_parameter must be a 1d numpy array with as many elements as bases in the random effects model") + raise ValueError( + "working_parameter must be a 1d numpy array with as many elements as bases in the random effects model" + ) self.rfx_model_cpp.SetWorkingParameter(working_parameter) - + def set_group_parameters(self, group_parameters: np.ndarray) -> None: """ - Set values for the "group parameters." This is typically used for initialization, + Set values for the "group parameters." This is typically used for initialization, but could also be used to interrupt or override the sampler. Parameters ---------- group_parameters: np.ndarray - Group parameter initial values. Must have as many rows as bases in the random effects model and as + Group parameter initial values. Must have as many rows as bases in the random effects model and as many columns as groups in the random effects model. """ if not isinstance(group_parameters, np.ndarray): raise ValueError("group_parameters must be a numpy array") - group_parameters_ = np.squeeze(group_parameters) if group_parameters.ndim > 2 else group_parameters + group_parameters_ = ( + np.squeeze(group_parameters) + if group_parameters.ndim > 2 + else group_parameters + ) if group_parameters_.ndim != 2: - raise ValueError("group_parameters must be a 2d numpy array with as many rows as bases and as many columns as groups in the random effects model") + raise ValueError( + "group_parameters must be a 2d numpy array with as many rows as bases and as many columns as groups in the random effects model" + ) if group_parameters_.shape[0] != self.num_components: - raise ValueError("group_parameters must be a 2d numpy array with as many rows as bases and as many columns as groups in the random effects model") + raise ValueError( + "group_parameters must be a 2d numpy array with as many rows as bases and as many columns as groups in the random effects model" + ) if group_parameters_.shape[1] != self.num_groups: - raise ValueError("group_parameters must be a 2d numpy array with as many rows as bases and as many columns as groups in the random effects model") + raise ValueError( + "group_parameters must be a 2d numpy array with as many rows as bases and as many columns as groups in the random effects model" + ) self.rfx_model_cpp.SetGroupParameters(group_parameters) - + def set_working_parameter_covariance(self, covariance: np.ndarray) -> None: """ - Set values for the working parameter covariance. This is typically used for initialization, + Set values for the working parameter covariance. This is typically used for initialization, but could also be used to interrupt or override the sampler. Parameters @@ -327,16 +371,22 @@ def set_working_parameter_covariance(self, covariance: np.ndarray) -> None: raise ValueError("covariance must be a numpy array") covariance_ = np.squeeze(covariance) if covariance.ndim > 2 else covariance if covariance_.ndim != 2: - raise ValueError("covariance must be a 2d numpy array with as many rows and columns as bases in the random effects model") + raise ValueError( + "covariance must be a 2d numpy array with as many rows and columns as bases in the random effects model" + ) if covariance_.shape[0] != self.num_components: - raise ValueError("covariance must be a 2d numpy array with as many rows and columns as bases in the random effects model") + raise ValueError( + "covariance must be a 2d numpy array with as many rows and columns as bases in the random effects model" + ) if covariance_.shape[1] != self.num_components: - raise ValueError("covariance must be a 2d numpy array with as many rows and columns as bases in the random effects model") + raise ValueError( + "covariance must be a 2d numpy array with as many rows and columns as bases in the random effects model" + ) self.rfx_model_cpp.SetWorkingParameterCovariance(covariance_) - + def set_group_parameter_covariance(self, covariance: np.ndarray) -> None: """ - Set values for the group parameter covariance. This is typically used for initialization, + Set values for the group parameter covariance. This is typically used for initialization, but could also be used to interrupt or override the sampler. Parameters @@ -348,16 +398,22 @@ def set_group_parameter_covariance(self, covariance: np.ndarray) -> None: raise ValueError("covariance must be a numpy array") covariance_ = np.squeeze(covariance) if covariance.ndim > 2 else covariance if covariance_.ndim != 2: - raise ValueError("covariance must be a 2d numpy array with as many rows and columns as bases in the random effects model") + raise ValueError( + "covariance must be a 2d numpy array with as many rows and columns as bases in the random effects model" + ) if covariance_.shape[0] != self.num_components: - raise ValueError("covariance must be a 2d numpy array with as many rows and columns as bases in the random effects model") + raise ValueError( + "covariance must be a 2d numpy array with as many rows and columns as bases in the random effects model" + ) if covariance_.shape[1] != self.num_components: - raise ValueError("covariance must be a 2d numpy array with as many rows and columns as bases in the random effects model") + raise ValueError( + "covariance must be a 2d numpy array with as many rows and columns as bases in the random effects model" + ) self.rfx_model_cpp.SetGroupParameterCovariance(covariance_) - + def set_variance_prior_shape(self, shape: float) -> None: """ - Set shape parameter for the group parameter variance prior. This is typically used for initialization, + Set shape parameter for the group parameter variance prior. This is typically used for initialization, but could also be used to interrupt or override the sampler. Parameters @@ -370,10 +426,10 @@ def set_variance_prior_shape(self, shape: float) -> None: if shape <= 0: raise ValueError("shape must a positive scalar") self.rfx_model_cpp.SetVariancePriorShape(shape) - + def set_variance_prior_scale(self, scale: float) -> None: """ - Set scale parameter for the group parameter variance prior. This is typically used for initialization, + Set scale parameter for the group parameter variance prior. This is typically used for initialization, but could also be used to interrupt or override the sampler. Parameters diff --git a/stochtree/serialization.py b/stochtree/serialization.py index 5bc63962..4a61a7e1 100644 --- a/stochtree/serialization.py +++ b/stochtree/serialization.py @@ -1,10 +1,6 @@ import warnings -from typing import Union import numpy as np -import pandas as pd -from scipy.linalg import lstsq -from scipy.stats import gamma from stochtree_cpp import JsonCpp from .forest import ForestContainer @@ -359,7 +355,9 @@ def get_forest_container(self, forest_str: str) -> ForestContainer: result.forest_container_cpp.LoadFromJson(self.json_cpp, forest_str) return result - def get_random_effects_container(self, random_effects_str: str) -> RandomEffectsContainer: + def get_random_effects_container( + self, random_effects_str: str + ) -> RandomEffectsContainer: """Converts a JSON string for a random effects container to a `RandomEffectsContainer` object. Parameters @@ -374,5 +372,7 @@ def get_random_effects_container(self, random_effects_str: str) -> RandomEffects """ # TODO: read this from JSON result = RandomEffectsContainer(0, 0) - result.random_effects_container_cpp.LoadFromJson(self.json_cpp, random_effects_str) + result.random_effects_container_cpp.LoadFromJson( + self.json_cpp, random_effects_str + ) return result From 79b7d95a69a131f1d663911491c391daf6c78bb4 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Wed, 5 Mar 2025 22:47:55 -0600 Subject: [PATCH 25/35] Refactoring "basis" parameter names in python BART interface --- demo/debug/supervised_learning.py | 2 +- .../heteroskedastic_supervised_learning.ipynb | 4 +- demo/notebooks/serialization.ipynb | 4 +- demo/notebooks/supervised_learning.ipynb | 4 +- python_docs/source/supervised.rst | 2 +- stochtree/bart.py | 68 +++++++++---------- test/python/test_bart.py | 16 ++--- test/python/test_json.py | 2 +- 8 files changed, 51 insertions(+), 51 deletions(-) diff --git a/demo/debug/supervised_learning.py b/demo/debug/supervised_learning.py index e6115957..bbbd99dd 100644 --- a/demo/debug/supervised_learning.py +++ b/demo/debug/supervised_learning.py @@ -56,7 +56,7 @@ def outcome_mean(X, W): # Run BART bart_model = BARTModel() -bart_model.sample(X_train=X_train, y_train=y_train, basis_train=basis_train, X_test=X_test, basis_test=basis_test, num_gfr=10, num_mcmc=100) +bart_model.sample(X_train=X_train, y_train=y_train, leaf_basis_train=basis_train, X_test=X_test, leaf_basis_test=basis_test, num_gfr=10, num_mcmc=100) # Inspect the MCMC (BART) samples forest_preds_y_mcmc = bart_model.y_hat_test diff --git a/demo/notebooks/heteroskedastic_supervised_learning.ipynb b/demo/notebooks/heteroskedastic_supervised_learning.ipynb index ff90ec43..82cb9a23 100644 --- a/demo/notebooks/heteroskedastic_supervised_learning.ipynb +++ b/demo/notebooks/heteroskedastic_supervised_learning.ipynb @@ -142,8 +142,8 @@ " X_train=X_train,\n", " y_train=y_train,\n", " X_test=X_test,\n", - " basis_train=basis_train,\n", - " basis_test=basis_test,\n", + " leaf_basis_train=basis_train,\n", + " leaf_basis_test=basis_test,\n", " num_gfr=10,\n", " num_mcmc=100,\n", " general_params=global_params,\n", diff --git a/demo/notebooks/serialization.ipynb b/demo/notebooks/serialization.ipynb index 762affe6..6cc5b019 100644 --- a/demo/notebooks/serialization.ipynb +++ b/demo/notebooks/serialization.ipynb @@ -127,9 +127,9 @@ "bart_model.sample(\n", " X_train=X_train,\n", " y_train=y_train,\n", - " basis_train=basis_train,\n", + " leaf_basis_train=basis_train,\n", " X_test=X_test,\n", - " basis_test=basis_test,\n", + " leaf_basis_test=basis_test,\n", " num_gfr=10,\n", " num_mcmc=10,\n", ")" diff --git a/demo/notebooks/supervised_learning.ipynb b/demo/notebooks/supervised_learning.ipynb index 57a8c9b2..f51f78e1 100644 --- a/demo/notebooks/supervised_learning.ipynb +++ b/demo/notebooks/supervised_learning.ipynb @@ -125,9 +125,9 @@ "bart_model.sample(\n", " X_train=X_train,\n", " y_train=y_train,\n", - " basis_train=basis_train,\n", + " leaf_basis_train=basis_train,\n", " X_test=X_test,\n", - " basis_test=basis_test,\n", + " leaf_basis_test=basis_test,\n", " num_gfr=10,\n", " num_mcmc=100,\n", " general_params=general_params,\n", diff --git a/python_docs/source/supervised.rst b/python_docs/source/supervised.rst index a200e66f..89a94227 100644 --- a/python_docs/source/supervised.rst +++ b/python_docs/source/supervised.rst @@ -68,4 +68,4 @@ Initialize and run a BART sampler for 100 iterations (after 10 "warm-start" draw .. code-block:: python bart_model = BARTModel() - bart_model.sample(X_train=X_train, y_train=y_train, basis_train=basis_train, X_test=X_test, basis_test=basis_test, num_gfr=10, num_mcmc=100) + bart_model.sample(X_train=X_train, y_train=y_train, leaf_basis_train=basis_train, X_test=X_test, leaf_basis_test=basis_test, num_gfr=10, num_mcmc=100) diff --git a/stochtree/bart.py b/stochtree/bart.py index 908039b0..f0f701f9 100644 --- a/stochtree/bart.py +++ b/stochtree/bart.py @@ -50,7 +50,7 @@ class BARTModel: The `BARTModel` class supports the following extensions of this model: - - Leaf Regression: Rather than letting `f(X)` define a standard decision tree ensemble, in which each tree uses `X` to partition the data and then serve up constant predictions, we allow for models `f(X,Z)` in which `X` and `Z` together define a partitioned linear model (`X` partitions the data and `Z` serves as the basis for regression models). This model can be run by specifying `basis_train` in the `sample` method. + - Leaf Regression: Rather than letting `f(X)` define a standard decision tree ensemble, in which each tree uses `X` to partition the data and then serve up constant predictions, we allow for models `f(X,Z)` in which `X` and `Z` together define a partitioned linear model (`X` partitions the data and `Z` serves as the basis for regression models). This model can be run by specifying `leaf_basis_train` in the `sample` method. - Heteroskedasticity: Rather than define $\epsilon$ parameterically, we can let a forest $\sigma^2(X)$ model a conditional error variance function. This can be done by setting `num_trees_variance > 0` in the `params` dictionary passed to the `sample` method. """ @@ -63,9 +63,9 @@ def sample( self, X_train: Union[np.array, pd.DataFrame], y_train: np.array, - basis_train: np.array = None, + leaf_basis_train: np.array = None, X_test: Union[np.array, pd.DataFrame] = None, - basis_test: np.array = None, + leaf_basis_test: np.array = None, num_gfr: int = 5, num_burnin: int = 0, num_mcmc: int = 100, @@ -82,13 +82,13 @@ def sample( Training set covariates on which trees may be partitioned. y_train : np.array Training set outcome. - basis_train : np.array, optional + leaf_basis_train : np.array, optional Optional training set basis vector used to define a regression to be run in the leaves of each tree. X_test : np.array, optional Optional test set covariates. - basis_test : np.array, optional + leaf_basis_test : np.array, optional Optional test set basis vector used to define a regression to be run in the leaves of each tree. - Must be included / omitted consistently (i.e. if basis_train is provided, then basis_test must be provided alongside X_test). + Must be included / omitted consistently (i.e. if leaf_basis_train is provided, then leaf_basis_test must be provided alongside X_test). num_gfr : int, optional Number of "warm-start" iterations run using the grow-from-root algorithm (He and Hahn, 2021). Defaults to `5`. num_burnin : int, optional @@ -119,7 +119,7 @@ def sample( * `beta` (`float`): Exponent that decreases split probabilities for nodes of depth > 0 in the conditional mean model. Tree split prior combines `alpha` and `beta` via `alpha*(1+node_depth)^-beta`. Defaults to `2`. * `min_samples_leaf` (`int`): Minimum allowable size of a leaf, in terms of training samples, in the conditional mean model. Defaults to `5`. * `max_depth` (`int`): Maximum depth of any tree in the ensemble in the conditional mean model. Defaults to `10`. Can be overriden with `-1` which does not enforce any depth limits on trees. - * `sample_sigma2_leaf` (`bool`): Whether or not to update the `tau` leaf scale variance parameter based on `IG(sigma2_leaf_shape, sigma2_leaf_scale)`. Cannot (currently) be set to true if `basis_train` has more than one column. Defaults to `False`. + * `sample_sigma2_leaf` (`bool`): Whether or not to update the `tau` leaf scale variance parameter based on `IG(sigma2_leaf_shape, sigma2_leaf_scale)`. Cannot (currently) be set to true if `leaf_basis_train` has more than one column. Defaults to `False`. * `sigma2_leaf_init` (`float`): Starting value of leaf node scale parameter. Calibrated internally as `1/num_trees` if not set here. * `sigma2_leaf_shape` (`float`): Shape parameter in the `IG(sigma2_leaf_shape, sigma2_leaf_scale)` leaf node parameter variance model. Defaults to `3`. * `sigma2_leaf_scale` (`float`): Scale parameter in the `IG(sigma2_leaf_shape, sigma2_leaf_scale)` leaf node parameter variance model. Calibrated internally as `0.5/num_trees` if not set here. @@ -271,29 +271,29 @@ def sample( raise ValueError("X_test must be a pandas dataframe or numpy array") if not isinstance(y_train, np.ndarray): raise ValueError("y_train must be a numpy array") - if basis_train is not None: - if not isinstance(basis_train, np.ndarray): - raise ValueError("basis_train must be a numpy array") - if basis_test is not None: - if not isinstance(basis_test, np.ndarray): + if leaf_basis_train is not None: + if not isinstance(leaf_basis_train, np.ndarray): + raise ValueError("leaf_basis_train must be a numpy array") + if leaf_basis_test is not None: + if not isinstance(leaf_basis_test, np.ndarray): raise ValueError("X_test must be a numpy array") # Convert everything to standard shape (2-dimensional) if isinstance(X_train, np.ndarray): if X_train.ndim == 1: X_train = np.expand_dims(X_train, 1) - if basis_train is not None: - if basis_train.ndim == 1: - basis_train = np.expand_dims(basis_train, 1) + if leaf_basis_train is not None: + if leaf_basis_train.ndim == 1: + leaf_basis_train = np.expand_dims(leaf_basis_train, 1) if y_train.ndim == 1: y_train = np.expand_dims(y_train, 1) if X_test is not None: if isinstance(X_test, np.ndarray): if X_test.ndim == 1: X_test = np.expand_dims(X_test, 1) - if basis_test is not None: - if basis_test.ndim == 1: - basis_test = np.expand_dims(basis_test, 1) + if leaf_basis_test is not None: + if leaf_basis_test.ndim == 1: + leaf_basis_test = np.expand_dims(leaf_basis_test, 1) # Data checks if X_test is not None: @@ -301,25 +301,25 @@ def sample( raise ValueError( "X_train and X_test must have the same number of columns" ) - if basis_test is not None: - if basis_train is not None: - if basis_test.shape[1] != basis_train.shape[1]: + if leaf_basis_test is not None: + if leaf_basis_train is not None: + if leaf_basis_test.shape[1] != leaf_basis_train.shape[1]: raise ValueError( - "basis_train and basis_test must have the same number of columns" + "leaf_basis_train and leaf_basis_test must have the same number of columns" ) else: - raise ValueError("basis_test provided but basis_train was not") - if basis_train is not None: - if basis_train.shape[0] != X_train.shape[0]: + raise ValueError("leaf_basis_test provided but leaf_basis_train was not") + if leaf_basis_train is not None: + if leaf_basis_train.shape[0] != X_train.shape[0]: raise ValueError( - "basis_train and Z_train must have the same number of rows" + "leaf_basis_train and Z_train must have the same number of rows" ) if y_train.shape[0] != X_train.shape[0]: raise ValueError("X_train and y_train must have the same number of rows") - if X_test is not None and basis_test is not None: - if X_test.shape[0] != basis_test.shape[0]: + if X_test is not None and leaf_basis_test is not None: + if X_test.shape[0] != leaf_basis_test.shape[0]: raise ValueError( - "X_test and basis_test must have the same number of rows" + "X_test and leaf_basis_test must have the same number of rows" ) # Variable weight preprocessing (and initialization if necessary) @@ -352,13 +352,13 @@ def sample( self.has_test = X_test is not None # Determine whether a basis is provided - self.has_basis = basis_train is not None + self.has_basis = leaf_basis_train is not None # Unpack data dimensions self.n_train = y_train.shape[0] self.n_test = X_test_processed.shape[0] if self.has_test else 0 self.num_covariates = X_train_processed.shape[1] - self.num_basis = basis_train.shape[1] if self.has_basis else 0 + self.num_basis = leaf_basis_train.shape[1] if self.has_basis else 0 # Standardize the keep variable lists to numeric indices if keep_vars_mean is not None: @@ -684,12 +684,12 @@ def sample( forest_dataset_train = Dataset() forest_dataset_train.add_covariates(X_train_processed) if self.has_basis: - forest_dataset_train.add_basis(basis_train) + forest_dataset_train.add_basis(leaf_basis_train) if self.has_test: forest_dataset_test = Dataset() forest_dataset_test.add_covariates(X_test_processed) if self.has_basis: - forest_dataset_test.add_basis(basis_test) + forest_dataset_test.add_basis(leaf_basis_test) # Residual residual_train = Residual(resid_train) @@ -788,7 +788,7 @@ def sample( # Initialize the leaves of each tree in the mean forest if self.include_mean_forest: if self.has_basis: - init_val_mean = np.repeat(0.0, basis_train.shape[1]) + init_val_mean = np.repeat(0.0, leaf_basis_train.shape[1]) else: init_val_mean = np.array([0.0]) forest_sampler_mean.prepare_for_sampler( diff --git a/test/python/test_bart.py b/test/python/test_bart.py index 878b0b20..5b1415bd 100644 --- a/test/python/test_bart.py +++ b/test/python/test_bart.py @@ -115,9 +115,9 @@ def outcome_mean(X, W): bart_model.sample( X_train=X_train, y_train=y_train, - basis_train=basis_train, + leaf_basis_train=basis_train, X_test=X_test, - basis_test=basis_test, + leaf_basis_test=basis_test, num_gfr=num_gfr, num_burnin=num_burnin, num_mcmc=num_mcmc, @@ -181,9 +181,9 @@ def outcome_mean(X, W): bart_model.sample( X_train=X_train, y_train=y_train, - basis_train=basis_train, + leaf_basis_train=basis_train, X_test=X_test, - basis_test=basis_test, + leaf_basis_test=basis_test, num_gfr=num_gfr, num_burnin=num_burnin, num_mcmc=num_mcmc, @@ -331,9 +331,9 @@ def conditional_stddev(X): bart_model.sample( X_train=X_train, y_train=y_train, - basis_train=basis_train, + leaf_basis_train=basis_train, X_test=X_test, - basis_test=basis_test, + leaf_basis_test=basis_test, num_gfr=num_gfr, num_burnin=num_burnin, num_mcmc=num_mcmc, @@ -412,9 +412,9 @@ def conditional_stddev(X): bart_model.sample( X_train=X_train, y_train=y_train, - basis_train=basis_train, + leaf_basis_train=basis_train, X_test=X_test, - basis_test=basis_test, + leaf_basis_test=basis_test, num_gfr=num_gfr, num_burnin=num_burnin, num_mcmc=num_mcmc, diff --git a/test/python/test_json.py b/test/python/test_json.py index 8016cbbd..5a43e855 100644 --- a/test/python/test_json.py +++ b/test/python/test_json.py @@ -333,7 +333,7 @@ def outcome_mean(X, W): # Run BART bart_orig = BARTModel() - bart_orig.sample(X_train=X, y_train=y, basis_train=W, num_gfr=10, num_mcmc=10) + bart_orig.sample(X_train=X, y_train=y, leaf_basis_train=W, num_gfr=10, num_mcmc=10) # Extract predictions from the sampler y_hat_orig = bart_orig.predict(X, W) From a0843064a16bfdf693aa9fb6155ddd6f4b99c047 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Wed, 5 Mar 2025 23:58:56 -0600 Subject: [PATCH 26/35] Partial addition of random effects to BART interface --- R/bart.R | 2 +- stochtree/bart.py | 136 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 131 insertions(+), 7 deletions(-) diff --git a/R/bart.R b/R/bart.R index 64422438..d4f1574a 100644 --- a/R/bart.R +++ b/R/bart.R @@ -445,7 +445,7 @@ bart <- function(X_train, y_train, leaf_basis_train = NULL, rfx_group_ids_train } if (has_rfx_test) { if (is.null(rfx_basis_test)) { - if (!is.null(rfx_basis_train)) { + if (has_basis_rfx) { stop("Random effects basis provided for training set, must also be provided for the test set") } rfx_basis_test <- matrix(rep(1,nrow(X_test)), nrow = nrow(X_test), ncol = 1) diff --git a/stochtree/bart.py b/stochtree/bart.py index f0f701f9..0a93391c 100644 --- a/stochtree/bart.py +++ b/stochtree/bart.py @@ -14,6 +14,7 @@ from .data import Dataset, Residual from .forest import Forest, ForestContainer from .preprocessing import CovariatePreprocessor, _preprocess_params +from .random_effects import RandomEffectsContainer, RandomEffectsDataset, RandomEffectsModel, RandomEffectsTracker from .sampler import RNG, ForestSampler, GlobalVarianceModel, LeafVarianceModel from .serialization import JSONSerializer from .utils import NotSampledError @@ -64,8 +65,12 @@ def sample( X_train: Union[np.array, pd.DataFrame], y_train: np.array, leaf_basis_train: np.array = None, + rfx_group_ids_train: np.array = None, + rfx_basis_train: np.array = None, X_test: Union[np.array, pd.DataFrame] = None, leaf_basis_test: np.array = None, + rfx_group_ids_test: np.array = None, + rfx_basis_test: np.array = None, num_gfr: int = 5, num_burnin: int = 0, num_mcmc: int = 100, @@ -84,11 +89,20 @@ def sample( Training set outcome. leaf_basis_train : np.array, optional Optional training set basis vector used to define a regression to be run in the leaves of each tree. + rfx_group_ids_train : np.array, optional + Optional group labels used for an additive random effects model. + rfx_basis_train : np.array, optional + Optional basis for "random-slope" regression in an additive random effects model. X_test : np.array, optional Optional test set covariates. leaf_basis_test : np.array, optional Optional test set basis vector used to define a regression to be run in the leaves of each tree. Must be included / omitted consistently (i.e. if leaf_basis_train is provided, then leaf_basis_test must be provided alongside X_test). + rfx_group_ids_test : np.array, optional + Optional test set group labels used for an additive random effects model. We do not currently support (but plan to in the near future), + test set evaluation for group labels that were not in the training set. + rfx_basis_test : np.array, optional + Optional test set basis for "random-slope" regression in additive random effects model. num_gfr : int, optional Number of "warm-start" iterations run using the grow-from-root algorithm (He and Hahn, 2021). Defaults to `5`. num_burnin : int, optional @@ -277,7 +291,23 @@ def sample( if leaf_basis_test is not None: if not isinstance(leaf_basis_test, np.ndarray): raise ValueError("X_test must be a numpy array") - + if rfx_group_ids_train is not None: + if not isinstance(rfx_group_ids_train, np.ndarray): + raise ValueError("rfx_group_ids_train must be a numpy array") + if not np.issubdtype(rfx_group_ids_train, np.integer): + raise ValueError("rfx_group_ids_train must be a numpy array of integer-valued group IDs") + if rfx_basis_train is not None: + if not isinstance(rfx_basis_train, np.ndarray): + raise ValueError("rfx_basis_train must be a numpy array") + if rfx_group_ids_test is not None: + if not isinstance(rfx_group_ids_test, np.ndarray): + raise ValueError("rfx_group_ids_test must be a numpy array") + if not np.issubdtype(rfx_group_ids_test, np.integer): + raise ValueError("rfx_group_ids_test must be a numpy array of integer-valued group IDs") + if rfx_basis_test is not None: + if not isinstance(rfx_basis_test, np.ndarray): + raise ValueError("rfx_basis_test must be a numpy array") + # Convert everything to standard shape (2-dimensional) if isinstance(X_train, np.ndarray): if X_train.ndim == 1: @@ -658,6 +688,60 @@ def sample( a_forest = 1.0 if not b_forest: b_forest = 1.0 + + # Runtime checks on RFX group ids + self.has_rfx = False + has_rfx_test = False + if rfx_group_ids_train is not None: + self.has_rfx = True + if rfx_group_ids_test is not None: + has_rfx_test = True + if not np.all(np.isin(rfx_group_ids_test, rfx_group_ids_train)): + raise ValueError("All random effect group labels provided in rfx_group_ids_test must be present in rfx_group_ids_train") + + # Fill in rfx basis as a vector of 1s (random intercept) if a basis not provided + has_basis_rfx = False + num_basis_rfx = 0 + if self.has_rfx: + if rfx_basis_train is None: + rfx_basis_train = np.ones((rfx_group_ids_train.shape[0],1)) + else: + has_basis_rfx = True + num_basis_rfx = rfx_basis_train.shape[1] + num_rfx_groups = len(np.unique(rfx_group_ids_train)) + num_rfx_components = rfx_basis_train.shape[0] + # TODO warn if num_rfx_groups is 1 + if has_rfx_test: + if rfx_basis_test is None: + if has_basis_rfx: + raise ValueError("Random effects basis provided for training set, must also be provided for the test set") + rfx_basis_test = np.ones((rfx_group_ids_test.shape[0],1)) + + # Set up random effects structures + if self.has_rfx: + if num_rfx_components == 1: + alpha_init = np.array([1]) + elif num_rfx_components > 1: + alpha_init = np.c_[np.ones(1), np.zeros(num_rfx_components-1)] + else: + raise ValueError("There must be at least 1 random effect component") + xi_init = np.tile(alpha_init, (1, num_rfx_groups)) + sigma_alpha_init = np.identity(num_rfx_components) + sigma_xi_init = np.identity(num_rfx_components) + sigma_xi_shape = 1. + sigma_xi_scale = 1. + rfx_dataset_train = RandomEffectsDataset() + rfx_dataset_train.add_group_labels(rfx_group_ids_train) + rfx_dataset_train.add_basis(rfx_basis_train) + rfx_tracker = RandomEffectsTracker(rfx_group_ids_train) + rfx_model = RandomEffectsModel(num_rfx_components, num_rfx_groups) + rfx_model.set_working_parameter(alpha_init) + rfx_model.set_group_parameters(xi_init) + rfx_model.set_working_parameter_covariance(sigma_alpha_init) + rfx_model.set_group_parameter_covariance(sigma_xi_init) + rfx_model.set_variance_prior_shape(sigma_xi_shape) + rfx_model.set_variance_prior_scale(sigma_xi_scale) + self.rfx_container = RandomEffectsContainer(num_rfx_components, num_rfx_groups, rfx_tracker) # Container of variance parameter samples self.num_gfr = num_gfr @@ -863,6 +947,12 @@ def sample( self.leaf_scale_samples[sample_counter] = current_leaf_scale[ 0, 0 ] + + # Sample random effects + if self.has_rfx: + rfx_model.sample( + rfx_dataset_train, residual_train, rfx_tracker, self.rfx_container, keep_sample, current_sigma2, cpp_rng + ) # Run MCMC if self.num_burnin + self.num_mcmc > 0: @@ -976,6 +1066,12 @@ def sample( self.leaf_scale_samples[sample_counter] = ( current_leaf_scale[0, 0] ) + + # Sample random effects + if self.has_rfx: + rfx_model.sample( + rfx_dataset_train, residual_train, rfx_tracker, self.rfx_container, keep_sample, current_sigma2, cpp_rng + ) # Mark the model as sampled self.sampled = True @@ -1010,6 +1106,19 @@ def sample( forest_dataset_test.dataset_cpp ) self.y_hat_test = yhat_test_raw * self.y_std + self.y_bar + + if self.has_rfx: + rfx_preds_train = self.rfx_container.predict(rfx_group_ids_train, rfx_basis_train) * self.y_std + if has_rfx_test: + rfx_preds_test = self.rfx_container.predict(rfx_group_ids_test, rfx_basis_test) * self.y_std + if self.include_mean_forest: + self.y_hat_train = self.y_hat_train + rfx_preds_train + if self.has_test: + self.y_hat_test = self.y_hat_test + rfx_preds_test + else: + self.y_hat_train = rfx_preds_train + if self.has_test: + self.y_hat_test = rfx_preds_test if self.include_variance_forest: sigma_x_train_raw = ( @@ -1045,7 +1154,9 @@ def sample( ) def predict( - self, covariates: Union[np.array, pd.DataFrame], basis: np.array = None + self, covariates: Union[np.array, pd.DataFrame], basis: np.array = None, + rfx_group_ids: np.array = None, rfx_basis: np.array = None, + ) -> Union[np.array, tuple]: """Return predictions from every forest sampled (either / both of mean and variance). Return type is either a single array of predictions, if a BART model only includes a @@ -1057,11 +1168,15 @@ def predict( Test set covariates. basis : np.array, optional Optional test set basis vector, must be provided if the model was trained with a leaf regression basis. + rfx_group_ids : np.array, optional + Optional group labels used for an additive random effects model. + rfx_basis : np.array, optional + Optional basis for "random-slope" regression in an additive random effects model. Returns ------- mu_x : np.array, optional - Mean forest predictions. + Mean forest and / or random effects predictions. sigma2_x : np.array, optional Variance forest predictions. """ @@ -1128,6 +1243,14 @@ def predict( pred_dataset.dataset_cpp ) mean_pred = mean_pred_raw * self.y_std + self.y_bar + + if self.has_rfx: + rfx_preds = self.rfx_container.predict(rfx_group_ids, rfx_basis) * self.y_std + if self.include_mean_forest: + mean_pred = mean_pred + rfx_preds + else: + mean_pred = rfx_preds + if self.include_variance_forest: variance_pred_raw = ( self.forest_container_variance.forest_container_cpp.Predict( @@ -1145,11 +1268,12 @@ def predict( np.sqrt(variance_pred_raw * self.sigma2_init) * self.y_std ) - if self.include_mean_forest and self.include_variance_forest: + has_mean_predictions = self.include_mean_forest or self.has_rfx + if has_mean_predictions and self.include_variance_forest: return (mean_pred, variance_pred) - elif self.include_mean_forest and not self.include_variance_forest: + elif has_mean_predictions and not self.include_variance_forest: return mean_pred - elif not self.include_mean_forest and self.include_variance_forest: + elif not has_mean_predictions and self.include_variance_forest: return variance_pred def predict_mean(self, covariates: np.array, basis: np.array = None) -> np.array: From 93cbd65752bc9e832bd4d0fa26027464d5fd7bd2 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Thu, 13 Mar 2025 00:23:14 -0500 Subject: [PATCH 27/35] Updated random effects so that serialization works as in R --- demo/debug/rfx_serialization.py | 70 ++++++++++++++ include/stochtree/random_effects.h | 13 ++- src/py_stochtree.cpp | 97 +++++++++++++++---- stochtree/bart.py | 75 +++++++++++---- stochtree/bcf.py | 147 ++++++++++++++++++++++++++++- stochtree/random_effects.py | 60 ++++++++---- stochtree/serialization.py | 18 +++- test/python/test_json.py | 137 +++++++++++++++++++++++++++ test/python/test_random_effects.py | 6 +- 9 files changed, 563 insertions(+), 60 deletions(-) create mode 100644 demo/debug/rfx_serialization.py diff --git a/demo/debug/rfx_serialization.py b/demo/debug/rfx_serialization.py new file mode 100644 index 00000000..db80edea --- /dev/null +++ b/demo/debug/rfx_serialization.py @@ -0,0 +1,70 @@ +import numpy as np +from stochtree import BARTModel + +# RNG +random_seed = 1234 +rng = np.random.default_rng(random_seed) + +# Generate covariates and basis +n = 1000 +p_X = 10 +p_W = 1 +X = rng.uniform(0, 1, (n, p_X)) +W = rng.uniform(0, 1, (n, p_W)) + +# Generate random effects terms +num_basis = 2 +num_groups = 4 +group_labels = rng.choice(num_groups, size=n) +basis = np.empty((n, num_basis)) +basis[:, 0] = 1.0 +if num_basis > 1: + basis[:, 1:] = rng.uniform(-1, 1, (n, num_basis - 1)) + +# Define the outcome mean function +def outcome_mean(X, W): + return np.where( + (X[:,0] >= 0.0) & (X[:,0] < 0.25), -7.5 * W[:,0], + np.where( + (X[:,0] >= 0.25) & (X[:,0] < 0.5), -2.5 * W[:,0], + np.where( + (X[:,0] >= 0.5) & (X[:,0] < 0.75), 2.5 * W[:,0], + 7.5 * W[:,0] + ) + ) + ) + +# Define the group rfx function +def rfx_mean(group_labels, basis): + return np.where( + group_labels == 0, + 0 - 1 * basis[:, 1], + np.where( + group_labels == 1, + 4 + 1 * basis[:, 1], + np.where( + group_labels == 2, 8 + 3 * basis[:, 1], 12 + 5 * basis[:, 1] + ), + ), + ) + +# Generate outcome +epsilon = rng.normal(0, 1, n) +forest_term = outcome_mean(X, W) +rfx_term = rfx_mean(group_labels, basis) +y = forest_term + rfx_term + epsilon + +# Run BART +bart_orig = BARTModel() +bart_orig.sample(X_train=X, y_train=y, leaf_basis_train=W, rfx_group_ids_train=group_labels, + rfx_basis_train=basis, num_gfr=10, num_mcmc=10) + +# Extract predictions from the sampler +y_hat_orig = bart_orig.predict(X, W, group_labels, basis) + +# "Round-trip" the model to JSON string and back and check that the predictions agree +bart_json_string = bart_orig.to_json() +bart_reloaded = BARTModel() +bart_reloaded.from_json(bart_json_string) +y_hat_reloaded = bart_reloaded.predict(X, W, group_labels, basis) +np.testing.assert_almost_equal(y_hat_orig, y_hat_reloaded) \ No newline at end of file diff --git a/include/stochtree/random_effects.h b/include/stochtree/random_effects.h index 451bc4e4..701ebeaa 100644 --- a/include/stochtree/random_effects.h +++ b/include/stochtree/random_effects.h @@ -85,6 +85,10 @@ class LabelMapper { for (const auto& [key, value] : label_map) keys_.push_back(key); } ~LabelMapper() {} + void LoadFromLabelMap(std::map label_map) { + label_map_ = label_map; + for (const auto& [key, value] : label_map) keys_.push_back(key); + } bool ContainsLabel(int32_t category_id) { auto pos = label_map_.find(category_id); return pos != label_map_.end(); @@ -319,9 +323,12 @@ class RandomEffectsContainer { void AddSample(MultivariateRegressionRandomEffectsModel& model); void DeleteSample(int sample_num); void Predict(RandomEffectsDataset& dataset, LabelMapper& label_mapper, std::vector& output); - int NumSamples() {return num_samples_;} - int NumComponents() {return num_components_;} - int NumGroups() {return num_groups_;} + inline int NumSamples() {return num_samples_;} + inline int NumComponents() {return num_components_;} + inline int NumGroups() {return num_groups_;} + inline void SetNumSamples(int num_samples) {num_samples_ = num_samples;} + inline void SetNumComponents(int num_components) {num_components_ = num_components;} + inline void SetNumGroups(int num_groups) {num_groups_ = num_groups;} void Reset() { num_samples_ = 0; num_components_ = 0; diff --git a/src/py_stochtree.cpp b/src/py_stochtree.cpp index 6cdae80c..6d28a237 100644 --- a/src/py_stochtree.cpp +++ b/src/py_stochtree.cpp @@ -1246,10 +1246,14 @@ class RandomEffectsLabelMapperCpp; class RandomEffectsContainerCpp { public: - RandomEffectsContainerCpp(int num_components, int num_groups) { - rfx_container_ = std::make_unique(num_components, num_groups); + RandomEffectsContainerCpp() { + rfx_container_ = std::make_unique(); } ~RandomEffectsContainerCpp() {} + void SetComponentsAndGroups(int num_components, int num_groups) { + rfx_container_->SetNumComponents(num_components); + rfx_container_->SetNumGroups(num_groups); + } void AddSample(RandomEffectsModelCpp& rfx_model); int NumSamples() { return rfx_container_->NumSamples(); @@ -1276,7 +1280,10 @@ class RandomEffectsContainerCpp { void LoadFromJsonString(std::string& json_string) { rfx_container_->LoadFromJsonString(json_string); } - void LoadFromJson(JsonCpp& json, std::string rfx_label); + void LoadFromJson(JsonCpp& json, std::string rfx_container_label); + StochTree::RandomEffectsContainer* GetRandomEffectsContainer() { + return rfx_container_.get(); + } private: std::unique_ptr rfx_container_; @@ -1293,6 +1300,16 @@ class RandomEffectsTrackerCpp { rfx_tracker_ = std::make_unique(group_labels_vec); } ~RandomEffectsTrackerCpp() {} + py::array_t GetUniqueGroupIds() { + std::vector output = rfx_tracker_->GetUniqueGroupIds(); + py::ssize_t output_length = output.size(); + auto result = py::array_t(py::detail::any_container({output_length})); + auto accessor = result.mutable_unchecked<1>(); + for (size_t i = 0; i < output_length; i++) { + accessor(i) = output.at(i); + } + return result; + } StochTree::RandomEffectsTracker* GetTracker() { return rfx_tracker_.get(); } @@ -1303,11 +1320,14 @@ class RandomEffectsTrackerCpp { class RandomEffectsLabelMapperCpp { public: - RandomEffectsLabelMapperCpp(RandomEffectsTrackerCpp& rfx_tracker) { - StochTree::RandomEffectsTracker* internal_tracker = rfx_tracker.GetTracker(); - rfx_label_mapper_ = std::make_unique(internal_tracker->GetLabelMap()); + RandomEffectsLabelMapperCpp() { + rfx_label_mapper_ = std::make_unique(); } ~RandomEffectsLabelMapperCpp() {} + void LoadFromTracker(RandomEffectsTrackerCpp& rfx_tracker) { + StochTree::RandomEffectsTracker* internal_tracker = rfx_tracker.GetTracker(); + rfx_label_mapper_->LoadFromLabelMap(internal_tracker->GetLabelMap()); + } void SaveToJsonFile(std::string json_filename) { rfx_label_mapper_->SaveToJsonFile(json_filename); } @@ -1320,7 +1340,7 @@ class RandomEffectsLabelMapperCpp { void LoadFromJsonString(std::string& json_string) { rfx_label_mapper_->LoadFromJsonString(json_string); } - void LoadFromJson(JsonCpp& json, std::string rfx_label); + void LoadFromJson(JsonCpp& json, std::string rfx_label_mapper_label); StochTree::LabelMapper* GetLabelMapper() { return rfx_label_mapper_.get(); } @@ -1410,6 +1430,9 @@ class JsonCpp { nlohmann::json forests = nlohmann::json::object(); json_->emplace("forests", forests); json_->emplace("num_forests", 0); + nlohmann::json rfx = nlohmann::json::object(); + json_->emplace("random_effects", rfx); + json_->emplace("num_random_effects", 0); } ~JsonCpp() {} @@ -1440,6 +1463,38 @@ class JsonCpp { return forest_label; } + std::string AddRandomEffectsContainer(RandomEffectsContainerCpp& rfx_samples) { + int rfx_num = json_->at("num_random_effects"); + std::string rfx_label = "random_effect_container_" + std::to_string(rfx_num); + nlohmann::json rfx_json = rfx_samples.GetRandomEffectsContainer()->to_json(); + json_->at("random_effects").emplace(rfx_label, rfx_json); + return rfx_label; + } + + std::string AddRandomEffectsLabelMapper(RandomEffectsLabelMapperCpp& rfx_label_mapper) { + int rfx_num = json_->at("num_random_effects"); + std::string rfx_label = "random_effect_label_mapper_" + std::to_string(rfx_num); + nlohmann::json rfx_json = rfx_label_mapper.GetLabelMapper()->to_json(); + json_->at("random_effects").emplace(rfx_label, rfx_json); + return rfx_label; + } + + std::string AddRandomEffectsGroupIDs(py::array_t rfx_group_ids) { + int rfx_num = json_->at("num_random_effects"); + std::string rfx_label = "random_effect_groupids_" + std::to_string(rfx_num); + nlohmann::json groupids_json = nlohmann::json::array(); + for (int i = 0; i < rfx_group_ids.size(); i++) { + groupids_json.emplace_back(rfx_group_ids.at(i)); + } + json_->at("random_effects").emplace(rfx_label, groupids_json); + return rfx_label; + } + + void IncrementRandomEffectsCount() { + int rfx_num = json_->at("num_random_effects"); + json_->at("num_random_effects") = rfx_num + 1; + } + void AddDouble(std::string field_name, double field_value) { if (json_->contains(field_name)) { json_->at(field_name) = field_value; @@ -1762,8 +1817,8 @@ class JsonCpp { return json_->at("forests").at(forest_label); } - nlohmann::json SubsetJsonRFX(std::string rfx_label) { - return json_->at("random_effects").at(rfx_label); + nlohmann::json SubsetJsonRFX() { + return json_->at("random_effects"); } private: @@ -1796,8 +1851,8 @@ void ForestCpp::AdjustResidual(ForestDatasetCpp& dataset, ResidualCpp& residual, StochTree::UpdateResidualEntireForest(*(sampler.GetTracker()), *(dataset.GetDataset()), *(residual.GetData()), forest_.get(), requires_basis, op); } -void RandomEffectsContainerCpp::LoadFromJson(JsonCpp& json, std::string rfx_label) { - nlohmann::json rfx_json = json.SubsetJsonRFX(rfx_label); +void RandomEffectsContainerCpp::LoadFromJson(JsonCpp& json, std::string rfx_container_label) { + nlohmann::json rfx_json = json.SubsetJsonRFX().at(rfx_container_label); rfx_container_->Reset(); rfx_container_->from_json(rfx_json); } @@ -1821,8 +1876,8 @@ py::array_t RandomEffectsContainerCpp::Predict(RandomEffectsDatasetCpp& return result; } -void RandomEffectsLabelMapperCpp::LoadFromJson(JsonCpp& json, std::string rfx_label) { - nlohmann::json rfx_json = json.SubsetJsonRFX(rfx_label); +void RandomEffectsLabelMapperCpp::LoadFromJson(JsonCpp& json, std::string rfx_label_mapper_label) { + nlohmann::json rfx_json = json.SubsetJsonRFX().at(rfx_label_mapper_label); rfx_label_mapper_->Reset(); rfx_label_mapper_->from_json(rfx_json); } @@ -1857,6 +1912,9 @@ PYBIND11_MODULE(stochtree_cpp, m) { .def("AddStringVector", &JsonCpp::AddStringVector) .def("AddStringVectorSubfolder", &JsonCpp::AddStringVectorSubfolder) .def("AddForest", &JsonCpp::AddForest) + .def("AddRandomEffectsContainer", &JsonCpp::AddRandomEffectsContainer) + .def("AddRandomEffectsLabelMapper", &JsonCpp::AddRandomEffectsLabelMapper) + .def("AddRandomEffectsGroupIDs", &JsonCpp::AddRandomEffectsGroupIDs) .def("ContainsField", &JsonCpp::ContainsField) .def("ContainsFieldSubfolder", &JsonCpp::ContainsFieldSubfolder) .def("ExtractDouble", &JsonCpp::ExtractDouble) @@ -1873,9 +1931,10 @@ PYBIND11_MODULE(stochtree_cpp, m) { .def("ExtractIntegerVectorSubfolder", &JsonCpp::ExtractIntegerVectorSubfolder) .def("ExtractStringVector", &JsonCpp::ExtractStringVector) .def("ExtractStringVectorSubfolder", &JsonCpp::ExtractStringVectorSubfolder) + .def("IncrementRandomEffectsCount", &JsonCpp::IncrementRandomEffectsCount) .def("SubsetJsonForest", &JsonCpp::SubsetJsonForest) .def("SubsetJsonRFX", &JsonCpp::SubsetJsonRFX); - + py::class_(m, "ForestDatasetCpp") .def(py::init<>()) .def("AddCovariates", &ForestDatasetCpp::AddCovariates) @@ -2009,7 +2068,8 @@ PYBIND11_MODULE(stochtree_cpp, m) { .def("HasVarianceWeights", &RandomEffectsDatasetCpp::HasVarianceWeights); py::class_(m, "RandomEffectsContainerCpp") - .def(py::init()) + .def(py::init<>()) + .def("SetComponentsAndGroups", &RandomEffectsContainerCpp::SetComponentsAndGroups) .def("AddSample", &RandomEffectsContainerCpp::AddSample) .def("NumSamples", &RandomEffectsContainerCpp::NumSamples) .def("NumComponents", &RandomEffectsContainerCpp::NumComponents) @@ -2020,14 +2080,17 @@ PYBIND11_MODULE(stochtree_cpp, m) { .def("LoadFromJsonFile", &RandomEffectsContainerCpp::LoadFromJsonFile) .def("DumpJsonString", &RandomEffectsContainerCpp::DumpJsonString) .def("LoadFromJsonString", &RandomEffectsContainerCpp::LoadFromJsonString) - .def("LoadFromJson", &RandomEffectsContainerCpp::LoadFromJson); + .def("LoadFromJson", &RandomEffectsContainerCpp::LoadFromJson) + .def("GetRandomEffectsContainer", &RandomEffectsContainerCpp::GetRandomEffectsContainer); py::class_(m, "RandomEffectsTrackerCpp") .def(py::init>()) + .def("GetUniqueGroupIds", &RandomEffectsTrackerCpp::GetUniqueGroupIds) .def("GetTracker", &RandomEffectsTrackerCpp::GetTracker); py::class_(m, "RandomEffectsLabelMapperCpp") - .def(py::init()) + .def(py::init<>()) + .def("LoadFromTracker", &RandomEffectsLabelMapperCpp::LoadFromTracker) .def("SaveToJsonFile", &RandomEffectsLabelMapperCpp::SaveToJsonFile) .def("LoadFromJsonFile", &RandomEffectsLabelMapperCpp::LoadFromJsonFile) .def("DumpJsonString", &RandomEffectsLabelMapperCpp::DumpJsonString) diff --git a/stochtree/bart.py b/stochtree/bart.py index 0a93391c..2c82f190 100644 --- a/stochtree/bart.py +++ b/stochtree/bart.py @@ -294,7 +294,7 @@ def sample( if rfx_group_ids_train is not None: if not isinstance(rfx_group_ids_train, np.ndarray): raise ValueError("rfx_group_ids_train must be a numpy array") - if not np.issubdtype(rfx_group_ids_train, np.integer): + if not np.issubdtype(rfx_group_ids_train.dtype, np.integer): raise ValueError("rfx_group_ids_train must be a numpy array of integer-valued group IDs") if rfx_basis_train is not None: if not isinstance(rfx_basis_train, np.ndarray): @@ -302,7 +302,7 @@ def sample( if rfx_group_ids_test is not None: if not isinstance(rfx_group_ids_test, np.ndarray): raise ValueError("rfx_group_ids_test must be a numpy array") - if not np.issubdtype(rfx_group_ids_test, np.integer): + if not np.issubdtype(rfx_group_ids_test.dtype, np.integer): raise ValueError("rfx_group_ids_test must be a numpy array of integer-valued group IDs") if rfx_basis_test is not None: if not isinstance(rfx_basis_test, np.ndarray): @@ -324,6 +324,18 @@ def sample( if leaf_basis_test is not None: if leaf_basis_test.ndim == 1: leaf_basis_test = np.expand_dims(leaf_basis_test, 1) + if rfx_group_ids_train is not None: + if rfx_group_ids_train.ndim != 1: + rfx_group_ids_train = np.squeeze(rfx_group_ids_train) + if rfx_group_ids_test is not None: + if rfx_group_ids_test.ndim != 1: + rfx_group_ids_test = np.squeeze(rfx_group_ids_test) + if rfx_basis_train is not None: + if rfx_basis_train.ndim == 1: + rfx_basis_train = np.expand_dims(rfx_basis_train, 1) + if rfx_basis_test is not None: + if rfx_basis_test.ndim == 1: + rfx_basis_test = np.expand_dims(rfx_basis_test, 1) # Data checks if X_test is not None: @@ -701,15 +713,13 @@ def sample( # Fill in rfx basis as a vector of 1s (random intercept) if a basis not provided has_basis_rfx = False - num_basis_rfx = 0 if self.has_rfx: if rfx_basis_train is None: rfx_basis_train = np.ones((rfx_group_ids_train.shape[0],1)) else: has_basis_rfx = True - num_basis_rfx = rfx_basis_train.shape[1] - num_rfx_groups = len(np.unique(rfx_group_ids_train)) - num_rfx_components = rfx_basis_train.shape[0] + num_rfx_groups = np.unique(rfx_group_ids_train).shape[0] + num_rfx_components = rfx_basis_train.shape[1] # TODO warn if num_rfx_groups is 1 if has_rfx_test: if rfx_basis_test is None: @@ -722,10 +732,10 @@ def sample( if num_rfx_components == 1: alpha_init = np.array([1]) elif num_rfx_components > 1: - alpha_init = np.c_[np.ones(1), np.zeros(num_rfx_components-1)] + alpha_init = np.concatenate((np.ones(1), np.zeros(num_rfx_components-1))) else: raise ValueError("There must be at least 1 random effect component") - xi_init = np.tile(alpha_init, (1, num_rfx_groups)) + xi_init = np.tile(np.expand_dims(alpha_init, 1), (1, num_rfx_groups)) sigma_alpha_init = np.identity(num_rfx_components) sigma_xi_init = np.identity(num_rfx_components) sigma_xi_shape = 1. @@ -741,7 +751,8 @@ def sample( rfx_model.set_group_parameter_covariance(sigma_xi_init) rfx_model.set_variance_prior_shape(sigma_xi_shape) rfx_model.set_variance_prior_scale(sigma_xi_scale) - self.rfx_container = RandomEffectsContainer(num_rfx_components, num_rfx_groups, rfx_tracker) + self.rfx_container = RandomEffectsContainer() + self.rfx_container.load_new_container(num_rfx_components, num_rfx_groups, rfx_tracker) # Container of variance parameter samples self.num_gfr = num_gfr @@ -1083,6 +1094,8 @@ def sample( self.forest_container_mean.delete_sample(i) if self.include_variance_forest: self.forest_container_variance.delete_sample(i) + if self.has_rfx: + self.rfx_container.delete_sample(i) if self.sample_sigma_global: self.global_var_samples = self.global_var_samples[num_gfr:] if self.sample_sigma_leaf: @@ -1107,6 +1120,7 @@ def sample( ) self.y_hat_test = yhat_test_raw * self.y_std + self.y_bar + # TODO: make rfx_preds_train and rfx_preds_test persistent properties if self.has_rfx: rfx_preds_train = self.rfx_container.predict(rfx_group_ids_train, rfx_basis_train) * self.y_std if has_rfx_test: @@ -1155,8 +1169,7 @@ def sample( def predict( self, covariates: Union[np.array, pd.DataFrame], basis: np.array = None, - rfx_group_ids: np.array = None, rfx_basis: np.array = None, - + rfx_group_ids: np.array = None, rfx_basis: np.array = None ) -> Union[np.array, tuple]: """Return predictions from every forest sampled (either / both of mean and variance). Return type is either a single array of predictions, if a BART model only includes a @@ -1249,7 +1262,7 @@ def predict( if self.include_mean_forest: mean_pred = mean_pred + rfx_preds else: - mean_pred = rfx_preds + mean_pred = rfx_preds + self.y_bar if self.include_variance_forest: variance_pred_raw = ( @@ -1276,7 +1289,10 @@ def predict( elif not has_mean_predictions and self.include_variance_forest: return variance_pred - def predict_mean(self, covariates: np.array, basis: np.array = None) -> np.array: + def predict_mean( + self, covariates: np.array, basis: np.array = None, + rfx_group_ids: np.array = None, rfx_basis: np.array = None + ) -> np.array: """Predict expected conditional outcome from a BART model. Parameters @@ -1298,9 +1314,10 @@ def predict_mean(self, covariates: np.array, basis: np.array = None) -> np.array ) raise NotSampledError(msg) - if not self.include_mean_forest: + has_mean_predictions = self.include_mean_forest or self.has_rfx + if not has_mean_predictions: msg = ( - "This BARTModel instance was not sampled with a mean forest. " + "This BARTModel instance was not sampled with a mean forest or random effects. " "Call 'fit' with appropriate arguments before using this model." ) raise NotSampledError(msg) @@ -1356,10 +1373,19 @@ def predict_mean(self, covariates: np.array, basis: np.array = None) -> np.array pred_dataset.add_basis(basis) # Mean forest predictions - mean_pred_raw = self.forest_container_mean.forest_container_cpp.Predict( - pred_dataset.dataset_cpp - ) - mean_pred = mean_pred_raw * self.y_std + self.y_bar + if self.include_mean_forest: + mean_pred_raw = self.forest_container_mean.forest_container_cpp.Predict( + pred_dataset.dataset_cpp + ) + mean_pred = mean_pred_raw * self.y_std + self.y_bar + + # RFX predictions + if self.has_rfx: + rfx_preds = self.rfx_container.predict(rfx_group_ids, rfx_basis) * self.y_std + if self.include_mean_forest: + mean_pred = mean_pred + rfx_preds + else: + mean_pred = rfx_preds + self.y_bar return mean_pred @@ -1471,6 +1497,10 @@ def to_json(self) -> str: if self.include_variance_forest: bart_json.add_forest(self.forest_container_variance) + # Add the rfx + if self.has_rfx: + bart_json.add_random_effects(self.rfx_container) + # Add global parameters bart_json.add_scalar("outcome_scale", self.y_std) bart_json.add_scalar("outcome_mean", self.y_bar) @@ -1480,6 +1510,7 @@ def to_json(self) -> str: bart_json.add_boolean("sample_sigma_leaf", self.sample_sigma_leaf) bart_json.add_boolean("include_mean_forest", self.include_mean_forest) bart_json.add_boolean("include_variance_forest", self.include_variance_forest) + bart_json.add_boolean("has_rfx", self.has_rfx) bart_json.add_scalar("num_gfr", self.num_gfr) bart_json.add_scalar("num_burnin", self.num_burnin) bart_json.add_scalar("num_mcmc", self.num_mcmc) @@ -1519,6 +1550,7 @@ def from_json(self, json_string: str) -> None: # Unpack forests self.include_mean_forest = bart_json.get_boolean("include_mean_forest") self.include_variance_forest = bart_json.get_boolean("include_variance_forest") + self.has_rfx = bart_json.get_boolean("has_rfx") if self.include_mean_forest: # TODO: don't just make this a placeholder that we overwrite self.forest_container_mean = ForestContainer(0, 0, False, False) @@ -1537,6 +1569,11 @@ def from_json(self, json_string: str) -> None: self.forest_container_variance.forest_container_cpp.LoadFromJson( bart_json.json_cpp, "forest_0" ) + + # Unpack random effects + if self.has_rfx: + self.rfx_container = RandomEffectsContainer() + self.rfx_container.load_from_json(bart_json, 0) # Unpack global parameters self.y_std = bart_json.get_scalar("outcome_scale") diff --git a/stochtree/bcf.py b/stochtree/bcf.py index c8dfd1b2..fec61de3 100644 --- a/stochtree/bcf.py +++ b/stochtree/bcf.py @@ -13,6 +13,7 @@ from .data import Dataset, Residual from .forest import Forest, ForestContainer from .preprocessing import CovariatePreprocessor, _preprocess_params +from .random_effects import RandomEffectsContainer, RandomEffectsDataset, RandomEffectsModel, RandomEffectsTracker from .sampler import RNG, ForestSampler, GlobalVarianceModel, LeafVarianceModel from .serialization import JSONSerializer from .utils import NotSampledError @@ -73,9 +74,13 @@ def sample( Z_train: np.array, y_train: np.array, pi_train: np.array = None, + rfx_group_ids_train: np.array = None, + rfx_basis_train: np.array = None, X_test: Union[pd.DataFrame, np.array] = None, Z_test: np.array = None, pi_test: np.array = None, + rfx_group_ids_test: np.array = None, + rfx_basis_test: np.array = None, num_gfr: int = 5, num_burnin: int = 0, num_mcmc: int = 100, @@ -97,6 +102,10 @@ def sample( Outcome to be modeled by the ensemble. pi_train : np.array Optional vector of propensity scores. If not provided, this will be estimated from the data. + rfx_group_ids_train : np.array, optional + Optional group labels used for an additive random effects model. + rfx_basis_train : np.array, optional + Optional basis for "random-slope" regression in an additive random effects model. X_test : np.array, optional Optional test set of covariates used to define "out of sample" evaluation data. Z_test : np.array, optional @@ -104,6 +113,11 @@ def sample( Must be provided if `X_test` is provided. pi_test : np.array, optional Optional test set vector of propensity scores. If not provided (but `X_test` and `Z_test` are), this will be estimated from the data. + rfx_group_ids_test : np.array, optional + Optional test set group labels used for an additive random effects model. We do not currently support (but plan to in the near future), + test set evaluation for group labels that were not in the training set. + rfx_basis_test : np.array, optional + Optional test set basis for "random-slope" regression in additive random effects model. num_gfr : int, optional Number of "warm-start" iterations run using the grow-from-root algorithm (He and Hahn, 2021). Defaults to `5`. num_burnin : int, optional @@ -357,6 +371,22 @@ def sample( if pi_test is not None: if not isinstance(pi_test, np.ndarray): raise ValueError("pi_test must be a numpy array") + if rfx_group_ids_train is not None: + if not isinstance(rfx_group_ids_train, np.ndarray): + raise ValueError("rfx_group_ids_train must be a numpy array") + if not np.issubdtype(rfx_group_ids_train.dtype, np.integer): + raise ValueError("rfx_group_ids_train must be a numpy array of integer-valued group IDs") + if rfx_basis_train is not None: + if not isinstance(rfx_basis_train, np.ndarray): + raise ValueError("rfx_basis_train must be a numpy array") + if rfx_group_ids_test is not None: + if not isinstance(rfx_group_ids_test, np.ndarray): + raise ValueError("rfx_group_ids_test must be a numpy array") + if not np.issubdtype(rfx_group_ids_test.dtype, np.integer): + raise ValueError("rfx_group_ids_test must be a numpy array of integer-valued group IDs") + if rfx_basis_test is not None: + if not isinstance(rfx_basis_test, np.ndarray): + raise ValueError("rfx_basis_test must be a numpy array") # Convert everything to standard shape (2-dimensional) if isinstance(X_train, np.ndarray): @@ -380,6 +410,18 @@ def sample( if pi_test is not None: if pi_test.ndim == 1: pi_test = np.expand_dims(pi_test, 1) + if rfx_group_ids_train is not None: + if rfx_group_ids_train.ndim != 1: + rfx_group_ids_train = np.squeeze(rfx_group_ids_train) + if rfx_group_ids_test is not None: + if rfx_group_ids_test.ndim != 1: + rfx_group_ids_test = np.squeeze(rfx_group_ids_test) + if rfx_basis_train is not None: + if rfx_basis_train.ndim == 1: + rfx_basis_train = np.expand_dims(rfx_basis_train, 1) + if rfx_basis_test is not None: + if rfx_basis_test.ndim == 1: + rfx_basis_test = np.expand_dims(rfx_basis_test, 1) # Original number of covariates num_cov_orig = X_train.shape[1] @@ -1128,6 +1170,59 @@ def sample( a_forest = 1.0 if not b_forest: b_forest = 1.0 + + # Runtime checks on RFX group ids + self.has_rfx = False + has_rfx_test = False + if rfx_group_ids_train is not None: + self.has_rfx = True + if rfx_group_ids_test is not None: + has_rfx_test = True + if not np.all(np.isin(rfx_group_ids_test, rfx_group_ids_train)): + raise ValueError("All random effect group labels provided in rfx_group_ids_test must be present in rfx_group_ids_train") + + # Fill in rfx basis as a vector of 1s (random intercept) if a basis not provided + has_basis_rfx = False + if self.has_rfx: + if rfx_basis_train is None: + rfx_basis_train = np.ones((rfx_group_ids_train.shape[0],1)) + else: + has_basis_rfx = True + num_rfx_groups = np.unique(rfx_group_ids_train).shape[0] + num_rfx_components = rfx_basis_train.shape[1] + # TODO warn if num_rfx_groups is 1 + if has_rfx_test: + if rfx_basis_test is None: + if has_basis_rfx: + raise ValueError("Random effects basis provided for training set, must also be provided for the test set") + rfx_basis_test = np.ones((rfx_group_ids_test.shape[0],1)) + + # Set up random effects structures + if self.has_rfx: + if num_rfx_components == 1: + alpha_init = np.array([1]) + elif num_rfx_components > 1: + alpha_init = np.concatenate((np.ones(1), np.zeros(num_rfx_components-1))) + else: + raise ValueError("There must be at least 1 random effect component") + xi_init = np.tile(np.expand_dims(alpha_init, 1), (1, num_rfx_groups)) + sigma_alpha_init = np.identity(num_rfx_components) + sigma_xi_init = np.identity(num_rfx_components) + sigma_xi_shape = 1. + sigma_xi_scale = 1. + rfx_dataset_train = RandomEffectsDataset() + rfx_dataset_train.add_group_labels(rfx_group_ids_train) + rfx_dataset_train.add_basis(rfx_basis_train) + rfx_tracker = RandomEffectsTracker(rfx_group_ids_train) + rfx_model = RandomEffectsModel(num_rfx_components, num_rfx_groups) + rfx_model.set_working_parameter(alpha_init) + rfx_model.set_group_parameters(xi_init) + rfx_model.set_working_parameter_covariance(sigma_alpha_init) + rfx_model.set_group_parameter_covariance(sigma_xi_init) + rfx_model.set_variance_prior_shape(sigma_xi_shape) + rfx_model.set_variance_prior_scale(sigma_xi_scale) + self.rfx_container = RandomEffectsContainer() + self.rfx_container.load_new_container(num_rfx_components, num_rfx_groups, rfx_tracker) # Update variable weights variable_counts = [original_var_indices.count(i) for i in original_var_indices] @@ -1516,6 +1611,12 @@ def sample( self.leaf_scale_tau_samples[sample_counter] = ( current_leaf_scale_tau[0, 0] ) + + # Sample random effects + if self.has_rfx: + rfx_model.sample( + rfx_dataset_train, residual_train, rfx_tracker, self.rfx_container, keep_sample, current_sigma2, cpp_rng + ) # Run MCMC if num_burnin + num_mcmc > 0: @@ -1658,6 +1759,12 @@ def sample( self.leaf_scale_tau_samples[sample_counter] = ( current_leaf_scale_tau[0, 0] ) + + # Sample random effects + if self.has_rfx: + rfx_model.sample( + rfx_dataset_train, residual_train, rfx_tracker, self.rfx_container, keep_sample, current_sigma2, cpp_rng + ) # Mark the model as sampled self.sampled = True @@ -1669,6 +1776,8 @@ def sample( self.forest_container_tau.delete_sample(i) if self.include_variance_forest: self.forest_container_variance.delete_sample(i) + if self.has_rfx: + self.rfx_container.delete_sample(i) if self.adaptive_coding: self.b1_samples = self.b1_samples[num_gfr:] self.b0_samples = self.b0_samples[num_gfr:] @@ -1725,6 +1834,15 @@ def sample( treatment_term_test = Z_test * np.squeeze(self.tau_hat_test) self.y_hat_test = self.mu_hat_test + treatment_term_test + # TODO: make rfx_preds_train and rfx_preds_test persistent properties + if self.has_rfx: + rfx_preds_train = self.rfx_container.predict(rfx_group_ids_train, rfx_basis_train) * self.y_std + if has_rfx_test: + rfx_preds_test = self.rfx_container.predict(rfx_group_ids_test, rfx_basis_test) * self.y_std + self.y_hat_train = self.y_hat_train + rfx_preds_train + if self.has_test: + self.y_hat_test = self.y_hat_test + rfx_preds_test + if self.include_variance_forest: sigma2_x_train_raw = ( self.forest_container_variance.forest_container_cpp.Predict( @@ -1938,7 +2056,7 @@ def predict_variance( return variance_pred - def predict(self, X: np.array, Z: np.array, propensity: np.array = None) -> tuple: + def predict(self, X: np.array, Z: np.array, propensity: np.array = None, rfx_group_ids: np.array = None, rfx_basis: np.array = None) -> tuple: """Predict outcome model components (CATE function and prognostic function) as well as overall outcome for every provided observation. Predicted outcomes are computed as `yhat = mu_x + Z*tau_x` where mu_x is a sample of the prognostic function and tau_x is a sample of the treatment effect (CATE) function. @@ -1950,6 +2068,10 @@ def predict(self, X: np.array, Z: np.array, propensity: np.array = None) -> tupl Test set treatment indicators. propensity : `np.array`, optional Optional test set propensities. Must be provided if propensities were provided when the model was sampled. + rfx_group_ids : np.array, optional + Optional group labels used for an additive random effects model. + rfx_basis : np.array, optional + Optional basis for "random-slope" regression in an additive random effects model. Returns ------- @@ -1957,6 +2079,8 @@ def predict(self, X: np.array, Z: np.array, propensity: np.array = None) -> tupl Conditional average treatment effect (CATE) samples for every observation provided. mu_x : np.array Prognostic effect samples for every observation provided. + rfx : np.array, optional + Random effect samples for every observation provided, if the model includes a random effects term. yhat_x : np.array Outcome prediction samples for every observation provided. sigma2_x : np.array, optional @@ -2031,6 +2155,10 @@ def predict(self, X: np.array, Z: np.array, propensity: np.array = None) -> tupl else: treatment_term = Z * np.squeeze(tau_x) yhat_x = mu_x + treatment_term + + if self.has_rfx: + rfx_preds = self.rfx_container.predict(rfx_group_ids, rfx_basis) * self.y_std + yhat_x = yhat_x + rfx_preds # Compute predictions from the variance forest (if included) if self.include_variance_forest: @@ -2051,8 +2179,12 @@ def predict(self, X: np.array, Z: np.array, propensity: np.array = None) -> tupl ) # Return result matrices as a tuple - if self.include_variance_forest: + if self.has_rfx and self.include_variance_forest: + return (tau_x, mu_x, rfx_preds, yhat_x, sigma2_x) + elif not self.has_rfx and self.include_variance_forest: return (tau_x, mu_x, yhat_x, sigma2_x) + elif self.has_rfx and not self.include_variance_forest: + return (tau_x, mu_x, rfx_preds, yhat_x) else: return (tau_x, mu_x, yhat_x) @@ -2082,6 +2214,10 @@ def to_json(self) -> str: if self.include_variance_forest: bcf_json.add_forest(self.forest_container_variance) + # Add the rfx + if self.has_rfx: + bcf_json.add_random_effects(self.rfx_container) + # Add global parameters bcf_json.add_scalar("variance_scale", self.variance_scale) bcf_json.add_scalar("outcome_scale", self.y_std) @@ -2092,6 +2228,7 @@ def to_json(self) -> str: bcf_json.add_boolean("sample_sigma_leaf_mu", self.sample_sigma_leaf_mu) bcf_json.add_boolean("sample_sigma_leaf_tau", self.sample_sigma_leaf_tau) bcf_json.add_boolean("include_variance_forest", self.include_variance_forest) + bcf_json.add_boolean("has_rfx", self.has_rfx) bcf_json.add_scalar("num_gfr", self.num_gfr) bcf_json.add_scalar("num_burnin", self.num_burnin) bcf_json.add_scalar("num_mcmc", self.num_mcmc) @@ -2145,6 +2282,7 @@ def from_json(self, json_string: str) -> None: # Unpack forests self.include_variance_forest = bcf_json.get_boolean("include_variance_forest") + self.has_rfx = bcf_json.get_boolean("has_rfx") # TODO: don't just make this a placeholder that we overwrite self.forest_container_mu = ForestContainer(0, 0, False, False) self.forest_container_mu.forest_container_cpp.LoadFromJson( @@ -2161,6 +2299,11 @@ def from_json(self, json_string: str) -> None: self.forest_container_variance.forest_container_cpp.LoadFromJson( bcf_json.json_cpp, "forest_2" ) + + # Unpack random effects + if self.has_rfx: + self.rfx_container = RandomEffectsContainer() + self.rfx_container.load_from_json(bcf_json, 0) # Unpack global parameters self.variance_scale = bcf_json.get_scalar("variance_scale") diff --git a/stochtree/random_effects.py b/stochtree/random_effects.py index 709b83b7..0badcbb3 100644 --- a/stochtree/random_effects.py +++ b/stochtree/random_effects.py @@ -173,24 +173,51 @@ class RandomEffectsContainer: 0-indexed label numbers used to place group samples in memory (i.e. the first label is stored in column 0 of the sample matrix, the second label is store in column 1 of the sample matrix, etc...). - - Parameters - ---------- - num_components : int - Number of components (bases) in a random effects model. For the simplest random effects model, - in which each group has a different random intercept, this is 1, and the basis is a trivial - "dummy" intercept vector. - num_groups : int - Number of groups in a random effects model. """ - def __init__( - self, num_components: int, num_groups: int, rfx_tracker: RandomEffectsTracker - ) -> None: - self.rfx_container_cpp = RandomEffectsContainerCpp(num_components, num_groups) - self.rfx_label_mapper_cpp = RandomEffectsLabelMapperCpp( - rfx_tracker.rfx_tracker_cpp - ) + def __init__(self) -> None: + pass + + def load_new_container(self, num_components: int, num_groups: int, rfx_tracker: RandomEffectsTracker) -> None: + """ + Initializes internal data structures for an "empty" random effects container to be sampled and populated. + + Parameters + ---------- + num_components : int + Number of components (bases) in a random effects model. For the simplest random effects model, + in which each group has a different random intercept, this is 1, and the basis is a trivial + "dummy" intercept vector. + num_groups : int + Number of groups in a random effects model. + rfx_tracker : RandomEffectsTracker + Tracking data structures for random effects models. + """ + self.rfx_container_cpp = RandomEffectsContainerCpp() + self.rfx_container_cpp.SetComponentsAndGroups(num_components, num_groups) + self.rfx_label_mapper_cpp = RandomEffectsLabelMapperCpp() + self.rfx_label_mapper_cpp.LoadFromTracker(rfx_tracker.rfx_tracker_cpp) + self.rfx_group_ids = rfx_tracker.rfx_tracker_cpp.GetUniqueGroupIds() + + def load_from_json(self, json, rfx_num: int) -> None: + """ + Initializes internal data structures for an "empty" random effects container to be sampled and populated. + + Parameters + ---------- + json : JSONSerializer + Python object wrapping a C++ `json` object. + rfx_num : int + Integer index of the RFX term in a JSON model. In practice, this is typically 0 (most models don't contain two RFX terms). + """ + rfx_container_key = f'random_effect_container_{rfx_num:d}' + rfx_label_mapper_key = f'random_effect_label_mapper_{rfx_num:d}' + rfx_group_ids_key = f'random_effect_groupids_{rfx_num:d}' + self.rfx_container_cpp = RandomEffectsContainerCpp() + self.rfx_container_cpp.LoadFromJson(json.json_cpp, rfx_container_key) + self.rfx_label_mapper_cpp = RandomEffectsLabelMapperCpp() + self.rfx_label_mapper_cpp.LoadFromJson(json.json_cpp, rfx_label_mapper_key) + self.rfx_group_ids = json.get_integer_vector(rfx_group_ids_key, "random_effects") def num_samples(self) -> int: return self.rfx_container_cpp.NumSamples() @@ -214,6 +241,7 @@ def load_from_json_string(self, json_string: str) -> None: In-memory string containing state of a random effects container. """ self.rfx_container_cpp.LoadFromJsonString(json_string) + # TODO: re-initialize label mapper def predict(self, group_labels: np.array, basis: np.array) -> np.ndarray: """ diff --git a/stochtree/serialization.py b/stochtree/serialization.py index 4a61a7e1..844ee54d 100644 --- a/stochtree/serialization.py +++ b/stochtree/serialization.py @@ -15,7 +15,9 @@ class JSONSerializer: def __init__(self) -> None: self.json_cpp = JsonCpp() self.num_forests = 0 + self.num_rfx = 0 self.forest_labels = [] + self.rfx_labels = [] def return_json_string(self) -> str: """ @@ -51,6 +53,20 @@ def add_forest(self, forest_samples: ForestContainer) -> None: self.num_forests += 1 self.forest_labels.append(forest_label) + def add_random_effects(self, rfx_container: RandomEffectsContainer) -> None: + """Adds a container of random effect samples to a json object + + Parameters + ---------- + rfx_container : RandomEffectsContainer + Samples of a random effects model + """ + _ = self.json_cpp.AddRandomEffectsContainer(rfx_container.rfx_container_cpp) + _ = self.json_cpp.AddRandomEffectsLabelMapper(rfx_container.rfx_label_mapper_cpp) + _ = self.json_cpp.AddRandomEffectsGroupIDs(rfx_container.rfx_group_ids) + self.json_cpp.IncrementRandomEffectsCount() + self.num_rfx += 1 + def add_scalar( self, field_name: str, field_value: float, subfolder_name: str = None ) -> None: @@ -371,7 +387,7 @@ def get_random_effects_container( In-memory `RandomEffectsContainer` python object, created from JSON """ # TODO: read this from JSON - result = RandomEffectsContainer(0, 0) + result = RandomEffectsContainer() result.random_effects_container_cpp.LoadFromJson( self.json_cpp, random_effects_str ) diff --git a/test/python/test_json.py b/test/python/test_json.py index 5a43e855..1e3d9c5a 100644 --- a/test/python/test_json.py +++ b/test/python/test_json.py @@ -345,6 +345,78 @@ def outcome_mean(X, W): y_hat_reloaded = bart_reloaded.predict(X, W) np.testing.assert_almost_equal(y_hat_orig, y_hat_reloaded) + def test_bart_rfx_string(self): + # RNG + random_seed = 1234 + rng = np.random.default_rng(random_seed) + + # Generate covariates and basis + n = 1000 + p_X = 10 + p_W = 1 + X = rng.uniform(0, 1, (n, p_X)) + W = rng.uniform(0, 1, (n, p_W)) + + # Generate random effects terms + num_basis = 2 + num_groups = 4 + group_labels = rng.choice(num_groups, size=n) + basis = np.empty((n, num_basis)) + basis[:, 0] = 1.0 + if num_basis > 1: + basis[:, 1:] = rng.uniform(-1, 1, (n, num_basis - 1)) + + # Define the outcome mean function + def outcome_mean(X, W): + return np.where( + (X[:, 0] >= 0.0) & (X[:, 0] < 0.25), + -7.5 * W[:, 0], + np.where( + (X[:, 0] >= 0.25) & (X[:, 0] < 0.5), + -2.5 * W[:, 0], + np.where( + (X[:, 0] >= 0.5) & (X[:, 0] < 0.75), + 2.5 * W[:, 0], + 7.5 * W[:, 0], + ), + ), + ) + + # Define the group rfx function + def rfx_mean(group_labels, basis): + return np.where( + group_labels == 0, + 0 - 1 * basis[:, 1], + np.where( + group_labels == 1, + 4 + 1 * basis[:, 1], + np.where( + group_labels == 2, 8 + 3 * basis[:, 1], 12 + 5 * basis[:, 1] + ), + ), + ) + + # Generate outcome + epsilon = rng.normal(0, 1, n) + forest_term = outcome_mean(X, W) + rfx_term = rfx_mean(group_labels, basis) + y = forest_term + rfx_term + epsilon + + # Run BART + bart_orig = BARTModel() + bart_orig.sample(X_train=X, y_train=y, leaf_basis_train=W, rfx_group_ids_train=group_labels, + rfx_basis_train=basis, num_gfr=10, num_mcmc=10) + + # Extract predictions from the sampler + y_hat_orig = bart_orig.predict(X, W, group_labels, basis) + + # "Round-trip" the model to JSON string and back and check that the predictions agree + bart_json_string = bart_orig.to_json() + bart_reloaded = BARTModel() + bart_reloaded.from_json(bart_json_string) + y_hat_reloaded = bart_reloaded.predict(X, W, group_labels, basis) + np.testing.assert_almost_equal(y_hat_orig, y_hat_reloaded) + def test_bcf_string(self): # RNG random_seed = 1234 @@ -385,6 +457,71 @@ def test_bcf_string(self): np.testing.assert_almost_equal(tau_hat_orig, tau_hat_reloaded) np.testing.assert_almost_equal(mu_hat_orig, mu_hat_reloaded) + def test_bcf_rfx_string(self): + # RNG + random_seed = 1234 + rng = np.random.default_rng(random_seed) + + # Generate covariates and basis + n = 100 + p_X = 5 + X = rng.uniform(0, 1, (n, p_X)) + pi_X = 0.25 + 0.5 * X[:, 0] + Z = rng.binomial(1, pi_X, n).astype(float) + + # Define the outcome mean functions (prognostic and treatment effects) + mu_X = pi_X * 5 + tau_X = X[:, 1] * 2 + + # Generate random effects terms + num_basis = 2 + num_groups = 4 + group_labels = rng.choice(num_groups, size=n) + basis = np.empty((n, num_basis)) + basis[:, 0] = 1.0 + if num_basis > 1: + basis[:, 1:] = rng.uniform(-1, 1, (n, num_basis - 1)) + + # Define the group rfx function + def rfx_mean(group_labels, basis): + return np.where( + group_labels == 0, + 0 - 1 * basis[:, 1], + np.where( + group_labels == 1, + 4 + 1 * basis[:, 1], + np.where( + group_labels == 2, 8 + 3 * basis[:, 1], 12 + 5 * basis[:, 1] + ), + ), + ) + + # Generate outcome + epsilon = rng.normal(0, 1, n) + rfx_term = rfx_mean(group_labels, basis) + y = mu_X + tau_X * Z + rfx_term + epsilon + + # Run BCF + bcf_orig = BCFModel() + bcf_orig.sample( + X_train=X, Z_train=Z, y_train=y, pi_train=pi_X, rfx_group_ids_train=group_labels, rfx_basis_train=basis, num_gfr=10, num_mcmc=10 + ) + + # Extract predictions from the sampler + mu_hat_orig, tau_hat_orig, rfx_hat_orig, y_hat_orig = bcf_orig.predict(X, Z, pi_X, group_labels, basis) + + # "Round-trip" the model to JSON string and back and check that the predictions agree + bcf_json_string = bcf_orig.to_json() + bcf_reloaded = BCFModel() + bcf_reloaded.from_json(bcf_json_string) + mu_hat_reloaded, tau_hat_reloaded, rfx_hat_reloaded, y_hat_reloaded = bcf_reloaded.predict( + X, Z, pi_X, group_labels, basis + ) + np.testing.assert_almost_equal(y_hat_orig, y_hat_reloaded) + np.testing.assert_almost_equal(tau_hat_orig, tau_hat_reloaded) + np.testing.assert_almost_equal(mu_hat_orig, mu_hat_reloaded) + np.testing.assert_almost_equal(rfx_hat_orig, rfx_hat_reloaded) + def test_bcf_propensity_string(self): # RNG random_seed = 1234 diff --git a/test/python/test_random_effects.py b/test/python/test_random_effects.py index aaeaa0c3..f6616240 100644 --- a/test/python/test_random_effects.py +++ b/test/python/test_random_effects.py @@ -56,7 +56,8 @@ def outcome_mean(group_labels, basis): rfx_model.set_group_parameter_covariance(np.identity(num_basis)) rfx_model.set_variance_prior_shape(1.0) rfx_model.set_variance_prior_scale(1.0) - rfx_container = RandomEffectsContainer(num_basis, num_groups, rfx_tracker) + rfx_container = RandomEffectsContainer() + rfx_container.load_new_container(num_basis, num_groups, rfx_tracker) cpp_rng = RNG() # Sample the model @@ -121,7 +122,8 @@ def outcome_mean(group_labels, basis): rfx_model.set_group_parameter_covariance(np.identity(num_basis)) rfx_model.set_variance_prior_shape(1.0) rfx_model.set_variance_prior_scale(1.0) - rfx_container = RandomEffectsContainer(num_basis, num_groups, rfx_tracker) + rfx_container = RandomEffectsContainer() + rfx_container.load_new_container(num_basis, num_groups, rfx_tracker) cpp_rng = RNG() # Sample the model From f454c1fe7660637472d36df9012d4c7720d197de Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Sun, 16 Mar 2025 18:42:47 -0500 Subject: [PATCH 28/35] Placeholder kernel computations and R / Python comparison scripts --- R/kernel.R | 2 +- demo/debug/r_comparison_debug.py | 22 +++++++++++++ stochtree/kernel.py | 47 +++++++++++++++++++++++++++ tools/debug/python_comparison_debug.R | 18 ++++++++++ 4 files changed, 88 insertions(+), 1 deletion(-) create mode 100644 demo/debug/r_comparison_debug.py create mode 100644 stochtree/kernel.py create mode 100644 tools/debug/python_comparison_debug.R diff --git a/R/kernel.R b/R/kernel.R index ac10b887..3a8856fb 100644 --- a/R/kernel.R +++ b/R/kernel.R @@ -35,7 +35,7 @@ #' @param forest_inds (Optional) Indices of the forest sample(s) for which to compute leaf indices. If not provided, #' this function will return leaf indices for every sample of a forest. #' This function uses 0-indexing, so the first forest sample corresponds to `forest_num = 0`, and so on. -#' @return List of vectors. Each vector is of size `num_obs * num_trees`, where `num_obs = nrow(covariates)` +#' @return Vector of size `num_obs * num_trees`, where `num_obs = nrow(covariates)` #' and `num_trees` is the number of trees in the relevant forest of `model_object`. #' @export #' diff --git a/demo/debug/r_comparison_debug.py b/demo/debug/r_comparison_debug.py new file mode 100644 index 00000000..4df9d12c --- /dev/null +++ b/demo/debug/r_comparison_debug.py @@ -0,0 +1,22 @@ +# R Comparison Demo Script + +# Load necessary libraries +import numpy as np +import pandas as pd +from stochtree import BARTModel + +# Load data +df = pd.read_csv("debug/data/heterosked_train.csv") +y = df.loc[:,'y'].to_numpy() +X = df.loc[:,['X1','X2','X3','X4','X5','X6','X7','X8','X9','X10']].to_numpy() +y = y.astype(np.float64) +X = X.astype(np.float64) + +# Run BART +bart_model = BARTModel() +bart_model.sample(X_train=X, y_train=y, num_gfr=0, num_mcmc=10, general_params={'random_seed': 1234, 'standardize': False, 'sample_sigma2_global': True}) + +# Inspect the MCMC (BART) samples +y_avg_mcmc = np.squeeze(bart_model.y_hat_train).mean(axis = 1, keepdims = True) +print(y_avg_mcmc[:20]) +print(bart_model.global_var_samples) diff --git a/stochtree/kernel.py b/stochtree/kernel.py new file mode 100644 index 00000000..ccf83ddd --- /dev/null +++ b/stochtree/kernel.py @@ -0,0 +1,47 @@ +from typing import Union + +import pandas as pd +import numpy as np +from stochtree import BARTModel, BCFModel, ForestContainer + +from .data import Residual +from .sampler import RNG + + +def compute_forest_leaf_indices(model_object: Union[BARTModel, BCFModel, ForestContainer], covariates: Union[np.array, pd.DataFrame], forest_type: str = None, forest_inds: Union[int, np.ndarray] = None): + """ + Compute and return a vector representation of a forest's leaf predictions for every observation in a dataset. + + The vector has a "row-major" format that can be easily re-represented as as a CSR sparse matrix: elements are organized so that the first `n` elements + correspond to leaf predictions for all `n` observations in a dataset for the first tree in an ensemble, the next `n` elements correspond to predictions for + the second tree and so on. The "data" for each element corresponds to a uniquely mapped column index that corresponds to a single leaf of a single tree (i.e. + if tree 1 has 3 leaves, its column indices range from 0 to 2, and then tree 2's leaf indices begin at 3, etc...). + + Parameters + ---------- + model_object : BARTModel, BCFModel, or ForestContainer + Object corresponding to a BART / BCF model with at least one forest sample, or a low-level `ForestContainer` object. + covariates : np.array or pd.DataFrame + Covariates to use for prediction. Must have the same dimensions / column types as the data used to train a forest. + forest_type : str + Which forest to use from `model_object`. Valid inputs depend on the model type, and whether or not a given forest was sampled in that model. + + * **BART** + * `'mean'`: `'mean'`: Extracts leaf indices for the mean forest + * `'variance'`: Extracts leaf indices for the variance forest + * **BCF** + * `'prognostic'`: Extracts leaf indices for the prognostic forest + * `'treatment'`: Extracts leaf indices for the treatment effect forest + * `'variance'`: Extracts leaf indices for the variance forest + * **ForestContainer** + * `NULL`: It is not necessary to disambiguate when this function is called directly on a `ForestSamples` object. This is the default value of this + + forest_inds : int or np.ndarray + Indices of the forest sample(s) for which to compute leaf indices. If not provided, this function will return leaf indices for every sample of a forest. + This function uses 0-indexing, so the first forest sample corresponds to `forest_num = 0`, and so on. + + Returns + ------- + Numpy array with dimensions `num_obs` by `num_trees`, where `num_obs` is the number of rows in `covaritates` and `num_trees` is the number of trees in the relevant forest of `model_object`. + """ + pass diff --git a/tools/debug/python_comparison_debug.R b/tools/debug/python_comparison_debug.R new file mode 100644 index 00000000..d7f62b82 --- /dev/null +++ b/tools/debug/python_comparison_debug.R @@ -0,0 +1,18 @@ +library(stochtree) + +df <- read.csv("debug/data/heterosked_train.csv") +y <- df[,"y"] +X <- df[,c('X1','X2','X3','X4','X5','X6','X7','X8','X9','X10')] + +num_gfr <- 0 +num_burnin <- 0 +num_mcmc <- 10 +general_params <- list(random_seed = 1234, standardize = F, sample_sigma2_global = T) +bart_model <- stochtree::bart( + X_train = X, y_train = y, + num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, + general_params = general_params +) + +rowMeans(bart_model$y_hat_train)[1:20] +bart_model$sigma2_global_samples \ No newline at end of file From a6caffb69514026abd2798ae2e9c68a58de9abb0 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Mon, 17 Mar 2025 00:10:14 -0500 Subject: [PATCH 29/35] Updated demo scripts --- demo/debug/causal_inference.py | 22 +++++++++---------- ...multivariate_treatment_causal_inference.py | 2 -- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/demo/debug/causal_inference.py b/demo/debug/causal_inference.py index 0aa1bb0b..fb77367e 100644 --- a/demo/debug/causal_inference.py +++ b/demo/debug/causal_inference.py @@ -63,38 +63,38 @@ # Run BCF bcf_model = BCFModel() -bcf_model.sample(X_train, Z_train, y_train, pi_train, X_test, Z_test, pi_test, num_gfr=10, num_mcmc=100) +bcf_model.sample(X_train, Z_train, y_train, pi_train, X_test, Z_test, pi_test, num_gfr=10, num_mcmc=1000) # Inspect the MCMC (BART) samples -forest_preds_y_mcmc = bcf_model.y_hat_test[:,bcf_model.num_gfr:] +forest_preds_y_mcmc = bcf_model.y_hat_test y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis = 1, keepdims = True) y_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(y_test,1), y_avg_mcmc), axis = 1), columns=["True outcome", "Average estimated outcome"]) sns.scatterplot(data=y_df_mcmc, x="Average estimated outcome", y="True outcome") plt.axline((0, 0), slope=1, color="black", linestyle=(0, (3,3))) plt.show() -forest_preds_tau_mcmc = bcf_model.tau_hat_test[:,bcf_model.num_gfr:] +forest_preds_tau_mcmc = bcf_model.tau_hat_test tau_avg_mcmc = np.squeeze(forest_preds_tau_mcmc).mean(axis = 1, keepdims = True) tau_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(tau_test,1), tau_avg_mcmc), axis = 1), columns=["True tau", "Average estimated tau"]) sns.scatterplot(data=tau_df_mcmc, x="Average estimated tau", y="True tau") plt.axline((0, 0), slope=1, color="black", linestyle=(0, (3,3))) plt.show() -forest_preds_mu_mcmc = bcf_model.mu_hat_test[:,bcf_model.num_gfr:] +forest_preds_mu_mcmc = bcf_model.mu_hat_test mu_avg_mcmc = np.squeeze(forest_preds_mu_mcmc).mean(axis = 1, keepdims = True) mu_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(mu_test,1), mu_avg_mcmc), axis = 1), columns=["True mu", "Average estimated mu"]) sns.scatterplot(data=mu_df_mcmc, x="Average estimated mu", y="True mu") plt.axline((0, 0), slope=1, color="black", linestyle=(0, (3,3))) plt.show() -# sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model.num_samples - bcf_model.num_gfr),axis=1), np.expand_dims(bcf_model.global_var_samples,axis=1)), axis = 1), columns=["Sample", "Sigma"]) -# sns.scatterplot(data=sigma_df_mcmc, x="Sample", y="Sigma") -# plt.show() +sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model.num_samples),axis=1), np.expand_dims(bcf_model.global_var_samples,axis=1)), axis = 1), columns=["Sample", "Sigma"]) +sns.scatterplot(data=sigma_df_mcmc, x="Sample", y="Sigma") +plt.show() -# b_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model.num_samples - bcf_model.num_gfr),axis=1), np.expand_dims(bcf_model.b0_samples,axis=1), np.expand_dims(bcf_model.b1_samples,axis=1)), axis = 1), columns=["Sample", "Beta_0", "Beta_1"]) -# sns.scatterplot(data=b_df_mcmc, x="Sample", y="Beta_0") -# sns.scatterplot(data=b_df_mcmc, x="Sample", y="Beta_1") -# plt.show() +b_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model.num_samples),axis=1), np.expand_dims(bcf_model.b0_samples,axis=1), np.expand_dims(bcf_model.b1_samples,axis=1)), axis = 1), columns=["Sample", "Beta_0", "Beta_1"]) +sns.scatterplot(data=b_df_mcmc, x="Sample", y="Beta_0") +sns.scatterplot(data=b_df_mcmc, x="Sample", y="Beta_1") +plt.show() # Compute RMSEs y_rmse = np.sqrt(np.mean(np.power(np.expand_dims(y_test,1) - y_avg_mcmc, 2))) diff --git a/demo/debug/multivariate_treatment_causal_inference.py b/demo/debug/multivariate_treatment_causal_inference.py index 317480a3..8cc73051 100644 --- a/demo/debug/multivariate_treatment_causal_inference.py +++ b/demo/debug/multivariate_treatment_causal_inference.py @@ -44,5 +44,3 @@ # Run BCF bcf_model = BCFModel() bcf_model.sample(X_train, Z_train, y_train, pi_train, X_test, Z_test, pi_test, num_gfr=10, num_mcmc=100) - - From ae13cb01fedffd21ad37e0dbcbfbb13ac88230b3 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Mon, 17 Mar 2025 11:20:26 -0500 Subject: [PATCH 30/35] Updated demos and forest initialization python code --- demo/debug/serialization.py | 21 +++++++++++++++++---- stochtree/forest.py | 12 ++++++++---- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/demo/debug/serialization.py b/demo/debug/serialization.py index 0ba48903..4ee14cee 100644 --- a/demo/debug/serialization.py +++ b/demo/debug/serialization.py @@ -1,7 +1,8 @@ import numpy as np from stochtree import ( BARTModel, JSONSerializer, ForestContainer, Dataset, Residual, - RNG, ForestSampler, ForestContainer, GlobalVarianceModel + RNG, ForestSampler, ForestContainer, GlobalVarianceModel, + GlobalModelConfig, ForestModelConfig, Forest ) # RNG @@ -53,6 +54,7 @@ def outcome_mean(X, W): leaf_regression = True feature_types = np.repeat(0, p_X).astype(int) # 0 = numeric var_weights = np.repeat(1/p_X, p_X) +leaf_model_type = 1 if p_W == 1 else 2 # Dataset (covariates and basis) dataset = Dataset() @@ -64,7 +66,14 @@ def outcome_mean(X, W): # Forest samplers and temporary tracking data structures forest_container = ForestContainer(num_trees, W.shape[1], False, False) -forest_sampler = ForestSampler(dataset, feature_types, num_trees, n, alpha, beta, min_samples_leaf) +active_forest = Forest(num_trees, W.shape[1], False, False) +global_config = GlobalModelConfig(global_error_variance=global_variance_init) +forest_config = ForestModelConfig(num_trees=num_trees, num_features=p_X, num_observations=n, + feature_types=feature_types, variable_weights=var_weights, + leaf_dimension=W.shape[1], alpha=alpha, beta=beta, + min_samples_leaf=min_samples_leaf, leaf_model_type=leaf_model_type, + leaf_model_scale=leaf_prior_scale, cutpoint_grid_size=cutpoint_grid_size) +forest_sampler = ForestSampler(dataset, global_config=global_config, forest_config=forest_config) cpp_rng = RNG(random_seed) global_var_model = GlobalVarianceModel() @@ -74,14 +83,18 @@ def outcome_mean(X, W): num_samples = num_warmstart + num_mcmc global_var_samples = np.concatenate((np.array([global_variance_init]), np.repeat(0, num_samples))) +# Initialize the forest +constant_leaf_value = np.repeat(0.0, p_W) +active_forest.set_root_leaves(constant_leaf_value) + # Run "grow-from-root" sampler for i in range(num_warmstart): - forest_sampler.sample_one_iteration(forest_container, dataset, residual, cpp_rng, feature_types, cutpoint_grid_size, leaf_prior_scale, var_weights, 1., 1., global_var_samples[i], 1, True, False) + forest_sampler.sample_one_iteration(forest_container, active_forest, dataset, residual, cpp_rng, global_config, forest_config, True, False) global_var_samples[i+1] = global_var_model.sample_one_iteration(residual, cpp_rng, a_global, b_global) # Run MCMC sampler for i in range(num_warmstart, num_samples): - forest_sampler.sample_one_iteration(forest_container, dataset, residual, cpp_rng, feature_types, cutpoint_grid_size, leaf_prior_scale, var_weights, 1., 1., global_var_samples[i], 1, False, False) + forest_sampler.sample_one_iteration(forest_container, active_forest, dataset, residual, cpp_rng, global_config, forest_config, False, False) global_var_samples[i+1] = global_var_model.sample_one_iteration(residual, cpp_rng, a_global, b_global) # Extract predictions from the sampler diff --git a/stochtree/forest.py b/stochtree/forest.py index 2638c577..8fbf377c 100644 --- a/stochtree/forest.py +++ b/stochtree/forest.py @@ -890,10 +890,14 @@ def set_root_leaves(self, leaf_value: Union[float, np.array]) -> None: if not isinstance(leaf_value, np.ndarray) and not isinstance(leaf_value, float): raise ValueError("leaf_value must be either a float or np.array") if isinstance(leaf_value, np.ndarray): - leaf_value = np.squeeze(leaf_value) - if len(leaf_value.shape) != 1: - raise ValueError("leaf_value must be either a one-dimensional array") - self.forest_cpp.SetRootVector(leaf_value, leaf_value.shape[0]) + if len(leaf_value.shape) > 1: + leaf_value = np.squeeze(leaf_value) + if len(leaf_value.shape) != 1 or leaf_value.shape[0] != self.output_dimension: + raise ValueError("leaf_value must be a one-dimensional array with dimension equal to the output_dimension field of the forest") + if leaf_value.shape[0] > 1: + self.forest_cpp.SetRootVector(leaf_value, leaf_value.shape[0]) + else: + self.forest_cpp.SetRootValue(np.squeeze(leaf_value)) else: self.forest_cpp.SetRootValue(leaf_value) self.internal_forest_is_empty = False From eb32b1137ddf17190007e21afc96ce9d0d281b79 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Tue, 18 Mar 2025 00:33:17 -0500 Subject: [PATCH 31/35] Added kernel indices module --- src/py_stochtree.cpp | 44 +++++++++++++++++++ stochtree/__init__.py | 2 + stochtree/bart.py | 6 --- stochtree/bcf.py | 80 +++++++++++++++++++++++++++++++---- stochtree/forest.py | 11 +++++ stochtree/kernel.py | 61 +++++++++++++++++++++++++-- test/python/test_kernel.py | 86 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 272 insertions(+), 18 deletions(-) create mode 100644 test/python/test_kernel.py diff --git a/src/py_stochtree.cpp b/src/py_stochtree.cpp index 6d28a237..ff165663 100644 --- a/src/py_stochtree.cpp +++ b/src/py_stochtree.cpp @@ -177,6 +177,10 @@ class ForestContainerCpp { return forest_samples_->OutputDimension(); } + int NumTrees() { + return num_trees_; + } + int NumSamples() { return forest_samples_->NumSamples(); } @@ -660,6 +664,10 @@ class ForestCpp { return forest_->OutputDimension(); } + int NumTrees() { + return num_trees_; + } + int NumLeavesForest() { return forest_->NumLeaves(); } @@ -1825,6 +1833,37 @@ class JsonCpp { std::unique_ptr json_; }; +py::array_t cppComputeForestContainerLeafIndices(ForestContainerCpp& forest_container, py::array_t& covariates, py::array_t& forest_nums) { + // Wrap an Eigen Map around the raw data of the covariate matrix + StochTree::data_size_t num_obs = covariates.shape(0); + int num_covariates = covariates.shape(1); + double* covariate_data_ptr = static_cast(covariates.mutable_data()); + Eigen::Map> covariates_eigen(covariate_data_ptr, num_obs, num_covariates); + + // Extract other output dimensions + int num_trees = forest_container.NumTrees(); + int num_samples = forest_nums.size(); + + // Convert forest_nums to std::vector + std::vector forest_indices(num_samples); + for (int i = 0; i < num_samples; i++) { + forest_indices[i] = forest_nums.at(i); + } + + // Compute leaf indices + auto result = py::array_t(py::detail::any_container({num_obs*num_trees, num_samples})); + int* output_data_ptr = static_cast(result.mutable_data()); + Eigen::Map> output_eigen(output_data_ptr, num_obs*num_trees, num_samples); + forest_container.GetContainer()->PredictLeafIndicesInplace(covariates_eigen, output_eigen, forest_indices, num_trees, num_obs); + + // Return matrix + return result; +} + +int cppComputeForestMaxLeafIndex(ForestContainerCpp& forest_container, int forest_num) { + return forest_container.GetForest(forest_num)->GetMaxLeafIndex(); +} + void ForestContainerCpp::LoadFromJson(JsonCpp& json, std::string forest_label) { nlohmann::json forest_json = json.SubsetJsonForest(forest_label); forest_samples_->Reset(); @@ -1891,6 +1930,9 @@ void RandomEffectsModelCpp::SampleRandomEffects(RandomEffectsDatasetCpp& rfx_dat } PYBIND11_MODULE(stochtree_cpp, m) { + m.def("cppComputeForestContainerLeafIndices", &cppComputeForestContainerLeafIndices, "Compute leaf indices of the forests in a forest container"); + m.def("cppComputeForestMaxLeafIndex", &cppComputeForestMaxLeafIndex, "Compute max leaf index of a forest in a forest container"); + py::class_(m, "JsonCpp") .def(py::init<>()) .def("LoadFile", &JsonCpp::LoadFile) @@ -1958,6 +2000,7 @@ PYBIND11_MODULE(stochtree_cpp, m) { py::class_(m, "ForestContainerCpp") .def(py::init()) .def("OutputDimension", &ForestContainerCpp::OutputDimension) + .def("NumTrees", &ForestContainerCpp::NumTrees) .def("NumSamples", &ForestContainerCpp::NumSamples) .def("DeleteSample", &ForestContainerCpp::DeleteSample) .def("Predict", &ForestContainerCpp::Predict) @@ -2003,6 +2046,7 @@ PYBIND11_MODULE(stochtree_cpp, m) { py::class_(m, "ForestCpp") .def(py::init()) .def("OutputDimension", &ForestCpp::OutputDimension) + .def("NumTrees", &ForestCpp::NumTrees) .def("NumLeavesForest", &ForestCpp::NumLeavesForest) .def("SumLeafSquared", &ForestCpp::SumLeafSquared) .def("ResetRoot", &ForestCpp::ResetRoot) diff --git a/stochtree/__init__.py b/stochtree/__init__.py index feb28402..24b43900 100644 --- a/stochtree/__init__.py +++ b/stochtree/__init__.py @@ -4,6 +4,7 @@ from .config import ForestModelConfig, GlobalModelConfig from .data import Dataset, Residual from .forest import Forest, ForestContainer +from .kernel import compute_forest_leaf_indices from .preprocessing import CovariatePreprocessor from .random_effects import ( RandomEffectsContainer, @@ -56,5 +57,6 @@ "_check_matrix_square", "_standardize_array_to_list", "_standardize_array_to_np", + "compute_forest_leaf_indices", "calibrate_global_error_variance", ] diff --git a/stochtree/bart.py b/stochtree/bart.py index 6b1249c3..6608f576 100644 --- a/stochtree/bart.py +++ b/stochtree/bart.py @@ -1242,8 +1242,6 @@ def predict( ) covariates_processed = covariates else: - self._covariate_preprocessor = CovariatePreprocessor() - self._covariate_preprocessor.fit(covariates) covariates_processed = self._covariate_preprocessor.transform(covariates) # Dataset construction @@ -1364,8 +1362,6 @@ def predict_mean( ) covariates_processed = covariates else: - self._covariate_preprocessor = CovariatePreprocessor() - self._covariate_preprocessor.fit(covariates) covariates_processed = self._covariate_preprocessor.transform(covariates) # Dataset construction @@ -1448,8 +1444,6 @@ def predict_variance(self, covariates: np.array) -> np.array: ) covariates_processed = covariates else: - self._covariate_preprocessor = CovariatePreprocessor() - self._covariate_preprocessor.fit(covariates) covariates_processed = self._covariate_preprocessor.transform(covariates) # Dataset construction diff --git a/stochtree/bcf.py b/stochtree/bcf.py index 452541e4..276c0f54 100644 --- a/stochtree/bcf.py +++ b/stochtree/bcf.py @@ -2,6 +2,7 @@ Bayesian Causal Forests (BCF) module """ +import warnings from typing import Any, Dict, Optional, Union import numpy as np @@ -1958,11 +1959,32 @@ def predict_tau( propensity = np.ones(X.shape[0]) propensity = np.expand_dims(propensity, 1) + # Covariate preprocessing + if not self._covariate_preprocessor._check_is_fitted(): + if not isinstance(X, np.ndarray): + raise ValueError( + "Prediction cannot proceed on a pandas dataframe, since the BCF model was not fit with a covariate preprocessor. Please refit your model by passing covariate data as a Pandas dataframe." + ) + else: + warnings.warn( + "This BCF model has not run any covariate preprocessing routines. We will attempt to predict on the raw covariate values, but this will trigger an error with non-numeric columns. Please refit your model by passing non-numeric covariate data a a Pandas dataframe.", + RuntimeWarning, + ) + if not np.issubdtype( + X.dtype, np.floating + ) and not np.issubdtype(X.dtype, np.integer): + raise ValueError( + "Prediction cannot proceed on a non-numeric numpy array, since the BCF model was not fit with a covariate preprocessor. Please refit your model by passing non-numeric covariate data as a Pandas dataframe." + ) + covariates_processed = X + else: + covariates_processed = self._covariate_preprocessor.transform(X) + # Update covariates to include propensities if requested if self.propensity_covariate == "none": - X_combined = X + X_combined = covariates_processed else: - X_combined = np.c_[X, propensity] + X_combined = np.c_[covariates_processed, propensity] # Forest dataset forest_dataset_test = Dataset() @@ -2022,17 +2044,38 @@ def predict_variance( if propensity.ndim == 1: propensity = np.expand_dims(propensity, 1) + # Covariate preprocessing + if not self._covariate_preprocessor._check_is_fitted(): + if not isinstance(covariates, np.ndarray): + raise ValueError( + "Prediction cannot proceed on a pandas dataframe, since the BCF model was not fit with a covariate preprocessor. Please refit your model by passing covariate data as a Pandas dataframe." + ) + else: + warnings.warn( + "This BCF model has not run any covariate preprocessing routines. We will attempt to predict on the raw covariate values, but this will trigger an error with non-numeric columns. Please refit your model by passing non-numeric covariate data a a Pandas dataframe.", + RuntimeWarning, + ) + if not np.issubdtype( + covariates.dtype, np.floating + ) and not np.issubdtype(covariates.dtype, np.integer): + raise ValueError( + "Prediction cannot proceed on a non-numeric numpy array, since the BCF model was not fit with a covariate preprocessor. Please refit your model by passing non-numeric covariate data as a Pandas dataframe." + ) + covariates_processed = covariates + else: + covariates_processed = self._covariate_preprocessor.transform(covariates) + # Update covariates to include propensities if requested if self.propensity_covariate == "none": - X_combined = covariates + X_combined = covariates_processed else: if propensity is not None: - X_combined = np.c_[covariates, propensity] + X_combined = np.c_[covariates_processed, propensity] else: # Dummy propensities if not provided but also not needed - propensity = np.ones(covariates.shape[0]) + propensity = np.ones(covariates_processed.shape[0]) propensity = np.expand_dims(propensity, 1) - X_combined = np.c_[covariates, propensity] + X_combined = np.c_[covariates_processed, propensity] # Forest dataset pred_dataset = Dataset() @@ -2124,12 +2167,33 @@ def predict(self, X: np.array, Z: np.array, propensity: np.array = None, rfx_gro propensity = np.mean( self.bart_propensity_model.predict(X), axis=1, keepdims=True ) + + # Covariate preprocessing + if not self._covariate_preprocessor._check_is_fitted(): + if not isinstance(X, np.ndarray): + raise ValueError( + "Prediction cannot proceed on a pandas dataframe, since the BCF model was not fit with a covariate preprocessor. Please refit your model by passing covariate data as a Pandas dataframe." + ) + else: + warnings.warn( + "This BCF model has not run any covariate preprocessing routines. We will attempt to predict on the raw covariate values, but this will trigger an error with non-numeric columns. Please refit your model by passing non-numeric covariate data a a Pandas dataframe.", + RuntimeWarning, + ) + if not np.issubdtype( + X.dtype, np.floating + ) and not np.issubdtype(X.dtype, np.integer): + raise ValueError( + "Prediction cannot proceed on a non-numeric numpy array, since the BCF model was not fit with a covariate preprocessor. Please refit your model by passing non-numeric covariate data as a Pandas dataframe." + ) + covariates_processed = X + else: + covariates_processed = self._covariate_preprocessor.transform(X) # Update covariates to include propensities if requested if self.propensity_covariate == "none": - X_combined = X + X_combined = covariates_processed else: - X_combined = np.c_[X, propensity] + X_combined = np.c_[covariates_processed, propensity] # Forest dataset forest_dataset_test = Dataset() diff --git a/stochtree/forest.py b/stochtree/forest.py index 8fbf377c..809a3908 100644 --- a/stochtree/forest.py +++ b/stochtree/forest.py @@ -662,6 +662,17 @@ def node_leaf_values( """ return self.forest_container_cpp.NodeLeafValues(forest_num, tree_num, node_id) + def num_samples(self) -> int: + """ + Number of forest samples in the ``ForestContainer``. + + Returns + ------- + int + Total number of forest samples. + """ + return self.forest_container_cpp.NumSamples() + def num_nodes(self, forest_num: int, tree_num: int) -> int: """ Number of nodes in a given tree in a given forest in the ``ForestContainer``. diff --git a/stochtree/kernel.py b/stochtree/kernel.py index ccf83ddd..c137227f 100644 --- a/stochtree/kernel.py +++ b/stochtree/kernel.py @@ -2,10 +2,11 @@ import pandas as pd import numpy as np -from stochtree import BARTModel, BCFModel, ForestContainer +from stochtree_cpp import cppComputeForestContainerLeafIndices, cppComputeForestMaxLeafIndex -from .data import Residual -from .sampler import RNG +from .bart import BARTModel +from .bcf import BCFModel +from .forest import ForestContainer def compute_forest_leaf_indices(model_object: Union[BARTModel, BCFModel, ForestContainer], covariates: Union[np.array, pd.DataFrame], forest_type: str = None, forest_inds: Union[int, np.ndarray] = None): @@ -44,4 +45,56 @@ def compute_forest_leaf_indices(model_object: Union[BARTModel, BCFModel, ForestC ------- Numpy array with dimensions `num_obs` by `num_trees`, where `num_obs` is the number of rows in `covaritates` and `num_trees` is the number of trees in the relevant forest of `model_object`. """ - pass + # Extract relevant forest container + if not isinstance(model_object, BARTModel) and not isinstance(model_object, BCFModel) and not isinstance(model_object, ForestContainer): + raise ValueError("model_object must be one of BARTModel, BCFModel, or ForestContainer") + if isinstance(model_object, BARTModel): + model_type = "bart" + if forest_type is None: + raise ValueError("forest_type must be specified for a BARTModel model_type (either set to 'mean' or 'variance')") + elif isinstance(model_object, BCFModel): + model_type = "bcf" + if forest_type is None: + raise ValueError("forest_type must be specified for a BCFModel model_type (either set to 'prognostic', 'treatment' or 'variance')") + else: + model_type = "forest" + if model_type == "bart": + if forest_type == "mean": + if not model_object.include_mean_forest: + raise ValueError("Mean forest was not sampled for model_object, but requested by forest_type") + forest_container = model_object.forest_container_mean + else: + if not model_object.include_variance_forest: + raise ValueError("Variance forest was not sampled for model_object, but requested by forest_type") + forest_container = model_object.forest_container_variance + elif model_type == "bcf": + if forest_type == "prognostic": + forest_container = model_object.forest_container_mu + elif forest_type == "treatment": + forest_container = model_object.forest_container_tau + else: + if not model_object.include_variance_forest: + raise ValueError("Variance forest was not sampled for model_object, but requested by forest_type") + forest_container = model_object.forest_container_variance + else: + forest_container = model_object + + if not isinstance(covariates, pd.DataFrame) and not isinstance(covariates, np.ndarray): + raise ValueError("covariates must be a matrix or dataframe") + + # Preprocess covariates + if model_type == "bart" or model_type == "bcf": + covariates_processed = model_object._covariate_preprocessor.transform(covariates) + else: + covariates_processed = covariates + covariates_processed = np.asfortranarray(covariates_processed) + + # Preprocess forest indices + num_forests = forest_container.num_samples() + if forest_inds is None: + forest_inds = np.arange(num_forests) + else: + if not np.all(forest_inds >= 0) or not np.all(forest_inds < num_forests): + raise ValueError("The indices in forest_inds must be >= 0 and < the total number of samples in a forest container") + + return cppComputeForestContainerLeafIndices(forest_container.forest_container_cpp, covariates_processed, forest_inds) diff --git a/test/python/test_kernel.py b/test/python/test_kernel.py new file mode 100644 index 00000000..f5b711c5 --- /dev/null +++ b/test/python/test_kernel.py @@ -0,0 +1,86 @@ +import numpy as np +import pandas as pd + +from stochtree import ( + Dataset, + Forest, + ForestContainer, + compute_forest_leaf_indices +) + + +class TestJson: + def test_value(self): + # Create dataset + X = np.array( + [[1.5, 8.7, 1.2], + [2.7, 3.4, 5.4], + [3.6, 1.2, 9.3], + [4.4, 5.4, 10.4], + [5.3, 9.3, 3.6], + [6.1, 10.4, 4.4]] + ) + n, p = X.shape + num_trees = 2 + output_dim = 1 + forest_dataset = Dataset() + forest_dataset.add_covariates(X) + forest_samples = ForestContainer(num_trees, output_dim, True, False) + + # Initialize a forest with constant root predictions + forest_samples.add_sample(0.) + + # Check that regular and "raw" predictions are the same (since the leaf is constant) + pred = forest_samples.predict(forest_dataset) + pred_raw = forest_samples.predict_raw(forest_dataset) + + # Assertion + np.testing.assert_almost_equal(pred, pred_raw) + + # Split the root of the first tree in the ensemble at X[,1] > 4.0 + forest_samples.add_numeric_split(0, 0, 0, 0, 4.0, -5., 5.) + + # Check that regular and "raw" predictions are the same (since the leaf is constant) + computed = compute_forest_leaf_indices(forest_samples, X) + print(computed) + expected = np.array([ + [0], + [0], + [0], + [1], + [1], + [1], + [2], + [2], + [2], + [2], + [2], + [2] + ]) + + # Assertion + np.testing.assert_almost_equal(computed, expected) + + # Split the left leaf of the first tree in the ensemble at X[,2] > 4.0 + forest_samples.add_numeric_split(0, 0, 1, 1, 4.0, -7.5, -2.5) + + # Check that regular and "raw" predictions are the same (since the leaf is constant) + computed = compute_forest_leaf_indices(forest_samples, X) + print(computed) + expected = np.array([ + [2], + [1], + [1], + [0], + [0], + [0], + [3], + [3], + [3], + [3], + [3], + [3] + ]) + + # Assertion + np.testing.assert_almost_equal(computed, expected) From e2413f012100cceef417124b4615c48c2aa0ea61 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Tue, 18 Mar 2025 12:18:28 -0500 Subject: [PATCH 32/35] Updated R kernel code --- NEWS.md | 5 +++++ R/kernel.R | 11 +++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index ce97e9a0..7fddbbf1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +# stochtree 0.1.2 + +* Fixed indexing bug in cleanup of grow-from-root (GFR) samples in BART and BCF models +* Avoid using covariate preprocessor in `computeForestLeafIndices` function when a `ForestSamples` object is provided + # stochtree 0.1.1 * Fixed initialization bug in several R package code examples for random effects models diff --git a/R/kernel.R b/R/kernel.R index 3a8856fb..0d7dbef5 100644 --- a/R/kernel.R +++ b/R/kernel.R @@ -83,8 +83,15 @@ computeForestLeafIndices <- function(model_object, covariates, forest_type=NULL, if ((!is.data.frame(covariates)) && (!is.matrix(covariates))) { stop("covariates must be a matrix or dataframe") } - train_set_metadata <- model_object$train_set_metadata - covariates_processed <- preprocessPredictionData(covariates, train_set_metadata) + if (model_type %in% c("bart", "bcf")) { + train_set_metadata <- model_object$train_set_metadata + covariates_processed <- preprocessPredictionData(covariates, train_set_metadata) + } else { + if (!is.matrix(covariates)) { + stop("covariates must be a matrix since no covariate preprocessor is stored in a `ForestSamples` object provided as `model_object`") + } + covariates_processed <- covariates + } # Preprocess forest indices num_forests <- forest_container$num_samples() From c6d2aa0b0ca21cbbfe91692cd600199918430545 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Tue, 18 Mar 2025 16:37:51 -0500 Subject: [PATCH 33/35] Added kernel debugging script and updated kernel unit tests --- demo/debug/kernel.py | 33 +++++++++++++++++++++++++++++++++ test/python/test_kernel.py | 13 ++----------- 2 files changed, 35 insertions(+), 11 deletions(-) create mode 100644 demo/debug/kernel.py diff --git a/demo/debug/kernel.py b/demo/debug/kernel.py new file mode 100644 index 00000000..90b36f70 --- /dev/null +++ b/demo/debug/kernel.py @@ -0,0 +1,33 @@ +import numpy as np +from stochtree import Dataset, ForestContainer, compute_forest_leaf_indices + +# Create dataset +X = np.array( + [[1.5, 8.7, 1.2], + [2.7, 3.4, 5.4], + [3.6, 1.2, 9.3], + [4.4, 5.4, 10.4], + [5.3, 9.3, 3.6], + [6.1, 10.4, 4.4]] +) +n, p = X.shape +num_trees = 2 +output_dim = 1 +forest_dataset = Dataset() +forest_dataset.add_covariates(X) +forest_samples = ForestContainer(num_trees, output_dim, True, False) + +# Initialize a forest with constant root predictions +forest_samples.add_sample(0.) + +# Split the root of the first tree in the ensemble at X[,1] > 4.0 +forest_samples.add_numeric_split(0, 0, 0, 0, 4.0, -5., 5.) + +# Check that regular and "raw" predictions are the same (since the leaf is constant) +computed_indices = compute_forest_leaf_indices(forest_samples, X) + +# Split the left leaf of the first tree in the ensemble at X[,2] > 4.0 +forest_samples.add_numeric_split(0, 0, 1, 1, 4.0, -7.5, -2.5) + +# Check that regular and "raw" predictions are the same (since the leaf is constant) +computed_indices = compute_forest_leaf_indices(forest_samples, X) diff --git a/test/python/test_kernel.py b/test/python/test_kernel.py index f5b711c5..a89f3858 100644 --- a/test/python/test_kernel.py +++ b/test/python/test_kernel.py @@ -9,8 +9,8 @@ ) -class TestJson: - def test_value(self): +class TestKernel: + def test_forest(self): # Create dataset X = np.array( [[1.5, 8.7, 1.2], @@ -30,19 +30,11 @@ def test_value(self): # Initialize a forest with constant root predictions forest_samples.add_sample(0.) - # Check that regular and "raw" predictions are the same (since the leaf is constant) - pred = forest_samples.predict(forest_dataset) - pred_raw = forest_samples.predict_raw(forest_dataset) - - # Assertion - np.testing.assert_almost_equal(pred, pred_raw) - # Split the root of the first tree in the ensemble at X[,1] > 4.0 forest_samples.add_numeric_split(0, 0, 0, 0, 4.0, -5., 5.) # Check that regular and "raw" predictions are the same (since the leaf is constant) computed = compute_forest_leaf_indices(forest_samples, X) - print(computed) expected = np.array([ [0], [0], @@ -66,7 +58,6 @@ def test_value(self): # Check that regular and "raw" predictions are the same (since the leaf is constant) computed = compute_forest_leaf_indices(forest_samples, X) - print(computed) expected = np.array([ [2], [1], From 7bc21884aa6131fc1601dc1893fe4de64ae043e5 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Tue, 18 Mar 2025 23:46:06 -0500 Subject: [PATCH 34/35] Updated kernel code --- R/kernel.R | 3 +- src/kernel.cpp | 2 +- src/py_stochtree.cpp | 2 +- stochtree/__init__.py | 22 ++++---- stochtree/kernel.py | 103 ++++++++++++++++++++++++++++++++++++- test/python/test_kernel.py | 7 ++- 6 files changed, 124 insertions(+), 15 deletions(-) diff --git a/R/kernel.R b/R/kernel.R index 0d7dbef5..d21d42ae 100644 --- a/R/kernel.R +++ b/R/kernel.R @@ -206,7 +206,6 @@ computeForestLeafVariances <- function(model_object, forest_type, forest_inds=NU #' Compute and return the largest possible leaf index computable by `computeForestLeafIndices` for the forests in a designated forest sample container. #' #' @param model_object Object of type `bartmodel`, `bcfmodel`, or `ForestSamples` corresponding to a BART / BCF model with at least one forest sample, or a low-level `ForestSamples` object. -#' @param covariates Covariates to use for prediction. Must have the same dimensions / column types as the data used to train a forest. #' @param forest_type Which forest to use from `model_object`. #' Valid inputs depend on the model type, and whether or not a #' @@ -238,7 +237,7 @@ computeForestLeafVariances <- function(model_object, forest_type, forest_inds=NU #' computeForestMaxLeafIndex(bart_model, X, "mean") #' computeForestMaxLeafIndex(bart_model, X, "mean", 0) #' computeForestMaxLeafIndex(bart_model, X, "mean", c(1,3,9)) -computeForestMaxLeafIndex <- function(model_object, covariates, forest_type=NULL, forest_inds=NULL) { +computeForestMaxLeafIndex <- function(model_object, forest_type=NULL, forest_inds=NULL) { # Extract relevant forest container stopifnot(any(c(inherits(model_object, "bartmodel"), inherits(model_object, "bcfmodel"), inherits(model_object, "ForestSamples")))) model_type <- ifelse(inherits(model_object, "bartmodel"), "bart", ifelse(inherits(model_object, "bcfmodel"), "bcf", "forest_samples")) diff --git a/src/kernel.cpp b/src/kernel.cpp index 3a39dbb5..6b5867bb 100644 --- a/src/kernel.cpp +++ b/src/kernel.cpp @@ -11,7 +11,7 @@ typedef Eigen::Map forest_container, int forest_num) { - return forest_container->GetEnsemble(forest_num)->GetMaxLeafIndex(); + return forest_container->GetEnsemble(forest_num)->GetMaxLeafIndex() - 1; } [[cpp11::register]] diff --git a/src/py_stochtree.cpp b/src/py_stochtree.cpp index ff165663..65f8c927 100644 --- a/src/py_stochtree.cpp +++ b/src/py_stochtree.cpp @@ -1861,7 +1861,7 @@ py::array_t cppComputeForestContainerLeafIndices(ForestContainerCpp& forest } int cppComputeForestMaxLeafIndex(ForestContainerCpp& forest_container, int forest_num) { - return forest_container.GetForest(forest_num)->GetMaxLeafIndex(); + return forest_container.GetForest(forest_num)->GetMaxLeafIndex() - 1; } void ForestContainerCpp::LoadFromJson(JsonCpp& json, std::string forest_label) { diff --git a/stochtree/__init__.py b/stochtree/__init__.py index 24b43900..9cabd8cd 100644 --- a/stochtree/__init__.py +++ b/stochtree/__init__.py @@ -4,19 +4,22 @@ from .config import ForestModelConfig, GlobalModelConfig from .data import Dataset, Residual from .forest import Forest, ForestContainer -from .kernel import compute_forest_leaf_indices +from .kernel import ( + compute_forest_leaf_indices, + compute_forest_max_leaf_index +) from .preprocessing import CovariatePreprocessor from .random_effects import ( - RandomEffectsContainer, - RandomEffectsDataset, - RandomEffectsModel, - RandomEffectsTracker, + RandomEffectsContainer, + RandomEffectsDataset, + RandomEffectsModel, + RandomEffectsTracker, ) from .sampler import ( - RNG, - ForestSampler, - GlobalVarianceModel, - LeafVarianceModel + RNG, + ForestSampler, + GlobalVarianceModel, + LeafVarianceModel ) from .serialization import JSONSerializer from .utils import ( @@ -58,5 +61,6 @@ "_standardize_array_to_list", "_standardize_array_to_np", "compute_forest_leaf_indices", + "compute_forest_max_leaf_index", "calibrate_global_error_variance", ] diff --git a/stochtree/kernel.py b/stochtree/kernel.py index c137227f..ec902303 100644 --- a/stochtree/kernel.py +++ b/stochtree/kernel.py @@ -93,8 +93,109 @@ def compute_forest_leaf_indices(model_object: Union[BARTModel, BCFModel, ForestC num_forests = forest_container.num_samples() if forest_inds is None: forest_inds = np.arange(num_forests) - else: + elif isinstance(forest_inds, int): + if not forest_inds >= 0 or not forest_inds < num_forests: + raise ValueError("The index in forest_inds must be >= 0 and < the total number of samples in a forest container") + forest_inds = np.array([forest_inds]) + elif isinstance(forest_inds, np.ndarray): + if forest_inds.size > 1: + forest_inds = np.squeeze(forest_inds) + if forest_inds.ndim > 1: + raise ValueError("forest_inds must be a one-dimensional numpy array") if not np.all(forest_inds >= 0) or not np.all(forest_inds < num_forests): raise ValueError("The indices in forest_inds must be >= 0 and < the total number of samples in a forest container") + else: + raise ValueError("forest_inds must be a one-dimensional numpy array") return cppComputeForestContainerLeafIndices(forest_container.forest_container_cpp, covariates_processed, forest_inds) + +def compute_forest_max_leaf_index(model_object: Union[BARTModel, BCFModel, ForestContainer], forest_type: str = None, forest_inds: Union[int, np.ndarray] = None): + """ + Compute and return the largest possible leaf index computable by `compute_forest_leaf_indices` for the forests in a designated forest sample container. + + Parameters + ---------- + model_object : BARTModel, BCFModel, or ForestContainer + Object corresponding to a BART / BCF model with at least one forest sample, or a low-level `ForestContainer` object. + forest_type : str + Which forest to use from `model_object`. Valid inputs depend on the model type, and whether or not a given forest was sampled in that model. + + * **BART** + * `'mean'`: `'mean'`: Extracts leaf indices for the mean forest + * `'variance'`: Extracts leaf indices for the variance forest + * **BCF** + * `'prognostic'`: Extracts leaf indices for the prognostic forest + * `'treatment'`: Extracts leaf indices for the treatment effect forest + * `'variance'`: Extracts leaf indices for the variance forest + * **ForestContainer** + * `NULL`: It is not necessary to disambiguate when this function is called directly on a `ForestSamples` object. This is the default value of this + + forest_inds : int or np.ndarray + Indices of the forest sample(s) for which to compute max leaf indices. If not provided, this function will return max leaf indices for every sample of a forest. + This function uses 0-indexing, so the first forest sample corresponds to `forest_num = 0`, and so on. + + Returns + ------- + Numpy array containing the largest possible leaf index computable by `compute_forest_leaf_indices` for the forests in a designated forest sample container. + """ + # Extract relevant forest container + if not isinstance(model_object, BARTModel) and not isinstance(model_object, BCFModel) and not isinstance(model_object, ForestContainer): + raise ValueError("model_object must be one of BARTModel, BCFModel, or ForestContainer") + if isinstance(model_object, BARTModel): + model_type = "bart" + if forest_type is None: + raise ValueError("forest_type must be specified for a BARTModel model_type (either set to 'mean' or 'variance')") + elif isinstance(model_object, BCFModel): + model_type = "bcf" + if forest_type is None: + raise ValueError("forest_type must be specified for a BCFModel model_type (either set to 'prognostic', 'treatment' or 'variance')") + else: + model_type = "forest" + if model_type == "bart": + if forest_type == "mean": + if not model_object.include_mean_forest: + raise ValueError("Mean forest was not sampled for model_object, but requested by forest_type") + forest_container = model_object.forest_container_mean + else: + if not model_object.include_variance_forest: + raise ValueError("Variance forest was not sampled for model_object, but requested by forest_type") + forest_container = model_object.forest_container_variance + elif model_type == "bcf": + if forest_type == "prognostic": + forest_container = model_object.forest_container_mu + elif forest_type == "treatment": + forest_container = model_object.forest_container_tau + else: + if not model_object.include_variance_forest: + raise ValueError("Variance forest was not sampled for model_object, but requested by forest_type") + forest_container = model_object.forest_container_variance + else: + forest_container = model_object + + # Preprocess forest indices + num_forests = forest_container.num_samples() + if forest_inds is None: + forest_inds = np.arange(num_forests) + elif isinstance(forest_inds, int): + if not forest_inds >= 0 or not forest_inds < num_forests: + raise ValueError("The index in forest_inds must be >= 0 and < the total number of samples in a forest container") + forest_inds = np.array([forest_inds]) + elif isinstance(forest_inds, np.ndarray): + if forest_inds.size > 1: + forest_inds = np.squeeze(forest_inds) + if forest_inds.ndim > 1: + raise ValueError("forest_inds must be a one-dimensional numpy array") + if not np.all(forest_inds >= 0) or not np.all(forest_inds < num_forests): + raise ValueError("The indices in forest_inds must be >= 0 and < the total number of samples in a forest container") + else: + raise ValueError("forest_inds must be a one-dimensional numpy array") + + # Compute max index + output_size = len(forest_inds) + output = np.empty(output_size) + for i in np.arange(output_size): + output[i] = cppComputeForestMaxLeafIndex(forest_container.forest_container_cpp, forest_inds[i]) + + # Return result + return output + diff --git a/test/python/test_kernel.py b/test/python/test_kernel.py index a89f3858..6d630874 100644 --- a/test/python/test_kernel.py +++ b/test/python/test_kernel.py @@ -5,7 +5,8 @@ Dataset, Forest, ForestContainer, - compute_forest_leaf_indices + compute_forest_leaf_indices, + compute_forest_max_leaf_index ) @@ -35,6 +36,7 @@ def test_forest(self): # Check that regular and "raw" predictions are the same (since the leaf is constant) computed = compute_forest_leaf_indices(forest_samples, X) + max_leaf_index = compute_forest_max_leaf_index(forest_samples) expected = np.array([ [0], [0], @@ -52,12 +54,14 @@ def test_forest(self): # Assertion np.testing.assert_almost_equal(computed, expected) + assert max_leaf_index == [2] # Split the left leaf of the first tree in the ensemble at X[,2] > 4.0 forest_samples.add_numeric_split(0, 0, 1, 1, 4.0, -7.5, -2.5) # Check that regular and "raw" predictions are the same (since the leaf is constant) computed = compute_forest_leaf_indices(forest_samples, X) + max_leaf_index = compute_forest_max_leaf_index(forest_samples) expected = np.array([ [2], [1], @@ -75,3 +79,4 @@ def test_forest(self): # Assertion np.testing.assert_almost_equal(computed, expected) + assert max_leaf_index == [3] From a5fdcae9c671ec21c2e5d9113fd0035836e3bfa6 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Thu, 27 Mar 2025 09:50:30 -0500 Subject: [PATCH 35/35] Updated R package --- R/kernel.R | 6 +++--- man/RandomEffectSamples.Rd | 2 +- man/computeForestLeafIndices.Rd | 2 +- man/computeForestMaxLeafIndex.Rd | 15 ++++----------- 4 files changed, 9 insertions(+), 16 deletions(-) diff --git a/R/kernel.R b/R/kernel.R index d21d42ae..3265a1b7 100644 --- a/R/kernel.R +++ b/R/kernel.R @@ -234,9 +234,9 @@ computeForestLeafVariances <- function(model_object, forest_type, forest_inds=NU #' X <- matrix(runif(10*100), ncol = 10) #' y <- -5 + 10*(X[,1] > 0.5) + rnorm(100) #' bart_model <- bart(X, y, num_gfr=0, num_mcmc=10) -#' computeForestMaxLeafIndex(bart_model, X, "mean") -#' computeForestMaxLeafIndex(bart_model, X, "mean", 0) -#' computeForestMaxLeafIndex(bart_model, X, "mean", c(1,3,9)) +#' computeForestMaxLeafIndex(bart_model, "mean") +#' computeForestMaxLeafIndex(bart_model, "mean", 0) +#' computeForestMaxLeafIndex(bart_model, "mean", c(1,3,9)) computeForestMaxLeafIndex <- function(model_object, forest_type=NULL, forest_inds=NULL) { # Extract relevant forest container stopifnot(any(c(inherits(model_object, "bartmodel"), inherits(model_object, "bcfmodel"), inherits(model_object, "ForestSamples")))) diff --git a/man/RandomEffectSamples.Rd b/man/RandomEffectSamples.Rd index ecc6230f..ae5e9ac0 100644 --- a/man/RandomEffectSamples.Rd +++ b/man/RandomEffectSamples.Rd @@ -217,7 +217,7 @@ If a random effects model is "intercept-only" the \code{rfx_basis} will be a vec \describe{ \item{\code{rfx_group_ids}}{Indices of random effects groups in a prediction set} -\item{\code{rfx_basis}}{(Optional ) Basis used for random effects prediction} +\item{\code{rfx_basis}}{(Optional) Basis used for random effects prediction} } \if{html}{\out{
}} } diff --git a/man/computeForestLeafIndices.Rd b/man/computeForestLeafIndices.Rd index 9733708b..169b1ea8 100644 --- a/man/computeForestLeafIndices.Rd +++ b/man/computeForestLeafIndices.Rd @@ -42,7 +42,7 @@ this function will return leaf indices for every sample of a forest. This function uses 0-indexing, so the first forest sample corresponds to \code{forest_num = 0}, and so on.} } \value{ -List of vectors. Each vector is of size \code{num_obs * num_trees}, where \code{num_obs = nrow(covariates)} +Vector of size \code{num_obs * num_trees}, where \code{num_obs = nrow(covariates)} and \code{num_trees} is the number of trees in the relevant forest of \code{model_object}. } \description{ diff --git a/man/computeForestMaxLeafIndex.Rd b/man/computeForestMaxLeafIndex.Rd index 61b5bd68..afa3effc 100644 --- a/man/computeForestMaxLeafIndex.Rd +++ b/man/computeForestMaxLeafIndex.Rd @@ -4,18 +4,11 @@ \alias{computeForestMaxLeafIndex} \title{Compute and return the largest possible leaf index computable by \code{computeForestLeafIndices} for the forests in a designated forest sample container.} \usage{ -computeForestMaxLeafIndex( - model_object, - covariates, - forest_type = NULL, - forest_inds = NULL -) +computeForestMaxLeafIndex(model_object, forest_type = NULL, forest_inds = NULL) } \arguments{ \item{model_object}{Object of type \code{bartmodel}, \code{bcfmodel}, or \code{ForestSamples} corresponding to a BART / BCF model with at least one forest sample, or a low-level \code{ForestSamples} object.} -\item{covariates}{Covariates to use for prediction. Must have the same dimensions / column types as the data used to train a forest.} - \item{forest_type}{Which forest to use from \code{model_object}. Valid inputs depend on the model type, and whether or not a @@ -51,7 +44,7 @@ Compute and return the largest possible leaf index computable by \code{computeFo X <- matrix(runif(10*100), ncol = 10) y <- -5 + 10*(X[,1] > 0.5) + rnorm(100) bart_model <- bart(X, y, num_gfr=0, num_mcmc=10) -computeForestMaxLeafIndex(bart_model, X, "mean") -computeForestMaxLeafIndex(bart_model, X, "mean", 0) -computeForestMaxLeafIndex(bart_model, X, "mean", c(1,3,9)) +computeForestMaxLeafIndex(bart_model, "mean") +computeForestMaxLeafIndex(bart_model, "mean", 0) +computeForestMaxLeafIndex(bart_model, "mean", c(1,3,9)) }