diff --git a/NAMESPACE b/NAMESPACE index e7a370fc..c2d9c133 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -47,6 +47,8 @@ export(oneHotEncode) export(oneHotInitializeAndEncode) export(orderedCatInitializeAndPreprocess) export(orderedCatPreprocess) +export(preprocessBartParams) +export(preprocessBcfParams) export(preprocessPredictionData) export(preprocessPredictionDataFrame) export(preprocessPredictionMatrix) diff --git a/R/bart.R b/R/bart.R index 8c96c3a5..6a246a91 100644 --- a/R/bart.R +++ b/R/bart.R @@ -24,43 +24,64 @@ #' We do not currently support (but plan to in the near future), test set evaluation for group labels #' that were not in the training set. #' @param rfx_basis_test (Optional) Test set basis for "random-slope" regression in additive random effects model. -#' @param cutpoint_grid_size Maximum size of the "grid" of potential cutpoints to consider. Default: 100. -#' @param sigma_leaf_init Starting value of leaf node scale parameter. Calibrated internally as `1/num_trees_mean` if not set here. -#' @param leaf_model Model to use in the leaves, coded as integer with (0 = constant leaf, 1 = univariate leaf regression, 2 = multivariate leaf regression). Default: 0. -#' @param alpha_mean Prior probability of splitting for a tree of depth 0 in the mean model. Tree split prior combines `alpha_mean` and `beta_mean` via `alpha_mean*(1+node_depth)^-beta_mean`. Default: 0.95. -#' @param beta_mean Exponent that decreases split probabilities for nodes of depth > 0 in the mean model. Tree split prior combines `alpha_mean` and `beta_mean` via `alpha_mean*(1+node_depth)^-beta_mean`. Default: 2. -#' @param min_samples_leaf_mean Minimum allowable size of a leaf, in terms of training samples, in the mean model. Default: 5. -#' @param max_depth_mean Maximum depth of any tree in the ensemble in the mean model. Default: 10. Can be overriden with ``-1`` which does not enforce any depth limits on trees. -#' @param alpha_variance Prior probability of splitting for a tree of depth 0 in the variance model. Tree split prior combines `alpha_variance` and `beta_variance` via `alpha_variance*(1+node_depth)^-beta_variance`. Default: 0.95. -#' @param beta_variance Exponent that decreases split probabilities for nodes of depth > 0 in the variance model. Tree split prior combines `alpha_variance` and `beta_variance` via `alpha_variance*(1+node_depth)^-beta_variance` .Default: 2. -#' @param min_samples_leaf_variance Minimum allowable size of a leaf, in terms of training samples, in the variance model. Default: 5. -#' @param max_depth_variance Maximum depth of any tree in the ensemble in the variance model. Default: 10. Can be overriden with ``-1`` which does not enforce any depth limits on trees. -#' @param a_global Shape parameter in the `IG(a_global, b_global)` global error variance model. Default: 0. -#' @param b_global Scale parameter in the `IG(a_global, b_global)` global error variance model. Default: 0. -#' @param a_leaf Shape parameter in the `IG(a_leaf, b_leaf)` leaf node parameter variance model. Default: 3. -#' @param b_leaf Scale parameter in the `IG(a_leaf, b_leaf)` leaf node parameter variance model. Calibrated internally as `0.5/num_trees_mean` if not set here. -#' @param a_forest Shape parameter in the `IG(a_forest, b_forest)` conditional error variance model (which is only sampled if `num_trees_variance > 0`). Calibrated internally as `num_trees_variance / 1.5^2 + 0.5` if not set. -#' @param b_forest Scale parameter in the `IG(a_forest, b_forest)` conditional error variance model (which is only sampled if `num_trees_variance > 0`). Calibrated internally as `num_trees_variance / 1.5^2` if not set. -#' @param q Quantile used to calibrated `lambda` as in Sparapani et al (2021). Default: 0.9. -#' @param sigma2_init Starting value of global error variance parameter. Calibrated internally as `pct_var_sigma2_init*var((y-mean(y))/sd(y))` if not set. -#' @param variance_forest_init Starting value of root forest prediction in conditional (heteroskedastic) error variance model. Calibrated internally as `log(pct_var_variance_forest_init*var((y-mean(y))/sd(y)))/num_trees_variance` if not set. -#' @param pct_var_sigma2_init Percentage of standardized outcome variance used to initialize global error variance parameter. Default: 1. Superseded by `sigma2_init`. -#' @param pct_var_variance_forest_init Percentage of standardized outcome variance used to initialize global error variance parameter. Default: 1. Superseded by `variance_forest_init`. -#' @param variance_scale Variance after the data have been scaled. Default: 1. -#' @param variable_weights_mean Numeric weights reflecting the relative probability of splitting on each variable in the mean forest. Does not need to sum to 1 but cannot be negative. Defaults to `rep(1/ncol(X_train), ncol(X_train))` if not set here. -#' @param variable_weights_variance Numeric weights reflecting the relative probability of splitting on each variable in the variance forest. Does not need to sum to 1 but cannot be negative. Defaults to `rep(1/ncol(X_train), ncol(X_train))` if not set here. -#' @param num_trees_mean Number of trees in the ensemble for the conditional mean model. Default: 200. If `num_trees_mean = 0`, the conditional mean will not be modeled using a forest and the function will only proceed if `num_trees_variance > 0`. -#' @param num_trees_variance Number of trees in the ensemble for the conditional variance model. Default: 0. Variance is only modeled using a tree / forest if `num_trees_variance > 0`. #' @param num_gfr Number of "warm-start" iterations run using the grow-from-root algorithm (He and Hahn, 2021). Default: 5. #' @param num_burnin Number of "burn-in" iterations of the MCMC sampler. Default: 0. #' @param num_mcmc Number of "retained" iterations of the MCMC sampler. Default: 100. -#' @param sample_sigma_global Whether or not to update the `sigma^2` global error variance parameter based on `IG(a_global, b_global)`. Default: T. -#' @param sample_sigma_leaf Whether or not to update the `tau` leaf scale variance parameter based on `IG(a_leaf, b_leaf)`. Cannot (currently) be set to true if `ncol(W_train)>1`. Default: F. -#' @param random_seed Integer parameterizing the C++ random number generator. If not specified, the C++ random number generator is seeded according to `std::random_device`. -#' @param keep_burnin Whether or not "burnin" samples should be included in cached predictions. Default FALSE. Ignored if num_mcmc = 0. -#' @param keep_gfr Whether or not "grow-from-root" samples should be included in cached predictions. Default TRUE. Ignored if num_mcmc = 0. -#' @param verbose Whether or not to print progress during the sampling loops. Default: FALSE. +#' @param params The list of model parameters, each of which has a default value. #' +#' **1. Global Parameters** +#' +#' - `cutpoint_grid_size` Maximum size of the "grid" of potential cutpoints to consider. Default: `100`. +#' - `sigma2_init` Starting value of global error variance parameter. Calibrated internally as `pct_var_sigma2_init*var((y-mean(y))/sd(y))` if not set. +#' - `pct_var_sigma2_init` Percentage of standardized outcome variance used to initialize global error variance parameter. Default: `1`. Superseded by `sigma2_init`. +#' - `variance_scale` Variance after the data have been scaled. Default: `1`. +#' - `a_global` Shape parameter in the `IG(a_global, b_global)` global error variance model. Default: `0`. +#' - `b_global` Scale parameter in the `IG(a_global, b_global)` global error variance model. Default: `0`. +#' - `random_seed` Integer parameterizing the C++ random number generator. If not specified, the C++ random number generator is seeded according to `std::random_device`. +#' - `sample_sigma_global` Whether or not to update the `sigma^2` global error variance parameter based on `IG(a_global, b_global)`. Default: `TRUE`. +#' - `keep_burnin` Whether or not "burnin" samples should be included in cached predictions. Default `FALSE`. Ignored if `num_mcmc = 0`. +#' - `keep_gfr` Whether or not "grow-from-root" samples should be included in cached predictions. Default `TRUE`. Ignored if `num_mcmc = 0`. +#' - `verbose` Whether or not to print progress during the sampling loops. Default: `FALSE`. +#' +#' **2. Mean Forest Parameters** +#' +#' - `num_trees_mean` Number of trees in the ensemble for the conditional mean model. Default: `200`. If `num_trees_mean = 0`, the conditional mean will not be modeled using a forest, and the function will only proceed if `num_trees_variance > 0`. +#' - `sample_sigma_leaf` Whether or not to update the `tau` leaf scale variance parameter based on `IG(a_leaf, b_leaf)`. Cannot (currently) be set to true if `ncol(W_train)>1`. Default: `FALSE`. +#' +#' **2.1. Tree Prior Parameters** +#' +#' - `alpha_mean` Prior probability of splitting for a tree of depth 0 in the mean model. Tree split prior combines `alpha_mean` and `beta_mean` via `alpha_mean*(1+node_depth)^-beta_mean`. Default: `0.95`. +#' - `beta_mean` Exponent that decreases split probabilities for nodes of depth > 0 in the mean model. Tree split prior combines `alpha_mean` and `beta_mean` via `alpha_mean*(1+node_depth)^-beta_mean`. Default: `2`. +#' - `min_samples_leaf_mean` Minimum allowable size of a leaf, in terms of training samples, in the mean model. Default: `5`. +#' - `max_depth_mean` Maximum depth of any tree in the ensemble in the mean model. Default: `10`. Can be overridden with ``-1`` which does not enforce any depth limits on trees. +#' +#' **2.2. Leaf Model Parameters** +#' +#' - `variable_weights_mean` Numeric weights reflecting the relative probability of splitting on each variable in the mean forest. Does not need to sum to 1 but cannot be negative. Defaults to `rep(1/ncol(X_train), ncol(X_train))` if not set here. +#' - `sigma_leaf_init` Starting value of leaf node scale parameter. Calibrated internally as `1/num_trees_mean` if not set here. +#' - `a_leaf` Shape parameter in the `IG(a_leaf, b_leaf)` leaf node parameter variance model. Default: `3`. +#' - `b_leaf` Scale parameter in the `IG(a_leaf, b_leaf)` leaf node parameter variance model. Calibrated internally as `0.5/num_trees_mean` if not set here. +#' +#' **3. Conditional Variance Forest Parameters** +#' +#' - `num_trees_variance` Number of trees in the ensemble for the conditional variance model. Default: `0`. Variance is only modeled using a tree / forest if `num_trees_variance > 0`. +#' - `variance_forest_init` Starting value of root forest prediction in conditional (heteroskedastic) error variance model. Calibrated internally as `log(pct_var_variance_forest_init*var((y-mean(y))/sd(y)))/num_trees_variance` if not set. +#' - `pct_var_variance_forest_init` Percentage of standardized outcome variance used to initialize global error variance parameter. Default: `1`. Superseded by `variance_forest_init`. +#' +#' **3.1. Tree Prior Parameters** +#' +#' - `alpha_variance` Prior probability of splitting for a tree of depth 0 in the variance model. Tree split prior combines `alpha_variance` and `beta_variance` via `alpha_variance*(1+node_depth)^-beta_variance`. Default: `0.95`. +#' - `beta_variance` Exponent that decreases split probabilities for nodes of depth > 0 in the variance model. Tree split prior combines `alpha_variance` and `beta_variance` via `alpha_variance*(1+node_depth)^-beta_variance`. Default: `2`. +#' - `min_samples_leaf_variance` Minimum allowable size of a leaf, in terms of training samples, in the variance model. Default: `5`. +#' - `max_depth_variance` Maximum depth of any tree in the ensemble in the variance model. Default: `10`. Can be overridden with ``-1`` which does not enforce any depth limits on trees. +#' +#' **3.2. Leaf Model Parameters** +#' +#' - `variable_weights_variance` Numeric weights reflecting the relative probability of splitting on each variable in the variance forest. Does not need to sum to 1 but cannot be negative. Defaults to `rep(1/ncol(X_train), ncol(X_train))` if not set here. +#' - `sigma_leaf_init` Starting value of leaf node scale parameter. Calibrated internally as `1/num_trees_mean` if not set here. +#' - `a_forest` Shape parameter in the `IG(a_forest, b_forest)` conditional error variance model (which is only sampled if `num_trees_variance > 0`). Calibrated internally as `num_trees_variance / 1.5^2 + 0.5` if not set. +#' - `b_forest` Scale parameter in the `IG(a_forest, b_forest)` conditional error variance model (which is only sampled if `num_trees_variance > 0`). Calibrated internally as `num_trees_variance / 1.5^2` if not set. +#' #' @return List of sampling outputs and a wrapper around the sampled forests (which can be used for in-memory prediction on new data, or serialized to JSON on disk). #' @export #' @@ -91,19 +112,42 @@ bart <- function(X_train, y_train, W_train = NULL, group_ids_train = NULL, rfx_basis_train = NULL, X_test = NULL, W_test = NULL, group_ids_test = NULL, rfx_basis_test = NULL, - cutpoint_grid_size = 100, sigma_leaf_init = NULL, - alpha_mean = 0.95, beta_mean = 2.0, min_samples_leaf_mean = 5, - max_depth_mean = 10, alpha_variance = 0.95, beta_variance = 2.0, - min_samples_leaf_variance = 5, max_depth_variance = 10, - a_global = 0, b_global = 0, a_leaf = 3, b_leaf = NULL, - a_forest = NULL, b_forest = NULL, q = 0.9, sigma2_init = NULL, - variance_forest_init = NULL, pct_var_sigma2_init = 1, - pct_var_variance_forest_init = 1, variance_scale = 1, - variable_weights_mean = NULL, variable_weights_variance = NULL, - num_trees_mean = 200, num_trees_variance = 0, num_gfr = 5, num_burnin = 0, num_mcmc = 100, - sample_sigma_global = T, sample_sigma_leaf = F, random_seed = -1, - keep_burnin = F, keep_gfr = F, verbose = F) { + params = list()) { + # Extract BART parameters + bart_params <- preprocessBartParams(params) + cutpoint_grid_size <- bart_params$cutpoint_grid_size + sigma_leaf_init <- bart_params$sigma_leaf_init + alpha_mean <- bart_params$alpha_mean + beta_mean <- bart_params$beta_mean + min_samples_leaf_mean <- bart_params$min_samples_leaf_mean + max_depth_mean <- bart_params$max_depth_mean + alpha_variance <- bart_params$alpha_variance + beta_variance <- bart_params$beta_variance + min_samples_leaf_variance <- bart_params$min_samples_leaf_variance + max_depth_variance <- bart_params$max_depth_variance + a_global <- bart_params$a_global + b_global <- bart_params$b_global + a_leaf <- bart_params$a_leaf + b_leaf <- bart_params$b_leaf + a_forest <- bart_params$a_forest + b_forest <- bart_params$b_forest + variance_scale <- bart_params$variance_scale + sigma2_init <- bart_params$sigma2_init + variance_forest_init <- bart_params$variance_forest_init + pct_var_sigma2_init <- bart_params$pct_var_sigma2_init + pct_var_variance_forest_init <- bart_params$pct_var_variance_forest_init + variable_weights_mean <- bart_params$variable_weights_mean + variable_weights_variance <- bart_params$variable_weights_variance + num_trees_mean <- bart_params$num_trees_mean + num_trees_variance <- bart_params$num_trees_variance + sample_sigma_global <- bart_params$sample_sigma_global + sample_sigma_leaf <- bart_params$sample_sigma_leaf + random_seed <- bart_params$random_seed + keep_burnin <- bart_params$keep_burnin + keep_gfr <- bart_params$keep_gfr + verbose <- bart_params$verbose + # Determine whether conditional mean, variance, or both will be modeled if (num_trees_variance > 0) include_variance_forest = T else include_variance_forest = F @@ -121,7 +165,7 @@ bart <- function(X_train, y_train, W_train = NULL, group_ids_train = NULL, } # Override tau sampling if there is no mean forest - if (!include_mean_forest) sample_tau <- F + if (!include_mean_forest) sample_sigma_leaf <- F # Variable weight preprocessing (and initialization if necessary) if (include_mean_forest) { diff --git a/R/bcf.R b/R/bcf.R index a93272f9..99866c1b 100644 --- a/R/bcf.R +++ b/R/bcf.R @@ -21,58 +21,91 @@ #' We do not currently support (but plan to in the near future), test set evaluation for group labels #' that were not in the training set. #' @param rfx_basis_test (Optional) Test set basis for "random-slope" regression in additive random effects model. -#' @param cutpoint_grid_size Maximum size of the "grid" of potential cutpoints to consider. Default: 100. -#' @param sigma_leaf_mu Starting value of leaf node scale parameter for the prognostic forest. Calibrated internally as `1/num_trees_mu` if not set here. -#' @param sigma_leaf_tau Starting value of leaf node scale parameter for the treatment effect forest. Calibrated internally as `1/(2*num_trees_tau)` if not set here. -#' @param alpha_mu Prior probability of splitting for a tree of depth 0 for the prognostic forest. Tree split prior combines `alpha` and `beta` via `alpha_mu*(1+node_depth)^-beta_mu`. Default: 0.95. -#' @param alpha_tau Prior probability of splitting for a tree of depth 0 for the treatment effect forest. Tree split prior combines `alpha` and `beta` via `alpha_tau*(1+node_depth)^-beta_tau`. Default: 0.25. -#' @param alpha_variance Prior probability of splitting for a tree of depth 0 in the (optional) conditional variance model. Tree split prior combines `alpha_variance` and `beta_variance` via `alpha_variance*(1+node_depth)^-beta_variance`. Default: 0.95. -#' @param beta_mu Exponent that decreases split probabilities for nodes of depth > 0 for the prognostic forest. Tree split prior combines `alpha` and `beta` via `alpha_mu*(1+node_depth)^-beta_mu`. Default: 2.0. -#' @param beta_tau Exponent that decreases split probabilities for nodes of depth > 0 for the treatment effect forest. Tree split prior combines `alpha` and `beta` via `alpha_tau*(1+node_depth)^-beta_tau`. Default: 3.0. -#' @param beta_variance Exponent that decreases split probabilities for nodes of depth > 0 in the (optional) conditional variance model. Tree split prior combines `alpha_variance` and `beta_variance` via `alpha_variance*(1+node_depth)^-beta_variance`. Default: 2.0. -#' @param min_samples_leaf_mu Minimum allowable size of a leaf, in terms of training samples, for the prognostic forest. Default: 5. -#' @param min_samples_leaf_tau Minimum allowable size of a leaf, in terms of training samples, for the treatment effect forest. Default: 5. -#' @param min_samples_leaf_variance Minimum allowable size of a leaf, in terms of training samples, in the (optional) conditional variance model. Default: 5. -#' @param max_depth_mu Maximum depth of any tree in the mu ensemble. Default: 10. Can be overriden with ``-1`` which does not enforce any depth limits on trees. -#' @param max_depth_tau Maximum depth of any tree in the tau ensemble. Default: 5. Can be overriden with ``-1`` which does not enforce any depth limits on trees. -#' @param max_depth_variance Maximum depth of any tree in the ensemble in the (optional) conditional variance model. Default: 10. Can be overriden with ``-1`` which does not enforce any depth limits on trees. -#' @param a_global Shape parameter in the `IG(a_global, b_global)` global error variance model. Default: 0. -#' @param b_global Scale parameter in the `IG(a_global, b_global)` global error variance model. Default: 0. -#' @param a_leaf_mu Shape parameter in the `IG(a_leaf, b_leaf)` leaf node parameter variance model for the prognostic forest. Default: 3. -#' @param a_leaf_tau Shape parameter in the `IG(a_leaf, b_leaf)` leaf node parameter variance model for the treatment effect forest. Default: 3. -#' @param b_leaf_mu Scale parameter in the `IG(a_leaf, b_leaf)` leaf node parameter variance model for the prognostic forest. Calibrated internally as 0.5/num_trees if not set here. -#' @param b_leaf_tau Scale parameter in the `IG(a_leaf, b_leaf)` leaf node parameter variance model for the treatment effect forest. Calibrated internally as 0.5/num_trees if not set here. -#' @param a_forest Shape parameter in the `IG(a_forest, b_forest)` conditional error variance model (which is only sampled if `num_trees_variance > 0`). Calibrated internally as `num_trees_variance / 1.5^2 + 0.5` if not set. -#' @param b_forest Scale parameter in the `IG(a_forest, b_forest)` conditional error variance model (which is only sampled if `num_trees_variance > 0`). Calibrated internally as `num_trees_variance / 1.5^2` if not set. -#' @param sigma2_init Starting value of global error variance parameter. Calibrated internally as `pct_var_sigma2_init*var((y-mean(y))/sd(y))` if not set. -#' @param variance_forest_init Starting value of root forest prediction in conditional (heteroskedastic) error variance model. Calibrated internally as `log(pct_var_variance_forest_init*var((y-mean(y))/sd(y)))/num_trees_variance` if not set. -#' @param pct_var_sigma2_init Percentage of standardized outcome variance used to initialize global error variance parameter. Default: 1. Superseded by `sigma2_init`. -#' @param pct_var_variance_forest_init Percentage of standardized outcome variance used to initialize global error variance parameter. Default: 1. Superseded by `variance_forest_init`. -#' @param variable_weights Numeric weights reflecting the relative probability of splitting on each variable. Does not need to sum to 1 but cannot be negative. Defaults to `rep(1/ncol(X_train), ncol(X_train))` if not set here. Note that if the propensity score is included as a covariate in either forest, its weight will default to `1/ncol(X_train)`. A workaround if you wish to provide a custom weight for the propensity score is to include it as a column in `X_train` and then set `propensity_covariate` to `'none'` adjust `keep_vars_mu` and `keep_vars_tau` accordingly. -#' @param keep_vars_mu Vector of variable names or column indices denoting variables that should be included in the prognostic (`mu(X)`) forest. Default: NULL. -#' @param drop_vars_mu Vector of variable names or column indices denoting variables that should be excluded from the prognostic (`mu(X)`) forest. Default: NULL. If both `drop_vars_mu` and `keep_vars_mu` are set, `drop_vars_mu` will be ignored. -#' @param keep_vars_tau Vector of variable names or column indices denoting variables that should be included in the treatment effect (`tau(X)`) forest. Default: NULL. -#' @param drop_vars_tau Vector of variable names or column indices denoting variables that should be excluded from the treatment effect (`tau(X)`) forest. Default: NULL. If both `drop_vars_tau` and `keep_vars_tau` are set, `drop_vars_tau` will be ignored. -#' @param keep_vars_variance Vector of variable names or column indices denoting variables that should be included in the (optional) conditional variance forest. Default: NULL. -#' @param drop_vars_variance Vector of variable names or column indices denoting variables that should be excluded from the (optional) conditional variance forest. Default: NULL. If both `drop_vars_variance` and `keep_vars_variance` are set, `drop_vars_variance` will be ignored. -#' @param num_trees_mu Number of trees in the prognostic forest. Default: 200. -#' @param num_trees_tau Number of trees in the treatment effect forest. Default: 50. -#' @param num_trees_variance Number of trees in the (optional) conditional variance forest model. Default: 0. #' @param num_gfr Number of "warm-start" iterations run using the grow-from-root algorithm (He and Hahn, 2021). Default: 5. #' @param num_burnin Number of "burn-in" iterations of the MCMC sampler. Default: 0. #' @param num_mcmc Number of "retained" iterations of the MCMC sampler. Default: 100. -#' @param sample_sigma_global Whether or not to update the `sigma^2` global error variance parameter based on `IG(a_global, b_global)`. Default: T. -#' @param sample_sigma_leaf_mu Whether or not to update the `sigma_leaf_mu` leaf scale variance parameter in the prognostic forest based on `IG(a_leaf_mu, b_leaf_mu)`. Default: T. -#' @param sample_sigma_leaf_tau Whether or not to update the `sigma_leaf_tau` leaf scale variance parameter in the treatment effect forest based on `IG(a_leaf_tau, b_leaf_tau)`. Default: T. -#' @param propensity_covariate Whether to include the propensity score as a covariate in either or both of the forests. Enter "none" for neither, "mu" for the prognostic forest, "tau" for the treatment forest, and "both" for both forests. If this is not "none" and a propensity score is not provided, it will be estimated from (`X_train`, `Z_train`) using `stochtree::bart()`. Default: "mu". -#' @param adaptive_coding Whether or not to use an "adaptive coding" scheme in which a binary treatment variable is not coded manually as (0,1) or (-1,1) but learned via parameters `b_0` and `b_1` that attach to the outcome model `[b_0 (1-Z) + b_1 Z] tau(X)`. This is ignored when Z is not binary. Default: T. -#' @param b_0 Initial value of the "control" group coding parameter. This is ignored when Z is not binary. Default: -0.5. -#' @param b_1 Initial value of the "treatment" group coding parameter. This is ignored when Z is not binary. Default: 0.5. -#' @param rfx_prior_var Prior on the (diagonals of the) covariance of the additive group-level random regression coefficients. Must be a vector of length `ncol(rfx_basis_train)`. Default: `rep(1, ncol(rfx_basis_train))` -#' @param random_seed Integer parameterizing the C++ random number generator. If not specified, the C++ random number generator is seeded according to `std::random_device`. -#' @param keep_burnin Whether or not "burnin" samples should be included in cached predictions. Default FALSE. Ignored if num_mcmc = 0. -#' @param keep_gfr Whether or not "grow-from-root" samples should be included in cached predictions. Default FALSE. Ignored if num_mcmc = 0. -#' @param verbose Whether or not to print progress during the sampling loops. Default: FALSE. +#' @param params The list of model parameters, each of which has a default value. +#' +#' **1. Global Parameters** +#' +#' - `cutpoint_grid_size` Maximum size of the "grid" of potential cutpoints to consider. Default: `100`. +#' - `a_global` Shape parameter in the `IG(a_global, b_global)` global error variance model. Default: `0`. +#' - `b_global` Scale parameter in the `IG(a_global, b_global)` global error variance model. Default: `0`. +#' - `sigma2_init` Starting value of global error variance parameter. Calibrated internally as `pct_var_sigma2_init*var((y-mean(y))/sd(y))` if not set. +#' - `pct_var_sigma2_init` Percentage of standardized outcome variance used to initialize global error variance parameter. Default: `1`. Superseded by `sigma2_init`. +#' - `variable_weights` Numeric weights reflecting the relative probability of splitting on each variable. Does not need to sum to 1 but cannot be negative. Defaults to `rep(1/ncol(X_train), ncol(X_train))` if not set here. Note that if the propensity score is included as a covariate in either forest, its weight will default to `1/ncol(X_train)`. A workaround if you wish to provide a custom weight for the propensity score is to include it as a column in `X_train` and then set `propensity_covariate` to `'none'` adjust `keep_vars_mu`, `keep_vars_tau` and `keep_vars_variance` accordingly. +#' - `propensity_covariate` Whether to include the propensity score as a covariate in either or both of the forests. Enter `"none"` for neither, `"mu"` for the prognostic forest, `"tau"` for the treatment forest, and `"both"` for both forests. If this is not `"none"` and a propensity score is not provided, it will be estimated from (`X_train`, `Z_train`) using `stochtree::bart()`. Default: `"mu"`. +#' - `adaptive_coding` Whether or not to use an "adaptive coding" scheme in which a binary treatment variable is not coded manually as (0,1) or (-1,1) but learned via parameters `b_0` and `b_1` that attach to the outcome model `[b_0 (1-Z) + b_1 Z] tau(X)`. This is ignored when Z is not binary. Default: `TRUE`. +#' - `b_0` Initial value of the "control" group coding parameter. This is ignored when Z is not binary. Default: `-0.5`. +#' - `b_1` Initial value of the "treatment" group coding parameter. This is ignored when Z is not binary. Default: `0.5`. +#' - `random_seed` Integer parameterizing the C++ random number generator. If not specified, the C++ random number generator is seeded according to `std::random_device`. +#' - `keep_burnin` Whether or not "burnin" samples should be included in cached predictions. Default `FALSE`. Ignored if `num_mcmc = 0`. +#' - `keep_gfr` Whether or not "grow-from-root" samples should be included in cached predictions. Default `FALSE`. Ignored if `num_mcmc = 0`. +#' - `verbose` Whether or not to print progress during the sampling loops. Default: `FALSE`. +#' - `sample_sigma_global` Whether or not to update the `sigma^2` global error variance parameter based on `IG(a_global, b_global)`. Default: `TRUE`. +#' +#' **2. Prognostic Forest Parameters** +#' +#' - `num_trees_mu` Number of trees in the prognostic forest. Default: `200`. +#' - `sample_sigma_leaf_mu` Whether or not to update the `sigma_leaf_mu` leaf scale variance parameter in the prognostic forest based on `IG(a_leaf_mu, b_leaf_mu)`. Default: `TRUE`. +#' +#' **2.1. Tree Prior Parameters** +#' +#' - `alpha_mu` Prior probability of splitting for a tree of depth 0 for the prognostic forest. Tree split prior combines `alpha` and `beta` via `alpha_mu*(1+node_depth)^-beta_mu`. Default: `0.95`. +#' - `beta_mu` Exponent that decreases split probabilities for nodes of depth > 0 for the prognostic forest. Tree split prior combines `alpha` and `beta` via `alpha_mu*(1+node_depth)^-beta_mu`. Default: `2.0`. +#' - `min_samples_leaf_mu` Minimum allowable size of a leaf, in terms of training samples, for the prognostic forest. Default: `5`. +#' - `max_depth_mu` Maximum depth of any tree in the mu ensemble. Default: `10`. Can be overridden with `-1` which does not enforce any depth limits on trees. +#' +#' **2.2. Leaf Model Parameters** +#' +#' - `keep_vars_mu` Vector of variable names or column indices denoting variables that should be included in the prognostic (`mu(X)`) forest. Default: `NULL`. +#' - `drop_vars_mu` Vector of variable names or column indices denoting variables that should be excluded from the prognostic (`mu(X)`) forest. Default: `NULL`. If both `drop_vars_mu` and `keep_vars_mu` are set, `drop_vars_mu` will be ignored. +#' - `sigma_leaf_mu` Starting value of leaf node scale parameter for the prognostic forest. Calibrated internally as `1/num_trees_mu` if not set here. +#' - `a_leaf_mu` Shape parameter in the `IG(a_leaf_mu, b_leaf_mu)` leaf node parameter variance model for the prognostic forest. Default: `3`. +#' - `b_leaf_mu` Scale parameter in the `IG(a_leaf_mu, b_leaf_mu)` leaf node parameter variance model for the prognostic forest. Calibrated internally as `0.5/num_trees` if not set here. +#' +#' **3. Treatment Effect Forest Parameters** +#' +#' - `num_trees_tau` Number of trees in the treatment effect forest. Default: `50`. +#' - `sample_sigma_leaf_tau` Whether or not to update the `sigma_leaf_tau` leaf scale variance parameter in the treatment effect forest based on `IG(a_leaf_tau, b_leaf_tau)`. Default: `TRUE`. +#' +#' **3.1. Tree Prior Parameters** +#' +#' - `alpha_tau` Prior probability of splitting for a tree of depth 0 for the treatment effect forest. Tree split prior combines `alpha` and `beta` via `alpha_tau*(1+node_depth)^-beta_tau`. Default: `0.25`. +#' - `beta_tau` Exponent that decreases split probabilities for nodes of depth > 0 for the treatment effect forest. Tree split prior combines `alpha` and `beta` via `alpha_tau*(1+node_depth)^-beta_tau`. Default: `3.0`. +#' - `min_samples_leaf_tau` Minimum allowable size of a leaf, in terms of training samples, for the treatment effect forest. Default: `5`. +#' - `max_depth_tau` Maximum depth of any tree in the tau ensemble. Default: `5`. Can be overridden with `-1` which does not enforce any depth limits on trees. +#' +#' **3.2. Leaf Model Parameters** +#' +#' - `a_leaf_tau` Shape parameter in the `IG(a_leaf, b_leaf)` leaf node parameter variance model for the treatment effect forest. Default: `3`. +#' - `b_leaf_tau` Scale parameter in the `IG(a_leaf, b_leaf)` leaf node parameter variance model for the treatment effect forest. Calibrated internally as `0.5/num_trees` if not set here. +#' - `keep_vars_tau` Vector of variable names or column indices denoting variables that should be included in the treatment effect (`tau(X)`) forest. Default: `NULL`. +#' - `drop_vars_tau` Vector of variable names or column indices denoting variables that should be excluded from the treatment effect (`tau(X)`) forest. Default: `NULL`. If both `drop_vars_tau` and `keep_vars_tau` are set, `drop_vars_tau` will be ignored. +#' +#' **4. Conditional Variance Forest Parameters** +#' +#' - `num_trees_variance` Number of trees in the (optional) conditional variance forest model. Default: `0`. +#' - `variance_forest_init` Starting value of root forest prediction in conditional (heteroskedastic) error variance model. Calibrated internally as `log(pct_var_variance_forest_init*var((y-mean(y))/sd(y)))/num_trees_variance` if not set. +#' - `pct_var_variance_forest_init` Percentage of standardized outcome variance used to initialize global error variance parameter. Default: `1`. Superseded by `variance_forest_init`. +#' +#' **4.1. Tree Prior Parameters** +#' +#' - `alpha_variance` Prior probability of splitting for a tree of depth 0 in the (optional) conditional variance model. Tree split prior combines `alpha_variance` and `beta_variance` via `alpha_variance*(1+node_depth)^-beta_variance`. Default: `0.95`. +#' - `beta_variance` Exponent that decreases split probabilities for nodes of depth > 0 in the (optional) conditional variance model. Tree split prior combines `alpha_variance` and `beta_variance` via `alpha_variance*(1+node_depth)^-beta_variance`. Default: `2.0`. +#' - `min_samples_leaf_variance` Minimum allowable size of a leaf, in terms of training samples, in the (optional) conditional variance model. Default: `5`. +#' - `max_depth_variance` Maximum depth of any tree in the ensemble in the (optional) conditional variance model. Default: `10`. Can be overridden with `-1` which does not enforce any depth limits on trees. +#' +#' **4.2. Leaf Model Parameters** +#' +#' - `a_forest` Shape parameter in the `IG(a_forest, b_forest)` conditional error variance model (which is only sampled if `num_trees_variance > 0`). Calibrated internally as `num_trees_variance / 1.5^2 + 0.5` if not set. +#' - `b_forest` Scale parameter in the `IG(a_forest, b_forest)` conditional error variance model (which is only sampled if `num_trees_variance > 0`). Calibrated internally as `num_trees_variance / 1.5^2` if not set. +#' - `keep_vars_variance` Vector of variable names or column indices denoting variables that should be included in the (optional) conditional variance forest. Default: `NULL`. +#' - `drop_vars_variance` Vector of variable names or column indices denoting variables that should be excluded from the (optional) conditional variance forest. Default: NULL. If both `drop_vars_variance` and `keep_vars_variance` are set, `drop_vars_variance` will be ignored. +#' +#' **5. Random Effects Parameters** +#' +#' - `rfx_prior_var` Prior on the (diagonals of the) covariance of the additive group-level random regression coefficients. Must be a vector of length `ncol(rfx_basis_train)`. Default: `rep(1, ncol(rfx_basis_train))` #' #' @return List of sampling outputs and a wrapper around the sampled forests (which can be used for in-memory prediction on new data, or serialized to JSON on disk). #' @export @@ -126,19 +159,63 @@ #' # abline(0,1,col="red",lty=3,lwd=3) bcf <- function(X_train, Z_train, y_train, pi_train = NULL, group_ids_train = NULL, rfx_basis_train = NULL, X_test = NULL, Z_test = NULL, pi_test = NULL, - group_ids_test = NULL, rfx_basis_test = NULL, cutpoint_grid_size = 100, - sigma_leaf_mu = NULL, sigma_leaf_tau = NULL, alpha_mu = 0.95, alpha_tau = 0.25, - alpha_variance = 0.95, beta_mu = 2.0, beta_tau = 3.0, beta_variance = 2.0, - min_samples_leaf_mu = 5, min_samples_leaf_tau = 5, min_samples_leaf_variance = 5, - max_depth_mu = 10, max_depth_tau = 5, max_depth_variance = 10, a_global = 0, b_global = 0, - a_leaf_mu = 3, a_leaf_tau = 3, b_leaf_mu = NULL, b_leaf_tau = NULL, a_forest = NULL, b_forest = NULL, - sigma2_init = NULL, variance_forest_init = NULL, pct_var_sigma2_init = 1, - pct_var_variance_forest_init = 1, variable_weights = NULL, keep_vars_mu = NULL, - drop_vars_mu = NULL, keep_vars_tau = NULL, drop_vars_tau = NULL, keep_vars_variance = NULL, - drop_vars_variance = NULL, num_trees_mu = 250, num_trees_tau = 50, num_trees_variance = 0, - num_gfr = 5, num_burnin = 0, num_mcmc = 100, sample_sigma_global = T, sample_sigma_leaf_mu = T, - sample_sigma_leaf_tau = F, propensity_covariate = "mu", adaptive_coding = T, b_0 = -0.5, b_1 = 0.5, - rfx_prior_var = NULL, random_seed = -1, keep_burnin = F, keep_gfr = F, verbose = F) { + group_ids_test = NULL, rfx_basis_test = NULL, num_gfr = 5, + num_burnin = 0, num_mcmc = 100, params = list()) { + # Extract BCF parameters + bcf_params <- preprocessBcfParams(params) + cutpoint_grid_size <- bcf_params$cutpoint_grid_size + sigma_leaf_mu <- bcf_params$sigma_leaf_mu + sigma_leaf_tau <- bcf_params$sigma_leaf_tau + alpha_mu <- bcf_params$alpha_mu + alpha_tau <- bcf_params$alpha_tau + alpha_variance <- bcf_params$alpha_variance + beta_mu <- bcf_params$beta_mu + beta_tau <- bcf_params$beta_tau + beta_variance <- bcf_params$beta_variance + min_samples_leaf_mu <- bcf_params$min_samples_leaf_mu + min_samples_leaf_tau <- bcf_params$min_samples_leaf_tau + min_samples_leaf_variance <- bcf_params$min_samples_leaf_variance + max_depth_mu <- bcf_params$max_depth_mu + max_depth_tau <- bcf_params$max_depth_tau + max_depth_variance <- bcf_params$max_depth_variance + a_global <- bcf_params$a_global + b_global <- bcf_params$b_global + a_leaf_mu <- bcf_params$a_leaf_mu + a_leaf_tau <- bcf_params$a_leaf_tau + b_leaf_mu <- bcf_params$b_leaf_mu + b_leaf_tau <- bcf_params$b_leaf_tau + a_forest <- bcf_params$a_forest + b_forest <- bcf_params$b_forest + sigma2_init <- bcf_params$sigma2_init + variance_forest_init <- bcf_params$variance_forest_init + pct_var_sigma2_init <- bcf_params$pct_var_sigma2_init + pct_var_variance_forest_init <- bcf_params$pct_var_variance_forest_init + variable_weights <- bcf_params$variable_weights + keep_vars_mu <- bcf_params$keep_vars_mu + drop_vars_mu <- bcf_params$drop_vars_mu + keep_vars_tau <- bcf_params$keep_vars_tau + drop_vars_tau <- bcf_params$drop_vars_tau + keep_vars_variance <- bcf_params$keep_vars_variance + drop_vars_variance <- bcf_params$drop_vars_variance + num_trees_mu <- bcf_params$num_trees_mu + num_trees_tau <- bcf_params$num_trees_tau + num_trees_variance <- bcf_params$num_trees_variance + num_gfr <- bcf_params$num_gfr + num_burnin <- bcf_params$num_burnin + num_mcmc <- bcf_params$num_mcmc + sample_sigma_global <- bcf_params$sample_sigma_global + sample_sigma_leaf_mu <- bcf_params$sample_sigma_leaf_mu + sample_sigma_leaf_tau <- bcf_params$sample_sigma_leaf_tau + propensity_covariate <- bcf_params$propensity_covariate + adaptive_coding <- bcf_params$adaptive_coding + b_0 <- bcf_params$b_0 + b_1 <- bcf_params$b_1 + rfx_prior_var <- bcf_params$rfx_prior_var + random_seed <- bcf_params$random_seed + keep_burnin <- bcf_params$keep_burnin + keep_gfr <- bcf_params$keep_gfr + verbose <- bcf_params$verbose + # Determine whether conditional variance will be modeled if (num_trees_variance > 0) include_variance_forest = T else include_variance_forest = F diff --git a/R/utils.R b/R/utils.R index 7afffe27..4d42c835 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,3 +1,78 @@ +#' Preprocess BART parameter list. Override defaults with any provided parameters. +#' +#' @param params Parameter list +#' +#' @return Parameter list with defaults overriden by values supplied in `params` +#' @export +preprocessBartParams <- function(params) { + # Default parameter values + processed_params <- list( + cutpoint_grid_size = 100, sigma_leaf_init = NULL, + alpha_mean = 0.95, beta_mean = 2.0, + min_samples_leaf_mean = 5, max_depth_mean = 10, + alpha_variance = 0.95, beta_variance = 2.0, + min_samples_leaf_variance = 5, max_depth_variance = 10, + a_global = 0, b_global = 0, a_leaf = 3, b_leaf = NULL, + a_forest = NULL, b_forest = NULL, variance_scale = 1, + sigma2_init = NULL, variance_forest_init = NULL, + pct_var_sigma2_init = 1, pct_var_variance_forest_init = 1, + variable_weights_mean = NULL, variable_weights_variance = NULL, + num_trees_mean = 200, num_trees_variance = 0, + sample_sigma_global = T, sample_sigma_leaf = F, + random_seed = -1, keep_burnin = F, keep_gfr = F, verbose = F + ) + + # Override defaults + for (key in names(params)) { + if (!key %in% names(processed_params)) { + stop("Variable ", key, " is not a valid BART model parameter") + } + val <- params[[key]] + if (!is.null(val)) processed_params[[key]] <- val + } + + # Return result + return(processed_params) +} + +#' Preprocess BCF parameter list. Override defaults with any provided parameters. +#' +#' @param params Parameter list +#' +#' @return Parameter list with defaults overriden by values supplied in `params` +#' @export +preprocessBcfParams <- function(params) { + # Default parameter values + processed_params <- list( + cutpoint_grid_size = 100, sigma_leaf_mu = NULL, sigma_leaf_tau = NULL, + alpha_mu = 0.95, alpha_tau = 0.25, alpha_variance = 0.95, + beta_mu = 2.0, beta_tau = 3.0, beta_variance = 2.0, + min_samples_leaf_mu = 5, min_samples_leaf_tau = 5, min_samples_leaf_variance = 5, + max_depth_mu = 10, max_depth_tau = 5, max_depth_variance = 10, + a_global = 0, b_global = 0, a_leaf_mu = 3, a_leaf_tau = 3, b_leaf_mu = NULL, b_leaf_tau = NULL, + a_forest = NULL, b_forest = NULL, sigma2_init = NULL, variance_forest_init = NULL, + pct_var_sigma2_init = 1, pct_var_variance_forest_init = 1, + variable_weights = NULL, keep_vars_mu = NULL, drop_vars_mu = NULL, + keep_vars_tau = NULL, drop_vars_tau = NULL, keep_vars_variance = NULL, + drop_vars_variance = NULL, num_trees_mu = 250, num_trees_tau = 50, num_trees_variance = 0, + num_gfr = 5, num_burnin = 0, num_mcmc = 100, sample_sigma_global = T, sample_sigma_leaf_mu = T, + sample_sigma_leaf_tau = F, propensity_covariate = "mu", adaptive_coding = T, b_0 = -0.5, b_1 = 0.5, + rfx_prior_var = NULL, random_seed = -1, keep_burnin = F, keep_gfr = F, verbose = F + ) + + # Override defaults + for (key in names(params)) { + if (!key %in% names(processed_params)) { + stop("Variable ", key, " is not a valid BART model parameter") + } + val <- params[[key]] + if (!is.null(val)) processed_params[[key]] <- val + } + + # Return result + return(processed_params) +} + #' Preprocess covariates. DataFrames will be preprocessed based on their column #' types. Matrices will be passed through assuming all columns are numeric. #' diff --git a/demo/debug/causal_inference.py b/demo/debug/causal_inference.py index 10e2f58c..0aa1bb0b 100644 --- a/demo/debug/causal_inference.py +++ b/demo/debug/causal_inference.py @@ -87,11 +87,11 @@ plt.axline((0, 0), slope=1, color="black", linestyle=(0, (3,3))) plt.show() -# sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model.num_samples - bcf_model.num_gfr),axis=1), np.expand_dims(bcf_model.global_var_samples[bcf_model.num_gfr:],axis=1)), axis = 1), columns=["Sample", "Sigma"]) +# sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model.num_samples - bcf_model.num_gfr),axis=1), np.expand_dims(bcf_model.global_var_samples,axis=1)), axis = 1), columns=["Sample", "Sigma"]) # sns.scatterplot(data=sigma_df_mcmc, x="Sample", y="Sigma") # plt.show() -# b_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model.num_samples - bcf_model.num_gfr),axis=1), np.expand_dims(bcf_model.b0_samples[bcf_model.num_gfr:],axis=1), np.expand_dims(bcf_model.b1_samples[bcf_model.num_gfr:],axis=1)), axis = 1), columns=["Sample", "Beta_0", "Beta_1"]) +# b_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model.num_samples - bcf_model.num_gfr),axis=1), np.expand_dims(bcf_model.b0_samples,axis=1), np.expand_dims(bcf_model.b1_samples,axis=1)), axis = 1), columns=["Sample", "Beta_0", "Beta_1"]) # sns.scatterplot(data=b_df_mcmc, x="Sample", y="Beta_0") # sns.scatterplot(data=b_df_mcmc, x="Sample", y="Beta_1") # plt.show() diff --git a/demo/debug/serialization.py b/demo/debug/serialization.py index 7aaae1b6..0ba48903 100644 --- a/demo/debug/serialization.py +++ b/demo/debug/serialization.py @@ -92,4 +92,4 @@ def outcome_mean(X, W): forest_container_reloaded = ForestContainer(num_trees, W.shape[1], False, False) forest_container_reloaded.load_from_json_string(forest_json_string) y_hat_reloaded = forest_container_reloaded.predict(dataset) -np.testing.assert_approx_equal(y_hat_orig, y_hat_reloaded) +np.testing.assert_almost_equal(y_hat_orig, y_hat_reloaded) diff --git a/demo/debug/supervised_learning.py b/demo/debug/supervised_learning.py index ebff227c..f7e3b1ab 100644 --- a/demo/debug/supervised_learning.py +++ b/demo/debug/supervised_learning.py @@ -66,7 +66,7 @@ def outcome_mean(X, W): plt.axline((0, 0), slope=1, color="black", linestyle=(0, (3,3))) plt.show() -sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bart_model.num_samples - bart_model.num_gfr),axis=1), np.expand_dims(bart_model.global_var_samples[bart_model.num_gfr:],axis=1)), axis = 1), columns=["Sample", "Sigma"]) +sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bart_model.num_samples - bart_model.num_gfr),axis=1), np.expand_dims(bart_model.global_var_samples,axis=1)), axis = 1), columns=["Sample", "Sigma"]) sns.scatterplot(data=sigma_df_mcmc, x="Sample", y="Sigma") plt.show() @@ -89,7 +89,7 @@ def outcome_mean(X, W): plt.axline((0, 0), slope=1, color="black", linestyle=(0, (3,3))) plt.show() -sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bart_model.num_samples - bart_model.num_gfr),axis=1), np.expand_dims(bart_model.global_var_samples[bart_model.num_gfr:],axis=1)), axis = 1), columns=["Sample", "Sigma"]) +sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bart_model.num_samples - bart_model.num_gfr),axis=1), np.expand_dims(bart_model.global_var_samples,axis=1)), axis = 1), columns=["Sample", "Sigma"]) sns.scatterplot(data=sigma_df_mcmc, x="Sample", y="Sigma") plt.show() @@ -110,7 +110,7 @@ def outcome_mean(X, W): plt.axline((0, 0), slope=1, color="black", linestyle=(0, (3,3))) plt.show() -sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bart_model.num_samples - bart_model.num_gfr),axis=1), np.expand_dims(bart_model.global_var_samples[bart_model.num_gfr:],axis=1)), axis = 1), columns=["Sample", "Sigma"]) +sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bart_model.num_samples - bart_model.num_gfr),axis=1), np.expand_dims(bart_model.global_var_samples,axis=1)), axis = 1), columns=["Sample", "Sigma"]) sns.scatterplot(data=sigma_df_mcmc, x="Sample", y="Sigma") plt.show() diff --git a/demo/notebooks/causal_inference.ipynb b/demo/notebooks/causal_inference.ipynb index 0e8663a5..085243a0 100644 --- a/demo/notebooks/causal_inference.ipynb +++ b/demo/notebooks/causal_inference.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -69,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -98,7 +98,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -119,7 +119,7 @@ "metadata": {}, "outputs": [], "source": [ - "forest_preds_y_mcmc = bcf_model.y_hat_test[:,bcf_model.num_gfr:]\n", + "forest_preds_y_mcmc = bcf_model.y_hat_test\n", "y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis = 1, keepdims = True)\n", "y_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(y_test,1), y_avg_mcmc), axis = 1), columns=[\"True outcome\", \"Average estimated outcome\"])\n", "sns.scatterplot(data=y_df_mcmc, x=\"Average estimated outcome\", y=\"True outcome\")\n", @@ -133,7 +133,7 @@ "metadata": {}, "outputs": [], "source": [ - "forest_preds_tau_mcmc = bcf_model.tau_hat_test[:,bcf_model.num_gfr:]\n", + "forest_preds_tau_mcmc = bcf_model.tau_hat_test\n", "tau_avg_mcmc = np.squeeze(forest_preds_tau_mcmc).mean(axis = 1, keepdims = True)\n", "tau_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(tau_test,1), tau_avg_mcmc), axis = 1), columns=[\"True tau\", \"Average estimated tau\"])\n", "sns.scatterplot(data=tau_df_mcmc, x=\"True tau\", y=\"Average estimated tau\")\n", @@ -147,7 +147,7 @@ "metadata": {}, "outputs": [], "source": [ - "forest_preds_mu_mcmc = bcf_model.mu_hat_test[:,bcf_model.num_gfr:]\n", + "forest_preds_mu_mcmc = bcf_model.mu_hat_test\n", "mu_avg_mcmc = np.squeeze(forest_preds_mu_mcmc).mean(axis = 1, keepdims = True)\n", "mu_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(mu_test,1), mu_avg_mcmc), axis = 1), columns=[\"True mu\", \"Average estimated mu\"])\n", "sns.scatterplot(data=mu_df_mcmc, x=\"True mu\", y=\"Average estimated mu\")\n", @@ -161,7 +161,7 @@ "metadata": {}, "outputs": [], "source": [ - "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model.num_samples - bcf_model.num_gfr),axis=1), np.expand_dims(bcf_model.global_var_samples[bcf_model.num_gfr:],axis=1)), axis = 1), columns=[\"Sample\", \"Sigma\"])\n", + "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model.num_samples - bcf_model.num_gfr),axis=1), np.expand_dims(bcf_model.global_var_samples,axis=1)), axis = 1), columns=[\"Sample\", \"Sigma\"])\n", "sns.scatterplot(data=sigma_df_mcmc, x=\"Sample\", y=\"Sigma\")\n", "plt.show()" ] @@ -172,7 +172,7 @@ "metadata": {}, "outputs": [], "source": [ - "b_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model.num_samples - bcf_model.num_gfr),axis=1), np.expand_dims(bcf_model.b0_samples[bcf_model.num_gfr:],axis=1), np.expand_dims(bcf_model.b1_samples[bcf_model.num_gfr:],axis=1)), axis = 1), columns=[\"Sample\", \"Beta_0\", \"Beta_1\"])\n", + "b_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model.num_samples - bcf_model.num_gfr),axis=1), np.expand_dims(bcf_model.b0_samples,axis=1), np.expand_dims(bcf_model.b1_samples,axis=1)), axis = 1), columns=[\"Sample\", \"Beta_0\", \"Beta_1\"])\n", "sns.scatterplot(data=b_df_mcmc, x=\"Sample\", y=\"Beta_0\")\n", "sns.scatterplot(data=b_df_mcmc, x=\"Sample\", y=\"Beta_1\")\n", "plt.show()" @@ -181,7 +181,7 @@ ], "metadata": { "kernelspec": { - "display_name": "stochtree-dev", + "display_name": "venv", "language": "python", "name": "python3" }, @@ -195,7 +195,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.8.17" } }, "nbformat": 4, diff --git a/demo/notebooks/causal_inference_feature_subsets.ipynb b/demo/notebooks/causal_inference_feature_subsets.ipynb index 1d35bb5c..b84c9c7f 100644 --- a/demo/notebooks/causal_inference_feature_subsets.ipynb +++ b/demo/notebooks/causal_inference_feature_subsets.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -42,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -74,7 +74,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -103,7 +103,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -115,7 +115,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Inspect the MCMC (BART) samples" + "Inspect the MCMC samples" ] }, { @@ -166,7 +166,7 @@ "metadata": {}, "outputs": [], "source": [ - "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model.num_samples - bcf_model.num_gfr),axis=1), np.expand_dims(bcf_model.global_var_samples[bcf_model.num_gfr:],axis=1)), axis = 1), columns=[\"Sample\", \"Sigma\"])\n", + "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model.num_samples - bcf_model.num_gfr),axis=1), np.expand_dims(bcf_model.global_var_samples,axis=1)), axis = 1), columns=[\"Sample\", \"Sigma\"])\n", "sns.scatterplot(data=sigma_df_mcmc, x=\"Sample\", y=\"Sigma\")\n", "plt.show()" ] @@ -177,7 +177,7 @@ "metadata": {}, "outputs": [], "source": [ - "b_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model.num_samples - bcf_model.num_gfr),axis=1), np.expand_dims(bcf_model.b0_samples[bcf_model.num_gfr:],axis=1), np.expand_dims(bcf_model.b1_samples[bcf_model.num_gfr:],axis=1)), axis = 1), columns=[\"Sample\", \"Beta_0\", \"Beta_1\"])\n", + "b_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model.num_samples - bcf_model.num_gfr),axis=1), np.expand_dims(bcf_model.b0_samples,axis=1), np.expand_dims(bcf_model.b1_samples,axis=1)), axis = 1), columns=[\"Sample\", \"Beta_0\", \"Beta_1\"])\n", "sns.scatterplot(data=b_df_mcmc, x=\"Sample\", y=\"Beta_0\")\n", "sns.scatterplot(data=b_df_mcmc, x=\"Sample\", y=\"Beta_1\")\n", "plt.show()" @@ -192,12 +192,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "bcf_model_subset = BCFModel()\n", - "bcf_model_subset.sample(X_train, Z_train, y_train, pi_train, X_test, Z_test, pi_test, num_gfr=10, num_mcmc=100, keep_vars_tau=[0,1])" + "bcf_params = {'keep_vars_tau': [0,1]}\n", + "bcf_model_subset.sample(X_train, Z_train, y_train, pi_train, X_test, Z_test, pi_test, num_gfr=10, num_mcmc=100, params=bcf_params)" ] }, { @@ -256,7 +257,7 @@ "outputs": [], "source": [ "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model_subset.num_samples - bcf_model_subset.num_gfr),axis=1), \n", - " np.expand_dims(bcf_model_subset.global_var_samples[bcf_model_subset.num_gfr:],axis=1)), axis = 1), \n", + " np.expand_dims(bcf_model_subset.global_var_samples,axis=1)), axis = 1), \n", " columns=[\"Sample\", \"Sigma\"])\n", "sns.scatterplot(data=sigma_df_mcmc, x=\"Sample\", y=\"Sigma\")\n", "plt.show()" @@ -269,8 +270,8 @@ "outputs": [], "source": [ "b_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model_subset.num_samples - bcf_model_subset.num_gfr),axis=1), \n", - " np.expand_dims(bcf_model_subset.b0_samples[bcf_model_subset.num_gfr:],axis=1), \n", - " np.expand_dims(bcf_model_subset.b1_samples[bcf_model_subset.num_gfr:],axis=1)), axis = 1), \n", + " np.expand_dims(bcf_model_subset.b0_samples,axis=1), \n", + " np.expand_dims(bcf_model_subset.b1_samples,axis=1)), axis = 1), \n", " columns=[\"Sample\", \"Beta_0\", \"Beta_1\"])\n", "sns.scatterplot(data=b_df_mcmc, x=\"Sample\", y=\"Beta_0\")\n", "sns.scatterplot(data=b_df_mcmc, x=\"Sample\", y=\"Beta_1\")\n", @@ -280,7 +281,7 @@ ], "metadata": { "kernelspec": { - "display_name": "stochtree-dev", + "display_name": "venv", "language": "python", "name": "python3" }, @@ -294,7 +295,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.8.17" } }, "nbformat": 4, diff --git a/demo/notebooks/heteroskedastic_supervised_learning.ipynb b/demo/notebooks/heteroskedastic_supervised_learning.ipynb index 77814767..9eab1b68 100644 --- a/demo/notebooks/heteroskedastic_supervised_learning.ipynb +++ b/demo/notebooks/heteroskedastic_supervised_learning.ipynb @@ -139,8 +139,9 @@ "outputs": [], "source": [ "bart_model = BARTModel()\n", + "bart_params = {'num_trees_mean': 100, 'num_trees_variance': 50, 'sample_sigma_global': True, 'sample_sigma_leaf': False}\n", "bart_model.sample(X_train=X_train, y_train=y_train, X_test=X_test, basis_train=basis_train, basis_test=basis_test,\n", - " num_gfr=10, num_mcmc=100, num_trees_mean=100, num_trees_variance=50, sample_sigma_global=True, sample_sigma_leaf=False)" + " num_gfr=10, num_mcmc=100, params=bart_params)" ] }, { diff --git a/demo/notebooks/multivariate_treatment_causal_inference.ipynb b/demo/notebooks/multivariate_treatment_causal_inference.ipynb index 6e175bd5..4d959109 100644 --- a/demo/notebooks/multivariate_treatment_causal_inference.ipynb +++ b/demo/notebooks/multivariate_treatment_causal_inference.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -71,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -100,7 +100,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ diff --git a/demo/notebooks/supervised_learning.ipynb b/demo/notebooks/supervised_learning.ipynb index 227cedd3..ca7d6312 100644 --- a/demo/notebooks/supervised_learning.ipynb +++ b/demo/notebooks/supervised_learning.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -84,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -114,7 +114,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -149,7 +149,7 @@ "metadata": {}, "outputs": [], "source": [ - "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bart_model.num_samples - bart_model.num_gfr),axis=1), np.expand_dims(bart_model.global_var_samples[bart_model.num_gfr:],axis=1)), axis = 1), columns=[\"Sample\", \"Sigma\"])\n", + "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bart_model.num_samples - bart_model.num_gfr),axis=1), np.expand_dims(bart_model.global_var_samples,axis=1)), axis = 1), columns=[\"Sample\", \"Sigma\"])\n", "sns.scatterplot(data=sigma_df_mcmc, x=\"Sample\", y=\"Sigma\")\n", "plt.show()" ] @@ -186,7 +186,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -223,7 +223,7 @@ "metadata": {}, "outputs": [], "source": [ - "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bart_model.num_samples - bart_model.num_gfr),axis=1), np.expand_dims(bart_model.global_var_samples[bart_model.num_gfr:],axis=1)), axis = 1), columns=[\"Sample\", \"Sigma\"])\n", + "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bart_model.num_samples - bart_model.num_gfr),axis=1), np.expand_dims(bart_model.global_var_samples,axis=1)), axis = 1), columns=[\"Sample\", \"Sigma\"])\n", "sns.scatterplot(data=sigma_df_mcmc, x=\"Sample\", y=\"Sigma\")\n", "plt.show()" ] @@ -260,7 +260,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -295,7 +295,7 @@ "metadata": {}, "outputs": [], "source": [ - "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bart_model.num_samples - bart_model.num_gfr),axis=1), np.expand_dims(bart_model.global_var_samples[bart_model.num_gfr:],axis=1)), axis = 1), columns=[\"Sample\", \"Sigma\"])\n", + "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bart_model.num_samples - bart_model.num_gfr),axis=1), np.expand_dims(bart_model.global_var_samples,axis=1)), axis = 1), columns=[\"Sample\", \"Sigma\"])\n", "sns.scatterplot(data=sigma_df_mcmc, x=\"Sample\", y=\"Sigma\")\n", "plt.show()" ] @@ -319,7 +319,7 @@ ], "metadata": { "kernelspec": { - "display_name": "stochtree-dev", + "display_name": "venv", "language": "python", "name": "python3" }, @@ -333,7 +333,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.8.17" } }, "nbformat": 4, diff --git a/man/bart.Rd b/man/bart.Rd index 45d5a0f7..44b22410 100644 --- a/man/bart.Rd +++ b/man/bart.Rd @@ -14,41 +14,10 @@ bart( W_test = NULL, group_ids_test = NULL, rfx_basis_test = NULL, - cutpoint_grid_size = 100, - sigma_leaf_init = NULL, - alpha_mean = 0.95, - beta_mean = 2, - min_samples_leaf_mean = 5, - max_depth_mean = 10, - alpha_variance = 0.95, - beta_variance = 2, - min_samples_leaf_variance = 5, - max_depth_variance = 10, - a_global = 0, - b_global = 0, - a_leaf = 3, - b_leaf = NULL, - a_forest = NULL, - b_forest = NULL, - q = 0.9, - sigma2_init = NULL, - variance_forest_init = NULL, - pct_var_sigma2_init = 1, - pct_var_variance_forest_init = 1, - variance_scale = 1, - variable_weights_mean = NULL, - variable_weights_variance = NULL, - num_trees_mean = 200, - num_trees_variance = 0, num_gfr = 5, num_burnin = 0, num_mcmc = 100, - sample_sigma_global = T, - sample_sigma_leaf = F, - random_seed = -1, - keep_burnin = F, - keep_gfr = F, - verbose = F + params = list() ) } \arguments{ @@ -85,77 +54,73 @@ that were not in the training set.} \item{rfx_basis_test}{(Optional) Test set basis for "random-slope" regression in additive random effects model.} -\item{cutpoint_grid_size}{Maximum size of the "grid" of potential cutpoints to consider. Default: 100.} - -\item{sigma_leaf_init}{Starting value of leaf node scale parameter. Calibrated internally as \code{1/num_trees_mean} if not set here.} - -\item{alpha_mean}{Prior probability of splitting for a tree of depth 0 in the mean model. Tree split prior combines \code{alpha_mean} and \code{beta_mean} via \code{alpha_mean*(1+node_depth)^-beta_mean}. Default: 0.95.} - -\item{beta_mean}{Exponent that decreases split probabilities for nodes of depth > 0 in the mean model. Tree split prior combines \code{alpha_mean} and \code{beta_mean} via \code{alpha_mean*(1+node_depth)^-beta_mean}. Default: 2.} - -\item{min_samples_leaf_mean}{Minimum allowable size of a leaf, in terms of training samples, in the mean model. Default: 5.} - -\item{max_depth_mean}{Maximum depth of any tree in the ensemble in the mean model. Default: 10. Can be overriden with \code{-1} which does not enforce any depth limits on trees.} - -\item{alpha_variance}{Prior probability of splitting for a tree of depth 0 in the variance model. Tree split prior combines \code{alpha_variance} and \code{beta_variance} via \code{alpha_variance*(1+node_depth)^-beta_variance}. Default: 0.95.} - -\item{beta_variance}{Exponent that decreases split probabilities for nodes of depth > 0 in the variance model. Tree split prior combines \code{alpha_variance} and \code{beta_variance} via \code{alpha_variance*(1+node_depth)^-beta_variance} .Default: 2.} - -\item{min_samples_leaf_variance}{Minimum allowable size of a leaf, in terms of training samples, in the variance model. Default: 5.} - -\item{max_depth_variance}{Maximum depth of any tree in the ensemble in the variance model. Default: 10. Can be overriden with \code{-1} which does not enforce any depth limits on trees.} - -\item{a_global}{Shape parameter in the \code{IG(a_global, b_global)} global error variance model. Default: 0.} - -\item{b_global}{Scale parameter in the \code{IG(a_global, b_global)} global error variance model. Default: 0.} - -\item{a_leaf}{Shape parameter in the \code{IG(a_leaf, b_leaf)} leaf node parameter variance model. Default: 3.} - -\item{b_leaf}{Scale parameter in the \code{IG(a_leaf, b_leaf)} leaf node parameter variance model. Calibrated internally as \code{0.5/num_trees_mean} if not set here.} - -\item{a_forest}{Shape parameter in the \code{IG(a_forest, b_forest)} conditional error variance model (which is only sampled if \code{num_trees_variance > 0}). Calibrated internally as \code{num_trees_variance / 1.5^2 + 0.5} if not set.} - -\item{b_forest}{Scale parameter in the \code{IG(a_forest, b_forest)} conditional error variance model (which is only sampled if \code{num_trees_variance > 0}). Calibrated internally as \code{num_trees_variance / 1.5^2} if not set.} - -\item{q}{Quantile used to calibrated \code{lambda} as in Sparapani et al (2021). Default: 0.9.} - -\item{sigma2_init}{Starting value of global error variance parameter. Calibrated internally as \code{pct_var_sigma2_init*var((y-mean(y))/sd(y))} if not set.} - -\item{variance_forest_init}{Starting value of root forest prediction in conditional (heteroskedastic) error variance model. Calibrated internally as \code{log(pct_var_variance_forest_init*var((y-mean(y))/sd(y)))/num_trees_variance} if not set.} - -\item{pct_var_sigma2_init}{Percentage of standardized outcome variance used to initialize global error variance parameter. Default: 1. Superseded by \code{sigma2_init}.} - -\item{pct_var_variance_forest_init}{Percentage of standardized outcome variance used to initialize global error variance parameter. Default: 1. Superseded by \code{variance_forest_init}.} - -\item{variance_scale}{Variance after the data have been scaled. Default: 1.} - -\item{variable_weights_mean}{Numeric weights reflecting the relative probability of splitting on each variable in the mean forest. Does not need to sum to 1 but cannot be negative. Defaults to \code{rep(1/ncol(X_train), ncol(X_train))} if not set here.} - -\item{variable_weights_variance}{Numeric weights reflecting the relative probability of splitting on each variable in the variance forest. Does not need to sum to 1 but cannot be negative. Defaults to \code{rep(1/ncol(X_train), ncol(X_train))} if not set here.} - -\item{num_trees_mean}{Number of trees in the ensemble for the conditional mean model. Default: 200. If \code{num_trees_mean = 0}, the conditional mean will not be modeled using a forest and the function will only proceed if \code{num_trees_variance > 0}.} - -\item{num_trees_variance}{Number of trees in the ensemble for the conditional variance model. Default: 0. Variance is only modeled using a tree / forest if \code{num_trees_variance > 0}.} - \item{num_gfr}{Number of "warm-start" iterations run using the grow-from-root algorithm (He and Hahn, 2021). Default: 5.} \item{num_burnin}{Number of "burn-in" iterations of the MCMC sampler. Default: 0.} \item{num_mcmc}{Number of "retained" iterations of the MCMC sampler. Default: 100.} -\item{sample_sigma_global}{Whether or not to update the \code{sigma^2} global error variance parameter based on \code{IG(a_global, b_global)}. Default: T.} +\item{params}{The list of model parameters, each of which has a default value. + +\strong{1. Global Parameters} +\itemize{ +\item \code{cutpoint_grid_size} Maximum size of the "grid" of potential cutpoints to consider. Default: \code{100}. +\item \code{sigma2_init} Starting value of global error variance parameter. Calibrated internally as \code{pct_var_sigma2_init*var((y-mean(y))/sd(y))} if not set. +\item \code{pct_var_sigma2_init} Percentage of standardized outcome variance used to initialize global error variance parameter. Default: \code{1}. Superseded by \code{sigma2_init}. +\item \code{variance_scale} Variance after the data have been scaled. Default: \code{1}. +\item \code{a_global} Shape parameter in the \code{IG(a_global, b_global)} global error variance model. Default: \code{0}. +\item \code{b_global} Scale parameter in the \code{IG(a_global, b_global)} global error variance model. Default: \code{0}. +\item \code{random_seed} Integer parameterizing the C++ random number generator. If not specified, the C++ random number generator is seeded according to \code{std::random_device}. +\item \code{sample_sigma_global} Whether or not to update the \code{sigma^2} global error variance parameter based on \code{IG(a_global, b_global)}. Default: \code{TRUE}. +\item \code{keep_burnin} Whether or not "burnin" samples should be included in cached predictions. Default \code{FALSE}. Ignored if \code{num_mcmc = 0}. +\item \code{keep_gfr} Whether or not "grow-from-root" samples should be included in cached predictions. Default \code{TRUE}. Ignored if \code{num_mcmc = 0}. +\item \code{verbose} Whether or not to print progress during the sampling loops. Default: \code{FALSE}. +} -\item{sample_sigma_leaf}{Whether or not to update the \code{tau} leaf scale variance parameter based on \code{IG(a_leaf, b_leaf)}. Cannot (currently) be set to true if \code{ncol(W_train)>1}. Default: F.} +\strong{2. Mean Forest Parameters} +\itemize{ +\item \code{num_trees_mean} Number of trees in the ensemble for the conditional mean model. Default: \code{200}. If \code{num_trees_mean = 0}, the conditional mean will not be modeled using a forest, and the function will only proceed if \code{num_trees_variance > 0}. +\item \code{sample_sigma_leaf} Whether or not to update the \code{tau} leaf scale variance parameter based on \code{IG(a_leaf, b_leaf)}. Cannot (currently) be set to true if \code{ncol(W_train)>1}. Default: \code{FALSE}. +} -\item{random_seed}{Integer parameterizing the C++ random number generator. If not specified, the C++ random number generator is seeded according to \code{std::random_device}.} +\strong{2.1. Tree Prior Parameters} +\itemize{ +\item \code{alpha_mean} Prior probability of splitting for a tree of depth 0 in the mean model. Tree split prior combines \code{alpha_mean} and \code{beta_mean} via \code{alpha_mean*(1+node_depth)^-beta_mean}. Default: \code{0.95}. +\item \code{beta_mean} Exponent that decreases split probabilities for nodes of depth > 0 in the mean model. Tree split prior combines \code{alpha_mean} and \code{beta_mean} via \code{alpha_mean*(1+node_depth)^-beta_mean}. Default: \code{2}. +\item \code{min_samples_leaf_mean} Minimum allowable size of a leaf, in terms of training samples, in the mean model. Default: \code{5}. +\item \code{max_depth_mean} Maximum depth of any tree in the ensemble in the mean model. Default: \code{10}. Can be overridden with \code{-1} which does not enforce any depth limits on trees. +} -\item{keep_burnin}{Whether or not "burnin" samples should be included in cached predictions. Default FALSE. Ignored if num_mcmc = 0.} +\strong{2.2. Leaf Model Parameters} +\itemize{ +\item \code{variable_weights_mean} Numeric weights reflecting the relative probability of splitting on each variable in the mean forest. Does not need to sum to 1 but cannot be negative. Defaults to \code{rep(1/ncol(X_train), ncol(X_train))} if not set here. +\item \code{sigma_leaf_init} Starting value of leaf node scale parameter. Calibrated internally as \code{1/num_trees_mean} if not set here. +\item \code{a_leaf} Shape parameter in the \code{IG(a_leaf, b_leaf)} leaf node parameter variance model. Default: \code{3}. +\item \code{b_leaf} Scale parameter in the \code{IG(a_leaf, b_leaf)} leaf node parameter variance model. Calibrated internally as \code{0.5/num_trees_mean} if not set here. +} -\item{keep_gfr}{Whether or not "grow-from-root" samples should be included in cached predictions. Default TRUE. Ignored if num_mcmc = 0.} +\strong{3. Conditional Variance Forest Parameters} +\itemize{ +\item \code{num_trees_variance} Number of trees in the ensemble for the conditional variance model. Default: \code{0}. Variance is only modeled using a tree / forest if \code{num_trees_variance > 0}. +\item \code{variance_forest_init} Starting value of root forest prediction in conditional (heteroskedastic) error variance model. Calibrated internally as \code{log(pct_var_variance_forest_init*var((y-mean(y))/sd(y)))/num_trees_variance} if not set. +\item \code{pct_var_variance_forest_init} Percentage of standardized outcome variance used to initialize global error variance parameter. Default: \code{1}. Superseded by \code{variance_forest_init}. +} -\item{verbose}{Whether or not to print progress during the sampling loops. Default: FALSE.} +\strong{3.1. Tree Prior Parameters} +\itemize{ +\item \code{alpha_variance} Prior probability of splitting for a tree of depth 0 in the variance model. Tree split prior combines \code{alpha_variance} and \code{beta_variance} via \code{alpha_variance*(1+node_depth)^-beta_variance}. Default: \code{0.95}. +\item \code{beta_variance} Exponent that decreases split probabilities for nodes of depth > 0 in the variance model. Tree split prior combines \code{alpha_variance} and \code{beta_variance} via \code{alpha_variance*(1+node_depth)^-beta_variance}. Default: \code{2}. +\item \code{min_samples_leaf_variance} Minimum allowable size of a leaf, in terms of training samples, in the variance model. Default: \code{5}. +\item \code{max_depth_variance} Maximum depth of any tree in the ensemble in the variance model. Default: \code{10}. Can be overridden with \code{-1} which does not enforce any depth limits on trees. +} -\item{leaf_model}{Model to use in the leaves, coded as integer with (0 = constant leaf, 1 = univariate leaf regression, 2 = multivariate leaf regression). Default: 0.} +\strong{3.2. Leaf Model Parameters} +\itemize{ +\item \code{variable_weights_variance} Numeric weights reflecting the relative probability of splitting on each variable in the variance forest. Does not need to sum to 1 but cannot be negative. Defaults to \code{rep(1/ncol(X_train), ncol(X_train))} if not set here. +\item \code{sigma_leaf_init} Starting value of leaf node scale parameter. Calibrated internally as \code{1/num_trees_mean} if not set here. +\item \code{a_forest} Shape parameter in the \code{IG(a_forest, b_forest)} conditional error variance model (which is only sampled if \code{num_trees_variance > 0}). Calibrated internally as \code{num_trees_variance / 1.5^2 + 0.5} if not set. +\item \code{b_forest} Scale parameter in the \code{IG(a_forest, b_forest)} conditional error variance model (which is only sampled if \code{num_trees_variance > 0}). Calibrated internally as \code{num_trees_variance / 1.5^2} if not set. +}} } \value{ List of sampling outputs and a wrapper around the sampled forests (which can be used for in-memory prediction on new data, or serialized to JSON on disk). diff --git a/man/bcf.Rd b/man/bcf.Rd index 9bcfd339..0e96e2e5 100644 --- a/man/bcf.Rd +++ b/man/bcf.Rd @@ -16,56 +16,10 @@ bcf( pi_test = NULL, group_ids_test = NULL, rfx_basis_test = NULL, - cutpoint_grid_size = 100, - sigma_leaf_mu = NULL, - sigma_leaf_tau = NULL, - alpha_mu = 0.95, - alpha_tau = 0.25, - alpha_variance = 0.95, - beta_mu = 2, - beta_tau = 3, - beta_variance = 2, - min_samples_leaf_mu = 5, - min_samples_leaf_tau = 5, - min_samples_leaf_variance = 5, - max_depth_mu = 10, - max_depth_tau = 5, - max_depth_variance = 10, - a_global = 0, - b_global = 0, - a_leaf_mu = 3, - a_leaf_tau = 3, - b_leaf_mu = NULL, - b_leaf_tau = NULL, - sigma2_init = NULL, - variance_forest_init = NULL, - pct_var_sigma2_init = 1, - pct_var_variance_forest_init = 1, - variable_weights = NULL, - keep_vars_mu = NULL, - drop_vars_mu = NULL, - keep_vars_tau = NULL, - drop_vars_tau = NULL, - keep_vars_variance = NULL, - drop_vars_variance = NULL, - num_trees_mu = 250, - num_trees_tau = 50, - num_trees_variance = 0, num_gfr = 5, num_burnin = 0, num_mcmc = 100, - sample_sigma_global = T, - sample_sigma_leaf_mu = T, - sample_sigma_leaf_tau = F, - propensity_covariate = "mu", - adaptive_coding = T, - b_0 = -0.5, - b_1 = 0.5, - rfx_prior_var = NULL, - random_seed = -1, - keep_burnin = F, - keep_gfr = F, - verbose = F + params = list() ) } \arguments{ @@ -101,109 +55,105 @@ that were not in the training set.} \item{rfx_basis_test}{(Optional) Test set basis for "random-slope" regression in additive random effects model.} -\item{cutpoint_grid_size}{Maximum size of the "grid" of potential cutpoints to consider. Default: 100.} - -\item{sigma_leaf_mu}{Starting value of leaf node scale parameter for the prognostic forest. Calibrated internally as \code{1/num_trees_mu} if not set here.} - -\item{sigma_leaf_tau}{Starting value of leaf node scale parameter for the treatment effect forest. Calibrated internally as \code{1/(2*num_trees_tau)} if not set here.} - -\item{alpha_mu}{Prior probability of splitting for a tree of depth 0 for the prognostic forest. Tree split prior combines \code{alpha} and \code{beta} via \code{alpha_mu*(1+node_depth)^-beta_mu}. Default: 0.95.} - -\item{alpha_tau}{Prior probability of splitting for a tree of depth 0 for the treatment effect forest. Tree split prior combines \code{alpha} and \code{beta} via \code{alpha_tau*(1+node_depth)^-beta_tau}. Default: 0.25.} - -\item{alpha_variance}{Prior probability of splitting for a tree of depth 0 in the (optional) conditional variance model. Tree split prior combines \code{alpha_variance} and \code{beta_variance} via \code{alpha_variance*(1+node_depth)^-beta_variance}. Default: 0.95.} - -\item{beta_mu}{Exponent that decreases split probabilities for nodes of depth > 0 for the prognostic forest. Tree split prior combines \code{alpha} and \code{beta} via \code{alpha_mu*(1+node_depth)^-beta_mu}. Default: 2.0.} - -\item{beta_tau}{Exponent that decreases split probabilities for nodes of depth > 0 for the treatment effect forest. Tree split prior combines \code{alpha} and \code{beta} via \code{alpha_tau*(1+node_depth)^-beta_tau}. Default: 3.0.} - -\item{beta_variance}{Exponent that decreases split probabilities for nodes of depth > 0 in the (optional) conditional variance model. Tree split prior combines \code{alpha_variance} and \code{beta_variance} via \code{alpha_variance*(1+node_depth)^-beta_variance}. Default: 2.0.} - -\item{min_samples_leaf_mu}{Minimum allowable size of a leaf, in terms of training samples, for the prognostic forest. Default: 5.} - -\item{min_samples_leaf_tau}{Minimum allowable size of a leaf, in terms of training samples, for the treatment effect forest. Default: 5.} - -\item{min_samples_leaf_variance}{Minimum allowable size of a leaf, in terms of training samples, in the (optional) conditional variance model. Default: 5.} - -\item{max_depth_mu}{Maximum depth of any tree in the mu ensemble. Default: 10. Can be overriden with \code{-1} which does not enforce any depth limits on trees.} - -\item{max_depth_tau}{Maximum depth of any tree in the tau ensemble. Default: 5. Can be overriden with \code{-1} which does not enforce any depth limits on trees.} - -\item{max_depth_variance}{Maximum depth of any tree in the ensemble in the (optional) conditional variance model. Default: 10. Can be overriden with \code{-1} which does not enforce any depth limits on trees.} - -\item{a_global}{Shape parameter in the \code{IG(a_global, b_global)} global error variance model. Default: 0.} - -\item{b_global}{Scale parameter in the \code{IG(a_global, b_global)} global error variance model. Default: 0.} - -\item{a_leaf_mu}{Shape parameter in the \code{IG(a_leaf, b_leaf)} leaf node parameter variance model for the prognostic forest. Default: 3.} - -\item{a_leaf_tau}{Shape parameter in the \code{IG(a_leaf, b_leaf)} leaf node parameter variance model for the treatment effect forest. Default: 3.} - -\item{b_leaf_mu}{Scale parameter in the \code{IG(a_leaf, b_leaf)} leaf node parameter variance model for the prognostic forest. Calibrated internally as 0.5/num_trees if not set here.} - -\item{b_leaf_tau}{Scale parameter in the \code{IG(a_leaf, b_leaf)} leaf node parameter variance model for the treatment effect forest. Calibrated internally as 0.5/num_trees if not set here.} - -\item{sigma2_init}{Starting value of global error variance parameter. Calibrated internally as \code{pct_var_sigma2_init*var((y-mean(y))/sd(y))} if not set.} - -\item{variance_forest_init}{Starting value of root forest prediction in conditional (heteroskedastic) error variance model. Calibrated internally as \code{log(pct_var_variance_forest_init*var((y-mean(y))/sd(y)))/num_trees_variance} if not set.} - -\item{pct_var_sigma2_init}{Percentage of standardized outcome variance used to initialize global error variance parameter. Default: 1. Superseded by \code{sigma2_init}.} - -\item{pct_var_variance_forest_init}{Percentage of standardized outcome variance used to initialize global error variance parameter. Default: 1. Superseded by \code{variance_forest_init}.} - -\item{variable_weights}{Numeric weights reflecting the relative probability of splitting on each variable. Does not need to sum to 1 but cannot be negative. Defaults to \code{rep(1/ncol(X_train), ncol(X_train))} if not set here. Note that if the propensity score is included as a covariate in either forest, its weight will default to \code{1/ncol(X_train)}. A workaround if you wish to provide a custom weight for the propensity score is to include it as a column in \code{X_train} and then set \code{propensity_covariate} to \code{'none'} adjust \code{keep_vars_mu} and \code{keep_vars_tau} accordingly.} - -\item{keep_vars_mu}{Vector of variable names or column indices denoting variables that should be included in the prognostic (\code{mu(X)}) forest. Default: NULL.} - -\item{drop_vars_mu}{Vector of variable names or column indices denoting variables that should be excluded from the prognostic (\code{mu(X)}) forest. Default: NULL. If both \code{drop_vars_mu} and \code{keep_vars_mu} are set, \code{drop_vars_mu} will be ignored.} - -\item{keep_vars_tau}{Vector of variable names or column indices denoting variables that should be included in the treatment effect (\code{tau(X)}) forest. Default: NULL.} - -\item{drop_vars_tau}{Vector of variable names or column indices denoting variables that should be excluded from the treatment effect (\code{tau(X)}) forest. Default: NULL. If both \code{drop_vars_tau} and \code{keep_vars_tau} are set, \code{drop_vars_tau} will be ignored.} - -\item{keep_vars_variance}{Vector of variable names or column indices denoting variables that should be included in the (optional) conditional variance forest. Default: NULL.} - -\item{drop_vars_variance}{Vector of variable names or column indices denoting variables that should be excluded from the (optional) conditional variance forest. Default: NULL. If both \code{drop_vars_variance} and \code{keep_vars_variance} are set, \code{drop_vars_variance} will be ignored.} - -\item{num_trees_mu}{Number of trees in the prognostic forest. Default: 200.} - -\item{num_trees_tau}{Number of trees in the treatment effect forest. Default: 50.} - -\item{num_trees_variance}{Number of trees in the (optional) conditional variance forest model. Default: 0.} - \item{num_gfr}{Number of "warm-start" iterations run using the grow-from-root algorithm (He and Hahn, 2021). Default: 5.} \item{num_burnin}{Number of "burn-in" iterations of the MCMC sampler. Default: 0.} \item{num_mcmc}{Number of "retained" iterations of the MCMC sampler. Default: 100.} -\item{sample_sigma_global}{Whether or not to update the \code{sigma^2} global error variance parameter based on \code{IG(a_global, b_global)}. Default: T.} - -\item{sample_sigma_leaf_mu}{Whether or not to update the \code{sigma_leaf_mu} leaf scale variance parameter in the prognostic forest based on \code{IG(a_leaf_mu, b_leaf_mu)}. Default: T.} - -\item{sample_sigma_leaf_tau}{Whether or not to update the \code{sigma_leaf_tau} leaf scale variance parameter in the treatment effect forest based on \code{IG(a_leaf_tau, b_leaf_tau)}. Default: T.} - -\item{propensity_covariate}{Whether to include the propensity score as a covariate in either or both of the forests. Enter "none" for neither, "mu" for the prognostic forest, "tau" for the treatment forest, and "both" for both forests. If this is not "none" and a propensity score is not provided, it will be estimated from (\code{X_train}, \code{Z_train}) using \code{stochtree::bart()}. Default: "mu".} +\item{params}{The list of model parameters, each of which has a default value. + +\strong{1. Global Parameters} +\itemize{ +\item \code{cutpoint_grid_size} Maximum size of the "grid" of potential cutpoints to consider. Default: \code{100}. +\item \code{a_global} Shape parameter in the \code{IG(a_global, b_global)} global error variance model. Default: \code{0}. +\item \code{b_global} Scale parameter in the \code{IG(a_global, b_global)} global error variance model. Default: \code{0}. +\item \code{sigma2_init} Starting value of global error variance parameter. Calibrated internally as \code{pct_var_sigma2_init*var((y-mean(y))/sd(y))} if not set. +\item \code{pct_var_sigma2_init} Percentage of standardized outcome variance used to initialize global error variance parameter. Default: \code{1}. Superseded by \code{sigma2_init}. +\item \code{variable_weights} Numeric weights reflecting the relative probability of splitting on each variable. Does not need to sum to 1 but cannot be negative. Defaults to \code{rep(1/ncol(X_train), ncol(X_train))} if not set here. Note that if the propensity score is included as a covariate in either forest, its weight will default to \code{1/ncol(X_train)}. A workaround if you wish to provide a custom weight for the propensity score is to include it as a column in \code{X_train} and then set \code{propensity_covariate} to \code{'none'} adjust \code{keep_vars_mu}, \code{keep_vars_tau} and \code{keep_vars_variance} accordingly. +\item \code{propensity_covariate} Whether to include the propensity score as a covariate in either or both of the forests. Enter \code{"none"} for neither, \code{"mu"} for the prognostic forest, \code{"tau"} for the treatment forest, and \code{"both"} for both forests. If this is not \code{"none"} and a propensity score is not provided, it will be estimated from (\code{X_train}, \code{Z_train}) using \code{stochtree::bart()}. Default: \code{"mu"}. +\item \code{adaptive_coding} Whether or not to use an "adaptive coding" scheme in which a binary treatment variable is not coded manually as (0,1) or (-1,1) but learned via parameters \code{b_0} and \code{b_1} that attach to the outcome model \verb{[b_0 (1-Z) + b_1 Z] tau(X)}. This is ignored when Z is not binary. Default: \code{TRUE}. +\item \code{b_0} Initial value of the "control" group coding parameter. This is ignored when Z is not binary. Default: \code{-0.5}. +\item \code{b_1} Initial value of the "treatment" group coding parameter. This is ignored when Z is not binary. Default: \code{0.5}. +\item \code{random_seed} Integer parameterizing the C++ random number generator. If not specified, the C++ random number generator is seeded according to \code{std::random_device}. +\item \code{keep_burnin} Whether or not "burnin" samples should be included in cached predictions. Default \code{FALSE}. Ignored if \code{num_mcmc = 0}. +\item \code{keep_gfr} Whether or not "grow-from-root" samples should be included in cached predictions. Default \code{FALSE}. Ignored if \code{num_mcmc = 0}. +\item \code{verbose} Whether or not to print progress during the sampling loops. Default: \code{FALSE}. +\item \code{sample_sigma_global} Whether or not to update the \code{sigma^2} global error variance parameter based on \code{IG(a_global, b_global)}. Default: \code{TRUE}. +} -\item{adaptive_coding}{Whether or not to use an "adaptive coding" scheme in which a binary treatment variable is not coded manually as (0,1) or (-1,1) but learned via parameters \code{b_0} and \code{b_1} that attach to the outcome model \verb{[b_0 (1-Z) + b_1 Z] tau(X)}. This is ignored when Z is not binary. Default: T.} +\strong{2. Prognostic Forest Parameters} +\itemize{ +\item \code{num_trees_mu} Number of trees in the prognostic forest. Default: \code{200}. +\item \code{sample_sigma_leaf_mu} Whether or not to update the \code{sigma_leaf_mu} leaf scale variance parameter in the prognostic forest based on \code{IG(a_leaf_mu, b_leaf_mu)}. Default: \code{TRUE}. +} -\item{b_0}{Initial value of the "control" group coding parameter. This is ignored when Z is not binary. Default: -0.5.} +\strong{2.1. Tree Prior Parameters} +\itemize{ +\item \code{alpha_mu} Prior probability of splitting for a tree of depth 0 for the prognostic forest. Tree split prior combines \code{alpha} and \code{beta} via \code{alpha_mu*(1+node_depth)^-beta_mu}. Default: \code{0.95}. +\item \code{beta_mu} Exponent that decreases split probabilities for nodes of depth > 0 for the prognostic forest. Tree split prior combines \code{alpha} and \code{beta} via \code{alpha_mu*(1+node_depth)^-beta_mu}. Default: \code{2.0}. +\item \code{min_samples_leaf_mu} Minimum allowable size of a leaf, in terms of training samples, for the prognostic forest. Default: \code{5}. +\item \code{max_depth_mu} Maximum depth of any tree in the mu ensemble. Default: \code{10}. Can be overridden with \code{-1} which does not enforce any depth limits on trees. +} -\item{b_1}{Initial value of the "treatment" group coding parameter. This is ignored when Z is not binary. Default: 0.5.} +\strong{2.2. Leaf Model Parameters} +\itemize{ +\item \code{keep_vars_mu} Vector of variable names or column indices denoting variables that should be included in the prognostic (\code{mu(X)}) forest. Default: \code{NULL}. +\item \code{drop_vars_mu} Vector of variable names or column indices denoting variables that should be excluded from the prognostic (\code{mu(X)}) forest. Default: \code{NULL}. If both \code{drop_vars_mu} and \code{keep_vars_mu} are set, \code{drop_vars_mu} will be ignored. +\item \code{sigma_leaf_mu} Starting value of leaf node scale parameter for the prognostic forest. Calibrated internally as \code{1/num_trees_mu} if not set here. +\item \code{a_leaf_mu} Shape parameter in the \code{IG(a_leaf_mu, b_leaf_mu)} leaf node parameter variance model for the prognostic forest. Default: \code{3}. +\item \code{b_leaf_mu} Scale parameter in the \code{IG(a_leaf_mu, b_leaf_mu)} leaf node parameter variance model for the prognostic forest. Calibrated internally as \code{0.5/num_trees} if not set here. +} -\item{rfx_prior_var}{Prior on the (diagonals of the) covariance of the additive group-level random regression coefficients. Must be a vector of length \code{ncol(rfx_basis_train)}. Default: \code{rep(1, ncol(rfx_basis_train))}} +\strong{3. Treatment Effect Forest Parameters} +\itemize{ +\item \code{num_trees_tau} Number of trees in the treatment effect forest. Default: \code{50}. +\item \code{sample_sigma_leaf_tau} Whether or not to update the \code{sigma_leaf_tau} leaf scale variance parameter in the treatment effect forest based on \code{IG(a_leaf_tau, b_leaf_tau)}. Default: \code{TRUE}. +} -\item{random_seed}{Integer parameterizing the C++ random number generator. If not specified, the C++ random number generator is seeded according to \code{std::random_device}.} +\strong{3.1. Tree Prior Parameters} +\itemize{ +\item \code{alpha_tau} Prior probability of splitting for a tree of depth 0 for the treatment effect forest. Tree split prior combines \code{alpha} and \code{beta} via \code{alpha_tau*(1+node_depth)^-beta_tau}. Default: \code{0.25}. +\item \code{beta_tau} Exponent that decreases split probabilities for nodes of depth > 0 for the treatment effect forest. Tree split prior combines \code{alpha} and \code{beta} via \code{alpha_tau*(1+node_depth)^-beta_tau}. Default: \code{3.0}. +\item \code{min_samples_leaf_tau} Minimum allowable size of a leaf, in terms of training samples, for the treatment effect forest. Default: \code{5}. +\item \code{max_depth_tau} Maximum depth of any tree in the tau ensemble. Default: \code{5}. Can be overridden with \code{-1} which does not enforce any depth limits on trees. +} -\item{keep_burnin}{Whether or not "burnin" samples should be included in cached predictions. Default FALSE. Ignored if num_mcmc = 0.} +\strong{3.2. Leaf Model Parameters} +\itemize{ +\item \code{a_leaf_tau} Shape parameter in the \code{IG(a_leaf, b_leaf)} leaf node parameter variance model for the treatment effect forest. Default: \code{3}. +\item \code{b_leaf_tau} Scale parameter in the \code{IG(a_leaf, b_leaf)} leaf node parameter variance model for the treatment effect forest. Calibrated internally as \code{0.5/num_trees} if not set here. +\item \code{keep_vars_tau} Vector of variable names or column indices denoting variables that should be included in the treatment effect (\code{tau(X)}) forest. Default: \code{NULL}. +\item \code{drop_vars_tau} Vector of variable names or column indices denoting variables that should be excluded from the treatment effect (\code{tau(X)}) forest. Default: \code{NULL}. If both \code{drop_vars_tau} and \code{keep_vars_tau} are set, \code{drop_vars_tau} will be ignored. +} -\item{keep_gfr}{Whether or not "grow-from-root" samples should be included in cached predictions. Default FALSE. Ignored if num_mcmc = 0.} +\strong{4. Conditional Variance Forest Parameters} +\itemize{ +\item \code{num_trees_variance} Number of trees in the (optional) conditional variance forest model. Default: \code{0}. +\item \code{variance_forest_init} Starting value of root forest prediction in conditional (heteroskedastic) error variance model. Calibrated internally as \code{log(pct_var_variance_forest_init*var((y-mean(y))/sd(y)))/num_trees_variance} if not set. +\item \code{pct_var_variance_forest_init} Percentage of standardized outcome variance used to initialize global error variance parameter. Default: \code{1}. Superseded by \code{variance_forest_init}. +} -\item{verbose}{Whether or not to print progress during the sampling loops. Default: FALSE.} +\strong{4.1. Tree Prior Parameters} +\itemize{ +\item \code{alpha_variance} Prior probability of splitting for a tree of depth 0 in the (optional) conditional variance model. Tree split prior combines \code{alpha_variance} and \code{beta_variance} via \code{alpha_variance*(1+node_depth)^-beta_variance}. Default: \code{0.95}. +\item \code{beta_variance} Exponent that decreases split probabilities for nodes of depth > 0 in the (optional) conditional variance model. Tree split prior combines \code{alpha_variance} and \code{beta_variance} via \code{alpha_variance*(1+node_depth)^-beta_variance}. Default: \code{2.0}. +\item \code{min_samples_leaf_variance} Minimum allowable size of a leaf, in terms of training samples, in the (optional) conditional variance model. Default: \code{5}. +\item \code{max_depth_variance} Maximum depth of any tree in the ensemble in the (optional) conditional variance model. Default: \code{10}. Can be overridden with \code{-1} which does not enforce any depth limits on trees. +} -\item{a_forest}{Shape parameter in the \code{IG(a_forest, b_forest)} conditional error variance model (which is only sampled if \code{num_trees_variance > 0}). Calibrated internally as \code{num_trees_variance / 1.5^2 + 0.5} if not set.} +\strong{4.2. Leaf Model Parameters} +\itemize{ +\item \code{a_forest} Shape parameter in the \code{IG(a_forest, b_forest)} conditional error variance model (which is only sampled if \code{num_trees_variance > 0}). Calibrated internally as \code{num_trees_variance / 1.5^2 + 0.5} if not set. +\item \code{b_forest} Scale parameter in the \code{IG(a_forest, b_forest)} conditional error variance model (which is only sampled if \code{num_trees_variance > 0}). Calibrated internally as \code{num_trees_variance / 1.5^2} if not set. +\item \code{keep_vars_variance} Vector of variable names or column indices denoting variables that should be included in the (optional) conditional variance forest. Default: \code{NULL}. +\item \code{drop_vars_variance} Vector of variable names or column indices denoting variables that should be excluded from the (optional) conditional variance forest. Default: NULL. If both \code{drop_vars_variance} and \code{keep_vars_variance} are set, \code{drop_vars_variance} will be ignored. +} -\item{b_forest}{Scale parameter in the \code{IG(a_forest, b_forest)} conditional error variance model (which is only sampled if \code{num_trees_variance > 0}). Calibrated internally as \code{num_trees_variance / 1.5^2} if not set.} +\strong{5. Random Effects Parameters} +\itemize{ +\item \code{rfx_prior_var} Prior on the (diagonals of the) covariance of the additive group-level random regression coefficients. Must be a vector of length \code{ncol(rfx_basis_train)}. Default: \code{rep(1, ncol(rfx_basis_train))} +}} } \value{ List of sampling outputs and a wrapper around the sampled forests (which can be used for in-memory prediction on new data, or serialized to JSON on disk). diff --git a/man/preprocessBartParams.Rd b/man/preprocessBartParams.Rd new file mode 100644 index 00000000..f06ef30f --- /dev/null +++ b/man/preprocessBartParams.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{preprocessBartParams} +\alias{preprocessBartParams} +\title{Preprocess BART parameter list. Override defaults with any provided parameters.} +\usage{ +preprocessBartParams(params) +} +\arguments{ +\item{params}{Parameter list} +} +\value{ +Parameter list with defaults overriden by values supplied in \code{params} +} +\description{ +Preprocess BART parameter list. Override defaults with any provided parameters. +} diff --git a/man/preprocessBcfParams.Rd b/man/preprocessBcfParams.Rd new file mode 100644 index 00000000..fdc1d498 --- /dev/null +++ b/man/preprocessBcfParams.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{preprocessBcfParams} +\alias{preprocessBcfParams} +\title{Preprocess BCF parameter list. Override defaults with any provided parameters.} +\usage{ +preprocessBcfParams(params) +} +\arguments{ +\item{params}{Parameter list} +} +\value{ +Parameter list with defaults overriden by values supplied in \code{params} +} +\description{ +Preprocess BCF parameter list. Override defaults with any provided parameters. +} diff --git a/stochtree/bart.py b/stochtree/bart.py index 594fa9c1..5b79b9cd 100644 --- a/stochtree/bart.py +++ b/stochtree/bart.py @@ -3,13 +3,10 @@ """ import numpy as np import pandas as pd -from sklearn import linear_model -from sklearn.metrics import mean_squared_error -from scipy.stats import gamma -from .calibration import calibrate_global_error_variance +from typing import Optional, Dict, Any from .data import Dataset, Residual from .forest import ForestContainer -from .preprocessing import CovariateTransformer +from .preprocessing import CovariateTransformer, _preprocess_bart_params from .sampler import ForestSampler, RNG, GlobalVarianceModel, LeafVarianceModel from .utils import NotSampledError @@ -26,14 +23,7 @@ def is_sampled(self) -> bool: return self.sampled def sample(self, X_train: np.array, y_train: np.array, basis_train: np.array = None, X_test: np.array = None, basis_test: np.array = None, - cutpoint_grid_size = 100, sigma_leaf: float = None, alpha_mean: float = 0.95, beta_mean: float = 2.0, min_samples_leaf_mean: int = 5, - max_depth_mean: int = 10, alpha_variance: float = 0.95, beta_variance: float = 2.0, min_samples_leaf_variance: int = 5, - max_depth_variance: int = 10, a_global: float = 0, b_global: float = 0, a_leaf: float = 3, b_leaf: float = None, - a_forest: float = None, b_forest: float = None, sigma2_init: float = None, variance_forest_leaf_init: float = None, - pct_var_sigma2_init: float = 1, pct_var_variance_forest_init: float = 1, variance_scale: float = 1, - variable_weights_mean: np.array = None, variable_weights_variance: np.array = None, num_trees_mean: int = 200, - num_trees_variance: int = 0, num_gfr: int = 5, num_burnin: int = 0, num_mcmc: int = 100, - sample_sigma_global: bool = True, sample_sigma_leaf: bool = True, random_seed: int = -1, keep_burnin: bool = False, keep_gfr: bool = False) -> None: + num_gfr: int = 5, num_burnin: int = 0, num_mcmc: int = 100, params: Optional[Dict[str, Any]] = None) -> None: """Runs a BART sampler on provided training set. Predictions will be cached for the training set and (if provided) the test set. Does not require a leaf regression basis. @@ -50,84 +40,84 @@ def sample(self, X_train: np.array, y_train: np.array, basis_train: np.array = N basis_test : :obj:`np.array`, optional Optional test set basis vector used to define a regression to be run in the leaves of each tree. Must be included / omitted consistently (i.e. if basis_train is provided, then basis_test must be provided alongside X_test). - cutpoint_grid_size : :obj:`int`, optional - Maximum number of cutpoints to consider for each feature. Defaults to ``100``. - sigma_leaf : :obj:`float`, optional - Scale parameter on the (conditional mean) leaf node regression model. - alpha_mean : :obj:`float`, optional - Prior probability of splitting for a tree of depth 0 in the conditional mean model. - Tree split prior combines ``alpha_mean`` and ``beta_mean`` via ``alpha_mean*(1+node_depth)^-beta_mean``. - beta_mean : :obj:`float`, optional - Exponent that decreases split probabilities for nodes of depth > 0 in the conditional mean model. - Tree split prior combines ``alpha_mean`` and ``beta_mean`` via ``alpha_mean*(1+node_depth)^-beta_mean``. - min_samples_leaf_mean : :obj:`int`, optional - Minimum allowable size of a leaf, in terms of training samples, in the conditional mean model. Defaults to ``5``. - max_depth_mean : :obj:`int`, optional - Maximum depth of any tree in the ensemble in the conditional mean model. Defaults to ``10``. Can be overriden with ``-1`` which does not enforce any depth limits on trees. - alpha_variance : :obj:`float`, optional - Prior probability of splitting for a tree of depth 0 in the conditional variance model. - Tree split prior combines ``alpha_variance`` and ``beta_variance`` via ``alpha_variance*(1+node_depth)^-beta_variance``. - beta_variance : :obj:`float`, optional - Exponent that decreases split probabilities for nodes of depth > 0 in the conditional variance model. - Tree split prior combines ``alpha_variance`` and ``beta_variance`` via ``alpha_variance*(1+node_depth)^-beta_variance``. - min_samples_leaf_variance : :obj:`int`, optional - Minimum allowable size of a leaf, in terms of training samples in the conditional variance model. Defaults to ``5``. - max_depth_variance : :obj:`int`, optional - Maximum depth of any tree in the ensemble in the conditional variance model. Defaults to ``10``. Can be overriden with ``-1`` which does not enforce any depth limits on trees. - a_global : :obj:`float`, optional - Shape parameter in the ``IG(a_global, b_global)`` global error variance model. Defaults to ``0``. - b_global : :obj:`float`, optional - Component of the scale parameter in the ``IG(a_global, b_global)`` global error variance prior. Defaults to ``0``. - b_leaf : :obj:`float`, optional - Scale parameter in the ``IG(a_leaf, b_leaf)`` leaf node parameter variance model. Calibrated internally as ``0.5/num_trees_mean`` if not set here. - a_leaf : :obj:`float`, optional - Shape parameter in the ``IG(a_leaf, b_leaf)`` leaf node parameter variance model. Defaults to ``3``. - b_leaf : :obj:`float`, optional - Scale parameter in the ``IG(a_leaf, b_leaf)`` leaf node parameter variance model. Calibrated internally as ``0.5/num_trees_mean`` if not set here. - a_forest : :obj:`float`, optional - Shape parameter in the [optional] ``IG(a_forest, b_forest)`` conditional error variance forest (which is only sampled if ``num_trees_variance > 0``). Calibrated internally as ``num_trees_variance / 1.5^2 + 0.5`` if not set here. - b_forest : :obj:`float`, optional - Scale parameter in the [optional] ``IG(a_forest, b_forest)`` conditional error variance forest (which is only sampled if ``num_trees_variance > 0``). Calibrated internally as ``num_trees_variance / 1.5^2`` if not set here. - sigma2_init : :obj:`float`, optional - Starting value of global variance parameter. Set internally as a percentage of the standardized outcome variance if not set here. - variance_forest_leaf_init : :obj:`float`, optional - Starting value of root forest prediction in conditional (heteroskedastic) error variance model. Calibrated internally as ``np.log(pct_var_variance_forest_init*np.var((y-np.mean(y))/np.std(y)))/num_trees_variance`` if not set. - pct_var_sigma2_init : :obj:`float`, optional - Percentage of standardized outcome variance used to initialize global error variance parameter. Superseded by ``sigma2``. Defaults to ``1``. - pct_var_variance_forest_init : :obj:`float`, optional - Percentage of standardized outcome variance used to initialize global error variance parameter. Default: ``1``. Superseded by ``variance_forest_init``. - variance_scale : :obj:`float`, optional - Variance after the data have been scaled. Default: ``1``. - variable_weights_mean : :obj:`np.array`, optional - Numeric weights reflecting the relative probability of splitting on each variable in the mean forest. Does not need to sum to 1 but cannot be negative. Defaults to uniform over the columns of ``X_train`` if not provided. - variable_weights_forest : :obj:`np.array`, optional - Numeric weights reflecting the relative probability of splitting on each variable in the variance forest. Does not need to sum to 1 but cannot be negative. Defaults to uniform over the columns of ``X_train`` if not provided. - num_trees_mean : :obj:`int`, optional - Number of trees in the ensemble for the conditional mean model. Defaults to ``200``. If ``num_trees_mean = 0``, the conditional mean will not be modeled using a forest and the function will only proceed if ``num_trees_variance > 0``. - num_trees_variance : :obj:`int`, optional - Number of trees in the ensemble for the conditional variance model. Defaults to ``0``. Variance is only modeled using a tree / forest if `num_trees_variance > 0`. num_gfr : :obj:`int`, optional Number of "warm-start" iterations run using the grow-from-root algorithm (He and Hahn, 2021). Defaults to ``5``. num_burnin : :obj:`int`, optional Number of "burn-in" iterations of the MCMC sampler. Defaults to ``0``. Ignored if ``num_gfr > 0``. num_mcmc : :obj:`int`, optional Number of "retained" iterations of the MCMC sampler. Defaults to ``100``. If this is set to 0, GFR (XBART) samples will be retained. - sample_sigma_global : :obj:`bool`, optional - Whether or not to update the ``sigma^2`` global error variance parameter based on ``IG(a_global, b_global)``. Defaults to ``True``. - sample_sigma_leaf : :obj:`bool`, optional - Whether or not to update the ``tau`` leaf scale variance parameter based on ``IG(a_leaf, b_leaf)``. Cannot (currently) be set to true if ``basis_train`` has more than one column. Defaults to ``False``. - random_seed : :obj:`int`, optional - Integer parameterizing the C++ random number generator. If not specified, the C++ random number generator is seeded according to ``std::random_device``. - keep_burnin : :obj:`bool`, optional - Whether or not "burnin" samples should be included in predictions. Defaults to ``False``. Ignored if ``num_mcmc == 0``. - keep_gfr : :obj:`bool`, optional - Whether or not "warm-start" / grow-from-root samples should be included in predictions. Defaults to ``False``. Ignored if ``num_mcmc == 0``. - + params : :obj:`dict`, optional + Dictionary of model parameters, each of which has a default value. + + * ``cutpoint_grid_size`` (``int``): Maximum number of cutpoints to consider for each feature. Defaults to ``100``. + * ``sigma_leaf`` (``float``): Scale parameter on the (conditional mean) leaf node regression model. + * ``alpha_mean`` (``float``): Prior probability of splitting for a tree of depth 0 in the conditional mean model. Tree split prior combines ``alpha_mean`` and ``beta_mean`` via ``alpha_mean*(1+node_depth)^-beta_mean``. + * ``beta_mean`` (``float``): Exponent that decreases split probabilities for nodes of depth > 0 in the conditional mean model. Tree split prior combines ``alpha_mean`` and ``beta_mean`` via ``alpha_mean*(1+node_depth)^-beta_mean``. + * ``min_samples_leaf_mean`` (``int``): Minimum allowable size of a leaf, in terms of training samples, in the conditional mean model. Defaults to ``5``. + * ``max_depth_mean`` (``int``): Maximum depth of any tree in the ensemble in the conditional mean model. Defaults to ``10``. Can be overriden with ``-1`` which does not enforce any depth limits on trees. + * ``alpha_variance`` (``float``): Prior probability of splitting for a tree of depth 0 in the conditional variance model. Tree split prior combines ``alpha_variance`` and ``beta_variance`` via ``alpha_variance*(1+node_depth)^-beta_variance``. + * ``beta_variance`` (``float``): Exponent that decreases split probabilities for nodes of depth > 0 in the conditional variance model. Tree split prior combines ``alpha_variance`` and ``beta_variance`` via ``alpha_variance*(1+node_depth)^-beta_variance``. + * ``min_samples_leaf_variance`` (``int``): Minimum allowable size of a leaf, in terms of training samples in the conditional variance model. Defaults to ``5``. + * ``max_depth_variance`` (``int``): Maximum depth of any tree in the ensemble in the conditional variance model. Defaults to ``10``. Can be overriden with ``-1`` which does not enforce any depth limits on trees. + * ``a_global`` (``float``): Shape parameter in the ``IG(a_global, b_global)`` global error variance model. Defaults to ``0``. + * ``b_global`` (``float``): Scale parameter in the ``IG(a_global, b_global)`` global error variance prior. Defaults to ``0``. + * ``a_leaf`` (``float``): Shape parameter in the ``IG(a_leaf, b_leaf)`` leaf node parameter variance model. Defaults to ``3``. + * ``b_leaf`` (``float``): Scale parameter in the ``IG(a_leaf, b_leaf)`` leaf node parameter variance model. Calibrated internally as ``0.5/num_trees_mean`` if not set here. + * ``a_forest`` (``float``): Shape parameter in the [optional] ``IG(a_forest, b_forest)`` conditional error variance forest (which is only sampled if ``num_trees_variance > 0``). Calibrated internally as ``num_trees_variance / 1.5^2 + 0.5`` if not set here. + * ``b_forest`` (``float``): Scale parameter in the [optional] ``IG(a_forest, b_forest)`` conditional error variance forest (which is only sampled if ``num_trees_variance > 0``). Calibrated internally as ``num_trees_variance / 1.5^2`` if not set here. + * ``sigma2_init`` (``float``): Starting value of global variance parameter. Set internally as a percentage of the standardized outcome variance if not set here. + * ``variance_forest_leaf_init`` (``float``): Starting value of root forest prediction in conditional (heteroskedastic) error variance model. Calibrated internally as ``np.log(pct_var_variance_forest_init*np.var((y-np.mean(y))/np.std(y)))/num_trees_variance`` if not set. + * ``pct_var_sigma2_init`` (``float``): Percentage of standardized outcome variance used to initialize global error variance parameter. Superseded by ``sigma2``. Defaults to ``1``. + * ``pct_var_variance_forest_init`` (``float``): Percentage of standardized outcome variance used to initialize global error variance parameter. Default: ``1``. Superseded by ``variance_forest_init``. + * ``variance_scale`` (``float``): Variance after the data have been scaled. Default: ``1``. + * ``variable_weights_mean`` (``np.array``): Numeric weights reflecting the relative probability of splitting on each variable in the mean forest. Does not need to sum to 1 but cannot be negative. Defaults to uniform over the columns of ``X_train`` if not provided. + * ``variable_weights_forest`` (``np.array``): Numeric weights reflecting the relative probability of splitting on each variable in the variance forest. Does not need to sum to 1 but cannot be negative. Defaults to uniform over the columns of ``X_train`` if not provided. + * ``num_trees_mean`` (``int``): Number of trees in the ensemble for the conditional mean model. Defaults to ``200``. If ``num_trees_mean = 0``, the conditional mean will not be modeled using a forest and the function will only proceed if ``num_trees_variance > 0``. + * ``num_trees_variance`` (``int``): Number of trees in the ensemble for the conditional variance model. Defaults to ``0``. Variance is only modeled using a tree / forest if ``num_trees_variance > 0``. + * ``sample_sigma_global`` (``bool``): Whether or not to update the ``sigma^2`` global error variance parameter based on ``IG(a_global, b_global)``. Defaults to ``True``. + * ``sample_sigma_leaf`` (``bool``): Whether or not to update the ``tau`` leaf scale variance parameter based on ``IG(a_leaf, b_leaf)``. Cannot (currently) be set to true if ``basis_train`` has more than one column. Defaults to ``False``. + * ``random_seed`` (``int``): Integer parameterizing the C++ random number generator. If not specified, the C++ random number generator is seeded according to ``std::random_device``. + * ``keep_burnin`` (``bool``): Whether or not "burnin" samples should be included in predictions. Defaults to ``False``. Ignored if ``num_mcmc == 0``. + * ``keep_gfr`` (``bool``): Whether or not "warm-start" / grow-from-root samples should be included in predictions. Defaults to ``False``. Ignored if ``num_mcmc == 0``. + Returns ------- self : BARTModel Sampled BART Model. """ + # Unpack parameters + bart_params = _preprocess_bart_params(params) + cutpoint_grid_size = bart_params['cutpoint_grid_size'] + sigma_leaf = bart_params['sigma_leaf'] + alpha_mean = bart_params['alpha_mean'] + beta_mean = bart_params['beta_mean'] + min_samples_leaf_mean = bart_params['min_samples_leaf_mean'] + max_depth_mean = bart_params['max_depth_mean'] + alpha_variance = bart_params['alpha_variance'] + beta_variance = bart_params['beta_variance'] + min_samples_leaf_variance = bart_params['min_samples_leaf_variance'] + max_depth_variance = bart_params['max_depth_variance'] + a_global = bart_params['a_global'] + b_global = bart_params['b_global'] + a_leaf = bart_params['a_leaf'] + b_leaf = bart_params['b_leaf'] + a_forest = bart_params['a_forest'] + b_forest = bart_params['b_forest'] + sigma2_init = bart_params['sigma2_init'] + variance_forest_leaf_init = bart_params['variance_forest_leaf_init'] + pct_var_sigma2_init = bart_params['pct_var_sigma2_init'] + pct_var_variance_forest_init = bart_params['pct_var_variance_forest_init'] + variance_scale = bart_params['variance_scale'] + variable_weights_mean = bart_params['variable_weights_mean'] + variable_weights_variance = bart_params['variable_weights_variance'] + num_trees_mean = bart_params['num_trees_mean'] + num_trees_variance = bart_params['num_trees_variance'] + sample_sigma_global = bart_params['sample_sigma_global'] + sample_sigma_leaf = bart_params['sample_sigma_leaf'] + random_seed = bart_params['random_seed'] + keep_burnin = bart_params['keep_burnin'] + keep_gfr = bart_params['keep_gfr'] + # Determine which models (conditional mean, conditional variance, or both) we will fit self.include_mean_forest = True if num_trees_mean > 0 else False self.include_variance_forest = True if num_trees_variance > 0 else False diff --git a/stochtree/bcf.py b/stochtree/bcf.py index f87a0db7..1a26dbc7 100644 --- a/stochtree/bcf.py +++ b/stochtree/bcf.py @@ -3,17 +3,12 @@ """ import numpy as np import pandas as pd -from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor -from sklearn.model_selection import GridSearchCV, KFold from sklearn.utils import check_scalar -from typing import Optional, Union -from scipy.linalg import lstsq -from scipy.stats import gamma +from typing import Optional, Union, Dict, Any from .bart import BARTModel -from .calibration import calibrate_global_error_variance from .data import Dataset, Residual from .forest import ForestContainer -from .preprocessing import CovariateTransformer +from .preprocessing import CovariateTransformer, _preprocess_bcf_params from .sampler import ForestSampler, RNG, GlobalVarianceModel, LeafVarianceModel from .utils import NotSampledError @@ -31,18 +26,7 @@ def is_sampled(self) -> bool: def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_train: np.array, pi_train: np.array = None, X_test: Union[pd.DataFrame, np.array] = None, Z_test: np.array = None, pi_test: np.array = None, - cutpoint_grid_size = 100, sigma_leaf_mu: float = None, sigma_leaf_tau: float = None, - alpha_mu: float = 0.95, alpha_tau: float = 0.25, beta_mu: float = 2.0, beta_tau: float = 3.0, - min_samples_leaf_mu: int = 5, min_samples_leaf_tau: int = 5, max_depth_mu: int = 10, max_depth_tau: int = 5, - a_global: float = 0, b_global: float = 0, a_leaf_mu: float = 3, a_leaf_tau: float = 3, - b_leaf_mu: float = None, b_leaf_tau: float = None, q: float = 0.9, sigma2: float = None, - pct_var_sigma2_init: float = 0.25, variable_weights: np.array = None, - keep_vars_mu: Union[list, np.array] = None, drop_vars_mu: Union[list, np.array] = None, - keep_vars_tau: Union[list, np.array] = None, drop_vars_tau: Union[list, np.array] = None, - num_trees_mu: int = 200, num_trees_tau: int = 50, num_gfr: int = 5, num_burnin: int = 0, num_mcmc: int = 100, - sample_sigma_global: bool = True, sample_sigma_leaf_mu: bool = True, sample_sigma_leaf_tau: bool = False, - propensity_covariate: str = "mu", adaptive_coding: bool = True, b_0: float = -0.5, b_1: float = 0.5, - random_seed: int = -1, keep_burnin: bool = False, keep_gfr: bool = False) -> None: + num_gfr: int = 5, num_burnin: int = 0, num_mcmc: int = 100, params: Optional[Dict[str, Any]] = None) -> None: """Runs a BCF sampler on provided training set. Outcome predictions and estimates of the prognostic and treatment effect functions will be cached for the training set and (if provided) the test set. @@ -63,105 +47,109 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr Must be provided if ``X_test`` is provided. pi_test : :obj:`np.array`, optional Optional test set vector of propensity scores. If not provided (but ``X_test`` and ``Z_test`` are), this will be estimated from the data. - cutpoint_grid_size : :obj:`int`, optional - Maximum number of cutpoints to consider for each feature. Defaults to ``100``. - sigma_leaf_mu : :obj:`float`, optional - Starting value of leaf node scale parameter for the prognostic forest. Calibrated internally as ``2/num_trees_mu`` if not set here. - sigma_leaf_tau : :obj:`float` or :obj:`np.array`, optional - Starting value of leaf node scale parameter for the treatment effect forest. - When treatment (``Z_train``) is multivariate, this can be either a ``float`` or a square 2-dimensional ``np.array`` - with ``sigma_leaf_tau.shape[0] == Z_train.shape[1]`` and ``sigma_leaf_tau.shape[1] == Z_train.shape[1]``. - If ``sigma_leaf_tau`` is provided as a float for multivariate treatment, the leaf scale term will be set as a - diagonal matrix with ``sigma_leaf_tau`` on every diagonal. If not passed as an argument, this parameter is - calibrated internally as ``1/num_trees_tau`` (and propagated to a diagonal matrix if necessary). - alpha_mu : :obj:`float`, optional - Prior probability of splitting for a tree of depth 0 for the prognostic forest. - Tree split prior combines ``alpha`` and ``beta`` via ``alpha*(1+node_depth)^-beta``. - alpha_tau : :obj:`float`, optional - Prior probability of splitting for a tree of depth 0 for the treatment effect forest. - Tree split prior combines ``alpha`` and ``beta`` via ``alpha*(1+node_depth)^-beta``. - beta_mu : :obj:`float`, optional - Exponent that decreases split probabilities for nodes of depth > 0 for the prognostic forest. - Tree split prior combines ``alpha`` and ``beta`` via ``alpha*(1+node_depth)^-beta``. - beta_tau : :obj:`float`, optional - Exponent that decreases split probabilities for nodes of depth > 0 for the treatment effect forest. - Tree split prior combines ``alpha`` and ``beta`` via ``alpha*(1+node_depth)^-beta``. - min_samples_leaf_mu : :obj:`int`, optional - Minimum allowable size of a leaf, in terms of training samples, for the prognostic forest. Defaults to ``5``. - min_samples_leaf_tau : :obj:`int`, optional - Minimum allowable size of a leaf, in terms of training samples, for the treatment effect forest. Defaults to ``5``. - max_depth_mu : :obj:`int`, optional - Maximum depth of any tree in the mu ensemble. Defaults to ``10``. Can be overriden with ``-1`` which does not enforce any depth limits on trees. - max_depth_tau : :obj:`int`, optional - Maximum depth of any tree in the tau ensemble. Defaults to ``5``. Can be overriden with ``-1`` which does not enforce any depth limits on trees. - a_global : :obj:`float`, optional - Shape parameter in the ``IG(a_global, b_global)`` global error variance model. Defaults to ``0``. - b_global : :obj:`float`, optional - Component of the scale parameter in the ``IG(a_global, b_global)`` global error variance prior. Defaults to ``0``. - a_leaf_mu : :obj:`float`, optional - Shape parameter in the ``IG(a_leaf, b_leaf)`` leaf node parameter variance model for the prognostic forest. Defaults to ``3``. - a_leaf_tau : :obj:`float`, optional - Shape parameter in the ``IG(a_leaf, b_leaf)`` leaf node parameter variance model for the treatment effect forest. Defaults to ``3``. - b_leaf_mu : :obj:`float`, optional - Scale parameter in the ``IG(a_leaf, b_leaf)`` leaf node parameter variance model for the prognostic forest. Calibrated internally as ``0.5/num_trees`` if not set here. - b_leaf_tau : :obj:`float`, optional - Scale parameter in the ``IG(a_leaf, b_leaf)`` leaf node parameter variance model for the treatment effect forest. Calibrated internally as ``0.5/num_trees`` if not set here. - q : :obj:`float`, optional - Quantile used to calibrated ``lamb`` as in Sparapani et al (2021). Defaults to ``0.9``. - sigma2 : :obj:`float`, optional - Starting value of global variance parameter. Calibrated internally as in Sparapani et al (2021) if not set here. - pct_var_sigma2_init : :obj:`float`, optional - Percentage of standardized outcome variance used to initialize global error variance parameter. Superseded by ``sigma2``. Defaults to ``0.25``. - variable_weights : :obj:`np.array`, optional - Numeric weights reflecting the relative probability of splitting on each variable. Does not need to sum to 1 but cannot be negative. Defaults to ``np.repeat(1/X_train.shape[1], X_train.shape[1])`` if not set here. Note that if the propensity score is included as a covariate in either forest, its weight will default to ``1/X_train.shape[1]``. A workaround if you wish to provide a custom weight for the propensity score is to include it as a column in ``X_train`` and then set ``propensity_covariate`` to ``'none'`` and adjust ``keep_vars_mu`` and ``keep_vars_tau`` accordingly. - keep_vars_mu : obj:`list` or :obj:`np.array`, optional - Vector of variable names or column indices denoting variables that should be included in the prognostic (``mu(X)``) forest. Defaults to ``None``. - drop_vars_mu : obj:`list` or :obj:`np.array`, optional - Vector of variable names or column indices denoting variables that should be excluded from the prognostic (``mu(X)``) forest. Defaults to ``None``. If both ``drop_vars_mu`` and ``keep_vars_mu`` are set, ``drop_vars_mu`` will be ignored. - keep_vars_tau : obj:`list` or :obj:`np.array`, optional - Vector of variable names or column indices denoting variables that should be included in the treatment effect (``tau(X)``) forest. Defaults to ``None``. - drop_vars_tau : obj:`list` or :obj:`np.array`, optional - Vector of variable names or column indices denoting variables that should be excluded from the treatment effect (``tau(X)``) forest. Defaults to ``None``. If both ``drop_vars_tau`` and ``keep_vars_tau`` are set, ``drop_vars_tau`` will be ignored. - num_trees_mu : :obj:`int`, optional - Number of trees in the prognostic forest. Defaults to ``200``. - num_trees_tau : :obj:`int`, optional - Number of trees in the treatment effect forest. Defaults to ``50``. num_gfr : :obj:`int`, optional Number of "warm-start" iterations run using the grow-from-root algorithm (He and Hahn, 2021). Defaults to ``5``. num_burnin : :obj:`int`, optional Number of "burn-in" iterations of the MCMC sampler. Defaults to ``0``. Ignored if ``num_gfr > 0``. num_mcmc : :obj:`int`, optional Number of "retained" iterations of the MCMC sampler. Defaults to ``100``. If this is set to 0, GFR (XBART) samples will be retained. - sample_sigma_global : :obj:`bool`, optional - Whether or not to update the ``sigma^2`` global error variance parameter based on ``IG(a_global, b_global)``. Defaults to ``True``. - sample_sigma_leaf_mu : :obj:`bool`, optional - Whether or not to update the ``tau`` leaf scale variance parameter based on ``IG(a_leaf, b_leaf)`` for the prognostic forest. - Cannot (currently) be set to true if ``basis_train`` has more than one column. Defaults to ``True``. - sample_sigma_leaf_tau : :obj:`bool`, optional - Whether or not to update the ``tau`` leaf scale variance parameter based on ``IG(a_leaf, b_leaf)`` for the treatment effect forest. - Cannot (currently) be set to true if ``basis_train`` has more than one column. Defaults to ``True``. - propensity_covariate : :obj:`str`, optional - Whether to include the propensity score as a covariate in either or both of the forests. Enter ``"none"`` for neither, ``"mu"`` for the prognostic forest, ``"tau"`` for the treatment forest, and ``"both"`` for both forests. - If this is not ``"none"`` and a propensity score is not provided, it will be estimated from (``X_train``, ``Z_train``) using ``BARTModel``. Defaults to ``"mu"``. - adaptive_coding : :obj:`bool`, optional - Whether or not to use an "adaptive coding" scheme in which a binary treatment variable is not coded manually as (0,1) or (-1,1) but learned via - parameters ``b_0`` and ``b_1`` that attach to the outcome model ``[b_0 (1-Z) + b_1 Z] tau(X)``. This is ignored when Z is not binary. Defaults to True. - b_0 : :obj:`float`, optional - Initial value of the "control" group coding parameter. This is ignored when ``Z`` is not binary. Default: ``-0.5``. - b_1 : :obj:`float`, optional - Initial value of the "treated" group coding parameter. This is ignored when ``Z`` is not binary. Default: ``0.5``. - random_seed : :obj:`int`, optional - Integer parameterizing the C++ random number generator. If not specified, the C++ random number generator is seeded according to ``std::random_device``. - keep_burnin : :obj:`bool`, optional - Whether or not "burnin" samples should be included in predictions. Defaults to ``False``. Ignored if ``num_mcmc == 0``. - keep_gfr : :obj:`bool`, optional - Whether or not "warm-start" / grow-from-root samples should be included in predictions. Defaults to ``False``. Ignored if ``num_mcmc == 0``. + params : :obj:`dict`, optional + Dictionary of model parameters, each of which has a default value. + + * ``cutpoint_grid_size`` (``int``): Maximum number of cutpoints to consider for each feature. Defaults to ``100``. + * ``sigma_leaf_mu`` (``float``): Starting value of leaf node scale parameter for the prognostic forest. Calibrated internally as ``2/num_trees_mu`` if not set here. + * ``sigma_leaf_tau`` (``float`` or ``np.array``): Starting value of leaf node scale parameter for the treatment effect forest. + When treatment (``Z_train``) is multivariate, this can be either a ``float`` or a square 2-dimensional ``np.array`` + with ``sigma_leaf_tau.shape[0] == Z_train.shape[1]`` and ``sigma_leaf_tau.shape[1] == Z_train.shape[1]``. + If ``sigma_leaf_tau`` is provided as a float for multivariate treatment, the leaf scale term will be set as a + diagonal matrix with ``sigma_leaf_tau`` on every diagonal. If not passed as an argument, this parameter is + calibrated internally as ``1/num_trees_tau`` (and propagated to a diagonal matrix if necessary). + * ``alpha_mu`` (``float``): Prior probability of splitting for a tree of depth 0 for the prognostic forest. + Tree split prior combines ``alpha`` and ``beta`` via ``alpha*(1+node_depth)^-beta``. + * ``alpha_tau`` (``float``): Prior probability of splitting for a tree of depth 0 for the treatment effect forest. + Tree split prior combines ``alpha`` and ``beta`` via ``alpha*(1+node_depth)^-beta``. + * ``beta_mu`` (``float``): Exponent that decreases split probabilities for nodes of depth > 0 for the prognostic forest. + Tree split prior combines ``alpha`` and ``beta`` via ``alpha*(1+node_depth)^-beta``. + * ``beta_tau`` (``float``): Exponent that decreases split probabilities for nodes of depth > 0 for the treatment effect forest. + Tree split prior combines ``alpha`` and ``beta`` via ``alpha*(1+node_depth)^-beta``. + * ``min_samples_leaf_mu`` (``int``): Minimum allowable size of a leaf, in terms of training samples, for the prognostic forest. Defaults to ``5``. + * ``min_samples_leaf_tau`` (``int``): Minimum allowable size of a leaf, in terms of training samples, for the treatment effect forest. Defaults to ``5``. + * ``max_depth_mu`` (``int``): Maximum depth of any tree in the mu ensemble. Defaults to ``10``. Can be overriden with ``-1`` which does not enforce any depth limits on trees. + * ``max_depth_tau`` (``int``): Maximum depth of any tree in the tau ensemble. Defaults to ``5``. Can be overriden with ``-1`` which does not enforce any depth limits on trees. + * ``a_global`` (``float``): Shape parameter in the ``IG(a_global, b_global)`` global error variance model. Defaults to ``0``. + * ``b_global`` (``float``): Component of the scale parameter in the ``IG(a_global, b_global)`` global error variance prior. Defaults to ``0``. + * ``a_leaf_mu`` (``float``): Shape parameter in the ``IG(a_leaf, b_leaf)`` leaf node parameter variance model for the prognostic forest. Defaults to ``3``. + * ``a_leaf_tau`` (``float``): Shape parameter in the ``IG(a_leaf, b_leaf)`` leaf node parameter variance model for the treatment effect forest. Defaults to ``3``. + * ``b_leaf_mu`` (``float``): Scale parameter in the ``IG(a_leaf, b_leaf)`` leaf node parameter variance model for the prognostic forest. Calibrated internally as ``0.5/num_trees`` if not set here. + * ``b_leaf_tau`` (``float``): Scale parameter in the ``IG(a_leaf, b_leaf)`` leaf node parameter variance model for the treatment effect forest. Calibrated internally as ``0.5/num_trees`` if not set here. + * ``sigma2`` (``float``): Starting value of global variance parameter. Calibrated internally as in Sparapani et al (2021) if not set here. + * ``pct_var_sigma2_init`` (``float``): Percentage of standardized outcome variance used to initialize global error variance parameter. Superseded by ``sigma2``. Defaults to ``0.25``. + * ``variable_weights`` (`np.`array``): Numeric weights reflecting the relative probability of splitting on each variable. Does not need to sum to 1 but cannot be negative. Defaults to ``np.repeat(1/X_train.shape[1], X_train.shape[1])`` if not set here. Note that if the propensity score is included as a covariate in either forest, its weight will default to ``1/X_train.shape[1]``. A workaround if you wish to provide a custom weight for the propensity score is to include it as a column in ``X_train`` and then set ``propensity_covariate`` to ``'none'`` and adjust ``keep_vars_mu`` and ``keep_vars_tau`` accordingly. + * ``keep_vars_mu`` (``list`` or ``np.array``): Vector of variable names or column indices denoting variables that should be included in the prognostic (``mu(X)``) forest. Defaults to ``None``. + * ``drop_vars_mu`` (``list`` or ``np.array``): Vector of variable names or column indices denoting variables that should be excluded from the prognostic (``mu(X)``) forest. Defaults to ``None``. If both ``drop_vars_mu`` and ``keep_vars_mu`` are set, ``drop_vars_mu`` will be ignored. + * ``keep_vars_tau`` (``list`` or ``np.array``): Vector of variable names or column indices denoting variables that should be included in the treatment effect (``tau(X)``) forest. Defaults to ``None``. + * ``drop_vars_tau`` (``list`` or ``np.array``): Vector of variable names or column indices denoting variables that should be excluded from the treatment effect (``tau(X)``) forest. Defaults to ``None``. If both ``drop_vars_tau`` and ``keep_vars_tau`` are set, ``drop_vars_tau`` will be ignored. + * ``num_trees_mu`` (``int``): Number of trees in the prognostic forest. Defaults to ``200``. + * ``num_trees_tau`` (``int``): Number of trees in the treatment effect forest. Defaults to ``50``. + * ``sample_sigma_global`` (``bool``): Whether or not to update the ``sigma^2`` global error variance parameter based on ``IG(a_global, b_global)``. Defaults to ``True``. + * ``sample_sigma_leaf_mu`` (``bool``): Whether or not to update the ``tau`` leaf scale variance parameter based on ``IG(a_leaf, b_leaf)`` for the prognostic forest. + Cannot (currently) be set to true if ``basis_train`` has more than one column. Defaults to ``True``. + * ``sample_sigma_leaf_tau`` (``bool``): Whether or not to update the ``tau`` leaf scale variance parameter based on ``IG(a_leaf, b_leaf)`` for the treatment effect forest. + Cannot (currently) be set to true if ``basis_train`` has more than one column. Defaults to ``True``. + * ``propensity_covariate`` (``str``): Whether to include the propensity score as a covariate in either or both of the forests. Enter ``"none"`` for neither, ``"mu"`` for the prognostic forest, ``"tau"`` for the treatment forest, and ``"both"`` for both forests. + If this is not ``"none"`` and a propensity score is not provided, it will be estimated from (``X_train``, ``Z_train``) using ``BARTModel``. Defaults to ``"mu"``. + * ``adaptive_coding`` (``bool``): Whether or not to use an "adaptive coding" scheme in which a binary treatment variable is not coded manually as (0,1) or (-1,1) but learned via + parameters ``b_0`` and ``b_1`` that attach to the outcome model ``[b_0 (1-Z) + b_1 Z] tau(X)``. This is ignored when Z is not binary. Defaults to True. + * ``b_0`` (``float``): Initial value of the "control" group coding parameter. This is ignored when ``Z`` is not binary. Default: ``-0.5``. + * ``b_1`` (``float``): Initial value of the "treated" group coding parameter. This is ignored when ``Z`` is not binary. Default: ``0.5``. + * ``random_seed`` (``int``): Integer parameterizing the C++ random number generator. If not specified, the C++ random number generator is seeded according to ``std::random_device``. + * ``keep_burnin`` (``bool``): Whether or not "burnin" samples should be included in predictions. Defaults to ``False``. Ignored if ``num_mcmc == 0``. + * ``keep_gfr`` (``bool``): Whether or not "warm-start" / grow-from-root samples should be included in predictions. Defaults to ``False``. Ignored if ``num_mcmc == 0``. Returns ------- self : BCFModel Sampled BCF Model. """ + # Unpack parameters + bcf_params = _preprocess_bcf_params(params) + cutpoint_grid_size = bcf_params['cutpoint_grid_size'] + sigma_leaf_mu = bcf_params['sigma_leaf_mu'] + sigma_leaf_tau = bcf_params['sigma_leaf_tau'] + alpha_mu = bcf_params['alpha_mu'] + alpha_tau = bcf_params['alpha_tau'] + beta_mu = bcf_params['beta_mu'] + beta_tau = bcf_params['beta_tau'] + min_samples_leaf_mu = bcf_params['min_samples_leaf_mu'] + min_samples_leaf_tau = bcf_params['min_samples_leaf_tau'] + max_depth_mu = bcf_params['max_depth_mu'] + max_depth_tau = bcf_params['max_depth_tau'] + a_global = bcf_params['a_global'] + b_global = bcf_params['b_global'] + a_leaf_mu = bcf_params['a_leaf_mu'] + a_leaf_tau = bcf_params['a_leaf_tau'] + b_leaf_mu = bcf_params['b_leaf_mu'] + b_leaf_tau = bcf_params['b_leaf_tau'] + sigma2 = bcf_params['sigma2'] + pct_var_sigma2_init = bcf_params['pct_var_sigma2_init'] + variable_weights = bcf_params['variable_weights'] + keep_vars_mu = bcf_params['keep_vars_mu'] + drop_vars_mu = bcf_params['drop_vars_mu'] + keep_vars_tau = bcf_params['keep_vars_tau'] + drop_vars_tau = bcf_params['drop_vars_tau'] + num_trees_mu = bcf_params['num_trees_mu'] + num_trees_tau = bcf_params['num_trees_tau'] + sample_sigma_global = bcf_params['sample_sigma_global'] + sample_sigma_leaf_mu = bcf_params['sample_sigma_leaf_mu'] + sample_sigma_leaf_tau = bcf_params['sample_sigma_leaf_tau'] + propensity_covariate = bcf_params['propensity_covariate'] + adaptive_coding = bcf_params['adaptive_coding'] + b_0 = bcf_params['b_0'] + b_1 = bcf_params['b_1'] + random_seed = bcf_params['random_seed'] + keep_burnin = bcf_params['keep_burnin'] + keep_gfr = bcf_params['keep_gfr'] + # Variable weight preprocessing (and initialization if necessary) if variable_weights is None: if X_train.ndim > 1: @@ -315,9 +303,6 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr if b_leaf_tau is not None: b_leaf_tau = check_scalar(x=b_leaf_tau, name="b_leaf_tau", target_type=(float,int), min_val=0, max_val=None, include_boundaries="left") - if q is not None: - q = check_scalar(x=q, name="q", target_type=float, - min_val=0, max_val=1, include_boundaries="neither") if sigma2 is not None: sigma2 = check_scalar(x=sigma2, name="sigma2", target_type=(float,int), min_val=0, max_val=None, include_boundaries="neither") @@ -821,6 +806,19 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr treatment_term_test = Z_test*np.squeeze(self.tau_hat_test) self.y_hat_test = self.mu_hat_test + treatment_term_test + if self.sample_sigma_global: + self.global_var_samples = self.global_var_samples[self.keep_indices] + + if self.sample_sigma_leaf_mu: + self.leaf_scale_mu_samples = self.leaf_scale_mu_samples[self.keep_indices] + + if self.sample_sigma_leaf_tau: + self.leaf_scale_tau_samples = self.leaf_scale_tau_samples[self.keep_indices] + + if self.adaptive_coding: + self.b0_samples = self.b0_samples[self.keep_indices] + self.b1_samples = self.b1_samples[self.keep_indices] + def predict_tau(self, X: np.array, Z: np.array, propensity: np.array = None) -> np.array: """Predict CATE function for every provided observation. @@ -882,16 +880,10 @@ def predict_tau(self, X: np.array, Z: np.array, propensity: np.array = None) -> forest_dataset_tau.add_basis(Z) # Estimate treatment effect - tau_raw = self.forest_container_tau.forest_container_cpp.PredictRaw(forest_dataset_tau.dataset_cpp) - tau_raw = tau_raw*self.y_std - if self.adaptive_coding: - tau_raw = tau_raw*np.expand_dims(self.b1_samples - self.b0_samples, axis=(0,2)) - tau_x = tau_raw[:,self.keep_indices] - tau_raw = self.forest_container_tau.forest_container_cpp.PredictRaw(forest_dataset_tau.dataset_cpp) tau_raw = tau_raw[:,self.keep_indices,:] if self.adaptive_coding: - adaptive_coding_weights = np.expand_dims(self.b1_samples[self.keep_indices] - self.b0_samples[self.keep_indices], axis=(0,2)) + adaptive_coding_weights = np.expand_dims(self.b1_samples - self.b0_samples, axis=(0,2)) tau_raw = tau_raw*adaptive_coding_weights tau_x = np.squeeze(tau_raw*self.y_std) @@ -979,7 +971,7 @@ def predict(self, X: np.array, Z: np.array, propensity: np.array = None) -> np.a tau_raw = self.forest_container_tau.forest_container_cpp.PredictRaw(forest_dataset_tau.dataset_cpp) tau_raw = tau_raw[:,self.keep_indices,:] if self.adaptive_coding: - adaptive_coding_weights = np.expand_dims(self.b1_samples[self.keep_indices] - self.b0_samples[self.keep_indices], axis=(0,2)) + adaptive_coding_weights = np.expand_dims(self.b1_samples - self.b0_samples, axis=(0,2)) tau_raw = tau_raw*adaptive_coding_weights tau_x = np.squeeze(tau_raw*self.y_std) if Z.shape[1] > 1: diff --git a/stochtree/preprocessing.py b/stochtree/preprocessing.py index 807befb3..a9a1b8d9 100644 --- a/stochtree/preprocessing.py +++ b/stochtree/preprocessing.py @@ -3,13 +3,105 @@ Copyright (c) 2007-2024 The scikit-learn developers. """ -from typing import Union, Optional, Any +from typing import Union, Optional, Any, Dict from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder from sklearn.utils.validation import check_array, column_or_1d import numpy as np import pandas as pd import warnings +def _preprocess_bart_params(params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + processed_params = { + 'cutpoint_grid_size' : 100, + 'sigma_leaf' : None, + 'alpha_mean' : 0.95, + 'beta_mean' : 2.0, + 'min_samples_leaf_mean' : 5, + 'max_depth_mean' : 10, + 'alpha_variance' : 0.95, + 'beta_variance' : 2.0, + 'min_samples_leaf_variance' : 5, + 'max_depth_variance' : 10, + 'a_global' : 0, + 'b_global' : 0, + 'a_leaf' : 3, + 'b_leaf' : None, + 'a_forest' : None, + 'b_forest' : None, + 'sigma2_init' : None, + 'variance_forest_leaf_init' : None, + 'pct_var_sigma2_init' : 1, + 'pct_var_variance_forest_init' : 1, + 'variance_scale' : 1, + 'variable_weights_mean' : None, + 'variable_weights_variance' : None, + 'num_trees_mean' : 200, + 'num_trees_variance' : 0, + 'sample_sigma_global' : True, + 'sample_sigma_leaf' : True, + 'random_seed' : -1, + 'keep_burnin' : False, + 'keep_gfr' : False + } + + if params: + for key, value in params.items(): + if key not in processed_params: + raise ValueError(f'Parameter {key} not a valid BART parameter') + processed_params[key] = value + + return processed_params + + +def _preprocess_bcf_params(params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + processed_params = { + 'cutpoint_grid_size': 100, + 'sigma_leaf_mu': None, + 'sigma_leaf_tau': None, + 'alpha_mu': 0.95, + 'alpha_tau': 0.25, + 'beta_mu': 2.0, + 'beta_tau': 3.0, + 'min_samples_leaf_mu': 5, + 'min_samples_leaf_tau': 5, + 'max_depth_mu': 10, + 'max_depth_tau': 5, + 'a_global': 0, + 'b_global': 0, + 'a_leaf_mu': 3, + 'a_leaf_tau': 3, + 'b_leaf_mu': None, + 'b_leaf_tau': None, + 'sigma2': None, + 'pct_var_sigma2_init': 0.25, + 'variable_weights': None, + 'keep_vars_mu': None, + 'drop_vars_mu': None, + 'keep_vars_tau': None, + 'drop_vars_tau': None, + 'num_trees_mu': 200, + 'num_trees_tau': 50, + 'sample_sigma_global': True, + 'sample_sigma_leaf_mu': True, + 'sample_sigma_leaf_tau': False, + 'propensity_covariate': "mu", + 'adaptive_coding': True, + 'b_0': -0.5, + 'b_1': 0.5, + 'random_seed': -1, + 'keep_burnin': False, + 'keep_gfr': False + } + + if params: + for key, value in params.items(): + if key not in processed_params: + raise ValueError(f'Parameter {key} not a valid BCF parameter') + processed_params[key] = value + + return processed_params + + class CovariateTransformer: """Class that transforms covariates to a format that can be used to define tree splits """ diff --git a/vignettes/BayesianSupervisedLearning.Rmd b/vignettes/BayesianSupervisedLearning.Rmd index 18c03b7a..e1af506d 100644 --- a/vignettes/BayesianSupervisedLearning.Rmd +++ b/vignettes/BayesianSupervisedLearning.Rmd @@ -73,10 +73,11 @@ num_gfr <- 10 num_burnin <- 0 num_mcmc <- 100 num_samples <- num_gfr + num_burnin + num_mcmc +bart_params <- list(sample_sigma_global = T, sample_sigma_leaf = T, num_trees_mean = 100) bart_model_warmstart <- stochtree::bart( X_train = X_train, y_train = y_train, X_test = X_test, - num_trees_mean = 100, num_gfr = num_gfr, num_burnin = num_burnin, - num_mcmc = num_mcmc, sample_sigma_global = T, sample_sigma_leaf = T + num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, + params = bart_params ) ``` @@ -99,10 +100,11 @@ num_gfr <- 0 num_burnin <- 100 num_mcmc <- 100 num_samples <- num_gfr + num_burnin + num_mcmc +bart_params <- list(sample_sigma_global = T, sample_sigma_leaf = T, num_trees_mean = 100) bart_model_root <- stochtree::bart( X_train = X_train, y_train = y_train, X_test = X_test, - num_trees_mean = 100, num_gfr = num_gfr, num_burnin = num_burnin, - num_mcmc = num_mcmc, sample_sigma_global = T, sample_sigma_leaf = T + num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, + params = bart_params ) ``` @@ -166,11 +168,11 @@ num_gfr <- 10 num_burnin <- 0 num_mcmc <- 100 num_samples <- num_gfr + num_burnin + num_mcmc +bart_params <- list(sample_sigma_global = T, sample_sigma_leaf = T, num_trees_mean = 100) bart_model_warmstart <- stochtree::bart( - X_train = X_train, W_train = W_train, y_train = y_train, - X_test = X_test, W_test = W_test, num_trees_mean = 100, - num_gfr = num_gfr, num_burnin = num_burnin, - num_mcmc = num_mcmc, sample_sigma_global = T, sample_sigma_leaf = T + X_train = X_train, W_train = W_train, y_train = y_train, X_test = X_test, W_test = W_test, + num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, + params = bart_params ) ``` @@ -193,11 +195,11 @@ num_gfr <- 0 num_burnin <- 100 num_mcmc <- 100 num_samples <- num_gfr + num_burnin + num_mcmc +bart_params <- list(sample_sigma_global = T, sample_sigma_leaf = T, num_trees_mean = 100) bart_model_root <- stochtree::bart( - X_train = X_train, W_train = W_train, y_train = y_train, - X_test = X_test, W_test = W_test, num_trees_mean = 100, - num_gfr = num_gfr, num_burnin = num_burnin, - num_mcmc = num_mcmc, sample_sigma_global = T, sample_sigma_leaf = T + X_train = X_train, W_train = W_train, y_train = y_train, X_test = X_test, W_test = W_test, + num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, + params = bart_params ) ``` @@ -270,13 +272,12 @@ num_gfr <- 10 num_burnin <- 0 num_mcmc <- 100 num_samples <- num_gfr + num_burnin + num_mcmc +bart_params <- list(sample_sigma_global = T, sample_sigma_leaf = T, num_trees_mean = 100) bart_model_warmstart <- stochtree::bart( - X_train = X_train, W_train = W_train, y_train = y_train, - group_ids_train = group_ids_train, rfx_basis_train = rfx_basis_train, - X_test = X_test, W_test = W_test, group_ids_test = group_ids_test, - rfx_basis_test = rfx_basis_test, num_trees_mean = 100, num_gfr = num_gfr, - num_burnin = num_burnin, num_mcmc = num_mcmc, - sample_sigma_global = T, sample_sigma_leaf = T + X_train = X_train, W_train = W_train, y_train = y_train, group_ids_train = group_ids_train, + rfx_basis_train = rfx_basis_train, X_test = X_test, W_test = W_test, group_ids_test = group_ids_test, + rfx_basis_test = rfx_basis_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, + params = bart_params ) ``` @@ -299,13 +300,12 @@ num_gfr <- 0 num_burnin <- 100 num_mcmc <- 100 num_samples <- num_gfr + num_burnin + num_mcmc +bart_params <- list(sample_sigma_global = T, sample_sigma_leaf = T, num_trees_mean = 100) bart_model_root <- stochtree::bart( - X_train = X_train, W_train = W_train, y_train = y_train, - group_ids_train = group_ids_train, rfx_basis_train = rfx_basis_train, - X_test = X_test, W_test = W_test, group_ids_test = group_ids_test, - rfx_basis_test = rfx_basis_test, num_trees_mean = 100, num_gfr = num_gfr, - num_burnin = num_burnin, num_mcmc = num_mcmc, - sample_sigma_global = T, sample_sigma_leaf = T + X_train = X_train, W_train = W_train, y_train = y_train, group_ids_train = group_ids_train, + rfx_basis_train = rfx_basis_train, X_test = X_test, W_test = W_test, group_ids_test = group_ids_test, + rfx_basis_test = rfx_basis_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, + params = bart_params ) ``` diff --git a/vignettes/CausalInference.Rmd b/vignettes/CausalInference.Rmd index 1fd270c1..74291196 100644 --- a/vignettes/CausalInference.Rmd +++ b/vignettes/CausalInference.Rmd @@ -112,12 +112,12 @@ num_gfr <- 10 num_burnin <- 0 num_mcmc <- 1000 num_samples <- num_gfr + num_burnin + num_mcmc +bcf_params <- list(sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F) bcf_model_warmstart <- bcf( X_train = X_train, Z_train = Z_train, y_train = y_train, pi_train = pi_train, X_test = X_test, Z_test = Z_test, pi_test = pi_test, - num_trees_variance = 50, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, - sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F + params = bcf_params ) ``` @@ -159,11 +159,12 @@ num_gfr <- 0 num_burnin <- 1000 num_mcmc <- 1000 num_samples <- num_gfr + num_burnin + num_mcmc +bcf_params <- list(sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F) bcf_model_root <- bcf( X_train = X_train, Z_train = Z_train, y_train = y_train, pi_train = pi_train, X_test = X_test, Z_test = Z_test, pi_test = pi_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, - sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F + params = bcf_params ) ``` @@ -274,11 +275,12 @@ num_gfr <- 10 num_burnin <- 0 num_mcmc <- 100 num_samples <- num_gfr + num_burnin + num_mcmc +bcf_params <- list(sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F) bcf_model_warmstart <- bcf( X_train = X_train, Z_train = Z_train, y_train = y_train, pi_train = pi_train, X_test = X_test, Z_test = Z_test, pi_test = pi_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, - sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F + params = bcf_params ) ``` @@ -320,11 +322,12 @@ num_gfr <- 0 num_burnin <- 100 num_mcmc <- 100 num_samples <- num_gfr + num_burnin + num_mcmc +bcf_params <- list(sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F) bcf_model_root <- bcf( X_train = X_train, Z_train = Z_train, y_train = y_train, pi_train = pi_train, X_test = X_test, Z_test = Z_test, pi_test = pi_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, - sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F + params = bcf_params ) ``` @@ -435,11 +438,12 @@ num_gfr <- 10 num_burnin <- 0 num_mcmc <- 100 num_samples <- num_gfr + num_burnin + num_mcmc +bcf_params <- list(sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F) bcf_model_warmstart <- bcf( X_train = X_train, Z_train = Z_train, y_train = y_train, pi_train = pi_train, X_test = X_test, Z_test = Z_test, pi_test = pi_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, - sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F + params = bcf_params ) ``` @@ -481,11 +485,12 @@ num_gfr <- 0 num_burnin <- 100 num_mcmc <- 100 num_samples <- num_gfr + num_burnin + num_mcmc +bcf_params <- list(sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F) bcf_model_root <- bcf( X_train = X_train, Z_train = Z_train, y_train = y_train, pi_train = pi_train, X_test = X_test, Z_test = Z_test, pi_test = pi_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, - sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F + params = bcf_params ) ``` @@ -594,11 +599,12 @@ num_gfr <- 10 num_burnin <- 0 num_mcmc <- 100 num_samples <- num_gfr + num_burnin + num_mcmc +bcf_params <- list(sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F) bcf_model_warmstart <- bcf( X_train = X_train, Z_train = Z_train, y_train = y_train, pi_train = pi_train, X_test = X_test, Z_test = Z_test, pi_test = pi_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, - sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F + params = bcf_params ) ``` @@ -640,11 +646,12 @@ num_gfr <- 0 num_burnin <- 100 num_mcmc <- 100 num_samples <- num_gfr + num_burnin + num_mcmc +bcf_params <- list(sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F) bcf_model_root <- bcf( X_train = X_train, Z_train = Z_train, y_train = y_train, pi_train = pi_train, X_test = X_test, Z_test = Z_test, pi_test = pi_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, - sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F + params = bcf_params ) ``` @@ -746,13 +753,13 @@ num_gfr <- 100 num_burnin <- 0 num_mcmc <- 500 num_samples <- num_gfr + num_burnin + num_mcmc +bcf_params <- list(sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F) bcf_model_warmstart <- bcf( X_train = X_train, Z_train = Z_train, y_train = y_train, pi_train = pi_train, group_ids_train = group_ids_train, rfx_basis_train = rfx_basis_train, X_test = X_test, Z_test = Z_test, pi_test = pi_test, group_ids_test = group_ids_test, - rfx_basis_test = rfx_basis_test, - num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, - sample_sigma_leaf_mu = T, sample_sigma_leaf_tau = F + rfx_basis_test = rfx_basis_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, + params = bcf_params ) ``` @@ -860,11 +867,12 @@ num_gfr <- 0 num_burnin <- 1000 num_mcmc <- 1000 num_samples <- num_gfr + num_burnin + num_mcmc +bcf_params <- list(sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F) bcf_model_mcmc <- bcf( X_train = X_train, Z_train = Z_train, y_train = y_train, pi_train = pi_train, X_test = X_test, Z_test = Z_test, pi_test = pi_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, - sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F + params = bcf_params ) ``` @@ -918,12 +926,12 @@ num_gfr <- 0 num_burnin <- 1000 num_mcmc <- 1000 num_samples <- num_gfr + num_burnin + num_mcmc +bcf_params <- list(sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F, keep_vars_tau = c("x1","x2")) bcf_model_mcmc <- bcf( X_train = X_train, Z_train = Z_train, y_train = y_train, pi_train = pi_train, X_test = X_test, Z_test = Z_test, pi_test = pi_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, - sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F, - keep_vars_tau = c("x1","x2") + params = bcf_params ) ``` @@ -977,11 +985,12 @@ num_gfr <- 10 num_burnin <- 0 num_mcmc <- 1000 num_samples <- num_gfr + num_burnin + num_mcmc +bcf_params <- list(sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F) bcf_model_warmstart <- bcf( X_train = X_train, Z_train = Z_train, y_train = y_train, pi_train = pi_train, X_test = X_test, Z_test = Z_test, pi_test = pi_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, - sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F + params = bcf_params ) ``` @@ -1035,12 +1044,12 @@ num_gfr <- 10 num_burnin <- 0 num_mcmc <- 1000 num_samples <- num_gfr + num_burnin + num_mcmc +bcf_params <- list(sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F, keep_vars_tau = c("x1","x2")) bcf_model_warmstart <- bcf( X_train = X_train, Z_train = Z_train, y_train = y_train, pi_train = pi_train, X_test = X_test, Z_test = Z_test, pi_test = pi_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, - sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F, - keep_vars_tau = c("x1", "x2") + params = bcf_params ) ``` @@ -1160,11 +1169,12 @@ num_gfr <- 10 num_burnin <- 0 num_mcmc <- 1000 num_samples <- num_gfr + num_burnin + num_mcmc +bcf_params <- list(sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F) bcf_model_warmstart <- bcf( X_train = X_train, Z_train = Z_train, y_train = y_train, pi_train = pi_train, X_test = X_test, Z_test = Z_test, pi_test = pi_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, - sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F + params = bcf_params ) ``` @@ -1206,11 +1216,12 @@ num_gfr <- 0 num_burnin <- 1000 num_mcmc <- 1000 num_samples <- num_gfr + num_burnin + num_mcmc +bcf_params <- list(sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F) bcf_model_root <- bcf( X_train = X_train, Z_train = Z_train, y_train = y_train, pi_train = pi_train, X_test = X_test, Z_test = Z_test, pi_test = pi_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, - sample_sigma_leaf_mu = F, sample_sigma_leaf_tau = F + params = bcf_params ) ``` diff --git a/vignettes/EnsembleKernel.Rmd b/vignettes/EnsembleKernel.Rmd index cce00faf..04fab08c 100644 --- a/vignettes/EnsembleKernel.Rmd +++ b/vignettes/EnsembleKernel.Rmd @@ -96,7 +96,8 @@ sigma_leaf <- 1/num_trees X_train <- as.data.frame(X_train) X_test <- as.data.frame(X_test) colnames(X_train) <- colnames(X_test) <- "x1" -bart_model <- bart(X_train=X_train, y_train=y_train, X_test=X_test, num_trees_mean=num_trees) +bart_params <- list(num_trees_mean=num_trees) +bart_model <- bart(X_train=X_train, y_train=y_train, X_test=X_test, params = bart_params) # Extract kernels needed for kriging result_kernels <- computeForestKernels(bart_model=bart_model, X_train=X_train, X_test=X_test) @@ -167,7 +168,8 @@ num_trees <- 200 sigma_leaf <- 1/num_trees X_train <- as.data.frame(X_train) X_test <- as.data.frame(X_test) -bart_model <- bart(X_train=X_train, y_train=y_train, X_test=X_test, num_trees_mean=num_trees) +bart_params <- list(num_trees_mean=num_trees) +bart_model <- bart(X_train=X_train, y_train=y_train, X_test=X_test, params = bart_params) # Extract kernels needed for kriging result_kernels <- computeForestKernels(bart_model=bart_model, X_train=X_train, X_test=X_test) diff --git a/vignettes/Heteroskedasticity.Rmd b/vignettes/Heteroskedasticity.Rmd index ed81c36a..8195edca 100644 --- a/vignettes/Heteroskedasticity.Rmd +++ b/vignettes/Heteroskedasticity.Rmd @@ -93,11 +93,12 @@ num_mcmc <- 100 num_trees <- 20 a_0 <- 1.5 num_samples <- num_gfr + num_burnin + num_mcmc +bart_params <- list(num_trees_mean = 0, num_trees_variance = num_trees, + sample_sigma_global = F, sample_sigma_leaf = F) bart_model_warmstart <- stochtree::bart( X_train = X_train, y_train = y_train, X_test = X_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, - num_trees_mean = 0, num_trees_variance = num_trees, - sample_sigma_global = F, sample_sigma_leaf = F + params = bart_params ) ``` @@ -119,11 +120,12 @@ num_gfr <- 0 num_burnin <- 1000 num_mcmc <- 100 num_samples <- num_gfr + num_burnin + num_mcmc +bart_params <- list(num_trees_mean = 0, num_trees_variance = 50, + sample_sigma_global = F, sample_sigma_leaf = F) bart_model_mcmc <- stochtree::bart( X_train = X_train, y_train = y_train, X_test = X_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, - num_trees_mean = 0, num_trees_variance = 50, - sample_sigma_global = F, sample_sigma_leaf = F + params = bart_params ) ``` @@ -200,14 +202,14 @@ num_gfr <- 10 num_burnin <- 0 num_mcmc <- 100 num_samples <- num_gfr + num_burnin + num_mcmc +bart_params <- list(num_trees_mean = 0, num_trees_variance = 50, + alpha_mean = 0.95, beta_mean = 2, min_samples_leaf_mean = 5, + alpha_variance = 0.95, beta_variance = 1.25, min_samples_leaf_variance = 1, + sample_sigma_global = F, sample_sigma_leaf = F) bart_model_warmstart <- stochtree::bart( X_train = X_train, y_train = y_train, X_test = X_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, - num_trees_mean = 0, num_trees_variance = 50, - alpha_mean = 0.95, beta_mean = 2, min_samples_leaf_mean = 5, - alpha_variance = 0.95, beta_variance = 1.25, - min_samples_leaf_variance = 1, - sample_sigma_global = F, sample_sigma_leaf = F + params = bart_params ) ``` @@ -229,14 +231,14 @@ num_gfr <- 0 num_burnin <- 1000 num_mcmc <- 100 num_samples <- num_gfr + num_burnin + num_mcmc +bart_params <- list(num_trees_mean = 0, num_trees_variance = 50, + alpha_mean = 0.95, beta_mean = 2, min_samples_leaf_mean = 5, + alpha_variance = 0.95, beta_variance = 1.25, min_samples_leaf_variance = 5, + sample_sigma_global = F, sample_sigma_leaf = F) bart_model_mcmc <- stochtree::bart( X_train = X_train, y_train = y_train, X_test = X_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, - num_trees_mean = 0, num_trees_variance = 50, - alpha_mean = 0.95, beta_mean = 2, min_samples_leaf_mean = 5, - alpha_variance = 0.95, beta_variance = 1.25, - min_samples_leaf_variance = 5, - sample_sigma_global = F, sample_sigma_leaf = F + params = bart_params ) ``` @@ -324,14 +326,14 @@ num_gfr <- 10 num_burnin <- 0 num_mcmc <- 100 num_samples <- num_gfr + num_burnin + num_mcmc +bart_params <- list(num_trees_mean = 50, num_trees_variance = 50, + alpha_mean = 0.95, beta_mean = 2, min_samples_leaf_mean = 5, + alpha_variance = 0.95, beta_variance = 1.25, min_samples_leaf_variance = 5, + sample_sigma_global = F, sample_sigma_leaf = F) bart_model_warmstart <- stochtree::bart( X_train = X_train, y_train = y_train, X_test = X_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, - num_trees_mean = 50, num_trees_variance = 50, - alpha_mean = 0.95, beta_mean = 2, min_samples_leaf_mean = 5, - alpha_variance = 0.95, beta_variance = 1.25, - min_samples_leaf_variance = 5, - sample_sigma_global = F, sample_sigma_leaf = F + params = bart_params ) ``` @@ -356,14 +358,14 @@ num_gfr <- 0 num_burnin <- 1000 num_mcmc <- 100 num_samples <- num_gfr + num_burnin + num_mcmc +bart_params <- list(num_trees_mean = 50, num_trees_variance = 50, + alpha_mean = 0.95, beta_mean = 2, min_samples_leaf_mean = 5, + alpha_variance = 0.95, beta_variance = 1.25, min_samples_leaf_variance = 5, + sample_sigma_global = F, sample_sigma_leaf = F) bart_model_mcmc <- stochtree::bart( X_train = X_train, y_train = y_train, X_test = X_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, - num_trees_mean = 50, num_trees_variance = 50, - alpha_mean = 0.95, beta_mean = 2, min_samples_leaf_mean = 5, - alpha_variance = 0.95, beta_variance = 1.25, - min_samples_leaf_variance = 5, - sample_sigma_global = F, sample_sigma_leaf = F + params = bart_params ) ``` @@ -455,14 +457,14 @@ num_gfr <- 10 num_burnin <- 0 num_mcmc <- 100 num_samples <- num_gfr + num_burnin + num_mcmc +bart_params <- list(num_trees_mean = 50, num_trees_variance = 50, + alpha_mean = 0.95, beta_mean = 2, min_samples_leaf_mean = 5, + alpha_variance = 0.95, beta_variance = 1.25, min_samples_leaf_variance = 5, + sample_sigma_global = F, sample_sigma_leaf = F) bart_model_warmstart <- stochtree::bart( X_train = X_train, y_train = y_train, X_test = X_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, - num_trees_mean = 50, num_trees_variance = 50, - alpha_mean = 0.95, beta_mean = 2, min_samples_leaf_mean = 5, - alpha_variance = 0.95, beta_variance = 1.25, - min_samples_leaf_variance = 5, - sample_sigma_global = F, sample_sigma_leaf = F + params = bart_params ) ``` @@ -487,14 +489,14 @@ num_gfr <- 0 num_burnin <- 1000 num_mcmc <- 100 num_samples <- num_gfr + num_burnin + num_mcmc +bart_params <- list(num_trees_mean = 50, num_trees_variance = 50, + alpha_mean = 0.95, beta_mean = 2, min_samples_leaf_mean = 5, + alpha_variance = 0.95, beta_variance = 1.25, min_samples_leaf_variance = 5, + sample_sigma_global = F, sample_sigma_leaf = F) bart_model_mcmc <- stochtree::bart( X_train = X_train, y_train = y_train, X_test = X_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, - num_trees_mean = 50, num_trees_variance = 50, - alpha_mean = 0.95, beta_mean = 2, min_samples_leaf_mean = 5, - alpha_variance = 0.95, beta_variance = 1.25, - min_samples_leaf_variance = 5, - sample_sigma_global = F, sample_sigma_leaf = F + params = bart_params ) ``` diff --git a/vignettes/MultiChain.Rmd b/vignettes/MultiChain.Rmd index 2e679b6b..bc5adc2e 100644 --- a/vignettes/MultiChain.Rmd +++ b/vignettes/MultiChain.Rmd @@ -97,13 +97,12 @@ Run the sampler, storing the resulting BART objects in a list ```{r} bart_models <- list() +bart_params <- list(sample_sigma_global = T, sample_sigma_leaf = T, num_trees_mean = num_trees) for (i in 1:num_chains) { bart_models[[i]] <- stochtree::bart( - X_train = X_train, W_train = W_train, y_train = y_train, - X_test = X_test, W_test = W_test, num_trees_mean = num_trees, - num_gfr = num_gfr, num_burnin = num_burnin, - num_mcmc = num_mcmc, sample_sigma_global = T, - sample_sigma_leaf = T + X_train = X_train, W_train = W_train, y_train = y_train, X_test = X_test, + W_test = W_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, + params = bart_params ) } ``` @@ -178,12 +177,11 @@ storing the resulting BART JSON strings in a list. ```{r} bart_model_outputs <- foreach (i = 1:num_chains) %dopar% { random_seed <- i + bart_params <- list(sample_sigma_global = T, sample_sigma_leaf = T, + num_trees_mean = num_trees, random_seed = random_seed) bart_model <- stochtree::bart( - X_train = X_train, W_train = W_train, y_train = y_train, - X_test = X_test, W_test = W_test, num_trees_mean = num_trees, - num_gfr = num_gfr, num_burnin = num_burnin, - num_mcmc = num_mcmc, sample_sigma_global = T, - sample_sigma_leaf = T, random_seed = random_seed + X_train = X_train, W_train = W_train, y_train = y_train, X_test = X_test, W_test = W_test, + num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, params = bart_params ) bart_model_string <- stochtree::saveBARTModelToJsonString(bart_model) y_hat_test <- bart_model$y_hat_test diff --git a/vignettes/PriorCalibration.Rmd b/vignettes/PriorCalibration.Rmd index 9cf56ba5..604ddab6 100644 --- a/vignettes/PriorCalibration.Rmd +++ b/vignettes/PriorCalibration.Rmd @@ -56,7 +56,7 @@ This is done in `stochtree` via the `calibrate_inverse_gamma_error_variance` fun library(stochtree) # Generate data -n <- 1000 +n <- 500 p <- 5 X <- matrix(runif(n*p), ncol = p) f_XW <- ( @@ -87,9 +87,10 @@ lambda <- calibrate_inverse_gamma_error_variance(y_train, X_train, nu = nu) Now we run a BART model with this variance parameterization ```{r} +bart_params <- list(a_global = nu/2, b_global = (nu*lambda)/2) bart_model <- bart(X_train = X_train, y_train = y_train, X_test = X_test, - a_global = nu, b_global = nu*lambda, num_gfr = 0, - num_burnin = 500, num_mcmc = 100) + num_gfr = 0, num_burnin = 1000, num_mcmc = 100, + params = bart_params) ``` Inspect the out-of-sample predictions of the model @@ -106,5 +107,4 @@ plot(bart_model$sigma2_global_samples, ylab = "sigma^2", xlab = "iteration") abline(h = noise_sd^2, col = "red", lty = 3, lwd = 3) ``` - # References