From ef7b83cf7763ec3f2aa694deec510f74ebb9db18 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Sun, 8 Jun 2025 14:36:17 -0600 Subject: [PATCH 01/12] Added support for weighted sampling without replacement in C++ with R wrappers and benchmark code --- NAMESPACE | 1 + R/cpp11.R | 4 ++ R/model.R | 18 +++++ include/stochtree/discrete_sampler.h | 65 +++++++++++++++++++ man/sample_without_replacement.Rd | 31 +++++++++ src/cpp11.cpp | 8 +++ src/sampler.cpp | 38 ++++++++++- src/stochtree_types.h | 1 + tools/debug/sampling_without_replacement.R | 7 ++ .../sampling_without_replacement_comparison.R | 23 +++++++ ...pling_without_replacement_microbenchmark.R | 13 ++++ 11 files changed, 208 insertions(+), 1 deletion(-) create mode 100644 include/stochtree/discrete_sampler.h create mode 100644 man/sample_without_replacement.Rd create mode 100644 tools/debug/sampling_without_replacement.R create mode 100644 tools/perf/sampling_without_replacement_comparison.R create mode 100644 tools/perf/sampling_without_replacement_microbenchmark.R diff --git a/NAMESPACE b/NAMESPACE index a78a5123..bad6d23c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -57,6 +57,7 @@ export(rootResetRandomEffectsModel) export(rootResetRandomEffectsTracker) export(sampleGlobalErrorVarianceOneIteration) export(sampleLeafVarianceOneIteration) +export(sample_without_replacement) export(saveBARTModelToJson) export(saveBARTModelToJsonFile) export(saveBARTModelToJsonString) diff --git a/R/cpp11.R b/R/cpp11.R index 2260cad6..c6f147ba 100644 --- a/R/cpp11.R +++ b/R/cpp11.R @@ -640,6 +640,10 @@ forest_tracker_cpp <- function(data, feature_types, num_trees, n) { .Call(`_stochtree_forest_tracker_cpp`, data, feature_types, num_trees, n) } +sample_without_replacement_integer_cpp <- function(population_vector, sampling_probs, sample_size) { + .Call(`_stochtree_sample_without_replacement_integer_cpp`, population_vector, sampling_probs, sample_size) +} + init_json_cpp <- function() { .Call(`_stochtree_init_json_cpp`) } diff --git a/R/model.R b/R/model.R index dfa88fc9..9519591c 100644 --- a/R/model.R +++ b/R/model.R @@ -271,3 +271,21 @@ createForestModel <- function(forest_dataset, forest_model_config, global_model_ ))) } + +#' Draw `sample_size` samples from `population_vector` without replacement, weighted by `sampling_probabilities` +#' +#' @param population_vector Vector from which to draw samples. +#' @param sampling_probabilities Vector of probabilities of drawing each element of `population_vector`. +#' @param sample_size Number of samples to draw from `population_vector`. Must be less than or equal to `length(population_vector)` +#' +#' @returns Vector of size `sample_size` +#' @export +#' +#' @examples +#' a <- c(4,3,2,5,1,9,7) +#' p <- c(0.7,0.2,0.05,0.02,0.01,0.01,0.01) +#' num_samples <- 5 +#' sample_without_replacement(a, p, num_samples) +sample_without_replacement <- function(population_vector, sampling_probabilities, sample_size) { + return(sample_without_replacement_integer_cpp(population_vector, sampling_probabilities, sample_size)) +} diff --git a/include/stochtree/discrete_sampler.h b/include/stochtree/discrete_sampler.h new file mode 100644 index 00000000..c2a55882 --- /dev/null +++ b/include/stochtree/discrete_sampler.h @@ -0,0 +1,65 @@ +/*! + * Copyright (c) 2024 stochtree authors. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ +#ifndef STOCHTREE_DISCRETE_SAMPLER_H_ +#define STOCHTREE_DISCRETE_SAMPLER_H_ +#include +#include +#include +#include + +namespace StochTree { + +/*! \brief Sample without replacement according to a set of probability weights. + * This template function is a C++ variant of numpy's implementation: + * https://github.com/numpy/numpy/blob/031f44252d613f4524ad181e3eb2ae2791e22187/numpy/random/_generator.pyx#L925 + */ +template +void sample_without_replacement(container_type* output, prob_type* p, container_type* a, int population_size, int sample_size, std::mt19937& gen) { + std::vector p_copy(population_size); + std::memcpy(p_copy.data(), p, sizeof(prob_type) * population_size); + std::vector indices(sample_size); + std::uniform_real_distribution<> unif(0.0, 1.0); + std::vector unif_samples(sample_size); + std::vector cdf(population_size); + + int fulfilled_sample_count = 0; + int remaining_sample_count = sample_size - fulfilled_sample_count; + while (fulfilled_sample_count < sample_size) { + if (fulfilled_sample_count > 0) { + for (int i = 0; i < fulfilled_sample_count; i++) p_copy[indices[i]] = 0.0; + } + std::generate(unif_samples.begin(), unif_samples.begin() + remaining_sample_count, [&gen, &unif](){ + return unif(gen); + }); + std::partial_sum(p_copy.cbegin(), p_copy.cend(), cdf.begin()); + for (int i = 0; i < cdf.size(); i++) { + cdf[i] = cdf[i] / cdf[cdf.size()-1]; + } + std::vector matches(remaining_sample_count); + for (int i = 0; i < remaining_sample_count; i++) { + auto match = std::upper_bound(cdf.cbegin(), cdf.cend(), unif_samples[i]); + if (match != cdf.cend()) { + matches[i] = std::distance(cdf.cbegin(), match); + } else { + matches[i] = std::distance(cdf.cbegin(), cdf.cend()); + } + } + std::sort(matches.begin(), matches.end()); + auto last_unique = std::unique(matches.begin(), matches.end()); + matches.erase(last_unique, matches.end()); + for (int i = 0; i < matches.size(); i++) { + indices[fulfilled_sample_count + i] = matches[i]; + } + fulfilled_sample_count += matches.size(); + remaining_sample_count -= matches.size(); + } + for (int i = 0; i < sample_size; i++) { + output[i] = a[indices[i]]; + } +} + +} + +#endif // STOCHTREE_DISCRETE_SAMPLER_H_ diff --git a/man/sample_without_replacement.Rd b/man/sample_without_replacement.Rd new file mode 100644 index 00000000..36041e7d --- /dev/null +++ b/man/sample_without_replacement.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/model.R +\name{sample_without_replacement} +\alias{sample_without_replacement} +\title{Draw \code{sample_size} samples from \code{population_vector} without replacement, weighted by \code{sampling_probabilities}} +\usage{ +sample_without_replacement( + population_vector, + sampling_probabilities, + sample_size +) +} +\arguments{ +\item{population_vector}{Vector from which to draw samples.} + +\item{sampling_probabilities}{Vector of probabilities of drawing each element of \code{population_vector}.} + +\item{sample_size}{Number of samples to draw from \code{population_vector}. Must be less than or equal to \code{length(population_vector)}} +} +\value{ +Vector of size \code{sample_size} +} +\description{ +Draw \code{sample_size} samples from \code{population_vector} without replacement, weighted by \code{sampling_probabilities} +} +\examples{ +a <- c(4,3,2,5,1,9,7) +p <- c(0.7,0.2,0.05,0.02,0.01,0.01,0.01) +num_samples <- 5 +sample_without_replacement(a, p, num_samples) +} diff --git a/src/cpp11.cpp b/src/cpp11.cpp index d32eba1f..362656fa 100644 --- a/src/cpp11.cpp +++ b/src/cpp11.cpp @@ -1186,6 +1186,13 @@ extern "C" SEXP _stochtree_forest_tracker_cpp(SEXP data, SEXP feature_types, SEX return cpp11::as_sexp(forest_tracker_cpp(cpp11::as_cpp>>(data), cpp11::as_cpp>(feature_types), cpp11::as_cpp>(num_trees), cpp11::as_cpp>(n))); END_CPP11 } +// sampler.cpp +cpp11::writable::integers sample_without_replacement_integer_cpp(cpp11::integers population_vector, cpp11::doubles sampling_probs, int sample_size); +extern "C" SEXP _stochtree_sample_without_replacement_integer_cpp(SEXP population_vector, SEXP sampling_probs, SEXP sample_size) { + BEGIN_CPP11 + return cpp11::as_sexp(sample_without_replacement_integer_cpp(cpp11::as_cpp>(population_vector), cpp11::as_cpp>(sampling_probs), cpp11::as_cpp>(sample_size))); + END_CPP11 +} // serialization.cpp cpp11::external_pointer init_json_cpp(); extern "C" SEXP _stochtree_init_json_cpp() { @@ -1673,6 +1680,7 @@ static const R_CallMethodDef CallEntries[] = { {"_stochtree_sample_mcmc_one_iteration_cpp", (DL_FUNC) &_stochtree_sample_mcmc_one_iteration_cpp, 17}, {"_stochtree_sample_sigma2_one_iteration_cpp", (DL_FUNC) &_stochtree_sample_sigma2_one_iteration_cpp, 5}, {"_stochtree_sample_tau_one_iteration_cpp", (DL_FUNC) &_stochtree_sample_tau_one_iteration_cpp, 4}, + {"_stochtree_sample_without_replacement_integer_cpp", (DL_FUNC) &_stochtree_sample_without_replacement_integer_cpp, 3}, {"_stochtree_set_leaf_value_active_forest_cpp", (DL_FUNC) &_stochtree_set_leaf_value_active_forest_cpp, 2}, {"_stochtree_set_leaf_value_forest_container_cpp", (DL_FUNC) &_stochtree_set_leaf_value_forest_container_cpp, 2}, {"_stochtree_set_leaf_vector_active_forest_cpp", (DL_FUNC) &_stochtree_set_leaf_vector_active_forest_cpp, 2}, diff --git a/src/sampler.cpp b/src/sampler.cpp index 514ae006..30ff2540 100644 --- a/src/sampler.cpp +++ b/src/sampler.cpp @@ -1,6 +1,7 @@ #include #include "stochtree_types.h" #include +#include #include #include #include @@ -281,4 +282,39 @@ cpp11::external_pointer forest_tracker_cpp(cpp11::exte // Release management of the pointer to R session return cpp11::external_pointer(tracker_ptr_.release()); -} \ No newline at end of file +} + +[[cpp11::register]] +cpp11::writable::integers sample_without_replacement_integer_cpp( + cpp11::integers population_vector, + cpp11::doubles sampling_probs, + int sample_size +) { + // Unpack pointer to population vector + int population_size = population_vector.size(); + int* population_vector_ptr = INTEGER(PROTECT(population_vector)); + + // Unpack pointer to sampling probabilities + double* sampling_probs_ptr = REAL(PROTECT(sampling_probs)); + + // Create output vector + cpp11::writable::integers output(sample_size); + + // Unpack pointer to output vector + int* output_ptr = INTEGER(PROTECT(output)); + + // Create C++ RNG + std::random_device rd; + std::mt19937 gen(rd()); + + // Run the sampler + StochTree::sample_without_replacement( + output_ptr, sampling_probs_ptr, population_vector_ptr, population_size, sample_size, gen + ); + + // Unprotect raw pointers + UNPROTECT(3); + + // Return result + return(output); +} diff --git a/src/stochtree_types.h b/src/stochtree_types.h index 33a8fb61..d3d6327c 100644 --- a/src/stochtree_types.h +++ b/src/stochtree_types.h @@ -1,5 +1,6 @@ #include #include +#include #include #include #include diff --git a/tools/debug/sampling_without_replacement.R b/tools/debug/sampling_without_replacement.R new file mode 100644 index 00000000..9f32081e --- /dev/null +++ b/tools/debug/sampling_without_replacement.R @@ -0,0 +1,7 @@ +library(stochtree) + +# Run sampler +a <- c(4,3,2,5,1,9,7) +p <- c(0.7,0.2,0.05,0.02,0.01,0.01,0.01) +num_samples <- 5 +sample_without_replacement(as.integer(a), p, num_samples) diff --git a/tools/perf/sampling_without_replacement_comparison.R b/tools/perf/sampling_without_replacement_comparison.R new file mode 100644 index 00000000..1dc9c60a --- /dev/null +++ b/tools/perf/sampling_without_replacement_comparison.R @@ -0,0 +1,23 @@ +library(stochtree) + +a <- c(4,3,2,5,1,9,7) +p <- c(0.7,0.2,0.05,0.02,0.01,0.01,0.01) +num_mc <- 100000 +num_samples <- 5 +results_stochtree <- matrix(NA_integer_, nrow = num_mc, ncol = num_samples) +results_base_R <- matrix(NA_integer_, nrow = num_mc, ncol = num_samples) + +for (i in 1:num_mc) { + results_stochtree[i,] <- sample_without_replacement(as.integer(a), p, num_samples) + results_base_R[i,] <- sample(a, num_samples, replace = F, prob = p) +} + + +count_elems_stochtree <- rep(NA_integer_, length(a)) +count_elems_base_R <- rep(NA_integer_, length(a)) +for (i in 1:length(a)) { + count_elems_stochtree[i] <- sum(rowSums(results_stochtree == a[i])) + count_elems_base_R[i] <- sum(rowSums(results_base_R == a[i])) +} +count_elems_stochtree +count_elems_base_R diff --git a/tools/perf/sampling_without_replacement_microbenchmark.R b/tools/perf/sampling_without_replacement_microbenchmark.R new file mode 100644 index 00000000..254ed05a --- /dev/null +++ b/tools/perf/sampling_without_replacement_microbenchmark.R @@ -0,0 +1,13 @@ +library(microbenchmark) +library(stochtree) + +# Run microbenchmark +num_elements <- 10000 +a <- 1:num_elements +p <- runif(num_elements) +p <- p / sum(p) +num_samples <- 500 +(bench_results <- microbenchmark( + sample(a, num_samples, replace = F, prob = p), + sample_without_replacement(as.integer(a), p, num_samples) +)) From d82432e383220659ff9bb001ce6f30caa512c35b Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Sun, 8 Jun 2025 14:54:00 -0600 Subject: [PATCH 02/12] Fixed R sampling without replacement example --- R/model.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/model.R b/R/model.R index 9519591c..f1ea8e54 100644 --- a/R/model.R +++ b/R/model.R @@ -282,7 +282,7 @@ createForestModel <- function(forest_dataset, forest_model_config, global_model_ #' @export #' #' @examples -#' a <- c(4,3,2,5,1,9,7) +#' a <- as.integer(c(4,3,2,5,1,9,7)) #' p <- c(0.7,0.2,0.05,0.02,0.01,0.01,0.01) #' num_samples <- 5 #' sample_without_replacement(a, p, num_samples) From 1fe4101d1d0759f3e216a46406057bdb68b550c1 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Sun, 8 Jun 2025 15:27:33 -0600 Subject: [PATCH 03/12] Updated docs --- man/sample_without_replacement.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/sample_without_replacement.Rd b/man/sample_without_replacement.Rd index 36041e7d..8f23d8e1 100644 --- a/man/sample_without_replacement.Rd +++ b/man/sample_without_replacement.Rd @@ -24,7 +24,7 @@ Vector of size \code{sample_size} Draw \code{sample_size} samples from \code{population_vector} without replacement, weighted by \code{sampling_probabilities} } \examples{ -a <- c(4,3,2,5,1,9,7) +a <- as.integer(c(4,3,2,5,1,9,7)) p <- c(0.7,0.2,0.05,0.02,0.01,0.01,0.01) num_samples <- 5 sample_without_replacement(a, p, num_samples) From 4cbcc5a4169ad02dbb17c66a6f97c11f1a1250e8 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Mon, 9 Jun 2025 16:05:09 -0500 Subject: [PATCH 04/12] Added support for GFR feature subsampling in the BART R API --- R/bart.R | 24 +++++-- R/config.R | 42 +++++++++++- R/cpp11.R | 4 +- R/model.R | 3 +- include/stochtree/tree_sampler.h | 47 ++++++++++---- man/ForestModelConfig.Rd | 40 +++++++++++- man/bart.Rd | 3 + man/createForestModelConfig.Rd | 5 +- src/cpp11.cpp | 8 +-- src/sampler.cpp | 10 +-- tools/debug/gfr_feature_subsample_debug.R | 36 +++++++++++ .../gfr_feature_subsample_microbenchmark.R | 64 +++++++++++++++++++ 12 files changed, 252 insertions(+), 34 deletions(-) create mode 100644 tools/debug/gfr_feature_subsample_debug.R create mode 100644 tools/perf/gfr_feature_subsample_microbenchmark.R diff --git a/R/bart.R b/R/bart.R index 224432ed..6576c83e 100644 --- a/R/bart.R +++ b/R/bart.R @@ -59,6 +59,7 @@ #' - `keep_vars` Vector of variable names or column indices denoting variables that should be included in the forest. Default: `NULL`. #' - `drop_vars` Vector of variable names or column indices denoting variables that should be excluded from the forest. Default: `NULL`. If both `drop_vars` and `keep_vars` are set, `drop_vars` will be ignored. #' - `probit_outcome_model` Whether or not the outcome should be modeled as explicitly binary via a probit link. If `TRUE`, `y` must only contain the values `0` and `1`. Default: `FALSE`. +#' - `num_features_subsample` How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features passed in the training dataset. #' #' @param variance_forest_params (Optional) A list of variance forest model parameters, each of which has a default value processed internally, so this argument list is optional. #' @@ -73,6 +74,7 @@ #' - `var_forest_prior_scale` Scale parameter in the `IG(var_forest_prior_shape, var_forest_prior_scale)` conditional error variance model (which is only sampled if `num_trees > 0`). Calibrated internally as `num_trees / leaf_prior_calibration_param^2` if not set. #' - `keep_vars` Vector of variable names or column indices denoting variables that should be included in the forest. Default: `NULL`. #' - `drop_vars` Vector of variable names or column indices denoting variables that should be excluded from the forest. Default: `NULL`. If both `drop_vars` and `keep_vars` are set, `drop_vars` will be ignored. +#' - `num_features_subsample` How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features passed in the training dataset. #' #' @return List of sampling outputs and a wrapper around the sampled forests (which can be used for in-memory prediction on new data, or serialized to JSON on disk). #' @export @@ -98,6 +100,7 @@ #' X_train <- X[train_inds,] #' y_test <- y[test_inds] #' y_train <- y[train_inds] +#' #' bart_model <- bart(X_train = X_train, y_train = y_train, X_test = X_test, #' num_gfr = 10, num_burnin = 0, num_mcmc = 10) bart <- function(X_train, y_train, leaf_basis_train = NULL, rfx_group_ids_train = NULL, @@ -127,7 +130,8 @@ bart <- function(X_train, y_train, leaf_basis_train = NULL, rfx_group_ids_train sample_sigma2_leaf = TRUE, sigma2_leaf_init = NULL, sigma2_leaf_shape = 3, sigma2_leaf_scale = NULL, keep_vars = NULL, drop_vars = NULL, - probit_outcome_model = FALSE + probit_outcome_model = FALSE, + num_features_subsample = NULL ) mean_forest_params_updated <- preprocessParams( mean_forest_params_default, mean_forest_params @@ -141,7 +145,8 @@ bart <- function(X_train, y_train, leaf_basis_train = NULL, rfx_group_ids_train var_forest_leaf_init = NULL, var_forest_prior_shape = NULL, var_forest_prior_scale = NULL, - keep_vars = NULL, drop_vars = NULL + keep_vars = NULL, drop_vars = NULL, + num_features_subsample = NULL ) variance_forest_params_updated <- preprocessParams( variance_forest_params_default, variance_forest_params @@ -176,6 +181,7 @@ bart <- function(X_train, y_train, leaf_basis_train = NULL, rfx_group_ids_train keep_vars_mean <- mean_forest_params_updated$keep_vars drop_vars_mean <- mean_forest_params_updated$drop_vars probit_outcome_model <- mean_forest_params_updated$probit_outcome_model + num_features_subsample_mean <- mean_forest_params_updated$num_features_subsample # 3. Variance forest parameters num_trees_variance <- variance_forest_params_updated$num_trees @@ -189,6 +195,7 @@ bart <- function(X_train, y_train, leaf_basis_train = NULL, rfx_group_ids_train b_forest <- variance_forest_params_updated$var_forest_prior_scale keep_vars_variance <- variance_forest_params_updated$keep_vars drop_vars_variance <- variance_forest_params_updated$drop_vars + num_features_subsample_variance <- variance_forest_params_updated$num_features_subsample # Check if there are enough GFR samples to seed num_chains samplers if (num_gfr > 0) { @@ -373,6 +380,15 @@ bart <- function(X_train, y_train, leaf_basis_train = NULL, rfx_group_ids_train variable_weights_variance <- variable_weights_variance[original_var_indices]*variable_weights_adj variable_weights_variance[!(original_var_indices %in% variable_subset_variance)] <- 0 } + + # Set num_features_subsample to default, ncol(X_train), if not already set + if (is.null(num_features_subsample_mean)) { + num_features_subsample_mean <- ncol(X_train) + } + if (is.null(num_features_subsample_variance)) { + num_features_subsample_variance <- ncol(X_train) + } + # Convert all input data to matrices if not already converted if ((is.null(dim(leaf_basis_train))) && (!is.null(leaf_basis_train))) { @@ -633,7 +649,7 @@ bart <- function(X_train, y_train, leaf_basis_train = NULL, rfx_group_ids_train num_observations=nrow(X_train), variable_weights=variable_weights_mean, leaf_dimension=leaf_dimension, alpha=alpha_mean, beta=beta_mean, min_samples_leaf=min_samples_leaf_mean, max_depth=max_depth_mean, leaf_model_type=leaf_model_mean_forest, leaf_model_scale=current_leaf_scale, - cutpoint_grid_size=cutpoint_grid_size) + cutpoint_grid_size=cutpoint_grid_size, num_features_subsample=num_features_subsample_mean) forest_model_mean <- createForestModel(forest_dataset_train, forest_model_config_mean, global_model_config) } if (include_variance_forest) { @@ -641,7 +657,7 @@ bart <- function(X_train, y_train, leaf_basis_train = NULL, rfx_group_ids_train num_observations=nrow(X_train), variable_weights=variable_weights_variance, leaf_dimension=1, alpha=alpha_variance, beta=beta_variance, min_samples_leaf=min_samples_leaf_variance, max_depth=max_depth_variance, leaf_model_type=leaf_model_variance_forest, - cutpoint_grid_size=cutpoint_grid_size) + cutpoint_grid_size=cutpoint_grid_size, num_features_subsample=num_features_subsample_variance) forest_model_variance <- createForestModel(forest_dataset_train, forest_model_config_variance, global_model_config) } diff --git a/R/config.R b/R/config.R index a5982cac..53c259ea 100644 --- a/R/config.R +++ b/R/config.R @@ -61,6 +61,9 @@ ForestModelConfig <- R6::R6Class( #' @field cutpoint_grid_size Number of unique cutpoints to consider cutpoint_grid_size = NULL, + + #' @field num_features_subsample Number of features to subsample for the GFR algorithm + num_features_subsample = NULL, #' Create a new ForestModelConfig object. #' @@ -80,13 +83,14 @@ ForestModelConfig <- R6::R6Class( #' @param variance_forest_shape Shape parameter for IG leaf models (applicable when `leaf_model_type = 3`). Default: `1`. #' @param variance_forest_scale Scale parameter for IG leaf models (applicable when `leaf_model_type = 3`). Default: `1`. #' @param cutpoint_grid_size Number of unique cutpoints to consider (default: `100`) + #' @param num_features_subsample Number of features to subsample for the GFR algorithm #' #' @return A new ForestModelConfig object. initialize = function(feature_types = NULL, sweep_update_indices = NULL, num_trees = NULL, num_features = NULL, num_observations = NULL, variable_weights = NULL, leaf_dimension = 1, alpha = 0.95, beta = 2.0, min_samples_leaf = 5, max_depth = -1, leaf_model_type = 1, leaf_model_scale = NULL, variance_forest_shape = 1.0, - variance_forest_scale = 1.0, cutpoint_grid_size = 100) { + variance_forest_scale = 1.0, cutpoint_grid_size = 100, num_features_subsample = NULL) { if (is.null(feature_types)) { if (is.null(num_features)) { stop("Neither of `num_features` nor `feature_types` (a vector from which `num_features` can be inferred) was provided. Please provide at least one of these inputs when creating a ForestModelConfig object.") @@ -132,6 +136,16 @@ ForestModelConfig <- R6::R6Class( self$variance_forest_shape <- variance_forest_shape self$variance_forest_scale <- variance_forest_scale self$cutpoint_grid_size <- cutpoint_grid_size + if (is.null(num_features_subsample)) { + num_features_subsample <- num_features + } + if (num_features_subsample > num_features) { + stop("`num_features_subsample` cannot be larger than `num_features`") + } + if (num_features_subsample <= 0) { + stop("`num_features_subsample` must at least 1") + } + self$num_features_subsample <- num_features_subsample if (!(as.integer(leaf_model_type) == leaf_model_type)) { stop("`leaf_model_type` must be an integer between 0 and 3") @@ -255,6 +269,19 @@ ForestModelConfig <- R6::R6Class( self$cutpoint_grid_size <- cutpoint_grid_size }, + #' @description + #' Update number of features to subsample for the GFR algorithm + #' @param num_features_subsample Number of features to subsample for the GFR algorithm + update_num_features_subsample = function(num_features_subsample) { + if (num_features_subsample > self$num_features) { + stop("`num_features_subsample` cannot be larger than `num_features`") + } + if (num_features_subsample <= 0) { + stop("`num_features_subsample` must at least 1") + } + self$num_features_subsample <- num_features_subsample + }, + #' @description #' Query feature types for this ForestModelConfig object #' @returns Vector of integer-coded feature types (integers where 0 = numeric, 1 = ordered categorical, 2 = unordered categorical) @@ -358,6 +385,13 @@ ForestModelConfig <- R6::R6Class( #' @returns Number of unique cutpoints to consider get_cutpoint_grid_size = function() { return(self$cutpoint_grid_size) + }, + + #' @description + #' Query number of features to subsample for the GFR algorithm + #' @returns Number of features to subsample for the GFR algorithm + get_num_features_subsample = function() { + return(self$num_features_subsample) } ) ) @@ -424,6 +458,7 @@ GlobalModelConfig <- R6::R6Class( #' @param variance_forest_shape Shape parameter for IG leaf models (applicable when `leaf_model_type = 3`). Default: `1`. #' @param variance_forest_scale Scale parameter for IG leaf models (applicable when `leaf_model_type = 3`). Default: `1`. #' @param cutpoint_grid_size Number of unique cutpoints to consider (default: `100`) +#' @param num_features_subsample Number of features to subsample for the GFR algorithm #' @return ForestModelConfig object #' @export #' @@ -433,12 +468,13 @@ createForestModelConfig <- function(feature_types = NULL, sweep_update_indices = num_observations = NULL, variable_weights = NULL, leaf_dimension = 1, alpha = 0.95, beta = 2.0, min_samples_leaf = 5, max_depth = -1, leaf_model_type = 1, leaf_model_scale = NULL, variance_forest_shape = 1.0, - variance_forest_scale = 1.0, cutpoint_grid_size = 100){ + variance_forest_scale = 1.0, cutpoint_grid_size = 100, + num_features_subsample = NULL){ return(invisible(( ForestModelConfig$new(feature_types, sweep_update_indices, num_trees, num_features, num_observations, variable_weights, leaf_dimension, alpha, beta, min_samples_leaf, max_depth, leaf_model_type, leaf_model_scale, variance_forest_shape, - variance_forest_scale, cutpoint_grid_size) + variance_forest_scale, cutpoint_grid_size, num_features_subsample) ))) } diff --git a/R/cpp11.R b/R/cpp11.R index c6f147ba..943206f1 100644 --- a/R/cpp11.R +++ b/R/cpp11.R @@ -580,8 +580,8 @@ compute_leaf_indices_cpp <- function(forest_container, covariates, forest_nums) .Call(`_stochtree_compute_leaf_indices_cpp`, forest_container, covariates, forest_nums) } -sample_gfr_one_iteration_cpp <- function(data, residual, forest_samples, active_forest, tracker, split_prior, rng, sweep_indices, feature_types, cutpoint_grid_size, leaf_model_scale_input, variable_weights, a_forest, b_forest, global_variance, leaf_model_int, keep_forest) { - invisible(.Call(`_stochtree_sample_gfr_one_iteration_cpp`, data, residual, forest_samples, active_forest, tracker, split_prior, rng, sweep_indices, feature_types, cutpoint_grid_size, leaf_model_scale_input, variable_weights, a_forest, b_forest, global_variance, leaf_model_int, keep_forest)) +sample_gfr_one_iteration_cpp <- function(data, residual, forest_samples, active_forest, tracker, split_prior, rng, sweep_indices, feature_types, cutpoint_grid_size, leaf_model_scale_input, variable_weights, a_forest, b_forest, global_variance, leaf_model_int, keep_forest, num_features_subsample) { + invisible(.Call(`_stochtree_sample_gfr_one_iteration_cpp`, data, residual, forest_samples, active_forest, tracker, split_prior, rng, sweep_indices, feature_types, cutpoint_grid_size, leaf_model_scale_input, variable_weights, a_forest, b_forest, global_variance, leaf_model_int, keep_forest, num_features_subsample)) } sample_mcmc_one_iteration_cpp <- function(data, residual, forest_samples, active_forest, tracker, split_prior, rng, sweep_indices, feature_types, cutpoint_grid_size, leaf_model_scale_input, variable_weights, a_forest, b_forest, global_variance, leaf_model_int, keep_forest) { diff --git a/R/model.R b/R/model.R index f1ea8e54..955037b0 100644 --- a/R/model.R +++ b/R/model.R @@ -86,6 +86,7 @@ ForestModel <- R6::R6Class( b_forest <- forest_model_config$variance_forest_scale global_scale <- global_model_config$global_error_variance cutpoint_grid_size <- forest_model_config$cutpoint_grid_size + num_features_subsample <- forest_model_config$num_features_subsample # Default to empty integer vector if sweep_update_indices is NULL if (is.null(sweep_update_indices)) { @@ -113,7 +114,7 @@ ForestModel <- R6::R6Class( forest_dataset$data_ptr, residual$data_ptr, forest_samples$forest_container_ptr, active_forest$forest_ptr, self$tracker_ptr, self$tree_prior_ptr, rng$rng_ptr, sweep_update_indices, feature_types, cutpoint_grid_size, leaf_model_scale, - variable_weights, a_forest, b_forest, global_scale, leaf_model_int, keep_forest + variable_weights, a_forest, b_forest, global_scale, leaf_model_int, keep_forest, num_features_subsample ) } else { sample_mcmc_one_iteration_cpp( diff --git a/include/stochtree/tree_sampler.h b/include/stochtree/tree_sampler.h index 5c234d23..750364d7 100644 --- a/include/stochtree/tree_sampler.h +++ b/include/stochtree/tree_sampler.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -13,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -469,10 +471,10 @@ static inline void AdjustStateAfterTreeSampling(ForestTracker& tracker, LeafMode template static inline void EvaluateAllPossibleSplits( - ForestDataset& dataset, ForestTracker& tracker, ColumnVector& residual, TreePrior& tree_prior, LeafModel& leaf_model, double global_variance, int tree_num, int split_node_id, + ForestDataset& dataset, ForestTracker& tracker, ColumnVector& residual, TreePrior& tree_prior, std::mt19937& gen, LeafModel& leaf_model, double global_variance, int tree_num, int split_node_id, std::vector& log_cutpoint_evaluations, std::vector& cutpoint_features, std::vector& cutpoint_values, std::vector& cutpoint_feature_types, data_size_t& valid_cutpoint_count, CutpointGridContainer& cutpoint_grid_container, data_size_t node_begin, data_size_t node_end, std::vector& variable_weights, - std::vector& feature_types, LeafSuffStatConstructorArgs&... leaf_suff_stat_args + std::vector& feature_types, std::vector& feature_subset, LeafSuffStatConstructorArgs&... leaf_suff_stat_args ) { // Initialize sufficient statistics LeafSuffStat node_suff_stat = LeafSuffStat(leaf_suff_stat_args...); @@ -510,7 +512,7 @@ static inline void EvaluateAllPossibleSplits( double split_log_ml; for (int j = 0; j < covariates.cols(); j++) { - if (std::abs(variable_weights.at(j)) > kEpsilon) { + if ((std::abs(variable_weights.at(j)) > kEpsilon) && (feature_subset[j])) { // Enumerate cutpoint strides cutpoint_grid_container.CalculateStrides(covariates, outcome, tracker.GetSortedNodeSampleTracker(), split_node_id, node_begin, node_end, j, feature_types); @@ -571,14 +573,15 @@ static inline void EvaluateCutpoints(Tree* tree, ForestTracker& tracker, LeafMod std::mt19937& gen, int tree_num, double global_variance, int cutpoint_grid_size, int node_id, data_size_t node_begin, data_size_t node_end, std::vector& log_cutpoint_evaluations, std::vector& cutpoint_features, std::vector& cutpoint_values, std::vector& cutpoint_feature_types, data_size_t& valid_cutpoint_count, std::vector& variable_weights, - std::vector& feature_types, CutpointGridContainer& cutpoint_grid_container, LeafSuffStatConstructorArgs&... leaf_suff_stat_args) { + std::vector& feature_types, CutpointGridContainer& cutpoint_grid_container, + std::vector& feature_subset, LeafSuffStatConstructorArgs&... leaf_suff_stat_args) { // Evaluate all possible cutpoints according to the leaf node model, // recording their log-likelihood and other split information in a series of vectors. // The last element of these vectors concerns the "no-split" option. EvaluateAllPossibleSplits( - dataset, tracker, residual, tree_prior, leaf_model, global_variance, tree_num, node_id, log_cutpoint_evaluations, + dataset, tracker, residual, tree_prior, gen, leaf_model, global_variance, tree_num, node_id, log_cutpoint_evaluations, cutpoint_features, cutpoint_values, cutpoint_feature_types, valid_cutpoint_count, cutpoint_grid_container, - node_begin, node_end, variable_weights, feature_types, leaf_suff_stat_args... + node_begin, node_end, variable_weights, feature_types, feature_subset, leaf_suff_stat_args... ); // Compute an adjustment to reflect the no split prior probability and the number of cutpoints @@ -599,7 +602,7 @@ static inline void SampleSplitRule(Tree* tree, ForestTracker& tracker, LeafModel TreePrior& tree_prior, std::mt19937& gen, int tree_num, double global_variance, int cutpoint_grid_size, std::unordered_map>& node_index_map, std::deque& split_queue, int node_id, data_size_t node_begin, data_size_t node_end, std::vector& variable_weights, - std::vector& feature_types, LeafSuffStatConstructorArgs&... leaf_suff_stat_args) { + std::vector& feature_types, std::vector feature_subset, LeafSuffStatConstructorArgs&... leaf_suff_stat_args) { // Leaf depth int leaf_depth = tree->GetDepth(node_id); @@ -619,7 +622,7 @@ static inline void SampleSplitRule(Tree* tree, ForestTracker& tracker, LeafModel tree, tracker, leaf_model, dataset, residual, tree_prior, gen, tree_num, global_variance, cutpoint_grid_size, node_id, node_begin, node_end, log_cutpoint_evaluations, cutpoint_features, cutpoint_values, cutpoint_feature_types, valid_cutpoint_count, variable_weights, feature_types, - cutpoint_grid_container, leaf_suff_stat_args... + cutpoint_grid_container, feature_subset, leaf_suff_stat_args... ); // TODO: maybe add some checks here? @@ -702,12 +705,32 @@ template & variable_weights, int tree_num, double global_variance, std::vector& feature_types, int cutpoint_grid_size, - LeafSuffStatConstructorArgs&... leaf_suff_stat_args) { + int num_features_subsample, LeafSuffStatConstructorArgs&... leaf_suff_stat_args) { int root_id = Tree::kRoot; int curr_node_id; data_size_t curr_node_begin; data_size_t curr_node_end; data_size_t n = dataset.GetCovariates().rows(); + int p = dataset.GetCovariates().cols(); + + // Subsample features (if requested) + std::vector feature_subset(p, true); + if (num_features_subsample < p) { + std::vector feature_indices(p); + std::iota(feature_indices.begin(), feature_indices.end(), 0); + std::vector features_selected(num_features_subsample); + sample_without_replacement( + features_selected.data(), variable_weights.data(), feature_indices.data(), + p, num_features_subsample, gen + ); + for (int i = 0; i < p; i++) { + feature_subset.at(i) = false; + } + for (const auto& feat : features_selected) { + feature_subset.at(feat) = true; + } + } + // Mapping from node id to start and end points of sorted indices std::unordered_map> node_index_map; node_index_map.insert({root_id, std::make_pair(0, n)}); @@ -728,7 +751,7 @@ static inline void GFRSampleTreeOneIter(Tree* tree, ForestTracker& tracker, Fore SampleSplitRule( tree, tracker, leaf_model, dataset, residual, tree_prior, gen, tree_num, global_variance, cutpoint_grid_size, node_index_map, split_queue, curr_node_id, curr_node_begin, curr_node_end, variable_weights, feature_types, - leaf_suff_stat_args...); + feature_subset, leaf_suff_stat_args...); } } @@ -765,7 +788,7 @@ template & variable_weights, std::vector& sweep_update_indices, double global_variance, std::vector& feature_types, int cutpoint_grid_size, - bool keep_forest, bool pre_initialized, bool backfitting, LeafSuffStatConstructorArgs&... leaf_suff_stat_args) { + bool keep_forest, bool pre_initialized, bool backfitting, int num_features_subsample, LeafSuffStatConstructorArgs&... leaf_suff_stat_args) { // Run the GFR algorithm for each tree int num_trees = forests.NumTrees(); for (const int& i : sweep_update_indices) { @@ -785,7 +808,7 @@ static inline void GFRSampleOneIter(TreeEnsemble& active_forest, ForestTracker& GFRSampleTreeOneIter( tree, tracker, forests, leaf_model, dataset, residual, tree_prior, gen, variable_weights, i, global_variance, feature_types, cutpoint_grid_size, - leaf_suff_stat_args... + num_features_subsample, leaf_suff_stat_args... ); // Sample leaf parameters for tree i diff --git a/man/ForestModelConfig.Rd b/man/ForestModelConfig.Rd index 5fe5c755..a20bf36a 100644 --- a/man/ForestModelConfig.Rd +++ b/man/ForestModelConfig.Rd @@ -34,6 +34,8 @@ Shape parameter for IG leaf models Scale parameter for IG leaf models Number of unique cutpoints to consider + +Number of features to subsample for the GFR algorithm } \description{ The "low-level" stochtree interface enables a high degreee of sampler @@ -76,7 +78,9 @@ forest model they wish to run. \item{\code{variance_forest_scale}}{Scale parameter for IG leaf models (applicable when \code{leaf_model_type = 3})} -\item{\code{cutpoint_grid_size}}{Number of unique cutpoints to consider +\item{\code{cutpoint_grid_size}}{Number of unique cutpoints to consider} + +\item{\code{num_features_subsample}}{Number of features to subsample for the GFR algorithm Create a new ForestModelConfig object.} } \if{html}{\out{}} @@ -96,6 +100,7 @@ Create a new ForestModelConfig object.} \item \href{#method-ForestModelConfig-update_variance_forest_shape}{\code{ForestModelConfig$update_variance_forest_shape()}} \item \href{#method-ForestModelConfig-update_variance_forest_scale}{\code{ForestModelConfig$update_variance_forest_scale()}} \item \href{#method-ForestModelConfig-update_cutpoint_grid_size}{\code{ForestModelConfig$update_cutpoint_grid_size()}} +\item \href{#method-ForestModelConfig-update_num_features_subsample}{\code{ForestModelConfig$update_num_features_subsample()}} \item \href{#method-ForestModelConfig-get_feature_types}{\code{ForestModelConfig$get_feature_types()}} \item \href{#method-ForestModelConfig-get_sweep_indices}{\code{ForestModelConfig$get_sweep_indices()}} \item \href{#method-ForestModelConfig-get_variable_weights}{\code{ForestModelConfig$get_variable_weights()}} @@ -111,6 +116,7 @@ Create a new ForestModelConfig object.} \item \href{#method-ForestModelConfig-get_variance_forest_shape}{\code{ForestModelConfig$get_variance_forest_shape()}} \item \href{#method-ForestModelConfig-get_variance_forest_scale}{\code{ForestModelConfig$get_variance_forest_scale()}} \item \href{#method-ForestModelConfig-get_cutpoint_grid_size}{\code{ForestModelConfig$get_cutpoint_grid_size()}} +\item \href{#method-ForestModelConfig-get_num_features_subsample}{\code{ForestModelConfig$get_num_features_subsample()}} } } \if{html}{\out{
}} @@ -134,7 +140,8 @@ Create a new ForestModelConfig object.} leaf_model_scale = NULL, variance_forest_shape = 1, variance_forest_scale = 1, - cutpoint_grid_size = 100 + cutpoint_grid_size = 100, + num_features_subsample = NULL )}\if{html}{\out{}} } @@ -172,6 +179,8 @@ Create a new ForestModelConfig object.} \item{\code{variance_forest_scale}}{Scale parameter for IG leaf models (applicable when \code{leaf_model_type = 3}). Default: \code{1}.} \item{\code{cutpoint_grid_size}}{Number of unique cutpoints to consider (default: \code{100})} + +\item{\code{num_features_subsample}}{Number of features to subsample for the GFR algorithm} } \if{html}{\out{}} } @@ -367,6 +376,23 @@ Update number of unique cutpoints to consider } } \if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ForestModelConfig-update_num_features_subsample}{}}} +\subsection{Method \code{update_num_features_subsample()}}{ +Update number of features to subsample for the GFR algorithm +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ForestModelConfig$update_num_features_subsample(num_features_subsample)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{num_features_subsample}}{Number of features to subsample for the GFR algorithm} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-ForestModelConfig-get_feature_types}{}}} \subsection{Method \code{get_feature_types()}}{ @@ -515,5 +541,15 @@ Query number of unique cutpoints to consider for this ForestModelConfig object \if{html}{\out{
}}\preformatted{ForestModelConfig$get_cutpoint_grid_size()}\if{html}{\out{
}} } +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ForestModelConfig-get_num_features_subsample}{}}} +\subsection{Method \code{get_num_features_subsample()}}{ +Query number of features to subsample for the GFR algorithm +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ForestModelConfig$get_num_features_subsample()}\if{html}{\out{
}} +} + } } diff --git a/man/bart.Rd b/man/bart.Rd index 7a6b8006..0ea4d966 100644 --- a/man/bart.Rd +++ b/man/bart.Rd @@ -99,6 +99,7 @@ that were not in the training set.} \item \code{keep_vars} Vector of variable names or column indices denoting variables that should be included in the forest. Default: \code{NULL}. \item \code{drop_vars} Vector of variable names or column indices denoting variables that should be excluded from the forest. Default: \code{NULL}. If both \code{drop_vars} and \code{keep_vars} are set, \code{drop_vars} will be ignored. \item \code{probit_outcome_model} Whether or not the outcome should be modeled as explicitly binary via a probit link. If \code{TRUE}, \code{y} must only contain the values \code{0} and \code{1}. Default: \code{FALSE}. +\item \code{num_features_subsample} How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features passed in the training dataset. }} \item{variance_forest_params}{(Optional) A list of variance forest model parameters, each of which has a default value processed internally, so this argument list is optional. @@ -114,6 +115,7 @@ that were not in the training set.} \item \code{var_forest_prior_scale} Scale parameter in the \code{IG(var_forest_prior_shape, var_forest_prior_scale)} conditional error variance model (which is only sampled if \code{num_trees > 0}). Calibrated internally as \code{num_trees / leaf_prior_calibration_param^2} if not set. \item \code{keep_vars} Vector of variable names or column indices denoting variables that should be included in the forest. Default: \code{NULL}. \item \code{drop_vars} Vector of variable names or column indices denoting variables that should be excluded from the forest. Default: \code{NULL}. If both \code{drop_vars} and \code{keep_vars} are set, \code{drop_vars} will be ignored. +\item \code{num_features_subsample} How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features passed in the training dataset. }} } \value{ @@ -143,6 +145,7 @@ X_test <- X[test_inds,] X_train <- X[train_inds,] y_test <- y[test_inds] y_train <- y[train_inds] + bart_model <- bart(X_train = X_train, y_train = y_train, X_test = X_test, num_gfr = 10, num_burnin = 0, num_mcmc = 10) } diff --git a/man/createForestModelConfig.Rd b/man/createForestModelConfig.Rd index 235552aa..b46be0e5 100644 --- a/man/createForestModelConfig.Rd +++ b/man/createForestModelConfig.Rd @@ -20,7 +20,8 @@ createForestModelConfig( leaf_model_scale = NULL, variance_forest_shape = 1, variance_forest_scale = 1, - cutpoint_grid_size = 100 + cutpoint_grid_size = 100, + num_features_subsample = NULL ) } \arguments{ @@ -55,6 +56,8 @@ createForestModelConfig( \item{variance_forest_scale}{Scale parameter for IG leaf models (applicable when \code{leaf_model_type = 3}). Default: \code{1}.} \item{cutpoint_grid_size}{Number of unique cutpoints to consider (default: \code{100})} + +\item{num_features_subsample}{Number of features to subsample for the GFR algorithm} } \value{ ForestModelConfig object diff --git a/src/cpp11.cpp b/src/cpp11.cpp index 362656fa..aea80bc6 100644 --- a/src/cpp11.cpp +++ b/src/cpp11.cpp @@ -1076,10 +1076,10 @@ extern "C" SEXP _stochtree_compute_leaf_indices_cpp(SEXP forest_container, SEXP END_CPP11 } // sampler.cpp -void sample_gfr_one_iteration_cpp(cpp11::external_pointer data, cpp11::external_pointer residual, cpp11::external_pointer forest_samples, cpp11::external_pointer active_forest, cpp11::external_pointer tracker, cpp11::external_pointer split_prior, cpp11::external_pointer rng, cpp11::integers sweep_indices, cpp11::integers feature_types, int cutpoint_grid_size, cpp11::doubles_matrix<> leaf_model_scale_input, cpp11::doubles variable_weights, double a_forest, double b_forest, double global_variance, int leaf_model_int, bool keep_forest); -extern "C" SEXP _stochtree_sample_gfr_one_iteration_cpp(SEXP data, SEXP residual, SEXP forest_samples, SEXP active_forest, SEXP tracker, SEXP split_prior, SEXP rng, SEXP sweep_indices, SEXP feature_types, SEXP cutpoint_grid_size, SEXP leaf_model_scale_input, SEXP variable_weights, SEXP a_forest, SEXP b_forest, SEXP global_variance, SEXP leaf_model_int, SEXP keep_forest) { +void sample_gfr_one_iteration_cpp(cpp11::external_pointer data, cpp11::external_pointer residual, cpp11::external_pointer forest_samples, cpp11::external_pointer active_forest, cpp11::external_pointer tracker, cpp11::external_pointer split_prior, cpp11::external_pointer rng, cpp11::integers sweep_indices, cpp11::integers feature_types, int cutpoint_grid_size, cpp11::doubles_matrix<> leaf_model_scale_input, cpp11::doubles variable_weights, double a_forest, double b_forest, double global_variance, int leaf_model_int, bool keep_forest, int num_features_subsample); +extern "C" SEXP _stochtree_sample_gfr_one_iteration_cpp(SEXP data, SEXP residual, SEXP forest_samples, SEXP active_forest, SEXP tracker, SEXP split_prior, SEXP rng, SEXP sweep_indices, SEXP feature_types, SEXP cutpoint_grid_size, SEXP leaf_model_scale_input, SEXP variable_weights, SEXP a_forest, SEXP b_forest, SEXP global_variance, SEXP leaf_model_int, SEXP keep_forest, SEXP num_features_subsample) { BEGIN_CPP11 - sample_gfr_one_iteration_cpp(cpp11::as_cpp>>(data), cpp11::as_cpp>>(residual), cpp11::as_cpp>>(forest_samples), cpp11::as_cpp>>(active_forest), cpp11::as_cpp>>(tracker), cpp11::as_cpp>>(split_prior), cpp11::as_cpp>>(rng), cpp11::as_cpp>(sweep_indices), cpp11::as_cpp>(feature_types), cpp11::as_cpp>(cutpoint_grid_size), cpp11::as_cpp>>(leaf_model_scale_input), cpp11::as_cpp>(variable_weights), cpp11::as_cpp>(a_forest), cpp11::as_cpp>(b_forest), cpp11::as_cpp>(global_variance), cpp11::as_cpp>(leaf_model_int), cpp11::as_cpp>(keep_forest)); + sample_gfr_one_iteration_cpp(cpp11::as_cpp>>(data), cpp11::as_cpp>>(residual), cpp11::as_cpp>>(forest_samples), cpp11::as_cpp>>(active_forest), cpp11::as_cpp>>(tracker), cpp11::as_cpp>>(split_prior), cpp11::as_cpp>>(rng), cpp11::as_cpp>(sweep_indices), cpp11::as_cpp>(feature_types), cpp11::as_cpp>(cutpoint_grid_size), cpp11::as_cpp>>(leaf_model_scale_input), cpp11::as_cpp>(variable_weights), cpp11::as_cpp>(a_forest), cpp11::as_cpp>(b_forest), cpp11::as_cpp>(global_variance), cpp11::as_cpp>(leaf_model_int), cpp11::as_cpp>(keep_forest), cpp11::as_cpp>(num_features_subsample)); return R_NilValue; END_CPP11 } @@ -1676,7 +1676,7 @@ static const R_CallMethodDef CallEntries[] = { {"_stochtree_rng_cpp", (DL_FUNC) &_stochtree_rng_cpp, 1}, {"_stochtree_root_reset_active_forest_cpp", (DL_FUNC) &_stochtree_root_reset_active_forest_cpp, 1}, {"_stochtree_root_reset_rfx_tracker_cpp", (DL_FUNC) &_stochtree_root_reset_rfx_tracker_cpp, 4}, - {"_stochtree_sample_gfr_one_iteration_cpp", (DL_FUNC) &_stochtree_sample_gfr_one_iteration_cpp, 17}, + {"_stochtree_sample_gfr_one_iteration_cpp", (DL_FUNC) &_stochtree_sample_gfr_one_iteration_cpp, 18}, {"_stochtree_sample_mcmc_one_iteration_cpp", (DL_FUNC) &_stochtree_sample_mcmc_one_iteration_cpp, 17}, {"_stochtree_sample_sigma2_one_iteration_cpp", (DL_FUNC) &_stochtree_sample_sigma2_one_iteration_cpp, 5}, {"_stochtree_sample_tau_one_iteration_cpp", (DL_FUNC) &_stochtree_sample_tau_one_iteration_cpp, 4}, diff --git a/src/sampler.cpp b/src/sampler.cpp index 30ff2540..1a5a5bb5 100644 --- a/src/sampler.cpp +++ b/src/sampler.cpp @@ -26,7 +26,7 @@ void sample_gfr_one_iteration_cpp(cpp11::external_pointer(*active_forest, *tracker, *forest_samples, std::get(leaf_model), *data, *residual, *split_prior, *rng, var_weights_vector, sweep_indices_, global_variance, feature_types_, cutpoint_grid_size, keep_forest, pre_initialized, true); + StochTree::GFRSampleOneIter(*active_forest, *tracker, *forest_samples, std::get(leaf_model), *data, *residual, *split_prior, *rng, var_weights_vector, sweep_indices_, global_variance, feature_types_, cutpoint_grid_size, keep_forest, pre_initialized, true, num_features_subsample); } else if (model_type == StochTree::ModelType::kUnivariateRegressionLeafGaussian) { - StochTree::GFRSampleOneIter(*active_forest, *tracker, *forest_samples, std::get(leaf_model), *data, *residual, *split_prior, *rng, var_weights_vector, sweep_indices_, global_variance, feature_types_, cutpoint_grid_size, keep_forest, pre_initialized, true); + StochTree::GFRSampleOneIter(*active_forest, *tracker, *forest_samples, std::get(leaf_model), *data, *residual, *split_prior, *rng, var_weights_vector, sweep_indices_, global_variance, feature_types_, cutpoint_grid_size, keep_forest, pre_initialized, true, num_features_subsample); } else if (model_type == StochTree::ModelType::kMultivariateRegressionLeafGaussian) { - StochTree::GFRSampleOneIter(*active_forest, *tracker, *forest_samples, std::get(leaf_model), *data, *residual, *split_prior, *rng, var_weights_vector, sweep_indices_, global_variance, feature_types_, cutpoint_grid_size, keep_forest, pre_initialized, true, num_basis); + StochTree::GFRSampleOneIter(*active_forest, *tracker, *forest_samples, std::get(leaf_model), *data, *residual, *split_prior, *rng, var_weights_vector, sweep_indices_, global_variance, feature_types_, cutpoint_grid_size, keep_forest, pre_initialized, true, num_features_subsample, num_basis); } else if (model_type == StochTree::ModelType::kLogLinearVariance) { - StochTree::GFRSampleOneIter(*active_forest, *tracker, *forest_samples, std::get(leaf_model), *data, *residual, *split_prior, *rng, var_weights_vector, sweep_indices_, global_variance, feature_types_, cutpoint_grid_size, keep_forest, pre_initialized, false); + StochTree::GFRSampleOneIter(*active_forest, *tracker, *forest_samples, std::get(leaf_model), *data, *residual, *split_prior, *rng, var_weights_vector, sweep_indices_, global_variance, feature_types_, cutpoint_grid_size, keep_forest, pre_initialized, false, num_features_subsample); } } diff --git a/tools/debug/gfr_feature_subsample_debug.R b/tools/debug/gfr_feature_subsample_debug.R new file mode 100644 index 00000000..e724c6f5 --- /dev/null +++ b/tools/debug/gfr_feature_subsample_debug.R @@ -0,0 +1,36 @@ +# Load libraries +library(stochtree) + +# Generate the data +n <- 500 +p <- 20 +snr <- 2 +X <- matrix(runif(n*p), ncol = p) +f_XW <- sin(4*pi*X[,1]) + cos(4*pi*X[,2]) + sin(4*pi*X[,3]) + cos(4*pi*X[,4]) +noise_sd <- sd(f_XW) / snr +y <- f_XW + rnorm(n, 0, 1)*noise_sd + +# Split data into test and train sets +test_set_pct <- 0.2 +n_test <- round(test_set_pct*n) +n_train <- n - n_test +test_inds <- sort(sample(1:n, n_test, replace = FALSE)) +train_inds <- (1:n)[!((1:n) %in% test_inds)] +X_test <- as.data.frame(X[test_inds,]) +X_train <- as.data.frame(X[train_inds,]) +y_test <- y[test_inds] +y_train <- y[train_inds] + +# Sampler settings +num_gfr <- 100 +num_burnin <- 0 +num_mcmc <- 20 + +# Run the sampler +general_params <- list(random_seed = 1234, standardize = F, sample_sigma2_global = T) +mean_params <- list(num_trees = 100, num_features_subsample = 5) +bart_model <- stochtree::bart( + X_train = X, y_train = y, + num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, + general_params = general_params, mean_forest_params = mean_params +) \ No newline at end of file diff --git a/tools/perf/gfr_feature_subsample_microbenchmark.R b/tools/perf/gfr_feature_subsample_microbenchmark.R new file mode 100644 index 00000000..fa289a28 --- /dev/null +++ b/tools/perf/gfr_feature_subsample_microbenchmark.R @@ -0,0 +1,64 @@ +# Load libraries +library(stochtree) +library(microbenchmark) + +# Generate the data +n <- 5000 +p <- 100 +snr <- 2 +X <- matrix(runif(n*p), ncol = p) +f_XW <- sin(4*pi*X[,1]) + cos(4*pi*X[,2]) + sin(4*pi*X[,3]) + cos(4*pi*X[,4]) +noise_sd <- sd(f_XW) / snr +y <- f_XW + rnorm(n, 0, 1)*noise_sd + +# Split data into test and train sets +test_set_pct <- 0.2 +n_test <- round(test_set_pct*n) +n_train <- n - n_test +test_inds <- sort(sample(1:n, n_test, replace = FALSE)) +train_inds <- (1:n)[!((1:n) %in% test_inds)] +X_test <- as.data.frame(X[test_inds,]) +X_train <- as.data.frame(X[train_inds,]) +y_test <- y[test_inds] +y_train <- y[train_inds] + +# Sampler settings +num_gfr <- 100 +num_burnin <- 0 +num_mcmc <- 0 +general_params <- list(sample_sigma2_global = T) +mean_params_a <- list(num_trees = 100, num_features_subsample = 5) +mean_params_b <- list(num_trees = 100) + +# Benchmark sampler with and without feature subsampling +microbenchmark::microbenchmark( + stochtree::bart( + X_train = X, y_train = y, + num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, + general_params = general_params, mean_forest_params = mean_params_a + ), + stochtree::bart( + X_train = X, y_train = y, + num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, + general_params = general_params, mean_forest_params = mean_params_b + ), + times = 5 +) + +Rprof() +stochtree::bart( + X_train = X, y_train = y, + num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, + general_params = general_params, mean_forest_params = mean_params_a +) +Rprof(NULL) +summaryRprof() + +Rprof() +stochtree::bart( + X_train = X, y_train = y, + num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, + general_params = general_params, mean_forest_params = mean_params_b +) +Rprof(NULL) +summaryRprof() From d09729364446f977d3ed0b8994a775098663f273 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Mon, 9 Jun 2025 18:33:58 -0500 Subject: [PATCH 05/12] Updated Python BART interface --- R/bart.R | 13 ++-- R/config.R | 2 +- .../supervised_learning_feature_subsets.py | 54 ++++++++++++++++ src/py_stochtree.cpp | 10 +-- stochtree/bart.py | 18 +++++- stochtree/config.py | 62 ++++++++++++++++++- stochtree/sampler.py | 1 + vignettes/BayesianSupervisedLearning.Rmd | 10 ++- 8 files changed, 148 insertions(+), 22 deletions(-) create mode 100644 demo/debug/supervised_learning_feature_subsets.py diff --git a/R/bart.R b/R/bart.R index 6576c83e..1b28ee83 100644 --- a/R/bart.R +++ b/R/bart.R @@ -44,6 +44,7 @@ #' - `keep_every` How many iterations of the burned-in MCMC sampler should be run before forests and parameters are retained. Default `1`. Setting `keep_every <- k` for some `k > 1` will "thin" the MCMC samples by retaining every `k`-th sample, rather than simply every sample. This can reduce the autocorrelation of the MCMC samples. #' - `num_chains` How many independent MCMC chains should be sampled. If `num_mcmc = 0`, this is ignored. If `num_gfr = 0`, then each chain is run from root for `num_mcmc * keep_every + num_burnin` iterations, with `num_mcmc` samples retained. If `num_gfr > 0`, each MCMC chain will be initialized from a separate GFR ensemble, with the requirement that `num_gfr >= num_chains`. Default: `1`. #' - `verbose` Whether or not to print progress during the sampling loops. Default: `FALSE`. +#' - `probit_outcome_model` Whether or not the outcome should be modeled as explicitly binary via a probit link. If `TRUE`, `y` must only contain the values `0` and `1`. Default: `FALSE`. #' #' @param mean_forest_params (Optional) A list of mean forest model parameters, each of which has a default value processed internally, so this argument list is optional. #' @@ -58,8 +59,7 @@ #' - `sigma2_leaf_scale` Scale parameter in the `IG(sigma2_leaf_shape, sigma2_leaf_scale)` leaf node parameter variance model. Calibrated internally as `0.5/num_trees` if not set here. #' - `keep_vars` Vector of variable names or column indices denoting variables that should be included in the forest. Default: `NULL`. #' - `drop_vars` Vector of variable names or column indices denoting variables that should be excluded from the forest. Default: `NULL`. If both `drop_vars` and `keep_vars` are set, `drop_vars` will be ignored. -#' - `probit_outcome_model` Whether or not the outcome should be modeled as explicitly binary via a probit link. If `TRUE`, `y` must only contain the values `0` and `1`. Default: `FALSE`. -#' - `num_features_subsample` How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features passed in the training dataset. +#' - `num_features_subsample` How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features in the training dataset. #' #' @param variance_forest_params (Optional) A list of variance forest model parameters, each of which has a default value processed internally, so this argument list is optional. #' @@ -74,7 +74,7 @@ #' - `var_forest_prior_scale` Scale parameter in the `IG(var_forest_prior_shape, var_forest_prior_scale)` conditional error variance model (which is only sampled if `num_trees > 0`). Calibrated internally as `num_trees / leaf_prior_calibration_param^2` if not set. #' - `keep_vars` Vector of variable names or column indices denoting variables that should be included in the forest. Default: `NULL`. #' - `drop_vars` Vector of variable names or column indices denoting variables that should be excluded from the forest. Default: `NULL`. If both `drop_vars` and `keep_vars` are set, `drop_vars` will be ignored. -#' - `num_features_subsample` How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features passed in the training dataset. +#' - `num_features_subsample` How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features in the training dataset. #' #' @return List of sampling outputs and a wrapper around the sampled forests (which can be used for in-memory prediction on new data, or serialized to JSON on disk). #' @export @@ -117,7 +117,8 @@ bart <- function(X_train, y_train, leaf_basis_train = NULL, rfx_group_ids_train sigma2_global_shape = 0, sigma2_global_scale = 0, variable_weights = NULL, random_seed = -1, keep_burnin = FALSE, keep_gfr = FALSE, keep_every = 1, - num_chains = 1, verbose = FALSE + num_chains = 1, verbose = FALSE, + probit_outcome_model = FALSE ) general_params_updated <- preprocessParams( general_params_default, general_params @@ -130,7 +131,6 @@ bart <- function(X_train, y_train, leaf_basis_train = NULL, rfx_group_ids_train sample_sigma2_leaf = TRUE, sigma2_leaf_init = NULL, sigma2_leaf_shape = 3, sigma2_leaf_scale = NULL, keep_vars = NULL, drop_vars = NULL, - probit_outcome_model = FALSE, num_features_subsample = NULL ) mean_forest_params_updated <- preprocessParams( @@ -167,6 +167,7 @@ bart <- function(X_train, y_train, leaf_basis_train = NULL, rfx_group_ids_train keep_every <- general_params_updated$keep_every num_chains <- general_params_updated$num_chains verbose <- general_params_updated$verbose + probit_outcome_model <- general_params_updated$probit_outcome_model # 2. Mean forest parameters num_trees_mean <- mean_forest_params_updated$num_trees @@ -180,7 +181,6 @@ bart <- function(X_train, y_train, leaf_basis_train = NULL, rfx_group_ids_train b_leaf <- mean_forest_params_updated$sigma2_leaf_scale keep_vars_mean <- mean_forest_params_updated$keep_vars drop_vars_mean <- mean_forest_params_updated$drop_vars - probit_outcome_model <- mean_forest_params_updated$probit_outcome_model num_features_subsample_mean <- mean_forest_params_updated$num_features_subsample # 3. Variance forest parameters @@ -388,7 +388,6 @@ bart <- function(X_train, y_train, leaf_basis_train = NULL, rfx_group_ids_train if (is.null(num_features_subsample_variance)) { num_features_subsample_variance <- ncol(X_train) } - # Convert all input data to matrices if not already converted if ((is.null(dim(leaf_basis_train))) && (!is.null(leaf_basis_train))) { diff --git a/R/config.R b/R/config.R index 53c259ea..0e2a0b40 100644 --- a/R/config.R +++ b/R/config.R @@ -143,7 +143,7 @@ ForestModelConfig <- R6::R6Class( stop("`num_features_subsample` cannot be larger than `num_features`") } if (num_features_subsample <= 0) { - stop("`num_features_subsample` must at least 1") + stop("`num_features_subsample` must be at least 1") } self$num_features_subsample <- num_features_subsample diff --git a/demo/debug/supervised_learning_feature_subsets.py b/demo/debug/supervised_learning_feature_subsets.py new file mode 100644 index 00000000..e3d80bc5 --- /dev/null +++ b/demo/debug/supervised_learning_feature_subsets.py @@ -0,0 +1,54 @@ +# Supervised Learning Demo Script + +# Load necessary libraries +import numpy as np +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +from stochtree import BARTModel +from sklearn.model_selection import train_test_split + +# Generate sample data +# RNG +random_seed = 1234 +rng = np.random.default_rng(random_seed) + +# Generate covariates and basis +n = 1000 +p_X = 20 +X = rng.uniform(0, 1, (n, p_X)) + +# Define the outcome mean function +def outcome_mean(X): + return np.where( + (X[:,0] >= 0.0) & (X[:,0] < 0.25), -7.5, + np.where( + (X[:,0] >= 0.25) & (X[:,0] < 0.5), -2.5, + np.where( + (X[:,0] >= 0.5) & (X[:,0] < 0.75), 2.5, + 7.5 + ) + ) + ) + +# Generate outcome +epsilon = rng.normal(0, 1, n) +y = outcome_mean(X) + epsilon + +# Test-train split +sample_inds = np.arange(n) +train_inds, test_inds = train_test_split(sample_inds, test_size=0.2) +X_train = X[train_inds,:] +X_test = X[test_inds,:] +y_train = y[train_inds] +y_test = y[test_inds] + +# Run XBART with the full feature set +bart_model_a = BARTModel() +forest_config_a = {"num_trees": 100} +bart_model_a.sample(X_train=X_train, y_train=y_train, X_test=X_test, num_gfr=100, num_mcmc=0, mean_forest_params=forest_config_a) + +# Run XBART with each tree considering random subsets of 5 features +bart_model_b = BARTModel() +forest_config_b = {"num_trees": 100, "num_features_subsample": 5} +bart_model_b.sample(X_train=X_train, y_train=y_train, X_test=X_test, num_gfr=100, num_mcmc=0, mean_forest_params=forest_config_b) diff --git a/src/py_stochtree.cpp b/src/py_stochtree.cpp index 916bffa9..5ff25e72 100644 --- a/src/py_stochtree.cpp +++ b/src/py_stochtree.cpp @@ -1028,7 +1028,7 @@ class ForestSamplerCpp { void SampleOneIteration(ForestContainerCpp& forest_samples, ForestCpp& forest, ForestDatasetCpp& dataset, ResidualCpp& residual, RngCpp& rng, py::array_t feature_types, py::array_t sweep_update_indices, int cutpoint_grid_size, py::array_t leaf_model_scale_input, py::array_t variable_weights, double a_forest, double b_forest, double global_variance, - int leaf_model_int, bool keep_forest = true, bool gfr = true) { + int leaf_model_int, int num_features_subsample, bool keep_forest = true, bool gfr = true) { // Refactoring completely out of the Python interface. // Intention to refactor out of the C++ interface in the future. bool pre_initialized = true; @@ -1090,13 +1090,13 @@ class ForestSamplerCpp { std::mt19937* rng_ptr = rng.GetRng(); if (gfr) { if (model_type == StochTree::ModelType::kConstantLeafGaussian) { - StochTree::GFRSampleOneIter(*active_forest_ptr, *(tracker_.get()), *forest_sample_ptr, std::get(leaf_model), *forest_data_ptr, *residual_data_ptr, *(split_prior_.get()), *rng_ptr, var_weights_vector, sweep_update_indices_, global_variance, feature_types_, cutpoint_grid_size, keep_forest, pre_initialized, true); + StochTree::GFRSampleOneIter(*active_forest_ptr, *(tracker_.get()), *forest_sample_ptr, std::get(leaf_model), *forest_data_ptr, *residual_data_ptr, *(split_prior_.get()), *rng_ptr, var_weights_vector, sweep_update_indices_, global_variance, feature_types_, cutpoint_grid_size, keep_forest, pre_initialized, true, num_features_subsample); } else if (model_type == StochTree::ModelType::kUnivariateRegressionLeafGaussian) { - StochTree::GFRSampleOneIter(*active_forest_ptr, *(tracker_.get()), *forest_sample_ptr, std::get(leaf_model), *forest_data_ptr, *residual_data_ptr, *(split_prior_.get()), *rng_ptr, var_weights_vector, sweep_update_indices_, global_variance, feature_types_, cutpoint_grid_size, keep_forest, pre_initialized, true); + StochTree::GFRSampleOneIter(*active_forest_ptr, *(tracker_.get()), *forest_sample_ptr, std::get(leaf_model), *forest_data_ptr, *residual_data_ptr, *(split_prior_.get()), *rng_ptr, var_weights_vector, sweep_update_indices_, global_variance, feature_types_, cutpoint_grid_size, keep_forest, pre_initialized, true, num_features_subsample); } else if (model_type == StochTree::ModelType::kMultivariateRegressionLeafGaussian) { - StochTree::GFRSampleOneIter(*active_forest_ptr, *(tracker_.get()), *forest_sample_ptr, std::get(leaf_model), *forest_data_ptr, *residual_data_ptr, *(split_prior_.get()), *rng_ptr, var_weights_vector, sweep_update_indices_, global_variance, feature_types_, cutpoint_grid_size, keep_forest, pre_initialized, true, num_basis); + StochTree::GFRSampleOneIter(*active_forest_ptr, *(tracker_.get()), *forest_sample_ptr, std::get(leaf_model), *forest_data_ptr, *residual_data_ptr, *(split_prior_.get()), *rng_ptr, var_weights_vector, sweep_update_indices_, global_variance, feature_types_, cutpoint_grid_size, keep_forest, pre_initialized, true, num_features_subsample, num_basis); } else if (model_type == StochTree::ModelType::kLogLinearVariance) { - StochTree::GFRSampleOneIter(*active_forest_ptr, *(tracker_.get()), *forest_sample_ptr, std::get(leaf_model), *forest_data_ptr, *residual_data_ptr, *(split_prior_.get()), *rng_ptr, var_weights_vector, sweep_update_indices_, global_variance, feature_types_, cutpoint_grid_size, keep_forest, pre_initialized, false); + StochTree::GFRSampleOneIter(*active_forest_ptr, *(tracker_.get()), *forest_sample_ptr, std::get(leaf_model), *forest_data_ptr, *residual_data_ptr, *(split_prior_.get()), *rng_ptr, var_weights_vector, sweep_update_indices_, global_variance, feature_types_, cutpoint_grid_size, keep_forest, pre_initialized, false, num_features_subsample); } } else { if (model_type == StochTree::ModelType::kConstantLeafGaussian) { diff --git a/stochtree/bart.py b/stochtree/bart.py index 0539129a..1b51491e 100644 --- a/stochtree/bart.py +++ b/stochtree/bart.py @@ -131,6 +131,7 @@ def sample( * `keep_gfr` (`bool`): Whether or not "warm-start" / grow-from-root samples should be included in predictions. Defaults to `False`. Ignored if `num_mcmc == 0`. * `keep_every` (`int`): How many iterations of the burned-in MCMC sampler should be run before forests and parameters are retained. Defaults to `1`. Setting `keep_every = k` for some `k > 1` will "thin" the MCMC samples by retaining every `k`-th sample, rather than simply every sample. This can reduce the autocorrelation of the MCMC samples. * `num_chains` (`int`): How many independent MCMC chains should be sampled. If `num_mcmc = 0`, this is ignored. If `num_gfr = 0`, then each chain is run from root for `num_mcmc * keep_every + num_burnin` iterations, with `num_mcmc` samples retained. If `num_gfr > 0`, each MCMC chain will be initialized from a separate GFR ensemble, with the requirement that `num_gfr >= num_chains`. Defaults to `1`. + * `probit_outcome_model` (`bool`): Whether or not the outcome should be modeled as explicitly binary via a probit link. If `True`, `y` must only contain the values `0` and `1`. Default: `False`. mean_forest_params : dict, optional Dictionary of mean forest model parameters, each of which has a default value processed internally, so this argument is optional. @@ -146,7 +147,7 @@ def sample( * `sigma2_leaf_scale` (`float`): Scale parameter in the `IG(sigma2_leaf_shape, sigma2_leaf_scale)` leaf node parameter variance model. Calibrated internally as `0.5/num_trees` if not set here. * `keep_vars` (`list` or `np.array`): Vector of variable names or column indices denoting variables that should be included in the mean forest. Defaults to `None`. * `drop_vars` (`list` or `np.array`): Vector of variable names or column indices denoting variables that should be excluded from the mean forest. Defaults to `None`. If both `drop_vars` and `keep_vars` are set, `drop_vars` will be ignored. - * `probit_outcome_model` (`bool`): Whether or not the outcome should be modeled as explicitly binary via a probit link. If `True`, `y` must only contain the values `0` and `1`. Default: `False`. + * `num_features_subsample` (`int`): How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features in the training dataset. variance_forest_params : dict, optional Dictionary of variance forest model parameters, each of which has a default value processed internally, so this argument is optional. @@ -162,6 +163,7 @@ def sample( * `var_forest_prior_scale` (`float`): Scale parameter in the [optional] `IG(var_forest_prior_shape, var_forest_prior_scale)` conditional error variance forest (which is only sampled if `num_trees > 0`). Calibrated internally as `num_trees / leaf_prior_calibration_param^2` if not set here. * `keep_vars` (`list` or `np.array`): Vector of variable names or column indices denoting variables that should be included in the variance forest. Defaults to `None`. * `drop_vars` (`list` or `np.array`): Vector of variable names or column indices denoting variables that should be excluded from the variance forest. Defaults to `None`. If both `drop_vars` and `keep_vars` are set, `drop_vars` will be ignored. + * `num_features_subsample` (`int`): How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features in the training dataset. previous_model_json : str, optional JSON string containing a previous BART model. This can be used to "continue" a sampler interactively after inspecting the samples or to run parallel chains "warm-started" from existing forest samples. Defaults to `None`. @@ -206,6 +208,7 @@ def sample( "sigma2_leaf_scale": None, "keep_vars": None, "drop_vars": None, + "num_features_subsample": None, } mean_forest_params_updated = _preprocess_params( mean_forest_params_default, mean_forest_params @@ -224,6 +227,7 @@ def sample( "var_forest_prior_scale": None, "keep_vars": None, "drop_vars": None, + "num_features_subsample": None, } variance_forest_params_updated = _preprocess_params( variance_forest_params_default, variance_forest_params @@ -257,6 +261,7 @@ def sample( b_leaf = mean_forest_params_updated["sigma2_leaf_scale"] keep_vars_mean = mean_forest_params_updated["keep_vars"] drop_vars_mean = mean_forest_params_updated["drop_vars"] + num_features_subsample_mean = mean_forest_params_updated["num_features_subsample"] # 3. Variance forest parameters num_trees_variance = variance_forest_params_updated["num_trees"] @@ -272,6 +277,7 @@ def sample( b_forest = variance_forest_params_updated["var_forest_prior_scale"] keep_vars_variance = variance_forest_params_updated["keep_vars"] drop_vars_variance = variance_forest_params_updated["drop_vars"] + num_features_subsample_variance = variance_forest_params_updated["num_features_subsample"] # Override keep_gfr if there are no MCMC samples if num_mcmc == 0: @@ -714,6 +720,12 @@ def sample( [variable_subset_variance.count(i) == 0 for i in original_var_indices] ] = 0 + # Set num_features_subsample to default, ncol(X_train), if not already set + if num_features_subsample_mean is None: + num_features_subsample_mean = X_train.shape[1] + if num_features_subsample_variance is None: + num_features_subsample_variance = X_train.shape[1] + # Preliminary runtime checks for probit link if not self.include_mean_forest: self.probit_outcome_model = False @@ -1048,7 +1060,8 @@ def sample( max_depth=max_depth_mean, leaf_model_type=leaf_model_mean_forest, leaf_model_scale=current_leaf_scale, - cutpoint_grid_size=cutpoint_grid_size, + cutpoint_grid_size=cutpoint_grid_size, + num_features_subsample=num_features_subsample_mean ) forest_sampler_mean = ForestSampler( forest_dataset_train, @@ -1071,6 +1084,7 @@ def sample( cutpoint_grid_size=cutpoint_grid_size, variance_forest_shape=a_forest, variance_forest_scale=b_forest, + num_features_subsample=num_features_subsample_variance ) forest_sampler_variance = ForestSampler( forest_dataset_train, diff --git a/stochtree/config.py b/stochtree/config.py index 7651e088..72cae512 100644 --- a/stochtree/config.py +++ b/stochtree/config.py @@ -52,7 +52,9 @@ class ForestModelConfig: variance_forest_scale : int, optional Scale parameter for IG leaf models (applicable when `leaf_model_type = 3`). Default: `1`. cutpoint_grid_size : int, optional - Number of unique cutpoints to consider (default: `100`) + Number of unique cutpoints to consider (default: `100`). + num_features_subsample : int, optional + Number of features to subsample for the GFR algorithm (default: `None`). """ def __init__( @@ -73,6 +75,7 @@ def __init__( variance_forest_shape=1.0, variance_forest_scale=1.0, cutpoint_grid_size=100, + num_features_subsample=None, ) -> None: # Preprocess inputs and run some error checks if feature_types is None: @@ -150,6 +153,29 @@ def __init__( ) self.sweep_update_indices = sweep_update_indices + if sweep_update_indices is not None: + sweep_update_indices = _standardize_array_to_np(sweep_update_indices) + if np.min(sweep_update_indices) < 0: + raise ValueError( + "sweep_update_indices must be a list / np.array of indices >= 0 and < num_trees", + ) + if np.max(sweep_update_indices) >= num_trees: + raise ValueError( + "sweep_update_indices must be a list / np.array of indices >= 0 and < num_trees", + ) + + if num_features_subsample is None: + num_features_subsample = num_features + if num_features_subsample > num_features: + raise ValueError( + "`num_features_subsample` cannot be larger than `num_features`", + ) + if num_features_subsample <= 0: + raise ValueError( + "`num_features_subsample` must be at least 1", + ) + self.num_features_subsample = num_features_subsample + # Set internal config values self.num_trees = num_trees self.num_features = num_features @@ -372,6 +398,29 @@ def update_cutpoint_grid_size(self, cutpoint_grid_size: int) -> None: """ self.cutpoint_grid_size = cutpoint_grid_size + def update_num_features_subsample(self, num_features_subsample: int) -> None: + """ + Update number of features to subsample for the GFR algorithm. + + Parameters + ---------- + num_features_subsample : int + Number of features to subsample for the GFR algorithm. + + Returns + ------- + self + """ + if num_features_subsample > self.num_features: + raise ValueError( + "`num_features_subsample` cannot be larger than `num_features`", + ) + if num_features_subsample <= 0: + raise ValueError( + "`num_features_subsample` must be at least 1", + ) + self.num_features_subsample = num_features_subsample + def get_feature_types(self) -> np.ndarray: """ Query feature types (integer-coded so that 0 = numeric, 1 = ordered categorical, 2 = unordered categorical) @@ -537,6 +586,17 @@ def get_cutpoint_grid_size(self) -> int: """ return self.cutpoint_grid_size + def get_num_features_subsample(self) -> int: + """ + Query number of features to subsample for the GFR algorithm + + Returns + ------- + num_features_subsample : int + Number of features to subsample for the GFR algorithm + """ + return self.num_features_subsample + class GlobalModelConfig: """ diff --git a/stochtree/sampler.py b/stochtree/sampler.py index 3586c24a..be55286a 100644 --- a/stochtree/sampler.py +++ b/stochtree/sampler.py @@ -170,6 +170,7 @@ def sample_one_iteration( forest_config.get_variance_forest_scale(), global_config.get_global_error_variance(), forest_config.get_leaf_model_type(), + forest_config.get_num_features_subsample(), keep_forest, gfr, ) diff --git a/vignettes/BayesianSupervisedLearning.Rmd b/vignettes/BayesianSupervisedLearning.Rmd index 250f74c1..4270fed4 100644 --- a/vignettes/BayesianSupervisedLearning.Rmd +++ b/vignettes/BayesianSupervisedLearning.Rmd @@ -374,9 +374,8 @@ num_gfr <- 10 num_burnin <- 0 num_mcmc <- 100 num_samples <- num_gfr + num_burnin + num_mcmc -general_params <- list(sample_sigma2_global = F) -mean_forest_params <- list(sample_sigma2_leaf = F, num_trees = 100, - probit_outcome_model = T) +general_params <- list(sample_sigma2_global = F, probit_outcome_model = T) +mean_forest_params <- list(sample_sigma2_leaf = F, num_trees = 100) bart_model_warmstart <- stochtree::bart( X_train = X_train, y_train = y_train, X_test = X_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, @@ -447,9 +446,8 @@ num_gfr <- 0 num_burnin <- 100 num_mcmc <- 100 num_samples <- num_gfr + num_burnin + num_mcmc -general_params <- list(sample_sigma2_global = F) -mean_forest_params <- list(sample_sigma2_leaf = F, num_trees = 100, - probit_outcome_model = T) +general_params <- list(sample_sigma2_global = F, probit_outcome_model = T) +mean_forest_params <- list(sample_sigma2_leaf = F, num_trees = 100) bart_model_root <- stochtree::bart( X_train = X_train, y_train = y_train, X_test = X_test, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, From ff61ed81fb857776bc1ab3d5cb24015cb3c230df Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Tue, 10 Jun 2025 13:56:39 -0500 Subject: [PATCH 06/12] Updated examples and docs --- demo/debug/supervised_learning_feature_subsets.py | 9 ++++++++- man/bart.Rd | 6 +++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/demo/debug/supervised_learning_feature_subsets.py b/demo/debug/supervised_learning_feature_subsets.py index e3d80bc5..9ddbb447 100644 --- a/demo/debug/supervised_learning_feature_subsets.py +++ b/demo/debug/supervised_learning_feature_subsets.py @@ -7,6 +7,7 @@ import matplotlib.pyplot as plt from stochtree import BARTModel from sklearn.model_selection import train_test_split +import timeit # Generate sample data # RNG @@ -15,7 +16,7 @@ # Generate covariates and basis n = 1000 -p_X = 20 +p_X = 100 X = rng.uniform(0, 1, (n, p_X)) # Define the outcome mean function @@ -44,11 +45,17 @@ def outcome_mean(X): y_test = y[test_inds] # Run XBART with the full feature set +s = """\ bart_model_a = BARTModel() forest_config_a = {"num_trees": 100} bart_model_a.sample(X_train=X_train, y_train=y_train, X_test=X_test, num_gfr=100, num_mcmc=0, mean_forest_params=forest_config_a) +""" +print(timeit.timeit(stmt=s, number=5, globals=globals())) # Run XBART with each tree considering random subsets of 5 features +s = """\ bart_model_b = BARTModel() forest_config_b = {"num_trees": 100, "num_features_subsample": 5} bart_model_b.sample(X_train=X_train, y_train=y_train, X_test=X_test, num_gfr=100, num_mcmc=0, mean_forest_params=forest_config_b) +""" +print(timeit.timeit(stmt=s, number=5, globals=globals())) diff --git a/man/bart.Rd b/man/bart.Rd index 0ea4d966..5712271e 100644 --- a/man/bart.Rd +++ b/man/bart.Rd @@ -83,6 +83,7 @@ that were not in the training set.} \item \code{keep_every} How many iterations of the burned-in MCMC sampler should be run before forests and parameters are retained. Default \code{1}. Setting \code{keep_every <- k} for some \code{k > 1} will "thin" the MCMC samples by retaining every \code{k}-th sample, rather than simply every sample. This can reduce the autocorrelation of the MCMC samples. \item \code{num_chains} How many independent MCMC chains should be sampled. If \code{num_mcmc = 0}, this is ignored. If \code{num_gfr = 0}, then each chain is run from root for \code{num_mcmc * keep_every + num_burnin} iterations, with \code{num_mcmc} samples retained. If \code{num_gfr > 0}, each MCMC chain will be initialized from a separate GFR ensemble, with the requirement that \code{num_gfr >= num_chains}. Default: \code{1}. \item \code{verbose} Whether or not to print progress during the sampling loops. Default: \code{FALSE}. +\item \code{probit_outcome_model} Whether or not the outcome should be modeled as explicitly binary via a probit link. If \code{TRUE}, \code{y} must only contain the values \code{0} and \code{1}. Default: \code{FALSE}. }} \item{mean_forest_params}{(Optional) A list of mean forest model parameters, each of which has a default value processed internally, so this argument list is optional. @@ -98,8 +99,7 @@ that were not in the training set.} \item \code{sigma2_leaf_scale} Scale parameter in the \code{IG(sigma2_leaf_shape, sigma2_leaf_scale)} leaf node parameter variance model. Calibrated internally as \code{0.5/num_trees} if not set here. \item \code{keep_vars} Vector of variable names or column indices denoting variables that should be included in the forest. Default: \code{NULL}. \item \code{drop_vars} Vector of variable names or column indices denoting variables that should be excluded from the forest. Default: \code{NULL}. If both \code{drop_vars} and \code{keep_vars} are set, \code{drop_vars} will be ignored. -\item \code{probit_outcome_model} Whether or not the outcome should be modeled as explicitly binary via a probit link. If \code{TRUE}, \code{y} must only contain the values \code{0} and \code{1}. Default: \code{FALSE}. -\item \code{num_features_subsample} How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features passed in the training dataset. +\item \code{num_features_subsample} How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features in the training dataset. }} \item{variance_forest_params}{(Optional) A list of variance forest model parameters, each of which has a default value processed internally, so this argument list is optional. @@ -115,7 +115,7 @@ that were not in the training set.} \item \code{var_forest_prior_scale} Scale parameter in the \code{IG(var_forest_prior_shape, var_forest_prior_scale)} conditional error variance model (which is only sampled if \code{num_trees > 0}). Calibrated internally as \code{num_trees / leaf_prior_calibration_param^2} if not set. \item \code{keep_vars} Vector of variable names or column indices denoting variables that should be included in the forest. Default: \code{NULL}. \item \code{drop_vars} Vector of variable names or column indices denoting variables that should be excluded from the forest. Default: \code{NULL}. If both \code{drop_vars} and \code{keep_vars} are set, \code{drop_vars} will be ignored. -\item \code{num_features_subsample} How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features passed in the training dataset. +\item \code{num_features_subsample} How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features in the training dataset. }} } \value{ From 98ebce15b231379777837040bee06fbdbfcacf90 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Tue, 10 Jun 2025 14:02:12 -0500 Subject: [PATCH 07/12] Updated BCF interface to allow for feature subsampling --- R/bcf.R | 34 +++++++++++++++++++++++++++------- man/bcf.Rd | 3 +++ 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/R/bcf.R b/R/bcf.R index b9842c5d..11696f79 100644 --- a/R/bcf.R +++ b/R/bcf.R @@ -62,6 +62,7 @@ #' - `sigma2_leaf_scale` Scale parameter in the `IG(sigma2_leaf_shape, sigma2_leaf_scale)` leaf node parameter variance model. Calibrated internally as `0.5/num_trees` if not set here. #' - `keep_vars` Vector of variable names or column indices denoting variables that should be included in the forest. Default: `NULL`. #' - `drop_vars` Vector of variable names or column indices denoting variables that should be excluded from the forest. Default: `NULL`. If both `drop_vars` and `keep_vars` are set, `drop_vars` will be ignored. +#' - `num_features_subsample` How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features in the training dataset. #' #' @param treatment_effect_forest_params (Optional) A list of treatment effect forest model parameters, each of which has a default value processed internally, so this argument list is optional. #' @@ -78,6 +79,7 @@ #' - `delta_max` Maximum plausible conditional distributional treatment effect (i.e. P(Y(1) = 1 | X) - P(Y(0) = 1 | X)) when the outcome is binary. Only used when the outcome is specified as a probit model in `general_params`. Must be > 0 and < 1. Default: `0.9`. Ignored if `sigma2_leaf_init` is set directly, as this parameter is used to calibrate `sigma2_leaf_init`. #' - `keep_vars` Vector of variable names or column indices denoting variables that should be included in the forest. Default: `NULL`. #' - `drop_vars` Vector of variable names or column indices denoting variables that should be excluded from the forest. Default: `NULL`. If both `drop_vars` and `keep_vars` are set, `drop_vars` will be ignored. +#' - `num_features_subsample` How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features in the training dataset. #' #' @param variance_forest_params (Optional) A list of variance forest model parameters, each of which has a default value processed internally, so this argument list is optional. #' @@ -92,6 +94,7 @@ #' - `var_forest_prior_scale` Scale parameter in the `IG(var_forest_prior_shape, var_forest_prior_scale)` conditional error variance model (which is only sampled if `num_trees > 0`). Calibrated internally as `num_trees / 1.5^2` if not set. #' - `keep_vars` Vector of variable names or column indices denoting variables that should be included in the forest. Default: `NULL`. #' - `drop_vars` Vector of variable names or column indices denoting variables that should be excluded from the forest. Default: `NULL`. If both `drop_vars` and `keep_vars` are set, `drop_vars` will be ignored. +#' - `num_features_subsample` How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features in the training dataset. #' #' @return List of sampling outputs and a wrapper around the sampled forests (which can be used for in-memory prediction on new data, or serialized to JSON on disk). #' @export @@ -171,7 +174,8 @@ bcf <- function(X_train, Z_train, y_train, propensity_train = NULL, rfx_group_id min_samples_leaf = 5, max_depth = 10, sample_sigma2_leaf = TRUE, sigma2_leaf_init = NULL, sigma2_leaf_shape = 3, sigma2_leaf_scale = NULL, - keep_vars = NULL, drop_vars = NULL + keep_vars = NULL, drop_vars = NULL, + num_features_subsample = NULL ) prognostic_forest_params_updated <- preprocessParams( prognostic_forest_params_default, prognostic_forest_params @@ -183,8 +187,8 @@ bcf <- function(X_train, Z_train, y_train, propensity_train = NULL, rfx_group_id min_samples_leaf = 5, max_depth = 5, sample_sigma2_leaf = FALSE, sigma2_leaf_init = NULL, sigma2_leaf_shape = 3, sigma2_leaf_scale = NULL, - keep_vars = NULL, drop_vars = NULL, - delta_max = 0.9 + keep_vars = NULL, drop_vars = NULL, delta_max = 0.9, + num_features_subsample = NULL ) treatment_effect_forest_params_updated <- preprocessParams( treatment_effect_forest_params_default, treatment_effect_forest_params @@ -198,7 +202,8 @@ bcf <- function(X_train, Z_train, y_train, propensity_train = NULL, rfx_group_id variance_forest_init = NULL, var_forest_prior_shape = NULL, var_forest_prior_scale = NULL, - keep_vars = NULL, drop_vars = NULL + keep_vars = NULL, drop_vars = NULL, + num_features_subsample = NULL ) variance_forest_params_updated <- preprocessParams( variance_forest_params_default, variance_forest_params @@ -238,6 +243,7 @@ bcf <- function(X_train, Z_train, y_train, propensity_train = NULL, rfx_group_id b_leaf_mu <- prognostic_forest_params_updated$sigma2_leaf_scale keep_vars_mu <- prognostic_forest_params_updated$keep_vars drop_vars_mu <- prognostic_forest_params_updated$drop_vars + num_features_subsample_mu <- prognostic_forest_params_updated$num_features_subsample # 3. Tau forest parameters num_trees_tau <- treatment_effect_forest_params_updated$num_trees @@ -252,6 +258,7 @@ bcf <- function(X_train, Z_train, y_train, propensity_train = NULL, rfx_group_id keep_vars_tau <- treatment_effect_forest_params_updated$keep_vars drop_vars_tau <- treatment_effect_forest_params_updated$drop_vars delta_max <- treatment_effect_forest_params_updated$delta_max + num_features_subsample_tau <- treatment_effect_forest_params_updated$num_features_subsample # 4. Variance forest parameters num_trees_variance <- variance_forest_params_updated$num_trees @@ -265,6 +272,7 @@ bcf <- function(X_train, Z_train, y_train, propensity_train = NULL, rfx_group_id b_forest <- variance_forest_params_updated$var_forest_prior_scale keep_vars_variance <- variance_forest_params_updated$keep_vars drop_vars_variance <- variance_forest_params_updated$drop_vars + num_features_subsample_variance <- variance_forest_params_updated$num_features_subsample # Check if there are enough GFR samples to seed num_chains samplers if (num_gfr > 0) { @@ -477,6 +485,17 @@ bcf <- function(X_train, Z_train, y_train, propensity_train = NULL, rfx_group_id X_test_raw <- X_test if (!is.null(X_test)) X_test <- preprocessPredictionData(X_test, X_train_metadata) + # Set num_features_subsample to default, ncol(X_train), if not already set + if (is.null(num_features_subsample_mu)) { + num_features_subsample_mu <- ncol(X_train) + } + if (is.null(num_features_subsample_tau)) { + num_features_subsample_tau <- ncol(X_train) + } + if (is.null(num_features_subsample_variance)) { + num_features_subsample_variance <- ncol(X_train) + } + # Convert all input data to matrices if not already converted if ((is.null(dim(Z_train))) && (!is.null(Z_train))) { Z_train <- as.matrix(as.numeric(Z_train)) @@ -899,12 +918,12 @@ bcf <- function(X_train, Z_train, y_train, propensity_train = NULL, rfx_group_id num_observations=nrow(X_train), variable_weights=variable_weights_mu, leaf_dimension=leaf_dimension_mu_forest, alpha=alpha_mu, beta=beta_mu, min_samples_leaf=min_samples_leaf_mu, max_depth=max_depth_mu, leaf_model_type=leaf_model_mu_forest, leaf_model_scale=current_leaf_scale_mu, - cutpoint_grid_size=cutpoint_grid_size) + cutpoint_grid_size=cutpoint_grid_size, num_features_subsample = num_features_subsample_mu) forest_model_config_tau <- createForestModelConfig(feature_types=feature_types, num_trees=num_trees_tau, num_features=ncol(X_train), num_observations=nrow(X_train), variable_weights=variable_weights_tau, leaf_dimension=leaf_dimension_tau_forest, alpha=alpha_tau, beta=beta_tau, min_samples_leaf=min_samples_leaf_tau, max_depth=max_depth_tau, leaf_model_type=leaf_model_tau_forest, leaf_model_scale=current_leaf_scale_tau, - cutpoint_grid_size=cutpoint_grid_size) + cutpoint_grid_size=cutpoint_grid_size, num_features_subsample = num_features_subsample_tau) forest_model_mu <- createForestModel(forest_dataset_train, forest_model_config_mu, global_model_config) forest_model_tau <- createForestModel(forest_dataset_train, forest_model_config_tau, global_model_config) if (include_variance_forest) { @@ -912,7 +931,8 @@ bcf <- function(X_train, Z_train, y_train, propensity_train = NULL, rfx_group_id num_observations=nrow(X_train), variable_weights=variable_weights_variance, leaf_dimension=leaf_dimension_variance_forest, alpha=alpha_variance, beta=beta_variance, min_samples_leaf=min_samples_leaf_variance, max_depth=max_depth_variance, - leaf_model_type=leaf_model_variance_forest, cutpoint_grid_size=cutpoint_grid_size) + leaf_model_type=leaf_model_variance_forest, cutpoint_grid_size=cutpoint_grid_size, + num_features_subsample=num_features_subsample_variance) forest_model_variance <- createForestModel(forest_dataset_train, forest_model_config_variance, global_model_config) } diff --git a/man/bcf.Rd b/man/bcf.Rd index a7811b6a..14698735 100644 --- a/man/bcf.Rd +++ b/man/bcf.Rd @@ -107,6 +107,7 @@ that were not in the training set.} \item \code{sigma2_leaf_scale} Scale parameter in the \code{IG(sigma2_leaf_shape, sigma2_leaf_scale)} leaf node parameter variance model. Calibrated internally as \code{0.5/num_trees} if not set here. \item \code{keep_vars} Vector of variable names or column indices denoting variables that should be included in the forest. Default: \code{NULL}. \item \code{drop_vars} Vector of variable names or column indices denoting variables that should be excluded from the forest. Default: \code{NULL}. If both \code{drop_vars} and \code{keep_vars} are set, \code{drop_vars} will be ignored. +\item \code{num_features_subsample} How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features in the training dataset. }} \item{treatment_effect_forest_params}{(Optional) A list of treatment effect forest model parameters, each of which has a default value processed internally, so this argument list is optional. @@ -124,6 +125,7 @@ that were not in the training set.} \item \code{delta_max} Maximum plausible conditional distributional treatment effect (i.e. P(Y(1) = 1 | X) - P(Y(0) = 1 | X)) when the outcome is binary. Only used when the outcome is specified as a probit model in \code{general_params}. Must be > 0 and < 1. Default: \code{0.9}. Ignored if \code{sigma2_leaf_init} is set directly, as this parameter is used to calibrate \code{sigma2_leaf_init}. \item \code{keep_vars} Vector of variable names or column indices denoting variables that should be included in the forest. Default: \code{NULL}. \item \code{drop_vars} Vector of variable names or column indices denoting variables that should be excluded from the forest. Default: \code{NULL}. If both \code{drop_vars} and \code{keep_vars} are set, \code{drop_vars} will be ignored. +\item \code{num_features_subsample} How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features in the training dataset. }} \item{variance_forest_params}{(Optional) A list of variance forest model parameters, each of which has a default value processed internally, so this argument list is optional. @@ -139,6 +141,7 @@ that were not in the training set.} \item \code{var_forest_prior_scale} Scale parameter in the \code{IG(var_forest_prior_shape, var_forest_prior_scale)} conditional error variance model (which is only sampled if \code{num_trees > 0}). Calibrated internally as \code{num_trees / 1.5^2} if not set. \item \code{keep_vars} Vector of variable names or column indices denoting variables that should be included in the forest. Default: \code{NULL}. \item \code{drop_vars} Vector of variable names or column indices denoting variables that should be excluded from the forest. Default: \code{NULL}. If both \code{drop_vars} and \code{keep_vars} are set, \code{drop_vars} will be ignored. +\item \code{num_features_subsample} How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features in the training dataset. }} } \value{ From cf03f4dd765533ef2b0aefc4d18f3b1eef677465 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Tue, 10 Jun 2025 14:11:17 -0500 Subject: [PATCH 08/12] Updated Python BCF interface --- stochtree/bcf.py | 44 ++++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/stochtree/bcf.py b/stochtree/bcf.py index e97e1a88..c4b67232 100644 --- a/stochtree/bcf.py +++ b/stochtree/bcf.py @@ -168,6 +168,7 @@ def sample( * `sigma2_leaf_scale` (`float`): Scale parameter in the `IG(sigma2_leaf_shape, sigma2_leaf_scale)` leaf node parameter variance model. Calibrated internally as `0.5/num_trees` if not set here. * `keep_vars` (`list` or `np.array`): Vector of variable names or column indices denoting variables that should be included in the prognostic (`mu(X)`) forest. Defaults to `None`. * `drop_vars` (`list` or `np.array`): Vector of variable names or column indices denoting variables that should be excluded from the prognostic (`mu(X)`) forest. Defaults to `None`. If both `drop_vars` and `keep_vars` are set, `drop_vars` will be ignored. + * `num_features_subsample` (`int`): How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features in the training dataset. treatment_effect_forest_params : dict, optional Dictionary of treatment effect forest model parameters, each of which has a default value processed internally, so this argument is optional. @@ -184,6 +185,7 @@ def sample( * `delta_max` (`float`): Maximum plausible conditional distributional treatment effect (i.e. P(Y(1) = 1 | X) - P(Y(0) = 1 | X)) when the outcome is binary. Only used when the outcome is specified as a probit model in `general_params`. Must be > 0 and < 1. Defaults to `0.9`. Ignored if `sigma2_leaf_init` is set directly, as this parameter is used to calibrate `sigma2_leaf_init`. * `keep_vars` (`list` or `np.array`): Vector of variable names or column indices denoting variables that should be included in the treatment effect (`tau(X)`) forest. Defaults to `None`. * `drop_vars` (`list` or `np.array`): Vector of variable names or column indices denoting variables that should be excluded from the treatment effect (`tau(X)`) forest. Defaults to `None`. If both `drop_vars` and `keep_vars` are set, `drop_vars` will be ignored. + * `num_features_subsample` (`int`): How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features in the training dataset. variance_forest_params : dict, optional Dictionary of variance forest model parameters, each of which has a default value processed internally, so this argument is optional. @@ -199,6 +201,7 @@ def sample( * `var_forest_prior_scale` (`float`): Scale parameter in the [optional] `IG(var_forest_prior_shape, var_forest_prior_scale)` conditional error variance forest (which is only sampled if `num_trees > 0`). Calibrated internally as `num_trees / 1.5^2` if not set here. * `keep_vars` (`list` or `np.array`): Vector of variable names or column indices denoting variables that should be included in the variance forest. Defaults to `None`. * `drop_vars` (`list` or `np.array`): Vector of variable names or column indices denoting variables that should be excluded from the variance forest. Defaults to `None`. If both `drop_vars` and `keep_vars` are set, `drop_vars` will be ignored. + * `num_features_subsample` (`int`): How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features in the training dataset. Returns ------- @@ -242,6 +245,7 @@ def sample( "sigma2_leaf_scale": None, "keep_vars": None, "drop_vars": None, + "num_features_subsample": None, } prognostic_forest_params_updated = _preprocess_params( prognostic_forest_params_default, prognostic_forest_params @@ -261,6 +265,7 @@ def sample( "delta_max": 0.9, "keep_vars": None, "drop_vars": None, + "num_features_subsample": None, } treatment_effect_forest_params_updated = _preprocess_params( treatment_effect_forest_params_default, treatment_effect_forest_params @@ -279,6 +284,7 @@ def sample( "var_forest_prior_scale": None, "keep_vars": None, "drop_vars": None, + "num_features_subsample": None, } variance_forest_params_updated = _preprocess_params( variance_forest_params_default, variance_forest_params @@ -316,6 +322,7 @@ def sample( b_leaf_mu = prognostic_forest_params_updated["sigma2_leaf_scale"] keep_vars_mu = prognostic_forest_params_updated["keep_vars"] drop_vars_mu = prognostic_forest_params_updated["drop_vars"] + num_features_subsample_mu = prognostic_forest_params_updated["num_features_subsample"] # 3. Tau forest parameters num_trees_tau = treatment_effect_forest_params_updated["num_trees"] @@ -334,6 +341,7 @@ def sample( delta_max = treatment_effect_forest_params_updated["delta_max"] keep_vars_tau = treatment_effect_forest_params_updated["keep_vars"] drop_vars_tau = treatment_effect_forest_params_updated["drop_vars"] + num_features_subsample_tau = treatment_effect_forest_params_updated["num_features_subsample"] # 4. Variance forest parameters num_trees_variance = variance_forest_params_updated["num_trees"] @@ -349,6 +357,7 @@ def sample( b_forest = variance_forest_params_updated["var_forest_prior_scale"] keep_vars_variance = variance_forest_params_updated["keep_vars"] drop_vars_variance = variance_forest_params_updated["drop_vars"] + num_features_subsample_variance = variance_forest_params_updated["num_features_subsample"] # Override keep_gfr if there are no MCMC samples if num_mcmc == 0: @@ -744,6 +753,19 @@ def sample( if not isinstance(keep_gfr, bool): raise ValueError("keep_gfr must be a bool") + # Covariate preprocessing + self._covariate_preprocessor = CovariatePreprocessor() + self._covariate_preprocessor.fit(X_train) + X_train_processed = self._covariate_preprocessor.transform(X_train) + if X_test is not None: + X_test_processed = self._covariate_preprocessor.transform(X_test) + feature_types = np.asarray( + self._covariate_preprocessor._processed_feature_types + ) + original_var_indices = ( + self._covariate_preprocessor.fetch_original_feature_indices() + ) + # Standardize the keep variable lists to numeric indices if keep_vars_mu is not None: if isinstance(keep_vars_mu, list): @@ -1052,18 +1074,13 @@ def sample( else: variable_subset_variance = [i for i in range(X_train.shape[1])] - # Covariate preprocessing - self._covariate_preprocessor = CovariatePreprocessor() - self._covariate_preprocessor.fit(X_train) - X_train_processed = self._covariate_preprocessor.transform(X_train) - if X_test is not None: - X_test_processed = self._covariate_preprocessor.transform(X_test) - feature_types = np.asarray( - self._covariate_preprocessor._processed_feature_types - ) - original_var_indices = ( - self._covariate_preprocessor.fetch_original_feature_indices() - ) + # Set num_features_subsample to default, ncol(X_train), if not already set + if num_features_subsample_mu is None: + num_features_subsample_mu = X_train.shape[1] + if num_features_subsample_tau is None: + num_features_subsample_tau = X_train.shape[1] + if num_features_subsample_variance is None: + num_features_subsample_variance = X_train.shape[1] # Determine whether a test set is provided self.has_test = X_test is not None @@ -1519,6 +1536,7 @@ def sample( leaf_model_type=leaf_model_mu, leaf_model_scale=current_leaf_scale_mu, cutpoint_grid_size=cutpoint_grid_size, + num_features_subsample=num_features_subsample_mu, ) forest_sampler_mu = ForestSampler( forest_dataset_train, @@ -1539,6 +1557,7 @@ def sample( leaf_model_type=leaf_model_tau, leaf_model_scale=current_leaf_scale_tau, cutpoint_grid_size=cutpoint_grid_size, + num_features_subsample=num_features_subsample_tau, ) forest_sampler_tau = ForestSampler( forest_dataset_train, @@ -1561,6 +1580,7 @@ def sample( cutpoint_grid_size=cutpoint_grid_size, variance_forest_shape=a_forest, variance_forest_scale=b_forest, + num_features_subsample=num_features_subsample_variance, ) forest_sampler_variance = ForestSampler( forest_dataset_train, global_model_config, forest_model_config_variance From 3a576e11a9c35e2247249c3527e0cee6bbfbb553 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Tue, 10 Jun 2025 14:59:09 -0500 Subject: [PATCH 09/12] Updated feature subsampling benchmark script --- .../gfr_feature_subsample_microbenchmark.R | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/tools/perf/gfr_feature_subsample_microbenchmark.R b/tools/perf/gfr_feature_subsample_microbenchmark.R index fa289a28..7327ca8a 100644 --- a/tools/perf/gfr_feature_subsample_microbenchmark.R +++ b/tools/perf/gfr_feature_subsample_microbenchmark.R @@ -3,7 +3,7 @@ library(stochtree) library(microbenchmark) # Generate the data -n <- 5000 +n <- 1000 p <- 100 snr <- 2 X <- matrix(runif(n*p), ncol = p) @@ -46,8 +46,8 @@ microbenchmark::microbenchmark( ) Rprof() -stochtree::bart( - X_train = X, y_train = y, +model_subsampling <- stochtree::bart( + X_train = X_train, y_train = y_train, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, general_params = general_params, mean_forest_params = mean_params_a ) @@ -55,10 +55,20 @@ Rprof(NULL) summaryRprof() Rprof() -stochtree::bart( - X_train = X, y_train = y, +model_no_subsampling <- stochtree::bart( + X_train = X_train, y_train = y_train, num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, general_params = general_params, mean_forest_params = mean_params_b ) Rprof(NULL) summaryRprof() + +# Compare out of sample RMSE of the two models +y_hat_test_subsampling <- rowMeans(predict(model_subsampling, X = X_test)$y_hat) +rmse_subsampling <- ( + sqrt(mean((y_hat_test_subsampling - y_test)^2)) +) +y_hat_test_no_subsampling <- rowMeans(predict(model_no_subsampling, X = X_test)$y_hat) +rmse_no_subsampling <- ( + sqrt(mean((y_hat_test_no_subsampling - y_test)^2)) +) From 21f2897d75cbc2a5067afb819ad728a077f8d434 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Tue, 10 Jun 2025 15:43:09 -0500 Subject: [PATCH 10/12] Updated benchmarking scripts --- .../supervised_learning_feature_subsets.py | 45 ++++++---- ...bcf_gfr_feature_subsample_microbenchmark.R | 89 +++++++++++++++++++ 2 files changed, 116 insertions(+), 18 deletions(-) create mode 100644 tools/perf/bcf_gfr_feature_subsample_microbenchmark.R diff --git a/demo/debug/supervised_learning_feature_subsets.py b/demo/debug/supervised_learning_feature_subsets.py index 9ddbb447..85e26dd1 100644 --- a/demo/debug/supervised_learning_feature_subsets.py +++ b/demo/debug/supervised_learning_feature_subsets.py @@ -2,9 +2,6 @@ # Load necessary libraries import numpy as np -import pandas as pd -import seaborn as sns -import matplotlib.pyplot as plt from stochtree import BARTModel from sklearn.model_selection import train_test_split import timeit @@ -16,25 +13,21 @@ # Generate covariates and basis n = 1000 -p_X = 100 -X = rng.uniform(0, 1, (n, p_X)) +p = 100 +X = rng.uniform(0, 1, (n, p)) # Define the outcome mean function def outcome_mean(X): - return np.where( - (X[:,0] >= 0.0) & (X[:,0] < 0.25), -7.5, - np.where( - (X[:,0] >= 0.25) & (X[:,0] < 0.5), -2.5, - np.where( - (X[:,0] >= 0.5) & (X[:,0] < 0.75), 2.5, - 7.5 - ) - ) + return ( + np.sin(4*np.pi*X[:,0]) + np.cos(4*np.pi*X[:,1]) + np.sin(4*np.pi*X[:,2]) + np.cos(4*np.pi*X[:,3]) ) # Generate outcome -epsilon = rng.normal(0, 1, n) -y = outcome_mean(X) + epsilon +snr = 2 +f_X = outcome_mean(X) +noise_sd = np.std(f_X) / snr +epsilon = rng.normal(0, 1, n) * noise_sd +y = f_X + epsilon # Test-train split sample_inds = np.arange(n) @@ -50,7 +43,8 @@ def outcome_mean(X): forest_config_a = {"num_trees": 100} bart_model_a.sample(X_train=X_train, y_train=y_train, X_test=X_test, num_gfr=100, num_mcmc=0, mean_forest_params=forest_config_a) """ -print(timeit.timeit(stmt=s, number=5, globals=globals())) +timing_no_subsampling = timeit.timeit(stmt=s, number=5, globals=globals()) +print(f"Average runtime, without feature subsampling (p = {p:d}): {timing_no_subsampling:.2f}") # Run XBART with each tree considering random subsets of 5 features s = """\ @@ -58,4 +52,19 @@ def outcome_mean(X): forest_config_b = {"num_trees": 100, "num_features_subsample": 5} bart_model_b.sample(X_train=X_train, y_train=y_train, X_test=X_test, num_gfr=100, num_mcmc=0, mean_forest_params=forest_config_b) """ -print(timeit.timeit(stmt=s, number=5, globals=globals())) +timing_subsampling = timeit.timeit(stmt=s, number=5, globals=globals()) +print(f"Average runtime, subsampling 5 out of {p:d} features: {timing_subsampling:.2f}") + +# Compare RMSEs of each model +bart_model_a = BARTModel() +forest_config_a = {"num_trees": 100} +bart_model_a.sample(X_train=X_train, y_train=y_train, X_test=X_test, num_gfr=100, num_mcmc=0, mean_forest_params=forest_config_a) +bart_model_b = BARTModel() +forest_config_b = {"num_trees": 100, "num_features_subsample": 5} +bart_model_b.sample(X_train=X_train, y_train=y_train, X_test=X_test, num_gfr=100, num_mcmc=0, mean_forest_params=forest_config_b) +y_hat_test_a = np.squeeze(bart_model_a.y_hat_test).mean(axis = 1) +rmse_no_subsampling = np.sqrt(np.mean(np.power(y_test - y_hat_test_a,2))) +print(f"Test set RMSE, no subsampling (p = {p:d}): {rmse_no_subsampling:.2f}") +y_hat_test_b = np.squeeze(bart_model_b.y_hat_test).mean(axis = 1) +rmse_subsampling = np.sqrt(np.mean(np.power(y_test - y_hat_test_b,2))) +print(f"Test set RMSE, subsampling 5 out of {p:d} features: {rmse_subsampling:.2f}") diff --git a/tools/perf/bcf_gfr_feature_subsample_microbenchmark.R b/tools/perf/bcf_gfr_feature_subsample_microbenchmark.R new file mode 100644 index 00000000..21c239b9 --- /dev/null +++ b/tools/perf/bcf_gfr_feature_subsample_microbenchmark.R @@ -0,0 +1,89 @@ +# Load libraries +library(stochtree) +library(microbenchmark) + +# Generate the data +n <- 1000 +p <- 100 +snr <- 2 +X <- matrix(rnorm(n*p), ncol = p) +mu_x <- 1 + 2*X[,1] - 4*(X[,2] < 0) + 4*(X[,2] >= 0) + 3*(abs(X[,3]) - sqrt(2/pi)) +tau_x <- 1 + 2*X[,4] +u <- runif(n) +pi_x <- ((mu_x-1)/4) + 4*(u-0.5) +Z <- pi_x + rnorm(n,0,1) +E_XZ <- mu_x + Z*tau_x +noise_sd <- sd(E_XZ) / snr +y <- E_XZ + rnorm(n, 0, 1)*noise_sd + +# Split data into test and train sets +test_set_pct <- 0.2 +n_test <- round(test_set_pct*n) +n_train <- n - n_test +test_inds <- sort(sample(1:n, n_test, replace = FALSE)) +train_inds <- (1:n)[!((1:n) %in% test_inds)] +X_test <- X[test_inds,] +X_train <- X[train_inds,] +Z_test <- Z[test_inds] +Z_train <- Z[train_inds] +pi_x_test <- pi_x[test_inds] +pi_x_train <- pi_x[train_inds] +y_test <- y[test_inds] +y_train <- y[train_inds] + +# Sampler settings +num_gfr <- 100 +num_burnin <- 0 +num_mcmc <- 0 +general_params <- list(sample_sigma2_global = T) +prog_params_a <- list(num_trees = 100, num_features_subsample = 5) +trt_params_a <- list(num_trees = 100, num_features_subsample = 5) +prog_params_b <- list(num_trees = 50) +trt_params_b <- list(num_trees = 50) + +# Benchmark sampler with and without feature subsampling +microbenchmark::microbenchmark( + stochtree::bcf( + X_train = X, Z_train = Z, propensity_train = pi_x, y_train = y, + num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, + general_params = general_params, prognostic_forest_params = prog_params_a, + treatment_effect_forest_params = trt_params_a + ), + stochtree::bcf( + X_train = X, Z_train = Z, propensity_train = pi_x, y_train = y, + num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, + general_params = general_params, prognostic_forest_params = prog_params_b, + treatment_effect_forest_params = trt_params_b + ), + times = 5 +) + +Rprof() +model_subsampling <- stochtree::bcf( + X_train = X_train, Z_train = Z_train, propensity_train = pi_x_train, y_train = y_train, + num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, + general_params = general_params, prognostic_forest_params = prog_params_a, + treatment_effect_forest_params = trt_params_a +) +Rprof(NULL) +summaryRprof() + +Rprof() +model_no_subsampling <- stochtree::bcf( + X_train = X_train, Z_train = Z_train, propensity_train = pi_x_train, y_train = y_train, + num_gfr = num_gfr, num_burnin = num_burnin, num_mcmc = num_mcmc, + general_params = general_params, prognostic_forest_params = prog_params_b, + treatment_effect_forest_params = trt_params_b +) +Rprof(NULL) +summaryRprof() + +# Compare out of sample RMSE of the two models +y_hat_test_subsampling <- rowMeans(predict(model_subsampling, X = X_test, Z = Z_test, propensity = pi_x_test)$y_hat) +rmse_subsampling <- ( + sqrt(mean((y_hat_test_subsampling - y_test)^2)) +) +y_hat_test_no_subsampling <- rowMeans(predict(model_no_subsampling, X = X_test, Z = Z_test, propensity = pi_x_test)$y_hat) +rmse_no_subsampling <- ( + sqrt(mean((y_hat_test_no_subsampling - y_test)^2)) +) From e9953002bd6b4709ee2cbb3d9e26495edccd1e81 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Tue, 10 Jun 2025 16:02:55 -0500 Subject: [PATCH 11/12] Added python demo and benchmark script --- .../debug/causal_inference_feature_subsets.py | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 demo/debug/causal_inference_feature_subsets.py diff --git a/demo/debug/causal_inference_feature_subsets.py b/demo/debug/causal_inference_feature_subsets.py new file mode 100644 index 00000000..00cab8b8 --- /dev/null +++ b/demo/debug/causal_inference_feature_subsets.py @@ -0,0 +1,76 @@ +# Supervised Learning Demo Script + +# Load necessary libraries +import numpy as np +from stochtree import BCFModel +from sklearn.model_selection import train_test_split +import timeit + +# Generate sample data +# RNG +random_seed = 1234 +rng = np.random.default_rng(random_seed) + +# Generate covariates and basis +n = 1000 +p = 100 +X = rng.normal(0, 1, (n, p)) + +# Generate outcome +snr = 2 +mu_x = 1 + 2*X[:,0] + np.where(X[:,1] < 0, -4, 4) + 3*(np.abs(X[:,2]) - np.sqrt(2/np.pi)) +tau_x = 1 + 2*X[:,3] +u = rng.uniform(0, 1, n) +pi_x = ((mu_x-1.)/4.) + 4*(u-0.5) +Z = pi_x + rng.normal(0, 1, n) +E_XZ = mu_x + Z*tau_x +noise_sd = np.std(E_XZ) / snr +y = E_XZ + rng.normal(0, 1, n)*noise_sd + +# Test-train split +sample_inds = np.arange(n) +train_inds, test_inds = train_test_split(sample_inds, test_size=0.2) +X_train = X[train_inds,:] +X_test = X[test_inds,:] +pi_x_train = pi_x[train_inds] +pi_x_test = pi_x[test_inds] +Z_train = Z[train_inds] +Z_test = Z[test_inds] +y_train = y[train_inds] +y_test = y[test_inds] + +# Run XBART with the full feature set +s = """\ +bcf_model_a = BCFModel() +prog_forest_config_a = {"num_trees": 100} +trt_forest_config_a = {"num_trees": 50} +bcf_model_a.sample(X_train=X_train, Z_train=Z_train, pi_train=pi_x_train, y_train=y_train, X_test=X_test, Z_test=Z_test, pi_test=pi_x_test, num_gfr=100, num_mcmc=0, prognostic_forest_params=prog_forest_config_a, treatment_effect_forest_params=trt_forest_config_a) +""" +timing_no_subsampling = timeit.timeit(stmt=s, number=5, globals=globals()) +print(f"Average runtime, without feature subsampling (p = {p:d}): {timing_no_subsampling:.2f}") + +# Run XBART with each tree considering random subsets of 5 features +s = """\ +bcf_model_b = BCFModel() +prog_forest_config_b = {"num_trees": 100, "num_features_subsample": 5} +trt_forest_config_b = {"num_trees": 50, "num_features_subsample": 5} +bcf_model_b.sample(X_train=X_train, Z_train=Z_train, pi_train=pi_x_train, y_train=y_train, X_test=X_test, Z_test=Z_test, pi_test=pi_x_test, num_gfr=100, num_mcmc=0, prognostic_forest_params=prog_forest_config_b, treatment_effect_forest_params=trt_forest_config_b) +""" +timing_subsampling = timeit.timeit(stmt=s, number=5, globals=globals()) +print(f"Average runtime, subsampling 5 out of {p:d} features: {timing_subsampling:.2f}") + +# Compare RMSEs of each model +bcf_model_a = BCFModel() +prog_forest_config_a = {"num_trees": 100} +trt_forest_config_a = {"num_trees": 50} +bcf_model_a.sample(X_train=X_train, Z_train=Z_train, pi_train=pi_x_train, y_train=y_train, X_test=X_test, Z_test=Z_test, pi_test=pi_x_test, num_gfr=100, num_mcmc=0, prognostic_forest_params=prog_forest_config_a, treatment_effect_forest_params=trt_forest_config_a) +bcf_model_b = BCFModel() +prog_forest_config_b = {"num_trees": 100, "num_features_subsample": 5} +trt_forest_config_b = {"num_trees": 50, "num_features_subsample": 5} +bcf_model_b.sample(X_train=X_train, Z_train=Z_train, pi_train=pi_x_train, y_train=y_train, X_test=X_test, Z_test=Z_test, pi_test=pi_x_test, num_gfr=100, num_mcmc=0, prognostic_forest_params=prog_forest_config_b, treatment_effect_forest_params=trt_forest_config_b) +y_hat_test_a = np.squeeze(bcf_model_a.y_hat_test).mean(axis = 1) +rmse_no_subsampling = np.sqrt(np.mean(np.power(y_test - y_hat_test_a,2))) +print(f"Test set RMSE, no subsampling (p = {p:d}): {rmse_no_subsampling:.2f}") +y_hat_test_b = np.squeeze(bcf_model_b.y_hat_test).mean(axis = 1) +rmse_subsampling = np.sqrt(np.mean(np.power(y_test - y_hat_test_b,2))) +print(f"Test set RMSE, subsampling 5 out of {p:d} features: {rmse_subsampling:.2f}") From ee95a2f23a9091fef1998d1cd463bb627de72d87 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Tue, 10 Jun 2025 16:18:00 -0500 Subject: [PATCH 12/12] Updated C++ unit tests and standalone program --- debug/api_debug.cpp | 9 +++++---- test/cpp/test_model.cpp | 28 ++++++++++++++++++++-------- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/debug/api_debug.cpp b/debug/api_debug.cpp index f31e8739..8958bd93 100644 --- a/debug/api_debug.cpp +++ b/debug/api_debug.cpp @@ -668,6 +668,7 @@ void RunDebug(int dgp_num = 0, const ModelType model_type = kConstantLeafGaussia // Prepare the samplers LeafModelVariant leaf_model = leafModelFactory(model_type, leaf_scale, leaf_scale_matrix, a_forest, b_forest); + int num_features_subsample = x_cols; // Initialize vector of sweep update indices std::vector sweep_indices(num_trees); @@ -687,13 +688,13 @@ void RunDebug(int dgp_num = 0, const ModelType model_type = kConstantLeafGaussia // Sample tree ensemble if (model_type == ModelType::kConstantLeafGaussian) { - GFRSampleOneIter(active_forest, tracker, forest_samples, std::get(leaf_model), dataset, residual, tree_prior, gen, variable_weights, sweep_indices, global_variance, feature_types, cutpoint_grid_size, true, true, true); + GFRSampleOneIter(active_forest, tracker, forest_samples, std::get(leaf_model), dataset, residual, tree_prior, gen, variable_weights, sweep_indices, global_variance, feature_types, cutpoint_grid_size, true, true, true, num_features_subsample); } else if (model_type == ModelType::kUnivariateRegressionLeafGaussian) { - GFRSampleOneIter(active_forest, tracker, forest_samples, std::get(leaf_model), dataset, residual, tree_prior, gen, variable_weights, sweep_indices, global_variance, feature_types, cutpoint_grid_size, true, true, true); + GFRSampleOneIter(active_forest, tracker, forest_samples, std::get(leaf_model), dataset, residual, tree_prior, gen, variable_weights, sweep_indices, global_variance, feature_types, cutpoint_grid_size, true, true, true, num_features_subsample); } else if (model_type == ModelType::kMultivariateRegressionLeafGaussian) { - GFRSampleOneIter(active_forest, tracker, forest_samples, std::get(leaf_model), dataset, residual, tree_prior, gen, variable_weights, sweep_indices, global_variance, feature_types, cutpoint_grid_size, true, true, true, omega_cols); + GFRSampleOneIter(active_forest, tracker, forest_samples, std::get(leaf_model), dataset, residual, tree_prior, gen, variable_weights, sweep_indices, global_variance, feature_types, cutpoint_grid_size, true, true, true, num_features_subsample, omega_cols); } else if (model_type == ModelType::kLogLinearVariance) { - GFRSampleOneIter(active_forest, tracker, forest_samples, std::get(leaf_model), dataset, residual, tree_prior, gen, variable_weights, sweep_indices, global_variance, feature_types, cutpoint_grid_size, true, true, false); + GFRSampleOneIter(active_forest, tracker, forest_samples, std::get(leaf_model), dataset, residual, tree_prior, gen, variable_weights, sweep_indices, global_variance, feature_types, cutpoint_grid_size, true, true, false, num_features_subsample); } if (rfx_included) { diff --git a/test/cpp/test_model.cpp b/test/cpp/test_model.cpp index 0e729bef..dbe85b7e 100644 --- a/test/cpp/test_model.cpp +++ b/test/cpp/test_model.cpp @@ -15,6 +15,9 @@ TEST(LeafConstantModel, FullEnumeration) { test_dataset = StochTree::TestUtils::LoadSmallDatasetUnivariateBasis(); std::vector feature_types(test_dataset.x_cols, StochTree::FeatureType::kNumeric); std::vector variable_weights(test_dataset.x_cols, 1./test_dataset.x_cols); + std::vector feature_subset(test_dataset.x_cols, true); + std::random_device rd; + std::mt19937 gen(rd()); // Construct datasets using data_size_t = StochTree::data_size_t; @@ -51,8 +54,8 @@ TEST(LeafConstantModel, FullEnumeration) { // Evaluate all possible cutpoints StochTree::EvaluateAllPossibleSplits( - dataset, tracker, residual, tree_prior, leaf_model, global_variance, 0, 0, log_cutpoint_evaluations, cutpoint_features, cutpoint_values, - cutpoint_feature_types, valid_cutpoint_count, cutpoint_grid_container, 0, n, variable_weights, feature_types + dataset, tracker, residual, tree_prior, gen, leaf_model, global_variance, 0, 0, log_cutpoint_evaluations, cutpoint_features, cutpoint_values, + cutpoint_feature_types, valid_cutpoint_count, cutpoint_grid_container, 0, n, variable_weights, feature_types, feature_subset ); // Check that there are (n - 2*min_samples_leaf + 1)*p + 1 cutpoints considered @@ -74,6 +77,9 @@ TEST(LeafConstantModel, CutpointThinning) { test_dataset = StochTree::TestUtils::LoadSmallDatasetUnivariateBasis(); std::vector feature_types(test_dataset.x_cols, StochTree::FeatureType::kNumeric); std::vector variable_weights(test_dataset.x_cols, 1./test_dataset.x_cols); + std::vector feature_subset(test_dataset.x_cols, true); + std::random_device rd; + std::mt19937 gen(rd()); // Construct datasets using data_size_t = StochTree::data_size_t; @@ -110,8 +116,8 @@ TEST(LeafConstantModel, CutpointThinning) { // Evaluate all possible cutpoints StochTree::EvaluateAllPossibleSplits( - dataset, tracker, residual, tree_prior, leaf_model, global_variance, 0, 0, log_cutpoint_evaluations, cutpoint_features, cutpoint_values, - cutpoint_feature_types, valid_cutpoint_count, cutpoint_grid_container, 0, n, variable_weights, feature_types + dataset, tracker, residual, tree_prior, gen, leaf_model, global_variance, 0, 0, log_cutpoint_evaluations, cutpoint_features, cutpoint_values, + cutpoint_feature_types, valid_cutpoint_count, cutpoint_grid_container, 0, n, variable_weights, feature_types, feature_subset ); // Check that there are (n - 2*min_samples_leaf + 1)*p + 1 cutpoints considered @@ -132,6 +138,9 @@ TEST(LeafUnivariateRegressionModel, FullEnumeration) { test_dataset = StochTree::TestUtils::LoadSmallDatasetUnivariateBasis(); std::vector feature_types(test_dataset.x_cols, StochTree::FeatureType::kNumeric); std::vector variable_weights(test_dataset.x_cols, 1./test_dataset.x_cols); + std::vector feature_subset(test_dataset.x_cols, true); + std::random_device rd; + std::mt19937 gen(rd()); // Construct datasets using data_size_t = StochTree::data_size_t; @@ -169,8 +178,8 @@ TEST(LeafUnivariateRegressionModel, FullEnumeration) { // Evaluate all possible cutpoints StochTree::EvaluateAllPossibleSplits( - dataset, tracker, residual, tree_prior, leaf_model, global_variance, 0, 0, log_cutpoint_evaluations, cutpoint_features, cutpoint_values, - cutpoint_feature_types, valid_cutpoint_count, cutpoint_grid_container, 0, n, variable_weights, feature_types + dataset, tracker, residual, tree_prior, gen, leaf_model, global_variance, 0, 0, log_cutpoint_evaluations, cutpoint_features, cutpoint_values, + cutpoint_feature_types, valid_cutpoint_count, cutpoint_grid_container, 0, n, variable_weights, feature_types, feature_subset ); // Check that there are (n - 2*min_samples_leaf + 1)*p + 1 cutpoints considered @@ -192,6 +201,9 @@ TEST(LeafUnivariateRegressionModel, CutpointThinning) { test_dataset = StochTree::TestUtils::LoadSmallDatasetUnivariateBasis(); std::vector feature_types(test_dataset.x_cols, StochTree::FeatureType::kNumeric); std::vector variable_weights(test_dataset.x_cols, 1./test_dataset.x_cols); + std::vector feature_subset(test_dataset.x_cols, true); + std::random_device rd; + std::mt19937 gen(rd()); // Construct datasets using data_size_t = StochTree::data_size_t; @@ -229,8 +241,8 @@ TEST(LeafUnivariateRegressionModel, CutpointThinning) { // Evaluate all possible cutpoints StochTree::EvaluateAllPossibleSplits( - dataset, tracker, residual, tree_prior, leaf_model, global_variance, 0, 0, log_cutpoint_evaluations, cutpoint_features, cutpoint_values, - cutpoint_feature_types, valid_cutpoint_count, cutpoint_grid_container, 0, n, variable_weights, feature_types + dataset, tracker, residual, tree_prior, gen, leaf_model, global_variance, 0, 0, log_cutpoint_evaluations, cutpoint_features, cutpoint_values, + cutpoint_feature_types, valid_cutpoint_count, cutpoint_grid_container, 0, n, variable_weights, feature_types, feature_subset );