Merge pull request #59 from StochasticTree/python_bcf_var_subsets_mu_tau

Enabling python BCF to use different subsets of features in the mu and tau forests
StochasticTree · Jun 25, 2024 · 0f61602 · 0f61602
2 parents 3e2f200 + 31312b9
commit 0f61602
Show file tree

Hide file tree

Showing 5 changed files with 533 additions and 105 deletions.
diff --git a/R/utils.R b/R/utils.R
@@ -64,7 +64,6 @@ preprocessPredictionData <- function(input_data, metadata) {
 #' Returns a list including a matrix of preprocessed covariate values and associated tracking.
 #'
 #' @param input_matrix Covariate matrix.
-#' @param variable_weights Numeric weights reflecting the relative probability of splitting on each variable
 #'
 #' @return List with preprocessed (unmodified) data and details on the number of each type 
 #' of variable, unique categories associated with categorical variables, and the 
@@ -142,7 +141,6 @@ preprocessPredictionMatrix <- function(input_matrix, metadata) {
 #'
 #' @param input_df Dataframe of covariates. Users must pre-process any 
 #' categorical variables as factors (ordered for ordered categorical).
-#' @param variable_weights Numeric weights reflecting the relative probability of splitting on each variable
 #'
 #' @return List with preprocessed data and details on the number of each type 
 #' of variable, unique categories associated with categorical variables, and the 

diff --git a/demo/notebooks/causal_inference_feature_subsets.ipynb b/demo/notebooks/causal_inference_feature_subsets.ipynb
@@ -0,0 +1,302 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Causal Inference with Feature Subsets Demo Notebook\n",
+    "\n",
+    "This is a duplicate of the main causal inference demo which shows how a user might decide to use only a subset of covariates in the treatment effect forest. \n",
+    "Why might we want to do that? Well, in many cases it is plausible that some covariates (for example age, income, etc...) influence the outcome of interest \n",
+    "in a causal problem, but do not **moderate** the treatment effect. In this case, we'd need to include these variables in the prognostic forest for deconfounding \n",
+    "but we don't necessarily need to include them in the treatment effect forest."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load necessary libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "from stochtree import BCFModel\n",
+    "from sklearn.model_selection import train_test_split"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Generate sample data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# RNG\n",
+    "rng = np.random.default_rng()\n",
+    "\n",
+    "# Generate covariates and basis\n",
+    "n = 1000\n",
+    "p_X = 10\n",
+    "X = rng.uniform(0, 1, (n, p_X))\n",
+    "pi_X = 0.25 + 0.5*X[:,0]\n",
+    "Z = rng.binomial(1, pi_X, n).astype(float)\n",
+    "\n",
+    "# Define the outcome mean functions (prognostic and treatment effects)\n",
+    "mu_X = pi_X*5 + 2*X[:,2]\n",
+    "tau_X = 1 - 2*X[:,0] + 2*X[:,1] + 1*X[:,0]*X[:,1]\n",
+    "\n",
+    "# Generate outcome\n",
+    "epsilon = rng.normal(0, 1, n)\n",
+    "y = mu_X + tau_X*Z + epsilon"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Test-train split"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample_inds = np.arange(n)\n",
+    "train_inds, test_inds = train_test_split(sample_inds, test_size=0.5)\n",
+    "X_train = X[train_inds,:]\n",
+    "X_test = X[test_inds,:]\n",
+    "Z_train = Z[train_inds]\n",
+    "Z_test = Z[test_inds]\n",
+    "y_train = y[train_inds]\n",
+    "y_test = y[test_inds]\n",
+    "pi_train = pi_X[train_inds]\n",
+    "pi_test = pi_X[test_inds]\n",
+    "mu_train = mu_X[train_inds]\n",
+    "mu_test = mu_X[test_inds]\n",
+    "tau_train = tau_X[train_inds]\n",
+    "tau_test = tau_X[test_inds]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Run BCF without feature subsetting for $\\tau(X)$"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bcf_model = BCFModel()\n",
+    "bcf_model.sample(X_train, Z_train, y_train, pi_train, X_test, Z_test, pi_test, num_gfr=10, num_mcmc=100)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Inspect the MCMC (BART) samples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "forest_preds_y_mcmc = bcf_model.y_hat_test\n",
+    "y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis = 1, keepdims = True)\n",
+    "y_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(y_test,1), y_avg_mcmc), axis = 1), columns=[\"True outcome\", \"Average estimated outcome\"])\n",
+    "sns.scatterplot(data=y_df_mcmc, x=\"Average estimated outcome\", y=\"True outcome\")\n",
+    "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "forest_preds_tau_mcmc = bcf_model.tau_hat_test\n",
+    "tau_avg_mcmc = np.squeeze(forest_preds_tau_mcmc).mean(axis = 1, keepdims = True)\n",
+    "tau_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(tau_test,1), tau_avg_mcmc), axis = 1), columns=[\"True tau\", \"Average estimated tau\"])\n",
+    "sns.scatterplot(data=tau_df_mcmc, x=\"True tau\", y=\"Average estimated tau\")\n",
+    "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "forest_preds_mu_mcmc = bcf_model.mu_hat_test\n",
+    "mu_avg_mcmc = np.squeeze(forest_preds_mu_mcmc).mean(axis = 1, keepdims = True)\n",
+    "mu_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(mu_test,1), mu_avg_mcmc), axis = 1), columns=[\"True mu\", \"Average estimated mu\"])\n",
+    "sns.scatterplot(data=mu_df_mcmc, x=\"True mu\", y=\"Average estimated mu\")\n",
+    "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model.num_samples - bcf_model.num_gfr),axis=1), np.expand_dims(bcf_model.global_var_samples[bcf_model.num_gfr:],axis=1)), axis = 1), columns=[\"Sample\", \"Sigma\"])\n",
+    "sns.scatterplot(data=sigma_df_mcmc, x=\"Sample\", y=\"Sigma\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "b_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model.num_samples - bcf_model.num_gfr),axis=1), np.expand_dims(bcf_model.b0_samples[bcf_model.num_gfr:],axis=1), np.expand_dims(bcf_model.b1_samples[bcf_model.num_gfr:],axis=1)), axis = 1), columns=[\"Sample\", \"Beta_0\", \"Beta_1\"])\n",
+    "sns.scatterplot(data=b_df_mcmc, x=\"Sample\", y=\"Beta_0\")\n",
+    "sns.scatterplot(data=b_df_mcmc, x=\"Sample\", y=\"Beta_1\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Run BCF, subsetting to the two features that show up in $\\tau(X)$"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bcf_model_subset = BCFModel()\n",
+    "bcf_model_subset.sample(X_train, Z_train, y_train, pi_train, X_test, Z_test, pi_test, num_gfr=10, num_mcmc=100, keep_vars_tau=[0,1])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Inspect the MCMC (BART) samples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "forest_preds_y_mcmc = bcf_model_subset.y_hat_test\n",
+    "y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis = 1, keepdims = True)\n",
+    "y_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(y_test,1), y_avg_mcmc), axis = 1), columns=[\"True outcome\", \"Average estimated outcome\"])\n",
+    "sns.scatterplot(data=y_df_mcmc, x=\"Average estimated outcome\", y=\"True outcome\")\n",
+    "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "forest_preds_tau_mcmc = bcf_model_subset.tau_hat_test\n",
+    "tau_avg_mcmc = np.squeeze(forest_preds_tau_mcmc).mean(axis = 1, keepdims = True)\n",
+    "tau_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(tau_test,1), tau_avg_mcmc), axis = 1), columns=[\"True tau\", \"Average estimated tau\"])\n",
+    "sns.scatterplot(data=tau_df_mcmc, x=\"True tau\", y=\"Average estimated tau\")\n",
+    "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "forest_preds_mu_mcmc = bcf_model_subset.mu_hat_test\n",
+    "mu_avg_mcmc = np.squeeze(forest_preds_mu_mcmc).mean(axis = 1, keepdims = True)\n",
+    "mu_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(mu_test,1), mu_avg_mcmc), axis = 1), columns=[\"True mu\", \"Average estimated mu\"])\n",
+    "sns.scatterplot(data=mu_df_mcmc, x=\"True mu\", y=\"Average estimated mu\")\n",
+    "plt.axline((0, 0), slope=1, color=\"black\", linestyle=(0, (3,3)))\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sigma_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model_subset.num_samples - bcf_model_subset.num_gfr),axis=1), \n",
+    "                                             np.expand_dims(bcf_model_subset.global_var_samples[bcf_model_subset.num_gfr:],axis=1)), axis = 1), \n",
+    "                                             columns=[\"Sample\", \"Sigma\"])\n",
+    "sns.scatterplot(data=sigma_df_mcmc, x=\"Sample\", y=\"Sigma\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "b_df_mcmc = pd.DataFrame(np.concatenate((np.expand_dims(np.arange(bcf_model_subset.num_samples - bcf_model_subset.num_gfr),axis=1), \n",
+    "                                         np.expand_dims(bcf_model_subset.b0_samples[bcf_model_subset.num_gfr:],axis=1), \n",
+    "                                         np.expand_dims(bcf_model_subset.b1_samples[bcf_model_subset.num_gfr:],axis=1)), axis = 1), \n",
+    "                                         columns=[\"Sample\", \"Beta_0\", \"Beta_1\"])\n",
+    "sns.scatterplot(data=b_df_mcmc, x=\"Sample\", y=\"Beta_0\")\n",
+    "sns.scatterplot(data=b_df_mcmc, x=\"Sample\", y=\"Beta_1\")\n",
+    "plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "stochtree-dev",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/include/stochtree/tree_sampler.h b/include/stochtree/tree_sampler.h
@@ -113,15 +113,9 @@ static inline bool NodeNonConstant(ForestDataset& dataset, ForestTracker& tracke
 
 static inline void AddSplitToModel(ForestTracker& tracker, ForestDataset& dataset, TreePrior& tree_prior, TreeSplit& split, std::mt19937& gen, Tree* tree, int tree_num, int leaf_node, int feature_split, bool keep_sorted = false) {
   // Use zeros as a "temporary" leaf values since we draw leaf parameters after tree sampling is complete
-  int basis_dim = dataset.NumBasis();
-  if (dataset.HasBasis()) {
-    if (basis_dim > 1) {
-      std::vector<double> temp_leaf_values(basis_dim, 0.);
-      tree->ExpandNode(leaf_node, feature_split, split, temp_leaf_values, temp_leaf_values);
-    } else {
-      double temp_leaf_value = 0.;
-      tree->ExpandNode(leaf_node, feature_split, split, temp_leaf_value, temp_leaf_value);
-    }
+  if (tree->OutputDimension() > 1) {
+    std::vector<double> temp_leaf_values(tree->OutputDimension(), 0.);
+    tree->ExpandNode(leaf_node, feature_split, split, temp_leaf_values, temp_leaf_values);
   } else {
     double temp_leaf_value = 0.;
     tree->ExpandNode(leaf_node, feature_split, split, temp_leaf_value, temp_leaf_value);
@@ -135,15 +129,9 @@ static inline void AddSplitToModel(ForestTracker& tracker, ForestDataset& datase
 
 static inline void RemoveSplitFromModel(ForestTracker& tracker, ForestDataset& dataset, TreePrior& tree_prior, std::mt19937& gen, Tree* tree, int tree_num, int leaf_node, int left_node, int right_node, bool keep_sorted = false) {
   // Use zeros as a "temporary" leaf values since we draw leaf parameters after tree sampling is complete
-  int basis_dim = dataset.NumBasis();
-  if (dataset.HasBasis()) {
-    if (basis_dim > 1) {
-      std::vector<double> temp_leaf_values(basis_dim, 0.);
-      tree->CollapseToLeaf(leaf_node, temp_leaf_values);
-    } else {
-      double temp_leaf_value = 0.;
-      tree->CollapseToLeaf(leaf_node, temp_leaf_value);
-    }
+  if (tree->OutputDimension() > 1) {
+    std::vector<double> temp_leaf_values(tree->OutputDimension(), 0.);
+    tree->CollapseToLeaf(leaf_node, temp_leaf_values);
   } else {
     double temp_leaf_value = 0.;
     tree->CollapseToLeaf(leaf_node, temp_leaf_value);